示例#1
0
 def rfe_rf(self):
     estimator = RandomForestClassifier(max_depth=3, n_estimators=5)
     selector = RFE(estimator, n_features_to_select=self.fs_num)
     return (selector)
示例#2
0
                for step in estimator:
                    current_features = get_feature_out(step, current_features)
                features_out = current_features
            else:
                features_out = get_feature_out(estimator, features)
            output_features.extend(features_out)
        elif estimator == 'passthrough':
            output_features.extend(ct._feature_names_in[features])

    return output_features


# Create the classifier object
selected_classifier = "SVC"
classifier = SVC(kernel="linear", probability=True, class_weight="balanced")
selector = RFE(classifier, n_features_to_select=10, step=0.05)

# A pipeline chains two algorithms together so that the training process for both can be done in a single step and data is passed automatically from one to the other
pipeline = Pipeline([("preprocessor", preprocess_pipeline), ("RFE", selector),
                     ("classifier", classifier)])

#print(classifier.get_params().keys())

# Dictionary that contains the values for the parameter sweep
#param_grid = dict(RFE__n_features_to_select=[2,3,4,5,6,7,8,9,10], classifier__C=[0.001, 0.01, 0.1, 1, 10, 100, 1000], classifier__gamma=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
param_grid = dict(RFE__n_features_to_select=[10, 20, 30, 40, 50],
                  classifier__C=[0.001, 0.01, 0.1, 1, 10, 100, 1000],
                  classifier__gamma=[1, 0.1, 0.001, 0.0001])

scores = []
accuracy_scores = []
示例#3
0
a digit classification task.

.. note::

    See also :ref:`example_feature_selection_plot_rfe_with_cross_validation.py`

"""
print(__doc__)

from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt

# Load the digits dataset
digits = load_digits()
X = digits.images.reshape((len(digits.images), -1))
y = digits.target

# Create the RFE object and rank each pixel
svc = SVC(kernel="linear", C=1)
rfe = RFE(estimator=svc, n_features_to_select=1, step=1)
rfe.fit(X, y)
ranking = rfe.ranking_.reshape(digits.images[0].shape)

# Plot pixel ranking
plt.matshow(ranking, cmap=plt.cm.Blues)
plt.colorbar()
plt.title("Ranking of pixels with RFE")
plt.show()
def backward_feature_selection(data_set, y_values, number_of_features):
    l_reg = LinearRegression()
    rfe = RFE(l_reg, number_of_features)
    rfe = rfe.fit_transform(data_set, y_values)

    return rfe
feat_names = leData2.drop(['Attrition'],axis=1).columns

indices = np.argsort(importances)[::-1]
plt.figure(figsize=(12,6))
plt.title("Feature importances by DecisionTreeClassifier")
plt.bar(range(len(indices)), importances[indices], color='lightblue',  align="center")
plt.step(range(len(indices)), np.cumsum(importances[indices]), where='mid', label='Cumulative')
plt.xticks(range(len(indices)), feat_names[indices], rotation='vertical',fontsize=14)
plt.xlim([-1, len(indices)])
plt.show()
plt.savefig('DecisionTreeFeaturesImportance')

#2) Feature Selection using Recursive Feature Elimination (RFE) 
model = LogisticRegression()

rfe = RFE(model, 15) #Number of Features Selected
rfe = rfe.fit(X,y)

print("Num Features: %s" % (rfe.n_features_))
print("Selected Features: %s" % (rfe.support_))
print("Feature Ranking: %s" % (rfe.ranking_))

sf = rfe.support_
fr = rfe.ranking_
featureNames = list(X.columns.values) 

#Create empty dataframe
RFE_df = pd.DataFrame()
#Add sf, fr and featureNames to the dataframe
RFE_df = pd.DataFrame(sf, fr)
RFE_df['featureNames'] = featureNames
X = array[:, 1:27]
Y = array[:, 0]
""" 
#feature extraction for non-negative features
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)
#summarize scores
set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
#summarize selected features
print(features[0:5,:])
"""

model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)
print("Num Features: %d") % fit.n_features_
print("Selected Features: %s") % fit.support_
print("Feature Ranking: %s") % fit.ranking_

support = fit.support_
rank = fit.ranking_
for i in range(len(fit.support_)):
    if support[i] == True:
        print(names[i])

# PCA

# Feature Importence with Extra Trees Classifier
model = ExtraTreesClassifier()
示例#7
0
def test():
    data = pd.read_csv('patientData.csv', header=0)
    data = data.dropna()
    dataLength = data.shape[0]

    print(data.shape)
    print(list(data.columns))

    #print(data['HEARTFAILURE'].value_counts())
    #sns.countplot(x= 'HEARTFAILURE',data=data,palette='hls')
    '''
    count_no_fail = len(data[data['HEARTFAILURE']==0])
    count_fail = len(data[data['HEARTFAILURE']==1])
    pct_of_no_fail = count_no_fail/(count_no_fail+count_fail)
    print("percentage of no subscription is", pct_of_no_fail*100)
    pct_of_fail = count_fail/(count_no_fail+count_fail)
    print("percentage of subscription", pct_of_fail*100)
    '''

    import statsmodels.api as sm

    print(data.groupby('HEARTFAILURE').mean())

    #logit_model = sm.Logit(y,X)

    #matplotlib inline
    '''
    pd.crosstab(data.SMOKERLAST5YRS,data.HEARTFAILURE).plot(kind ='bar')
    plt.title('smoke and heartfailure')
    plt.xlabel('smoke')
    plt.ylabel("failure")
    plt.savefig('smoke')
    
    data.PALPITATIONSPERDAY.hist()
    plt.title('Histogram of palp')
    plt.xlabel('palp')
    plt.ylabel('failure')
    plt.savefig('palpitations')
    '''

    cat_vars = ['SEX', 'FAMILYHISTORY', 'SMOKERLAST5YRS']
    for var in cat_vars:
        cat_list = 'var' + '_' + var
        cat_list = pd.get_dummies(data[var], prefix=var)
        data2 = data.join(cat_list)
        data = data2
    cat_vars = ['SEX', 'FAMILYHISTORY', 'SMOKERLAST5YRS']
    data_vars = data.columns.values.tolist()
    to_keep = [i for i in data_vars if i not in cat_vars]

    dataFinal = data[to_keep]

    X = dataFinal.loc[:, dataFinal.columns != 'HEARTFAILURE']
    y = dataFinal.loc[:, dataFinal.columns == 'HEARTFAILURE']

    from imblearn.over_sampling import SMOTE

    os = SMOTE(random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.15,
                                                        random_state=0)
    columns = X_train.columns

    os_data_X, os_data_y = os.fit_sample(X_train, y_train)
    os_data_X = pd.DataFrame(data=os_data_X, columns=columns)
    os_data_y = pd.DataFrame(data=os_data_y, columns=['HEARTFAILURE'])

    print("length of oversampled data is ", len(os_data_X))
    print("Number of no subscription in oversampled data",
          len(os_data_y[os_data_y['HEARTFAILURE'] == 0]))
    print("Number of subscription",
          len(os_data_y[os_data_y['HEARTFAILURE'] == 1]))
    print("Proportion of no subscription data in oversampled data is ",
          len(os_data_y[os_data_y['HEARTFAILURE'] == 0]) / len(os_data_X))
    print("Proportion of subscription data in oversampled data is ",
          len(os_data_y[os_data_y['HEARTFAILURE'] == 1]) / len(os_data_X))
    data_final_vars = dataFinal.columns.values.tolist()

    y = ['HEARTFAILURE']
    X = [i for i in data_final_vars if i not in y]

    from sklearn.feature_selection import RFE

    logreg = LogisticRegression()
    rfe = RFE(logreg, 2)
    rfe = rfe.fit(os_data_X, os_data_y.values.ravel())
    #    print(rfe.support_)
    #   print(rfe.ranking_)
    #  print(rfe.estimator_)

    col = [
        'PALPITATIONSPERDAY', 'BMI', 'AVGHEARTBEATSPERMIN', 'AGE',
        'EXERCISEMINPERWEEK', 'SEX_F', 'SEX_M', 'FAMILYHISTORY_N',
        'FAMILYHISTORY_Y', 'SMOKERLAST5YRS_N', 'SMOKERLAST5YRS_Y'
    ]
    #col = ['PALPITATIONSPERDAY', 'CHOLESTEROL', 'BMI', 'AVGHEARTBEATSPERMIN', 'AGE', 'EXERCISEMINPERWEEK', 'FAMILYHISTORY_N', 'FAMILYHISTORY_Y', 'SMOKERLAST5YRS_N', 'SMOKERLAST5YRS_Y']

    #col = ['BMI', 'SEX_M','SEX_F','AVGHEARTBEATSPERMIN', 'FAMILYHISTORY_N', 'AGE','FAMILYHISTORY_Y', 'SMOKERLAST5YRS_N', 'SMOKERLAST5YRS_Y']

    y = os_data_y['HEARTFAILURE']
    X = os_data_X[col]
    final = sm.Logit(y, X)
    result = final.fit()
    print(result.summary2())

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.20,
                                                        random_state=0)

    logreg = LogisticRegression()
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    print('Accuracy of logistic regression classifier on test set:',
          format(logreg.score(X, y)))
    print(y_pred)
示例#8
0
from sklearn.feature_selection import RFE, SelectKBest, chi2, SelectFpr
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

df = pd.read_csv('Train_CV_Data.csv')
X_train = np.asarray(df.loc[:2000000, 'srcPort':'HTTPM4'])
Y_train = np.asarray(df.loc[:2000000, 'malicious'], dtype=np.int32)
print(np.sum(Y_train == 1))

kBest = SelectKBest(chi2, k=12)
kBest.fit(X_train, Y_train)
mask1 = kBest.get_support(indices=True)

fpr = SelectFpr(chi2, alpha=0.0001)
fpr.fit(X_train, Y_train)
mask2 = fpr.get_support(indices=True)

rf = RandomForestClassifier(n_estimators=50)

rfe = RFE(rf, n_features_to_select=12, step=1)
rfe.fit(X_train, Y_train)
mask3 = rfe.get_support(indices=True)

print('K-Best Feat :', mask1)
print('False Positive based :', mask2)
print('RFE based :', mask3)
示例#9
0
def test_number_of_subsets_of_features():
    # In RFE, 'number_of_subsets_of_features'
    # = the number of iterations in '_fit'
    # = max(ranking_)
    # = 1 + (n_features + step - n_features_to_select - 1) // step
    # After optimization #4534, this number
    # = 1 + np.ceil((n_features - n_features_to_select) / float(step))
    # This test case is to test their equivalence, refer to #4534 and #3824

    def formula1(n_features, n_features_to_select, step):
        return 1 + ((n_features + step - n_features_to_select - 1) // step)

    def formula2(n_features, n_features_to_select, step):
        return 1 + np.ceil((n_features - n_features_to_select) / float(step))

    # RFE
    # Case 1, n_features - n_features_to_select is divisible by step
    # Case 2, n_features - n_features_to_select is not divisible by step
    n_features_list = [11, 11]
    n_features_to_select_list = [3, 3]
    step_list = [2, 3]
    for n_features, n_features_to_select, step in zip(
            n_features_list, n_features_to_select_list, step_list):
        generator = check_random_state(43)
        X = generator.normal(size=(100, n_features))
        y = generator.rand(100).round()
        rfe = RFE(
            estimator=SVC(kernel="linear"),
            n_features_to_select=n_features_to_select,
            step=step,
        )
        rfe.fit(X, y)
        # this number also equals to the maximum of ranking_
        assert np.max(rfe.ranking_) == formula1(n_features,
                                                n_features_to_select, step)
        assert np.max(rfe.ranking_) == formula2(n_features,
                                                n_features_to_select, step)

    # In RFECV, 'fit' calls 'RFE._fit'
    # 'number_of_subsets_of_features' of RFE
    # = the size of each score in 'cv_results_' of RFECV
    # = the number of iterations of the for loop before optimization #4534

    # RFECV, n_features_to_select = 1
    # Case 1, n_features - 1 is divisible by step
    # Case 2, n_features - 1 is not divisible by step

    n_features_to_select = 1
    n_features_list = [11, 10]
    step_list = [2, 2]
    for n_features, step in zip(n_features_list, step_list):
        generator = check_random_state(43)
        X = generator.normal(size=(100, n_features))
        y = generator.rand(100).round()
        rfecv = RFECV(estimator=SVC(kernel="linear"), step=step)
        rfecv.fit(X, y)

        # TODO: Remove in v1.2 when grid_scores_ is removed
        msg = (
            r"The `grid_scores_` attribute is deprecated in version 1\.0 in "
            r"favor of `cv_results_` and will be removed in version 1\.2.")
        with pytest.warns(FutureWarning, match=msg):
            assert len(rfecv.grid_scores_) == formula1(n_features,
                                                       n_features_to_select,
                                                       step)
            assert len(rfecv.grid_scores_) == formula2(n_features,
                                                       n_features_to_select,
                                                       step)

        for key in rfecv.cv_results_.keys():
            assert len(rfecv.cv_results_[key]) == formula1(
                n_features, n_features_to_select, step)
            assert len(rfecv.cv_results_[key]) == formula2(
                n_features, n_features_to_select, step)
示例#10
0
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.9794871794871796
exported_pipeline = make_pipeline(
    RFE(estimator=ExtraTreesClassifier(criterion="gini", max_features=0.7000000000000001, n_estimators=100), step=0.8),
    MinMaxScaler(),
    LinearSVC(C=10.0, dual=True, loss="squared_hinge", penalty="l2", tol=0.001)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
示例#11
0
acuu(dataX,dataY)
#--------------------------------------------------------------------------------------------------------------------

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

test = SelectKBest(score_func=chi2, k=17)        # k is number of features
fit = test.fit(dataX, dataY)
train2 = test.transform(dataX)
acuu(train2, dataY)
#
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

model = LogisticRegression()
rfe = RFE(model, 17)
fit = rfe.fit(dataX, dataY)
train2 = fit.transform(dataX)
acuu(train2, dataY)

from sklearn.decomposition import PCA

pca = PCA(17)
fit = pca.fit(dataX, dataY)
train2 = pca.transform(dataX)
acuu(train2, dataY)

import warnings

warnings.filterwarnings("ignore")
示例#12
0
文件: codigo.py 项目: nataliafm/AA
plt.show()

# Separación de características y etiquetas
datos = np.array(datos)
datosy = np.transpose(datos)[0]
datosx = [i[1:] for i in datos]

# Eliminación de la característica 11
datosx = np.delete(datosx, 10, 1)

#Eliminación de la característica 16
datosx = np.delete(datosx, 14, 1)

#Selección de características
estimator = SVR(kernel="linear")
selector = RFE(estimator, 13)

datosx = selector.fit_transform(datosx, datosy)
print(selector.ranking_)
print(selector.support_)

#Separación de los datos en train y test
trainx, testx, trainy, testy = train_test_split(datosx,
                                                datosy,
                                                test_size=0.3,
                                                shuffle=True)

#Neural Networks
print('Resultados Redes Neuronales: ')
W = np.random.randint(low=0, high=2, size=len(testy))
    'anomalous(wrongSetUp)': 6,
    'normal': 7
}

train_data = pd.read_csv('dataset/balanced_noTimestamp_mixTrain.csv')
label_index = len(train_data.iloc[0][:]) - 1

train_labels = train_data.iloc[:, -1]  # separate labels of training sets
train_data.drop(train_data.columns[label_index], axis=1, inplace=True)

test_data = pd.read_csv('dataset/balanced_noTimestamp_mixTest.csv')
test_labels = test_data.iloc[:, -1]  # separate labels of testing set
test_data.drop(test_data.columns[label_index], axis=1, inplace=True)

dt_clf = DecisionTreeClassifier()  # Train DecisionTreeClassifier
selector_dt = RFE(dt_clf, None, step=1).fit(train_data, train_labels)
predicted_test_dt = selector_dt.predict(test_data)

# rf_clf = RandomForestClassifier(max_depth=6, random_state=0)  # RandomForestClassifier
# selector_rf = RFE(rf_clf, None, step=1).fit(train_data, train_labels)
# predicted_test_rf = selector_rf.predict(test_data)

knn_clf = KNeighborsClassifier(n_neighbors=5).fit(
    train_data, train_labels)  # Train KNN classifier
predicted_test_knn = knn_clf.predict(test_data)

# Train SVM classifier
svc_clf = svm.SVC(gamma='auto',
                  kernel='rbf',
                  decision_function_shape='ovo',
                  max_iter=-1,
            testX = X[1240:1280, :]
            testY = Y[1240:1280]
            trainX = X[0:1240, :]
            trainY = Y[0:1240]
        else:
            testX = X[subNo * trialNum:(subNo + 1) * trialNum, :]
            testY = Y[subNo * trialNum:(subNo + 1) * trialNum]
            trainX = np.vstack(
                (X[0:subNo * trialNum, :],
                 X[(subNo + 1) * trialNum:subNum * trialNum, :]))
            trainY = np.concatenate(
                (Y[0:subNo * trialNum],
                 Y[(subNo + 1) * trialNum:subNum * trialNum]))

        clf = svm.SVC(kernel='linear')
        sel_criteria = RFE(estimator=clf, n_features_to_select=num_k,
                           step=0.5).fit(trainX, trainY)
        sel_indx_mask = sel_criteria.get_support()
        sel_indx = np.where(sel_indx_mask == True)
        sel_indx = sel_indx[0]
        trainX = trainX[:, sel_indx]
        testX = testX[:, sel_indx]

        #svm
        clf1 = svm.SVC(kernel='linear')
        clf1.fit(trainX, trainY)
        predict_testY = clf1.predict(testX)
        f1_scores[no_k, subNo] = metrics.f1_score(testY, predict_testY)
        acc_scores[no_k, subNo] = metrics.accuracy_score(testY, predict_testY)

        print('current sub performance:', acc_scores[no_k, subNo], ' kbest:',
              num_k)
示例#15
0
## FUNCTIONS: RFE, RFECV
## DOCUMENTATION: http://scikit-learn.org/stable/modules/feature_selection.html
## DATA: Crime (n=319 non-null, p=122, type=regression)
## DATA DICTIONARY: http://archive.ics.uci.edu/ml/datasets/Communities+and+Crime

# define X and y
X = crime.iloc[:, :-1]
y = crime.iloc[:, -1]

# split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# select "best" features (half of them by default)
lm = LinearRegression()
from sklearn.feature_selection import RFE
selector = RFE(lm)
selector.fit(X_train, y_train)
selector.n_features_
selector.support_
selector.ranking_

# let RFECV select the "optimal" number of features
from sklearn.feature_selection import RFECV
selector = RFECV(lm, cv=3, scoring='mean_squared_error')
selector.fit(X, y)
selector.n_features_
selector.support_
selector.ranking_

# *tentative* advice for usage:
# 1. scale features, then use RFECV to select the number of features (p)
示例#16
0
from sklearn.metrics import classification_report

print('\n1:')
print("DTC confusioin_matrix:\n", confusion_matrix(y_test, y_predict_dtc))
print("\nKNC confusioin_matrix:\n", confusion_matrix(y_test, y_predict_knc))
print("\nRFC confusioin_matrix:\n", confusion_matrix(y_test, y_predict_rfc))
print("\nGBC confusioin_matrix:\n", confusion_matrix(y_test, y_predict_gbc))
print("\nAda confusioin_matrix:\n", confusion_matrix(y_test, y_predict_abc))
print("\nSVC confusioin_matrix:\n", confusion_matrix(y_test, y_predict_svc))
print("\nGauNB confusioin_matrix:\n", confusion_matrix(y_test, y_predict_gnb))
print("\nLR confusioin_matrix:\n", confusion_matrix(y_test, y_predict_lr))

print("1:","KNC:",knc.score(x_test,y_test),'DTC:',dtc.score(x_test,y_test),"RFC:",rfc.score(x_test,y_test),"GBC:",gbc.score(x_test,y_test),\
"Ada:",abc.score(x_test,y_test),"SVC:",svc.score(x_test,y_test),"GauNB:",gnb.score(x_test,y_test),\
"LR:",LR.score(x_test,y_test))

c, r = Y.shape
Y = Y.values
Y = Y.reshape(c, )

dtc1 = DecisionTreeClassifier()
gbc1 = GradientBoostingClassifier()
gnb1 = GaussianNB()

rfe = RFE(estimator=gnb1, n_features_to_select=5, step=1)
rfe.fit(X, Y)
ranking = rfe.ranking_
print("gnb RFE ranking:\n", ranking)

#print(X.index)
#print(X.iloc[0])
plt.title('Comparison of different Feature Importances')
plt.show()

# -

# ### Recursive Feature Elimination

# +
from sklearn.feature_selection import RFE
from sklearn import ensemble
from yellowbrick.features import RFECV

## RFE

rf = RandomForestClassifier(random_state=42)
model = RFE(rf, n_features_to_select=50)
fit_model = model.fit(X_train_prepared, y_train)
features = pd.DataFrame(list(zip(X_train_prepared.columns,
                                 fit_model.ranking_)),
                        columns=['predictor', 'ranking'])
# -

features = features.sort_values(by='ranking')

## RFE and Tree based feature importance signify that features with rank greater than 3 in RFE are insignificant
chosen_features = features[features['ranking'] < 3]

chosen_features.shape

# ### Sequential Feature Selection
'''
def create_model(number, features):
    print(f"\nExecuting Model {number}")
    PATH = "carInsurance.csv"
    df = pd.read_csv(
        PATH,
        skiprows=1,
        encoding="ISO-8859-1",
        sep=',',
        names=("ID", "KIDSDRIV", "BIRTH", "AGE", "HOMEKIDS", "YOJ", "INCOME",
               "PARENT1", "HOME_VAL", "MSTATUS", "GENDER", "EDUCATION",
               "OCCUPATION", "TRAVTIME", "CAR_USE", "BLUEBOOK", "TIF",
               "CAR_TYPE", "RED_CAR", "OLDCLAIM", "CLM_FREQ", "REVOKED",
               "MVR_PTS", "CLM_AMT", "CAR_AGE", "CLAIM_FLAG", "URBANICITY"))
    # Show all columns.
    pd.set_option('display.max_columns', 1000)
    pd.set_option('display.width', 1000)

    # Exploratory Analysis
    def exploratory_analysis(df):
        print(df.columns)  # list all column names
        print(df.shape)  # get number of rows and columns
        print(df.info())  # additional info about dataframe
        print(
            df.describe())  # statistical description, only for numeric values
        columns = [
            "ID", "KIDSDRIV", "BIRTH", "AGE", "HOMEKIDS", "YOJ", "INCOME",
            "PARENT1", "HOME_VAL", "MSTATUS", "GENDER", "EDUCATION",
            "OCCUPATION", "TRAVTIME", "CAR_USE", "BLUEBOOK", "TIF", "CAR_TYPE",
            "RED_CAR", "OLDCLAIM", "CLM_FREQ", "REVOKED", "MVR_PTS", "CLM_AMT",
            "CAR_AGE", "CLAIM_FLAG", "URBANICITY"
        ]
        for column in columns:
            print(df[column].value_counts(
                dropna=False))  # count unique values in a

    # End of Exploratory Analysis

    # Convert Columns with $ in entry
    def convert_dollar_sign_columns(df):
        columns_with_dollar_sign = [
            'INCOME', 'HOME_VAL', 'BLUEBOOK', 'OLDCLAIM', 'CLM_AMT'
        ]

        for column in columns_with_dollar_sign:
            df[column].replace(to_replace='\D+',
                               value='',
                               regex=True,
                               inplace=True)
            df[column] = pd.to_numeric(df[column])
        return df

    df = convert_dollar_sign_columns(df=df)

    # End of Conversion

    # Imputation of Empty or NaN in Columns

    def convert_na_cells(colName, df, measureType):
        # Create two new column names based on original column name.
        indicatorColName = 'm_' + colName  # Tracks whether imputed.
        imputedColName = 'imp_' + colName  # Stores original & imputed data.

        # Get mean or median depending on preference.
        imputedValue = 0
        if (measureType == "median"):
            imputedValue = df[colName].median()
        elif (measureType == "mode"):
            imputedValue = float(df[colName].mode())
        else:
            imputedValue = df[colName].mean()

        # Populate new columns with data.
        imputedColumn = []
        indictorColumn = []
        for i in range(len(df)):
            isImputed = False

            # mi_OriginalName column stores imputed & original data.
            if (np.isnan(df.loc[i][colName])):
                isImputed = True
                imputedColumn.append(imputedValue)
            else:
                imputedColumn.append(df.loc[i][colName])

            # mi_OriginalName column tracks if is imputed (1) or not (0).
            if (isImputed):
                indictorColumn.append(1)
            else:
                indictorColumn.append(0)

        # Append new columns to dataframe but always keep original column.
        df[indicatorColName] = indictorColumn
        df[imputedColName] = imputedColumn
        return df

    def analysis_of_income_for_imputation(df):
        occurences_of_income = df['INCOME'].value_counts(
            dropna=False).to_dict()
        # print(occurences_of_income)
        plt.bar(["$0", "NaN"], [797, 570])
        plt.title("Occurences of Distinct Values for Income")
        plt.xlabel("Income")
        plt.ylabel("Occurences")
        plt.show()
        # Stats
        # Entries = 9732
        # Significant Non-Unique Occurences = {0: 797, "NaN": 570}
        # 1367 entries are significant non-unique. 8365 are entries remaining
        # that are relatively distinct.
        # Conclusion for imputation choice: Do not use mean or mode. Median is
        # ideal. 570 entries will be imputed.

    def analysis_of_age_for_imputation(df):
        occurences_of_age = df['AGE'].value_counts(dropna=False).to_dict()
        plt.bar(occurences_of_age.keys(), occurences_of_age.values())
        plt.title("Occurences of Distinct Values for Age")
        plt.xlabel("Ages")
        plt.ylabel("Occurences")
        plt.show()
        # The distribution of the age plot is normal and balanced.
        # Imputation with mean is reliable and only 7 entries need to be imputed so
        # the imputation will not heavily impact our regressional analysis later.

    def analysis_of_yoj_for_imputation(df):
        occurences_of_yoj = df['YOJ'].value_counts(dropna=False).to_dict()
        plt.bar(occurences_of_yoj.keys(), occurences_of_yoj.values())
        plt.title("Occurences of Distinct Values for YOJ")
        plt.xlabel("YOJ")
        plt.ylabel("Occurences")
        plt.show()
        # The distribution is not normal. There are 800, 0 value entries of the
        # 9754 entry total.
        # The majority of entries are focused around the mean.
        # There are 548 entries missing.
        # The 0 entries could be significant so testing median or mode are
        # likely more reliable than mean since there are so many 0 value entries.

    def analysis_of_home_val_for_imputation(df):
        # print(df['HOME_VAL'].value_counts(
        #     dropna=False))
        occurences_of_home_val = df['HOME_VAL'].value_counts(
            dropna=False).to_dict()
        plt.bar(["$0", "NaN"], [2908, 575])
        plt.title("Occurences of Distinct Values for Home Values")
        plt.xlabel("Home Values")
        plt.ylabel("Occurences")
        plt.show()
        # Value       Occurences
        # 0.0         2908
        # NaN          575
        # 6819 entries are not 0 or NaN
        # 575 entries are missing (NaN)
        # STD is quite high and is explainable by the occurences of $0 entries.
        # Our analysis of distinct values shows that mean and mode would be
        # invalid imputation methods for home_val. I will use median.
        #	        count	mean	std	        min	25%	50%	    75%	    max
        # HOME_VAL	9727	154523	129188.4	0	0	160661	238256	885282

    def analysis_of_car_age_for_imputation(df):
        occurences_of_car_age = df['CAR_AGE'].value_counts(
            dropna=False).to_dict()
        plt.bar(occurences_of_car_age.keys(), occurences_of_car_age.values())
        plt.title("Occurences of Distinct Values for Car Age")
        plt.xlabel("Car Age")
        plt.ylabel('Occurences')
        plt.show()
        # Distribution is not normal. There are roughly 2450 cars with an
        # age of 1. The second highest occurring age is 8 at around 650. The
        # occurences of age 1 are not likely to be erroneous. Imputation
        # with car age should be done with mean or median to find the better
        # imputation method.

    # Treat outlier in CAR_AGE column
    df.CAR_AGE = df.CAR_AGE.mask(df.CAR_AGE.lt(0), 0)

    def imputation_analysis(df):
        analysis_of_income_for_imputation(df)
        analysis_of_age_for_imputation(df)
        analysis_of_yoj_for_imputation(df)
        analysis_of_home_val_for_imputation(df)
        analysis_of_car_age_for_imputation(df)

    imputation_analysis(df)
    df = convert_na_cells("INCOME", df, "median")
    df = convert_na_cells("AGE", df, "mean")
    df = convert_na_cells("YOJ", df, "mode")
    df = convert_na_cells("HOME_VAL", df, "median")
    df = convert_na_cells("CAR_AGE", df, "mean")
    # End of Imputation

    # Dummy Variables: Handling Categorial and Ordinal/Nominal Columns
    # Treating categorical (string) information.
    df = pd.get_dummies(df,
                        columns=[
                            'PARENT1', 'MSTATUS', 'GENDER', 'EDUCATION',
                            'OCCUPATION', 'CAR_USE', 'CAR_TYPE', 'RED_CAR',
                            'REVOKED', 'URBANICITY'
                        ])
    # End of Dummy Variable Handling

    # Binning
    df['AGE_bin'] = pd.cut(x=df['AGE'], bins=[0, 17, 27, 37, 47, 57, 67, 77])
    tempDf = df['AGE_bin']  # Isolate columns
    # Get dummies
    dummyDf = pd.get_dummies(tempDf, columns=['AGE_bin'])
    df = pd.concat(([df, dummyDf]), axis=1)  # Join dummy df with original
    predictors_test = [
        'BLUEBOOK', 'OLDCLAIM', 'CLM_FREQ', 'CLM_AMT', 'imp_INCOME', 'imp_YOJ',
        'imp_HOME_VAL', 'imp_CAR_AGE', 'PARENT1_No', 'MSTATUS_Yes',
        'MSTATUS_z_No', 'GENDER_M', 'GENDER_z_F', 'EDUCATION_<High School',
        'EDUCATION_Bachelors', 'EDUCATION_Masters', 'EDUCATION_PhD',
        'EDUCATION_z_High School', 'OCCUPATION_Clerical', 'OCCUPATION_Doctor',
        'OCCUPATION_Home Maker', 'OCCUPATION_Lawyer', 'OCCUPATION_Manager',
        'OCCUPATION_Professional', 'OCCUPATION_Student',
        'OCCUPATION_z_Blue Collar', 'CAR_USE_Commercial', 'CAR_USE_Private',
        'CAR_TYPE_Minivan', 'CAR_TYPE_Panel Truck', 'CAR_TYPE_Pickup',
        'CAR_TYPE_Sports Car', 'CAR_TYPE_Van', 'CAR_TYPE_z_SUV', 'RED_CAR_no',
        'RED_CAR_yes', 'REVOKED_No', 'REVOKED_Yes',
        'URBANICITY_Highly Urban/ Urban', 'URBANICITY_z_Highly Rural/ Rural'
    ]
    X = df[predictors_test]
    y = df['CLAIM_FLAG']

    from sklearn.feature_selection import RFE
    from sklearn.linear_model import LogisticRegression

    # Scale the data prior to selection.
    print("Please wait for scaling...")
    sc_x = StandardScaler()
    X_scaled = sc_x.fit_transform(X)

    print("Please wait for automated feature selection...")
    n_features = features
    print(f"Selecting {n_features} features")
    logreg = LogisticRegression(max_iter=200)
    rfe = RFE(logreg, n_features)  # Select top 20 features.
    rfe = rfe.fit(X_scaled, y.values.ravel())
    print("Feature selection is complete.")

    def getSelectedColumns(ranking):
        # Extract selected indices from ranking.
        indices = []
        for i in range(0, len(ranking)):
            if (ranking[i] == 1):
                indices.append(i)
        # Build list of selected column names.
        counter = 0
        selectedColumns = []
        for col in X:
            if (counter in indices):
                selectedColumns.append(col)
            counter += 1
        return selectedColumns

    selectedPredictorNames = getSelectedColumns(rfe.ranking_)

    # Show selected names from RFE.
    print("\n*** Selected Features:")
    for i in range(0, len(selectedPredictorNames)):
        print(selectedPredictorNames[i])

    # prepare cross validation with three folds and 1 as a random seed.
    # Separate into x and y values.
    count = 0
    kfold = KFold(3, True, 1)
    # Separate into x and y values.
    X = df[selectedPredictorNames]
    y = df[['CLAIM_FLAG']]

    # Show chi-square scores for each feature.
    # There is 1 degree freedom since 1 predictor during feature evaluation.
    # Generally, >=3.8 is good)

    test = SelectKBest(score_func=chi2, k=n_features)
    XScaled = MinMaxScaler().fit_transform(X)
    chiScores = test.fit(XScaled, y)  # Summarize scores
    np.set_printoptions(precision=3)

    # Search here for insignificant features.
    print("\nPredictor Chi-Square Scores: " + str(chiScores.scores_))
    for train, test in kfold.split(df[selectedPredictorNames]):
        X_train = X.iloc[train, :]  # Gets all rows with train indexes.
        y_train = y.iloc[train, :]
        X_test = X.iloc[test, :]
        y_test = y.iloc[test, :]
        X_scaled = sc_x.fit_transform(X)

        # Perform logistic regression.
        logisticModel = LogisticRegression(fit_intercept=True,
                                           random_state=0,
                                           solver='liblinear')
        # Fit the model.
        logisticModel.fit(X_train, np.ravel(y_train))

        y_pred = logisticModel.predict(X_test)
        y_prob = logisticModel.predict_proba(X_test)

        # Split data.
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.25,
                                                            random_state=0)

        # Perform logistic regression.
        logisticModel = LogisticRegression(fit_intercept=True,
                                           random_state=0,
                                           solver='liblinear')

        # Fit the model.
        logisticModel.fit(X_train, np.ravel(y_train))

        y_pred = logisticModel.predict(X_test)
        y_prob = logisticModel.predict_proba(X_test)

        # Show confusion matrix and accuracy scores.
        cm = pd.crosstab(np.ravel(y_test),
                         y_pred,
                         rownames=['Actual'],
                         colnames=['Predicted'])
        count += 1
        print("\n***K-fold: " + str(count))
        print('\nAccuracy: ', metrics.accuracy_score(y_test, y_pred))
        print("\nConfusion Matrix")
        print(cm)

        from sklearn.metrics import classification_report, roc_auc_score

        print(classification_report(y_test, y_pred))

        from sklearn.metrics import average_precision_score
        average_precision = average_precision_score(y_test, y_pred)

        print('Average precision-recall score: {0:0.2f}'.format(
            average_precision))

        # calculate scores
        auc = roc_auc_score(
            y_test,
            y_prob[:, 1],
        )
        print('Logistic: ROC AUC=%.3f' % (auc))

        # Stat Summary: accuracy, precision, recall, f1 scores along with averages and
        # standard deviations of these scores for all folds.
        # Show model coefficients and intercept.
        print(f"\nStatistical Summary of Model {number}")
        print("\nModel Intercept: ")
        print(logisticModel.intercept_)
        print("\nModel Coefficients: ")
        print(logisticModel.coef_)
        # Prediction with test data
        pred = logisticModel.predict(X_test)
        # Show stats about the regression.
        mse = mean_squared_error(y_test, pred)
        RMSE = np.sqrt(mse)
        print("\nRMSE: " + str(RMSE))
        print("\nr2_score", r2_score(y_test, pred))

    # ROC CURVE CHART, and CUMUL GAINS CHART
    def create_roc_curve_chart_and_cumul_chart():
        # calculate roc curves chart
        CUT_OFF = 0.50
        lr_fpr, lr_tpr, _ = roc_curve(y_test, y_prob[:, 1])
        plt.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
        plt.plot([0, 1], [0, 1], '--', label=f"CUT-OFF{CUT_OFF}")
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title("ROC CURVE")
        plt.legend()
        plt.show()
        # cumulative gains chart
        clf = LogisticRegression(random_state=0,
                                 multi_class='multinomial',
                                 solver='newton-cg')
        clf.fit(X_train, y_train)
        predicted_probas = clf.predict_proba(X_test)
        y_pred = clf.predict(X_test)
        import scikitplot as skplt
        skplt.metrics.plot_cumulative_gain(y_test, predicted_probas)
        skplt.metrics.plot_lift_curve(y_test, predicted_probas)
        plt.show()

    create_roc_curve_chart_and_cumul_chart()
    print(f"\nEnd of Model {number}")
示例#19
0
def run_regressions_and_save_results(model,
                                     regression,
                                     dataframe,
                                     features_selection,
                                     results_df,
                                     parameters_dict=None,
                                     inputs=None):
    '''features_selection = 'all', 'all_but_climzones', 'RFE', 'RFE_but_climzones   '''

    # - - - - - - - - - - - - - -
    # Features = All
    # - - - - - - - - - - - - - -

    if features_selection == 'all':
        print('    with all features...')
        Features = 'All w/ climzones'

        scores = do_regression(regression, dataframe)

        print('    R2_score : ' + str(scores[0]))
        print('    MSE_score : ' + str(scores[1]))
        R2_score = scores[0]
        MSE_score = scores[1]
        results_df = results_df.append(
            {
                'Model': model,
                'num_features': len(dataframe.columns) - 1,
                'Features': Features,
                'params': parameters_dict,
                'R2': R2_score,
                'MSE': MSE_score
            },
            ignore_index=True)

    # - - - - - - - - - - - - - - -
    # Features = All but climzones
    # - - - - - - - - - - - - - - -

    elif features_selection == 'all_but_climzones':
        dataframe = dataframe[columns_without_climatezones]

        print('    with all features but climate zones...')
        Features = 'All w/o climzones'

        scores = do_regression(regression, dataframe)

        print('    R2_score : ' + str(scores[0]))
        print('    MSE_score : ' + str(scores[1]))
        R2_score = scores[0]
        MSE_score = scores[1]
        results_df = results_df.append(
            {
                'Model': model,
                'num_features': len(dataframe.columns) - 1,
                'Features': Features,
                'params': parameters_dict,
                'R2': R2_score,
                'MSE': MSE_score
            },
            ignore_index=True)

    # - - - - - - - - - - - - - - -
    # Features = SUBSET
    # - - - - - - - - - - - - - - -

    # if features_selection = 'subset':
    ## TODO

    # - - - - - - - - - - - - - - - - - - - - - - -
    # Features = RFE selected (with climate zones)
    # - - - - - - - - - - - - - - - - - - - - - - -

    elif features_selection == 'RFE':
        for num_features in range(5, 30):

            print(' RFE with ' + str(num_features) + ' features ...')

            ## RFE - Features selection
            selector = RFE(regression, num_features, step=1)
            x = dataframe.drop(['calories_per_ha'], axis=1)
            y = dataframe['calories_per_ha']
            X, X_test, Y, Y_test = train_test_split(x, y)
            X_RFE = selector.fit_transform(X, Y)
            features_selected = [
                X.columns[feature_pos]
                for feature_pos in selector.get_support(indices=True)
            ]

            # Do regression and append results to results_df
            scores = do_regression(
                regression,
                dataframe[(features_selected + ['calories_per_ha'])])
            print('    R2_score : ' + str(scores[0]))
            print('    MSE_score : ' + str(scores[1]))
            R2_score = scores[0]
            MSE_score = scores[1]
            results_df = results_df.append(
                {
                    'Model': model,
                    'num_features': num_features,
                    'Features': features_selected,
                    'params': parameters_dict,
                    'R2': R2_score,
                    'MSE': MSE_score
                },
                ignore_index=True)

    # - - - - - - - - - - - - - - - - - - - - - -
    # Features = RFE selected (w/o climate zones)
    # - - - - - - - - - - - - - - - - - - - - - -

    elif features_selection == 'RFE_but_climzones':

        dataframe = dataframe[columns_without_climatezones]

        for num_features in range(5, 30):

            print('RFE (no climzones) with ' + str(num_features) +
                  ' features ...')

            ## RFE - Features selection
            selector = RFE(regression, num_features, step=1)
            x = dataframe.drop(['calories_per_ha'], axis=1)
            y = dataframe['calories_per_ha']
            X, X_test, Y, Y_test = train_test_split(x, y)
            X_RFE = selector.fit_transform(X, Y)
            features_selected = [
                X.columns[feature_pos]
                for feature_pos in selector.get_support(indices=True)
            ]

            # Do regression and append results to results_df
            scores = do_regression(
                regression,
                dataframe[(features_selected + ['calories_per_ha'])])
            print('    R2_score : ' + str(scores[0]))
            print('    MSE_score : ' + str(scores[1]))
            R2_score = scores[0]
            MSE_score = scores[1]
            results_df = results_df.append(
                {
                    'Model': model,
                    'num_features': num_features,
                    'Features': features_selected,
                    'params': parameters_dict,
                    'R2': R2_score,
                    'MSE': MSE_score
                },
                ignore_index=True)

    elif features_selection == 'RFE_8_20':

        for num_features in [8, 20]:

            print('RFE with ' + str(num_features) + ' features ...')

            ## RFE - Features selection
            selector = RFE(regression, num_features, step=1)
            x = dataframe.drop(['calories_per_ha'], axis=1)
            y = dataframe['calories_per_ha']
            X, X_test, Y, Y_test = train_test_split(x, y)
            X_RFE = selector.fit_transform(X, Y)
            features_selected = [
                X.columns[feature_pos]
                for feature_pos in selector.get_support(indices=True)
            ]

            # Do regression and append results to results_df
            scores = do_regression(
                regression,
                dataframe[(features_selected + ['calories_per_ha'])])
            print('    R2_score : ' + str(scores[0]))
            print('    MSE_score : ' + str(scores[1]))
            R2_score = scores[0]
            MSE_score = scores[1]
            results_df = results_df.append(
                {
                    'Model': model,
                    'num_features': num_features,
                    'Features': features_selected,
                    'params': parameters_dict,
                    'R2': R2_score,
                    'MSE': MSE_score,
                    'inputs': inputs
                },
                ignore_index=True)

    return (results_df)
示例#20
0
                l.append(tup)
    rd.seed(77)
    choice = []
    for i in l:
        r = rd.random()
        if i[0] not in choice and i[1] not in choice:
            if r >= 0.5:
                choice.append(i[0])
            else:
                choice.append(i[1])
    df = df.loc[:, choice]
    # df = df.drop(choice,axis=1)
    return df, choice


rfe = RFE(estimator=DecisionTreeClassifier(random_state=2),
          n_features_to_select=30)
selector = rfe.fit(all_features_norm, survival_df_filtered['days_to_death'])
selected_ind = np.where(selector.support_)

all_features_selected = all_features_norm[
    all_features_norm.columns[selected_ind]]
events = survival_df_filtered['vital_status'].astype(bool)
all_features_drop_low_var = DropLowVariance(all_features_selected, events)

all_feature_names = [[feature_names[i] + '_' + str(j) for i in range(115)]
                     for j in range(4)]
all_feature_names_ls = list(chain.from_iterable(all_feature_names))

all_reduced_features = [
    all_feature_names_ls[i] for i in list(all_features_drop_low_var.columns)
]
示例#21
0
def get_data():
    # Read csv
    listings_df = pd.read_csv('./airbnb_data.csv', low_memory=False)

    # Drop columns that aren't related to income or not feasible to capture from user
    columns_to_drop = [
        'Unnamed: 0', 'id', 'scrape_id', 'host_id',
        'host_total_listings_count', 'latitude', 'longitude',
        'availability_30', 'availability_60', 'availability_90',
        'availability_365', 'number_of_reviews',
        'calculated_host_listings_count', 'reviews_per_month', 'Other',
        'listing_url', 'last_scraped', 'host_name', 'experiences_offered',
        'picture_url', 'name', 'host_url', 'host_since', 'host_is_superhost',
        'host_thumbnail_url', 'host_picture_url', 'host_listings_count',
        'host_verifications', 'host_has_profile_pic', 'host_identity_verified',
        'street', 'city', 'neighbourhood_group_cleansed', 'smart_location',
        'country_code', 'country', 'is_location_exact', 'amenities', 'price',
        'calendar_updated', 'has_availability', 'calendar_last_scraped',
        'first_review', 'last_review', 'requires_license',
        'jurisdiction_names', 'instant_bookable', 'is_business_travel_ready',
        'cancellation_policy', 'require_guest_profile_picture',
        'require_guest_phone_verification',
        'translation missing: en.hosting_amenity_49', 'summary', 'space',
        'description', 'neighborhood_overview', 'notes', 'transit', 'access',
        'interaction', 'house_rules', 'thumbnail_url', 'medium_url',
        'xl_picture_url', 'host_location', 'host_about', 'host_response_time',
        'host_response_rate', 'host_acceptance_rate', 'state',
        'neighbourhood_cleansed', 'host_neighbourhood', 'license',
        'review_scores_rating', 'review_scores_accuracy',
        'review_scores_cleanliness', 'review_scores_checkin',
        'review_scores_communication', 'review_scores_location',
        'review_scores_value', 'weekly_price', 'monthly_price',
        'security_deposit', 'cleaning_fee', 'market'
    ]
    for col in columns_to_drop:
        listings_df.drop([col], axis=1, inplace=True)

    # Remove rows that don't have an estimated income per month
    listings_df = listings_df[~pd.
                              isna(listings_df['estimated_income_per_month'])]

    # Dropping square feet because 7450 out of 7712 (97%) rows are null
    listings_df.drop(['square_feet'], axis=1, inplace=True)

    # Fill values going forward
    listings_df.fillna(method='ffill', inplace=True)

    # Convert zipcode to string rather than float
    listings_df['zipcode'] = listings_df['zipcode'].astype('int').astype('str')

    # Convert $ amount for extra people from string to float
    listings_df['extra_people'] = listings_df['extra_people'].apply(
        lambda s: s[1:]).astype('float')

    amenities = listings_df.iloc[:, 13:-1]
    y = np.ravel(listings_df.iloc[:, [-1]])

    # Select 20 top amenities
    select = RFE(LinearRegression(), 20).fit(amenities, y)

    # Remove amenities that weren't selected
    remove_cols = [
        col for i, col in enumerate(amenities.columns.values)
        if not select.get_support()[i]
    ]
    for col in remove_cols:
        listings_df.drop([col], axis=1, inplace=True)

    listings_df = pd.get_dummies(listings_df)

    estimated_income = listings_df['estimated_income_per_month']
    listings_df = listings_df.drop(['estimated_income_per_month'], axis=1)

    return listings_df, estimated_income
print("MSE: %0.2f" % mean_squared_error(Y, predicted))
print('\nsupport vector machine rbf')
clf = svm.SVR(epsilon=0.2, kernel='rbf', C=1.)
scores = cross_val_score(clf, X, Y, cv=cv)
print("mean R2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
predicted = cross_val_predict(clf, X, Y, cv=cv)
print("MSE: %0.2f" % mean_squared_error(Y, predicted))
print('\nknn')
knn = KNeighborsRegressor()
scores = cross_val_score(knn, X, Y, cv=cv)
print("mean R2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
predicted = cross_val_predict(knn, X, Y, cv=cv)
print("MSE: %0.2f" % mean_squared_error(Y, predicted))

best_features = 4
rfe_lin = RFE(lin, best_features).fit(X, Y)
supported_features = rfe_lin.get_support(indices=True)
for i in range(0, 4):
    z = supported_features[i]
    print(i + 1, boston.feature_names[z])

best_features = 4
print('feature selection on linear regression')
rfe_lin = RFE(lin, best_features).fit(X, Y)
mask = np.array(rfe_lin.support_)
scores = cross_val_score(lin, X[:, mask], Y, cv=cv)
print("R2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
predicted = cross_val_predict(lin, X[:, mask], Y, cv=cv)
print("MSE: %0.2f" % mean_squared_error(Y, predicted))
print('feature selection ridge regression')
rfe_ridge = RFE(ridge, best_features).fit(X, Y)
示例#23
0
print(model.feature_importances_)

feature_imp = pd.DataFrame({'Features': df5.columns.tolist(),
                            'Importance': model.feature_importances_})

'''

# RFE
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LinearRegression
#from sklearn.ensemble import RandomForestRegressor

# feature extraction
model = LinearRegression()
#model = RandomForestRegressor()
rfe = RFE(model, 10, step=1)
#rfe = RFECV(model, step=1, cv=5)
fit = rfe.fit(X, y)

print("Num Features: %d") % fit.n_features_
print("Selected Features: %s") % fit.support_
print("Feature Ranking: %s") % fit.ranking_

feature_imp = pd.DataFrame({
    'Features': df5.columns.tolist(),
    'Select': fit.support_,
    'Rank': fit.ranking_
})

feature_imp.to_csv(
    'C:/Users/Jie.Hu/Desktop/Correlation/corr_0906/feature_imp_rf.csv',
示例#24
0
x_train.shape

x_train

from sklearn.linear_model import LogisticRegression
#Train the model
model = LogisticRegression()
model.fit(x_train, y_train)  #Training the model

x_train.shape[1]

#recursive feature elimination
from sklearn.feature_selection import RFE

logreg = LogisticRegression()
rfe = RFE(logreg, x_train.shape[1])
rfe = rfe.fit(x_train, y_train)
print(rfe.support_)
print(rfe.ranking_)

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

predictions = model.predict(x_test)
print(predictions)  # printing predictions

print()  # Printing new line

#Check precision, recall, f1-score
print(classification_report(y_test, predictions))
print("Data Types:", dataframe.dtypes)

dataset = dataframe.values

# In[11]:

#splitting dataset
X = dataset[:, 0:15]
Y1 = dataset[:, 14]  #gt_c_decay
Y2 = dataset[:, 15]  #gt_t_decay

# In[12]:

#Feature Selection for gt_c_decay
estimator = ExtraTreesRegressor()
rfe = RFE(estimator, 3)
fit = rfe.fit(X, Y1)

print("Number of Features: ", fit.n_features_)
print("Selected Features: ", fit.support_)
print("Feature Ranking: ", fit.ranking_)

# In[13]:

#Feature Selection for gt_t_decay
estimator = ExtraTreesRegressor()
rfe = RFE(estimator, 3)
fit = rfe.fit(X, Y2)

print("Number of Features: ", fit.n_features_)
print("Selected Features: ", fit.support_)
示例#26
0
 X_train_nol_sel = sk1.transform(X_train_nol)
 X_test_nol_sel = sk1.transform(X_test_nol)
 selected1 = sk1.get_support()
 # logger.debug("1st feature selection accomplished")
 if not os.path.exists(
         "Results/selected_{f}".format(f=featureName)):
     os.mkdir("Results/selected_{f}".format(f=featureName))
 savemat(
     "Results/selected_{f}/selected1_{c}_{fold}.mat".format(
         f=featureName, c=iterCount, fold=fold),
     {'data': selected1})
 for feature_num in range(10, X_train_nol_sel.shape[1], 1):
     for C in np.logspace(-4, 4, 9, base=2):
         # the second step feature selection
         lr = LinearRegression()
         rfe = RFE(lr, n_features_to_select=feature_num)
         rfe.fit(X_train_nol_sel, y_train)
         selected2 = rfe.support_
         # svm classification
         clf = svm.SVC(kernel='linear',
                       C=C).fit(X_train_nol_sel[:, selected2],
                                y_train)
         score = clf.score(X_test_nol_sel[:, selected2], y_test)
         y_score = clf.decision_function(
             X_test_nol_sel[:, selected2])
         res = clf.predict(X_test_nol_sel[:, selected2])
         ACC, SEN, SPE = model_evaluate(test_label=y_test,
                                        res_label=res)
         if score > acc_max:
             COp = C
             featureNum_op = feature_num
示例#27
0
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

print('Accuracy of Logistic regression classifier on training set: {:.2f}'.
      format(logreg.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'.format(
    logreg.score(X_test, y_test)))

from sklearn.feature_selection import RFE
classifier = LogisticRegression()
selector = RFE(classifier, 10, step=1)
selector = selector.fit(X, y)
print(selector.support_)

predicted = cross_validation.cross_val_predict(logreg, X, y, cv=10)
print(metrics.accuracy_score(y, predicted))
print(metrics.classification_report(y, predicted))

logit_model = sm.MNLogit(y, X)
result = logit_model.fit()
print(result.summary())

print(logreg.intercept_)
print(logreg.coef_)

# Binarize the output

# In[42]:


# Importing RFE and LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression


# In[43]:


# Running RFE with the output number of the variable equal to 9
lm = LinearRegression()
rfe = RFE(lm, 15)             # running RFE
rfe = rfe.fit(X_train, y_train)
print(rfe.support_)           # Printing the boolean results
print(rfe.ranking_)  


# In[44]:


col = X_train.columns[rfe.support_]
print(col)


# In[45]:

示例#29
0
#

from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=False)
X = scalstat.drop('LifeExp', axis=1)
reg.fit(X, scalstat.LifeExp)
reg.coef_

#

reg.intercept_

#

from sklearn.feature_selection import RFE
selector = RFE(reg, n_features_to_select=1)
selector = selector.fit(X, scalstat.LifeExp)
selector.ranking_

#

X.columns[np.argsort(selector.ranking_)].tolist()

# ## Sample Splitting
#

import faraway.datasets.fat
fat = faraway.datasets.fat.load()

#
示例#30
0
 def rfe_tree(self):
     estimator = DecisionTreeClassifier()
     selector = RFE(estimator, n_features_to_select=self.fs_num)
     return (selector)