def rfe_rf(self): estimator = RandomForestClassifier(max_depth=3, n_estimators=5) selector = RFE(estimator, n_features_to_select=self.fs_num) return (selector)
for step in estimator: current_features = get_feature_out(step, current_features) features_out = current_features else: features_out = get_feature_out(estimator, features) output_features.extend(features_out) elif estimator == 'passthrough': output_features.extend(ct._feature_names_in[features]) return output_features # Create the classifier object selected_classifier = "SVC" classifier = SVC(kernel="linear", probability=True, class_weight="balanced") selector = RFE(classifier, n_features_to_select=10, step=0.05) # A pipeline chains two algorithms together so that the training process for both can be done in a single step and data is passed automatically from one to the other pipeline = Pipeline([("preprocessor", preprocess_pipeline), ("RFE", selector), ("classifier", classifier)]) #print(classifier.get_params().keys()) # Dictionary that contains the values for the parameter sweep #param_grid = dict(RFE__n_features_to_select=[2,3,4,5,6,7,8,9,10], classifier__C=[0.001, 0.01, 0.1, 1, 10, 100, 1000], classifier__gamma=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]) param_grid = dict(RFE__n_features_to_select=[10, 20, 30, 40, 50], classifier__C=[0.001, 0.01, 0.1, 1, 10, 100, 1000], classifier__gamma=[1, 0.1, 0.001, 0.0001]) scores = [] accuracy_scores = []
a digit classification task. .. note:: See also :ref:`example_feature_selection_plot_rfe_with_cross_validation.py` """ print(__doc__) from sklearn.svm import SVC from sklearn.datasets import load_digits from sklearn.feature_selection import RFE import matplotlib.pyplot as plt # Load the digits dataset digits = load_digits() X = digits.images.reshape((len(digits.images), -1)) y = digits.target # Create the RFE object and rank each pixel svc = SVC(kernel="linear", C=1) rfe = RFE(estimator=svc, n_features_to_select=1, step=1) rfe.fit(X, y) ranking = rfe.ranking_.reshape(digits.images[0].shape) # Plot pixel ranking plt.matshow(ranking, cmap=plt.cm.Blues) plt.colorbar() plt.title("Ranking of pixels with RFE") plt.show()
def backward_feature_selection(data_set, y_values, number_of_features): l_reg = LinearRegression() rfe = RFE(l_reg, number_of_features) rfe = rfe.fit_transform(data_set, y_values) return rfe
feat_names = leData2.drop(['Attrition'],axis=1).columns indices = np.argsort(importances)[::-1] plt.figure(figsize=(12,6)) plt.title("Feature importances by DecisionTreeClassifier") plt.bar(range(len(indices)), importances[indices], color='lightblue', align="center") plt.step(range(len(indices)), np.cumsum(importances[indices]), where='mid', label='Cumulative') plt.xticks(range(len(indices)), feat_names[indices], rotation='vertical',fontsize=14) plt.xlim([-1, len(indices)]) plt.show() plt.savefig('DecisionTreeFeaturesImportance') #2) Feature Selection using Recursive Feature Elimination (RFE) model = LogisticRegression() rfe = RFE(model, 15) #Number of Features Selected rfe = rfe.fit(X,y) print("Num Features: %s" % (rfe.n_features_)) print("Selected Features: %s" % (rfe.support_)) print("Feature Ranking: %s" % (rfe.ranking_)) sf = rfe.support_ fr = rfe.ranking_ featureNames = list(X.columns.values) #Create empty dataframe RFE_df = pd.DataFrame() #Add sf, fr and featureNames to the dataframe RFE_df = pd.DataFrame(sf, fr) RFE_df['featureNames'] = featureNames
X = array[:, 1:27] Y = array[:, 0] """ #feature extraction for non-negative features test = SelectKBest(score_func=chi2, k=4) fit = test.fit(X, Y) #summarize scores set_printoptions(precision=3) print(fit.scores_) features = fit.transform(X) #summarize selected features print(features[0:5,:]) """ model = LogisticRegression() rfe = RFE(model, 3) fit = rfe.fit(X, Y) print("Num Features: %d") % fit.n_features_ print("Selected Features: %s") % fit.support_ print("Feature Ranking: %s") % fit.ranking_ support = fit.support_ rank = fit.ranking_ for i in range(len(fit.support_)): if support[i] == True: print(names[i]) # PCA # Feature Importence with Extra Trees Classifier model = ExtraTreesClassifier()
def test(): data = pd.read_csv('patientData.csv', header=0) data = data.dropna() dataLength = data.shape[0] print(data.shape) print(list(data.columns)) #print(data['HEARTFAILURE'].value_counts()) #sns.countplot(x= 'HEARTFAILURE',data=data,palette='hls') ''' count_no_fail = len(data[data['HEARTFAILURE']==0]) count_fail = len(data[data['HEARTFAILURE']==1]) pct_of_no_fail = count_no_fail/(count_no_fail+count_fail) print("percentage of no subscription is", pct_of_no_fail*100) pct_of_fail = count_fail/(count_no_fail+count_fail) print("percentage of subscription", pct_of_fail*100) ''' import statsmodels.api as sm print(data.groupby('HEARTFAILURE').mean()) #logit_model = sm.Logit(y,X) #matplotlib inline ''' pd.crosstab(data.SMOKERLAST5YRS,data.HEARTFAILURE).plot(kind ='bar') plt.title('smoke and heartfailure') plt.xlabel('smoke') plt.ylabel("failure") plt.savefig('smoke') data.PALPITATIONSPERDAY.hist() plt.title('Histogram of palp') plt.xlabel('palp') plt.ylabel('failure') plt.savefig('palpitations') ''' cat_vars = ['SEX', 'FAMILYHISTORY', 'SMOKERLAST5YRS'] for var in cat_vars: cat_list = 'var' + '_' + var cat_list = pd.get_dummies(data[var], prefix=var) data2 = data.join(cat_list) data = data2 cat_vars = ['SEX', 'FAMILYHISTORY', 'SMOKERLAST5YRS'] data_vars = data.columns.values.tolist() to_keep = [i for i in data_vars if i not in cat_vars] dataFinal = data[to_keep] X = dataFinal.loc[:, dataFinal.columns != 'HEARTFAILURE'] y = dataFinal.loc[:, dataFinal.columns == 'HEARTFAILURE'] from imblearn.over_sampling import SMOTE os = SMOTE(random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0) columns = X_train.columns os_data_X, os_data_y = os.fit_sample(X_train, y_train) os_data_X = pd.DataFrame(data=os_data_X, columns=columns) os_data_y = pd.DataFrame(data=os_data_y, columns=['HEARTFAILURE']) print("length of oversampled data is ", len(os_data_X)) print("Number of no subscription in oversampled data", len(os_data_y[os_data_y['HEARTFAILURE'] == 0])) print("Number of subscription", len(os_data_y[os_data_y['HEARTFAILURE'] == 1])) print("Proportion of no subscription data in oversampled data is ", len(os_data_y[os_data_y['HEARTFAILURE'] == 0]) / len(os_data_X)) print("Proportion of subscription data in oversampled data is ", len(os_data_y[os_data_y['HEARTFAILURE'] == 1]) / len(os_data_X)) data_final_vars = dataFinal.columns.values.tolist() y = ['HEARTFAILURE'] X = [i for i in data_final_vars if i not in y] from sklearn.feature_selection import RFE logreg = LogisticRegression() rfe = RFE(logreg, 2) rfe = rfe.fit(os_data_X, os_data_y.values.ravel()) # print(rfe.support_) # print(rfe.ranking_) # print(rfe.estimator_) col = [ 'PALPITATIONSPERDAY', 'BMI', 'AVGHEARTBEATSPERMIN', 'AGE', 'EXERCISEMINPERWEEK', 'SEX_F', 'SEX_M', 'FAMILYHISTORY_N', 'FAMILYHISTORY_Y', 'SMOKERLAST5YRS_N', 'SMOKERLAST5YRS_Y' ] #col = ['PALPITATIONSPERDAY', 'CHOLESTEROL', 'BMI', 'AVGHEARTBEATSPERMIN', 'AGE', 'EXERCISEMINPERWEEK', 'FAMILYHISTORY_N', 'FAMILYHISTORY_Y', 'SMOKERLAST5YRS_N', 'SMOKERLAST5YRS_Y'] #col = ['BMI', 'SEX_M','SEX_F','AVGHEARTBEATSPERMIN', 'FAMILYHISTORY_N', 'AGE','FAMILYHISTORY_Y', 'SMOKERLAST5YRS_N', 'SMOKERLAST5YRS_Y'] y = os_data_y['HEARTFAILURE'] X = os_data_X[col] final = sm.Logit(y, X) result = final.fit() print(result.summary2()) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0) logreg = LogisticRegression() logreg.fit(X_train, y_train) y_pred = logreg.predict(X_test) print('Accuracy of logistic regression classifier on test set:', format(logreg.score(X, y))) print(y_pred)
from sklearn.feature_selection import RFE, SelectKBest, chi2, SelectFpr from sklearn.ensemble import RandomForestClassifier import pandas as pd import numpy as np df = pd.read_csv('Train_CV_Data.csv') X_train = np.asarray(df.loc[:2000000, 'srcPort':'HTTPM4']) Y_train = np.asarray(df.loc[:2000000, 'malicious'], dtype=np.int32) print(np.sum(Y_train == 1)) kBest = SelectKBest(chi2, k=12) kBest.fit(X_train, Y_train) mask1 = kBest.get_support(indices=True) fpr = SelectFpr(chi2, alpha=0.0001) fpr.fit(X_train, Y_train) mask2 = fpr.get_support(indices=True) rf = RandomForestClassifier(n_estimators=50) rfe = RFE(rf, n_features_to_select=12, step=1) rfe.fit(X_train, Y_train) mask3 = rfe.get_support(indices=True) print('K-Best Feat :', mask1) print('False Positive based :', mask2) print('RFE based :', mask3)
def test_number_of_subsets_of_features(): # In RFE, 'number_of_subsets_of_features' # = the number of iterations in '_fit' # = max(ranking_) # = 1 + (n_features + step - n_features_to_select - 1) // step # After optimization #4534, this number # = 1 + np.ceil((n_features - n_features_to_select) / float(step)) # This test case is to test their equivalence, refer to #4534 and #3824 def formula1(n_features, n_features_to_select, step): return 1 + ((n_features + step - n_features_to_select - 1) // step) def formula2(n_features, n_features_to_select, step): return 1 + np.ceil((n_features - n_features_to_select) / float(step)) # RFE # Case 1, n_features - n_features_to_select is divisible by step # Case 2, n_features - n_features_to_select is not divisible by step n_features_list = [11, 11] n_features_to_select_list = [3, 3] step_list = [2, 3] for n_features, n_features_to_select, step in zip( n_features_list, n_features_to_select_list, step_list): generator = check_random_state(43) X = generator.normal(size=(100, n_features)) y = generator.rand(100).round() rfe = RFE( estimator=SVC(kernel="linear"), n_features_to_select=n_features_to_select, step=step, ) rfe.fit(X, y) # this number also equals to the maximum of ranking_ assert np.max(rfe.ranking_) == formula1(n_features, n_features_to_select, step) assert np.max(rfe.ranking_) == formula2(n_features, n_features_to_select, step) # In RFECV, 'fit' calls 'RFE._fit' # 'number_of_subsets_of_features' of RFE # = the size of each score in 'cv_results_' of RFECV # = the number of iterations of the for loop before optimization #4534 # RFECV, n_features_to_select = 1 # Case 1, n_features - 1 is divisible by step # Case 2, n_features - 1 is not divisible by step n_features_to_select = 1 n_features_list = [11, 10] step_list = [2, 2] for n_features, step in zip(n_features_list, step_list): generator = check_random_state(43) X = generator.normal(size=(100, n_features)) y = generator.rand(100).round() rfecv = RFECV(estimator=SVC(kernel="linear"), step=step) rfecv.fit(X, y) # TODO: Remove in v1.2 when grid_scores_ is removed msg = ( r"The `grid_scores_` attribute is deprecated in version 1\.0 in " r"favor of `cv_results_` and will be removed in version 1\.2.") with pytest.warns(FutureWarning, match=msg): assert len(rfecv.grid_scores_) == formula1(n_features, n_features_to_select, step) assert len(rfecv.grid_scores_) == formula2(n_features, n_features_to_select, step) for key in rfecv.cv_results_.keys(): assert len(rfecv.cv_results_[key]) == formula1( n_features, n_features_to_select, step) assert len(rfecv.cv_results_[key]) == formula2( n_features, n_features_to_select, step)
import numpy as np import pandas as pd from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import RFE from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import MinMaxScaler from sklearn.svm import LinearSVC # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.9794871794871796 exported_pipeline = make_pipeline( RFE(estimator=ExtraTreesClassifier(criterion="gini", max_features=0.7000000000000001, n_estimators=100), step=0.8), MinMaxScaler(), LinearSVC(C=10.0, dual=True, loss="squared_hinge", penalty="l2", tol=0.001) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
acuu(dataX,dataY) #-------------------------------------------------------------------------------------------------------------------- from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 test = SelectKBest(score_func=chi2, k=17) # k is number of features fit = test.fit(dataX, dataY) train2 = test.transform(dataX) acuu(train2, dataY) # from sklearn.linear_model import LogisticRegression from sklearn.feature_selection import RFE model = LogisticRegression() rfe = RFE(model, 17) fit = rfe.fit(dataX, dataY) train2 = fit.transform(dataX) acuu(train2, dataY) from sklearn.decomposition import PCA pca = PCA(17) fit = pca.fit(dataX, dataY) train2 = pca.transform(dataX) acuu(train2, dataY) import warnings warnings.filterwarnings("ignore")
plt.show() # Separación de características y etiquetas datos = np.array(datos) datosy = np.transpose(datos)[0] datosx = [i[1:] for i in datos] # Eliminación de la característica 11 datosx = np.delete(datosx, 10, 1) #Eliminación de la característica 16 datosx = np.delete(datosx, 14, 1) #Selección de características estimator = SVR(kernel="linear") selector = RFE(estimator, 13) datosx = selector.fit_transform(datosx, datosy) print(selector.ranking_) print(selector.support_) #Separación de los datos en train y test trainx, testx, trainy, testy = train_test_split(datosx, datosy, test_size=0.3, shuffle=True) #Neural Networks print('Resultados Redes Neuronales: ') W = np.random.randint(low=0, high=2, size=len(testy))
'anomalous(wrongSetUp)': 6, 'normal': 7 } train_data = pd.read_csv('dataset/balanced_noTimestamp_mixTrain.csv') label_index = len(train_data.iloc[0][:]) - 1 train_labels = train_data.iloc[:, -1] # separate labels of training sets train_data.drop(train_data.columns[label_index], axis=1, inplace=True) test_data = pd.read_csv('dataset/balanced_noTimestamp_mixTest.csv') test_labels = test_data.iloc[:, -1] # separate labels of testing set test_data.drop(test_data.columns[label_index], axis=1, inplace=True) dt_clf = DecisionTreeClassifier() # Train DecisionTreeClassifier selector_dt = RFE(dt_clf, None, step=1).fit(train_data, train_labels) predicted_test_dt = selector_dt.predict(test_data) # rf_clf = RandomForestClassifier(max_depth=6, random_state=0) # RandomForestClassifier # selector_rf = RFE(rf_clf, None, step=1).fit(train_data, train_labels) # predicted_test_rf = selector_rf.predict(test_data) knn_clf = KNeighborsClassifier(n_neighbors=5).fit( train_data, train_labels) # Train KNN classifier predicted_test_knn = knn_clf.predict(test_data) # Train SVM classifier svc_clf = svm.SVC(gamma='auto', kernel='rbf', decision_function_shape='ovo', max_iter=-1,
testX = X[1240:1280, :] testY = Y[1240:1280] trainX = X[0:1240, :] trainY = Y[0:1240] else: testX = X[subNo * trialNum:(subNo + 1) * trialNum, :] testY = Y[subNo * trialNum:(subNo + 1) * trialNum] trainX = np.vstack( (X[0:subNo * trialNum, :], X[(subNo + 1) * trialNum:subNum * trialNum, :])) trainY = np.concatenate( (Y[0:subNo * trialNum], Y[(subNo + 1) * trialNum:subNum * trialNum])) clf = svm.SVC(kernel='linear') sel_criteria = RFE(estimator=clf, n_features_to_select=num_k, step=0.5).fit(trainX, trainY) sel_indx_mask = sel_criteria.get_support() sel_indx = np.where(sel_indx_mask == True) sel_indx = sel_indx[0] trainX = trainX[:, sel_indx] testX = testX[:, sel_indx] #svm clf1 = svm.SVC(kernel='linear') clf1.fit(trainX, trainY) predict_testY = clf1.predict(testX) f1_scores[no_k, subNo] = metrics.f1_score(testY, predict_testY) acc_scores[no_k, subNo] = metrics.accuracy_score(testY, predict_testY) print('current sub performance:', acc_scores[no_k, subNo], ' kbest:', num_k)
## FUNCTIONS: RFE, RFECV ## DOCUMENTATION: http://scikit-learn.org/stable/modules/feature_selection.html ## DATA: Crime (n=319 non-null, p=122, type=regression) ## DATA DICTIONARY: http://archive.ics.uci.edu/ml/datasets/Communities+and+Crime # define X and y X = crime.iloc[:, :-1] y = crime.iloc[:, -1] # split into train/test X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) # select "best" features (half of them by default) lm = LinearRegression() from sklearn.feature_selection import RFE selector = RFE(lm) selector.fit(X_train, y_train) selector.n_features_ selector.support_ selector.ranking_ # let RFECV select the "optimal" number of features from sklearn.feature_selection import RFECV selector = RFECV(lm, cv=3, scoring='mean_squared_error') selector.fit(X, y) selector.n_features_ selector.support_ selector.ranking_ # *tentative* advice for usage: # 1. scale features, then use RFECV to select the number of features (p)
from sklearn.metrics import classification_report print('\n1:') print("DTC confusioin_matrix:\n", confusion_matrix(y_test, y_predict_dtc)) print("\nKNC confusioin_matrix:\n", confusion_matrix(y_test, y_predict_knc)) print("\nRFC confusioin_matrix:\n", confusion_matrix(y_test, y_predict_rfc)) print("\nGBC confusioin_matrix:\n", confusion_matrix(y_test, y_predict_gbc)) print("\nAda confusioin_matrix:\n", confusion_matrix(y_test, y_predict_abc)) print("\nSVC confusioin_matrix:\n", confusion_matrix(y_test, y_predict_svc)) print("\nGauNB confusioin_matrix:\n", confusion_matrix(y_test, y_predict_gnb)) print("\nLR confusioin_matrix:\n", confusion_matrix(y_test, y_predict_lr)) print("1:","KNC:",knc.score(x_test,y_test),'DTC:',dtc.score(x_test,y_test),"RFC:",rfc.score(x_test,y_test),"GBC:",gbc.score(x_test,y_test),\ "Ada:",abc.score(x_test,y_test),"SVC:",svc.score(x_test,y_test),"GauNB:",gnb.score(x_test,y_test),\ "LR:",LR.score(x_test,y_test)) c, r = Y.shape Y = Y.values Y = Y.reshape(c, ) dtc1 = DecisionTreeClassifier() gbc1 = GradientBoostingClassifier() gnb1 = GaussianNB() rfe = RFE(estimator=gnb1, n_features_to_select=5, step=1) rfe.fit(X, Y) ranking = rfe.ranking_ print("gnb RFE ranking:\n", ranking) #print(X.index) #print(X.iloc[0])
plt.title('Comparison of different Feature Importances') plt.show() # - # ### Recursive Feature Elimination # + from sklearn.feature_selection import RFE from sklearn import ensemble from yellowbrick.features import RFECV ## RFE rf = RandomForestClassifier(random_state=42) model = RFE(rf, n_features_to_select=50) fit_model = model.fit(X_train_prepared, y_train) features = pd.DataFrame(list(zip(X_train_prepared.columns, fit_model.ranking_)), columns=['predictor', 'ranking']) # - features = features.sort_values(by='ranking') ## RFE and Tree based feature importance signify that features with rank greater than 3 in RFE are insignificant chosen_features = features[features['ranking'] < 3] chosen_features.shape # ### Sequential Feature Selection '''
def create_model(number, features): print(f"\nExecuting Model {number}") PATH = "carInsurance.csv" df = pd.read_csv( PATH, skiprows=1, encoding="ISO-8859-1", sep=',', names=("ID", "KIDSDRIV", "BIRTH", "AGE", "HOMEKIDS", "YOJ", "INCOME", "PARENT1", "HOME_VAL", "MSTATUS", "GENDER", "EDUCATION", "OCCUPATION", "TRAVTIME", "CAR_USE", "BLUEBOOK", "TIF", "CAR_TYPE", "RED_CAR", "OLDCLAIM", "CLM_FREQ", "REVOKED", "MVR_PTS", "CLM_AMT", "CAR_AGE", "CLAIM_FLAG", "URBANICITY")) # Show all columns. pd.set_option('display.max_columns', 1000) pd.set_option('display.width', 1000) # Exploratory Analysis def exploratory_analysis(df): print(df.columns) # list all column names print(df.shape) # get number of rows and columns print(df.info()) # additional info about dataframe print( df.describe()) # statistical description, only for numeric values columns = [ "ID", "KIDSDRIV", "BIRTH", "AGE", "HOMEKIDS", "YOJ", "INCOME", "PARENT1", "HOME_VAL", "MSTATUS", "GENDER", "EDUCATION", "OCCUPATION", "TRAVTIME", "CAR_USE", "BLUEBOOK", "TIF", "CAR_TYPE", "RED_CAR", "OLDCLAIM", "CLM_FREQ", "REVOKED", "MVR_PTS", "CLM_AMT", "CAR_AGE", "CLAIM_FLAG", "URBANICITY" ] for column in columns: print(df[column].value_counts( dropna=False)) # count unique values in a # End of Exploratory Analysis # Convert Columns with $ in entry def convert_dollar_sign_columns(df): columns_with_dollar_sign = [ 'INCOME', 'HOME_VAL', 'BLUEBOOK', 'OLDCLAIM', 'CLM_AMT' ] for column in columns_with_dollar_sign: df[column].replace(to_replace='\D+', value='', regex=True, inplace=True) df[column] = pd.to_numeric(df[column]) return df df = convert_dollar_sign_columns(df=df) # End of Conversion # Imputation of Empty or NaN in Columns def convert_na_cells(colName, df, measureType): # Create two new column names based on original column name. indicatorColName = 'm_' + colName # Tracks whether imputed. imputedColName = 'imp_' + colName # Stores original & imputed data. # Get mean or median depending on preference. imputedValue = 0 if (measureType == "median"): imputedValue = df[colName].median() elif (measureType == "mode"): imputedValue = float(df[colName].mode()) else: imputedValue = df[colName].mean() # Populate new columns with data. imputedColumn = [] indictorColumn = [] for i in range(len(df)): isImputed = False # mi_OriginalName column stores imputed & original data. if (np.isnan(df.loc[i][colName])): isImputed = True imputedColumn.append(imputedValue) else: imputedColumn.append(df.loc[i][colName]) # mi_OriginalName column tracks if is imputed (1) or not (0). if (isImputed): indictorColumn.append(1) else: indictorColumn.append(0) # Append new columns to dataframe but always keep original column. df[indicatorColName] = indictorColumn df[imputedColName] = imputedColumn return df def analysis_of_income_for_imputation(df): occurences_of_income = df['INCOME'].value_counts( dropna=False).to_dict() # print(occurences_of_income) plt.bar(["$0", "NaN"], [797, 570]) plt.title("Occurences of Distinct Values for Income") plt.xlabel("Income") plt.ylabel("Occurences") plt.show() # Stats # Entries = 9732 # Significant Non-Unique Occurences = {0: 797, "NaN": 570} # 1367 entries are significant non-unique. 8365 are entries remaining # that are relatively distinct. # Conclusion for imputation choice: Do not use mean or mode. Median is # ideal. 570 entries will be imputed. def analysis_of_age_for_imputation(df): occurences_of_age = df['AGE'].value_counts(dropna=False).to_dict() plt.bar(occurences_of_age.keys(), occurences_of_age.values()) plt.title("Occurences of Distinct Values for Age") plt.xlabel("Ages") plt.ylabel("Occurences") plt.show() # The distribution of the age plot is normal and balanced. # Imputation with mean is reliable and only 7 entries need to be imputed so # the imputation will not heavily impact our regressional analysis later. def analysis_of_yoj_for_imputation(df): occurences_of_yoj = df['YOJ'].value_counts(dropna=False).to_dict() plt.bar(occurences_of_yoj.keys(), occurences_of_yoj.values()) plt.title("Occurences of Distinct Values for YOJ") plt.xlabel("YOJ") plt.ylabel("Occurences") plt.show() # The distribution is not normal. There are 800, 0 value entries of the # 9754 entry total. # The majority of entries are focused around the mean. # There are 548 entries missing. # The 0 entries could be significant so testing median or mode are # likely more reliable than mean since there are so many 0 value entries. def analysis_of_home_val_for_imputation(df): # print(df['HOME_VAL'].value_counts( # dropna=False)) occurences_of_home_val = df['HOME_VAL'].value_counts( dropna=False).to_dict() plt.bar(["$0", "NaN"], [2908, 575]) plt.title("Occurences of Distinct Values for Home Values") plt.xlabel("Home Values") plt.ylabel("Occurences") plt.show() # Value Occurences # 0.0 2908 # NaN 575 # 6819 entries are not 0 or NaN # 575 entries are missing (NaN) # STD is quite high and is explainable by the occurences of $0 entries. # Our analysis of distinct values shows that mean and mode would be # invalid imputation methods for home_val. I will use median. # count mean std min 25% 50% 75% max # HOME_VAL 9727 154523 129188.4 0 0 160661 238256 885282 def analysis_of_car_age_for_imputation(df): occurences_of_car_age = df['CAR_AGE'].value_counts( dropna=False).to_dict() plt.bar(occurences_of_car_age.keys(), occurences_of_car_age.values()) plt.title("Occurences of Distinct Values for Car Age") plt.xlabel("Car Age") plt.ylabel('Occurences') plt.show() # Distribution is not normal. There are roughly 2450 cars with an # age of 1. The second highest occurring age is 8 at around 650. The # occurences of age 1 are not likely to be erroneous. Imputation # with car age should be done with mean or median to find the better # imputation method. # Treat outlier in CAR_AGE column df.CAR_AGE = df.CAR_AGE.mask(df.CAR_AGE.lt(0), 0) def imputation_analysis(df): analysis_of_income_for_imputation(df) analysis_of_age_for_imputation(df) analysis_of_yoj_for_imputation(df) analysis_of_home_val_for_imputation(df) analysis_of_car_age_for_imputation(df) imputation_analysis(df) df = convert_na_cells("INCOME", df, "median") df = convert_na_cells("AGE", df, "mean") df = convert_na_cells("YOJ", df, "mode") df = convert_na_cells("HOME_VAL", df, "median") df = convert_na_cells("CAR_AGE", df, "mean") # End of Imputation # Dummy Variables: Handling Categorial and Ordinal/Nominal Columns # Treating categorical (string) information. df = pd.get_dummies(df, columns=[ 'PARENT1', 'MSTATUS', 'GENDER', 'EDUCATION', 'OCCUPATION', 'CAR_USE', 'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY' ]) # End of Dummy Variable Handling # Binning df['AGE_bin'] = pd.cut(x=df['AGE'], bins=[0, 17, 27, 37, 47, 57, 67, 77]) tempDf = df['AGE_bin'] # Isolate columns # Get dummies dummyDf = pd.get_dummies(tempDf, columns=['AGE_bin']) df = pd.concat(([df, dummyDf]), axis=1) # Join dummy df with original predictors_test = [ 'BLUEBOOK', 'OLDCLAIM', 'CLM_FREQ', 'CLM_AMT', 'imp_INCOME', 'imp_YOJ', 'imp_HOME_VAL', 'imp_CAR_AGE', 'PARENT1_No', 'MSTATUS_Yes', 'MSTATUS_z_No', 'GENDER_M', 'GENDER_z_F', 'EDUCATION_<High School', 'EDUCATION_Bachelors', 'EDUCATION_Masters', 'EDUCATION_PhD', 'EDUCATION_z_High School', 'OCCUPATION_Clerical', 'OCCUPATION_Doctor', 'OCCUPATION_Home Maker', 'OCCUPATION_Lawyer', 'OCCUPATION_Manager', 'OCCUPATION_Professional', 'OCCUPATION_Student', 'OCCUPATION_z_Blue Collar', 'CAR_USE_Commercial', 'CAR_USE_Private', 'CAR_TYPE_Minivan', 'CAR_TYPE_Panel Truck', 'CAR_TYPE_Pickup', 'CAR_TYPE_Sports Car', 'CAR_TYPE_Van', 'CAR_TYPE_z_SUV', 'RED_CAR_no', 'RED_CAR_yes', 'REVOKED_No', 'REVOKED_Yes', 'URBANICITY_Highly Urban/ Urban', 'URBANICITY_z_Highly Rural/ Rural' ] X = df[predictors_test] y = df['CLAIM_FLAG'] from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression # Scale the data prior to selection. print("Please wait for scaling...") sc_x = StandardScaler() X_scaled = sc_x.fit_transform(X) print("Please wait for automated feature selection...") n_features = features print(f"Selecting {n_features} features") logreg = LogisticRegression(max_iter=200) rfe = RFE(logreg, n_features) # Select top 20 features. rfe = rfe.fit(X_scaled, y.values.ravel()) print("Feature selection is complete.") def getSelectedColumns(ranking): # Extract selected indices from ranking. indices = [] for i in range(0, len(ranking)): if (ranking[i] == 1): indices.append(i) # Build list of selected column names. counter = 0 selectedColumns = [] for col in X: if (counter in indices): selectedColumns.append(col) counter += 1 return selectedColumns selectedPredictorNames = getSelectedColumns(rfe.ranking_) # Show selected names from RFE. print("\n*** Selected Features:") for i in range(0, len(selectedPredictorNames)): print(selectedPredictorNames[i]) # prepare cross validation with three folds and 1 as a random seed. # Separate into x and y values. count = 0 kfold = KFold(3, True, 1) # Separate into x and y values. X = df[selectedPredictorNames] y = df[['CLAIM_FLAG']] # Show chi-square scores for each feature. # There is 1 degree freedom since 1 predictor during feature evaluation. # Generally, >=3.8 is good) test = SelectKBest(score_func=chi2, k=n_features) XScaled = MinMaxScaler().fit_transform(X) chiScores = test.fit(XScaled, y) # Summarize scores np.set_printoptions(precision=3) # Search here for insignificant features. print("\nPredictor Chi-Square Scores: " + str(chiScores.scores_)) for train, test in kfold.split(df[selectedPredictorNames]): X_train = X.iloc[train, :] # Gets all rows with train indexes. y_train = y.iloc[train, :] X_test = X.iloc[test, :] y_test = y.iloc[test, :] X_scaled = sc_x.fit_transform(X) # Perform logistic regression. logisticModel = LogisticRegression(fit_intercept=True, random_state=0, solver='liblinear') # Fit the model. logisticModel.fit(X_train, np.ravel(y_train)) y_pred = logisticModel.predict(X_test) y_prob = logisticModel.predict_proba(X_test) # Split data. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) # Perform logistic regression. logisticModel = LogisticRegression(fit_intercept=True, random_state=0, solver='liblinear') # Fit the model. logisticModel.fit(X_train, np.ravel(y_train)) y_pred = logisticModel.predict(X_test) y_prob = logisticModel.predict_proba(X_test) # Show confusion matrix and accuracy scores. cm = pd.crosstab(np.ravel(y_test), y_pred, rownames=['Actual'], colnames=['Predicted']) count += 1 print("\n***K-fold: " + str(count)) print('\nAccuracy: ', metrics.accuracy_score(y_test, y_pred)) print("\nConfusion Matrix") print(cm) from sklearn.metrics import classification_report, roc_auc_score print(classification_report(y_test, y_pred)) from sklearn.metrics import average_precision_score average_precision = average_precision_score(y_test, y_pred) print('Average precision-recall score: {0:0.2f}'.format( average_precision)) # calculate scores auc = roc_auc_score( y_test, y_prob[:, 1], ) print('Logistic: ROC AUC=%.3f' % (auc)) # Stat Summary: accuracy, precision, recall, f1 scores along with averages and # standard deviations of these scores for all folds. # Show model coefficients and intercept. print(f"\nStatistical Summary of Model {number}") print("\nModel Intercept: ") print(logisticModel.intercept_) print("\nModel Coefficients: ") print(logisticModel.coef_) # Prediction with test data pred = logisticModel.predict(X_test) # Show stats about the regression. mse = mean_squared_error(y_test, pred) RMSE = np.sqrt(mse) print("\nRMSE: " + str(RMSE)) print("\nr2_score", r2_score(y_test, pred)) # ROC CURVE CHART, and CUMUL GAINS CHART def create_roc_curve_chart_and_cumul_chart(): # calculate roc curves chart CUT_OFF = 0.50 lr_fpr, lr_tpr, _ = roc_curve(y_test, y_prob[:, 1]) plt.plot(lr_fpr, lr_tpr, marker='.', label='Logistic') plt.plot([0, 1], [0, 1], '--', label=f"CUT-OFF{CUT_OFF}") plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title("ROC CURVE") plt.legend() plt.show() # cumulative gains chart clf = LogisticRegression(random_state=0, multi_class='multinomial', solver='newton-cg') clf.fit(X_train, y_train) predicted_probas = clf.predict_proba(X_test) y_pred = clf.predict(X_test) import scikitplot as skplt skplt.metrics.plot_cumulative_gain(y_test, predicted_probas) skplt.metrics.plot_lift_curve(y_test, predicted_probas) plt.show() create_roc_curve_chart_and_cumul_chart() print(f"\nEnd of Model {number}")
def run_regressions_and_save_results(model, regression, dataframe, features_selection, results_df, parameters_dict=None, inputs=None): '''features_selection = 'all', 'all_but_climzones', 'RFE', 'RFE_but_climzones ''' # - - - - - - - - - - - - - - # Features = All # - - - - - - - - - - - - - - if features_selection == 'all': print(' with all features...') Features = 'All w/ climzones' scores = do_regression(regression, dataframe) print(' R2_score : ' + str(scores[0])) print(' MSE_score : ' + str(scores[1])) R2_score = scores[0] MSE_score = scores[1] results_df = results_df.append( { 'Model': model, 'num_features': len(dataframe.columns) - 1, 'Features': Features, 'params': parameters_dict, 'R2': R2_score, 'MSE': MSE_score }, ignore_index=True) # - - - - - - - - - - - - - - - # Features = All but climzones # - - - - - - - - - - - - - - - elif features_selection == 'all_but_climzones': dataframe = dataframe[columns_without_climatezones] print(' with all features but climate zones...') Features = 'All w/o climzones' scores = do_regression(regression, dataframe) print(' R2_score : ' + str(scores[0])) print(' MSE_score : ' + str(scores[1])) R2_score = scores[0] MSE_score = scores[1] results_df = results_df.append( { 'Model': model, 'num_features': len(dataframe.columns) - 1, 'Features': Features, 'params': parameters_dict, 'R2': R2_score, 'MSE': MSE_score }, ignore_index=True) # - - - - - - - - - - - - - - - # Features = SUBSET # - - - - - - - - - - - - - - - # if features_selection = 'subset': ## TODO # - - - - - - - - - - - - - - - - - - - - - - - # Features = RFE selected (with climate zones) # - - - - - - - - - - - - - - - - - - - - - - - elif features_selection == 'RFE': for num_features in range(5, 30): print(' RFE with ' + str(num_features) + ' features ...') ## RFE - Features selection selector = RFE(regression, num_features, step=1) x = dataframe.drop(['calories_per_ha'], axis=1) y = dataframe['calories_per_ha'] X, X_test, Y, Y_test = train_test_split(x, y) X_RFE = selector.fit_transform(X, Y) features_selected = [ X.columns[feature_pos] for feature_pos in selector.get_support(indices=True) ] # Do regression and append results to results_df scores = do_regression( regression, dataframe[(features_selected + ['calories_per_ha'])]) print(' R2_score : ' + str(scores[0])) print(' MSE_score : ' + str(scores[1])) R2_score = scores[0] MSE_score = scores[1] results_df = results_df.append( { 'Model': model, 'num_features': num_features, 'Features': features_selected, 'params': parameters_dict, 'R2': R2_score, 'MSE': MSE_score }, ignore_index=True) # - - - - - - - - - - - - - - - - - - - - - - # Features = RFE selected (w/o climate zones) # - - - - - - - - - - - - - - - - - - - - - - elif features_selection == 'RFE_but_climzones': dataframe = dataframe[columns_without_climatezones] for num_features in range(5, 30): print('RFE (no climzones) with ' + str(num_features) + ' features ...') ## RFE - Features selection selector = RFE(regression, num_features, step=1) x = dataframe.drop(['calories_per_ha'], axis=1) y = dataframe['calories_per_ha'] X, X_test, Y, Y_test = train_test_split(x, y) X_RFE = selector.fit_transform(X, Y) features_selected = [ X.columns[feature_pos] for feature_pos in selector.get_support(indices=True) ] # Do regression and append results to results_df scores = do_regression( regression, dataframe[(features_selected + ['calories_per_ha'])]) print(' R2_score : ' + str(scores[0])) print(' MSE_score : ' + str(scores[1])) R2_score = scores[0] MSE_score = scores[1] results_df = results_df.append( { 'Model': model, 'num_features': num_features, 'Features': features_selected, 'params': parameters_dict, 'R2': R2_score, 'MSE': MSE_score }, ignore_index=True) elif features_selection == 'RFE_8_20': for num_features in [8, 20]: print('RFE with ' + str(num_features) + ' features ...') ## RFE - Features selection selector = RFE(regression, num_features, step=1) x = dataframe.drop(['calories_per_ha'], axis=1) y = dataframe['calories_per_ha'] X, X_test, Y, Y_test = train_test_split(x, y) X_RFE = selector.fit_transform(X, Y) features_selected = [ X.columns[feature_pos] for feature_pos in selector.get_support(indices=True) ] # Do regression and append results to results_df scores = do_regression( regression, dataframe[(features_selected + ['calories_per_ha'])]) print(' R2_score : ' + str(scores[0])) print(' MSE_score : ' + str(scores[1])) R2_score = scores[0] MSE_score = scores[1] results_df = results_df.append( { 'Model': model, 'num_features': num_features, 'Features': features_selected, 'params': parameters_dict, 'R2': R2_score, 'MSE': MSE_score, 'inputs': inputs }, ignore_index=True) return (results_df)
l.append(tup) rd.seed(77) choice = [] for i in l: r = rd.random() if i[0] not in choice and i[1] not in choice: if r >= 0.5: choice.append(i[0]) else: choice.append(i[1]) df = df.loc[:, choice] # df = df.drop(choice,axis=1) return df, choice rfe = RFE(estimator=DecisionTreeClassifier(random_state=2), n_features_to_select=30) selector = rfe.fit(all_features_norm, survival_df_filtered['days_to_death']) selected_ind = np.where(selector.support_) all_features_selected = all_features_norm[ all_features_norm.columns[selected_ind]] events = survival_df_filtered['vital_status'].astype(bool) all_features_drop_low_var = DropLowVariance(all_features_selected, events) all_feature_names = [[feature_names[i] + '_' + str(j) for i in range(115)] for j in range(4)] all_feature_names_ls = list(chain.from_iterable(all_feature_names)) all_reduced_features = [ all_feature_names_ls[i] for i in list(all_features_drop_low_var.columns) ]
def get_data(): # Read csv listings_df = pd.read_csv('./airbnb_data.csv', low_memory=False) # Drop columns that aren't related to income or not feasible to capture from user columns_to_drop = [ 'Unnamed: 0', 'id', 'scrape_id', 'host_id', 'host_total_listings_count', 'latitude', 'longitude', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'number_of_reviews', 'calculated_host_listings_count', 'reviews_per_month', 'Other', 'listing_url', 'last_scraped', 'host_name', 'experiences_offered', 'picture_url', 'name', 'host_url', 'host_since', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'city', 'neighbourhood_group_cleansed', 'smart_location', 'country_code', 'country', 'is_location_exact', 'amenities', 'price', 'calendar_updated', 'has_availability', 'calendar_last_scraped', 'first_review', 'last_review', 'requires_license', 'jurisdiction_names', 'instant_bookable', 'is_business_travel_ready', 'cancellation_policy', 'require_guest_profile_picture', 'require_guest_phone_verification', 'translation missing: en.hosting_amenity_49', 'summary', 'space', 'description', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url', 'xl_picture_url', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'state', 'neighbourhood_cleansed', 'host_neighbourhood', 'license', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'weekly_price', 'monthly_price', 'security_deposit', 'cleaning_fee', 'market' ] for col in columns_to_drop: listings_df.drop([col], axis=1, inplace=True) # Remove rows that don't have an estimated income per month listings_df = listings_df[~pd. isna(listings_df['estimated_income_per_month'])] # Dropping square feet because 7450 out of 7712 (97%) rows are null listings_df.drop(['square_feet'], axis=1, inplace=True) # Fill values going forward listings_df.fillna(method='ffill', inplace=True) # Convert zipcode to string rather than float listings_df['zipcode'] = listings_df['zipcode'].astype('int').astype('str') # Convert $ amount for extra people from string to float listings_df['extra_people'] = listings_df['extra_people'].apply( lambda s: s[1:]).astype('float') amenities = listings_df.iloc[:, 13:-1] y = np.ravel(listings_df.iloc[:, [-1]]) # Select 20 top amenities select = RFE(LinearRegression(), 20).fit(amenities, y) # Remove amenities that weren't selected remove_cols = [ col for i, col in enumerate(amenities.columns.values) if not select.get_support()[i] ] for col in remove_cols: listings_df.drop([col], axis=1, inplace=True) listings_df = pd.get_dummies(listings_df) estimated_income = listings_df['estimated_income_per_month'] listings_df = listings_df.drop(['estimated_income_per_month'], axis=1) return listings_df, estimated_income
print("MSE: %0.2f" % mean_squared_error(Y, predicted)) print('\nsupport vector machine rbf') clf = svm.SVR(epsilon=0.2, kernel='rbf', C=1.) scores = cross_val_score(clf, X, Y, cv=cv) print("mean R2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) predicted = cross_val_predict(clf, X, Y, cv=cv) print("MSE: %0.2f" % mean_squared_error(Y, predicted)) print('\nknn') knn = KNeighborsRegressor() scores = cross_val_score(knn, X, Y, cv=cv) print("mean R2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) predicted = cross_val_predict(knn, X, Y, cv=cv) print("MSE: %0.2f" % mean_squared_error(Y, predicted)) best_features = 4 rfe_lin = RFE(lin, best_features).fit(X, Y) supported_features = rfe_lin.get_support(indices=True) for i in range(0, 4): z = supported_features[i] print(i + 1, boston.feature_names[z]) best_features = 4 print('feature selection on linear regression') rfe_lin = RFE(lin, best_features).fit(X, Y) mask = np.array(rfe_lin.support_) scores = cross_val_score(lin, X[:, mask], Y, cv=cv) print("R2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) predicted = cross_val_predict(lin, X[:, mask], Y, cv=cv) print("MSE: %0.2f" % mean_squared_error(Y, predicted)) print('feature selection ridge regression') rfe_ridge = RFE(ridge, best_features).fit(X, Y)
print(model.feature_importances_) feature_imp = pd.DataFrame({'Features': df5.columns.tolist(), 'Importance': model.feature_importances_}) ''' # RFE from sklearn.feature_selection import RFE, RFECV from sklearn.linear_model import LinearRegression #from sklearn.ensemble import RandomForestRegressor # feature extraction model = LinearRegression() #model = RandomForestRegressor() rfe = RFE(model, 10, step=1) #rfe = RFECV(model, step=1, cv=5) fit = rfe.fit(X, y) print("Num Features: %d") % fit.n_features_ print("Selected Features: %s") % fit.support_ print("Feature Ranking: %s") % fit.ranking_ feature_imp = pd.DataFrame({ 'Features': df5.columns.tolist(), 'Select': fit.support_, 'Rank': fit.ranking_ }) feature_imp.to_csv( 'C:/Users/Jie.Hu/Desktop/Correlation/corr_0906/feature_imp_rf.csv',
x_train.shape x_train from sklearn.linear_model import LogisticRegression #Train the model model = LogisticRegression() model.fit(x_train, y_train) #Training the model x_train.shape[1] #recursive feature elimination from sklearn.feature_selection import RFE logreg = LogisticRegression() rfe = RFE(logreg, x_train.shape[1]) rfe = rfe.fit(x_train, y_train) print(rfe.support_) print(rfe.ranking_) from sklearn.metrics import classification_report from sklearn.metrics import accuracy_score predictions = model.predict(x_test) print(predictions) # printing predictions print() # Printing new line #Check precision, recall, f1-score print(classification_report(y_test, predictions))
print("Data Types:", dataframe.dtypes) dataset = dataframe.values # In[11]: #splitting dataset X = dataset[:, 0:15] Y1 = dataset[:, 14] #gt_c_decay Y2 = dataset[:, 15] #gt_t_decay # In[12]: #Feature Selection for gt_c_decay estimator = ExtraTreesRegressor() rfe = RFE(estimator, 3) fit = rfe.fit(X, Y1) print("Number of Features: ", fit.n_features_) print("Selected Features: ", fit.support_) print("Feature Ranking: ", fit.ranking_) # In[13]: #Feature Selection for gt_t_decay estimator = ExtraTreesRegressor() rfe = RFE(estimator, 3) fit = rfe.fit(X, Y2) print("Number of Features: ", fit.n_features_) print("Selected Features: ", fit.support_)
X_train_nol_sel = sk1.transform(X_train_nol) X_test_nol_sel = sk1.transform(X_test_nol) selected1 = sk1.get_support() # logger.debug("1st feature selection accomplished") if not os.path.exists( "Results/selected_{f}".format(f=featureName)): os.mkdir("Results/selected_{f}".format(f=featureName)) savemat( "Results/selected_{f}/selected1_{c}_{fold}.mat".format( f=featureName, c=iterCount, fold=fold), {'data': selected1}) for feature_num in range(10, X_train_nol_sel.shape[1], 1): for C in np.logspace(-4, 4, 9, base=2): # the second step feature selection lr = LinearRegression() rfe = RFE(lr, n_features_to_select=feature_num) rfe.fit(X_train_nol_sel, y_train) selected2 = rfe.support_ # svm classification clf = svm.SVC(kernel='linear', C=C).fit(X_train_nol_sel[:, selected2], y_train) score = clf.score(X_test_nol_sel[:, selected2], y_test) y_score = clf.decision_function( X_test_nol_sel[:, selected2]) res = clf.predict(X_test_nol_sel[:, selected2]) ACC, SEN, SPE = model_evaluate(test_label=y_test, res_label=res) if score > acc_max: COp = C featureNum_op = feature_num
from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) logreg = LogisticRegression() logreg.fit(X_train, y_train) print('Accuracy of Logistic regression classifier on training set: {:.2f}'. format(logreg.score(X_train, y_train))) print('Accuracy of Logistic regression classifier on test set: {:.2f}'.format( logreg.score(X_test, y_test))) from sklearn.feature_selection import RFE classifier = LogisticRegression() selector = RFE(classifier, 10, step=1) selector = selector.fit(X, y) print(selector.support_) predicted = cross_validation.cross_val_predict(logreg, X, y, cv=10) print(metrics.accuracy_score(y, predicted)) print(metrics.classification_report(y, predicted)) logit_model = sm.MNLogit(y, X) result = logit_model.fit() print(result.summary()) print(logreg.intercept_) print(logreg.coef_) # Binarize the output
# In[42]: # Importing RFE and LinearRegression from sklearn.feature_selection import RFE from sklearn.linear_model import LinearRegression # In[43]: # Running RFE with the output number of the variable equal to 9 lm = LinearRegression() rfe = RFE(lm, 15) # running RFE rfe = rfe.fit(X_train, y_train) print(rfe.support_) # Printing the boolean results print(rfe.ranking_) # In[44]: col = X_train.columns[rfe.support_] print(col) # In[45]:
# from sklearn import linear_model reg = linear_model.LinearRegression(fit_intercept=False) X = scalstat.drop('LifeExp', axis=1) reg.fit(X, scalstat.LifeExp) reg.coef_ # reg.intercept_ # from sklearn.feature_selection import RFE selector = RFE(reg, n_features_to_select=1) selector = selector.fit(X, scalstat.LifeExp) selector.ranking_ # X.columns[np.argsort(selector.ranking_)].tolist() # ## Sample Splitting # import faraway.datasets.fat fat = faraway.datasets.fat.load() #
def rfe_tree(self): estimator = DecisionTreeClassifier() selector = RFE(estimator, n_features_to_select=self.fs_num) return (selector)