def cv(model, features, labels): global g_accuracy k = 10 if g_accuracy: selector = RFECV(model, step=1, cv=k) selector = selector.fit(features, labels) score = selector.score(features, labels) return score, selector.n_features_, selector.ranking_ selector_prec = RFECV(model, step=1, cv=k, scoring='precision_weighted') selector_prec.fit(features, labels) score_prec = selector_prec.score(features, labels) selector_rec = RFECV(model, step=1, cv=k, scoring='recall_weighted') selector_rec.fit(features, labels) score_rec = selector_rec.score(features, labels) selector_f1 = RFECV(model, step=1, cv=k, scoring='f1_weighted') selector_f1.fit(features, labels) score_f1 = selector_f1.score(features, labels) return (score_prec, selector_prec.n_features_, selector_prec.ranking_), \ (score_rec, selector_rec.n_features_, selector_rec.ranking_), \ (score_f1, selector_f1.n_features_, selector_f1.ranking_)
def randomforest_rfecv(X, y, X_test, y_test, columns): estimator = RandomForestClassifier(**CLASSIFIER_PARAMS) selector = RFECV(estimator, step=1, cv=5, verbose=0) selector = selector.fit(X, y) # selector ranking to column:rank pairs rank = {columns[i]: s for i, s in enumerate(selector.ranking_)} # Feature importances importances = { columns[i]: v for i, v in enumerate(selector.estimator_.feature_importances_) } labeled = { str(k): v for k, v in sorted(importances.items(), key=lambda item: -item[1]) } return { # sort rank by values 'rank': { str(k): int(v) for k, v in sorted(rank.items(), key=lambda item: item[1]) }, # pick selected features names 'support': [columns[i] for i, s in enumerate(selector.support_) if s], 'feature_importances': labeled, 'score': selector.score(X, y), 'test_score': selector.score(X_test, y_test) }
def decision_tree(): print "---bc---" clf = tree.DecisionTreeClassifier(criterion="gini") rfecv = RFECV(clf, cv=10) _decision_tree(clf, bc_data_train, bc_data_test, bc_target_train, bc_target_test, "bc_gini") for depth in DEPTHS: clf = tree.DecisionTreeClassifier(criterion="gini", max_depth=depth) _decision_tree(clf, bc_data_train, bc_data_test, bc_target_train, bc_target_test, "bc_gini" + str(depth)) clf = tree.DecisionTreeClassifier(criterion="entropy") _decision_tree(clf, bc_data_train, bc_data_test, bc_target_train, bc_target_test, "bc_entropy") for depth in DEPTHS: clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=depth) _decision_tree(clf, bc_data_train, bc_data_test, bc_target_train, bc_target_test, "bc_entropy" + str(depth)) rfecv.fit(bc_data_train, bc_target_train) print rfecv.support_ print rfecv.ranking_ print rfecv.score(bc_data_test, bc_target_test) plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() print "---v---" clf = tree.DecisionTreeClassifier(criterion="gini") rfecv = RFECV(clf, cv=10) _decision_tree(clf, v_data_train, v_data_test, v_target_train, v_target_test, "v_gini") for depth in DEPTHS: clf = tree.DecisionTreeClassifier(criterion="gini", max_depth=depth) _decision_tree(clf, v_data_train, v_data_test, v_target_train, v_target_test, "v_gini" + str(depth)) clf = tree.DecisionTreeClassifier(criterion="entropy") _decision_tree(clf, v_data_train, v_data_test, v_target_train, v_target_test, "v_entropy") for depth in DEPTHS: clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=depth) _decision_tree(clf, v_data_train, v_data_test, v_target_train, v_target_test, "v_entropy" + str(depth)) rfecv.fit(v_data_train, v_target_train) print rfecv.support_ print rfecv.ranking_ print rfecv.score(v_data_test, v_target_test) plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def main(): train_df = munge_data('./data/train.csv', False) train_df = train_df.drop('PassengerId', axis=1) target_df = train_df['Survived'] train_df = train_df.drop('Survived', axis=1) train_df = train_df.sort(axis=1) test_df = munge_data('./data/test.csv') test_ids = test_df.PassengerId.values test_df = test_df.drop('PassengerId', axis=1) test_df = test_df.sort(axis=1) train_data = train_df.values target_data = target_df.values test_data = test_df.values clf = svm.SVC(kernel='linear') selector = RFECV(clf, step=1, cv=5, scoring='accuracy') train_data, cx_data, target_data, cx_target_data = cross_validation.train_test_split( train_data, target_data, test_size=0.2) selector = selector.fit(train_data, target_data) print(selector.score(cx_data, cx_target_data)) cx_predictions = selector.predict(cx_data) print(classification_report(cx_target_data, cx_predictions)) predictions = selector.predict(test_data) with open('output.csv', 'w') as o: o.write('PassengerId,Survived\n') for passenger, prediction in zip(test_ids, predictions): o.write('{},{}\n'.format(passenger, prediction))
def optimal_features(model, x_train, y_train, x_test, y_test): rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(2), scoring='accuracy') rfecv.fit(x_train, y_train) print(rfecv.score(x_train, y_train), rfecv.score(x_test, y_test)) print("Optimal number of features : %d" % rfecv.n_features_) # # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def featureSelectAndClassifyRFECV(X_train, X_test, y_train, y_test): scaler = MinMaxScaler() #scaler = StandardScaler() #scaler = RobustScaler() X_train_minmax = scaler.fit_transform(X_train) X_test_minmax = scaler.transform(X_test) #svc =svm.LinearSVC() rf = RandomForestClassifier(n_estimators=50, max_depth=20) rfecv = RFECV(estimator=rf, step=1, min_features_to_select=5, cv=StratifiedKFold(5), scoring='accuracy') X_train_transformed = rfecv.fit_transform(X_train_minmax, y_train) #X_train_transformed = rfecv.fit_transform(X_train, y_train) X_test_transformed = rfecv.transform(X_test_minmax) #X_test_transformed = rfecv.transform(X_test) score = rfecv.score(X_test_minmax, y_test) #score = rfecv.score(X_test, y_test) print('Optimal no. of features are ' + str(rfecv.n_features_)) print('Score for test set is ' + str(score)) print(rfecv.ranking_.shape) print(X_train_transformed.shape) print(X_test_transformed.shape) plt.figure() plt.xlabel('no. of features') plt.ylabel('cv score') plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def FeatureSelectGreedy(df, model, in_columns, target, step=100): y = df[target] selector = RFECV(model, step=1, cv=3) keep_columns = list() N = len(in_columns) for i in range(0, N, step): j = min(i + step, N) print "\n--\nNumber of test features = %d(/%d)" % (j, N) X = df[keep_columns + in_columns[i:j]] start_time = timer(None) selector = selector.fit(X, y) timer(start_time) keep_columns = X.columns[selector.support_].tolist() score = selector.score(X, y) print "Number of keep features =", len(keep_columns) print "Score =", score return keep_columns
def main(): train_df = munge_data('./data/train.csv', False) train_df = train_df.drop('PassengerId', axis=1) target_df = train_df['Survived'] train_df = train_df.drop('Survived', axis=1) train_df = train_df.sort(axis=1) test_df = munge_data('./data/test.csv') test_ids = test_df.PassengerId.values test_df = test_df.drop('PassengerId', axis=1) test_df = test_df.sort(axis=1) train_data = train_df.values target_data = target_df.values test_data = test_df.values clf = svm.SVC(kernel='linear') selector = RFECV(clf, step=1, cv=5, scoring='accuracy') train_data, cx_data, target_data, cx_target_data = cross_validation.train_test_split( train_data, target_data, test_size=0.2) selector = selector.fit(train_data, target_data) print(selector.score(cx_data, cx_target_data)) cx_predictions = selector.predict(cx_data) print(classification_report(cx_target_data, cx_predictions)) predictions = selector.predict(test_data) with open('output.csv', 'w') as o: o.write('PassengerId,Survived\n') for passenger, prediction in zip(test_ids, predictions): o.write('{},{}\n'.format(passenger, prediction))
def test_model(model, xtrain, ytrain, feature_list, prefix): """ use train_test_split to create validation train/test samples """ xTrain, xTest, yTrain, yTest = train_test_split(xtrain, ytrain, test_size=0.4) if DO_RFECV: model.fit(xtrain, ytrain) if hasattr(model, 'coef_'): model = RFECV(estimator=model, verbose=0, step=1, scoring=score_fn, cv=3) model.fit(xTrain, yTrain) print 'score', model.score(xTest, yTest) ypred = model.predict(xTest) ### don't allow model to predict negative number of orders if any(ypred < 0): print ypred[ypred < 0] ypred[ypred < 0] = 0 print 'RMSE', np.sqrt(mean_squared_error(ypred, yTest)) # debug_output(model, feature_list) debug_plots(model, yTest, ypred, prefix) return
def testModel(_model, _X, _Y): if _model == "LogisticRegression": model = LogisticRegression(multi_class='multinomial', solver='lbfgs') elif _model == "MLPClassifier": model = MLPClassifier() elif _model == "RandomForestClassifier": model = RandomForestClassifier() elif _model == "GradientBoostingClassifier": model = GradientBoostingClassifier() elif _model == "XGBClassifier": model = XGBClassifier() X_train, X_test, y_train, y_test = __splitData(_X, _Y) # Since XGBoost is not part of sklearn if _model == "XGBClassifier": model.fit(X_train, y_train.values.ravel()) #y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] accuracy = accuracy_score(y_test, predictions) print("Accuracy of ", _model, " classifier on test set: {:.2f}".format( accuracy)) # For the sklearn stuff else: selector = RFECV(model) # Use the RFE wrapper selector.fit(X_train, y_train.values.ravel()) #y_pred = selector.predict(X_test) print("Accuracy of ", _model, " classifier on test set: {:.2f}".format( selector.score(X_test, y_test)))
def rfecv(self): rfecv = RFECV(estimator=SVC(kernel = "linear"), step=1, cv=StratifiedKFold(10), scoring='accuracy') rfecv.fit(self.train_X, self.train_y) print "Best number of features:" + str(rfecv.n_features_) print "Accuracy on test data:" + str(rfecv.score(self.test_X,self.test_y)) print "RFECV feature ranking:" print rfecv.ranking_
def data_prediction(): train, test = data_preprocessing() X = train.drop(columns=['gender']) y = train['gender'] print('[INFO]....trainset shape: ', X.shape) print('[INFO]....testset shape: ', test.shape) encoding_columns = ['first_item_browsed'] X, test = category_encoding(encoding_columns, 0.2, X, y, test) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=123) ##########################FOR BASE LGBM############################################ '''model = lgb.LGBMClassifier() model.fit(X_train,y_train) print('score on validation data: ',model.score(X_test,y_test)) final_pred = model.predict(test)''' ##########################FOR LGBM USING RFECV######################################### print('[INFO]....Creating an LGBM model') print('[INFO]....Applying RFECV to select 150 features') model = lgb.LGBMClassifier() model = RFECV(estimator=model, step=10, min_features_to_select=150, scoring='accuracy') model.fit(X_train, y_train) X_train = model.transform(X_train) X_test = model.transform(X_test) test = model.transform(test) print('[INFO]....After tranformation train shape :', X_train.shape) model = lgb.LGBMClassifier() model.fit(X_train, y_train) print('score on validation data: ', model.score(X_test, y_test)) final_pred = model.predict(test) ###########################FOR STACKING PURPOSE ############################################ '''basemodel_1,basemodel_2,basemodel_3,meta_model = stacking_models(X_train,X_test,y_train,y_test) base_pred_test = np.column_stack((basemodel_1.predict_proba(test)[:,1],basemodel_2.predict_proba(test)[:,1],\ basemodel_3.predict_proba(test)[:,1])) final_pred = meta_model.predict(base_pred_test)''' ###########################FOR NEURAL NETWORK PURPOSE############################################# #model = neural_net(X_train,y_train,X_train.shape[1]) #pd.Series(dict(zip(X.columns.tolist(),model.feature_importances_))).sort_values(ascending=False).head(20).plot(kind='bar') return (final_pred)
def train_classfier(X_train, y_train, X_test, y_test): svc = LinearSVC() clf = RFECV(svc, step=0.1, cv=7, n_jobs=-1) t = time.time() clf.fit(X_train, y_train) t2 = time.time() print(round(t2 - t, 2), 'Seconds to train SVC...') t = time.time() print('Test Accuracy of SVC = ', round(clf.score(X_test, y_test), 4)) # Check the prediction time for a single sample print('time takes: ', time.time() - t) return clf
class rfe_LBC(li_LBC): def fit(self, X, Y): params = self.get_params() model = li_LBC(**params) self.rfe = RFECV(model) self.rfe.fit(X, Y) def predict(self, X): return self.rfe.predict(X) def score(self, X, Y): return self.rfe.score(X, Y)
def recursive_feature_elimination_cv(X_train, y_train, X_test, y_test): # Create the RFE object and compute a cross-validated score. svc = SVC(kernel="linear") # classifications rfecv = RFECV(estimator=svc, step=1, cv=KFold(10), scoring='accuracy', n_jobs=-1) rfecv.fit(X_train, y_train) # Determine the accuracy of the SVC model on the test-data, get the used number of features and ranking of the importance of features accuracy = rfecv.score(X_test, y_test) RankFeatures = rfecv.ranking_ Nfeatures = rfecv.n_features_ return [rfecv, accuracy, Nfeatures, RankFeatures]
def feature_selection_with_cv(features_values_temp, rows_temp, columns_temp, prediction_values_temp, kernel, threshold): #kernel: linear, poly, rbf, sigmoid, precomputed rows = 0 while rows_temp > 0: rows = rows + 1 rows_temp = rows_temp - 1 columns = 0 while columns_temp > 0: columns = columns + 1 columns_temp = columns_temp - 1 features_values = [x for x in features_values_temp] prediction_values = [y for y in prediction_values_temp] rotated = convert_list_to_matrix(features_values, rows, columns) scores = np.array(prediction_values) threshold = float(threshold) estimator = SVR(kernel=kernel) # try to change to the model for which the test is gonna run (lasso, ridge, etc.) ###############START: PLAYING AROUND WITH RECURSIVE FEATURE WITH CROSS VALIDATION FUNCTION.##################### #####Seems to be a bit different. RFE (without cross validation) allows us to choose a number of features we're ### #####looking for. It seems that cross valdiation chooses the optimal number? So no threshold? Not positive.####### selector = RFECV(estimator, step=1, cv=5) selector = selector.fit(rotated, scores) selector.support_ print selector.support_ features_used = [i+1 for i, x in enumerate(selector.support_) if x == True] # i+1 b/c matlab starts indexing from 1 print features_used features_used = [] threshold = selector.score(rotated, scores) ####perhaps if this is the "optimal # of features" we could use this value as #the RFE threshold value. print "threshold: " print threshold
def select_feature_wrapping(self, estimator, X, y, scoring): estimator_name = (self.get_default_params_and_name(estimator))[0] print("using recursive feature elimination to tune features: " + estimator_name) selector = RFECV(estimator, step=1, cv=3, scoring=scoring, verbose=2) selector = selector.fit(X, y) sn = selector.n_features_ sc = selector.score(X, y) sr = selector.ranking_ print("features number and score:", sn, sc) print("selected features ranking:", sr) with open("tf_log.csv", 'a', newline='') as f: writer = csv.writer(f) str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) writer.writerow( ["feature selection with rfecv ", estimator_name, str_time]) writer.writerow([ "feature selection score: ", sc, "selected feature number:", sn, "feature ranking:" ]) writer.writerow(sr) return {estimator_name: selector}
from sklearn.naive_bayes import GaussianNB from sklearn.model_selection import GridSearchCV from sklearn.metrics import classification_report from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import RFECV print "RandomForestClassifier " # RFECV: Select the algorithm to train with: clf_Ranking = RFECV(GradientBoostingClassifier(random_state=0, learning_rate=0.05, max_depth=1), scoring='accuracy', n_jobs=-1) # RFECV: Fit and transform the RFECV function clf_Ranking.fit_transform(features_train, labels_train) print clf_Ranking.score(features_train, labels_train) print clf_Ranking.ranking_ # result of feature selection : [ 1 13 4 14 1 12 11 8 1 9 5 6 1 2 10 7 3 1] # [1 4 5 1 1 1 1 1 3 1 1 1 6 2 1 1 1 1] # [14 5 1 11 1 10 4 1 1 1 6 3 2 9 8 12 13 7 1] #print scores # GBC : [13 12 11 10 3 1 1 9 1 1 1 8 1 7 6 2 4 5 1 1] # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- # SECTION 4: Classifier Selection # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info:
data_USA['target'] = np.where(condition, 0 , 1) data_USA_target = data_USA['target'] data_USA.drop(['num','id','target'],axis = 1, inplace = True) data_USA = pd.get_dummies(data_USA, columns= ['cp','restecg','slope','thal','loc']) data_std = Standardize(data_USA) data_std['target'] = data_USA_target print("Data preprocessed...") data = data_std.as_matrix() train_x, test_x, train_y, test_y = train_test_split(data[:, 0:-1], data[:,-1],train_size=0.75) names = list(data_USA.columns.values) print("Executing Recursive Feature Elimination in SVM...") svc = SVC(kernel="linear", C=5) rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(10), scoring='accuracy') rfecv.fit(train_x, train_y) Training_score = rfecv.score(train_x, train_y) predicted= rfecv.predict(test_x) accuracy = accuracy_score(test_y, predicted) print("The support array \n",rfecv.support_) print("The ranking array \n",rfecv.ranking_) print(sorted(zip(map(lambda x: round(x, 4), rfecv.ranking_), names))) print("Training Accuracy is ", Training_score) print("Test Accuracy is ", accuracy) print("The Cross-validation score :" ,max(rfecv.grid_scores_)) print("Optimal number of features : {}" .format(rfecv.n_features_)) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def predictAndPlot(data, header, features, name): print "\n%s" % name # First reduce the data to relevant features. features_plus_date = np.hstack((0, features)) analyzed_data = data[:, features_plus_date] # Remove rows with missing data. for i in range(len(analyzed_data[0])): analyzed_data = analyzed_data[analyzed_data[:, i] != ''] # If it is a retention feature, skip the last X entries. if "retention" in name: if "1d" in name: retention_feature_linesSkipped = 3 elif "3d" in name: retention_feature_linesSkipped = 7 elif "7d" in name: retention_feature_linesSkipped = 15 elif "14d" in name: retention_feature_linesSkipped = 29 elif "28d" in name: retention_feature_linesSkipped = 57 else: retention_feature_linesSkipped = 0 analyzed_data = analyzed_data[:-retention_feature_linesSkipped, :] # The second-last line is # votes. If smaller than 50, skip this entry. # analyzed_data = analyzed_data[analyzed_data[:, -2].astype(float) >= min_daily_regs] # I added the date to simply for plotting reasons. Just in case. Could be removed if not needed. dates = analyzed_data[:, 0] # Set best model and best score default values. best_model = "" best_score = -100 # Iterate through all models to obtain the best parameters and features via cross validation for model_type in list_of_models: # Get training data X and y. X = analyzed_data[:, 1:-1].astype(float) # Ignore dates (first column) and "y" (last column) y = analyzed_data[:, -1].astype(float) model = define_model(model_type) # Set model parameters based on model_type # Perform differently depending on which model is used. # Random Forest has to be treated differently because it doesn't support RFECV. if model_type == "RF": to_be_used_threshold = "median" # Default value. Will be overwritten. score = -100. # Loop through different thresholds. Use the one with the highest score. for model_threshold in ("10.*median", "3.*median", "1*median", "0.3*median", "0.1*median", "0.03*median"): try: # Use only the "model_threshold" best features. model.fit(X, y) X_new = model.transform(X, threshold=model_threshold) header_new = model.transform(header[features][:-1], threshold=model_threshold) # Fit the model again with reduced features X_new and return out of bag score. model.fit(X_new, y) rf_score = model.oob_score_ # I try to keep the amount of features as small as possible. # The rf_score of a model with more features needs to be 2% better to justify more params. # In some cases the score is negative so it also needs to be better overall. if (rf_score > score * 1.02) and (rf_score > score): score = rf_score to_be_used_threshold = model_threshold except: # Just a debug output. print "There was an error at model threshold: %s" % model_threshold print "Score is %2.3f with threshold: %s" % (score, to_be_used_threshold) elif model_type in ("ElasticCV", "Elastic", "linear", "LassoCV"): selector = RFECV(model) selector = selector.fit(X, y) header_new = header[features][:-1] score = selector.score(X, y) print "Score is %2.3f with model: %s" % (score, model_type) else: print "Something went wrong!" if score > best_score: best_score = score best_model = model_type print "Best score is %2.3f with model: %s" % (best_score, best_model) # Predict using the best model, parameters and features, obtained before. model_type = best_model model = define_model(model_type) if model_type == "RF": # In some rare cases the model does not work, because all features were discarded. # Therefore try to do it again without a threshold, that should always work (model_threshold). try: model.fit(X, y) X_new = model.transform(X, threshold = to_be_used_threshold) header_new = model.transform(header[features][:-1], threshold=to_be_used_threshold) model.fit(X_new, y) prediction = model.predict(X_new) score = model.oob_score_ except: print "Fitting the model didn't work! The prediction might be sub-optimal. \nThreshold: %s" % model_threshold model.fit(X, y) prediction = model.predict(X) #score = model.oob_score_ score = 0 elif model_type in ("ElasticCV", "Elastic", "linear", "LassoCV"): selector = RFECV(model) selector = selector.fit(X, y) header_new = header[features][:-1] prediction = selector.predict(X) score = selector.score(X, y) else: print "lol!" # Now derive the importances respectively feature coefficients. try: # This only works with "RF" importances = model.feature_importances_ importances_list = np.vstack((importances, header_new)) importances_list = np.transpose(importances_list) importances_list = importances_list[importances_list[:, 0].astype(float).argsort()][::-1] except: # This should work with all other models. try: X_new = selector.transform(X) header_new = selector.transform(header_new) model.fit(X_new, y) med_value = np.median(X_new, axis=0) med_value[med_value == 0] = np.mean(X_new, axis=0)[med_value == 0] importances = model.coef_ * np.median(X_new, axis=0) importances_list = np.vstack((importances, header_new)) importances_list = np.transpose(importances_list) importances_list = importances_list[importances_list[:, 0].astype(float).argsort()][::1] except: # If the above doesnt work, just give a blank output. importances_list = np.zeros((10, 2)) score = "%s, %s\nOOB Score = %2.2f" % (name, model_type, score) plot_predictionVsActual(prediction, y, score) return prediction, y, dates, importances_list
# In[ ]: ###################################### # In[ ]: # Automagic to fınd the optımal number of features for some algorithm rfecv = RFECV(estimator=RandomForestClassifier(n_estimators=100), step=1, cv=StratifiedKFold(train_y, 2), scoring='accuracy') rfecv.fit(train_X, train_y) print(rfecv.score(train_X, train_y), rfecv.score(val_X, val_y)) print("Optimal number of features : %d" % rfecv.n_features_) #Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() # In[ ]: clf = RandomForestClassifier(n_estimators=100) ##Train the selected model clf.fit(train_X, train_y)
def main(): train = pd.read_csv("../input/train.csv") #change filepath later test = pd.read_csv('../input/train.csv') #change filepath later full = train.append(testc, ignoer_index=True) titanic = full[:891] del train, test print('Datasets:', 'full:', full_shape, 'titanic:', titanic.shape) #Peak at data to see what it looks like titanic.head() titanic.describe() #Plot correlation heat map plot_correlation_map(titanic) #Plot distribution of Age of passengers plot_distribution(titanic, var='Age', target='Survived', row='Sex') #Plot distribution of Fare of passengers plot_distribution(titanic, var='Fare', target='Survived', row='Pclass') #Plot survival rate by embarked plot_categories(titanic, cat='Embarked', target='Survived') #Plot survival rate by Sex plot_categories(titanic, cat='Sex', target='Survived') #Plot survival rate by Pclass plot_categories(titanic, cat='Pclass', target='Survived') #Plot surivival rate by SibSp plot_categories(titanic, cat='SibSp', target='Survived') #Plot survival rate by Parch plot_categories(titanic, cat='Parch', target='Survived') #Make sex into binary values 0 & 1 (needs to be numerical data) sex = pd.Series(np.where(full.Sex == 'male', 1, 0), name='Sex') #Create new variable for every unique embarked variable embarked = pd, get_dummies(full.Embarked, prefix='Embarked') embarked.head() #Create new variable for every unique value of Passenger Class pclass = pd.get_dummies(full.Pclass, prefix='Pclass') pclass.head() #Replace 2 missing embarkation values with the port closest to fare value imputed.head() #Extracting title title = pd.DataFrame() title['Title'] = full['Name'].map( lambda name: name.split(',')[1].split('.')[0].strip()) Title_Dictionary = { "Capt": "Officer", "Col": "Officer", "Major": "Officer", "Jonkheer": "Royalty", "Don": "Royalty", "Sir": "Royalty", "Dr": "Officer", "Rev": "Officer", "the Countess": "Royalty", "Dona": "Royalty", "Mme": "Mrs", "Mlle": "Miss", "Ms": "Mrs", "Mr": "Mr", "Mrs": "Mrs", "Miss": "Miss", "Master": "Royalty", "Lady": "Royalty" } title['Title'] = title.Title.map(Title_Dictionary) title = pd.get_dummies(title.Title) #title pd.concat([title, titles_dummies], axis = 1) title.head() #Replace 1 missing fare value with the median full['Fare'] = full.Fare.fillna(full.Fare.median()) #Fill missing values of Age #Option 1: fill with the average of Age #Age['Age'] = full.Age.fillna(full.Age.mean()) #Option 2: use regression analysis to find likely value of age for missing values #will need to get rid of negative ages and other stupid values #Option 3: fill missing ages with medians that are seperated by group stuff['Title'] = title.Title stuff['Sex'] = sex stuff['Pclass'] = pclass stuff['Age'] = full.Age stuff['Age'] = stuff.groupby( ['Sex', 'Pclass', 'Title'])['Age'].transform(lambda x: x.fillna(x.median())) Age['Age'] = stuff.Age del stuff #Fill in missing cabin values #Use regression from Pclass, ticket, embarkation port, etc... cabin = pd.DataFrame() #Create family size variable family = pd.DataFrame() family['FamilySize'] = full['Parch'] + full['Sibsip'] + 1 #Single, small or large family family['Family_Single'] = family['FamilySize'].map(lambda s: 1 if s == 1 else 0) family['Family_Small'] = family['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0) family['Family_Large'] = family['FamilySize'].map(lambda s: 1 if 5 <= s else 0) family.head() #Create a wealth variable wealth = pd.DataFrame() money = pd.DataFrame() money['Pclass'] = full['Pclass'] money['Title'] = title['Title'] money['Fare'] = full['Fare'] cabin['Cabin'] = full['Cabin'] wealth['Social_Class'] #Create Functions to define if Poor, Middle Class or Rich wealth['Poor'] = wealth['Social_Class'].map(determine_Poor(money)) wealth['Middle_Class'] = wealth['Social Class'].map( determine_Middle(money)) wealth['Rich'] = wealth['Social Class'].map(determine_Rich(money)) full_X = pd.concat([Age, embarked, cabin, sex, wealth, family], axis=1) full_X.head() #Create all datasets neccessary to test models train_valid_X = full_X[0:891] train_valid_Y = titanic.Survived test_X = full_X[891:] train_X, valid_X, train_Y, valid_Y = train_test_split(train_valid_X, train_valid_Y, train_size=0.7) print(full_X.shape, train_X.shape, valid_X.shaoe, train_Y.shape, valid_Y.shape, test_X.shape) plot_variable_importance(train_X, train_Y) #Run several different models model1 = RandomForestClassifier(n_estimators=100) model2 = SVC() model3 = GradientBoostingClassifier(n_neighbors=3) model4 = GaussianNB() model5 = LogisticRegression() model1.fit(train_X, train_Y) model2.fit(train_X, train_Y) model3.fit(train_X, train_Y) model4.fit(train_X, train_Y) model5.fit(train_X, train_Y) train_score1 = model1.score(train_X, train_Y) train_score2 = model2.score(train_X, train_Y) train_score3 = model3.score(train_X, train_Y) train_score4 = model4.score(train_X, train_Y) train_score5 = model5.score(train_X, train_Y) valid_score1 = model1.score(valid_X, valid_Y) valid_score2 = model2.score(valid_X, valid_Y) valid_score3 = model3.score(valid_X, valid_Y) valid_score4 = model4.score(valid_X, valid_Y) valid_score5 = model5.score(valid_X, valid_Y) #Print out score comparisons print("Train Data Score: Validation Data Score:") print(train_score1, valid_score1) print(train_score2, valid_score2) print(train_score3, valid_score3) print(train_score4, valid_score4) print(train_score5, valid_score5) #Hopefully find the Optimal Features for the model plot_model_var_imp(model1, train_X, train_Y) rfecv = RFECV(estimator=model1, step=1, cv=StratifiedKFold(train_Y, 2), scoring='accuracy') rfecv.fit(train_X, train_Y) print(rfecv.score(train_X, train_Y), rfecv.score(valid_X, Valid_Y)) print("Optimal number of features: %d" % refecv.n_features_) #Plot number of features vs. cross Validcation Scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classification") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores) plt.show()
rownames=['True'], colnames=['Predicted'], margins=True) print("confusion matrix") print(confu_mat) print("parameters") statLogitModel = sm.Logit(train_target, pured_data).fit_regularized() print(statLogitModel.params) print("P-values") scores, pvalues = chi2(pured_data, train_target) for i in range(len(pvalues)): print(pured_data.columns[i], pvalues[i]) plt.figure(figsize=(16, 9)) plt.plot(falsePositiveRate, truePositiveRate) plt.plot([0, 1], [0, 1], linestyle='dotted') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC and AUC') plt.show() test_data = feature.loc[feature['train_test'] == 0] test_data.loc[(test_data['activities'] == 0) | (test_data['activities'] == 1), 'activities'] = 0 test_data.loc[(test_data['activities'] != 0) & (test_data['activities'] != 1), 'activities'] = 1 test_target = test_data.iloc[:, -1] test_data = test_data[pured_data.columns] accuracy = clf.score(test_data, test_target) print("accuracy: %.2f" % accuracy)
clf = linear_model.Ridge(alpha =30) #clf=linear_model.LinearRegression() rfecv = RFECV(estimator=clf, step=1, cv=KFold(5,shuffle=True) ) rfecv.fit(X_train, y_train) print("Optimal number of features : %d" % rfecv.n_features_) for i in np.array(features)[rfecv.support_]: print i pred_train = rfecv.predict(X_train) pred_test = rfecv.predict(X_test) print ("Train score :%.2f" %rfecv.score(X_train,y_train)) print ("validation score :%.2f" %rfecv.score(X_test,y_test)) sorted_features=[] sorted_scores=sorted(rfecv.ranking_) for i in np.argsort(rfecv.ranking_): sorted_features.append(features[i]) sorted_scores=np.array(sorted_scores) sorted_scores=6-sorted_scores # plot feature scores pos1 = range(len(sorted_features),len(sorted_features[:len(sorted_features) - 11]), -1) pos2 = range(len(sorted_features[:len(sorted_features) - 11]),0, -1) pos = range(len(sorted_features),0, -1) barh(pos1, sorted_scores[:11] , align='center', color='green') barh(pos2, sorted_scores[11:] , align='center', color='red') yticks(pos, sorted_features)
X.head() X.columns XDF.columns XDF.groupby('redirect')['n_count'].mean() get_ipython().run_line_magic('matplotlib', '') import seaborn as sns X.var(0) XDF.groupby('related_page')['n_count'].mean() XX = X[[c for c in X if not c.startswith('M_')]] XX = XX[[c for c in XX if not c.startswith('S_')]] XX.columns XX.var(0).plot(kind='bar') XX.drop(['lifetime'], axis=1).var(0).plot(kind='bar') rfecv.fit(XX, y) rfecv.grid_scores_.max() rfecv.score(XX, y) lasso lasso.fit(XX, y) lasso.score(XX, y) from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_regression XX.head() get_ipython().run_line_magic('pinfo', 'SelectKBest') XX.shape for i in range(2, 20): best = SelectKBest(f_regression, k=i) XXX = best.fit_transform(XX.values) lr = lm.LinearRegression().fit(XXX, y.values) print(i, lr.score(XXX, y.values)) for i in range(2, 20):
# print test_X.shape, test_Y.shape logistic_reg = LogisticRegression() logistic_reg.fit(train_X, train_Y) print logistic_reg.score(test_X_1, test_Y_1) # test_Y = logistic_reg.predict(test_X) # result.to_csv('result.csv', encoding='utf-8', index=False) Svc = SVC() Svc.fit(train_X, train_Y) print Svc.score(test_X_1, test_Y_1) # test_Y = Svc.predict(test_X) model = RandomForestClassifier(n_estimators=100) model.fit(train_X, train_Y) print model.score(test_X_1, test_Y_1) # test_Y = model.predict(test_X) rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(train_Y, 2), scoring='accuracy') rfecv.fit(train_X, train_Y) print rfecv.score(test_X_1, test_Y_1) test_Y = rfecv.predict(test_X) passenger_id = full[891:].PassengerId test = pd.DataFrame({'PassengerId': passenger_id, 'Survived': test_Y}) print test.shape test.to_csv('pred.csv', index=False)
if (FeatSelection_RFE or FeatSelection_RFECV) == True: 'RFE + - best feats' 'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html ' svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False) # svc = LogisticRegression(class_weight='auto')#,C=1) if FeatSelection_RFECV==True: rfecv = RFECV(estimator=svc, step=0.1, cv=StratifiedShuffleSplit(y,n_iter=7,test_size=0.33), scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1'..." else: rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.1) rfecv.fit(X, y) if FeatSelection_RFECV==True: print("RFEcv selected %d number of Optimal features : " % (rfecv.n_features_)) print("RFE (%d Features) scorer : \n" % (rfecv.n_features_),rfecv.score(X, y) ) print("RFE selected feature names:") featureNames=featureNames[rfecv.get_support()] rfe_featnames = featureNames[rfecv.get_support()] print (rfe_featnames) X_RFE = rfecv.fit_transform(X, y) print(X_RFE.shape,"X_RFE \n") 'Set GetRFEPerf To true or by user, if perf. of reduced set wanted' GetRFEPerf=False print("\n X: \n") ModelParam_GridSearch(X,y,cv=4) if GetRFEPerf==True:
def GetAllPerf (filePaths=None): if filePaths is None: filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv')) #Sanity check: # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile'] # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv'] print("FilePaths: \n",filePaths) fileNames=fileNameFromPaths (filePaths) print("FileNames:",fileNames) resDict = pd.DataFrame(index=fileNames, columns=['Accuracy','Accuracy_SD', 'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1', 'LargestClassPercent','Classes', # 'TopRFE-Features','Best (f1) Model parameters', '# Classes', 'Array-Acc-Scores' ,'Array-f1-Scores' ,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted']) #redDict holds results for each file/class, for saving to output-file i=-1 for filePath in filePaths: i +=1 'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/' filePath = os.path.normpath(filePath) print(filePath) fileName=str(fileNames[i]) #Str added now 14.1 print("fileName: %s" %(fileName)) "resDict['Name']= fileName" # filePath = str(argv[1]) # X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels X, y, lb_encoder,featureNames = load_data(filePath, 'file') # X, y = features, labels print(X.shape,"= (samples, features)") y_inv = Counter(lb_encoder.inverse_transform(y)) MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1) print("Classes:", lb_encoder.classes_) print("MajorityClassPercent:", MajorityPercent) resDict.LargestClassPercent[fileName] = MajorityPercent resDict.Classes[fileName] = str(lb_encoder.classes_) resDict["# Classes"][fileName]=len(lb_encoder.classes_) KFilt=None KFilt=350 #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself. if KFilt is not None: k = SelectKBest(k=KFilt).fit(X,y) X=k.transform(X) featureNames=featureNames[k.get_support()] Fwe = SelectFwe(alpha=0.01).fit(X,y) X=Fwe.transform(X) featureNames=featureNames[Fwe.get_support()] print("X reduced to K best features: ",X.shape) FeatSelection_SVM=False #Feature Names need updating!! FeatSelection_RandLogReg=False if FeatSelection_RandLogReg == True: LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5, sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y) X_L1 = LogRegFeats.transform(X) featureNames=featureNames[LogRegFeats.get_support()] print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape) elif FeatSelection_SVM == True: svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y) X_L1 = svc_L1.transform(X, y) featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))] print ("L1 SVM Transformed X:",X_L1.shape) # X=X_L1 ''' print("Performance as a function of percent of features used:") PlotPerfPercentFeatures(X,y,est=LinearSVC()) ''' 'EG - graph best features; feature selection using RF, ensemble classifiers..' 'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb' RFE_FeatsToKeep = 16 FeatSelection_RFE=False FeatSelection_RFECV=False if (FeatSelection_RFE or FeatSelection_RFECV) == True: 'RFE + - best feats' 'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html ' svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False) # svc = LogisticRegression(class_weight='auto')#,C=1) if FeatSelection_RFECV==True: rfecv = RFECV(estimator=svc, step=RFE_FeatsToKeep,scoring='average_precision') # ,cv=StratifiedShuffleSplit(y,n_iter=3,test_size=0.3)) #,scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1',accuracy..." else: rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.03) rfecv.fit(X, y) if FeatSelection_RFECV==True: print("RFE-CV selected %d features : " % (rfecv.n_features_)) print("RFE (%d features) scorer : " % (rfecv.n_features_),rfecv.score(X, y) ) rfe_featnames = featureNames[rfecv.get_support()] featureNames = featureNames[rfecv.get_support()] print("RFE selected feature names:",rfe_featnames) X_RFE = rfecv.fit_transform(X, y) print("X_RFE",X_RFE.shape) resDict['TopRFE-Features'][fileName]=str(rfe_featnames) 'Set GetRFEPerf To true or by user, if perf. of reduced set wanted' GetRFEPerf=False # print("lb_encoder.classes_",lb_encoder.classes_) 'Blind score boxplot graphic example using Seaborn: http://nbviewer.ipython.org/github/cs109/2014/blob/master/homework-solutions/HW5-solutions.ipynb ' 'Confusion matrixes + Dummies - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/' 'http://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators' "http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html" print() "Make custom F1 scorer. May not have fixed problem!" from sklearn.metrics.score import make_scorer f1_scorer = make_scorer(metrics.f1_score, greater_is_better=True, average="micro") #Maybe another metric? May NOT be fixed!?. #weighted, micro, macro, none # print("Dummy classifiers output:") dummy_frequent = DummyClassifier(strategy='most_frequent',random_state=0) y_dummyPred = Get_yPred(X,y,clf_class=dummy_frequent) dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y,y_dummyPred )) dummy_freq_f1 = '{:.3}'.format(metrics.f1_score(y, y_dummyPred,average='weighted')) dummy_freq_f1_weighted = '{:.3}'.format(f1_scorer(y, y_dummyPred)) #Get from ALL classes f1.. dummy_freq_f1_mean=(metrics.f1_score(y, y_dummyPred,average=None)).mean() # print("Dummy, most frequent acc:",dummy_freq_acc) # dummy_stratifiedRandom = DummyClassifier(strategy='stratified',random_state=0) # dummy_strat2= '{:.3%}'.format(metrics.accuracy_score(y, Get_yPred(X,y,clf_class=dummy_frequent))) #,sample_weight=balance_weights(y))) # 'print("Dummy, Stratified Random:",dummy_strat2)' print() resDict['dummy_freq:Accuracy'][fileName]=dummy_freq_acc ## resDict['dummy_freq:f1'][fileName]=dummy_freq_f1 dummy_freq_f1_mean resDict['dummy_freq:f1'][fileName]=dummy_freq_f1_mean resDict['dummy_freq_f1_weighted'][fileName]=dummy_freq_f1_weighted # resDict.dummy_Stratfreq[fileName]=dummy_strat2 "We can get seperately the best model for Acc, and the best for f1!" "WARNING!? In binary case - default F1 works for the 1 class, in sklearn 15. and lower" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1') "Temporary workaround until next SKlearn update of F1 metric:" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')f1_scorer bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = f1_scorer) bestEst_acc,bestScore_acc = ModelParam_GridSearch(X,y,cv=2,scoreParam = 'accuracy') print("bestEst (f1):",bestEst_f1)#,"best f1",bestScore_f1) print("bestEst (f1):",bestEst_acc)#,"best acc",bestScore_acc) #Temp # bestEst_f1=bestEst_acc=bestEst = RandomForestClassifier(n_jobs=-1) if GetRFEPerf==True: bestEst_RFE,bestScore_RFE = ModelParam_GridSearch(X_RFE,y,cv=3,scoreParam = 'f1') "Modified to get 2 estimators" scores_acc = cross_val_score(estimator=bestEst_acc, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1) #Accuracy print("Accuracy: %0.3f (+- %0.2f)" % (scores_acc.mean(), scores_acc.std() * 2)) scores_f1 = cross_val_score(estimator=bestEst_f1, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1, scoring='f1') print("f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) resDict['Accuracy'][fileName]=round(scores_acc.mean(),4) resDict['Accuracy_SD'][fileName]=round(scores_acc.std(),4) resDict['f1'][fileName]=round(scores_f1.mean(),4) resDict['f1_SD'][fileName]=round(scores_f1.std(),4) resDict['Array-f1-Scores'][fileName]=(scores_f1) resDict['Array-Acc-Scores'][fileName]=(scores_acc) resDict['bestML-f1'][fileName]=(str(bestEst_f1)) resDict['bestML-Acc'][fileName]=(str(bestEst_acc)) #ORIG # Acc,Acc_SD,f1,f1_SD = CV_multi_stats(X, y, bestEst,n=15) # resDict['Accuracy'][fileName]=round(Acc,4) # resDict['Accuracy_SD'][fileName]=round(Acc_SD,4) # resDict['f1 score'][fileName]=round(f1,4) # resDict['f1_SD'][fileName]=round(f1_SD,4) # resDict['Best (f1) Model parameters'][fileName]= bestEst print() # print(fileName," Done") print("Saving results to file") resDict.to_csv("OutputData.tsv", sep=',')
def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted=None): # Store the original feature list and normalize the data list_temp = self.feature_list scaler = StandardScaler() X_minmax = scaler.fit_transform(X) self.X_minmax = copy.deepcopy(X_minmax) self.scores = [] # Determine the number of folds to be used. kfold = StratifiedKFold(n_splits=5, shuffle=True) for outer in range(self.outer_loop): print("\n--------This is outer loop {}---------\n".format(outer + 1)) # Run the outer loop from here for i, (train_o, test_o) in enumerate(kfold.split(X_minmax, y)): self.loop_indices.append((train_o, test_o)) print("This is set {}".format(i + 1)) X_train_o = X_minmax[train_o] y_train_o = y[train_o] X_test_o = X_minmax[test_o] y_test_o = y[test_o] X_train_transformed = copy.deepcopy(X_train_o) X_test_transformed = copy.deepcopy(X_test_o) # Run the inner loop from here for inner in range(self.inner_loop): # If the number of features are very high (>100), we set the minimum number of features needed to be 100. # If the numnber of features are moderate (15-100), we set the minimum number of features to be 10 # less than already present n_feat = min(100, X_train_transformed.shape[1] - 10) # If the number of features are less (<15), then we want it to select atleast 5 features to continue the loop n_feat = max(10, n_feat) list_temp_prev = list_temp print("\n\t--------This is inner loop {}---------\n".format(inner + 1)) rfecv = RFECV(estimator=self.clf, step=1, min_features_to_select=n_feat, cv=kfold, scoring='accuracy') # rfecv = xgb.XGBClassifier() # Transform the datasets at each loop to keep track of reduced features # rfecv.fit(X_train_transformed, y_train_o) # X_train_transformed = rfecv.transform(X_train_transformed) X_train_transformed = rfecv.fit_transform(X_train_transformed, y_train_o) self.models.append(rfecv) X_test_transformed = rfecv.transform(X_test_transformed) X_minmax = rfecv.transform(X_minmax) features = rfecv.n_features_ print("\tShape of transformed train dataset is: {}".format(X_train_transformed.shape)) print("\tOptimal no. of features are: {}".format(features)) ranking = rfecv.ranking_ # Update the feature list here list_temp = self.updateFeatures(list_temp_prev, ranking) # This is just used to check the score after inner loop is finished as the test data was already transformed # to reduced features. Hence we inverse the transform to check the score X_temp = rfecv.inverse_transform(X_test_transformed) score = rfecv.score(X_temp, y_test_o) self.scores.append(score) print("Shape of transformed train dataset is: {}".format(X_train_transformed.shape)) print("Shape of ranks is: {}\n\n".format(ranking.shape)) # Print the average scores after finshing the outer loop and save the features in an excel file print("After outer loop CV, mean score is: {}".format(mean(self.scores))) self.list = list_temp_prev self.ranking = ranking print(X_train_transformed.shape) print(X_test_transformed.shape) self.X_transformed = np.vstack((X_train_transformed, X_test_transformed)) return self
X_selected = X_perc.transform(X) from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression rfeLoR = RFE(LogisticRegression(solver='saga', max_iter=1000), 100) #Sag model works well on large datasets but is sensitive to feature scaling. saga handles sparcity rfeLoR.fit(X, Y) rfeLoR.n_features_ from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import RFECV m_RFERFC = RFECV(RandomForestClassifier(n_estimators=100), scoring='accuracy') m_RFERFC.fit(X, Y) # returns model X_RFERFC = m_RFERFC.predict(X) m_RFERFC.score(X, Y) from sklearn.linear_model import LassoCV from sklearn.feature_selection import SelectFromModel m_lasso = SelectFromModel(LassoCV()) m_lasso.fit(X, Y) m_lasso.transform(X).shape X_lasso = m_lasso.transform(X) m_lasso.get_params() mask = m_lasso.get_support() print(mask) plt.matshow(mask.reshape(1, -1), cmap='gray_r') X.columns[mask] #Using CV helps reduce selection bias due to the observations in the training set #X_test_selected = modelfit.transform(X_test)
rfecv.fit(trainData, trainLabel) print("Optimal number of features : %d" % rfecv.n_features_) # Plotting features with cross validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() # After an hour, the SVM model has been trained optimizing the features in the database. Using only these features # will reduce the time of training of the model so used only 373 features instead of input of 561. print('Accuracy of the SVM model on test data is ', rfecv.score(testData,testLabel) ) # Getting the best features best_features = [] for ix,val in enumerate(rfecv.support_): if val==True: best_features.append(testData[:,ix]) #The above yields an accuracy of approximately 97%. Following helps in visualization. from pandas.tools.plotting import scatter_matrix visualize = pd.DataFrame(np.asarray(best_features).T) print(visualize.shape) scatter_matrix(visualize.iloc[:,0:5], alpha=0.2, figsize=(6, 6), diagonal='kde')
df['SexN']=df['Sex'] df1['SexN']=df1['Sex'] enc=LabelEncoder() df['SexN']=enc.fit_transform(df['Sex']) df1['SexN']=enc.fit_transform(df1['Sex']) X_train=df[['Pclass','SibSp','Parch','Fare','AgeN','SexN']] y_train=df['Survived'] X_test=df1[['Pclass','SibSp','Parch','Fare','AgeN','SexN']] X_test1=df1[['PassengerId','Pclass','SibSp','Parch','Fare','AgeN','SexN']] svc=SVC(kernel='linear') #svc=DecisionTreeClassifier(criterion='entropy') rfecv=RFECV(estimator=svc, step=1, cv=StratifiedKFold(y_train, 5),scoring='accuracy') rfecv.fit(X_train,y_train) predictions=rfecv.predict(X_test) print rfecv.score(X_train,y_train) print("Optimal number of features : %d" % rfecv.n_features_) finlist=zip(X_test1['PassengerId'],predictions) with open("/Users/prakashchandraprasad/Desktop/datasets/Titanic/Decision_tree_titanic7.csv","wb") as f: writer=csv.writer(f) writer.writerow(["PassengerId","Survived"]) writer.writerows(finlist)
# #### making the test train data set # In[37]: X = train.iloc[:, :73] Y = train.iloc[:, -1:] # #### cross validation and calculate the train and test score. # In[46]: cv = KFold(n_splits=5, random_state=None, shuffle=True) scores = [] for (train1, test1), i in zip(cv.split(X, Y), range(5)): rfe.fit(X.iloc[train1], Y.iloc[train1]) train_score = rfe.score(X.iloc[train1], Y.iloc[train1]) test_score = rfe.score(X.iloc[test1], Y.iloc[test1]) scores.append((train_score, test_score)) pd.DataFrame(scores, columns=['Train', 'Test']) # In[47]: print('Optimal number of features:', rfe.n_features_) ## printing the optimal feature after RFE # #### plotting the AUC ROC graph to see the model score # In[48]: import matplotlib.pyplot as plt
# In[91]: selector.n_features_ # Which features were retained? # In[92]: X_train.columns[selector.support_] # Score of the underlying LinearSVC on the training set: # In[93]: selector.score(X_train, y_train) # Hopefully there was not too much overfitting. # Reduce our data to the retained features: # In[94]: X_train = X_train.loc[:, selector.support_] X_test = X_test.loc[:, selector.support_] # # 4 Predictive Modeling # <a id='4'></a> # In[95]:
def ExecuteRFECV(samples, y, featureNames, clusters, clusterNames, clf, kFolds, nSplits, standardization, removedInfo, permutation, nPermutation, currentDateTime, resultDir, debug, verbose): rfecv = RFECV(estimator=clf, cv=StratifiedKFold(kFolds), scoring='accuracy', n_jobs=-1) # Create empty Pandas dataframe cvResults = pandas.DataFrame() decodingAccuracy = pandas.DataFrame() permResults = pandas.DataFrame() avg_perm_DA = [] # Execute feature selection for nbOfSplit times for it in list(range(nSplits)): # Randomly create stratified train and test partitions (1/3 - 2/3) xTrain, xTest, yTrain, yTest = tts(samples, y['Cluster'], test_size=0.33, stratify=y['Cluster']) # Data z-score standardization xTrainSet, zPrm = Standardize(xTrain, yTrain, standardization, debug) # "accuracy" is proportional to the number of correct classifications if verbose: print(' Fiting for split #{}'.format(it)) rfecv.fit(xTrainSet, yTrain) # Append the dataframe with the new cross-validation results. cvResults['cv_Scores_' + str(it)] = rfecv.grid_scores_ cvResults['cv_Features_Rank_' + str(it)] = rfecv.ranking_ if debug: print('cvResults for it %d' % it) print(cvResults) # Plot number of features VS. cross-validation scores fig_cv = plt.figure(dpi=300) plt.subplot(211) plt.title('Best performance = %.2f with %d features' % \ (max(rfecv.grid_scores_), rfecv.n_features_)) plt.xlabel("Number of features selected") plt.ylabel("Cross-validation score %") plt.plot(range(len(rfecv.grid_scores_)), rfecv.grid_scores_) # subplot selected features plt.subplot(212) plt.title('Features selection') plt.xlabel("Features") plt.xticks(range(len(rfecv.grid_scores_)), featureNames, rotation='vertical') plt.ylabel("Selection") plt.scatter(range(len(rfecv.grid_scores_)), rfecv.support_) plt.grid() plt.tight_layout() savedPlotName = resultDir+'RFECV'+'_CV_DA_'+clusters+'_'+str(it+1)+ \ '_'+str(nSplits)+'.png' plt.savefig(savedPlotName, bbox_inches='tight') plt.close(fig_cv) if verbose: print('\tComplete') # ********************************** TEST ************************************* # standardize test set using trainset standardization parameters xTestSet = ApplyStandardization(xTest, zPrm) if verbose: print(' Testing') # use score() function to calculate DAs if debug: print('scores' + str(it)) print(rfecv.score(xTestSet, yTest)) decodingAccuracy['test_DA_' + str(it)] = [rfecv.score(xTestSet, yTest)] # plot confusion matrix y_pred = rfecv.predict(xTestSet) cm = confusion_matrix(yTest, y_pred) fig_CM = plt.figure(dpi=300) plot_confusion_matrix(cm, clusterNames, normalize=True, precision=2) savedPlotName = resultDir+'RFECV'+'_'+clusters+'_ConfusionMatrix_'+ \ str(it+1)+'_'+str(nSplits)+'.png' plt.savefig(savedPlotName, bbox_inches='tight') plt.close(fig_CM) if it == nSplits - 1: print('\nTest Decoding accuracy') decodingAccuracy['test_Avg_DA'] = decodingAccuracy.iloc[0][:].mean( ) for i in list(range(len(decodingAccuracy.iloc[0]))): print('\t'+str(decodingAccuracy.iloc[0].index[i])+'\t'+ \ str(decodingAccuracy.iloc[0][i])) #formating test results to save in excel file fTest = [] for i in range(len(list(decodingAccuracy)) - 1): fTest.append(decodingAccuracy.iloc[0][i]) testDA = pandas.DataFrame() testDA['test_DA_per_epoch'] = fTest tmp = pandas.DataFrame(data=[np.mean(testDA['test_DA_per_epoch'])], columns=['avg_test_DA']) testDA = pandas.concat([testDA, tmp], axis=1) print('\tComplete\n') # ****************************** Permutation ********************************** if permutation: if verbose: print(' Permutting') # Create subset based on selected best features xTrain_rfecv = rfecv.transform(xTrainSet) xTest_rfecv = rfecv.transform(xTestSet) permResults['permutation_DA_' + str(it)] = Permute(clusters, xTrain_rfecv, xTest_rfecv, yTrain, yTest, nPermutation, debug_flag=0) avg_perm_DA.append( np.mean(permResults['permutation_DA_' + str(it)])) # savedHistName = resultDir+'/Permutation_hist_'+str(it)+'.png' # PlotPermHist(permResults,testDA.iloc[0][1], # currentDateTime,savedHistName) if permutation: # compute permutation DA average and keep results in a dataframe epochedPermDA = ComputePermutationAvgDA(avg_perm_DA) print('Average permutation DA per train epoch') for i in epochedPermDA['Avg_Permutation_DA_per_epoch']: print('\t' + str(i)) print('\nAverage permutation DA : {}'.format( epochedPermDA['Global_Permutation_DA'][0])) savedHistName = resultDir + 'Average_Permutation_hist.png' PlotPermHist(permResults, testDA.iloc[0][1], currentDateTime, savedHistName) # formating permutation results to save in excel file permResults = pandas.concat([permResults, epochedPermDA], axis=1) # ************************ Select best of best features *********************** ranks = cvResults.iloc[:, 1::2] if debug: print(ranks) bestFeatures = pandas.DataFrame() bestFeatures = ranks[(ranks == 1).all(1)].index.tolist() print('\nBest features :') tmp = [] for i in bestFeatures: tmp.append(featureNames[i]) print('\t' + featureNames[i]) bestFeaturesNames = pandas.DataFrame(data=tmp, columns=['Best_Features']) # Calculate number of time every features is selected as best bestFeaturesHist = ranks[(ranks == 1)].sum(axis=1) bestFeaturesHist.rename('Best_Features_Hist') # Build structure of histogram data to save in excel hist = pandas.DataFrame(data=featureNames, columns=['Features_Name']) hist['Occurence_Best'] = bestFeaturesHist nbSubject = pandas.DataFrame(data=[len(samples)], columns=['Number_Of_Subjects']) nbFeature = pandas.DataFrame(data=[samples.shape[1]], columns=['Number_Of_Features']) dataSize = pandas.concat([nbSubject, nbFeature], axis=1) # Get the best test DA and corresponding training set of features bestDA = testDA['test_DA_per_epoch'].max() bestDAepoch = testDA['test_DA_per_epoch'].idxmax() colName = 'cv_Features_Rank_' + str(bestDAepoch) bTrainFeat = cvResults[colName][(cvResults[colName] == 1)].index.tolist() tmp = [] tmp.append(bestDA) for i in bTrainFeat: tmp.append(featureNames[i]) bTrainFeatName = pandas.DataFrame(data=tmp, columns=['Best_Train_Features_Set']) # Build results structure to be save in excel file excelResults = pandas.concat([ cvResults, testDA, permResults, hist, bestFeaturesNames, removedInfo, dataSize, bTrainFeatName ], axis=1) # excelResults.to_excel(resultDir+'results_RFECV_'+currentDateTime+'.xlsx', # sheet_name=xlSheetName) return excelResults
print featureNames print(model.feature_importances_) importances = model.feature_importances_ std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") for f in range(X.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) print 'Variable importance obtained' # estimator for rfecv est = linear_model.LogisticRegression() # fit rfecv rfecv = RFECV(estimator=est, step=1, cv=StratifiedKFold(y, 10), scoring='accuracy') rfecv.fit(X, y) print rfecv.support_ print rfecv.ranking_ print 'score = ', rfecv.score(X, y) print("Optimal number of features : %d" % rfecv.n_features_)
cv=StratifiedShuffleSplit(y, n_iter=7, test_size=0.33), scoring='f1', verbose=0) # " scoring='roc_auc','recall','f1'..." else: rfecv = RFE(estimator=svc, n_features_to_select=RFE_FeatsToKeep, step=0.1) rfecv.fit(X, y) if FeatSelection_RFECV == True: print("RFEcv selected %d number of Optimal features : " % (rfecv.n_features_)) print("RFE (%d Features) scorer : \n" % (rfecv.n_features_), rfecv.score(X, y)) print("RFE selected feature names:") featureNames = featureNames[rfecv.get_support()] rfe_featnames = featureNames[rfecv.get_support()] print(rfe_featnames) X_RFE = rfecv.fit_transform(X, y) print(X_RFE.shape, "X_RFE \n") 'Set GetRFEPerf To true or by user, if perf. of reduced set wanted' GetRFEPerf = False print("\n X: \n") ModelParam_GridSearch(X, y, cv=4) if GetRFEPerf == True: print("\n X-RFE: \n")
Rclf.fit(Xtrain,ytrain); print("Residual sum of squares: %.2f" % np.mean((Rclf.predict(Xtest) - ytest) ** 2)) print('Regularization choosen, alpha = %.2f' % Rclf.alpha_); print(' Coef values = ', Rclf.coef_); print('Variance score: %.2f' % Rclf.score(Xtest, ytest)) selector = RFECV(rf, step = 1, cv=ShuffleSplit(len(X), 10, .2)) selector = selector.fit(X,y); print (selector.n_features_) for i,j in enumerate(selector.support_): if j == True: print(features[i]) print('Variance score Train: %.2f' % selector.score(X,y)); print('Variance score Test: %.2f' % selector.score(Xtest,ytest)); #print('Coeff of Test: ', selector.coef_); print('No of Features selected by RFECV = %.2f' %sum(selector.support_)); plotfit(selector,Xtest,ytest); # Learning Curve from sklearn.learning_curve import learning_curve def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes= list(range(3,23,3))): """ Generate a simple plot of the test and traning learning curve. Parameters ---------- estimator : object type that implements the "fit" and "predict" methods
# print(sub_title) svc = svm.LinearSVC() # clf = tree.DecisionTreeClassifier() rfecv = RFECV(svc, step=1, cv=StratifiedKFold(target, 2), scoring='accuracy') rfecv.fit(selected,target) rf_support = rfecv.support_ # print(rf_support) sub2_title = [] for i in range(len(rf_support)): if (rf_support[i]): sub2_title.append(sub_title[i]) print sub2_title # # print(array) print(rfecv.score(selected,target)) print("Optimal number of features : %d" % rfecv.n_features_) plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()