def feature_selection_LR(): from sklearn.feature_selection import RFE rfe_selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=30, step=5, verbose=5) rfe_selector.fit(X_train_scaled, y_train) y_pred = rfe_selector.predict(X_test_scaled) y_predprob = rfe_selector.predict_proba(X_test_scaled)[:, 1] rfe_support = rfe_selector.get_support() rfe_feature = X_train[predictors].loc[:, rfe_support].columns.tolist() print(str(len(rfe_feature)), 'selected features') print('RFE features') print(rfe_feature) # Print model report: print("\nModel Report") #print("Train Accuracy : %.4g" % metrics.accuracy_score(y_train, y_pred_train)) print("Test Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pred)) #print('Train error: {:.3f}'.format(1 - metrics.accuracy_score(y_train, y_pred_train))) print('Test error: {:.3f}'.format(1 - metrics.accuracy_score(y_test, y_pred))) print("AUC Score : %f" % metrics.roc_auc_score(y_test, y_predprob)) print("Recall : %f" % metrics.recall_score(y_test, y_pred)) print("Precision : %f" % metrics.precision_score(y_test, y_pred)) print("F-measure : %f" % metrics.f1_score(y_test, y_pred)) c_matrix = metrics.confusion_matrix(y_test, y_pred) print('========Confusion Matrix==========') print(" Rejected Accepted") print('Rejected {} {}'.format(c_matrix[0][0], c_matrix[0][1])) print('Accepted {} {}'.format(c_matrix[1][0], c_matrix[1][1]))
def perform_rfe(model, train, test, filename, to_remove=None): if to_remove is None: to_remove = floor(0.3 * len(train.columns)) X = train.drop(TARGET, axis=1) y = train[TARGET] model.fit(X, y) preds = model.predict_proba(test)[:, 1] build_results_csv(filename, X.columns, send_submission("doesnt_matter.csv", preds), create_file=True) sleep(3) for i in range(to_remove): rfe = RFE(model, n_features_to_select=len(X.columns) - 1).fit(X, y) preds = rfe.predict_proba(test)[:, 1] X = X.iloc[:, rfe.get_support()] test = test.iloc[:, rfe.get_support()] results = build_results_csv( filename, X.columns, send_submission("doesnt_matter.csv", preds)) sleep(3) return results
def train_recursive_feature_elimination(x_train, y_train, x_test, y_test, feature_num=10): print("-------------RFE Model-------------") class_weight = dict() class_weight[1] = 1 class_weight[0] = 1 model = LogisticRegression(solver='sag', class_weight=class_weight) # model = RandomForestClassifier(n_estimators=100) # model = SVC(gamma='scale', probability=True, kernel='poly') rfe = RFE(model, feature_num) # RFE Fit rfe.fit(x_train, y_train) # RFE Predict y_predicted = rfe.predict(x_test) y_prob = rfe.predict_proba(x_test) print(rfe.support_) return y_predicted, y_prob
from sklearn.feature_selection import RFE from sklearn.ensemble import GradientBoostingClassifier from sklearn.linear_model import LogisticRegression import lightgbm as lgb import pickle import pandas as pd train_data = pickle.load(open('xwd/train_feat.pkl', 'rb')) train_label = pickle.load(open('xwd/train_label.pkl', 'rb')) ''' train_data = train_data[:100] train_label = train_label[:100] train_data.fillna(0,inplace=True) train_label.fillna(0,inplace=True) ''' valid_data = pickle.load(open('xwd/valid_feat.pkl', 'rb')) valid_label = pickle.load(open('xwd/valid_label.pkl', 'rb')) test = pickle.load(open('xwd/test_feat.pkl', 'rb')) gbm = lgb.LGBMClassifier( objective='binary', num_leaves=200, #600W learning_rate=0.05, min_child_samples=100, n_estimators=1) model = RFE(estimator=LogisticRegression(), n_features_to_select=70).fit(train_data, train_label) proba_test = model.predict_proba(test)
cols = [ "Age", "Fare", "TravelAlone", "Pclass_1", "Pclass_2", "Embarked_C", "Embarked_S", "Sex_male", "IsMinor" ] X = final_train[cols] Y = final_train["Survived"] x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0) logistic = LogisticRegression() rfe = RFE(logistic, 8) rfe.fit(x_train, y_train) # summarize the selection of the attributes print('Selected features: %s' % list(X.columns[rfe.support_])) print("=========================") y_pred = rfe.predict(x_test) print("========================") y_pred_proba = rfe.predict_proba(x_test)[:, 1] [fpr, tpr, thr] = roc_curve(y_test, y_pred_proba) print('Train/Test split results:') print(rfe.__class__.__name__ + " accuracy is %2.3f" % accuracy_score(y_test, y_pred)) print(rfe.__class__.__name__ + " log_loss is %2.3f" % log_loss(y_test, y_pred_proba)) print(rfe.__class__.__name__ + " auc is %2.3f" % auc(fpr, tpr))
# import modules import numpy as np import pandas as pd from sklearn.naive_bayes import BernoulliNB from sklearn.feature_selection import RFE # read data data = pd.read_csv('data_cleaned.csv') # make data ready data_x = data.drop(['Target', 'Client_ID'], axis = 1) data_y = data.Target # set train and test train_x = data_x[data_x['X2006'] == 0].values train_y = data_y[data_x['X2006'] == 0].values test_x = data_x[data_x['X2006'] == 1].values test_y = data_y[data_x['X2006'] == 1].values # set model nb = BernoulliNB(250) # select features rfe = RFE(nb, n_features_to_select=39) rfe.fit(train_x, train_y); # make predictions pred = rfe.predict_proba(data_x) # create submission pd.DataFrame({'Client_ID':data.Client_ID, 'Cross_Sell':pred[:, 1]}).to_csv('sub_final.csv', index=False)
return target nullmod = nullmod(df, target, other) y210 = getTargetDf(df, target, other) ypred = nullmod.predict(y210) y210[target] = ypred # 递归特征选择 from sklearn.feature_selection import RFE estimator = xgb.XGBClassifier(**params) selector = RFE(estimator, 200, step=0.1) selector = selector.fit(xtrain, ytrain) p = selector.predict_proba(xvalid) roc_auc_score(yvalid, p[:, 1]) #------------------------------------------- # KNN KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, # power, minkowski 的2次方 即euclidean_distance metric='minkowski',
y_pred_prob_train = model.predict_proba(X_train)[:, 1] logreg_roc_auc = roc_auc_score(y_train, y_pred_prob_train) from sklearn.metrics import roc_curve fpr, tpr, thresholds = roc_curve(y_train, y_pred_prob_train) len(thresholds) thresholds[thresholds > 0.8][-1] plt.plot(fpr, tpr, label="area=%0.4f" % logreg_roc_auc) plt.legend(loc="0") from sklearn.feature_selection import RFE model = LogisticRegression() rfe = RFE(model, 30) rfe.fit(X_train, y_train) rank = list(rfe.ranking_) col = list(X_train.columns) feature = [] for i in range(len(col)): if rank[i] == 1: feature.append(col[i]) feature x_train_new = pd.DataFrame() for i in feature: x_train_new[i] = X_train[i] x_test_new = pd.DataFrame() for i in feature: x_test_new[i] = X_test[i] rfe = rfe.fit(x_train_new, y_train) y_pred = rfe.predict(x_test_new) y_pred_prob = rfe.predict_proba(x_test_new)[:, 1] score = roc_auc_score(y_test, y_pred_prob)
print(rfe.support_) print(rfe.ranking_) col = list(X_train.columns) rank = list(rfe.ranking_) new_list = [] for i in range(len(col)): if rank[i] == 1: new_list.append(col[i]) x_new = pd.DataFrame() for i in new_list: x_new[i] = X_train[i] rfe = rfe.fit(x_new, y_train) y_pred_new = rfe.predict(x_new) new_accuracy = rfe.score(x_new, y_train) y_new_pred_prob = rfe.predict_proba(x_new) new_roc_auc_score = roc_auc_score(y_train, y_new_pred_prob[:, 1]) #New model for test data model = LogisticRegression(class_weight="balanced", penalty="l2") model.fit(X_train, y_train) y_pred_test = model.predict(X_test) y_pred_test = y_pred_test.astype(np.int16) df3 = df2["PassengerId"] df3 = pd.DataFrame(df3) df3.set_index("PassengerId", inplace=True) df3["Survived"] = y_pred_test df3.to_csv(r'C:\Users\Admin\Desktop\ml practice\kaggle_titanic.csv')
print matchups['home_team'] # Remove the 'week' 'home_team' and 'away_team' columns from matchups as they are not used in the algorithm matchups.drop(['week', 'home_team', 'away_team'], axis=1, inplace=True) '''You'll likely want to use the a pickled model from previous regression predicting 2015 results''' for feat in range(1, len(matchups.columns)): for c in C_vec: # Create the classifier and check the score # clf = LogisticRegression() clf = linear_model.LogisticRegression(C=c, random_state=42) selector = RFE(clf) selector = selector.fit(X_train, y_train) # Calculate probabilities using the predict_proba method for logistic regression probabilities = selector.predict_proba(scaler.transform(matchups)) # Vectorize the spread_conversion function and apply the function to the probabilities result vector vfunc = np.vectorize(spread_conversion) predicted_spreads = np.apply_along_axis(vfunc, 0, probabilities[:, 0]) # If the actual line for the home team is lower than the predicted line then you would take the away team, otherwise take the home team bet_vector = np.array(np.where(predicted_spreads > spreads, 0, 1)) # Create the actual result vector where a tie counts as a loss for the home team game_result = np.array( np.where( home_score.ix[:, 0] + predicted_spreads[:] > away_score.ix[:, 0], 1, 0)) # Check to see where the bet_vector equals the actual game result with the spread included
from var_clus import VarClus demo = VarClus(max_eigenvalue=1.35, max_tries=5) demo.decompose(dfc) demo.print_cluster_structure() #%%stepwise from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression logreg = LogisticRegression(class_weight='balanced', random_state=11, solver='lbfgs') rfe = RFE(logreg, 44) rfe.fit(dfc, y) rfe.predict_proba(dfc) selectX = rfe.transform(dfc) ## find those selected variables for i in range(44): temp = selectX[:, i] for name in dfc.columns: temp1 = dfc[name] if (temp1 == temp).all(): print(name) prob = rfe.predict_proba(dfc) odds = prob[:, 0] / prob[:, 1] #%% import matplotlib.pyplot as plt
for feat in range(1,len(matchups.columns)): for c in C_vec: # Create the classifier and check the score # clf = LogisticRegression() clf = linear_model.LogisticRegression(C=c,random_state=42) selector = RFE(clf) selector = selector.fit(X_train,y_train) # Calculate probabilities using the predict_proba method for logistic regression probabilities = selector.predict_proba(scaler.transform(matchups)) # Vectorize the spread_conversion function and apply the function to the probabilities result vector vfunc = np.vectorize(spread_conversion) predicted_spreads = np.apply_along_axis(vfunc,0,probabilities[:,0]) # If the actual line for the home team is lower than the predicted line then you would take the away team, otherwise take the home team bet_vector = np.array(np.where(predicted_spreads > spreads,0,1)) # Create the actual result vector where a tie counts as a loss for the home team game_result = np.array(np.where(home_score.ix[:,0] + predicted_spreads[:] > away_score.ix[:,0], 1, 0)) # Check to see where the bet_vector equals the actual game result with the spread included result = np.array(np.where(bet_vector == game_result,1,0)) prob_result = float(np.sum(result)) / len(result)
def Model(Label,Parameters=[]): global filepath, filename, fixed_seed_num, sequence_window, number_class, hidden_units, input_dim, learning_rate, epoch, is_multi_scale, training_level, cross_cv, is_add_noise, noise_ratio try: filepath = Parameters["filepath"] filename = Parameters["filename"] sequence_window = Parameters["sequence_window"] number_class = Parameters["number_class"] hidden_units = Parameters["hidden_units"] input_dim = Parameters["input_dim"] learning_rate = Parameters["learning_rate"] epoch = Parameters["epoch"] is_multi_scale = Parameters["is_multi_scale"] training_level = Parameters["training_level"] cross_cv = Parameters["cross_cv"] fixed_seed_num = Parameters["fixed_seed_num"] is_add_noise = Parameters["is_add_noise"] noise_ratio = Parameters["noise_ratio"] except: pass result_list_dict = defaultdict(list) evaluation_list = ["ACCURACY","F1_SCORE","AUC","G_MEAN"] for each in evaluation_list: result_list_dict[each] = [] np.random.seed(fixed_seed_num) # for reproducibility #num_selected_features = 30 #num_selected_features = 25#AS leak tab=0 #num_selected_features = 32#Slammer tab=0 num_selected_features = 33#Nimda tab=1 for tab_cv in range(cross_cv): if not tab_cv == 0 :continue epoch_training_loss_list = [] epoch_val_loss_list = [] #print(is_multi_scale) #using MLP to train if Label == "SVM": x_train, y_train, y_train0, x_test, y_test, y_test0 = LoadData.GetData_WithoutS(is_add_noise,noise_ratio,filepath, filename, sequence_window, tab_cv, cross_cv, Multi_Scale=is_multi_scale, Wave_Let_Scale=training_level, Normalize=0) print(Label+" is running..............................................") y_train = y_train0 clf = svm.SVC(kernel="rbf", gamma=0.00001, C=100000,probability=True) print(x_train.shape) clf.fit(x_train, y_train) result = clf.predict_proba(x_test) #return Evaluation.Evaluation(y_test, result) #results = Evaluation.Evaluation(y_test, result) elif Label == "SVMF": x_train, y_train, y_train0, x_test, y_test, y_test0 = LoadData.GetData_WithoutS(is_add_noise,noise_ratio,filepath, filename, sequence_window, tab_cv, cross_cv, Multi_Scale=is_multi_scale, Wave_Let_Scale=training_level, Normalize=5) print(Label+" is running..............................................") clf = svm.SVC(kernel="rbf", gamma=0.00001, C=100000,probability=True) print(x_train.shape) #x_train_new = SelectKBest(f_classif, k=num_selected_features).fit_transform(x_train, y_train0) #x_test_new = SelectKBest(f_classif, k=num_selected_features).fit_transform(x_test, y_test0) clf.fit(x_train, y_train0) result = clf.predict_proba(x_test) #return Evaluation.Evaluation(y_test, result) #results = Evaluation.Evaluation(y_test, result) elif Label == "SVMW": x_train, y_train, y_train0, x_test, y_test, y_test0 = LoadData.GetData_WithoutS(is_add_noise,noise_ratio,filepath, filename, sequence_window, tab_cv, cross_cv, Multi_Scale=is_multi_scale, Wave_Let_Scale=training_level, Normalize=6) print(Label + " is running..............................................") #SVR(kernel="linear") = svm.SVC(kernel="rbf", gamma=0.00001, C=100000, probability=True) estimator = svm.SVC(kernel="linear",probability=True) selector = RFE(estimator, num_selected_features, step=1) selector = selector.fit(x_train, y_train0) result = selector.predict_proba(x_test) # return Evaluation.Evaluation(y_test, result) # results = Evaluation.Evaluation(y_test, result) elif Label == "NBF": x_train, y_train, y_train0, x_test, y_test, y_test0 = LoadData.GetData_WithoutS(is_add_noise,noise_ratio,filepath, filename, sequence_window, tab_cv, cross_cv, Multi_Scale=is_multi_scale, Wave_Let_Scale=training_level, Normalize=10) print(Label + " is running..............................................") clf = MultinomialNB() clf.fit(x_train, y_train0) result = clf.predict_proba(x_test) elif Label == "NBW": x_train, y_train, y_train0, x_test, y_test, y_test0 = LoadData.GetData_WithoutS(is_add_noise,noise_ratio,filepath, filename, sequence_window, tab_cv, cross_cv, Multi_Scale=is_multi_scale, Wave_Let_Scale=training_level, Normalize=11) print(Label + " is running..............................................") #SVR(kernel="linear") = svm.SVC(kernel="rbf", gamma=0.00001, C=100000, probability=True) estimator = MultinomialNB() selector = RFE(estimator, num_selected_features, step=1) selector = selector.fit(x_train, y_train0) result = selector.predict_proba(x_test) # return Evaluation.Evaluation(y_test, result) # results = Evaluation.Evaluation(y_test, result) elif Label == "NB": x_train, y_train, y_train0, x_test, y_test, y_test0 = LoadData.GetData_WithoutS(is_add_noise,noise_ratio,filepath, filename, sequence_window, tab_cv, cross_cv, Multi_Scale=is_multi_scale, Wave_Let_Scale=training_level, Normalize=1) print(Label+" is running..............................................") y_train = y_train0 clf = MultinomialNB() clf.fit(x_train, y_train) result = clf.predict_proba(x_test) #return Evaluation.Evaluation(y_test, result) #results = Evaluation.Evaluation(y_test, result) elif Label == "DT": x_train, y_train, y_train0, x_test, y_test, y_test0 = LoadData.GetData_WithoutS(is_add_noise,noise_ratio,filepath, filename, sequence_window, tab_cv, cross_cv, Multi_Scale=is_multi_scale, Wave_Let_Scale=training_level, Normalize=2) print(Label+" is running.............................................."+str(x_train.shape)) y_train = y_train0 clf = tree.DecisionTreeClassifier() clf.fit(x_train, y_train) result = clf.predict_proba(x_test) #return Evaluation.Evaluation(y_test, result) #results = Evaluation.Evaluation(y_test, result) elif Label == "Ada.Boost": x_train, y_train, y_train0, x_test, y_test, y_test0 = LoadData.GetData_WithoutS(is_add_noise,noise_ratio,filepath, filename, sequence_window, tab_cv, cross_cv, Multi_Scale=is_multi_scale, Wave_Let_Scale=training_level, Normalize=0) print(Label+" is running.............................................."+str(x_train.shape)) y_train = y_train0 #clf = AdaBoostClassifier(n_estimators=10) #Nimda tab=1 clf = AdaBoostClassifier(n_estimators=10) clf.fit(x_train, y_train) result = clf.predict_proba(x_test) #return Evaluation.Evaluation(y_test, result) #results = Evaluation.Evaluation(y_test, result) elif Label == "MLP": x_train, y_train, y_train0, x_test, y_test, y_test0 = LoadData.GetData_WithoutS(is_add_noise,noise_ratio,filepath, filename, sequence_window, tab_cv, cross_cv, Multi_Scale=is_multi_scale, Wave_Let_Scale=training_level, Normalize=0) print(Label+" is running..............................................") batch_size = len(y_train) start = time.clock() model = Sequential() model.add(Dense(hidden_units, activation="relu", input_dim=33)) model.add(Dense(output_dim=number_class)) model.add(Activation("sigmoid")) # model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=epoch) #result = model.predict(X_Testing, batch_size=batch_size) result = model.predict(x_test) end = time.clock() print("The Time For MLP is " + str(end - start)) #return Evaluation.Evaluation(y_test, result) #results = Evaluation.Evaluation(y_test, result) #elif Label == "SVM-S": #x_train, y_train, y_train0, x_test, y_test, y_test0 = LoadData.GetData('Attention',filepath,filename,sequence_window,tab_cv,cross_cv) #x_train,y_train = Manipulation(x_train,y_train0,sequence_window) #x_test, y_test = Manipulation(x_test, y_test0, sequence_window) #clf = svm.SVC(kernel="rbf") #clf.fit(x_train, y_train) #result = clf.predict(x_test) #results = Evaluation.Evaluation_WithoutS(y_test, result) elif Label == "RNN": print(Label+" is running..............................................") start = time.clock() x_train_multi_list, x_train, y_train, x_testing_multi_list, x_test, y_test = LoadData.GetData(is_add_noise,noise_ratio,'Attention', filepath, filename, sequence_window, tab_cv, cross_cv, Multi_Scale=is_multi_scale, Wave_Let_Scale=training_level) batch_size = len(y_train) rnn_object = SimpleRNN(hidden_units, input_length=len(x_train[0]), input_dim=input_dim) model = Sequential() model.add(rnn_object) # X.shape is (samples, timesteps, dimension) #model.add(Dense(30, activation="relu")) #model.add(Dropout(0.2)) model.add(Dense(30, activation="sigmoid")) #model.add(Dropout(0.3)) # model.add(Dense(5,activation="tanh")) model.add(Dense(output_dim=number_class)) model.add(Activation("sigmoid")) # model.add(Activation("softmax")) # model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=epoch) #result = model.predict(X_Testing, batch_size=batch_size) result = model.predict(x_test) #return Evaluation.Evaluation(y_test, result) #results = Evaluation.Evaluation(y_test, result) end = time.clock() print("The Time For RNN is " + str(end - start)) # print(result) elif Label == "LSTM": print(Label+" is running..............................................") start = time.clock() x_train_multi_list, x_train, y_train, x_testing_multi_list, x_test, y_test = LoadData.GetData(is_add_noise,noise_ratio,'Attention',filepath, filename, sequence_window, tab_cv, cross_cv, Multi_Scale=is_multi_scale, Wave_Let_Scale=training_level) batch_size = len(y_train) lstm_object = LSTM(hidden_units, input_length=len(x_train[0]), input_dim=input_dim) model = Sequential() model.add(lstm_object) # X.shape is (samples, timesteps, dimension) # model.add(LSTM(lstm_size,return_sequences=True,input_shape=(len(X_Training[0]),33))) # model.add(LSTM(100,return_sequences=True)) # model.add(Dense(10, activation="tanh")) # model.add(Dense(5,activation="tanh")) model.add(Dense(30, activation="relu")) #model.add(Dropout(0.2)) #model.add(Dense(30, activation="sigmoid")) #model.add(Dropout(0.3)) # model.add(Dense(5,activation="tanh")) model.add(Dense(output_dim=number_class)) model.add(Activation("sigmoid")) #model.add(Activation("softmax")) # model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=epoch) #result = model.predict(X_Testing, batch_size=batch_size) result = model.predict(x_test) end = time.clock() print("The Time For LSTM is " + str(end - start)) if len(Parameters) > 0: return Evaluation.Evaluation(y_test, result)#Plotting AUC results = Evaluation.Evaluation(y_test, result)# Computing ACCURACY,F1-score,..,etc print(results) y_test2 = np.array(Evaluation.ReverseEncoder(y_test)) result2 = np.array(Evaluation.ReverseEncoder(result)) print("---------------------------1111111111111111") with open("StatFalseAlarm_"+filename+"_True.txt","w") as fout: for tab in range(len(y_test2)): fout.write(str(int(y_test2[tab]))+'\n') with open("StatFalseAlarm_"+filename+"_"+Label+"_"+"_Predict.txt","w") as fout: for tab in range(len(result2)): fout.write(str(int(result2[tab]))+'\n') print(result2.shape) print("---------------------------22222222222222222") for each_eval, each_result in results.items(): result_list_dict[each_eval].append(each_result) for eachk, eachv in result_list_dict.items(): result_list_dict[eachk] = np.average(eachv) #print(result_list_dict) if is_add_noise == False: with open(os.path.join(os.getcwd(),"Comparison_Log_"+filename+".txt"),"a")as fout: outfileline = Label+":__" fout.write(outfileline) for eachk,eachv in result_list_dict.items(): fout.write(eachk+": "+str(round(eachv,3))+",\t") fout.write('\n') else: with open(os.path.join(os.getcwd(),"Comparison_Log_Adding_Noise_"+filename+".txt"),"a")as fout: outfileline = Label+":__"+"Noise_Ratio_:"+str(noise_ratio) fout.write(outfileline) for eachk,eachv in result_list_dict.items(): fout.write(eachk+": "+str(round(eachv,3))+",\t") fout.write('\n') return results
'penalty': ['l2', 'l1'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] } grid = GridSearchCV(estimator=log_clf, param_grid=param_grid, scoring='roc_auc', verbose=1, n_jobs=-1) grid.fit(X, y) print("Best Score:" + str(grid.best_score_)) print("Best Parameters: " + str(grid.best_params_)) best_parameters = grid.best_params_ #Recursive feature elimination log_clf = LogisticRegression(**best_parameters) log_clf.fit(X, y) selector = RFE(log_clf, 25, step=1) selector.fit(X, y) scores_table(selector, 'selector_clf') #submission submission = pd.read_csv('../input/dont-overfit-ii/sample_submission.csv') X_test = test submission['target'] = selector.predict_proba(X_test) submission.to_csv('submission.csv', index=False)
pca = PCA(n_components=2) pc = pca.fit_transform(x) pc_t = pca.fit_transform(t) pcdf = pd.DataFrame(data=pc, columns=['pc1', 'pc2']) pcdf_t = pd.DataFrame(data=pc_t, columns=['pc1', 'pc2']) print(pcdf.shape) print(pcdf_t.shape) finalDf = pd.concat([pcdf, df[['target']]], axis=1) print(finalDf.head()) print('주성분 설명력 : ', pca.explained_variance_ratio_) # REF 적용 Random Forest # RF용 StandardScaler ss = StandardScaler() x_train_ss = ss.fit_transform(x_train) x_test_ss = ss.fit_transform(x_test) test_ss = ss.fit_transform(test) print(train.shape, test.shape) #sub model 1 : RandomForest forest = RandomForestClassifier(n_estimators=500, random_state=7) select = RFE(forest, n_features_to_select=77) x_train_rf = select.fit_transform(x_train_ss, y_train) x_test_rf = select.transform(x_test_ss) test_rf = select.transform(test_ss) print(x_train_rf.shape) score = select.fit(x_train_rf, y_train).score(x_test_rf, y_test) print('RFE 후 acc : {:.3f}'.format(score)) rf_y_pred = select.predict_proba(test_rf)
attributes = attributes_balance.drop('fusao', axis=1) print(attributes) #Cria dicionário e mapa para sexo d = {'F': 0, 'M': 1} attributes['Sexo'] = attributes['Sexo'].map(d).astype(int) attributes = pd.get_dummies(attributes) print(attributes) # Divide aleatoriamentes os conjuntos em teste e treino X_train, X_test, y_train, y_test = train_test_split(attributes, classes, test_size=0.20) # Criar e treinar modelo de regressão logreg = LogisticRegression(solver='liblinear') classifier = RFE(logreg, 20) classifier = classifier.fit(X_train, y_train) jl.dump(classifier, 'models/diabetes_logistic_regression.joblib') y_pred = classifier.predict(X_test) print(y_pred) print(classification_report(y_test, y_pred)) print(confusion_matrix(y_test, y_pred)) # predict probabilities probs = classifier.predict_proba(X_test) print(probs)
for f in range(n_features): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) if (feat_roc): half = int(n_samples / 2) x, y = shuffle(x, y, random_state=random_state) X_train, X_test = x[0:half], x[half:-1] y_train, y_test = y[0:half], y[half:-1] rf_feat_sel = RandomForestClassifierWithCoef(n_estimators=n_trees) for i in range(n_features): print(i) rfe = RFE(rf_feat_sel, i + 1) rfe = rfe.fit(X_train, y_train) probas_ = rfe.predict_proba(X_test) fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1]) roc_auc = auc(fpr, tpr) if (i == 0 or i == 18): print("auc: ", roc_auc) pl.plot(fpr, tpr, lw=1) pl.xlim([-0.001, 1.001]) pl.ylim([-0.001, 1.001]) pl.xlabel('False Positive Rate') pl.ylabel('True Positive Rate') pl.title('Receiver operating characteristic example') pl.show()
# Try the RFE method # Create the RFE model and select number of attributes # We checked the appropriate number of attributes through the confusion matrix rfe = RFE(model, 7).fit(x_train, y_train) # summarize the selection of the attributes print('Selected features: %s' % list(x_train.columns[rfe.support_])) # Create a confusion matrix in the form of an array y_pred = rfe.predict(x_train) cnf_matrix = metrics.confusion_matrix(y_train, y_pred) cnf_matrix # Plot of predicted probabilities of survival vs actual survival y_pred_prob = rfe.predict_proba(x_train)[:, 1] plt.figure(7) plt.scatter(y_pred_prob, y_train, s=10) plt.xlabel('Predicted Chance Of Survival') plt.ylabel('Actual Survival') plt.tight_layout() plt.show() print("Accuracy:", metrics.accuracy_score(y_train, y_pred)) ##################################################### # Test our model against dataset_test # Fill out the missing values of age with the mean # Fill out the missing values of fare with the median mean_age = dataset_test.loc[:, 'Age'].mean()
cfm = confusion_matrix(Y_test, Y_pred) print(cfm) #[[7397 26] #[ 142 2204]] print("Classification Report") print(classification_report(Y_test, Y_pred)) accuracy_score = accuracy_score(Y_test, Y_pred) print("Accuracy of the Model:", accuracy_score) # Accuracy of the Model: 0.982802743372 #%% # Adjusting The Threshold Y_pred_prob = rfe.predict_proba(X_test) print(Y_pred_prob) Y_pred_class = [] for value in Y_pred_prob[:, 0]: if value < 0.72: Y_pred_class.append(1) else: Y_pred_class.append(0) print(Y_pred_class) #%% from sklearn.metrics import confusion_matrix, accuracy_score cfm = confusion_matrix(Y_test.tolist(), Y_pred_class) print(cfm)
for f in range(n_features): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) if(feat_roc): half = int(n_samples / 2) x,y = shuffle(x,y,random_state=random_state) X_train, X_test = x[0:half], x[half:-1] y_train, y_test = y[0:half], y[half:-1] rf_feat_sel = RandomForestClassifierWithCoef(n_estimators=n_trees) for i in range(n_features): print(i) rfe = RFE(rf_feat_sel, i+1) rfe = rfe.fit(X_train,y_train) probas_ = rfe.predict_proba(X_test) fpr, tpr, thresholds = roc_curve(y_test, probas_[:,1]) roc_auc = auc(fpr, tpr) if (i==0 or i==18): print ("auc: ", roc_auc) pl.plot(fpr, tpr, lw=1) pl.xlim([-0.001, 1.001]) pl.ylim([-0.001, 1.001]) pl.xlabel('False Positive Rate') pl.ylabel('True Positive Rate') pl.title('Receiver operating characteristic example') pl.show()
# separate features from labels train_y = train_data['exclusion'] train_x = train_data.drop(columns=['exclusion']) test_y = test_data['exclusion'] test_x = test_data.drop(columns=['exclusion']) start = timeit.default_timer() rf_model = RandomForestClassifier(n_jobs=-1, n_estimators=tree_count) rfe = RFE(estimator=rf_model, n_features_to_select=subset_size, step=5) rfe.fit(train_x, train_y) logger.log_message( f'Using features {train_x.columns[rfe.support_].values}') # record performance posteriors = rfe.predict_proba(test_x) roc_auc = roc_auc_score(test_y, posteriors[:, 1]) # record results time_elapsed = timeit.default_timer() - start output += f'{roc_auc},{time_elapsed}\n' logger.log_message(f'Ending run {run}') logger.log_message(f'Results {output}') with open(results_file, 'a') as outfile: outfile.write(output) counter += 1
# ROC curve fpr, tpr, t = roc_curve(y_test, y_score) plot_roc() # In[113]: #Logistic regression with RFE log_clf = LogisticRegression(C=best_parameters['C'], penalty=best_parameters['penalty'], random_state=random_state) selector = RFE(log_clf) selector = selector.fit(X_train, y_train) y_pred = selector.predict(X_test) y_score = selector.predict_proba(X_test)[:, 1] # Confusion maxtrix & metrics cm = confusion_matrix(y_test, y_pred) class_names = [0, 1] plt.figure() plot_confusion_matrix(cm, classes=class_names, title='Logistic Confusion matrix') plt.xlim(-0.5, len(np.unique(y)) - 0.5) # ADD THIS LINE plt.ylim(len(np.unique(y)) - 0.5, -0.5) # ADD THIS LINE plt.show() show_metrics()
num_round = 250 # Number of rounds of training, increasing this increases the range of output values clf = xgbw.XGBWrapper(param, num_round, verbose_eval=0) k = 500 step = 25 result_all = [] for step in [400, 200, 100, 50, 25]: selector = RFE(clf, step=step, n_features_to_select=k, verbose=2) print "Fitting Selector: k = {}, step = {}".format(k, step) start = time.time() selector = selector.fit(X_train, y_train) train_time = time.time() - start support = selector.get_support(indices=True) file_name = str(data[0]).rjust(2, "0") + str(data[1]).rjust(2, "0") + "_k" + str(k) + "_s" + str(step) addr_out = os.path.join("/home/ubuntu/Weiyi/RFE_Select", file_name) np.save(addr_out, support) start = time.time() prob = selector.predict_proba(X_test) test_time = round(time.time() - start, 2) score, recall, filter_rate, cut, net_savings = search_cut(prob) result_all.append([k, train_time, test_time, score, recall, filter_rate, cut, net_savings, step]) data = pd.DataFrame(np.array(result_all), columns=["k", "train time", "test time", "score", "recall", "filter rate", "cut", "net savings", "step"]) data.to_csv("/home/ubuntu/Weiyi/RFE_Select/RFE_0604.csv")