def main(): data = pd.read_csv("./data/processed_data.csv", index_col=0) target = pd.read_csv("./data/processed_target_data.csv", index_col=0) data_clean = data.drop(columns=['doc_key']) data_clean.fillna(0, inplace=True) target_clean = target.drop(columns=['doc_key']) target_clean.fillna(0, inplace=True) # Data Extraction X = data_clean.drop(columns=['estimate']) y = data_clean['estimate'] bin_labels_2 = ['S', 'L'] bin_labels_3 = ['S', 'M', 'L'] bin_labels_4 = ['S', 'M', 'L', 'XL'] bin_labels_5 = ['XS', 'S', 'M', 'L', 'XL'] bin_labels = bin_labels_4 y_bins = pd.qcut(y, q=len(bin_labels), labels=bin_labels) raw_bins = pd.qcut(y, q=len(bin_labels)) X = drop_highly_correlated_features(X) from sklearn.model_selection import LeaveOneOut, ShuffleSplit, StratifiedShuffleSplit X_train, X_test, y_train, y_test = train_test_split(X, y_bins, test_size=0.05) model_cv = LeaveOneOut() model = MultinomialNB() selector = RFECV(estimator=model, min_features_to_select=10, step=1, cv=model_cv, scoring='accuracy') selector = selector.fit(X_train, y_train) y_pred = selector.predict(X_test) print("=== Results with Multinomial Naive Bayes ===") print('Bins: ', bin_labels) print('Bin Ranges: ', raw_bins.dtype.categories.to_tuples().to_numpy()) print('Counts (training set):') print(y_train.value_counts()) print('\nTotal number of features (aka Words) in data: %d' % (data_clean.shape[1] - 1)) print('Total number of uncorrelated features (aka Words) in data: %d' % X.shape[1]) print("Number of features (aka Words) in optimal model: %d" % selector.n_features_) print("\nTest doc: %s" % data.loc[list(X_test.index)[0]]['doc_key']) print("Expected vs Predicted Size: {} vs {}".format( y_test.iloc[0], y_pred[0])) print("Class Probabilities: ", selector.classes_, selector.predict_proba(X_test)) plot_rfecv_selection(selector)
from sklearn.feature_selection import RFECV from sklearn.model_selection import StratifiedKFold, train_test_split from xgboost import XGBClassifier np.seterr(divide='ignore', invalid='ignore') df = pd.read_excel(r'Data/Null check.xlsx') #clf_feature_selection = XGBClassifier(colsample_bytree= 0.1, gamma= 0.1, learning_rate= 0.01, max_depth= 20, min_child_weight= 1, n_estimators= 20) #clf_feature_selection = LogisticRegression() #clf_feature_selection = XGBClassifier() clf_feature_selection = RandomForestClassifier(bootstrap= False, criterion= 'entropy', max_depth= 20, max_features= 'auto', min_samples_leaf= 5, min_samples_split= 5, n_estimators= 300) rfecv = RFECV(estimator=clf_feature_selection, step=1, cv = StratifiedKFold(2), scoring='roc_auc') X = df.drop(columns = 'label') y = df['label'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42) rfecv.fit(X_train,y_train) #rfecv.fit(X,y) print("Best Features:", X_train.columns[rfecv.support_]) print("Optimal number of features : %d" % rfecv.n_features_) predicted_probas = rfecv.predict_proba(X_test) y_true = y_test y_probas = predicted_probas skplt.metrics.plot_roc_curve(y_true, y_probas) plt.show()
1, 10, 100, 500, 800, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 10000 ] K_scores = [] log_Loss = [] for K in K_range: # log_reg = LogisticRegression(solver='lbfgs', multi_class='multinomial' clf = linear_model.LogisticRegression(solver='lbfgs', multi_class='multinomial', C=K) rfecv = RFECV(clf, step=10, scoring='accuracy') rfecv.fit(X_trainf, y_train) train_predictions = rfecv.predict(X_valf) acc = accuracy_score(y_test, train_predictions) K_scores.append(acc) train_predictions = rfecv.predict_proba(X_valf) ll = log_loss(y_test, train_predictions) log_Loss.append(ll) plt.figure(1) plt.subplot(211) plt.plot(K_range, K_scores) plt.ylim([0.97, 1]) #plt.xlabel('Value of C') plt.ylabel('Accuracy') plt.subplot(212) plt.plot(K_range, log_Loss) plt.xlabel('Value of C') plt.ylabel('log_loss') plt.show()
plt.figure(figsize=(12, 9)) plt.xlabel('Number of features tested x 2') plt.ylabel('Cross-validation score (AUC)') plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.savefig('Porto-RFECV-01.png', dpi=150) plt.show() # Save sorted feature rankings. # In[ ]: ranking = pd.DataFrame({'Features': all_features}) ranking['Rank'] = np.asarray(rfecv.ranking_) ranking.sort_values('Rank', inplace=True) ranking.to_csv('Porto-RFECV-ranking-01.csv', index=False) # Make a prediction. This is only a proof-of-principle as the prediction will likely be poor until more optimal parameters are used above. # In[ ]: score = round((np.max(rfecv.grid_scores_) * 2 - 1), 5) test['target'] = rfecv.predict_proba(X_test)[:, 1] test = test[['id', 'target']] now = datetime.now() sub_file = 'submission_5fold-RFECV-RandomForest-01_' + str(score) + '_' + str( now.strftime("%Y-%m-%d-%H-%M")) + '.csv' print("\n Writing submission file: %s" % sub_file) test.to_csv(sub_file, index=False) timer(starttime)
# 设置参数 xgb = XGBClassifier(n_estimators=300, max_depth=5, nthread=20, scale_pos_weight=4, learning_rate=0.07) # 特征选择 rfecv = RFECV(estimator=xgb, step=10, cv=StratifiedKFold(3), n_jobs=20, scoring='roc_auc') rfecv.fit(train_x, train_y) pre_y = rfecv.predict_proba(test_x)[:, 1] pre_y_categ = rfecv.predict(test_x) # 计算auc fpr, tpr, thresholds = metrics.roc_curve(test_y, pre_y) auc = metrics.auc(fpr, tpr) f1 = metrics.f1_score(test_y, pre_y_categ) print("AUC得分为:") print(auc) print('f1-score为:') print(f1) print("Optimal number of features :") print(rfecv.ranking_) print('n_features_') print(rfecv.n_features_) print('support_') print(rfecv.support_)
learning_rate=0.02, nthread=4, subsample=0.95, colsample_bytree=0.85, seed=4242) # bst = xgb.train(param, # dtrain, # num_round, # evallist, # early_stopping_rounds=10) # If error doesn't decrease in n rounds, stop early selector = RFECV(clf, step=1, cv=5) selector = selector.fit(X_train, y_train) print 'Selector fit...' # clf.dump_model('/home/rmendoza/Desktop/xgb_june_04_to_05_v2.txt') # bst.save_model('/home/rmendoza/Desktop/xgbtemp.model') y_pred = selector.predict_proba(test_data) cut = 0.1 results = [0, 0, 0, 0, 0, 0, 0] for cutoff in range(10, 15): cut = cutoff/float(100) # Cutoff in decimal form y = y_pred > cut # If y values are greater than the cutoff recall = metrics.recall_score(test_label, y) # true_negative_rate = sum(np.logical_not(np.logical_or(test_label, y)))/float(len(y_pred)) filter_rate = sum(np.logical_not(y))/float(len(y_pred)) if recall*6.7+filter_rate > results[0]: timer = time.time() - start_time results = evalModel(test_label, y_pred, start_time, cut) print results
test[t, nfeatures + 2] = np.std(feature_array) test[t, nfeatures + 3] = np.max(feature_array) test[t, nfeatures + 4] = np.min(feature_array) sort_list = sorted(range(len(feature_array)), key=lambda x: feature_array[x], reverse=True) test[t, nfeatures + 5 : nfeatures + 5 + ntwobyte] = sorted( range(len(feature_array)), key=lambda x: sort_list[x] ) # test_fourbyte[t] = map(float, row[nfeatures+1:nfeatures+nfourbyte+1]) if six.PY2 else list(map(float, row[nfeatures+1:nfeatures+nfourbyte+1])) Ids.append(row[0]) if (t + 1) % 1000 == 0: print(t + 1, "records loaded") print("test set loaded") # Predict for whole test set # test[:,nfeatures+1:] = pca.transform(test_fourbyte) # del test_fourbyte y_pred = rfecv.predict_proba(test) # y_pred = gscv2.predict_proba(test) # y_pred = clf2.predict_proba(test) # Writing results to file with gzip.open(fsubmission, write_mode) as f: fw = writer(f) # Header preparation header = ["Id"] + ["Prediction" + str(i) for i in range(1, 10)] fw.writerow(header) for t, (Id, pred) in enumerate(zp(Ids, y_pred.tolist())): fw.writerow([Id] + pred) if (t + 1) % 1000 == 0: print(t + 1, "prediction written")
data_train = pd.read_csv('data_analysis/data_train.csv',encoding='gb2312') targets = data_train['TARGET'] train_data = data_train.drop(labels=['EID','TARGET'],axis=1) # 划分样本集 train_x,test_x,train_y,test_y = train_test_split(train_data,targets,test_size=0.5,random_state=66) # 设置参数 xgb = XGBClassifier(n_estimators=300,max_depth=5,nthread=20,scale_pos_weight=4,learning_rate=0.07) # 特征选择 rfecv = RFECV(estimator=xgb, step=10, cv=StratifiedKFold(3),n_jobs =20, scoring='roc_auc') rfecv.fit(train_x, train_y) pre_y = rfecv.predict_proba(test_x)[:,1] pre_y_categ = rfecv.predict(test_x) # 计算auc fpr, tpr, thresholds = metrics.roc_curve(test_y, pre_y) auc=metrics.auc(fpr, tpr) f1 = metrics.f1_score(test_y,pre_y_categ) print("AUC得分为:") print(auc) print('f1-score为:') print(f1) print("Optimal number of features :" ) print(rfecv.ranking_ ) print('n_features_') print(rfecv.n_features_) print('support_') print(rfecv.support_)
test[t, nfeatures + 3] = np.max(feature_array) test[t, nfeatures + 4] = np.min(feature_array) sort_list = sorted(range(len(feature_array)), key=lambda x: feature_array[x], reverse=True) test[t, nfeatures + 5:nfeatures + 5 + ntwobyte] = sorted( range(len(feature_array)), key=lambda x: sort_list[x]) # test_fourbyte[t] = map(float, row[nfeatures+1:nfeatures+nfourbyte+1]) if six.PY2 else list(map(float, row[nfeatures+1:nfeatures+nfourbyte+1])) Ids.append(row[0]) if (t + 1) % 1000 == 0: print(t + 1, 'records loaded') print('test set loaded') # Predict for whole test set #test[:,nfeatures+1:] = pca.transform(test_fourbyte) #del test_fourbyte y_pred = rfecv.predict_proba(test) #y_pred = gscv2.predict_proba(test) #y_pred = clf2.predict_proba(test) # Writing results to file with gzip.open(fsubmission, write_mode) as f: fw = writer(f) # Header preparation header = ['Id'] + ['Prediction' + str(i) for i in range(1, 10)] fw.writerow(header) for t, (Id, pred) in enumerate(zp(Ids, y_pred.tolist())): fw.writerow([Id] + pred) if (t + 1) % 1000 == 0: print(t + 1, 'prediction written')