def get_stacked_model(X, y, is_processing=True): ensemble = SuperLearner(scorer=accuracy_score, random_state=seed) preprocessers = [StandardScaler()] if is_processing else [] ensemble.add([MyClassifier(5.0)], preprocessing=preprocessers) ensemble.add_meta(MyClassifier(0.5)) ensemble.fit(X, y) return ensemble
def get_stacked_model(X, y): ensemble = SuperLearner(scorer=f1, random_state=seed) ensemble.add([RandomForestClassifier(random_state=seed), SVC()]) ensemble.add_meta(LogisticRegression()) ensemble.fit(X, y) print('f1-score in training') print('-m: mean. -s: std') print(pd.DataFrame(ensemble.data)) return ensemble
def simple_statistic(comb): resres=[] for train, test in tqdm(list(sfolder.split(data_x,data_y))): # break cofff=['age_interval','admission_type_EMERGENCY','admission_type_ELECTIVE','admission_type_URGENT','aids','hem','mets'] # stats_list=['min','max','minmax','mean','std','stdmean','median','qua25','qua75','qua2575','mode','skew','kurt','first'] X_train, X_test = data_x.iloc[train,:], data_x.iloc[test,:] Y_train, Y_test = data_y[train], data_y[test] x_train,x_val,y_train,y_val=train_test_split(X_train,Y_train,test_size=0.25,random_state=42) smo=SMOTE(random_state=42,ratio={1:2000}) x_train_s,y_train_s=smo.fit_sample(x_train,y_train) ###对遗传算法中的训练集进行重采样,获得新的遗传算法训练集x_train_s x_train_s=pd.DataFrame(x_train_s,columns=x_val.columns) X_train_s=pd.concat([x_train_s,x_val],axis=0) Y_train_s=list(y_train_s) Y_train_s.extend(list(y_val)) Y_train_s=np.array(Y_train_s) best_combination_nowfold=comb for sts in best_combination_nowfold: for column in x_train.columns: if(sts == column.split('_')[0]): cofff.append(column) x_train_train=X_train_s[cofff] y_train_train=Y_train_s x_test=X_test[cofff] y_test=Y_test ensemble = SuperLearner(scorer=roc_auc_score,random_state=42,folds=10,backend="multiprocessing") ensemble.add([GaussianNB(),SVC(C=100, probability=True), neighbors.KNeighborsClassifier(n_neighbors=3), LogisticRegression(), MLPClassifier(), GradientBoostingClassifier(n_estimators=100), RandomForestClassifier(random_state=42,n_estimators=100), BaggingClassifier(), tree.DecisionTreeClassifier()],proba=True) ensemble.add_meta(LogisticRegression(),proba=True) print('now is here -4\n') ensemble.fit(x_train_train,y_train_train) print('now is here -5\n') preds_prob=ensemble.predict_proba(x_test) print('now is here -6\n') prob=preds_prob[:, 1] preds=[] for i in prob: if i>=0.5: preds.append(1); else: preds.append(0) auc_sl=roc_auc_score(y_test,preds_prob[:,1]) auprc_sl=average_precision_score(y_test,preds_prob[:,1]) recall_sl=recall_score(y_test,preds) acc_sl=accuracy_score(y_test,preds) p_sl=precision_score(y_test,preds) f1_sl=f1_score(y_test,preds) fpr_sl,tpr_sl,thr_sl=roc_curve(y_test,prob) print('now is here -7') resres.append([best_combination_nowfold,auc_sl,auprc_sl,acc_sl,p_sl,recall_sl,f1_sl,fpr_sl,tpr_sl,thr_sl]) return resres
def stacking_training (X,y,X_pred,layer_list,meta_learner): stacking_in_layer = SuperLearner(folds = 5, backend= 'multiprocessing', model_selection=False) for each in layer_list: stacking_in_layer.add(each,proba=True) print ('基学习器添加成功') stacking_in_layer.add_meta(meta_learner,proba= True) print ('元学习器添加成功') print ('拟合中') stacking_in_layer.fit(X,y) pred_proba = stacking_in_layer.predict_proba(X_pred) return pred_proba,stacking_in_layer
def get_stacked_model(X, y): ensemble = SuperLearner(scorer=accuracy, random_state=seed) # call predict_proba instead of predict ensemble.add( [SVC(probability=True), RandomForestClassifier(random_state=seed)], proba=True) ensemble.add_meta(LogisticRegression()) ensemble.fit(X, y) print('accuracy score in training') print('-m: mean. -s: std') print(pd.DataFrame(ensemble.data)) return ensemble
def esemble(data,data2,data5,during): ensemble = SuperLearner(scorer=accuracy_score, random_state=45, verbose=2) ensemble.add(linear_model.LinearRegression()) ensemble.add_meta([GaussianProcessRegressor()]) y = data2['prmom'+during+'_f'] x = data2.drop(['prmom1d_f','prmom1w_f','prmom2w_f','prmom3w_f','uniqcode','date'],axis=1) x=x.fillna(0) y=np.array(y) x=np.array(x) ensemble.fit(x,y) X= data5.drop(['prmom1d_f','prmom1w_f','prmom2w_f','prmom3w_f','uniqcode','date','pred'],axis=1) X=X.fillna(0) X=np.array(X) preds = ensemble.predict(X) data['pred_essemble']=preds return data
def use_pack(): sl = SuperLearner( folds=10, random_state=SEED, verbose=2, # backend="multiprocessing" ) # Add the base learners and the meta learner sl.add(list(base_learners.values()), proba=True) sl.add_meta(meta_learner, proba=True) # Train the ensemble sl.fit(xtrain, ytrain) # Predict the test set p_sl = sl.predict_proba(xtest) print("\nSuper Learner ROC-AUC score: %.3f" % roc_auc_score(ytest, p_sl[:, 1]))
def perform_ensemble_adaboost(X_train, y_train, X_test, y_test): all_objects = [ "Vase", "Teapot", "Bottle", "Spoon", "Plate", "Mug", "Knife", "Fork", "Flask", "Bowl" ] ensemble = SuperLearner(folds=10, random_state=seed, verbose=2, backend="multiprocessing", scorer=accuracy_score) layer_1 = [SVC(kernel='linear', C=8)] ensemble.add(layer_1) # 95.50 """Make plots of learning curve""" ensemble.add_meta( AdaBoostClassifier( DecisionTreeClassifier(max_depth=8, min_samples_split=5, min_samples_leaf=8))) ensemble.fit(X_train, y_train) import time start = time.time() yhat = ensemble.predict(X_test) accuracies = cross_val_score(ensemble, X_test, y_test, cv=10, scoring="accuracy") print("Accuracy of Adaboost: {:.2f} %".format(accuracies.mean() * 100)) print("Standard Deviation of Adaboost: {:.2f} %".format(accuracies.std() * 100))
def test_equivalence_super_learner(): """[SequentialEnsemble] Test ensemble equivalence with SuperLearner.""" ens = SuperLearner() seq = SequentialEnsemble() ens.add(ECM, dtype=np.float64) seq.add('stack', ECM, dtype=np.float64) F = ens.fit(X, y).predict(X) P = seq.fit(X, y).predict(X) np.testing.assert_array_equal(P, F)
def test_subset_equiv(): """[Subsemble] Test equivalence with SuperLearner for J=1.""" sub = Subsemble(partitions=1) sl = SuperLearner() sub.add(ECM, dtype=np.float64) sl.add(ECM, dtype=np.float64) F = sub.fit(X, y).predict(X) P = sl.fit(X, y).predict(X) np.testing.assert_array_equal(P, F)
def train_model(ensemble, X, y) : seed = 2017 np.random.seed(seed) # --- Build --- # Passing a scoring function will create cv scores during fitting # the scorer should be a simple function accepting to vectors and returning a scalar ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2) # Build the first layer # ensemble.add([RandomForestClassifier(random_state=seed), SVC()]) ensemble.add([IsolationForest(), LOF(novelty=True)]) # Attach the final meta estimator # ensemble.add_meta(LogisticRegression()) ensemble.add_meta(OCSVM()) # Fit ensemble ensemble.fit(X, y)
def add_superlearner(name, models, X_train, Y_train, X_test, Y_test): # Establish and reset variables acc_score_cv = None acc_score = None time_ = None ensemble = SuperLearner(scorer=accuracy_score, random_state=seed) ensemble.add(models) # Attach the final meta estimator ensemble.add_meta(SVC()) start = time.time() ensemble.fit(X_train, Y_train) preds = ensemble.predict(X_test) acc_score = accuracy_score(preds, Y_test) end = time.time() time_ = end - start return { "Ensemble": name, "Meta_Classifier": "SVC", "Accuracy_Score": acc_score, "Runtime": time_ }
from mlens.ensemble import SuperLearner # Instantiate the ensemble with 10 folds sl = SuperLearner( folds=10, random_state=SEED, verbose=2, backend="multiprocessing" ) # Add the base learners and the meta learner sl.add(list(base_learners.values()), proba=True) sl.add_meta(meta_learner, proba=True) # Train the ensemble sl.fit(X_train_sc, y_train_sc) # Predict the test set p_sl = sl.predict_proba(X_test_sc) # print("\nSuper Learner ROC-AUC score: %.3f" % roc_auc_score(y_test_sc, p_sl[:, 1])) # In[119]: pp = [] for p in p_sl[:, 1]: if p>0.5: pp.append(1.) else:
# Initial layer, propagate as before ensemble.add(estimators, propagate_features=[0, 1]) # Intermediate layer, keep propagating, but add a preprocessing # pipeline that selects a subset of the input ensemble.add(estimators, preprocessing=[Subset([2, 3])], propagate_features=[0, 1]) ############################################################################## # In the above example, the two first features of the original input data # will be propagated through both layers, but the second layer will not be # trained on it. Instead, it will only see the predictions made by the base # learners in the first layer. ensemble.fit(X, y) n = list(ensemble.layer_2.learners[0].learner )[0].estimator.feature_importances_.shape[0] m = ensemble.predict(X).shape[1] print("Num features seen by estimators in intermediate layer: %i" % n) print("Num features in the output array of the intermediate layer: %i" % m) ############################################################################## # .. _proba-tutorial: # # Probabilistic ensemble learning # ------------------------------- # # When the target to predict is a class label, it can often be beneficial to # let higher-order layers or the meta learner learn from *class probabilities*, # as opposed to the predicted class. Scikit-learn classifiers can return a
seed = 2017 np.random.seed(seed) data = load_iris() idx = np.random.permutation(150) X = data.data[idx] y = data.target[idx] # Building an ensemble from mlens.ensemble import SuperLearner from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC # --- Multi-layer ensembles --- ensemble = SuperLearner(scorer=accuracy_score, random_state=seed) # Build first layer ensemble.add([RandomForestClassifier(random_state=seed), LogisticRegression()]) # Build the second layer ensemble.add([LogisticRegression(), SVC()]) # Attach final meta estimator ensemble.add_meta(SVC()) ensemble.fit(X[:75], y[:75]) preds = ensemble.predict(X[75:]) print("Fit data:\n%r" % ensemble.data)
# Passing a scoring function will create cv scores during fitting # the scorer should be a simple function accepting to vectors and returning a scalar ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2) # Build the first layer ensemble.add([RandomForestClassifier(random_state=seed), SVC()]) # Attach the final meta estimator ensemble.add_meta(LogisticRegression()) ## Use the model for training and testing # start counting time for training time_train_start = time.clock() # Fit ensemble ensemble.fit(training_data, training_labels) # print training time time_train_end = time.clock() print("Training finished, training time: %g seconds \n" % (time_train_end - time_train_start)) # start counting time for testing time_test_start = time.clock() # Predict preds = ensemble.predict(test_data) # print testing time time_test_end = time.clock() print("Testing finished, testing time: %g seconds \n" %
def main(): # Open and read in train x, train y, and scaled test data with open('AviationData_cleaned_V3.csv', 'r') as input_all: df_raw = pd.read_csv(input_all, encoding = 'utf-8') # Final check on NA values from print('Check number of NA values from selected columns:\n', df_raw.isnull().sum()) # Drop rows containing NA values and reset index df_raw.dropna(axis=0, inplace = True) df_raw.reset_index(drop = True, inplace = True) # Prepare response label df_raw['Injury Severity']= df_raw['Injury Severity'].replace('Incident', 'Non-Fatal') # Separate the two classes in the original dataset df_none = df_raw.loc[df_raw['Injury Severity'] == 'Non-Fatal'] df_fatl = df_raw.loc[df_raw['Injury Severity'] == 'Fatal'] # Balance Dataset n_fatl = len(df_fatl) df_none = df_none.sample(n = n_fatl, replace = False, random_state = 117) # Re-construct dataset df_sampled = pd.concat([df_none,df_fatl], ignore_index=True) df_sampled.reset_index(drop = True, inplace = True) # Separate predictors and response df_X = df_sampled.drop(['Injury Severity', 'Airport Code'], axis = 1) df_y = df_sampled.loc[: , 'Injury Severity' ] # Convert string response to numerical response fro convenience df_y.replace('Non-Fatal', '0', inplace = True) df_y.replace('Fatal', '1', inplace = True) # Define and apply one-hot encoder to encode predictors enc = OneHotEncoder(handle_unknown='ignore') enc.fit(df_X) df_X = pd.DataFrame(enc.transform(df_X).toarray(), columns = enc.get_feature_names(list(df_X.columns))) # Separate train and test dataset X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.5, random_state=1378) # Recude dataset dimension #X_train, X_test = dimension_reduction(X_train, y_train, X_test, 80 , method = 'PCA') # Define MLP classifier clf_mlp = MLPClassifier(hidden_layer_sizes=(100), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=117, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10) # Define XGBoost classifier clf_xgb = xgb.XGBClassifier(booster='gbtree', objective= 'binary:logistic', eval_metric='logloss', tree_method= 'auto', max_depth= 6, min_child_weight= 1, gamma = 0, subsample= 1, colsample_bytree = 1, reg_alpha = 0, reg_lambda = 1, learning_rate = 0.1, seed=27) # Define LGB Classifier clf_lgb = lgb.LGBMClassifier(objective = 'binary', boosting = 'gbdt', metric = 'binary_logloss', num_leaves = 15, min_data_in_leaf = 10, max_depth = 5, bagging_fraction = 0.85, bagging_freq = 11, feature_fraction = 0.5, lambda_l1 = 0.01, lambda_l2 = 0.3, num_iterations = 100, learning_rate = 0.08, random_state = 117) # Define random forest classifier clf_rf = RandomForestClassifier(n_estimators=300, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', random_state = 117) # Fit base learners using whole train dataset clf_mlp.fit(X_train,y_train) clf_xgb.fit(X_train,y_train) clf_lgb.fit(X_train,y_train) clf_rf.fit(X_train,y_train) # Generate predicted probability using base learners mlp_proba = clf_mlp.predict_proba(X_test)[:, 1] xgb_proba = clf_xgb.predict_proba(X_test)[:, 1] lgb_proba = clf_lgb.predict_proba(X_test)[:, 1] rf_proba = clf_lgb.predict_proba(X_test)[:, 1] # Initialize prediction using base learners' results pred_mlp = pd.Series(np.full(len(y_test), 0)) pred_xgb = pd.Series(np.full(len(y_test), 0)) pred_lgb = pd.Series(np.full(len(y_test), 0)) pred_rf = pd.Series(np.full(len(y_test), 0)) # Set threshold thres_mlp = 0.5 thres_xgb = 0.5 thres_lgb = 0.5 thres_rf = 0.5 # Make final prediction pred_mlp[mlp_proba >= thres_mlp] = 1 pred_xgb[xgb_proba >= thres_xgb] = 1 pred_lgb[lgb_proba >= thres_lgb] = 1 pred_rf[rf_proba >= thres_rf] = 1 # Map test data response into integers y_test = list(map(int, y_test)) # Generate prediction report using base learners print('\n\nMLP:') print_validate(y_test, pred_mlp) print('\n\nXGB:') print_validate(y_test, pred_xgb) print('\n\nLGB:') print_validate(y_test, pred_lgb) print('\n\nRF:') print_validate(y_test, pred_rf) # Set base learner dictionary base_learners = {'mlp': clf_mlp, 'xgb': clf_xgb, 'lgb' : clf_lgb, 'rf': clf_rf } # Define super learner sup_learner = SuperLearner( random_state=117 ) # Add the base learners and the meta learner sup_learner.add(list(base_learners.values()), proba = True) sup_learner.add_meta(linear_model.BayesianRidge(alpha_1 = 1e-3)) # Train the ensemble sup_learner.fit(X_train,y_train) # Make prediction using super learner sl_proba = sup_learner.predict_proba(X_test) pred_sl = pd.Series(np.full(len(y_test), 0)) thres_sl = 0.5 pred_sl[sl_proba >= thres_sl] = 1 print('\n\nSL:') print_validate(y_test, pred_sl) # ROC Curves for test dataset plt.figure(figsize=(8,7)) draw_roc(y_test, sl_proba, 'Super Learner', 'tab:cyan', '-') draw_roc(y_test, mlp_proba, 'MLP NN', 'royalblue', '-') draw_roc(y_test, xgb_proba, 'XGBoost', 'lightcoral', '--') draw_roc(y_test, lgb_proba, 'LightGBM', 'seagreen', '-.') draw_roc(y_test, rf_proba, 'Random Forest', 'darkorange', '-') plt.plot([0, 1], [0, 1], 'k--', lw = 4) plt.xlim([-0.02, 1.0]) plt.ylim([0.0, 1.02]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curves for Test Result') plt.legend(loc="lower right", fontsize = 14, handlelength=4) plt.show()
# E.train_base_learners(models,xtrain_base,ytrain_base,True) # P_base = E.predict_base_learners(models, xpred_base) # meta_learner.fit(P_base, ypred_base) # P_pred, p = E.ensemble_predict(models,meta_learner,X_test) # # print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(y_test, p)) base_learners, meta_learner1 = E.stacking(models, clone(meta_learner), xtrain_base, ytrain_base, KFold(2)) P_pred, p = E.ensemble_predict(base_learners, meta_learner1, X_test) print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(y_test, p)) sl = SuperLearner(folds=10, random_state=SEED, verbose=2, backend="multiprocessing") # Add the base learners and the meta learner sl.add(list(models.values()), proba=True) sl.add_meta(meta_learner, proba=True) # Train the ensemble sl.fit(X_train, y_train) # Predict the test set p_sl = sl.predict_proba(X_test) print("\nSuper Learner ROC-AUC score: %.3f" % roc_auc_score(y_test, p_sl[:, 1]))
} return models meta_learner = GradientBoostingClassifier( n_estimators = 200, loss = 'exponential', max_features = 4, max_depth = 3, subsample = 0.5, random_state = SEED, ) s2 = SuperLearner( folds = 10, random_state = SEED, verbose = 2 ) base_learners2 = get_models() s2.add(list(base_learners2.values()), proba=True)#!! s2.add_meta(meta_learner, proba=True) s2.fit(xstd, ytrain.values) p_mlens2 = s2.predict_proba(xvstd)[:, 1] roc_auc_score(yvalid, p_mlens2[:,1]) result12 = pd.DataFrame(p_mlens2[:,1], index=test.PERSONID) result12.to_csv('result12_637f.csv', sep='\t', header=False)
from mlens.ensemble import SuperLearner val_train, val_test = train_test_split(train,test_size=0.3,random_state=SEED,stratify=train['Survived']) val_Xtrain=val_train[val_train.columns[1:]] val_ytrain=val_train[val_train.columns[:1]] val_Xtest=val[val_test.columns[1:]] val_ytest=val[val_test.columns[:1]] # Instantiate the ensemble with 10 folds super_learner = SuperLearner(folds=10,random_state=SEED,verbose=2,backend='multiprocessing') # Add the base learners and the meta learner super_learner.add(list(base_learners().values()),proba=True) super_learner.add_meta(LogisticRegression(), proba=True) # Train the ensemble super_learner.fit(val_Xtrain,val_ytrain) # predict the test set p_ens = super_learner.predict(val_Xtest)[:,1] p_ens_label = 1*(p_ens>=0.5) print('The acccuracy of super learner:',metrics.accuracy_score(p_ens_label, val_ytest)) # ### Producing the Submission file # # Finally having trained and fit the base and meta learners, we can now output the predictions into the proper format for submission to the Titanic competition as follows: # In[ ]: # Generate Submission File Submission = pd.DataFrame({ 'PassengerId': PassengerId_test,
print('Recall:', '%.6f' % recall_score(y_test, ans)) fpr, tpr, thresholds = roc_curve(y_test, ans) print('AUC:', '%.6f' % auc(fpr, tpr)) #-------------------------------------------------------------------------------------------------# '''ensemble SL1''' seed = 2018 np.random.seed(seed) ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2) ensemble.add([ ExtraTreesClassifier(n_estimators=25, random_state=seed), KNeighborsClassifier(n_neighbors=2), AdaBoostClassifier(n_estimators=100) ]) ensemble.add_meta(SVC()) ensemble.fit(X_train, y_train) ans = ensemble.predict(X_test) FP, FN, TP, TN = conf_matrix(y_test, ans) print('--------------------Super Learner--------------------') #test 78.85% print('Precision:', '%.6f' % precision_score(y_test, ans)) print('Recall:', '%.6f' % recall_score(y_test, ans)) fpr, tpr, thresholds = roc_curve(y_test, ans) print('AUC:', '%.6f' % auc(fpr, tpr)) '''ensemble SL2''' #seed = 2018 #np.random.seed(seed) #ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2) #ensemble.add([ExtraTreesClassifier(n_estimators=30,random_state=seed),AdaBoostClassifier(n_estimators=100)]) #ensemble.add_meta(SVC()) #ensemble.fit(X_train,y_train) #ans = ensemble.predict(X_test)
print(Xv) print(yt) print(yv) Xt.fillna(-1) Xv.fillna(-1) yt.fillna(-1) yv.fillna(-1) print(Xt) ''' for clf in stacked_clf_list: ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, folds=10) ensemble.add(clf[0]) ensemble.add_meta(lr) ensemble.fit(Xt, yt) preds = ensemble.predict(Xv) accuracy = accuracy_score(preds, yv) if accuracy > best_combination[0]: best_combination[0] = accuracy best_combination[1] = clf[1] preds = ensemble.predict(X_test) best_preds = preds print(f"Accuracy score: {accuracy} {clf[1]}") print( f"\nBest stacking model is {best_combination[1]} with accuracy of: {best_combination[0]}" ) # Output print(best_preds)
continue num_in_layer = int(layer_weights[j] / weights_total * num_to_slot) layerlist = [] for k in range(num_in_layer): layerlist.append(eval_ind.pop()) ens.add(layerlist) # then add the meta model ens.add_meta(lgbm(n_estimators=1000, verbose=-1)) try: ens.fit(X_train, y_train) train_score = f1_score(ens.predict(X_train), y_train) test_score = f1_score(ens.predict(X_test), y_test) real_score = train_score * test_score print(' Training score is {}'.format(train_score)) print(' Testing score is {}'.format(test_score)) print(' Real score is {}'.format(real_score)) except: print(' There was an error with this one. Throwing it out') continue if real_score > highest_score: print(' New highest score found!') highest_score = real_score winning_model = ens '''
cv_base_learners, cv_meta_learner = stacking(get_models(), clone(meta_learner), xtrain.values, ytrain.values, KFold(2)) P_pred, p = ensemble_predict(cv_base_learners, cv_meta_learner, xtest, verbose=False) print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(ytest, p)) # 0.881 ## 现在我们来想一想,这样的方法有啥问题呢?是不是速度会比较慢呀!推荐用下面的并行方法,速度大大提升! # Instantiate the ensemble with 10 folds sl = SuperLearner(folds=10, random_state=SEED, verbose=2, backend="multiprocessing") # Add the base learners and the meta learner sl.add(list(base_learners.values()), proba=True) sl.add_meta(meta_learner, proba=True) # Train the ensemble sl.fit(xtrain, ytrain) # Predict the test set p_sl = sl.predict_proba(xtest) print("\nSuper Learner ROC-AUC score: %.3f" % roc_auc_score(ytest, p_sl[:, 1])) plot_roc_curve(ytest, p.reshape(-1, 1), P.mean(axis=1), ["Simple average"], "Super Learner", 'ROC_curve_with_super_learning') # 0.890
xtest, verbose=False) print("\nEnsemble (Stacking) ROC-AUC score: %.3f" % roc_auc_score(ytest, p)) # Instantiate the ensemble with 10 folds ensemble = SuperLearner(folds=10, random_state=SEED, verbose=2, backend="multiprocessing") # Add the base learners and the meta learner ensemble.add(list(base_learners.values()), proba=True) ensemble.add_meta(meta_learner, proba=True) # Train the ensemble ensemble.fit(xtrain, ytrain) # Predict the test set p_sl = ensemble.predict_proba(xtest) print("\nSuper Learner ROC-AUC score: %.3f" % roc_auc_score(ytest, p_sl[:, 1])) plot_roc_curve(ytest, p.reshape(-1, 1), P.mean(axis=1), ["Simple average"], "Super Learner") print('-------------------------------------') print(test.head()) y_pred = ensemble.predict(test.iloc[:, 1:].values) print(y_pred)
RandomForestClassifier(random_state=seed, n_estimators=250), SVC(), LassoLarsIC(criterion='bic'), ElasticNet(random_state=0), BayesianRidge(), MLPClassifier(), BaggingClassifier(), neighbors.KNeighborsClassifier(), tree.DecisionTreeClassifier(), GradientBoostingClassifier(n_estimators=200) ]) # Attach the final meta estimator ensemble.add_meta(LogisticRegression()) ensemble.fit(x_train, y_train) preds = ensemble.predict(x_test) ensemble_data = pd.DataFrame(ensemble.data) auroc = roc_auc_score(preds, y_test) acc = accuracy_score(preds, y_test) p = precision_score(preds, y_test) r = recall_score(preds, y_test) frp, tpr, threshholds = roc_curve(preds, y_test) fig = plt.figure() plt.plot(frp, tpr) plt.show()
from mlens.ensemble import SuperLearner from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC # %% Preparing the dataset and the putput label dataset = np.loadtxt('../dataset/train.csv', dtype=str, delimiter=",") dataset, outcome = prgm1.pre_processing(dataset) partition = np.round(0.8 * dataset.shape[0]).__int__() train_set = dataset[0:partition, :] test_set = dataset[partition:, :] # %% Training test_outcome = np.array(outcome[partition:]).astype(int) train_outcome = np.array(outcome[0:partition]).astype(int) seed = 2017 np.random.seed(seed) ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2) # Build the first layer ensemble.add([RandomForestClassifier(random_state=seed), SVC()]) # # Attach the final meta estimator ensemble.add_meta(LogisticRegression()) # # Fit ensemble ensemble.fit(train_set, train_outcome, gamma="auto") # # Predict preds = ensemble.predict(test_set) print("Fit data:\n%r" % ensemble.data) print("Prediction score: %.3f" % accuracy_score(preds, test_outcome))