def stacknet_train_test(X, y, text=False): models = [ # First level [RandomForestClassifier(n_estimators=500, max_depth=3, random_state=0), ExtraTreesClassifier (n_estimators=100, max_depth=5, random_state=0), SGDClassifier(loss="log", penalty="l2", max_iter=5), KNeighborsClassifier(n_neighbors=5), LogisticRegression(random_state=0), MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=0, learning_rate='invscaling'), AdaBoostClassifier(n_estimators=500, learning_rate=1e-3, random_state=0), ], # Second level [RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=0)] ] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0, shuffle=True) model = StackNetClassifier(models, metric="f1", folds=4, restacking=True, use_retraining=True, use_proba=True, random_state=0, verbose=1) model.fit(X_train, y_train) y_init = model.predict_proba(X_test) y_pred = [0 if i[0] > i[1] else 1 for i in y_init] y_score = [i[0] for i in y_init] y_pred = np.array(y_pred) files = {0: "stackNet", 1: "stackNet_text"} if text: name = files[1] else: name = files[0] # this function is imported from models_final.py file report_card = get_report(y_test, y_pred, y_score, name) with open("final_results/final_report_stackNet.txt", "a") as f: f.write(f"Classification report for {name}: \n") f.write(report_card) f.write("\n") f.write("-----------------------------------------------------------------") f.write("\n")
def test_pystacknet(): Xn=np.array(x_train) yn=np.array(y_train) print (Xn.shape, yn.shape) ##################################################################################### ############################### CLASSIFICATION ##################################### ##################################################################################### models=[ [RandomForestClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), ExtraTreesClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), LogisticRegression(random_state=1) ], [RandomForestClassifier (n_estimators=200, criterion="entropy", max_depth=5, max_features=0.5, random_state=1)] ] ################## no proba metric ############################### model=StackNetClassifier(models, metric="accuracy", folds=4, restacking=False, use_retraining=True, use_proba=True, random_state=12345, n_jobs=1, verbose=1) model.fit(x_train,y_train ) preds=model.predict_proba(x_test)[:,1] print ("accuracy test 1 , auc %f " % (roc_auc_score(y_test,preds))) ################## proba metric ############################### model=StackNetClassifier(models, metric="auc", folds=4, restacking=False, use_retraining=True, use_proba=True, random_state=12345, n_jobs=1, verbose=1) model.fit(x_train,y_train ) preds=model.predict_proba(x_test)[:,1] print ("auc test 2 , auc %f " % (roc_auc_score(y_test,preds))) ################## custom metric ############################### model=StackNetClassifier(models, metric=gini, folds=4, restacking=False, use_retraining=True, use_proba=True, random_state=12345, n_jobs=1, verbose=1) model.fit(x_train,y_train ) preds=model.predict_proba(x_test)[:,1] print ("custom metric gini test 3 , auc %f " % (gini(y_test,preds))) ################## numpy input ############################### model=StackNetClassifier(models, metric="auc", folds=4, restacking=False, use_retraining=True, use_proba=True, random_state=12345, n_jobs=1, verbose=1) model.fit(Xn,yn ) preds=model.predict_proba(np.array(x_test))[:,1] print ("numpy auc test 4 , auc %f " % (roc_auc_score(y_test,preds))) ################## csr_matrix input ############################### model=StackNetClassifier(models, metric="auc", folds=4, restacking=False, use_retraining=True, use_proba=True, random_state=12345, n_jobs=1, verbose=1) model.fit(csr_matrix( Xn) ,yn ) preds=model.predict_proba(csr_matrix(x_test))[:,1] print ("csr auc test 5 , auc %f " % (roc_auc_score(y_test,preds))) ################## restacking ############################### model=StackNetClassifier(models, metric="auc", folds=4, restacking=True, use_retraining=True, use_proba=True, random_state=12345, n_jobs=1, verbose=1) model.fit(csr_matrix( Xn) ,yn ) preds=model.predict_proba(csr_matrix(x_test))[:,1] print ("restacking auc test 6 , auc %f " % (roc_auc_score(y_test,preds))) ################## without retraining ############################### model=StackNetClassifier(models, metric="auc", folds=4, restacking=True, use_retraining=False, use_proba=True, random_state=12345, n_jobs=1, verbose=1) model.fit(csr_matrix( Xn) ,yn ) preds=model.predict_proba(csr_matrix(x_test))[:,1] print ("no retraining auc test 7 , auc %f " % (roc_auc_score(y_test,preds))) ################## custom k folder object ############################### k=StratifiedKFold(yn, n_folds=4, shuffle=True, random_state=1251) model=StackNetClassifier(models, metric="auc", folds=k, restacking=True, use_retraining=False, use_proba=True, random_state=12345, n_jobs=1, verbose=1) model.fit(csr_matrix( Xn) ,yn ) preds=model.predict_proba(csr_matrix(x_test))[:,1] print ("custom kfold auc test 8 , auc %f " % (roc_auc_score(y_test,preds))) ################## regressor in base level ############################### models_reg=[ [RandomForestClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), ExtraTreesRegressor (n_estimators=100, max_depth=5, max_features=0.5, random_state=1), GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), LogisticRegression(random_state=1) ], [RandomForestClassifier (n_estimators=200, criterion="entropy", max_depth=5, max_features=0.5, random_state=1)] ] model=StackNetClassifier(models_reg, metric="auc", folds=4, restacking=False, use_retraining=True, use_proba=True, random_state=12345, n_jobs=1, verbose=1) model.fit(x_train,y_train ) preds=model.predict_proba(x_test)[:,1] print ("with regressor test 9 , auc %f " % (roc_auc_score(y_test,preds))) ################## transformer in base level ############################### models_pca=[ [RandomForestClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), ExtraTreesRegressor (n_estimators=100, max_depth=5, max_features=0.5, random_state=1), GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), LogisticRegression(random_state=1), PCA(n_components=4,random_state=1) ], [RandomForestClassifier (n_estimators=200, criterion="entropy", max_depth=5, max_features=0.5, random_state=1)] ] model=StackNetClassifier(models_pca, metric="auc", folds=4, restacking=False, use_retraining=True, use_proba=True, random_state=12345, n_jobs=1, verbose=1) model.fit(x_train,y_train ) preds=model.predict_proba(x_test)[:,1] print ("with PCA test 10 , auc %f " % (roc_auc_score(y_test,preds))) ################## multiclass metric ############################### model=StackNetClassifier(models, metric="logloss", folds=4, restacking=False, use_retraining=True, use_proba=True, random_state=12345, n_jobs=1, verbose=1) model.fit(x_train,y2d[:100] ) preds=model.predict_proba(x_test) print ("logloss test 11 , auc %f " % (log_loss(y2d[100:],preds))) ################## 3 levels ############################### models3=[ [RandomForestClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), ExtraTreesClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), LogisticRegression(random_state=1) ], [GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), LogisticRegression(random_state=1) ], [RandomForestClassifier (n_estimators=200, criterion="entropy", max_depth=5, max_features=0.5, random_state=1)] ] model=StackNetClassifier(models3, metric="logloss", folds=4, restacking=False, use_retraining=True, use_proba=True, random_state=12345, n_jobs=1, verbose=1) model.fit(x_train,y2d[:100] ) preds=model.predict_proba(x_test) print ("3 levels test 12 , auc %f " % (log_loss(y2d[100:],preds))) ################## with sample_weight ############################### model=StackNetClassifier(models, metric="auc", folds=4, restacking=False, use_retraining=True, use_proba=True, random_state=12345, n_jobs=1, verbose=1) model.fit(x_train,y_train , sample_weight=w_train) preds=model.predict_proba(x_test)[:,1] print ("auc weighted test 13 , auc %f " % (roc_auc_score(y_test,preds, sample_weight=w_test))) ##################################################################################### ############################### REGRESSION ######################################### ##################################################################################### models=[ [RandomForestRegressor (n_estimators=100, max_depth=5, max_features=0.5, random_state=1), ExtraTreesRegressor (n_estimators=100, max_depth=5, max_features=0.5, random_state=1), GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), Ridge(random_state=1) ], [RandomForestRegressor (n_estimators=200, max_depth=5, max_features=0.5, random_state=1)] ] ################## rmse metric ############################### model=StackNetRegressor(models, metric="rmse", folds=4, restacking=False, use_retraining=True, random_state=12345, n_jobs=1, verbose=1) model.fit(x_train,y_train ) preds=model.predict(x_test) print ("rmse test 1 , %f " % (rmse(y_test,preds))) ################## mae metric ############################### model=StackNetRegressor(models, metric="mae", folds=4, restacking=False, use_retraining=True, random_state=12345, n_jobs=1, verbose=1) model.fit(x_train,y_train ) preds=model.predict(x_test) print ("mae test 2 , %f " % (mae(y_test,preds))) ################## custom metric ############################### model=StackNetRegressor(models, metric=R, folds=4, restacking=False, use_retraining=True, random_state=12345, n_jobs=1, verbose=1) model.fit(x_train,y_train ) preds=model.predict(x_test) print ("custom metric R test 3 %f " % (R(y_test,preds))) ################## numpy input ############################### model=StackNetRegressor(models, metric="rmse", folds=4, restacking=False, use_retraining=True, random_state=12345, n_jobs=1, verbose=1) model.fit(Xn,yn ) preds=model.predict(x_test) print ("numpy rmse test 4 %f " % (rmse(y_test,preds))) ################## csr_matrix input ############################### model=StackNetRegressor(models, metric="rmse", folds=4, restacking=False, use_retraining=True, random_state=12345, n_jobs=1, verbose=1) model.fit(csr_matrix( Xn) ,yn ) preds=model.predict(x_test) print ("csr test 5 , rmse %f " % (rmse(y_test,preds))) ################## restacking ############################### model=StackNetRegressor(models, metric="rmse", folds=4, restacking=True, use_retraining=True, random_state=12345, n_jobs=1, verbose=1) model.fit(csr_matrix( Xn) ,yn ) preds=model.predict(x_test) print ("restacking rmse test 6 , rmse %f " % (rmse(y_test,preds))) ################## without retraining ############################### model=StackNetRegressor(models, metric="rmse", folds=4, restacking=True, use_retraining=False, random_state=12345, n_jobs=1, verbose=1) model.fit(csr_matrix( Xn) ,yn ) preds=model.predict(x_test) print ("no retraining rmse test 7, rmse %f " % (rmse(y_test,preds))) ################## custom k folder object ############################### k=StratifiedKFold(yn, n_folds=4, shuffle=True, random_state=1251) model=StackNetRegressor(models, metric="rmse", folds=k, restacking=True, use_retraining=False,random_state=12345, n_jobs=1, verbose=1) model.fit(csr_matrix( Xn) ,yn ) preds=model.predict(x_test) print ("custom kfold rmse test 8, %f " % (rmse(y_test,preds))) ################## classifier in base level ############################### models_class=[ [RandomForestRegressor(n_estimators=100, max_depth=5, max_features=0.5, random_state=1), ExtraTreesClassifier (n_estimators=100, max_depth=5, max_features=0.5, random_state=1), GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), Ridge(random_state=1) ], [RandomForestRegressor (n_estimators=200, max_depth=5, max_features=0.5, random_state=1)] ] model=StackNetRegressor(models_class, metric="rmse", folds=4, restacking=False, use_retraining=True, random_state=12345, n_jobs=1, verbose=1) model.fit(x_train,y_train ) preds=model.predict(x_test) print ("with regressor test 9, rmse %f " % (rmse(y_test,preds))) ################## transformer in base level ############################### models_pca=[ [RandomForestRegressor (n_estimators=100, max_depth=5, max_features=0.5, random_state=1), ExtraTreesRegressor (n_estimators=100, max_depth=5, max_features=0.5, random_state=1), GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), Ridge(random_state=1), PCA(n_components=4,random_state=1) ], [RandomForestRegressor(n_estimators=200, max_depth=5, max_features=0.5, random_state=1)] ] model=StackNetRegressor(models_pca, metric="rmse", folds=4, restacking=False, use_retraining=True, random_state=12345, n_jobs=1, verbose=1) model.fit(x_train,y_train ) preds=model.predict(x_test) print ("with PCA test 10 , rmse %f " % (rmse(y_test,preds))) ################## 2d target ############################### models2=[ [RandomForestRegressor(n_estimators=100, max_depth=5, max_features=0.5, random_state=1), ExtraTreesRegressor (n_estimators=100, max_depth=5, max_features=0.5, random_state=1), #GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), Ridge(random_state=1) ], [RandomForestRegressor(n_estimators=200, max_depth=5, max_features=0.5, random_state=1)] ] model=StackNetRegressor(models2, metric="rmse", folds=4, restacking=False, use_retraining=True, random_state=12345, n_jobs=1, verbose=1) model.fit(x_train,np.column_stack((y_train,y2d[:100] ))) preds=model.predict(x_test) print ("rmse test 11 , rmse %f " % (rmse(np.column_stack((y_test,y2d[100:])),preds))) ################## 3 levels ############################### models3=[ [RandomForestRegressor(n_estimators=100, max_depth=5, max_features=0.5, random_state=1), ExtraTreesRegressor (n_estimators=100, max_depth=5, max_features=0.5, random_state=1), #GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), Ridge(random_state=1) ], [ExtraTreesRegressor (n_estimators=100, max_depth=5, max_features=0.5, random_state=1), Ridge(random_state=1) ], [RandomForestRegressor(n_estimators=200, max_depth=5, max_features=0.5, random_state=1)] ] model=StackNetRegressor(models3, metric="rmse", folds=4, restacking=False, use_retraining=True, random_state=12345, n_jobs=1, verbose=1) model.fit(x_train,y2d[:100] ) preds=model.predict(x_test) print ("3 levels test 12 , rmse %f " % (rmse(y2d[100:],preds))) ################## with sample)weight ############################### model=StackNetRegressor(models, metric="rmse", folds=4, restacking=False, use_retraining=True, random_state=12345, n_jobs=1, verbose=1) model.fit(x_train,y_train,sample_weight=w_train ) preds=model.predict(x_test) print ("rmse weighted test 13 , %f " % (rmse(y_test,preds, sample_weight=w_test)))
epsilon=0.1) ] ] # leave 4 subject out kf = KFold(4) generator = kf.split(X_train, y_train) # build StackNet model = StackNetClassifier(models, metric="auc", folds=generator, restacking=False, use_retraining=True, use_proba=True, random_state=42, n_jobs=-1, verbose=1) # evaluate model model.fit(X_train, y_train) y_probs = model.predict_proba(X_test)[:, 1] # save score csv = pd.read_csv('./data/benchmark.csv') csv['Prediction'] = y_probs csv.to_csv('submission_StackNet.csv', index=False) print( '--------------------Submission file has been generated.--------------------------' )
use_proba=param_stacknet['use_proba'], random_state=param_stacknet['random_state'], n_jobs=param_stacknet['n_jobs'], verbose=param_stacknet['verbose']) if model: st.info("Génération du model StackNet est terminé") choix = st.checkbox("Afficher le datasetCovid") if choix: X_train, X_test, y_train, y_test = get_Covid_19() if st.checkbox("Affichez les shape"): st.text(X_train.shape) if st.checkbox("Evaluer") & choix: model.fit(X_train, y_train) output = model.predict_proba(X_test) output_copy = output output_copy = pd.DataFrame(output_copy) output_copy = output_copy.reset_index() output_copy = output_copy.rename(index=str, columns={'index': 'Personne Id', 0: 'Negative Proba', 1: 'Positive Proba'}) output_copy['Personne Id'] = output_copy.replace(range(0, 111), X_test['Personne Id']) output_copy["Personne Id"] = output_copy["Personne Id"].astype("int") output_copy["Covid test"] = output_copy['Negative Proba'] < output_copy['Positive Proba'] output_copy['Covid test'] = output_copy['Covid test'].replace([True, False], ['Positive', 'Negative'])
def test_pystacknet(): path = "" y, X = load_data(path, 'train.csv') y_test, X_test = load_data(path, 'test.csv', use_labels=False) # === one-hot encoding === # # we want to encode the category IDs encountered both in # the training and the test set, so we fit the encoder on both encoder = preprocessing.OneHotEncoder() encoder.fit(np.vstack((X, X_test))) X = encoder.transform(X) # Returns a sparse matrix (see numpy.sparse) X_test = encoder.transform(X_test) ##################################################################################### ############################### CLASSIFICATION ##################################### ##################################################################################### models = [[ LogisticRegression(C=1, random_state=1), LogisticRegression(C=3, random_state=1), Ridge(alpha=0.1, random_state=1), LogisticRegression(penalty="l1", C=1, random_state=1), XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=300, objective="binary:logistic", n_jobs=1, booster="gbtree", random_state=1, colsample_bytree=0.4), XGBClassifier(max_depth=5, learning_rate=0.3, reg_lambda=0.1, n_estimators=300, objective="binary:logistic", n_jobs=1, booster="gblinear", random_state=1, colsample_bytree=0.4), XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=300, objective="rank:pairwise", n_jobs=1, booster="gbtree", random_state=1, colsample_bytree=0.4), LGBMClassifier(boosting_type='gbdt', num_leaves=40, max_depth=-1, learning_rate=0.01, n_estimators=1000, subsample_for_bin=1000, objective="xentropy", min_split_gain=0.0, min_child_weight=0.01, min_child_samples=10, subsample=0.9, subsample_freq=1, colsample_bytree=0.5, reg_alpha=0.0, reg_lambda=0.0, random_state=1, n_jobs=1) ], [ RandomForestClassifier(n_estimators=300, criterion="entropy", max_depth=6, max_features=0.5, random_state=1) ]] ################## proba metric ############################### model = StackNetClassifier(models, metric="auc", folds=4, restacking=False, use_retraining=True, use_proba=True, random_state=12345, n_jobs=1, verbose=1) model.fit(X, y) preds = model.predict_proba(X_test)[:, 1] save_results(preds, path + "pystacknet_pred.csv")
######## Third level ######## [ RandomForestClassifier(n_estimators=200, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), # LogisticRegression(random_state=1), ] ] from pystacknet.pystacknet import StackNetClassifier model = StackNetClassifier(models, metric=metric_self, folds=5, restacking=False, use_retraining=True, use_proba=True, random_state=12345, n_jobs=-1, verbose=1) model.fit(X_loc_train, y_loc_train) preds = model.predict_proba(X_loc_test)[:, 1] predict_result = test.loc[:, ['UID']] predict_result['Tag'] = preds now = datetime.datetime.now() now = now.strftime('%m-%d-%H-%M') predict_result[['UID', 'Tag']].to_csv("lgb_stacknet%s.csv" % now, index=False) print(predict_result.head())
model = StackNetClassifier(models, metric="auc", folds=5, restacking=False, use_retraining=False, use_proba=True, random_state=555, n_jobs=1, verbose=2) model.fit(train.drop(TARGET_COL, axis=1), train[TARGET_COL]) test.shape y_pred = model.predict_proba(test[list(train.drop(TARGET_COL, axis=1).columns)].values) sample_submission = pd.read_csv('sb_test.csv')[[ 'encounter_id', 'hospital_death' ]] sample_submission[TARGET_COL] = y_pred[:, 1] import pandas as pd test = pd.read_csv('unlabeled.csv') test_2 = test[list(train.drop(TARGET_COL, axis=1).columns)].copy() test_2['age'] = (test['age'] / 10).round() * 10 y_pred_2 = model.predict_proba(test_2)[:, 1] test_3 = test[list(train.drop(TARGET_COL, axis=1).columns)].copy()
X_train = X_train.as_matrix() #then refit model, it is ok model = StackNetClassifier( models, metric="auc", folds=2, restacking=False, use_retraining=False, use_proba=True, random_state=42, verbose=1, ) model.fit(X_train, y_train) y_pred = model.predict_proba(X_test) pd.DataFrame(y_pred, columns=['predictions','isFraud']).to_csv('prediction StackNetClassifier.csv') #Neural Networks from sklearn.neural_network import MLPClassifier clf_nn = MLPClassifier(solver='lbfgs', activation='relu', alpha=1e-3, hidden_layer_sizes=(3), random_state = 123, verbose=False) clf_nn.fit(X_train, y_train) y_pred = clf_nn.predict_proba(X_test) pd.DataFrame(y_pred, columns=['predictions','isFraud']).to_csv('prediction Neural Networks.csv')
subsample_freq=5, colsample_bytree=0.05, reg_alpha=0.1, reg_lambda=0.35, random_state=1, n_jobs=-1) ], [ RandomForestClassifier(n_estimators=300, criterion="entropy", max_depth=6, max_features=0.5, random_state=1) ]] model = StackNetClassifier(models, metric="auc", folds=5, restacking=False, use_retraining=True, use_proba=True, random_state=0, n_jobs=8, verbose=1) model.fit(train_df.iloc[:, 2:].values, train_df.iloc[:, 1].values) preds = model.predict_proba(test_df.iloc[:, 1:].values) sub = test_df.iloc[:, :2].drop(columns=['var_0']) sub['target'] = preds[:, 1] sub.to_csv('submission.csv', index=False)