def model_stack2(): _, test_df, train_label = data_process.get_person_data() train_data, test_data = data_process.get_scale_data() X_train, X_val, y_train, y_val = train_test_split(train_data, train_label, test_size=0.2, random_state=66) id_list = list(test_df.pop('ID')) model1 = gbt.XGBRegressor(n_estimators=1000, subsample=0.8, learning_rate=0.25, objective='reg:linear') model2 = gbt.XGBRegressor(n_estimators=1000, subsample=0.8, learning_rate=0.25, objective='reg:gamma') model3 = gbt.XGBRegressor(n_estimators=1000, subsample=0.8, learning_rate=0.25, objective='reg:tweedie') model4 = svm.SVR() stack_model = StackingClassifier( classifiers=[model1, model2, model3, model4], meta_classifier=model3) stack_model.fit(train_data, train_label) yHat = stack_model.predict(test_data) result = pd.DataFrame({'id': id_list, 'yhat': yHat}) result.to_csv('result/result6.csv', index=False, header=None, encoding='utf-8')
def run(cls) -> StackingClassifier: """ Run a Stacking Classifier using all registered models """ sc = cls() X, y = sc.load_train() # Define the StackingClassifier using all models registered. classifiers = [Model() for Model in sc._models if Model.__name__ != 'DumbModel'] clf = StackingClassifier(classifiers=classifiers, meta_classifier=LogisticRegression(), verbose=1, average_probas=False, use_probas=True ) # Run cross-val to get an idea of what to expect for final output #scores = cross_val_score(clf, X.copy(), y.copy(), scoring='neg_log_loss', cv=2) #print('\n---------\nCross validation (3) --> StackingClassifier - Avg Log Loss: {:.8f} - STD: {:.4f}\n---------' # .format(scores.mean(), scores.std()) # ) # Finally, refit clf to entire dataset print('Fitting Stacking Classifier to entire training dataset...') clf.fit(X.copy(), y.copy()) return clf
class ClassifierBlender: def __init__(self, x_train, x_test, y_train, y_test=None): x_train.drop(['Unnamed: 0', 'Id'], axis=1, inplace=True) x_test.drop(['Unnamed: 0', 'Id'], axis=1, inplace=True) self.x_train = x_train self.x_test = x_test self.y_train = y_train['y'].values if self.y_train is not None: self.y_test = y_test['y'].values def clf_blend(self): mete_clf = LinearRegression() clf1 = model.svm_regressor() clf2 = model.randomforest_regressor() clf3 = model.xgb_regressor() self.blend = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=mete_clf) self.blend.fit(self.x_train, self.y_train) return self.blend def score(self): scores = cross_val_score(self.blend, X=self.x_train, y=self.y_train, cv=10, verbose=2) return scores def prediction(self): y_pred = self.blend.predict(self.x_test) return y_pred
def stacking2(): from sklearn.datasets import load_iris from mlxtend.classifier import StackingClassifier from mlxtend.feature_selection import ColumnSelector from sklearn.pipeline import make_pipeline from sklearn.linear_model import LogisticRegression from sklearn import model_selection iris = load_iris() X = iris.data y = iris.target pipe1 = make_pipeline(ColumnSelector(cols=(0, 2)), LogisticRegression()) pipe2 = make_pipeline(ColumnSelector(cols=(1, 2, 3)), LogisticRegression()) sclf = StackingClassifier(classifiers=[pipe1, pipe2], meta_classifier=LogisticRegression(), use_features_in_secondary=True, store_train_meta_features=True) sclf.fit(X, y) scores = model_selection.cross_val_score(sclf, X, y, cv=5, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
def test_StackingClassifier_avg_vs_concat(): np.random.seed(123) lr1 = LogisticRegression() sclf1 = StackingClassifier(classifiers=[lr1, lr1], use_probas=True, average_probas=True, meta_classifier=lr1) sclf1.fit(X, y) r1 = sclf1._predict_meta_features(X[:2]) assert r1.shape == (2, 3) assert_almost_equal(np.sum(r1[0]), 1.0, places=6) assert_almost_equal(np.sum(r1[1]), 1.0, places=6) sclf2 = StackingClassifier(classifiers=[lr1, lr1], use_probas=True, average_probas=False, meta_classifier=lr1) sclf2.fit(X, y) r2 = sclf2._predict_meta_features(X[:2]) assert r2.shape == (2, 6) assert_almost_equal(np.sum(r2[0]), 2.0, places=6) assert_almost_equal(np.sum(r2[1]), 2.0, places=6) np.array_equal(r2[0][:3], r2[0][3:])
def stacking3Model(model1, model2, metamodel, xtr, ytr, xts, yts): model = StackingClassifier(classifiers=[model1, model2], meta_classifier=metamodel) train, testt = scaling(xtr, xts,MaxAbsScaler()) model.fit(train, ytr) acc = accuracy_score(yts, model.predict(testt)) predict = model.predict(testt) return acc, predict
def create_stacked(dataset, x_train, y_train): for i, y in enumerate(dataset.y_true): dataset.y_true[i] = dataset.class_labels.index(y) for i, y in enumerate(y_train): y_train[i] = dataset.class_labels.index(y) dataset.class_labels = range(0, len(dataset.class_labels)) clf1 = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=42) clf2 = KNeighborsClassifier(n_neighbors=10) clf3 = GaussianNB() clf4 = MLPClassifier(activation='relu', max_iter=100000, hidden_layer_sizes=(50, 50, 50, 50, 50)) clf5 = MLPClassifier(activation='relu', max_iter=1000000, hidden_layer_sizes=(500, 500)) clf6 = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42) clf_meta = LogisticRegression() clf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4, clf5, clf6], meta_classifier=clf_meta, use_probas=True) clf.fit(x_train, y_train) return clf
def kfold_train(mode): acc_list, loss_list = [], [] prediction = np.zeros((x_test.shape[0], 4)) for i in range(10): print(str(i + 1) + ' th kflod' + '*' * 50) kf = KFold(n_splits=5, shuffle=True, random_state=i) kfold_list = [] for k, (train_index, test_index) in enumerate(kf.split(x_train)): print(str(k + 1) + 'fold--------------') train_x, train_y = x_train[train_index], labels[train_index] test_x, test_y = x_train[test_index], labels[test_index] # train if mode == 'cat': model = catboost_model() model.fit( train_x, np.argmax(train_y, 1), eval_set=(test_x, np.argmax(test_y, 1)), #early_stopping_rounds=1000, verbose=False ) #print(pd.DataFrame({'column': features, 'importance': model.feature_importances_}).sort_values(by='importance')) if mode == 'lgb': model = lightgbm_model() model.fit( train_x, np.argmax(train_y, 1), eval_set=(test_x, np.argmax(test_y, 1)), # early_stopping_rounds=50, verbose=True verbose=False) if mode == 'xgb': model = xgboost_model() model.fit(train_x, np.argmax(train_y, 1), verbose=True) if mode == 'stack': model = StackingClassifier(classifiers=[ xgboost_model(), catboost_model(), lightgbm_model(), adaboost_model() ], use_probas=True, average_probas=False, meta_classifier=lr) model.fit(train_x, np.argmax(train_y, 1)) # test pred = model.predict_proba(test_x) acc = accuracy_score(np.argmax(test_y, 1), np.argmax(pred, 1)) loss = log_loss(test_y, pred) acc_list.append(acc) loss_list.append(loss) kfold_list.append(loss) print('test acc: %f, test loss: %f' % (acc, loss)) # predict prediction += model.predict_proba(x_test) print('this fold mean loss:', np.mean(kfold_list)) print('*' * 50) print('mean acc: %f, mean loss: %f' % (np.mean(acc_list), np.mean(loss_list))) prediction = prediction / 50. return prediction
def test_StackingClassifier_avg_vs_concat(): np.random.seed(123) lr1 = LogisticRegression(solver='liblinear', multi_class='ovr') sclf1 = StackingClassifier(classifiers=[lr1, lr1], use_probas=True, average_probas=True, meta_classifier=lr1) sclf1.fit(X, y) r1 = sclf1.predict_meta_features(X[:2]) assert r1.shape == (2, 3) assert_almost_equal(np.sum(r1[0]), 1.0, decimal=6) assert_almost_equal(np.sum(r1[1]), 1.0, decimal=6) sclf2 = StackingClassifier(classifiers=[lr1, lr1], use_probas=True, average_probas=False, meta_classifier=lr1) sclf2.fit(X, y) r2 = sclf2.predict_meta_features(X[:2]) assert r2.shape == (2, 6) assert_almost_equal(np.sum(r2[0]), 2.0, decimal=6) assert_almost_equal(np.sum(r2[1]), 2.0, decimal=6) np.array_equal(r2[0][:3], r2[0][3:])
def test_sample_weight(): # Make sure that: # prediction with weight # != prediction with no weight # == prediction with weight ones random.seed(87) w = np.array([random.random() for _ in range(len(y))]) np.random.seed(123) meta = LogisticRegression(solver='liblinear', multi_class='ovr') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() sclf = StackingClassifier(classifiers=[clf1, clf2], meta_classifier=meta) prob1 = sclf.fit(X, y, sample_weight=w).predict_proba(X) np.random.seed(123) meta = LogisticRegression(solver='liblinear', multi_class='ovr') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() sclf = StackingClassifier(classifiers=[clf1, clf2], meta_classifier=meta) prob2 = sclf.fit(X, y, sample_weight=None).predict_proba(X) maxdiff = np.max(np.abs(prob1 - prob2)) assert maxdiff > 1e-3, "max diff is %.4f" % maxdiff np.random.seed(123) meta = LogisticRegression(solver='liblinear', multi_class='ovr') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() sclf = StackingClassifier(classifiers=[clf1, clf2], meta_classifier=meta) prob3 = sclf.fit(X, y, sample_weight=np.ones(len(y))).predict_proba(X) maxdiff = np.max(np.abs(prob2 - prob3)) assert maxdiff < 1e-3, "max diff is %.4f" % maxdiff
def stacking_clf(train_x, train_y): clf1 = RandomForestClassifier(n_estimators=300, max_features="sqrt", min_samples_split=20, min_samples_leaf=15, max_depth=6, bootstrap=True, n_jobs=8) clf2 = svm.SVC(C=10) clf3 = xgb.XGBClassifier(n_estimators=300, learning_rate=0.1, n_jobs=8, object="multi:softmax", colsample_bylevel=0.8, reg_lambda=1, max_depth=6, min_child_weight=1) clf4 = GradientBoostingClassifier(n_estimators=300, learning_rate=0.1, min_samples_split=20, min_samples_leaf=15, max_depth=6, max_features="sqrt") clf5 = LogisticRegression(penalty='l2', C=100, multi_class='ovr') sclf = StackingClassifier( classifiers=[clf1, clf3, clf4], meta_classifier=clf5, ) sclf.fit(train_x, train_y) return sclf
def stacking_prediction2(m1, m2, meta): # model_train, model_test = stacking(clf, Xtrain2,ytrain2, Xtest2) # model.fit(model_train, ytrain2) tr, ts = scaling(Xtrain2,Xtest2,MaxAbsScaler()) m = StackingClassifier(classifiers=[m1, m2],meta_classifier=meta) m.fit(tr, ytrain2) predict_mm = m.predict(ts) return predict_mm
def stacking(self): train_data, test_data = self.Extract_feature.extract_count() from sklearn.svm import SVR from sklearn.pipeline import make_pipeline from sklearn.preprocessing import RobustScaler, MinMaxScaler from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from xgboost import XGBClassifier import lightgbm as lgb from lightgbm import LGBMClassifier import xgboost as xgb from mlxtend.classifier import StackingClassifier import scipy as sc from sklearn import model_selection lasso = make_pipeline(SVC(C=2.1, gamma=0.005)) rforest = make_pipeline( RandomForestClassifier(random_state=0, n_estimators=6)) Gboost = GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, max_depth=12, max_features="sqrt", min_samples_leaf=15, min_samples_split=97, random_state=200) model_xgb = xgb.XGBClassifier(colsample_bytree=0.4603, gamma=10, learning_rate=0.01, max_depth=11, n_estimators=500, reg_alpha=0.01, reg_lambda=5, subsample=0.5213, seed=1024, nthread=-1) lr = LogisticRegression() classifiers = [rforest, lasso, Gboost, model_xgb, lr] stregr = StackingClassifier(classifiers=classifiers, meta_classifier=lr) stregr.fit(train_data, self.train_label) prediction = stregr.predict(test_data) classification = classification_report(y_true=self.test_label, y_pred=prediction) print("classification:{}".format(classification)) print("测试集的score:{}".format(stregr.score(test_data, self.test_label))) for clf, label in zip( [rforest, lasso, Gboost, lr, model_xgb, stregr], ['rf', 'svr', 'gboost', 'lr', 'xgb', 'stackingclassifier']): scores = model_selection.cross_val_score(clf, train_data, self.train_label, cv=3, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
def test_weight_unsupported_no_weight(): # This is okay since we do not pass sample weight meta = LogisticRegression() clf1 = RandomForestClassifier() clf2 = GaussianNB() clf3 = KNeighborsClassifier() sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=meta) sclf.fit(X, y)
def data_ensemble(cancer_type,feat): data_dir = "/home/ubuntu/cancer/" data_file = data_dir + cancer_type + "_matrix.csv" features = data_dir + cancer_type + "_output.txt" output_file = data_dir + cancer_type + "_accuracy.txt" file = open(features, "r") o_file = open(output_file, "w") line = file.readline() line = file.readline() df = pd.read_csv(data_file) df = shuffle(df) file_ids=df.pop('file_id') y = df.pop('label').values dataf=df.pop(line[:-1]) #dataframe consisting of only important features for x in range(feat): line = file.readline() dataf=np.column_stack((dataf,df.pop(line[:-1]))) X=normalize(dataf) X=scale(X) pca=PCA() pca.fit(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42) #multiple classifiers clf1 = RandomForestClassifier(random_state=1,n_estimators=100) clf2 = GradientBoostingClassifier(n_estimators=1200,subsample=0.5,random_state=3) clf3 = SVC(gamma='auto') clf4 = KNeighborsClassifier(n_neighbors=1) clf5 = DecisionTreeClassifier(random_state=0) lr = LogisticRegression(solver='lbfgs') #stacking for data ensemble sclf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4, clf5], meta_classifier=lr) clf1.fit(X_train,y_train) clf2.fit(X_train,y_train) clf3.fit(X_train,y_train) clf4.fit(X_train,y_train) clf5.fit(X_train,y_train) sclf.fit(X_train,y_train) y_test_predict=sclf.predict(X_test) precision = precision_score(y_test, y_test_predict) accuracy = accuracy_score(y_test, y_test_predict) f1 = f1_score(y_test, y_test_predict) recall = recall_score(y_test, y_test_predict) scores = [precision,accuracy,f1,recall] label = ['RF', 'GBDT', 'SVM','KNN','DT','Stacking'] clf_list = [clf1, clf2, clf3, clf4, clf5, sclf] #score calculation for clf, label in zip(clf_list, label): y_test_predict = clf.predict(X_test) tn, fp, fn, tp = confusion_matrix(y_test, y_test_predict).ravel() specificity = tn / (tn+fp) recall = tp / (tp+fn) precision = tp / (tp+fp) accuracy = (tp + tn) / (tp+tn+fp+fn) f1 = 2*tp / (2*tp+fp+fn) o_file.write("\nAccuracy: %.2f [%s] \nPrecision: %.2f [%s] \nRecall: %.2f [%s] \nF1 score: %.2f [%s] \nSpecificity: %.2f [%s]\n" %(accuracy,label,precision, label, recall, label, f1, label, specificity, label))
def test_weight_unsupported_no_weight(): # This is okay since we do not pass sample weight np.random.seed(123) meta = LogisticRegression(solver='liblinear', multi_class='ovr') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() clf3 = KNeighborsClassifier() sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=meta) sclf.fit(X, y)
def test_verbose(): np.random.seed(123) meta = LogisticRegression() clf1 = RandomForestClassifier() clf2 = GaussianNB() sclf = StackingClassifier(classifiers=[clf1, clf2], use_probas=True, meta_classifier=meta, verbose=3) sclf.fit(iris.data, iris.target)
def stackingPerformanceEditor(): nb_clf = GaussianNB() svm_clf = RandomForestClassifier(n_estimators=100, max_depth=400, random_state=5) mlp_clff = MLPClassifier(hidden_layer_sizes=(500,500)) label = ["NB","RF","MLP"] acc = StackingClassifier(classifiers=[nb_clf,svm_clf,mlp_clff], meta_classifier=svm_clf) acc.fit(Xtrain2, ytrain2) pred = accuracy_score(ytest, acc.predict(Xtest2)) return pred
def stacking(self): from sklearn.svm import SVC from sklearn.pipeline import make_pipeline from sklearn.preprocessing import RobustScaler, MinMaxScaler from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from lightgbm import LGBMClassifier import xgboost as xgb from mlxtend.classifier import StackingClassifier import scipy as sc svc = make_pipeline(SVC(kernel='rbf', C=2.8, gamma=2)) rf = RandomForestClassifier(random_state=590, n_estimators=6) GBoost = GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, max_depth=12, max_features='sqrt', min_samples_leaf=15, min_samples_split=97, random_state=200) model_xgb = xgb.XGBClassifier(colsample_bytree=0.4603, gamma=10, learning_rate=0.01, max_depth=11, min_child_weight=1.7817, n_estimators=500, reg_alpha=0.01, reg_lambda=5, subsample=0.5213, silent=1, seed=1024, nthread=-1) model_lgb = LGBMClassifier(objective='regression', num_leaves=5, learning_rate=0.05, n_estimators=550, max_bin=25, bagging_fraction=1, bagging_freq=5, feature_fraction=0.7, feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf=42, min_sum_hessian_in_leaf=40) regressors = [rf, svc, GBoost, model_lgb, model_xgb] stregr = StackingClassifier(classifiers=regressors, meta_classifier=model_xgb, verbose=1) stregr.fit(self.X_train, self.y_train) print( "the model is stregr and the valid's f1 is: ", f1_score(self.y_test, stregr.predict(self.X_test), average="macro")) # print("the model is stregr and the valid's precision_score is: ", precision_score(self.y_test, stregr.predict(self.X_test),average="macro")) # print("the model is stregr and the valid's recall_score is: ", recall_score(self.y_test, stregr.predict(self.X_test),average="macro")) return stregr
def train3(): iris = datasets.load_iris() x = iris.data y = iris.target pipe1 = make_pipeline(ColumnSelector(cols=(0, 2)), LogisticRegression()) pipe2 = make_pipeline(ColumnSelector(cols=(1, 2, 3)), LogisticRegression()) sclf = StackingClassifier(classifiers=[pipe1, pipe2], meta_classifier=LogisticRegression()) sclf.fit(x, y)
def test_train_meta_features_(): knn = KNeighborsClassifier() lr = LogisticRegression() gnb = GaussianNB() stclf = StackingClassifier(classifiers=[knn, gnb], meta_classifier=lr, store_train_meta_features=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) stclf.fit(X_train, y_train) train_meta_features = stclf.train_meta_features_ assert train_meta_features.shape == (X_train.shape[0], 2)
def test_verbose(): np.random.seed(123) meta = LogisticRegression(solver='liblinear', multi_class='ovr') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() sclf = StackingClassifier(classifiers=[clf1, clf2], use_probas=True, meta_classifier=meta, verbose=3) X, y = iris_data() sclf.fit(X, y)
def test_weight_unsupported(): # Error since KNN does not support sample_weight meta = LogisticRegression() clf1 = RandomForestClassifier() clf2 = GaussianNB() clf3 = KNeighborsClassifier() sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=meta) random.seed(87) w = np.array([random.random() for _ in range(len(y))]) sclf.fit(X, y, sample_seight=w)
def classifer_stacking(data_file,alertgroup_name,classifier_list): classifiers = {'KNN':KNeighborsClassifier(), # n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric_params=None, n_jobs=1), # 'LR': LogisticRegression(), 'RF': RandomForestClassifier(), # n_estimators=60,max_depth=13,min_samples_split=120,min_samples_leaf=20,random_state=10 'DT': tree.DecisionTreeClassifier(), # criterion='gini',splitter=random,max_features=None,max_depth=13,min_samples_leaf=2 'GBDT': GradientBoostingClassifier() # loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1,max_depth=3,verbose=0,presort='auto') # 'XGB':xgboost_classifier } all_data = pd.read_csv(data_file, sep=',', dtype=str) for alertgroup, group in all_data.groupby('alertgroup'): if alertgroup == alertgroup_name: train_x, test_x, train_y, test_y = get_data(group, split=True) arr_x = train_x.values arr_y = train_y.values max_fs = 0 best_model = None stratified_folder = StratifiedKFold(n_folds=3,random_state=0,shuffle=False) for train_index,test_index in stratified_folder.split(train_x): train_x = arr_x[train_index] train_y = arr_y[train_index] test_x = arr_x[test_index] test_y = arr_y[test_index] classifiers_list = [classifiers[cl] for cl in classifier_list] stack_model = StackingClassifier(classifiers = classifiers_list,use_probas=True, average_probas=True,meta_classifier=classifiers['RF']) stack_model.fit(train_x,train_y) predict = stack_model.predict(test_x) fbetascore = fbeta_score(test_y, predict, 1) print(' f2score:' + str(fbetascore)) if fbetascore > max_fs: max_fs = fbetascore best_model = stack_model stack_model = best_model predict = stack_model.predict(test_x) precision = metrics.precision_score(test_y, predict) recall = metrics.recall_score(test_y, predict) fbetascore = fbeta_score(test_y, predict, 0.5) accuracy = metrics.accuracy_score(test_y, predict) print('final performance:') print(alertgroup_name) print('precision: %.6f' % (100 *precision)) print('recall: %.6f' % (100 * recall)) print('f0.5score: %.6f' % (100 * fbetascore)) print('accuracy: %.6f%%' % (100 * accuracy)) return best_model
def test_train_meta_features_(): np.random.seed(123) knn = KNeighborsClassifier() lr = LogisticRegression(solver='liblinear', multi_class='ovr') gnb = GaussianNB() stclf = StackingClassifier(classifiers=[knn, gnb], meta_classifier=lr, store_train_meta_features=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) stclf.fit(X_train, y_train) train_meta_features = stclf.train_meta_features_ assert train_meta_features.shape == (X_train.shape[0], 2)
def test_predict_meta_features(): knn = KNeighborsClassifier() lr = LogisticRegression() gnb = GaussianNB() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # test default (class labels) stclf = StackingClassifier(classifiers=[knn, gnb], meta_classifier=lr, store_train_meta_features=True) stclf.fit(X_train, y_train) test_meta_features = stclf.predict(X_test) assert test_meta_features.shape == (X_test.shape[0], )
def stack_models(self, x1, y1, x2, y2, meta): classifiers = [v['model'] for k, v in self.fine_models.items()] meta_classifier = self.fine_models[meta]['model'] sclf_lr = StackingClassifier(classifiers=classifiers, meta_classifier=meta_classifier, use_probas=True, average_probas=True, use_features_in_secondary=True) sclf_lr.fit(x1, y1.values) d = dict(model=sclf_lr, name='stacking_models', meta_classifier=meta) save_name = os.path.join(self.cfg.model_dir, 'stacking_models.m') joblib.dump(d, save_name, compress=5) return sclf_lr
def test_predict_meta_features(): knn = KNeighborsClassifier() lr = LogisticRegression() gnb = GaussianNB() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # test default (class labels) stclf = StackingClassifier(classifiers=[knn, gnb], meta_classifier=lr, store_train_meta_features=True) stclf.fit(X_train, y_train) test_meta_features = stclf.predict(X_test) assert test_meta_features.shape == (X_test.shape[0],)
def stack_model(X_train, Y_train, X_test, expert_model, n_estimator): estimators = [('DT', DecisionTreeClassifier()), ('MLP', MLPClassifier())] if expert_model == "DT": model = StackingClassifier(estimators=estimators, final_estimator=DecisionTreeClassifier()) if expert_model == "MLP": model = StackingClassifier(estimators=estimators, final_estimator=MLPClassifier()) model.fit(X_train, Y_train) Y_pred = model.predict(X_test) return Y_pred
def test_use_features_in_secondary_sparse_input_predict_proba(): np.random.seed(123) meta = LogisticRegression(solver='liblinear', multi_class='ovr') clf1 = RandomForestClassifier(n_estimators=10) sclf = StackingClassifier(classifiers=[clf1], use_features_in_secondary=True, meta_classifier=meta) sclf.fit(sparse.csr_matrix(X), y) idx = [0, 1, 2] y_pred = sclf.predict_proba(sparse.csr_matrix(X[idx]))[:, 0] expect = np.array([0.910, 0.829, 0.882]) np.testing.assert_almost_equal(y_pred, expect, 3)
def test_weight_unsupported(): # Error since KNN does not support sample_weight np.random.seed(123) meta = LogisticRegression(solver='liblinear', multi_class='ovr') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() clf3 = KNeighborsClassifier() sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=meta) random.seed(87) w = np.array([random.random() for _ in range(len(y))]) with pytest.raises(TypeError): sclf.fit(X, y, sample_seight=w)
def test_use_features_in_secondary_predict_proba(): np.random.seed(123) meta = LogisticRegression() clf1 = RandomForestClassifier() clf2 = GaussianNB() sclf = StackingClassifier(classifiers=[clf1, clf2], use_features_in_secondary=True, meta_classifier=meta) sclf.fit(X, y) idx = [0, 1, 2] y_pred = sclf.predict_proba(X[idx])[:, 0] expect = np.array([0.911, 0.829, 0.885]) np.testing.assert_almost_equal(y_pred, expect, 3)
def model_test(self,model,best_params): print 'Model Test' print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') lr = self.model_init(model) clf1 = self.model_init('KNN') clf2 = self.model_init('RFC') clf3 = self.model_init('GNB') sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],meta_classifier=lr) sclf.set_params(**best_params) train_data = self.train.values.copy() train_label = self.train_label['label'].values.copy() sclf.fit(train_data, train_label) if model.upper()=='LR': coef=sclf.coef_.reshape(clf.coef_.shape[1]) ind=coef.argsort() att=self.train.columns[ind[-30:]].tolist() print att elif model.upper()=='RFC': imp=sclf.feature_importances_ print imp ind=imp.argsort() att=self.train.columns[ind[-30:]].tolist() print att elif model.upper()=='XGB': imp=sclf.feature_importances_ print imp ind=imp.argsort() att=self.train.columns[ind[-30:]].tolist() print att test_data = self.test.values.copy() test_label = self.test_label['label'].values.copy() test_label = test_label.reshape(test_label.shape[0]) res_proba=sclf.predict_proba(test_data) res_auc=roc_auc_score(test_label,res_proba[:,1]) print 'Model: {0} ; Test: {1}'.format(model,res_auc) print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') return res_auc
def test_use_features_in_secondary_sparse_input_predict_proba(): np.random.seed(123) meta = LogisticRegression(solver='liblinear', multi_class='ovr') clf1 = RandomForestClassifier(n_estimators=10) sclf = StackingClassifier(classifiers=[clf1], use_features_in_secondary=True, meta_classifier=meta) sclf.fit(sparse.csr_matrix(X), y) idx = [0, 1, 2] y_pred = sclf.predict_proba( sparse.csr_matrix(X[idx]) )[:, 0] expect = np.array([0.910, 0.829, 0.882]) np.testing.assert_almost_equal(y_pred, expect, 3)
def test_use_features_in_secondary_predict_proba(): np.random.seed(123) X, y = iris_data() meta = LogisticRegression(solver='liblinear', multi_class='ovr', random_state=1) clf1 = RandomForestClassifier(n_estimators=10, random_state=1) clf2 = GaussianNB() sclf = StackingClassifier(classifiers=[clf1, clf2], use_features_in_secondary=True, meta_classifier=meta) sclf.fit(X, y) idx = [0, 1, 2] y_pred = sclf.predict_proba(X[idx])[:, 0] expect = np.array([0.916, 0.828, 0.889]) np.testing.assert_almost_equal(y_pred, expect, 3)
def test_multivariate_class(): np.random.seed(123) meta = KNeighborsClassifier() clf1 = RandomForestClassifier() clf2 = KNeighborsClassifier() sclf = StackingClassifier(classifiers=[clf1, clf2], meta_classifier=meta) y_pred = sclf.fit(X, y2).predict(X) ca = .973 assert round((y_pred == y2).mean(), 3) == ca
def model_processing(X_train,X_test,y_train,y_test): log_reg = LogisticRegression(C=0.01, penalty='l2') svc = SVC(C=0.7, kernel='linear') tree_clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5) rf_clf = RandomForestClassifier(n_estimators=70,criterion='entropy', max_features='auto',min_samples_leaf=6) xgb = XGBClassifier(gamma=0.3, max_depth=4, min_child_weight=8,reg_alpha=0.05) sclf = StackingClassifier(classifiers=[log_reg,svc,tree_clf,rf_clf],meta_classifier=xgb) sclf.fit(X_train,y_train) y_pred_train = sclf.predict(X_train) y_pred = sclf.predict(X_test) print('*' * 30,'在训练集上的得分' ) accuracy = accuracy_score(y_train,y_pred_train) precision = precision_score(y_train,y_pred_train) f1 = f1_score(y_train,y_pred_train) recall = recall_score(y_train,y_pred_train) auc = roc_auc_score(y_train,y_pred_train) model_name = '堆叠模型-训练集' print('{} 精确度 (accuracy):{:.2f}'.format(model_name,accuracy)) print('{} 准确度(precision):{:.2f}'.format(model_name,precision)) print('{} F1 Score :{:.2f}'.format(model_name,f1)) print('{} 召回率(recall Score):{:.2f}'.format(model_name,recall)) print('{} auc Score:{:.2f}'.format(model_name,auc)) print('*' * 30,'在测试集上的得分' ) accuracy = accuracy_score(y_test,y_pred) precision = precision_score(y_test,y_pred) f1 = f1_score(y_test,y_pred) recall = recall_score(y_test,y_pred) auc = roc_auc_score(y_test,y_pred) model_name = '堆叠模型' print('{} 精确度 (accuracy):{:.2f}'.format(model_name,accuracy)) print('{} 准确度(precision):{:.2f}'.format(model_name,precision)) print('{} F1 Score :{:.2f}'.format(model_name,f1)) print('{} 召回率(recall Score):{:.2f}'.format(model_name,recall)) print('{} auc Score:{:.2f}'.format(model_name,auc))
def test_StackingClassifier_drop_last_proba(): np.random.seed(123) lr1 = LogisticRegression(solver='liblinear', multi_class='ovr') sclf1 = StackingClassifier(classifiers=[lr1, lr1], use_probas=True, drop_last_proba=False, meta_classifier=lr1) sclf1.fit(X, y) r1 = sclf1.predict_meta_features(X[:2]) assert r1.shape == (2, 6) sclf2 = StackingClassifier(classifiers=[lr1, lr1], use_probas=True, drop_last_proba=True, meta_classifier=lr1) sclf2.fit(X, y) r2 = sclf2.predict_meta_features(X[:2]) assert r2.shape == (2, 4), r2.shape sclf3 = StackingClassifier(classifiers=[lr1, lr1], use_probas=True, drop_last_proba=True, meta_classifier=lr1) sclf3.fit(X[0:100], y[0:100]) # only 2 classes r3 = sclf3.predict_meta_features(X[:2]) assert r3.shape == (2, 2), r3.shape
def predictor_ev(): print "Building Neural Net classifiers for devices with events" n_input = X_train_ev.shape[1] n_train = X_train_ev.shape[0] from keras.models import Sequential from keras.layers import Dense, Activation from keras.layers.core import Dropout from keras.layers.advanced_activations import PReLU from keras.regularizers import l2 from keras.optimizers import Adadelta from keras.optimizers import SGD from keras.wrappers.scikit_learn import KerasClassifier from keras.callbacks import ModelCheckpoint def create_model(n_hidden_layers=1, nodes=[50], reg=1.0, dropouts=[.5], acts=['relu']): n_in = n_input model = Sequential() for i in xrange(n_hidden_layers): n_out = nodes[i] dropout = dropouts[i] act = acts[i] model.add(Dense(output_dim=n_out, input_dim=n_in, W_regularizer=l2(reg))) model.add(Activation(act)) model.add(Dropout(dropout)) n_in = n_out model.add(Dense(output_dim=12, W_regularizer=l2(reg))) model.add(Activation("softmax")) # Compile model adadelta = Adadelta(lr=1.0, rho=0.95, epsilon=1e-08) sgd = SGD(lr=0.05, decay=1e-6, momentum=0.9, nesterov=True) model.compile(loss='categorical_crossentropy', optimizer=adadelta, metrics=['accuracy']) return model class KerasClassifier2(KerasClassifier): def __init__(self, build_fn, fn_args, random_state=0, nb_epoch=10, batch_size=500, verbose=2): self.random_state = random_state self.nb_epoch = nb_epoch self.batch_size = batch_size self.verbose = verbose super(KerasClassifier2, self).__init__(build_fn, **fn_args) self.classes_= np.arange(12) self.n_classes_ = 12 self.model = build_fn(**fn_args) def fit(self, X, y, sample_weight=None): return super(KerasClassifier2, self).fit(X, indicator(y), verbose = self.verbose, sample_weight=sample_weight, validation_data=(X_cv_ev, indicator(y_cv_ev)), nb_epoch=self.nb_epoch, batch_size=self.batch_size) def predict_proba(self, X): return super(KerasClassifier2, self).predict_proba(X, batch_size=500, verbose=0) def predict(self, X): return super(KerasClassifier2, self).predict_proba(X, batch_size=500, verbose=0) nn1_args = {'n_hidden_layers': 2, 'nodes': [600, 400], 'reg': 1.8, 'dropouts': [.3, .4], 'acts': ['relu', 'relu']} nn2_args = {'n_hidden_layers': 3, 'nodes': [300, 100, 50], 'reg': 2.0, 'dropouts': [.2, .4, .5], 'acts': ['relu', 'relu', 'relu']} nn3_args = {'n_hidden_layers': 4, 'nodes': [1001, 511, 245, 99], 'reg': 2.0, 'dropouts': [.2, .3, .2, .3], 'acts': ['relu', 'relu', 'relu', 'relu']} nn4_args = {'n_hidden_layers': 1, 'nodes': [500], 'reg': 1.2, 'dropouts': [.25], 'acts': ['relu']} nn5_args = {'n_hidden_layers': 5, 'nodes': [1343, 1012, 757, 539, 117], 'reg': 2.5, 'dropouts': [.2, .3, .4, .4, .4], 'acts': ['relu', 'relu', 'relu', 'relu', 'relu']} clfNN1 = KerasClassifier2(create_model, nn1_args, random_state=5, nb_epoch=5) clfNN2 = KerasClassifier2(create_model, nn2_args, random_state=23, nb_epoch=11) clfNN3 = KerasClassifier2(create_model, nn3_args, random_state=710, nb_epoch=6) clfNN4 = KerasClassifier2(create_model, nn4_args, random_state=5072, nb_epoch=6) clfNN5 = KerasClassifier2(create_model, nn5_args, random_state=2016, nb_epoch=12) print "Building XGBoost classifiers for devices with events" xgb_params = { "objective": "multi:softprob", "num_class": 12, "booster": "gblinear", "max_depth": 6, "eval_metric": "mlogloss", "eta": 0.07, "silent": 1, "alpha": 3.5, } class XGBClassifier2(xgb.XGBClassifier): def __init__(self, max_depth=xgb_params['max_depth'], objective='multi:softprob', missing=None, learning_rate=xgb_params['eta'], n_estimators=40, subsample=1, reg_alpha=xgb_params['alpha'], seed=2016, booster='gblinear'): super(XGBClassifier2, self).__init__(max_depth=max_depth, seed=seed, objective=objective, missing=missing, learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample, reg_alpha=reg_alpha) self.booster = xgb_params['booster'] def fit(self, X, y): super(XGBClassifier2, self).fit(X.tocsc(), y, eval_metric='mlogloss', eval_set=[(X_cv_ev.tocsc(), y_cv_ev)]) gbm1 = XGBClassifier2(seed=0, booster='gblinear', n_estimators=28) gbm2 = XGBClassifier2(seed=6, booster='gblinear', n_estimators=28) gbm3 = XGBClassifier2(seed=151, booster='gbtree', n_estimators=28) gbm4 = XGBClassifier2(seed=1047, booster='gbtree', n_estimators=28) gbm5 = XGBClassifier2(seed=22, booster='dart', n_estimators=28) print "Building Logistic Regression classifier for devices with events" clfLR = LogisticRegression(C=.02, random_state=2016, multi_class='multinomial', solver='newton-cg') #Combine results of classifiers print "Stacking classifiers for devices with events" clf_ls = [gbm1,gbm2,gbm3,gbm4,gbm5,clfNN1,clfNN2,clfNN3,clfNN4,clfNN5,clfLR] meta = LogisticRegression() stack = StackingClassifier(clf_ls, meta, use_probas=True, verbose=1) stack.fit(X_train_ev, y_train_ev) print log_loss(y_cv_ev, stack.predict_proba(X_cv_ev)) y_pred_ev = stack.predict_proba(X_test_ev) #y_pre = (pred_prob_nn+y_pre)/2.0 return y_pred_ev
#clfKNN = KNeighborsClassifier(n_neighbors=5) #clfKNN.fit(X_train_noev, y_train_noev) #print log_loss(y_cv_noev, clfKNN.predict_proba(X_cv_noev)) # ##NB #clfNB = MultinomialNB(alpha=1.0) #clfNB.fit(X_train_noev, y_train_noev) #print log_loss(y_cv_noev, clfNB.predict_proba(X_cv_noev)) #Combine results of classifiers print "Stacking classifiers for devices with no events" clf_ls = [gbm1,gbm2,gbm3,gbm4,gbm5,clfNN1,clfNN2,clfNN3,clfNN4,clfNN5,clfLR] meta = LogisticRegression() stack = StackingClassifier(clf_ls, meta, use_probas=True, verbose=1) stack.fit(X_train_noev, y_train_noev) print log_loss(y_cv_noev, stack.predict_proba(X_cv_noev)) y_pred_noev = stack.predict_proba(X_test_noev) #y_pre = (pred_prob_nn+y_pre)/2.0 # return y_pred_noev y_pred_ev = predictor_ev() #y_pred_noev = predictor_noev() # Write results result = pd.DataFrame(np.hstack(y_pred_ev, y_pred_noev), columns=le.classes_) result["device_id"] = test_dev result = result.set_index("device_id") result.to_csv('stacking_1.gz', index=True, index_label='device_id', compression="gzip")