def __init__(self, num_features, **kwargs): super(StackedEnsembleClassifier, self).__init__() kwargs = {**constants.STACKED_ENSEMBLE_PARAMS, **kwargs} self.num_features = num_features self.num_folds = kwargs.pop('folds', 2) self.meta_layer = kwargs.pop('meta_layer') def init_estimators(num_features): estimators = [] for clf in constants.CLASSIFIERS_FOR_ENSEMBLE: model = utils.init_model( clf, num_features=num_features, **kwargs ) estimators.append((clf, model.kernel)) return estimators self.kernel = SuperLearner(verbose=2, n_jobs=1, folds=self.num_folds) l1_estimators = init_estimators(self.num_features) self.kernel.add(l1_estimators, proba=True) l2_estimators = init_estimators(len(l1_estimators) * self.num_folds) self.kernel.add(l2_estimators, proba=True) self.kernel.add_meta( utils.init_model( self.meta_layer, len(l2_estimators) * self.num_folds, **kwargs ).kernel, proba=True, )
def __init__(self, num_features, **kwargs): super(GatedEnsembleClassifier, self).__init__() kwargs = {**constants.GATED_ENSEMBLE_PARAMS, **kwargs} self.num_features = num_features self.num_folds = kwargs.pop('folds', 2) self.meta_layer = kwargs.pop('meta_layer') estimators = [] for clf in constants.CLASSIFIERS_FOR_ENSEMBLE: model = utils.init_model( clf, num_features=self.num_features, **kwargs ) estimators.append((clf, model.kernel)) self.kernel = SuperLearner(verbose=2, n_jobs=1, folds=self.num_folds) # use as output the probability of a given class (not just # the class itself) self.kernel.add(estimators, proba=True) self.kernel.add_meta( utils.init_model( self.meta_layer, len(estimators) * self.num_folds, **kwargs ).kernel, proba=True, )
class GatedEnsembleClassifier(_MLensAdapter): """Ensemble of classifiers, whose predictions are joined by using a further meta-learner, which decides the final output based on the prediction of the base classifiers. This classifier uses :class:`mlens.ensemble.SuperLearner` to implement the *gating* functionality. The parameters, and their default values, are: - **meta_layer**: Name of the classifier to use as a *meta layer*. By default this is `single_layer_perceptron` - **folds**: The number of folds to use for cross validation when generating the training set for the **meta_layer**. The default value for this is `2`. For a better explanation of this parameter, see: *Polley, Eric C. and van der Laan, Mark J., “Super Learner In Prediction” (May 2010). U.C. Berkeley Division of Biostatistics Working Paper Series. Working Paper 266* `<https://biostats.bepress.com/ucbbiostat/paper266/>`_ """ def __init__(self, num_features, **kwargs): super(GatedEnsembleClassifier, self).__init__() kwargs = {**constants.GATED_ENSEMBLE_PARAMS, **kwargs} self.num_features = num_features self.num_folds = kwargs.pop('folds', 2) self.meta_layer = kwargs.pop('meta_layer') estimators = [] for clf in constants.CLASSIFIERS_FOR_ENSEMBLE: model = utils.init_model(clf, num_features=self.num_features, **kwargs) estimators.append((clf, model.kernel)) self.kernel = SuperLearner(verbose=2, n_jobs=1, folds=self.num_folds) # use as output the probability of a given class (not just # the class itself) self.kernel.add(estimators, proba=True) self.kernel.add_meta( utils.init_model(self.meta_layer, len(estimators) * self.num_folds, **kwargs).kernel, proba=True, ) def __repr__(self): return (f'{self.__class__.__name__}(' f'num_folds={self.num_folds}, ' f'meta_layer={self.meta_layer}) ')
def test_equivalence_super_learner(): """[SequentialEnsemble] Test ensemble equivalence with SuperLearner.""" ens = SuperLearner() seq = SequentialEnsemble() ens.add(ECM, dtype=np.float64) seq.add('stack', ECM, dtype=np.float64) F = ens.fit(X, y).predict(X) P = seq.fit(X, y).predict(X) np.testing.assert_array_equal(P, F)
def simple_statistic(comb): resres=[] for train, test in tqdm(list(sfolder.split(data_x,data_y))): # break cofff=['age_interval','admission_type_EMERGENCY','admission_type_ELECTIVE','admission_type_URGENT','aids','hem','mets'] # stats_list=['min','max','minmax','mean','std','stdmean','median','qua25','qua75','qua2575','mode','skew','kurt','first'] X_train, X_test = data_x.iloc[train,:], data_x.iloc[test,:] Y_train, Y_test = data_y[train], data_y[test] x_train,x_val,y_train,y_val=train_test_split(X_train,Y_train,test_size=0.25,random_state=42) smo=SMOTE(random_state=42,ratio={1:2000}) x_train_s,y_train_s=smo.fit_sample(x_train,y_train) ###对遗传算法中的训练集进行重采样,获得新的遗传算法训练集x_train_s x_train_s=pd.DataFrame(x_train_s,columns=x_val.columns) X_train_s=pd.concat([x_train_s,x_val],axis=0) Y_train_s=list(y_train_s) Y_train_s.extend(list(y_val)) Y_train_s=np.array(Y_train_s) best_combination_nowfold=comb for sts in best_combination_nowfold: for column in x_train.columns: if(sts == column.split('_')[0]): cofff.append(column) x_train_train=X_train_s[cofff] y_train_train=Y_train_s x_test=X_test[cofff] y_test=Y_test ensemble = SuperLearner(scorer=roc_auc_score,random_state=42,folds=10,backend="multiprocessing") ensemble.add([GaussianNB(),SVC(C=100, probability=True), neighbors.KNeighborsClassifier(n_neighbors=3), LogisticRegression(), MLPClassifier(), GradientBoostingClassifier(n_estimators=100), RandomForestClassifier(random_state=42,n_estimators=100), BaggingClassifier(), tree.DecisionTreeClassifier()],proba=True) ensemble.add_meta(LogisticRegression(),proba=True) print('now is here -4\n') ensemble.fit(x_train_train,y_train_train) print('now is here -5\n') preds_prob=ensemble.predict_proba(x_test) print('now is here -6\n') prob=preds_prob[:, 1] preds=[] for i in prob: if i>=0.5: preds.append(1); else: preds.append(0) auc_sl=roc_auc_score(y_test,preds_prob[:,1]) auprc_sl=average_precision_score(y_test,preds_prob[:,1]) recall_sl=recall_score(y_test,preds) acc_sl=accuracy_score(y_test,preds) p_sl=precision_score(y_test,preds) f1_sl=f1_score(y_test,preds) fpr_sl,tpr_sl,thr_sl=roc_curve(y_test,prob) print('now is here -7') resres.append([best_combination_nowfold,auc_sl,auprc_sl,acc_sl,p_sl,recall_sl,f1_sl,fpr_sl,tpr_sl,thr_sl]) return resres
def test_subset_equiv(): """[Subsemble] Test equivalence with SuperLearner for J=1.""" sub = Subsemble(partitions=1) sl = SuperLearner() sub.add(ECM, dtype=np.float64) sl.add(ECM, dtype=np.float64) F = sub.fit(X, y).predict(X) P = sl.fit(X, y).predict(X) np.testing.assert_array_equal(P, F)
def build_ensemble(incl_meta, meta_type='log', preprocessors=None, estimators=None, propagate_features=None): if propagate_features: n = len(propagate_features) propagate_features_1 = propagate_features propagate_features_2 = [i for i in range(n)] else: propagate_features_1 = propagate_features_2 = None if not estimators: estimators = [('rfr', RandomForestRegressor(random_state=seed)), ('svr', SVR()), ('rdg', Ridge())] ensemble = SuperLearner() ensemble.add(estimators, propagate_features=propagate_features_1) ensemble.add(estimators, propagate_features=propagate_features_2) if incl_meta & meta_type == 'log': ensemble.add_meta(LogisticRegression()) elif incl_meta & meta_type == 'lin': ensemble.add_meta(LinearRegression()) return ensemble
def get_stacked_model(X, y, is_processing=True): ensemble = SuperLearner(scorer=accuracy_score, random_state=seed) preprocessers = [StandardScaler()] if is_processing else [] ensemble.add([MyClassifier(5.0)], preprocessing=preprocessers) ensemble.add_meta(MyClassifier(0.5)) ensemble.fit(X, y) return ensemble
def build_ensemble(incl_meta, proba, propagate_features=[0, 1]): """Return an ensemble.""" if propagate_features: n = len(propagate_features) propagate_features_1 = propagate_features propagate_features_2 = [i for i in range(n)] else: propagate_features_1 = propagate_features_2 = None #change here estimators_layer1 = [xgb] estimators_layer2 = [lgb] # estimators_layer3 = [rf,et ...........] ensemble = SuperLearner() ensemble.add(estimators_layer1, proba=proba, propagate_features=propagate_features) # ensemble.add(estimators_layer2, proba=proba, propagate_features=propagate_features) ensemble.add(estimators_layer2, proba=proba) if incl_meta: ensemble.add_meta(lr) return ensemble
def stacking_training (X,y,X_pred,layer_list,meta_learner): stacking_in_layer = SuperLearner(folds = 5, backend= 'multiprocessing', model_selection=False) for each in layer_list: stacking_in_layer.add(each,proba=True) print ('基学习器添加成功') stacking_in_layer.add_meta(meta_learner,proba= True) print ('元学习器添加成功') print ('拟合中') stacking_in_layer.fit(X,y) pred_proba = stacking_in_layer.predict_proba(X_pred) return pred_proba,stacking_in_layer
def get_stacked_model(X, y): ensemble = SuperLearner(scorer=f1, random_state=seed) ensemble.add([RandomForestClassifier(random_state=seed), SVC()]) ensemble.add_meta(LogisticRegression()) ensemble.fit(X, y) print('f1-score in training') print('-m: mean. -s: std') print(pd.DataFrame(ensemble.data)) return ensemble
def build_ensemble(incl_meta, propagate_features=None): """Return an ensemble.""" if propagate_features: n = len(propagate_features) propagate_features_1 = propagate_features propagate_features_2 = [i for i in range(n)] else: propagate_features_1 = propagate_features_2 = None #estimators = [RandomForestClassifier(random_state=seed), SVC()] estimators = [rfc, rfc2, rfc3] ensemble2 = SuperLearner() ensemble2.add(estimators, propagate_features=propagate_features_1) ensemble2.add(estimators, propagate_features=propagate_features_2) if incl_meta: ensemble.add_meta(LogisticRegression()) return ensemble2
def esemble(data,data2,data5,during): ensemble = SuperLearner(scorer=accuracy_score, random_state=45, verbose=2) ensemble.add(linear_model.LinearRegression()) ensemble.add_meta([GaussianProcessRegressor()]) y = data2['prmom'+during+'_f'] x = data2.drop(['prmom1d_f','prmom1w_f','prmom2w_f','prmom3w_f','uniqcode','date'],axis=1) x=x.fillna(0) y=np.array(y) x=np.array(x) ensemble.fit(x,y) X= data5.drop(['prmom1d_f','prmom1w_f','prmom2w_f','prmom3w_f','uniqcode','date','pred'],axis=1) X=X.fillna(0) X=np.array(X) preds = ensemble.predict(X) data['pred_essemble']=preds return data
def get_stacked_model(X, y): ensemble = SuperLearner(scorer=accuracy, random_state=seed) # call predict_proba instead of predict ensemble.add( [SVC(probability=True), RandomForestClassifier(random_state=seed)], proba=True) ensemble.add_meta(LogisticRegression()) ensemble.fit(X, y) print('accuracy score in training') print('-m: mean. -s: std') print(pd.DataFrame(ensemble.data)) return ensemble
def layer_hyperparam_tuning(X,y,pre_layer_learners, local_layer_learners, param_dicts_layer, n_iterations = 50, pre_params = 'params_base.csv'): '''中间层超参数调节,加入需按顺序''' X = X.values y = y.values scorer = make_scorer(metrics.roc_auc_score, greater_is_better=True) params_pre = pd.read_csv(pre_params) params_pre.set_index(['Unnamed: 0'], inplace = True) for case_name, params in params_pre["params"].items(): case_est = case_name params = eval(params) for est_name, est in pre_layer_learners: if est_name == case_est: est.set_params(**params) in_layer = SuperLearner(folds = 10, backend= 'multiprocessing', model_selection=True) in_layer.add(pre_layer_learners,proba=True) preprocess = [in_layer] evl = Evaluator(scorer,cv=5,verbose = 20,backend= 'multiprocessing') evl.fit(X, y, local_layer_learners, param_dicts = param_dicts_layer, preprocessing={'meta': preprocess},n_iter=n_iterations) df_params_layer = pd.DataFrame(evl.results) return in_layer, df_params_layer
def use_pack(): sl = SuperLearner( folds=10, random_state=SEED, verbose=2, # backend="multiprocessing" ) # Add the base learners and the meta learner sl.add(list(base_learners.values()), proba=True) sl.add_meta(meta_learner, proba=True) # Train the ensemble sl.fit(xtrain, ytrain) # Predict the test set p_sl = sl.predict_proba(xtest) print("\nSuper Learner ROC-AUC score: %.3f" % roc_auc_score(ytest, p_sl[:, 1]))
def get_super_learner(): base_learners = [elastic_net, xgboost, light_gbm] meta_learner = LinearRegression(fit_intercept=False) ensemble = SuperLearner(folds=2, shuffle=False) ensemble.add(base_learners) ensemble.add_meta(meta_learner) return ensemble
def build_ensemble(**kwargs): """Generate ensemble.""" ens = SuperLearner(**kwargs) prep = { 'Standard Scaling': [StandardScaler()], 'Min Max Scaling': [MinMaxScaler()], 'No Preprocessing': [] } est = { 'Standard Scaling': [ElasticNet(), Lasso(), KNeighborsRegressor()], 'Min Max Scaling': [SVR()], 'No Preprocessing': [ RandomForestRegressor(random_state=SEED), GradientBoostingRegressor() ] } ens.add(est, prep) ens.add(GradientBoostingRegressor(), meta=True) return ens
def get_super_learner(X): ensemble = SuperLearner(scorer=rmse, folds=2, shuffle=True, sample_size=len(X)) # Add base models models = get_models() ensemble.add(models) # Add the meta model ensemble.add_meta(LinearRegression()) return ensemble
def get_ensemble(): sl = SuperLearner(folds=10, random_state=seed, verbose=2, backend='multiprocessing') sl.add(list(get_models().values()), proba=True) sl.add_meta(get_meta(), proba=True) return sl
def perform_ensemble_adaboost(X_train, y_train, X_test, y_test): all_objects = [ "Vase", "Teapot", "Bottle", "Spoon", "Plate", "Mug", "Knife", "Fork", "Flask", "Bowl" ] ensemble = SuperLearner(folds=10, random_state=seed, verbose=2, backend="multiprocessing", scorer=accuracy_score) layer_1 = [SVC(kernel='linear', C=8)] ensemble.add(layer_1) # 95.50 """Make plots of learning curve""" ensemble.add_meta( AdaBoostClassifier( DecisionTreeClassifier(max_depth=8, min_samples_split=5, min_samples_leaf=8))) ensemble.fit(X_train, y_train) import time start = time.time() yhat = ensemble.predict(X_test) accuracies = cross_val_score(ensemble, X_test, y_test, cv=10, scoring="accuracy") print("Accuracy of Adaboost: {:.2f} %".format(accuracies.mean() * 100)) print("Standard Deviation of Adaboost: {:.2f} %".format(accuracies.std() * 100))
def get_super_learner(X): ensemble = SuperLearner(scorer=accuracy_score, folds=10, shuffle=True, sample_size=len(X)) # add base models models = get_models() ensemble.add(models) # add the meta model ensemble.add_meta(LogisticRegression(solver='lbfgs')) return ensemble
def build_ensemble(**kwargs): """Generate ensemble.""" ens = SuperLearner(**kwargs) est = [ElasticNet(copy_X=False), Lasso(copy_X=False)] ens.add(est) ens.add(KNeighborsRegressor()) return ens
def add_superlearner(name, models, X_train, Y_train, X_test, Y_test): # Establish and reset variables acc_score_cv = None acc_score = None time_ = None ensemble = SuperLearner(scorer=accuracy_score, random_state=seed) ensemble.add(models) # Attach the final meta estimator ensemble.add_meta(SVC()) start = time.time() ensemble.fit(X_train, Y_train) preds = ensemble.predict(X_test) acc_score = accuracy_score(preds, Y_test) end = time.time() time_ = end - start return { "Ensemble": name, "Meta_Classifier": "SVC", "Accuracy_Score": acc_score, "Runtime": time_ }
def train_model(ensemble, X, y) : seed = 2017 np.random.seed(seed) # --- Build --- # Passing a scoring function will create cv scores during fitting # the scorer should be a simple function accepting to vectors and returning a scalar ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2) # Build the first layer # ensemble.add([RandomForestClassifier(random_state=seed), SVC()]) ensemble.add([IsolationForest(), LOF(novelty=True)]) # Attach the final meta estimator # ensemble.add_meta(LogisticRegression()) ensemble.add_meta(OCSVM()) # Fit ensemble ensemble.fit(X, y)
def build_ensemble(incl_meta, propagate_features=None): if propagate_features: n = len(propagate_features) propagate_features_1 = propagate_features propagate_features_2 = [i for i in range(n)] else: propagate_features_1 = propagate_features_2 = None estimators = [RandomForestRegressor(random_state=seed), SVR()] ensemble = SuperLearner() ensemble.add(estimators, propagate_features=propagate_features_1) ensemble.add(estimators, propagate_features=propagate_features_2) if incl_meta: ensemble.add_meta(LogisticRegression()) return ensemble
def get_model(param: dict) -> BaseEstimator: model_name = param.pop('name') if model_name == 'xgb': return XGBRegressor(**param[model_name]) elif model_name == 'lgb': return LGBMRegressor(**param[model_name]) elif model_name == 'cb': return CatBoostRegressor(**param[model_name]) elif model_name == 'rf': return RandomForestRegressor(**param[model_name]) elif model_name == 'svm': return make_pipeline(StandardScaler(), SVR(**param[model_name])) elif model_name == 'knn': return make_pipeline(StandardScaler(), KNeighborsRegressor(**param[model_name])) elif model_name == 'mlp': return make_pipeline(StandardScaler(), MLPRegressor(**param[model_name])) elif model_name == 'vote': return VotingRegressor(estimators=[ ('svm', get_model(dict(param, name='svm'))), ('rf', get_model(dict(param, name='rf'))), ('lgb', get_model(dict(param, name='lgb'))), ('knn', get_model(dict(param, name='knn'))), ]) elif model_name == 'stack': model = SuperLearner(scorer=mean_squared_error, random_state=132) model.add([ get_model(dict(param, name='svm')), get_model(dict(param, name='rf')), get_model(dict(param, name='lgb')), get_model(dict(param, name='knn')), ]) model.add_meta(GradientBoostingRegressor(random_state=22)) return model elif model_name == 'sk_stack': return StackingRegressor( estimators=[ ('svm', get_model(dict(param, name='svm'))), ('rf', get_model(dict(param, name='rf'))), ('lgb', get_model(dict(param, name='lgb'))), ('knn', get_model(dict(param, name='knn'))), ], final_estimator=GradientBoostingRegressor(random_state=42) )
def do_stacking_simple_models(regressors, X, y, w, meta): """ do stacking witht the mlens library. :param regressors: a dict of regressors to feed into the ensemble pipeline :param X: training dataset :param y: outcome varaible y :param w: assignment variable :param meta: regressor (found in regressors dict for ensemble) :return: CATE predictions from the ensemble estimator """ ensemble = SuperLearner(scorer=mean_squared_error, random_state=42) ensemble.add([x for x in regressors.values()]) ensemble.add_meta(regressors[meta]) e_preds, tau_test = simple_model.create_simple_ml_model(X, y, w, ensemble) return e_preds
seed = 42 data_all = pd.read_csv('C:\\Users\\u\\Desktop\\datathon\\data-sofain.csv', header=0) data_all.fillna(data_all.mean(), inplace=True) data_all.to_csv('C:\\Users\\u\\Desktop\\datathon\\data_nomiss.csv') data_x = data_all.iloc[:, 1:-2] data_y = data_all['hospital_expire_flag'] x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.25, random_state=42) ensemble = SuperLearner(scorer=roc_auc_score, random_state=seed, folds=10, backend="multiprocessing") ensemble.add([ RandomForestClassifier(random_state=seed, n_estimators=250), SVC(), LassoLarsIC(criterion='bic'), ElasticNet(random_state=0), BayesianRidge(), MLPClassifier(), BaggingClassifier(), neighbors.KNeighborsClassifier(), tree.DecisionTreeClassifier(), GradientBoostingClassifier(n_estimators=200) ]) # Attach the final meta estimator
# Train with stacking cv_base_learners, cv_meta_learner = stacking(get_models(), clone(meta_learner), xtrain.values, ytrain.values, KFold(2)) P_pred, p = ensemble_predict(cv_base_learners, cv_meta_learner, xtest, verbose=False) print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(ytest, p)) # 0.881 ## 现在我们来想一想,这样的方法有啥问题呢?是不是速度会比较慢呀!推荐用下面的并行方法,速度大大提升! # Instantiate the ensemble with 10 folds sl = SuperLearner(folds=10, random_state=SEED, verbose=2, backend="multiprocessing") # Add the base learners and the meta learner sl.add(list(base_learners.values()), proba=True) sl.add_meta(meta_learner, proba=True) # Train the ensemble sl.fit(xtrain, ytrain) # Predict the test set p_sl = sl.predict_proba(xtest) print("\nSuper Learner ROC-AUC score: %.3f" % roc_auc_score(ytest, p_sl[:, 1])) plot_roc_curve(ytest, p.reshape(-1, 1), P.mean(axis=1), ["Simple average"], "Super Learner", 'ROC_curve_with_super_learning') # 0.890