xgb_params = { 'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 500, # 'num_class': 5, 'objective': 'multi:softprob', 'subsample': 0.8} model_xgb = Classifier(dataset=dataset_full, estimator=xgb.sklearn.XGBClassifier, parameters=xgb_params, name='xgb') model_xgb_f = Classifier(dataset=dataset_f, estimator=xgb.sklearn.XGBClassifier, parameters=xgb_params, name='xgb_f') model_rf = Classifier(dataset=dataset_f, estimator=RandomForestClassifier, parameters={'n_estimators': 700}, name='rf') pipeline = ModelsPipeline( # model_xgb, model_rf, model_xgb_f) stack_ds = pipeline.stack(k=5, full_test=True, seed=111) stacker = Classifier(stack_ds, LogisticRegression) stacker.validate(k=5, scorer=log_loss) # logging.info(val_results) # # # logging.info(train_x.head(10)) # # print(test_x.columns.difference(train_x.columns)) # # boosters = np.array([]) # predictions = []
'reg_alpha': 0.001, 'colsample_bytree': 0.5, 'min_child_samples': 24, }, name='lgb') model_lgb2 = Classifier(dataset=dataset, estimator=lgb.LGBMClassifier, parameters={ 'n_estimators': 70, 'boosting_type': 'gbdt', 'max_depth': 6, 'min_child_weight': 0.001, 'num_leaves': 30, 'seed': 128, 'reg_alpha': 0.001, 'reg_lambda': 0.002, 'colsample_bytree': 0.5, 'min_child_samples': 24 }, name='lgb') model_lg = Classifier(dataset=dataset, estimator=LogisticRegression, name='lg') pipeline = ModelsPipeline(model_lgb, model_lgb2, model_xgb, model_xgb2, model_lg) stack_ds = pipeline.stack(k=10, seed=111) stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression) stacker.validate(k=10, scorer=roc_auc_score) results = stacker.predict()
###stacking model_rf = Regressor(dataset=dataset, estimator=rf, parameters=params_rf, name='rf') #model_gbrt = Regressor(dataset=dataset, estimator=gbrt, parameters=params_gbrt,name='gbrt') model_ext = Regressor(dataset=dataset, estimator=ext, parameters=params_ext, name='ext') model_rcv = Regressor(dataset=dataset, estimator=rcv, parameters=params_rcv, name='rcv') #model_lascv = Regressor(dataset=dataset, estimator=lascv, parameters=params_lascv,name='lascv') pipeline = ModelsPipeline(model_rf, model_rcv, model_ext) stack_ds = pipeline.stack(k=5, seed=111) stacker = Regressor(dataset=stack_ds, estimator=Lasso, parameters=params_las) y_pre = stacker.predict() y_pre_last = np.append(y_pre, y_pre) y_pre_last[10] * 1.08 ### #loss_gbrt = Evaluation([y_pre_gbrt],[y_test]) output(fw, i + 1, y_pre_last) ''' if loss_gbrt>0.015: output(fw_gbrt,i+1,y_pre_rf) fw_gbrt.write(str(i+1)+',gbrt,'+str(loss_gbrt)+'\n')
'max_features': 'sqrt', 'min_samples_leaf': 15, 'min_samples_split': 10 }, name='gbdt') model_xgb = Regressor(dataset=dataset, estimator=xgb.XGBRegressor, parameters={ 'n_estimators': 50, 'learning_rate': 0.05, 'max_depth': 3 }, name='xgb') # stack两个模型 pipeline = ModelsPipeline(model_lr, model_rf, model_gbdt, model_xgb) stack_ds = pipeline.stack(k=10, seed=111) # 第二层使用xgboost模型stack stacker = Regressor(dataset=stack_ds, estimator=xgb.XGBRegressor) results = stacker.predict() # 使用10折交叉验证结果 results10 = stacker.validate(k=10, scorer=mean_squared_error) print("r2_score: %f" % r2_score(y_test, results)) test_y = pd.DataFrame(y_test) predictions = pd.DataFrame(results) data = pd.concat([data_XX, data_drop], axis=1) data = pd.concat([data, data_y], axis=1) data = pd.concat([data, predictions], axis=1) data = np.array(data) with open('C:/20180402_pre_test.csv', 'w') as f:
test_size=0.1, random_state=111) return X_train, y_train, X_test, y_test dataset = Dataset(preprocessor=boston_dataset, use_cache=True) model = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr') model_2 = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50}, name='rf') pipeline = ModelsPipeline(model, model_2) def test_apply(): output = pipeline.apply(lambda x: np.mean(x, axis=0)).execute() assert output.shape[0] == dataset.X_test.shape[0] output = pipeline.apply(lambda x: np.mean(x, axis=0)).validate( scorer=mean_absolute_error, k=10) assert len(output) == 10 def test_simple_functions(): assert dataset.X_test.shape[0] == pipeline.max().execute().shape[0] assert dataset.X_test.shape[0] == pipeline.mean().execute().shape[0] assert dataset.X_test.shape[0] == pipeline.gmean().execute().shape[0]
estimator=ExtraTreesClassifier, parameters=et_param, name='et') lgb = Classifier(dataset=dataset, estimator=LGBMClassifier, use_cache=CACHE, parameters=lgb_param, name='lgb') lr = Classifier(dataset=dataset, estimator=LogisticRegression, use_cache=CACHE, parameters=lr_param, name='lr') #------------------------------------------------------------------------------ #Stack the models and returns new dataset with out-of-fold predictions pipeline = ModelsPipeline(knn, rf, et, lgb, lr) stack_ds = pipeline.stack(k=NFOLDS, seed=1) # Train LogisticRegression on stacked data (second stage) lr1 = LogisticRegression lr1_params = { 'C': 5, 'random_state': 1, 'solver': 'liblinear', 'multi_class': 'ovr', } stacker = Classifier(dataset=stack_ds, estimator=lr1, use_cache=False, parameters=lr1_params)
use_cache=class_use_cache) model_mlp = Classifier(dataset=dataset, estimator=MLPClassifier, name="mlp", use_cache=class_use_cache) model_sgt = Classifier(dataset=dataset, estimator=SGDClassifier, parameters={'penalty': 'l1'}, name="sgd", use_cache=class_use_cache) # Stack两个模型mhg # Returns new dataset with out-of-fold prediction,model_svm,model_per logging.info('stack_ds....') # pipeline = ModelsPipeline(model_nb,model_lr,model_svc) pipeline = ModelsPipeline(model_sgt) # pipeline = ModelsPipeline(model_nb),model_nb,model_lr,model_lr2 stack_ds = pipeline.stack(k=8, seed=111) #第二层使用lr模型stack2 logging.info('second layer....') stacker = Classifier(dataset=stack_ds, estimator=svm.LinearSVC, use_cache=False, probability=False) results = stacker.predict() # 使用10折交叉验证结果 results10 = stacker.validate(k=3, scorer=accuracy_score) logging.info(results10) result_list = list(results + 1)
estimator=LinearRegression, parameters={'normalize': True}, name='lr'), Regressor(dataset=dataset, estimator=KNeighborsRegressor, name='kr'), Regressor(dataset=dataset, estimator=CatBoostRegressor, parameters={ 'custom_metric': ['MAE'], 'random_seed': seed, 'logging_level': 'Silent' }, name='cr') ] # pipelineを定義、2nd levelデータセットの作成 pipeline = ModelsPipeline(*models) stack_ds = pipeline.stack(k=10, seed=seed) # modelを作ってvalidation stacker = Regressor(dataset=stack_ds, estimator=LinearRegression) y_trues, y_preds = stacker.validate(k=10) # 精度出力 # X_testを使ってpredict y_pred = stacker.predict() print(y_pred) sum = 0 buy = 0 for i, yosoku in enumerate(x_test): if stacker.predict(x_test[i]) < 0.3:
fs = ['xgb1','xgb2','xgb3','et','svm','lr','lgb','gbdt'] import matplotlib.pyplot as plt def plot_confusion_matrix(cm, title, cmap=plt.cm.Blues): plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(8) plt.xticks(tick_marks, fs, rotation=45) plt.yticks(tick_marks, fs) plt.tight_layout() plot_confusion_matrix(cm, title='mic') plt.show() model_xgb2 = Regressor(dataset= dataset, estimator=xgb_feature2,name='xgb2',use_cache=False) model_lr = Regressor(dataset= dataset, estimator=logistic_model,name='lr',use_cache=False) model_lgb = Regressor(dataset= dataset, estimator=lgb_model,name='lgb',use_cache=False) model_ gbdt = Regressor(dataset= dataset, estimator=gbdt_model,name='gbdt',use_cache=False) pipeline = ModelsPipeline(model_xgb2, model_lr, model_lgb, model_svm) stack_data = pipeline.stack(k=5, seed=0, add_diff=False, full_test=True) stacker = Regressor(dataset=stack_data,estimator=LinearRegression, parameters={'fit_intercept': False}) predict_result = stacker.predict() val = pd.read_csv('val_list.csv') val['PROB'] = predict_result minmin, maxmax = min(val ['PROB']),max(val ['PROB']) val['PROB'] = val['PROB'].map(lambda x:(x-minmin)/(maxmax-minmin)) val['PROB'] = val['PROB'].map(lambda x:'%.4f' % x)
'n_jobs': -1 }, name='Perceptron', use_cache=class_use_cache) model_svc = Classifier(dataset=dataset, estimator=svm.LinearSVC, name="LinearSVC", use_cache=class_use_cache) model_svc = Classifier(dataset=dataset, estimator=svm.LinearSVC, name="LinearSVC", use_cache=class_use_cache) # Stack两个模型mhg # Returns new dataset with out-of-fold predictionmodel_svc, pipeline = ModelsPipeline(model_nb) stack_ds = pipeline.stack(k=3, seed=111) print(stack_ds.X_train.shape) #第二层使用lr模型stack stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, parameters={ 'C': 4, 'dual': True, 'n_jobs': -1 }, use_cache=False, probability=False) results = stacker.predict() result_list = list(results)
model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50}, name='rf') model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr') model_knn = Regressor(dataset=dataset, estimator=KNeighborsRegressor, parameters={'n_neighbors': 15}, name='knn') model_lgt = Regressor(dataset=dataset, estimator=LogisticRegression, parameters={'penalty': 'l2'}, name='lgt') xgbclf = Classifier(dataset=dataset, estimator=XGBClassifier) # Stack two models # Returns new dataset with out-of-fold predictions pipeline = ModelsPipeline(model_rf, model_lr, model_knn, xgbclf) weights = pipeline.find_weights(mean_absolute_error) result = pipeline.weight(weights) stack_ds = pipeline.stack(k=10, seed=111) # Then, train LinearRegression on stacked data stacker = Regressor(dataset=dataset, estimator=LinearRegression) results = stacker.predict() results = stacker.validate(k=10, scorer=mean_absolute_error)
model_xgb = Classifier(dataset=dataset, estimator=XGBClassifier, parameters={ 'subsample': 0.6, 'colsample_btree': 0.6, 'random_state': 27, 'n_jobs': 1 }, name="xgb", use_cache=class_use_cache) # Stack两个模型mhg1 # Returns new dataset with out-of-fold prediction,model_svm,model_per logging.info('stack_ds....') # pipeline = ModelsPipeline(model_nb,model_lr,model_svc) pipeline = ModelsPipeline(model_svc, model_lr) # pipeline = ModelsPipeline(model_nb),model_nb,model_lr,model_lr2 stack_ds = pipeline.stack(k=8, seed=111) #第二层使用lr模型stack2 logging.info('second layer....') stacker = Classifier(dataset=stack_ds, estimator=svm.LinearSVC, use_cache=False, probability=False) results = stacker.predict() # 使用10折交叉验证结果 results10 = stacker.validate(k=8, scorer=accuracy_score) logging.info(results10) # print(accuracy_score(y_test, results))
###stacking model_rf1 = Regressor(dataset=dataset, estimator=rf, parameters=params_rf,name='rf1') #model_ext = Regressor(dataset=dataset, estimator=ext, parameters=params_ext,name='ext') #model_rf2 = Regressor(dataset=dataset, estimator=rf2, parameters=params_rf2,name='rf2') model_rcv = Regressor(dataset=dataset, estimator=rcv, parameters=params_rcv,name='rcv') #model_gbrt = Regressor(dataset=dataset, estimator=gbrt, parameters=params_gbrt,name='gbrt') #model_lascv = Regressor(dataset=dataset, estimator=lascv, parameters=params_lascv,name='lascv') model_br = Regressor(dataset=dataset, estimator=br, parameters=params_br,name='br') model_knn = Regressor(dataset=dataset, estimator=knn, parameters=params_knn,name='knn') #blending = pipeline.blend(proportion=0.3,seed=111) params_las = {'alpha':1.7} params_rcv2 = {'cv':5,'normalize':True,'gcv_mode':'auto','scoring':'neg_mean_squared_error'} params_lascv = {'max_iter':500,'cv':8} pipeline = ModelsPipeline(model_rf1,model_knn) stack_ds = pipeline.stack(k=5,seed=111) stacker = Regressor(dataset=stack_ds,estimator=LassoCV, parameters=params_lascv) y_pre = stacker.predict() pipeline2 = ModelsPipeline(model_rf1,model_knn) stack_ds2 = pipeline2.blend(seed=111) blending = Regressor(dataset=stack_ds2,estimator=LassoCV, parameters=params_lascv) y_pre2 = blending.predict() blending_pre.append(y_pre2) #print(y_pre) #y_pre = pipeline.blend() #print(y_pre)
y_train = df_data.loc[df_data['sample']=='train', 'isDefault'] # 数据集划分 # X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2) from heamy.dataset import Dataset from heamy.estimator import Classifier model_dataset = Dataset(X_train=X_train, y_train=y_train, X_test=X_test) model_xgb = Classifier(dataset=model_dataset, estimator=xgb_model, name='xgb', use_cache=False) model_lgb = Classifier(dataset=model_dataset, estimator=lgb_model, name='lgb', use_cache=False) from heamy.pipeline import ModelsPipeline pipeline = ModelsPipeline(model_xgb, model_lgb) pipeline # 构建第一层新特征,其中k默认是5,表示5折交叉验证,full_test=True,对全部训练集进行训练得到基学习器,然后用基学习器对测试集预测得到新特征 stack_ds = pipeline.stack(k=5, seed=111, full_test=True) from sklearn.linear_model import LogisticRegression # 第二层使用逻辑回归进行stack LogisticRegression(solver='lbfgs') stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, parameters={'solver': 'lbfgs'}) # 测试集的预测结果 test_pred = stacker.predict() test_pred """生成提交格式的DataFrame""" df_result = pd.DataFrame({'id': df_data.loc[df_data['sample']=='test', 'id'].values, 'isDefault': test_pred})
# create dataset dataset = Dataset(X_train, y_train, X_test) # initialize RandomForest & LinearRegression model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50}, name='rf') model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr') # Stack two models # Returns new dataset with out-of-fold predictions pipeline = ModelsPipeline(model_rf, model_lr) stack_ds = pipeline.stack(k=10, seed=111) # Train LinearRegression on stacked data (second stage) stacker = Regressor(dataset=stack_ds, estimator=LinearRegression) results = stacker.predict() # Validate results using 10 fold cross-validation results = stacker.validate(k=10, scorer=mean_absolute_error) #blend # load boston dataset from sklearn from sklearn.datasets import load_boston data = load_boston() X, y = data['data'], data['target'] X_train, X_test, y_train, y_test = train_test_split(X, y,
# load boston dataset from sklearn from sklearn.datasets import load_boston data = load_boston() X, y = data['data'], data['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111) # create dataset dataset = Dataset(X_train,y_train,X_test) # initialize RandomForest & LinearRegression model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf') model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True},name='lr') # Stack two models # Returns new dataset with out-of-fold predictions pipeline = ModelsPipeline(model_rf,model_lr) stack_ds = pipeline.stack(k=10,seed=111) # Train LinearRegression on stacked data (second stage) stacker = Regressor(dataset=stack_ds, estimator=LinearRegression) results = stacker.predict() # Validate results using 10 fold cross-validation results = stacker.validate(k=10,scorer=mean_absolute_error) #blend # load boston dataset from sklearn from sklearn.datasets import load_boston data = load_boston() X, y = data['data'], data['target']
params_ext = {'max_features':'log2','n_estimators':500,'max_depth':12,'oob_score': True, 'n_jobs':4,'bootstrap':True} ext = ExtraTreesRegressor params_gbrt = {'loss':'huber','n_estimators': 400,'max_depth':12,'learning_rate': 0.01, 'random_state': 3} gbrt = GradientBoostingRegressor ###stacking model_rf1 = Regressor(dataset=dataset, estimator=rf, parameters=params_rf,name='rf1') model_ext = Regressor(dataset=dataset, estimator=ext, parameters=params_ext,name='ext') #model_rf2 = Regressor(dataset=dataset, estimator=rf2, parameters=params_rf2,name='rf2') model_rcv = Regressor(dataset=dataset, estimator=rcv, parameters=params_rcv,name='rcv') model_gbrt = Regressor(dataset=dataset, estimator=gbrt, parameters=params_gbrt,name='gbrt') #model_lascv = Regressor(dataset=dataset, estimator=lascv, parameters=params_lascv,name='lascv') model_br = Regressor(dataset=dataset, estimator=br, parameters=params_br,name='br') model_knn = Regressor(dataset=dataset, estimator=knn, parameters=params_knn,name='knn') model_adb = Regressor(dataset=dataset, estimator=adb, parameters=params_adb,name='adb') pipeline = ModelsPipeline(model_rf1,model_knn,model_rcv) #stack_ds = pipeline.stack(k=5,seed=111) #blending = pipeline.blend(proportion=0.3,seed=111) params_las = {'alpha':1.7} params_rcv2 = {'cv':5,'normalize':True,'gcv_mode':'auto','scoring':'neg_mean_absolute_error'} #stacker = Regressor(dataset=stack_ds,estimator=rcv, parameters=params_rcv2) #y_pre = stacker.predict() #print(y_pre) #y_pre = pipeline.blend() #print(y_pre) ### #loss_stack = Evaluation([y_pre],[y_test]) #stacking_pre.append(y_pre) weights = pipeline.find_weights(mean_squared_error) #print(weights) result = pipeline.weight(weights).execute()
X, y = data['data'], data['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111) # create dataset dataset = Dataset(X_train, y_train, X_test) # initialize RandomForest & LinearRegression model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50}, name='rf') model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr') # Stack two models # Returns new dataset with out-of-fold predictions pipeline = ModelsPipeline(model_rf, model_lr) stack_ds = pipeline.stack(k=10, seed=111) # Train LinearRegression on stacked data (second stage) stacker = Regressor(dataset=stack_ds, estimator=LinearRegression) results = stacker.predict() # Validate results using 10 fold cross-validation results = stacker.validate(k=10, scorer=mean_absolute_error) #print(results)
estimator=xgb_feature2, name='xgb2', use_cache=False) model_xgb3 = Regressor(dataset=xgb_dataset, estimator=xgb_feature3, name='xgb3', use_cache=False) model_lgb = Regressor(dataset=lgb_dataset, estimator=lgb_feature, name='lgb', use_cache=False) model_gbdt = Regressor(dataset=xgb_dataset, estimator=gbdt_model, name='gbdt', use_cache=False) pipeline = ModelsPipeline(model_xgb, model_xgb2, model_xgb3, model_lgb, model_gbdt) stack_ds = pipeline.stack(k=5, seed=111, add_diff=False, full_test=True) stacker = Regressor(dataset=stack_ds, estimator=LinearRegression, parameters={'fit_intercept': False}) predict_result = stacker.predict() ans = pd.read_csv('../AI_risk_test_V3.0/test_list.csv', parse_dates=['appl_sbm_tm']) ans['PROB'] = predict_result ans = ans.drop(['appl_sbm_tm'], axis=1) minmin, maxmax = min(ans['PROB']), max(ans['PROB']) ans['PROB'] = ans['PROB'].map(lambda x: (x - minmin) / (maxmax - minmin))
name='gdbt2') model_gdbt3 = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters={'n_estimators':600,'loss' : 'deviance', 'max_depth':4,'min_samples_split':10,'min_weight_fraction_leaf':0.01,'learning_rate':0.07,'random_state':3}, name='gdbt3') model_xgbt = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters={'n_estimators' :1350, 'nthread':-1,'max_depth':3,'min_child_weight':6,'learning_rate':0.05, 'gamma':0,'subsample':0.9,'colsample_bytree':0.9,'reg_alpha':8,},name='xgbt') model_ext1 = Classifier(dataset=dataset,estimator=ExtraTreesClassifier,parameters={'n_estimators':700,'max_depth':39,'n_jobs':-1, 'criterion':'gini','min_samples_split':18},name='ext1') model_ext2 = Classifier(dataset=dataset,estimator=ExtraTreesClassifier,parameters={'n_estimators':700,'max_depth':39,'n_jobs':-1, 'criterion':'entropy','min_samples_split':18},name='ext2') # Stack two models # Returns new dataset with out-of-fold predictions pipeline = ModelsPipeline(model_rf1,model_rf2,model_gdbt1,model_gdbt2,model_gdbt3,model_ext1,model_ext2) stack_ds = pipeline.stack(k=5,seed=111) # Train LinearRegression on stacked data (second stage) stacker1 = Classifier(dataset=stack_ds, estimator=LogisticRegression,parameters={'C': 10}) # stacker2 = Classifier(dataset=stack_ds, estimator=LogisticRegression,parameters={'C': 1,'penalty':'l1'}) # stacker3 = Classifier(dataset=stack_ds, estimator=SVC,parameters={'probability':True,'C':100}) # stacker4 = Classifier(dataset=stack_ds, estimator=SVC,parameters={'probability':True,'C':10}) pre_y1 = stacker1.predict() # pre_y2 = stacker2.predict() # pre_y3 = stacker3.predict() # pre_y4 = stacker4.predict() #print(pre_y) # 计算auc # fpr, tpr, thresholds = metrics.roc_curve(test_y, pre_y1)
from heamy.estimator import Regressor from heamy.pipeline import ModelsPipeline from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_absolute_error data = load_boston() X, y = data['data'], data['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2) # create dataset Data = Dataset(X_train,y_train,X_test) # initialize RandomForest & LinearRegression RfModel = Regressor(dataset=Data, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf') LRModel = Regressor(dataset=Data, estimator=LinearRegression, parameters={'normalize': True},name='lr') # Stack two models # Returns new dataset with out-of-fold predictions Pipeline = ModelsPipeline(RfModel,LRModel) StackModel = Pipeline.stack(k=10,seed=2) # Train LinearRegression on stacked data (second stage) Stacker = Regressor(dataset=StackModel, estimator=LinearRegression) Results = Stacker.predict() # Validate results using 10 fold cross-validation Results = Stacker.validate(k=10,scorer=mean_absolute_error)
# -*- coding: utf-8 -*- from heamy.dataset import Dataset from heamy.estimator import Regressor, Classifier from heamy.pipeline import ModelsPipeline from sklearn import cross_validation from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_absolute_error #加载数据集 from sklearn.datasets import load_boston data = load_boston() X, y = data['data'], data['target'] X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.1, random_state=111) #创建数据集 dataset = Dataset(X_train,y_train,X_test) #创建RF模型和LR模型 model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf') model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True},name='lr') # Blending两个模型 # Returns new dataset with out-of-fold predictions pipeline = ModelsPipeline(model_rf,model_lr) stack_ds = pipeline.blend(proportion=0.2,seed=111) #第二层使用lr模型stack stacker = Regressor(dataset=stack_ds, estimator=LinearRegression) results = stacker.predict() # 使用10折交叉验证结果 results10 = stacker.validate(k=10,scorer=mean_absolute_error)