예제 #1
0
    xgb_params = {
        'learning_rate': 0.05,
        'max_depth': 6,
        'n_estimators': 500,
        # 'num_class': 5,
        'objective': 'multi:softprob',
        'subsample': 0.8}

    model_xgb = Classifier(dataset=dataset_full, estimator=xgb.sklearn.XGBClassifier, parameters=xgb_params, name='xgb')
    model_xgb_f = Classifier(dataset=dataset_f, estimator=xgb.sklearn.XGBClassifier, parameters=xgb_params, name='xgb_f')
    model_rf = Classifier(dataset=dataset_f, estimator=RandomForestClassifier, parameters={'n_estimators': 700},
                          name='rf')

    pipeline = ModelsPipeline(
        # model_xgb,
        model_rf,
        model_xgb_f)

    stack_ds = pipeline.stack(k=5, full_test=True, seed=111)
    stacker = Classifier(stack_ds, LogisticRegression)
    stacker.validate(k=5, scorer=log_loss)

    # logging.info(val_results)
    #
    # # logging.info(train_x.head(10))
    #
    # print(test_x.columns.difference(train_x.columns))

    #
    # boosters = np.array([])
    # predictions = []
예제 #2
0
                           'reg_alpha': 0.001,
                           'colsample_bytree': 0.5,
                           'min_child_samples': 24,
                       },
                       name='lgb')

model_lgb2 = Classifier(dataset=dataset,
                        estimator=lgb.LGBMClassifier,
                        parameters={
                            'n_estimators': 70,
                            'boosting_type': 'gbdt',
                            'max_depth': 6,
                            'min_child_weight': 0.001,
                            'num_leaves': 30,
                            'seed': 128,
                            'reg_alpha': 0.001,
                            'reg_lambda': 0.002,
                            'colsample_bytree': 0.5,
                            'min_child_samples': 24
                        },
                        name='lgb')

model_lg = Classifier(dataset=dataset, estimator=LogisticRegression, name='lg')

pipeline = ModelsPipeline(model_lgb, model_lgb2, model_xgb, model_xgb2,
                          model_lg)
stack_ds = pipeline.stack(k=10, seed=111)
stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression)
stacker.validate(k=10, scorer=roc_auc_score)
results = stacker.predict()
예제 #3
0
    ###stacking
    model_rf = Regressor(dataset=dataset,
                         estimator=rf,
                         parameters=params_rf,
                         name='rf')
    #model_gbrt = Regressor(dataset=dataset, estimator=gbrt, parameters=params_gbrt,name='gbrt')
    model_ext = Regressor(dataset=dataset,
                          estimator=ext,
                          parameters=params_ext,
                          name='ext')
    model_rcv = Regressor(dataset=dataset,
                          estimator=rcv,
                          parameters=params_rcv,
                          name='rcv')
    #model_lascv = Regressor(dataset=dataset, estimator=lascv, parameters=params_lascv,name='lascv')
    pipeline = ModelsPipeline(model_rf, model_rcv, model_ext)
    stack_ds = pipeline.stack(k=5, seed=111)

    stacker = Regressor(dataset=stack_ds,
                        estimator=Lasso,
                        parameters=params_las)
    y_pre = stacker.predict()
    y_pre_last = np.append(y_pre, y_pre)
    y_pre_last[10] * 1.08
    ###
    #loss_gbrt = Evaluation([y_pre_gbrt],[y_test])
    output(fw, i + 1, y_pre_last)
    '''
    if loss_gbrt>0.015:
        output(fw_gbrt,i+1,y_pre_rf)
        fw_gbrt.write(str(i+1)+',gbrt,'+str(loss_gbrt)+'\n')
예제 #4
0
                           'max_features': 'sqrt',
                           'min_samples_leaf': 15,
                           'min_samples_split': 10
                       },
                       name='gbdt')
model_xgb = Regressor(dataset=dataset,
                      estimator=xgb.XGBRegressor,
                      parameters={
                          'n_estimators': 50,
                          'learning_rate': 0.05,
                          'max_depth': 3
                      },
                      name='xgb')

# stack两个模型
pipeline = ModelsPipeline(model_lr, model_rf, model_gbdt, model_xgb)
stack_ds = pipeline.stack(k=10, seed=111)
# 第二层使用xgboost模型stack
stacker = Regressor(dataset=stack_ds, estimator=xgb.XGBRegressor)
results = stacker.predict()
# 使用10折交叉验证结果
results10 = stacker.validate(k=10, scorer=mean_squared_error)
print("r2_score: %f" % r2_score(y_test, results))

test_y = pd.DataFrame(y_test)
predictions = pd.DataFrame(results)
data = pd.concat([data_XX, data_drop], axis=1)
data = pd.concat([data, data_y], axis=1)
data = pd.concat([data, predictions], axis=1)
data = np.array(data)
with open('C:/20180402_pre_test.csv', 'w') as f:
예제 #5
0
                                                        test_size=0.1,
                                                        random_state=111)
    return X_train, y_train, X_test, y_test


dataset = Dataset(preprocessor=boston_dataset, use_cache=True)
model = Regressor(dataset=dataset,
                  estimator=LinearRegression,
                  parameters={'normalize': True},
                  name='lr')
model_2 = Regressor(dataset=dataset,
                    estimator=RandomForestRegressor,
                    parameters={'n_estimators': 50},
                    name='rf')

pipeline = ModelsPipeline(model, model_2)


def test_apply():
    output = pipeline.apply(lambda x: np.mean(x, axis=0)).execute()
    assert output.shape[0] == dataset.X_test.shape[0]

    output = pipeline.apply(lambda x: np.mean(x, axis=0)).validate(
        scorer=mean_absolute_error, k=10)
    assert len(output) == 10


def test_simple_functions():
    assert dataset.X_test.shape[0] == pipeline.max().execute().shape[0]
    assert dataset.X_test.shape[0] == pipeline.mean().execute().shape[0]
    assert dataset.X_test.shape[0] == pipeline.gmean().execute().shape[0]
예제 #6
0
                estimator=ExtraTreesClassifier,
                parameters=et_param,
                name='et')
lgb = Classifier(dataset=dataset,
                 estimator=LGBMClassifier,
                 use_cache=CACHE,
                 parameters=lgb_param,
                 name='lgb')
lr = Classifier(dataset=dataset,
                estimator=LogisticRegression,
                use_cache=CACHE,
                parameters=lr_param,
                name='lr')
#------------------------------------------------------------------------------
#Stack the models and returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(knn, rf, et, lgb, lr)
stack_ds = pipeline.stack(k=NFOLDS, seed=1)

# Train LogisticRegression on stacked data (second stage)
lr1 = LogisticRegression
lr1_params = {
    'C': 5,
    'random_state': 1,
    'solver': 'liblinear',
    'multi_class': 'ovr',
}
stacker = Classifier(dataset=stack_ds,
                     estimator=lr1,
                     use_cache=False,
                     parameters=lr1_params)
예제 #7
0
                       use_cache=class_use_cache)
model_mlp = Classifier(dataset=dataset,
                       estimator=MLPClassifier,
                       name="mlp",
                       use_cache=class_use_cache)
model_sgt = Classifier(dataset=dataset,
                       estimator=SGDClassifier,
                       parameters={'penalty': 'l1'},
                       name="sgd",
                       use_cache=class_use_cache)

# Stack两个模型mhg
# Returns new dataset with out-of-fold prediction,model_svm,model_per
logging.info('stack_ds....')
# pipeline = ModelsPipeline(model_nb,model_lr,model_svc)
pipeline = ModelsPipeline(model_sgt)
# pipeline = ModelsPipeline(model_nb),model_nb,model_lr,model_lr2
stack_ds = pipeline.stack(k=8, seed=111)
#第二层使用lr模型stack2
logging.info('second layer....')
stacker = Classifier(dataset=stack_ds,
                     estimator=svm.LinearSVC,
                     use_cache=False,
                     probability=False)
results = stacker.predict()

# 使用10折交叉验证结果
results10 = stacker.validate(k=3, scorer=accuracy_score)
logging.info(results10)

result_list = list(results + 1)
예제 #8
0
파일: test3.py 프로젝트: thajime0125/keiba
              estimator=LinearRegression,
              parameters={'normalize': True},
              name='lr'),
    Regressor(dataset=dataset, estimator=KNeighborsRegressor, name='kr'),
    Regressor(dataset=dataset,
              estimator=CatBoostRegressor,
              parameters={
                  'custom_metric': ['MAE'],
                  'random_seed': seed,
                  'logging_level': 'Silent'
              },
              name='cr')
]

# pipelineを定義、2nd levelデータセットの作成
pipeline = ModelsPipeline(*models)
stack_ds = pipeline.stack(k=10, seed=seed)

# modelを作ってvalidation
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
y_trues, y_preds = stacker.validate(k=10)

# 精度出力
# X_testを使ってpredict
y_pred = stacker.predict()
print(y_pred)

sum = 0
buy = 0
for i, yosoku in enumerate(x_test):
    if stacker.predict(x_test[i]) < 0.3:
예제 #9
0
fs = ['xgb1','xgb2','xgb3','et','svm','lr','lgb','gbdt']  
  
import matplotlib.pyplot as plt  
  
def plot_confusion_matrix(cm, title, cmap=plt.cm.Blues):  
    plt.imshow(cm, interpolation='nearest', cmap=cmap)  
    plt.title(title)  
    plt.colorbar()  
    tick_marks = np.arange(8)  
    plt.xticks(tick_marks, fs, rotation=45)  
    plt.yticks(tick_marks, fs)  
    plt.tight_layout()  
  
plot_confusion_matrix(cm, title='mic')  
plt.show() 
model_xgb2 = Regressor(dataset= dataset, estimator=xgb_feature2,name='xgb2',use_cache=False) 
model_lr = Regressor(dataset= dataset, estimator=logistic_model,name='lr',use_cache=False) 
model_lgb = Regressor(dataset= dataset, estimator=lgb_model,name='lgb',use_cache=False)  
model_ gbdt = Regressor(dataset= dataset, estimator=gbdt_model,name='gbdt',use_cache=False)
pipeline = ModelsPipeline(model_xgb2, model_lr, model_lgb, model_svm)  
stack_data = pipeline.stack(k=5, seed=0, add_diff=False, full_test=True)  
stacker = Regressor(dataset=stack_data,estimator=LinearRegression,
                      parameters={'fit_intercept': False})  
predict_result = stacker.predict()
val = pd.read_csv('val_list.csv')
val['PROB'] = predict_result
minmin, maxmax = min(val ['PROB']),max(val ['PROB'])
val['PROB'] = val['PROB'].map(lambda x:(x-minmin)/(maxmax-minmin))
val['PROB'] = val['PROB'].map(lambda x:'%.4f' % x)

예제 #10
0
파일: stacking.py 프로젝트: godkillok/heamy
                           'n_jobs': -1
                       },
                       name='Perceptron',
                       use_cache=class_use_cache)
model_svc = Classifier(dataset=dataset,
                       estimator=svm.LinearSVC,
                       name="LinearSVC",
                       use_cache=class_use_cache)
model_svc = Classifier(dataset=dataset,
                       estimator=svm.LinearSVC,
                       name="LinearSVC",
                       use_cache=class_use_cache)

# Stack两个模型mhg
# Returns new dataset with out-of-fold predictionmodel_svc,
pipeline = ModelsPipeline(model_nb)
stack_ds = pipeline.stack(k=3, seed=111)
print(stack_ds.X_train.shape)
#第二层使用lr模型stack
stacker = Classifier(dataset=stack_ds,
                     estimator=LogisticRegression,
                     parameters={
                         'C': 4,
                         'dual': True,
                         'n_jobs': -1
                     },
                     use_cache=False,
                     probability=False)
results = stacker.predict()
result_list = list(results)
예제 #11
0
model_rf = Regressor(dataset=dataset,
                     estimator=RandomForestRegressor,
                     parameters={'n_estimators': 50},
                     name='rf')
model_lr = Regressor(dataset=dataset,
                     estimator=LinearRegression,
                     parameters={'normalize': True},
                     name='lr')
model_knn = Regressor(dataset=dataset,
                      estimator=KNeighborsRegressor,
                      parameters={'n_neighbors': 15},
                      name='knn')
model_lgt = Regressor(dataset=dataset,
                      estimator=LogisticRegression,
                      parameters={'penalty': 'l2'},
                      name='lgt')
xgbclf = Classifier(dataset=dataset, estimator=XGBClassifier)

# Stack two models
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_rf, model_lr, model_knn, xgbclf)
weights = pipeline.find_weights(mean_absolute_error)
result = pipeline.weight(weights)
stack_ds = pipeline.stack(k=10, seed=111)

# Then, train LinearRegression on stacked data
stacker = Regressor(dataset=dataset, estimator=LinearRegression)
results = stacker.predict()

results = stacker.validate(k=10, scorer=mean_absolute_error)
예제 #12
0
model_xgb = Classifier(dataset=dataset,
                       estimator=XGBClassifier,
                       parameters={
                           'subsample': 0.6,
                           'colsample_btree': 0.6,
                           'random_state': 27,
                           'n_jobs': 1
                       },
                       name="xgb",
                       use_cache=class_use_cache)

# Stack两个模型mhg1
# Returns new dataset with out-of-fold prediction,model_svm,model_per
logging.info('stack_ds....')
# pipeline = ModelsPipeline(model_nb,model_lr,model_svc)
pipeline = ModelsPipeline(model_svc, model_lr)
# pipeline = ModelsPipeline(model_nb),model_nb,model_lr,model_lr2
stack_ds = pipeline.stack(k=8, seed=111)
#第二层使用lr模型stack2
logging.info('second layer....')
stacker = Classifier(dataset=stack_ds,
                     estimator=svm.LinearSVC,
                     use_cache=False,
                     probability=False)
results = stacker.predict()

# 使用10折交叉验证结果
results10 = stacker.validate(k=8, scorer=accuracy_score)
logging.info(results10)

# print(accuracy_score(y_test, results))
예제 #13
0
    ###stacking
    model_rf1 = Regressor(dataset=dataset, estimator=rf, parameters=params_rf,name='rf1')
    #model_ext = Regressor(dataset=dataset, estimator=ext, parameters=params_ext,name='ext')
    #model_rf2 = Regressor(dataset=dataset, estimator=rf2, parameters=params_rf2,name='rf2')
    model_rcv = Regressor(dataset=dataset, estimator=rcv, parameters=params_rcv,name='rcv')
    #model_gbrt = Regressor(dataset=dataset, estimator=gbrt, parameters=params_gbrt,name='gbrt')
    #model_lascv = Regressor(dataset=dataset, estimator=lascv, parameters=params_lascv,name='lascv')
    model_br = Regressor(dataset=dataset, estimator=br, parameters=params_br,name='br')
    model_knn = Regressor(dataset=dataset, estimator=knn, parameters=params_knn,name='knn')
    
    #blending = pipeline.blend(proportion=0.3,seed=111)
    params_las = {'alpha':1.7}
    params_rcv2 = {'cv':5,'normalize':True,'gcv_mode':'auto','scoring':'neg_mean_squared_error'}
    params_lascv = {'max_iter':500,'cv':8}

    pipeline = ModelsPipeline(model_rf1,model_knn)
    stack_ds = pipeline.stack(k=5,seed=111)
    stacker = Regressor(dataset=stack_ds,estimator=LassoCV, parameters=params_lascv)
    y_pre = stacker.predict()

    pipeline2 = ModelsPipeline(model_rf1,model_knn)
    stack_ds2 = pipeline2.blend(seed=111)
    blending =  Regressor(dataset=stack_ds2,estimator=LassoCV, parameters=params_lascv)
    y_pre2 = blending.predict()
    blending_pre.append(y_pre2)

   

    #print(y_pre)
    #y_pre = pipeline.blend()
    #print(y_pre)
예제 #14
0
y_train = df_data.loc[df_data['sample']=='train', 'isDefault']
# 数据集划分
# X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)


from heamy.dataset import Dataset
from heamy.estimator import Classifier

model_dataset = Dataset(X_train=X_train, y_train=y_train, X_test=X_test)
model_xgb = Classifier(dataset=model_dataset, estimator=xgb_model, name='xgb', use_cache=False)
model_lgb = Classifier(dataset=model_dataset, estimator=lgb_model, name='lgb', use_cache=False)

from heamy.pipeline import ModelsPipeline

pipeline = ModelsPipeline(model_xgb, model_lgb)
pipeline

# 构建第一层新特征,其中k默认是5,表示5折交叉验证,full_test=True,对全部训练集进行训练得到基学习器,然后用基学习器对测试集预测得到新特征
stack_ds = pipeline.stack(k=5, seed=111, full_test=True)

from sklearn.linear_model import LogisticRegression
# 第二层使用逻辑回归进行stack
LogisticRegression(solver='lbfgs')
stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, parameters={'solver': 'lbfgs'})
# 测试集的预测结果
test_pred = stacker.predict()
test_pred

"""生成提交格式的DataFrame"""
df_result = pd.DataFrame({'id': df_data.loc[df_data['sample']=='test', 'id'].values, 'isDefault': test_pred})
예제 #15
0
# create dataset
dataset = Dataset(X_train, y_train, X_test)

# initialize RandomForest & LinearRegression
model_rf = Regressor(dataset=dataset,
                     estimator=RandomForestRegressor,
                     parameters={'n_estimators': 50},
                     name='rf')
model_lr = Regressor(dataset=dataset,
                     estimator=LinearRegression,
                     parameters={'normalize': True},
                     name='lr')

# Stack two models
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_rf, model_lr)
stack_ds = pipeline.stack(k=10, seed=111)

# Train LinearRegression on stacked data (second stage)
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
results = stacker.predict()
# Validate results using 10 fold cross-validation
results = stacker.validate(k=10, scorer=mean_absolute_error)

#blend
# load boston dataset from sklearn
from sklearn.datasets import load_boston
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
예제 #16
0
# load boston dataset from sklearn
from sklearn.datasets import load_boston
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111)

# create dataset
dataset = Dataset(X_train,y_train,X_test)

# initialize RandomForest & LinearRegression
model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf')
model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True},name='lr')

# Stack two models
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_rf,model_lr)
stack_ds = pipeline.stack(k=10,seed=111)

# Train LinearRegression on stacked data (second stage)
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
results = stacker.predict()
# Validate results using 10 fold cross-validation
results = stacker.validate(k=10,scorer=mean_absolute_error)



#blend
# load boston dataset from sklearn
from sklearn.datasets import load_boston
data = load_boston()
X, y = data['data'], data['target']
예제 #17
0
  params_ext = {'max_features':'log2','n_estimators':500,'max_depth':12,'oob_score': True, 'n_jobs':4,'bootstrap':True}
  ext = ExtraTreesRegressor
 
  params_gbrt = {'loss':'huber','n_estimators': 400,'max_depth':12,'learning_rate': 0.01, 'random_state': 3}
  gbrt = GradientBoostingRegressor
  ###stacking
  model_rf1 = Regressor(dataset=dataset, estimator=rf, parameters=params_rf,name='rf1')
  model_ext = Regressor(dataset=dataset, estimator=ext, parameters=params_ext,name='ext')
  #model_rf2 = Regressor(dataset=dataset, estimator=rf2, parameters=params_rf2,name='rf2')
  model_rcv = Regressor(dataset=dataset, estimator=rcv, parameters=params_rcv,name='rcv')
  model_gbrt = Regressor(dataset=dataset, estimator=gbrt, parameters=params_gbrt,name='gbrt')
  #model_lascv = Regressor(dataset=dataset, estimator=lascv, parameters=params_lascv,name='lascv')
  model_br = Regressor(dataset=dataset, estimator=br, parameters=params_br,name='br')
  model_knn = Regressor(dataset=dataset, estimator=knn, parameters=params_knn,name='knn')
  model_adb = Regressor(dataset=dataset, estimator=adb, parameters=params_adb,name='adb')
  pipeline = ModelsPipeline(model_rf1,model_knn,model_rcv)
  #stack_ds = pipeline.stack(k=5,seed=111)
  #blending = pipeline.blend(proportion=0.3,seed=111)
  params_las = {'alpha':1.7}
  params_rcv2 = {'cv':5,'normalize':True,'gcv_mode':'auto','scoring':'neg_mean_absolute_error'}
  #stacker = Regressor(dataset=stack_ds,estimator=rcv, parameters=params_rcv2)
  #y_pre = stacker.predict()
  #print(y_pre)
  #y_pre = pipeline.blend()
  #print(y_pre)
  ###
  #loss_stack = Evaluation([y_pre],[y_test])
  #stacking_pre.append(y_pre)
  weights = pipeline.find_weights(mean_squared_error)
  #print(weights)
  result = pipeline.weight(weights).execute()
예제 #18
0
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=111)

# create dataset
dataset = Dataset(X_train, y_train, X_test)

# initialize RandomForest & LinearRegression
model_rf = Regressor(dataset=dataset,
                     estimator=RandomForestRegressor,
                     parameters={'n_estimators': 50},
                     name='rf')
model_lr = Regressor(dataset=dataset,
                     estimator=LinearRegression,
                     parameters={'normalize': True},
                     name='lr')

# Stack two models
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_rf, model_lr)
stack_ds = pipeline.stack(k=10, seed=111)

# Train LinearRegression on stacked data (second stage)
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
results = stacker.predict()
# Validate results using 10 fold cross-validation
results = stacker.validate(k=10, scorer=mean_absolute_error)

#print(results)
                        estimator=xgb_feature2,
                        name='xgb2',
                        use_cache=False)
 model_xgb3 = Regressor(dataset=xgb_dataset,
                        estimator=xgb_feature3,
                        name='xgb3',
                        use_cache=False)
 model_lgb = Regressor(dataset=lgb_dataset,
                       estimator=lgb_feature,
                       name='lgb',
                       use_cache=False)
 model_gbdt = Regressor(dataset=xgb_dataset,
                        estimator=gbdt_model,
                        name='gbdt',
                        use_cache=False)
 pipeline = ModelsPipeline(model_xgb, model_xgb2, model_xgb3, model_lgb,
                           model_gbdt)
 stack_ds = pipeline.stack(k=5,
                           seed=111,
                           add_diff=False,
                           full_test=True)
 stacker = Regressor(dataset=stack_ds,
                     estimator=LinearRegression,
                     parameters={'fit_intercept': False})
 predict_result = stacker.predict()
 ans = pd.read_csv('../AI_risk_test_V3.0/test_list.csv',
                   parse_dates=['appl_sbm_tm'])
 ans['PROB'] = predict_result
 ans = ans.drop(['appl_sbm_tm'], axis=1)
 minmin, maxmax = min(ans['PROB']), max(ans['PROB'])
 ans['PROB'] = ans['PROB'].map(lambda x: (x - minmin) /
                               (maxmax - minmin))
예제 #20
0
							name='gdbt2')
model_gdbt3 = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters={'n_estimators':600,'loss' : 'deviance',
							'max_depth':4,'min_samples_split':10,'min_weight_fraction_leaf':0.01,'learning_rate':0.07,'random_state':3},
							name='gdbt3')							
model_xgbt = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters={'n_estimators' :1350,
							'nthread':-1,'max_depth':3,'min_child_weight':6,'learning_rate':0.05,
							'gamma':0,'subsample':0.9,'colsample_bytree':0.9,'reg_alpha':8,},name='xgbt')
model_ext1 = Classifier(dataset=dataset,estimator=ExtraTreesClassifier,parameters={'n_estimators':700,'max_depth':39,'n_jobs':-1,
							'criterion':'gini','min_samples_split':18},name='ext1')
model_ext2 = Classifier(dataset=dataset,estimator=ExtraTreesClassifier,parameters={'n_estimators':700,'max_depth':39,'n_jobs':-1,
							'criterion':'entropy','min_samples_split':18},name='ext2')


# Stack two models
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_rf1,model_rf2,model_gdbt1,model_gdbt2,model_gdbt3,model_ext1,model_ext2)
stack_ds = pipeline.stack(k=5,seed=111)

# Train LinearRegression on stacked data (second stage)
stacker1 = Classifier(dataset=stack_ds, estimator=LogisticRegression,parameters={'C': 10})
# stacker2 = Classifier(dataset=stack_ds, estimator=LogisticRegression,parameters={'C': 1,'penalty':'l1'})
# stacker3 = Classifier(dataset=stack_ds, estimator=SVC,parameters={'probability':True,'C':100})
# stacker4 = Classifier(dataset=stack_ds, estimator=SVC,parameters={'probability':True,'C':10})
pre_y1 = stacker1.predict()
# pre_y2 = stacker2.predict()
# pre_y3 = stacker3.predict()
# pre_y4 = stacker4.predict()

#print(pre_y)
# 计算auc
# fpr, tpr, thresholds = metrics.roc_curve(test_y, pre_y1)
from heamy.estimator import Regressor
from heamy.pipeline import ModelsPipeline

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2)

# create dataset
Data = Dataset(X_train,y_train,X_test)

# initialize RandomForest & LinearRegression
RfModel = Regressor(dataset=Data, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf')
LRModel = Regressor(dataset=Data, estimator=LinearRegression, parameters={'normalize': True},name='lr')


# Stack two models
# Returns new dataset with out-of-fold predictions
Pipeline = ModelsPipeline(RfModel,LRModel)
StackModel = Pipeline.stack(k=10,seed=2)

# Train LinearRegression on stacked data (second stage)
Stacker = Regressor(dataset=StackModel, estimator=LinearRegression)
Results = Stacker.predict()
# Validate results using 10 fold cross-validation
Results = Stacker.validate(k=10,scorer=mean_absolute_error)
예제 #22
0
# -*- coding: utf-8 -*-
from heamy.dataset import Dataset
from heamy.estimator import Regressor, Classifier
from heamy.pipeline import ModelsPipeline
from sklearn import cross_validation
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
#加载数据集
from sklearn.datasets import load_boston
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.1, random_state=111)
#创建数据集
dataset = Dataset(X_train,y_train,X_test)
#创建RF模型和LR模型
model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf')
model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True},name='lr')
# Blending两个模型
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_rf,model_lr)
stack_ds = pipeline.blend(proportion=0.2,seed=111)
#第二层使用lr模型stack
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
results = stacker.predict()
# 使用10折交叉验证结果
results10 = stacker.validate(k=10,scorer=mean_absolute_error)