Exemplo n.º 1
0
def fit_model (X, y):
    model = XGBRFRegressor(n_estimators=1000, max_depth=7, random_state=42)
    model.fit(X, y)
    y_pred = model.predict(X)
    #print (y)
    err_mae = mean_absolute_error(y, y_pred)
    err_rmse = np.sqrt(mean_squared_error(y, y_pred))
    return model, y_pred, err_mae, err_rmse
Exemplo n.º 2
0
class XGBRFRegressorOptimizer(BaseOptimizer):
	def __init__(self,src_file_index,bounds):
		self.model = XGBRFRegressor()
		self.model_name = "XGBRFRegressor"
		self.src = util.get_src_file(src_file_index=src_file_index)
		self.lower_bounds = bounds["lower_bounds"]
		self.upper_bounds = bounds["upper_bounds"]
		self.with_rain = False
		self.optimization_methods = optimization_methods
		self.num_iterations = 200
		self.results = {}
		self.result_save_path = 'optimization_result/with_rain_'+str(self.with_rain)+'/'+self.src.split('.')[0].split('/')[-1]+'/'
		self.optimization()
		self.save_optimization_result()

	def objective_function(self,x):
		print("XGBRegressor优化中...")
		train_x, test_x, train_y, test_y = util.get_train_test_split(self.src,int(np.round(x[0])),int(np.round(x[1])),with_rain=self.with_rain)
		print(self.model_name)
		self.tune_params = ['offset','period','max_depth',
							# 'learning_rate',
		 					'n_estimators',
							'gasmma',
							'min_child_weight','max_delta_step','subsample',
							'colsample_bytree','colsample_bylevel','colsample_bynode','reg_alpha',
							'reg_lambda','scale_pos_weight','base_score'
							]
		self.model.max_depth = int(x[2])
		self.model.n_estimators = int(x[3])
		self.model.gamma = x[4]
		self.model.min_child_weight = int(x[5])
		self.model.max_delta_step = int(x[6])
		self.model.subsample = x[7]
		self.model.colsample_bytree = x[8]
		self.model.colsample_bylevel = x[9]
		self.model.colsample_bynode = x[10]
		self.model.reg_alpha = x[11]
		self.model.reg_lambda = x[12]
		self.model.scale_pos_weight = x[13]
		self.model.base_score = x[14]
		self.model.objective = 'reg:squarederror'
		self.model.learning_rate = 0.001
		self.model.fit(X=train_x,y=train_y)
		y_hat = self.model.predict(test_x)
		mse = mean_squared_error(y_hat,test_y)
		return mse
                                                    shuffle=True,
                                                    train_size=0.8,
                                                    random_state=66)

model = XGBRFRegressor(n_jobs=-1)
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
print('R2', score)

thresholds = np.sort(model.feature_importances_)  #피처를 소팅
print(thresholds)

for thresh in thresholds:
    selection = SelectFromModel(model, threshold=thresh,
                                prefit=True)  # 피처의 개수를 하나씩 제거

    select_x_train = selection.transform(x_train)  # 피쳐의 개수를 줄인 트레인을 반환

    selection_model = XGBRFRegressor(n_jobs=-1)  # 모델 생성
    selection_model.fit(select_x_train, y_train)  #모델의 핏

    select_x_test = selection.transform(x_test)  # 피쳐의 개수를 줄인 테스트 반환
    y_predict = selection_model.predict(select_x_test)  # 프레딕트

    score = r2_score(y_test, y_predict)

    print("Thresh%.3f, n=%d, R2: %.2f%%" %
          (thresh, select_x_train.shape[1], score * 100.0))

score = model.score(x_test, y_test)
print('R2', score)
Exemplo n.º 4
0
temp = []
for CARD_SIDO_NM in CARD_SIDO_NMs:
    for STD_CLSS_NM in STD_CLSS_NMs:
        for HOM_SIDO_NM in HOM_SIDO_NMs:
            for AGE in AGEs:
                for SEX_CTGO_CD in SEX_CTGO_CDs:
                    for FLC in FLCs:
                        for year in years:
                            for month in months:
                                temp.append([CARD_SIDO_NM, STD_CLSS_NM, HOM_SIDO_NM, AGE, SEX_CTGO_CD, FLC, year, month])
temp = np.array(temp)
temp = pd.DataFrame(data=temp, columns=train_features.columns)

# 예측
pred = model.predict(temp)
pred = np.expm1(pred)
temp['AMT'] = np.round(pred, 0)
temp['REG_YYMM'] = temp['year']*100 + temp['month']
temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)

# 디코딩 
temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM'])
temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

# 제출 파일 만들기
submission = pd.read_csv('D:/STUDY/dacon/comp4/submission.csv', index_col=0)
submission = submission.drop(['AMT'], axis=1)
submission = submission.merge(temp, left_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], right_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], how='left')
submission.index.name = 'id'
Exemplo n.º 5
0
class XGBoostOptimiser(Optimiser):
    name = "xgboost"

    def __init__(self, config):
        super().__init__(config)
        self.config.logger.info("XGBoostOptimiser::Init")
        self.model = XGBRFRegressor(verbosity=1, **(self.config.params))

    def train(self):
        self.config.logger.info("XGBoostOptimiser::train")
        inputs, exp_outputs = self.get_data_("train")
        self.model.fit(inputs, exp_outputs)
        self.save_model(self.model)

    def apply(self):
        self.config.logger.info("XGBoostOptimiser::apply, input size: %d",
                                self.config.dim_input)
        self.load_model()
        inputs, exp_outputs = self.get_data_("apply")
        pred_outputs = self.model.predict(inputs)
        self.plot_apply_(exp_outputs, pred_outputs)
        self.config.logger.info("Done apply")

    def search_grid(self):
        raise NotImplementedError("Search grid method not implemented yet")

    def save_model(self, model):
        # Snapshot - can be used for further training
        out_filename = "%s/xgbmodel_%s_nEv%d.json" %\
                (self.config.dirmodel, self.config.suffix, self.config.train_events)
        pickle.dump(model, open(out_filename, "wb"), protocol=4)

    def load_model(self):
        # Loading a snapshot
        filename = "%s/xgbmodel_%s_nEv%d.json" %\
                (self.config.dirmodel, self.config.suffix, self.config.train_events)
        self.model = pickle.load(open(filename, "rb"))

    def get_data_(self, partition):
        inputs = []
        exp_outputs = []
        for indexev in self.config.partition[partition]:
            inputs_single, exp_outputs_single = load_event_idc(
                self.config.dirinput_train, indexev, self.config.input_z_range,
                self.config.output_z_range, self.config.opt_predout)
            inputs.append(inputs_single)
            exp_outputs.append(exp_outputs_single)
        inputs = np.concatenate(inputs)
        exp_outputs = np.concatenate(exp_outputs)
        return inputs, exp_outputs

    def plot_apply_(self, exp_outputs, pred_outputs):
        myfile = TFile.Open("%s/output_%s_nEv%d.root" % \
                            (self.config.dirval, self.config.suffix, self.config.train_events),
                            "recreate")
        h_dist_all_events, h_deltas_all_events, h_deltas_vs_dist_all_events =\
                plot_utils.create_apply_histos(self.config, self.config.suffix, infix="all_events_")
        distortion_numeric_flat_m, distortion_predict_flat_m, deltas_flat_a, deltas_flat_m =\
            plot_utils.get_apply_results_single_event(pred_outputs, exp_outputs)
        plot_utils.fill_apply_tree(h_dist_all_events, h_deltas_all_events,
                                   h_deltas_vs_dist_all_events,
                                   distortion_numeric_flat_m,
                                   distortion_predict_flat_m, deltas_flat_a,
                                   deltas_flat_m)

        for hist in (h_dist_all_events, h_deltas_all_events,
                     h_deltas_vs_dist_all_events):
            hist.Write()
        plot_utils.fill_profile_apply_hist(h_deltas_vs_dist_all_events,
                                           self.config.profile_name,
                                           self.config.suffix)
        plot_utils.fill_std_dev_apply_hist(h_deltas_vs_dist_all_events,
                                           self.config.h_std_dev_name,
                                           self.config.suffix, "all_events_")

        myfile.Close()
Exemplo n.º 6
0
# parameters=[
#     { 'n_estimators' : [300],
#     'learning_rate' : [1],
#     'colsample_bytree' : [0.9], # 0.6~0.9사용
#     'colsample_bylevel': [0.9],
#     'max_depth' : [50]}
# ]

model = XGBRFRegressor(n_jobs=-1)
# model = GridSearchCV(model, parameters, cv =5)
model = MultiOutputRegressor(model)

warnings.filterwarnings('ignore')
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(y_pred)
print(y_pred.shape)
acc = model.score(x_test, y_test)
warnings.filterwarnings('ignore')
print(acc)
# print("최적의 매개 변수 :  ", model.best_params_)
warnings.filterwarnings('ignore')
# print("최적의모델은:", model.best_estimator_)

a = np.arange(10000,20000)
y_pred = pd.DataFrame(y_pred,a)
y_pred.to_csv('./data/dacon/comp1/sample_submission.csv', index = True, header=['hhb','hbo2','ca','na'],index_label='id')


Exemplo n.º 7
0
                     n_estimators=2400,
                     reg_alpha=0.6,
                     reg_lambda=0.6)
lgbm = LGBMRegressor(objective='regression',
                     num_leaves=4,
                     learning_rate=0.01,
                     n_estimators=12000)

# In[158]:

xgb.fit(X_train, y_train)
lgbm.fit(X_train, y_train, eval_metric='rmse')

# In[162]:

predict1 = xgb.predict(X_test)
predict2 = lgbm.predict(X_test)

# In[164]:

print('Root Mean Square Error test = ' +
      str(math.sqrt(mean_squared_error(y_test, predict1))))
print('Root Mean Square Erroe test = ' +
      str(math.sqrt(mean_squared_error(y_test, predict2))))

# In[165]:

predcict3 = lgbm.predict(model_test)
predcict4 = xgb.predict(model_test)
predict_y = (predcict3 * 0.45 + predcict4 * 0.55)
Exemplo n.º 8
0
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from time import time
from function import plot_learning_curve

boston = load_boston()
X, y = boston.data, boston.target

x_train, x_test,  y_train, y_test =  train_test_split(X, y, test_size=0.3)

# 构建梯度提升树模型
xgbr = XGBR(n_estimators=100)
xgbr.fit(x_train, y_train)
# 预测结果
predict = xgbr.predict(x_test)


# 计算均方误差
print(MSE(y_test, xgbr.predict(x_test)))

# 绘制学习曲线
cv = KFold(n_splits=5, shuffle=True, random_state=32)
plot_learning_curve(XGBR(n_estimators=100, random_state=30), 'XGBR', X, y, ax=None, cv=cv)
plt.show()

# 通过观察图可以发现在数据量很少的情况下,模型处于过拟合状态,在数据流不断提高时,模型的泛华能力不断提高。


# 开始模型调参,先来绘制n_estimators的学习曲线观察何时最优
axisx = range(100, 300, 10)
Exemplo n.º 9
0
from sklearn.metrics import r2_score

adver = pd.read_csv('../testdata/Advertising.csv')
print(adver.head(3))
x = np.array(adver.loc[:, 'tv':'newspaper'])
y = np.array(adver.sales)

print('\nKNeighborsRegressor')
kmodel = KNeighborsRegressor(n_neighbors=3).fit(x, y)
kpred = kmodel.predict(x)
print('k pred :', kpred[:5])
print('k r2   : {:.3f}'.format(r2_score(y, kpred)))

print('\nLinearRegression')
lmodel = LinearRegression().fit(x, y)
lpred = lmodel.predict(x)
print('l pred :', lpred[:5])
print('l r2   : {:.3f}'.format(r2_score(y, lpred)))

print('\nRandomForestRegressor')  # 배깅에 대표적인 모델
rmodel = RandomForestRegressor(n_estimators=100, criterion='mse').fit(x, y)
rpred = rmodel.predict(x)
print('r pred :', rpred[:5])
print('r r2   : {:.3f}'.format(r2_score(y, rpred)))

print('\nXGBRFRegressor')  # 부스팅에 대표적인 모델
xmodel = XGBRFRegressor(n_estimators=100).fit(x, y)
xpred = xmodel.predict(x)
print('x pred :', rpred[:5])
print('x r2   : {:.3f}'.format(r2_score(y, xpred)))
Exemplo n.º 10
0
                                                    y,
                                                    train_size=0.8,
                                                    shuffle=True,
                                                    random_state=66)

model = XGBRFRegressor(n_jobs=-1)
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
print("R2: ", score)

thresholds = np.sort(model.feature_importances_)
print(thresholds)

for thresh in thresholds:
    # 칼럼수 만큼 돈다.
    selection = SelectFromModel(model, threshold=thresh, prefit=True)

    selection_x_train = selection.transform(x_train)
    # print(selection_x_train.shape)

    selection_model = XGBRFRegressor()
    selection_model.fit(selection_x_train, y_train)

    selection_x_test = selection.transform(x_test)
    y_pred = selection_model.predict(selection_x_test)

    score = r2_score(y_test, y_pred)
    # print("R2: ", score)

    print("Thresh= %.3f, n = %d, R2: %.2f%%" %
          (thresh, selection_x_train.shape[1], score * 100.0))
Exemplo n.º 11
0
# model4 = XGBRFRegressor(n_estimators= 100,learning_rate=1,colsample_bytree=1,colsample_bylevel=0.7,max_depth=30,n_jobs=-1)

# model = GridSearchCV(model, parameters, cv =5)
# model = MultiOutputRegressor(model2)

warnings.filterwarnings('ignore')
# model1.fit(x_train, y1_train)
model2.fit([x1_train, x2_train], y2_train)
# model3.fit(x_train, y3_train)
# model4.fit(x_train, y4_train)

# y1_pred = model1.predict(x_test)
# print(y1_pred)
# print(y1_pred.shape)

y2_pred = model2.predict([x1_test, x2_test])
# print(y2_pred)
# print(y2_pred.shape)

# y3_pred = model3.predict(x_test)
# print(y3_pred)
# print(y3_pred.shape)

# y4_pred = model4.predict(x_test)
# print(y4_pred)
# print(y4_pred.shape)

# mae1 = mean_absolute_error(y1_test, y1_pred)
mae2 = mean_absolute_error(y2_test, y2_pred)
# mae3 = mean_absolute_error(y3_test, y3_pred)
# mae4 = mean_absolute_error(y4_test, y4_pred)
Exemplo n.º 12
0
print('MAE:', metrics.mean_absolute_error(y_test, predictions)) 
print('MSE:', metrics.mean_squared_error(y_test, predictions)) 
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions))) 

fig, ax = plt.subplots()
ax.scatter(y_test, predictions)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
ax.set_title('R2: ' + str(r2_score(y_test, predictions)))
plt.show()

from sklearn import metrics 
xgbrfr = XGBRFRegressor(random_state=133).fit(X_train, y_train)
scores.append(xgbrfr.score(X_test, y_test))
rmse.append(np.sqrt(mean_squared_error(y_test, xgbrfr.predict(X_test))))
predictions = xgbrfr.predict(X_test)
mse.append(metrics.mean_squared_error(y_test, predictions))
mae.append(metrics.mean_absolute_error(y_test, predictions))
print('R^2-Coefficient of Determination value',xgbrfr.score(X_test, y_test))
print('MAE:', metrics.mean_absolute_error(y_test, predictions)) 
print('MSE:', metrics.mean_squared_error(y_test, predictions)) 
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions))) 

fig, ax = plt.subplots()
ax.scatter(y_test, predictions)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
ax.set_title('R2: ' + str(r2_score(y_test, predictions)))
plt.show()
Exemplo n.º 13
0

# In[44]:


print("Test", mean_absolute_error(rf.predict(X_te), y_te))


# ### Xgboost

# In[45]:


xgb = XGBRFRegressor().fit(X_tr, y_tr)
print("XGB Results")
print("Train ", mean_absolute_error(xgb.predict(X_tr), y_tr))


# In[46]:


print("Test ", mean_absolute_error(xgb.predict(X_te), y_te))


# ## Predict Next Day

# In[47]:


df["pain_shift"] = df.pain - df.pain.shift()
df_lag = df.dropna().drop(columns="pain")