def fit_model (X, y): model = XGBRFRegressor(n_estimators=1000, max_depth=7, random_state=42) model.fit(X, y) y_pred = model.predict(X) #print (y) err_mae = mean_absolute_error(y, y_pred) err_rmse = np.sqrt(mean_squared_error(y, y_pred)) return model, y_pred, err_mae, err_rmse
class XGBRFRegressorOptimizer(BaseOptimizer): def __init__(self,src_file_index,bounds): self.model = XGBRFRegressor() self.model_name = "XGBRFRegressor" self.src = util.get_src_file(src_file_index=src_file_index) self.lower_bounds = bounds["lower_bounds"] self.upper_bounds = bounds["upper_bounds"] self.with_rain = False self.optimization_methods = optimization_methods self.num_iterations = 200 self.results = {} self.result_save_path = 'optimization_result/with_rain_'+str(self.with_rain)+'/'+self.src.split('.')[0].split('/')[-1]+'/' self.optimization() self.save_optimization_result() def objective_function(self,x): print("XGBRegressor优化中...") train_x, test_x, train_y, test_y = util.get_train_test_split(self.src,int(np.round(x[0])),int(np.round(x[1])),with_rain=self.with_rain) print(self.model_name) self.tune_params = ['offset','period','max_depth', # 'learning_rate', 'n_estimators', 'gasmma', 'min_child_weight','max_delta_step','subsample', 'colsample_bytree','colsample_bylevel','colsample_bynode','reg_alpha', 'reg_lambda','scale_pos_weight','base_score' ] self.model.max_depth = int(x[2]) self.model.n_estimators = int(x[3]) self.model.gamma = x[4] self.model.min_child_weight = int(x[5]) self.model.max_delta_step = int(x[6]) self.model.subsample = x[7] self.model.colsample_bytree = x[8] self.model.colsample_bylevel = x[9] self.model.colsample_bynode = x[10] self.model.reg_alpha = x[11] self.model.reg_lambda = x[12] self.model.scale_pos_weight = x[13] self.model.base_score = x[14] self.model.objective = 'reg:squarederror' self.model.learning_rate = 0.001 self.model.fit(X=train_x,y=train_y) y_hat = self.model.predict(test_x) mse = mean_squared_error(y_hat,test_y) return mse
shuffle=True, train_size=0.8, random_state=66) model = XGBRFRegressor(n_jobs=-1) model.fit(x_train, y_train) score = model.score(x_test, y_test) print('R2', score) thresholds = np.sort(model.feature_importances_) #피처를 소팅 print(thresholds) for thresh in thresholds: selection = SelectFromModel(model, threshold=thresh, prefit=True) # 피처의 개수를 하나씩 제거 select_x_train = selection.transform(x_train) # 피쳐의 개수를 줄인 트레인을 반환 selection_model = XGBRFRegressor(n_jobs=-1) # 모델 생성 selection_model.fit(select_x_train, y_train) #모델의 핏 select_x_test = selection.transform(x_test) # 피쳐의 개수를 줄인 테스트 반환 y_predict = selection_model.predict(select_x_test) # 프레딕트 score = r2_score(y_test, y_predict) print("Thresh%.3f, n=%d, R2: %.2f%%" % (thresh, select_x_train.shape[1], score * 100.0)) score = model.score(x_test, y_test) print('R2', score)
temp = [] for CARD_SIDO_NM in CARD_SIDO_NMs: for STD_CLSS_NM in STD_CLSS_NMs: for HOM_SIDO_NM in HOM_SIDO_NMs: for AGE in AGEs: for SEX_CTGO_CD in SEX_CTGO_CDs: for FLC in FLCs: for year in years: for month in months: temp.append([CARD_SIDO_NM, STD_CLSS_NM, HOM_SIDO_NM, AGE, SEX_CTGO_CD, FLC, year, month]) temp = np.array(temp) temp = pd.DataFrame(data=temp, columns=train_features.columns) # 예측 pred = model.predict(temp) pred = np.expm1(pred) temp['AMT'] = np.round(pred, 0) temp['REG_YYMM'] = temp['year']*100 + temp['month'] temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']] temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False) # 디코딩 temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM']) temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM']) # 제출 파일 만들기 submission = pd.read_csv('D:/STUDY/dacon/comp4/submission.csv', index_col=0) submission = submission.drop(['AMT'], axis=1) submission = submission.merge(temp, left_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], right_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], how='left') submission.index.name = 'id'
class XGBoostOptimiser(Optimiser): name = "xgboost" def __init__(self, config): super().__init__(config) self.config.logger.info("XGBoostOptimiser::Init") self.model = XGBRFRegressor(verbosity=1, **(self.config.params)) def train(self): self.config.logger.info("XGBoostOptimiser::train") inputs, exp_outputs = self.get_data_("train") self.model.fit(inputs, exp_outputs) self.save_model(self.model) def apply(self): self.config.logger.info("XGBoostOptimiser::apply, input size: %d", self.config.dim_input) self.load_model() inputs, exp_outputs = self.get_data_("apply") pred_outputs = self.model.predict(inputs) self.plot_apply_(exp_outputs, pred_outputs) self.config.logger.info("Done apply") def search_grid(self): raise NotImplementedError("Search grid method not implemented yet") def save_model(self, model): # Snapshot - can be used for further training out_filename = "%s/xgbmodel_%s_nEv%d.json" %\ (self.config.dirmodel, self.config.suffix, self.config.train_events) pickle.dump(model, open(out_filename, "wb"), protocol=4) def load_model(self): # Loading a snapshot filename = "%s/xgbmodel_%s_nEv%d.json" %\ (self.config.dirmodel, self.config.suffix, self.config.train_events) self.model = pickle.load(open(filename, "rb")) def get_data_(self, partition): inputs = [] exp_outputs = [] for indexev in self.config.partition[partition]: inputs_single, exp_outputs_single = load_event_idc( self.config.dirinput_train, indexev, self.config.input_z_range, self.config.output_z_range, self.config.opt_predout) inputs.append(inputs_single) exp_outputs.append(exp_outputs_single) inputs = np.concatenate(inputs) exp_outputs = np.concatenate(exp_outputs) return inputs, exp_outputs def plot_apply_(self, exp_outputs, pred_outputs): myfile = TFile.Open("%s/output_%s_nEv%d.root" % \ (self.config.dirval, self.config.suffix, self.config.train_events), "recreate") h_dist_all_events, h_deltas_all_events, h_deltas_vs_dist_all_events =\ plot_utils.create_apply_histos(self.config, self.config.suffix, infix="all_events_") distortion_numeric_flat_m, distortion_predict_flat_m, deltas_flat_a, deltas_flat_m =\ plot_utils.get_apply_results_single_event(pred_outputs, exp_outputs) plot_utils.fill_apply_tree(h_dist_all_events, h_deltas_all_events, h_deltas_vs_dist_all_events, distortion_numeric_flat_m, distortion_predict_flat_m, deltas_flat_a, deltas_flat_m) for hist in (h_dist_all_events, h_deltas_all_events, h_deltas_vs_dist_all_events): hist.Write() plot_utils.fill_profile_apply_hist(h_deltas_vs_dist_all_events, self.config.profile_name, self.config.suffix) plot_utils.fill_std_dev_apply_hist(h_deltas_vs_dist_all_events, self.config.h_std_dev_name, self.config.suffix, "all_events_") myfile.Close()
# parameters=[ # { 'n_estimators' : [300], # 'learning_rate' : [1], # 'colsample_bytree' : [0.9], # 0.6~0.9사용 # 'colsample_bylevel': [0.9], # 'max_depth' : [50]} # ] model = XGBRFRegressor(n_jobs=-1) # model = GridSearchCV(model, parameters, cv =5) model = MultiOutputRegressor(model) warnings.filterwarnings('ignore') model.fit(x_train, y_train) y_pred = model.predict(x_test) print(y_pred) print(y_pred.shape) acc = model.score(x_test, y_test) warnings.filterwarnings('ignore') print(acc) # print("최적의 매개 변수 : ", model.best_params_) warnings.filterwarnings('ignore') # print("최적의모델은:", model.best_estimator_) a = np.arange(10000,20000) y_pred = pd.DataFrame(y_pred,a) y_pred.to_csv('./data/dacon/comp1/sample_submission.csv', index = True, header=['hhb','hbo2','ca','na'],index_label='id')
n_estimators=2400, reg_alpha=0.6, reg_lambda=0.6) lgbm = LGBMRegressor(objective='regression', num_leaves=4, learning_rate=0.01, n_estimators=12000) # In[158]: xgb.fit(X_train, y_train) lgbm.fit(X_train, y_train, eval_metric='rmse') # In[162]: predict1 = xgb.predict(X_test) predict2 = lgbm.predict(X_test) # In[164]: print('Root Mean Square Error test = ' + str(math.sqrt(mean_squared_error(y_test, predict1)))) print('Root Mean Square Erroe test = ' + str(math.sqrt(mean_squared_error(y_test, predict2)))) # In[165]: predcict3 = lgbm.predict(model_test) predcict4 = xgb.predict(model_test) predict_y = (predcict3 * 0.45 + predcict4 * 0.55)
import pandas as pd import matplotlib.pyplot as plt import datetime from time import time from function import plot_learning_curve boston = load_boston() X, y = boston.data, boston.target x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 构建梯度提升树模型 xgbr = XGBR(n_estimators=100) xgbr.fit(x_train, y_train) # 预测结果 predict = xgbr.predict(x_test) # 计算均方误差 print(MSE(y_test, xgbr.predict(x_test))) # 绘制学习曲线 cv = KFold(n_splits=5, shuffle=True, random_state=32) plot_learning_curve(XGBR(n_estimators=100, random_state=30), 'XGBR', X, y, ax=None, cv=cv) plt.show() # 通过观察图可以发现在数据量很少的情况下,模型处于过拟合状态,在数据流不断提高时,模型的泛华能力不断提高。 # 开始模型调参,先来绘制n_estimators的学习曲线观察何时最优 axisx = range(100, 300, 10)
from sklearn.metrics import r2_score adver = pd.read_csv('../testdata/Advertising.csv') print(adver.head(3)) x = np.array(adver.loc[:, 'tv':'newspaper']) y = np.array(adver.sales) print('\nKNeighborsRegressor') kmodel = KNeighborsRegressor(n_neighbors=3).fit(x, y) kpred = kmodel.predict(x) print('k pred :', kpred[:5]) print('k r2 : {:.3f}'.format(r2_score(y, kpred))) print('\nLinearRegression') lmodel = LinearRegression().fit(x, y) lpred = lmodel.predict(x) print('l pred :', lpred[:5]) print('l r2 : {:.3f}'.format(r2_score(y, lpred))) print('\nRandomForestRegressor') # 배깅에 대표적인 모델 rmodel = RandomForestRegressor(n_estimators=100, criterion='mse').fit(x, y) rpred = rmodel.predict(x) print('r pred :', rpred[:5]) print('r r2 : {:.3f}'.format(r2_score(y, rpred))) print('\nXGBRFRegressor') # 부스팅에 대표적인 모델 xmodel = XGBRFRegressor(n_estimators=100).fit(x, y) xpred = xmodel.predict(x) print('x pred :', rpred[:5]) print('x r2 : {:.3f}'.format(r2_score(y, xpred)))
y, train_size=0.8, shuffle=True, random_state=66) model = XGBRFRegressor(n_jobs=-1) model.fit(x_train, y_train) score = model.score(x_test, y_test) print("R2: ", score) thresholds = np.sort(model.feature_importances_) print(thresholds) for thresh in thresholds: # 칼럼수 만큼 돈다. selection = SelectFromModel(model, threshold=thresh, prefit=True) selection_x_train = selection.transform(x_train) # print(selection_x_train.shape) selection_model = XGBRFRegressor() selection_model.fit(selection_x_train, y_train) selection_x_test = selection.transform(x_test) y_pred = selection_model.predict(selection_x_test) score = r2_score(y_test, y_pred) # print("R2: ", score) print("Thresh= %.3f, n = %d, R2: %.2f%%" % (thresh, selection_x_train.shape[1], score * 100.0))
# model4 = XGBRFRegressor(n_estimators= 100,learning_rate=1,colsample_bytree=1,colsample_bylevel=0.7,max_depth=30,n_jobs=-1) # model = GridSearchCV(model, parameters, cv =5) # model = MultiOutputRegressor(model2) warnings.filterwarnings('ignore') # model1.fit(x_train, y1_train) model2.fit([x1_train, x2_train], y2_train) # model3.fit(x_train, y3_train) # model4.fit(x_train, y4_train) # y1_pred = model1.predict(x_test) # print(y1_pred) # print(y1_pred.shape) y2_pred = model2.predict([x1_test, x2_test]) # print(y2_pred) # print(y2_pred.shape) # y3_pred = model3.predict(x_test) # print(y3_pred) # print(y3_pred.shape) # y4_pred = model4.predict(x_test) # print(y4_pred) # print(y4_pred.shape) # mae1 = mean_absolute_error(y1_test, y1_pred) mae2 = mean_absolute_error(y2_test, y2_pred) # mae3 = mean_absolute_error(y3_test, y3_pred) # mae4 = mean_absolute_error(y4_test, y4_pred)
print('MAE:', metrics.mean_absolute_error(y_test, predictions)) print('MSE:', metrics.mean_squared_error(y_test, predictions)) print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions))) fig, ax = plt.subplots() ax.scatter(y_test, predictions) ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4) ax.set_xlabel('Measured') ax.set_ylabel('Predicted') ax.set_title('R2: ' + str(r2_score(y_test, predictions))) plt.show() from sklearn import metrics xgbrfr = XGBRFRegressor(random_state=133).fit(X_train, y_train) scores.append(xgbrfr.score(X_test, y_test)) rmse.append(np.sqrt(mean_squared_error(y_test, xgbrfr.predict(X_test)))) predictions = xgbrfr.predict(X_test) mse.append(metrics.mean_squared_error(y_test, predictions)) mae.append(metrics.mean_absolute_error(y_test, predictions)) print('R^2-Coefficient of Determination value',xgbrfr.score(X_test, y_test)) print('MAE:', metrics.mean_absolute_error(y_test, predictions)) print('MSE:', metrics.mean_squared_error(y_test, predictions)) print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions))) fig, ax = plt.subplots() ax.scatter(y_test, predictions) ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4) ax.set_xlabel('Measured') ax.set_ylabel('Predicted') ax.set_title('R2: ' + str(r2_score(y_test, predictions))) plt.show()
# In[44]: print("Test", mean_absolute_error(rf.predict(X_te), y_te)) # ### Xgboost # In[45]: xgb = XGBRFRegressor().fit(X_tr, y_tr) print("XGB Results") print("Train ", mean_absolute_error(xgb.predict(X_tr), y_tr)) # In[46]: print("Test ", mean_absolute_error(xgb.predict(X_te), y_te)) # ## Predict Next Day # In[47]: df["pain_shift"] = df.pain - df.pain.shift() df_lag = df.dropna().drop(columns="pain")