def fit_model (X, y): model = XGBRFRegressor(n_estimators=1000, max_depth=7, random_state=42) model.fit(X, y) y_pred = model.predict(X) #print (y) err_mae = mean_absolute_error(y, y_pred) err_rmse = np.sqrt(mean_squared_error(y, y_pred)) return model, y_pred, err_mae, err_rmse
def test_xg_XGBRFRegressor(): print("Testing xgboost, XGBRFRegressor...") mod = XGBRFRegressor() X, y = iris_data mod.fit(X, y) docs = {'name': "XGBRFRegressor test"} fv = X[0, :] upload(mod, fv, docs)
class XGBRFRegressorOptimizer(BaseOptimizer): def __init__(self,src_file_index,bounds): self.model = XGBRFRegressor() self.model_name = "XGBRFRegressor" self.src = util.get_src_file(src_file_index=src_file_index) self.lower_bounds = bounds["lower_bounds"] self.upper_bounds = bounds["upper_bounds"] self.with_rain = False self.optimization_methods = optimization_methods self.num_iterations = 200 self.results = {} self.result_save_path = 'optimization_result/with_rain_'+str(self.with_rain)+'/'+self.src.split('.')[0].split('/')[-1]+'/' self.optimization() self.save_optimization_result() def objective_function(self,x): print("XGBRegressor优化中...") train_x, test_x, train_y, test_y = util.get_train_test_split(self.src,int(np.round(x[0])),int(np.round(x[1])),with_rain=self.with_rain) print(self.model_name) self.tune_params = ['offset','period','max_depth', # 'learning_rate', 'n_estimators', 'gasmma', 'min_child_weight','max_delta_step','subsample', 'colsample_bytree','colsample_bylevel','colsample_bynode','reg_alpha', 'reg_lambda','scale_pos_weight','base_score' ] self.model.max_depth = int(x[2]) self.model.n_estimators = int(x[3]) self.model.gamma = x[4] self.model.min_child_weight = int(x[5]) self.model.max_delta_step = int(x[6]) self.model.subsample = x[7] self.model.colsample_bytree = x[8] self.model.colsample_bylevel = x[9] self.model.colsample_bynode = x[10] self.model.reg_alpha = x[11] self.model.reg_lambda = x[12] self.model.scale_pos_weight = x[13] self.model.base_score = x[14] self.model.objective = 'reg:squarederror' self.model.learning_rate = 0.001 self.model.fit(X=train_x,y=train_y) y_hat = self.model.predict(test_x) mse = mean_squared_error(y_hat,test_y) return mse
def xgrfboost (train, target, n_estimators = 100, max_depth = 8, random_state = 17, learning_rate = 0.1, colsample_bytree = 0.9, colsample_bynode = 0.9, colsample_bylevel = 0.9, importance_type = 'split', reg_alpha = 2, reg_lambda = 2): '''XGRFBoost Regressor Params :- train - Training Set to train target - Target Set to predict n_estimators - no. of trees to predict (default set to 100) max_depth - Maximum depth that a tree can grow (default set to 8) random_state - A arbitary number to get same results when run on different machine with same params (default set to 17) learning_rate - size of step to to attain towards local minima colsample_bytree, colsample_bynode, colsample_bylevel - part of total features to use bytree, bynode, bylevel importance_type - metric to split samples (default set to split) reg_alpha, reg_lambda - L1 regularisation and L2 regularisation respectively''' from xgboost import XGBRFRegressor model = XGBRFRegressor(n_estimators = n_estimators, max_depth = max_depth, random_state = random_state, learning_rate = learning_rate, colsample_bytree = colsample_bytree, colsample_bynode = colsample_bynode, colsample_bylevel = colsample_bylevel, importance_type = importance_type, reg_alpha = reg_alpha, reg_lambda = reg_lambda) model.fit(train, target) return model
def train(self): self.config.logger.info("XGBoostOptimiser::train") model = XGBRFRegressor(verbosity=1, **(self.config.params)) start = timer() inputs, exp_outputs = self.get_data_("train") end = timer() log_time(start, end, "for loading training data") log_memory_usage( ((inputs, "Input train data"), (exp_outputs, "Output train data"))) log_total_memory_usage("Memory usage after loading data") if self.config.plot_train: inputs_val, outputs_val = self.get_data_("validation") log_memory_usage(((inputs_val, "Input val data"), (outputs_val, "Output val data"))) log_total_memory_usage("Memory usage after loading val data") self.plot_train_(model, inputs, exp_outputs, inputs_val, outputs_val) start = timer() model.fit(inputs, exp_outputs) end = timer() log_time(start, end, "actual train") self.save_model(model)
def train(self): """ Train the optimizer. """ self.config.logger.info("XGBoostOptimiser::train") if self.config.dim_output > 1: logger = get_logger() logger.fatal( "YOU CAN PREDICT ONLY 1 DISTORTION. dim_output is bigger than 1." ) model = XGBRFRegressor(verbosity=1, **(self.config.params)) start = timer() inputs, exp_outputs, *_ = self.__get_data("train") end = timer() log_time(start, end, "for loading training data") log_memory_usage( ((inputs, "Input train data"), (exp_outputs, "Output train data"))) log_total_memory_usage("Memory usage after loading data") if self.config.plot_train: inputs_val, outputs_val, *_ = self.__get_data("validation") log_memory_usage(((inputs_val, "Input validation data"), (outputs_val, "Output validation data"))) log_total_memory_usage( "Memory usage after loading validation data") self.__plot_train(model, inputs, exp_outputs, inputs_val, outputs_val) start = timer() model.fit(inputs, exp_outputs) end = timer() log_time(start, end, "actual train") model.get_booster().feature_names = get_input_names_oned_idc( self.config.opt_usederivative, self.config.num_fourier_coeffs_train) self.__plot_feature_importance(model) self.save_model(model)
from sklearn.metrics import r2_score, accuracy_score from sklearn.feature_selection import SelectFromModel from sklearn.preprocessing import MinMaxScaler import numpy as np import pandas as pd x, y = load_boston(return_X_y=True) x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=True, train_size=0.8, random_state=66) model = XGBRFRegressor(n_jobs=-1) model.fit(x_train, y_train) score = model.score(x_test, y_test) print('R2', score) thresholds = np.sort(model.feature_importances_) #피처를 소팅 print(thresholds) for thresh in thresholds: selection = SelectFromModel(model, threshold=thresh, prefit=True) # 피처의 개수를 하나씩 제거 select_x_train = selection.transform(x_train) # 피쳐의 개수를 줄인 트레인을 반환 selection_model = XGBRFRegressor(n_jobs=-1) # 모델 생성 selection_model.fit(select_x_train, y_train) #모델의 핏
encoder.fit(df[column]) encoders[column] = encoder df_num = df.copy() for column in encoders.keys(): encoder = encoders[column] df_num[column] = encoder.transform(df[column]) # feature, target 설정 train_num = df_num.sample(frac=1, random_state=0) train_features = train_num.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1) train_target = np.log1p(train_num['AMT']) # 훈련 model = XGBRFRegressor(n_jobs=-1) model.fit(train_features, train_target) # 예측 템플릿 만들기 CARD_SIDO_NMs = df_num['CARD_SIDO_NM'].unique() STD_CLSS_NMs = df_num['STD_CLSS_NM'].unique() HOM_SIDO_NMs = df_num['HOM_SIDO_NM'].unique() AGEs = df_num['AGE'].unique() SEX_CTGO_CDs = df_num['SEX_CTGO_CD'].unique() FLCs = df_num['FLC'].unique() years = [2020] months = [4, 7] temp = [] for CARD_SIDO_NM in CARD_SIDO_NMs: for STD_CLSS_NM in STD_CLSS_NMs: for HOM_SIDO_NM in HOM_SIDO_NMs:
class XGBoostOptimiser(Optimiser): name = "xgboost" def __init__(self, config): super().__init__(config) self.config.logger.info("XGBoostOptimiser::Init") self.model = XGBRFRegressor(verbosity=1, **(self.config.params)) def train(self): self.config.logger.info("XGBoostOptimiser::train") inputs, exp_outputs = self.get_data_("train") self.model.fit(inputs, exp_outputs) self.save_model(self.model) def apply(self): self.config.logger.info("XGBoostOptimiser::apply, input size: %d", self.config.dim_input) self.load_model() inputs, exp_outputs = self.get_data_("apply") pred_outputs = self.model.predict(inputs) self.plot_apply_(exp_outputs, pred_outputs) self.config.logger.info("Done apply") def search_grid(self): raise NotImplementedError("Search grid method not implemented yet") def save_model(self, model): # Snapshot - can be used for further training out_filename = "%s/xgbmodel_%s_nEv%d.json" %\ (self.config.dirmodel, self.config.suffix, self.config.train_events) pickle.dump(model, open(out_filename, "wb"), protocol=4) def load_model(self): # Loading a snapshot filename = "%s/xgbmodel_%s_nEv%d.json" %\ (self.config.dirmodel, self.config.suffix, self.config.train_events) self.model = pickle.load(open(filename, "rb")) def get_data_(self, partition): inputs = [] exp_outputs = [] for indexev in self.config.partition[partition]: inputs_single, exp_outputs_single = load_event_idc( self.config.dirinput_train, indexev, self.config.input_z_range, self.config.output_z_range, self.config.opt_predout) inputs.append(inputs_single) exp_outputs.append(exp_outputs_single) inputs = np.concatenate(inputs) exp_outputs = np.concatenate(exp_outputs) return inputs, exp_outputs def plot_apply_(self, exp_outputs, pred_outputs): myfile = TFile.Open("%s/output_%s_nEv%d.root" % \ (self.config.dirval, self.config.suffix, self.config.train_events), "recreate") h_dist_all_events, h_deltas_all_events, h_deltas_vs_dist_all_events =\ plot_utils.create_apply_histos(self.config, self.config.suffix, infix="all_events_") distortion_numeric_flat_m, distortion_predict_flat_m, deltas_flat_a, deltas_flat_m =\ plot_utils.get_apply_results_single_event(pred_outputs, exp_outputs) plot_utils.fill_apply_tree(h_dist_all_events, h_deltas_all_events, h_deltas_vs_dist_all_events, distortion_numeric_flat_m, distortion_predict_flat_m, deltas_flat_a, deltas_flat_m) for hist in (h_dist_all_events, h_deltas_all_events, h_deltas_vs_dist_all_events): hist.Write() plot_utils.fill_profile_apply_hist(h_deltas_vs_dist_all_events, self.config.profile_name, self.config.suffix) plot_utils.fill_std_dev_apply_hist(h_deltas_vs_dist_all_events, self.config.h_std_dev_name, self.config.suffix, "all_events_") myfile.Close()
from sklearn.datasets import load_boston from sklearn.metrics import accuracy_score, r2_score ds = load_boston() x = ds.data y = ds.target x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, shuffle=True, random_state=66) model = XGBRFRegressor(n_jobs=-1) model.fit(x_train, y_train) score = model.score(x_test, y_test) print("R2: ", score) thresholds = np.sort(model.feature_importances_) print(thresholds) for thresh in thresholds: # 칼럼수 만큼 돈다. selection = SelectFromModel(model, threshold=thresh, prefit=True) selection_x_train = selection.transform(x_train) # print(selection_x_train.shape) selection_model = XGBRFRegressor() selection_model.fit(selection_x_train, y_train)
xgb = XGBRFRegressor(colsample_bynode=1, colsample_bytree=0.6, learning_rate=0.01, max_delta=4, min_child_weight=1.5, n_estimators=2400, reg_alpha=0.6, reg_lambda=0.6) lgbm = LGBMRegressor(objective='regression', num_leaves=4, learning_rate=0.01, n_estimators=12000) # In[158]: xgb.fit(X_train, y_train) lgbm.fit(X_train, y_train, eval_metric='rmse') # In[162]: predict1 = xgb.predict(X_test) predict2 = lgbm.predict(X_test) # In[164]: print('Root Mean Square Error test = ' + str(math.sqrt(mean_squared_error(y_test, predict1)))) print('Root Mean Square Erroe test = ' + str(math.sqrt(mean_squared_error(y_test, predict2)))) # In[165]:
from sklearn.datasets import load_boston from sklearn.metrics import accuracy_score, r2_score from sklearn.model_selection import train_test_split from xgboost import XGBRFRegressor # x, y = load_boston(return_X_y=True) x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8) model = XGBRFRegressor(n_estimators=1000, learning_rate=0.1) model.fit(x_train, y_train, verbose=True, eval_metric="rmse", eval_set=[(x_train, y_train), (x_test, y_test)]) #rmse,mae,logloss,error,auc results = model.evals_result() print("eval:", results) y_pred = model.predict(x_test) r2 = r2_score(y_test, y_pred) print("r2:", r2) print("r2: %.2f%%", (r2 * 100.0))
from sklearn.datasets import load_boston from sklearn.metrics import accuracy_score, r2_score from sklearn.model_selection import train_test_split from xgboost import XGBRFRegressor x, y = load_boston(return_X_y=True) x_train, x_test, y_train, y_test = train_test_split(x,y, train_size=0.8, shuffle=True, random_state=66) model = XGBRFRegressor(n_estimators=1000, learning_rate=0.1) model.fit(x_train, y_train, verbose=True, eval_metric="rmse", eval_set = [(x_train, y_train), (x_test, y_test)], early_stopping_rounds=20) #rmse,mae,logloss,error,auc results = model.evals_result() print("eval:", results)\ y_pred = model.predict(x_test) r2 = r2_score(y_test, y_pred) print("r2:", r2) print("r2: %.2f%%" %(r2*100.0))
# 'colsample_bylevel': [0.6, 0.8, 0.9], # 'max_depth' : [6,7,8]} # ] # model1 = XGBRFRegressor(n_estimators= 300,learning_rate=1,colsample_bytree=0.99,colsample_bylevel=0.99,max_depth=50,nrounds=1000,scale_pos_weight=1.5) #model2 = XGBRFRegressor(n_estimators= 400,learning_rate=1,colsample_bytree=0.99,colsample_bylevel=0.99,max_depth=50,nrounds=1000,scale_pos_weight=1.5) model3 = XGBRFRegressor(n_estimators= 400,learning_rate=1,colsample_bytree=0.99,colsample_bylevel=0.99,max_depth=10,nrounds=1000,scale_pos_weight=1.5) # model4 = XGBRFRegressor(n_estimators= 100,learning_rate=1,colsample_bytree=0.99,colsample_bylevel=0.99,max_depth=50,nrounds=1000,scale_pos_weight=1.5) # model = GridSearchCV(model, parameters, cv =5) # model = MultiOutputRegressor(model) warnings.filterwarnings('ignore') # model1.fit(x_train, y1_train) #model2.fit(x_train, y2_train) model3.fit(x_train, y3_train) # model4.fit(x_train, y4_train) # y1_pred = model1.predict(test) # print(y1_pred) # print(y1_pred.shape) # y2_pred = model2.predict(test) # print(y2_pred) # print(y2_pred.shape) # y3_pred = model3.predict(test) # print(y3_pred) # print(y3_pred.shape) # y4_pred = model4.predict(test)
class XGBoostText: def __init__(self, expmodel_id='test.new', n_estimators=100, use_gpu=False, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None): """ XGboost from public XGBoostText Lib. Parameters ---------- """ check_model_dir(expmodel_id=expmodel_id) self.checkout_dir = os.path.join('./experiments_records', expmodel_id, 'checkouts') self.result_dir = os.path.join('./experiments_records', expmodel_id, 'results') # make saving directory if needed if not os.path.isdir(self.checkout_dir): os.makedirs(self.checkout_dir) if not os.path.isdir(self.result_dir): os.makedirs(self.result_dir) self.expmodel_id = expmodel_id self.n_estimators = n_estimators self.use_gpu = use_gpu self.criterion = criterion self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease self.min_impurity_split = min_impurity_split self.bootstrap = bootstrap self.oob_score = oob_score self.n_jobs = n_jobs self.random_state = random_state self.verbose = verbose self.warm_start = warm_start self.class_weight = class_weight self.ccp_alpha = ccp_alpha self.max_samples = max_samples self.task_type = None # self._args_check() self.device = self._get_device() def _data_check(self, datalist): """ Target to 1) check train_data/valid_data valid, if not give tips about data problem 2) check loss function valid, if not recommend proper loss func Parameters ---------- datalist = [data1 = { 'x':list[episode_file_path], 'y':list[label], 'l':list[seq_len], 'feat_n': n of feature space, 'label_n': n of label space }, data2 = { 'x':list[episode_file_path], 'y':list[label], 'l':list[seq_len], 'feat_n': n of feature space, 'label_n': n of label space }, ... ] Returns ------- self : object """ label_n_check = set([]) task_type_check = set([]) for each_data in datalist: for each_x_path in each_data['x']: if os.path.exists(each_x_path) is False: raise Exception('episode file not exist') label_n_check.add(np.shape(np.array(each_data['y']))[1]) task_type_check.add( label_check(each_data['y'], hat_y=None, assign_task_type=self.task_type)) if len(task_type_check) != 1: raise Exception('task_type is inconformity in data') pre_task_type = list(task_type_check)[0] if self.task_type == None: self.task_type = pre_task_type elif self.task_type == pre_task_type: pass else: raise Exception( 'predifine task-type {0}, but data support task-type {1}'. format(self.task_type, pre_task_type)) print('current task can beed seen as {0}'.format(self.task_type)) def _get_device(self): if self.use_gpu: if torch.cuda.is_available(): device = torch.device("cuda") print('use GPU recource') else: device = torch.device("cpu") print('not find effcient GPU, use CPU recource') else: device = torch.device("cpu") print('use CPU recource') return device def _build_model(self): """ Build the crucial components for model training """ _config = { 'n_estimators': self.n_estimators, 'max_leaf_nodes': self.max_leaf_nodes, 'min_impurity_split': self.min_impurity_split, 'n_jobs': self.n_jobs, 'random_state': self.random_state, 'max_samples': self.max_samples } if self.task_type == 'binaryclass': self.predictor = XGBClassifier(**_config, objective='binary:logistic', eval_metric="logloss") elif self.task_type == 'multiclass': self.predictor = XGBClassifier(**_config) elif self.task_type == 'multilabel': xgb_estimator = XGBClassifier(**_config, objective='binary:logistic', eval_metric="logloss") self.predictor = MultiOutputClassifier(xgb_estimator) elif self.task_type == 'regression': self.predictor = XGBRFRegressor(**_config) self._save_config(_config, 'predictor') _config = {'tasktype': self.task_type} self._save_config(_config, 'tasktype') def _data_check(self, datalist): """ Target to 1) check train_data/valid_data valid, if not give tips about data problem 2) check loss function valid, if not recommend proper loss func Parameters ---------- datalist = [data1 = { 'x':list[episode_file_path], 'y':list[label], 'l':list[seq_len], 'feat_n': n of feature space, 'label_n': n of label space }, data2 = { 'x':list[episode_file_path], 'y':list[label], 'l':list[seq_len], 'feat_n': n of feature space, 'label_n': n of label space }, ... ] Returns ------- self : object """ label_n_check = set([]) task_type_check = set([]) for each_data in datalist: for each_x_path in each_data['x']: if os.path.exists(each_x_path) is False: raise Exception('episode file not exist') label_n_check.add(np.shape(np.array(each_data['y']))[1]) task_type_check.add( label_check(each_data['y'], hat_y=None, assign_task_type=self.task_type)) if len(task_type_check) != 1: raise Exception('task_type is inconformity in data') pre_task_type = list(task_type_check)[0] if self.task_type == None: self.task_type = pre_task_type elif self.task_type == pre_task_type: pass else: raise Exception( 'predifine task-type {0}, but data support task-type {1}'. format(self.task_type, pre_task_type)) def fit(self, data_dict, X=None, y=None, assign_task_type=None): """ Parameters ---------- train_data : { 'x':list[episode_file_path], 'y':list[label], 'l':list[seq_len], 'feat_n': n of feature space, 'label_n': n of label space } The input train samples dict. valid_data : { 'x':list[episode_file_path], 'y':list[label], 'l':list[seq_len], 'feat_n': n of feature space, 'label_n': n of label space } The input valid samples dict. Returns ------- self : object Fitted estimator. """ self.task_type = assign_task_type if data_dict != None: self._data_check([data_dict]) data = ml_reader.DatasetReader( data_dict, device=self.device, task_type=self.task_type).get_data() _X = np.array(data['X']) _y = np.array(data['Y']) elif X != None and y != None: self._data_check([{'X': X, 'Y': Y}]) _X = X _y = Y else: raise Exception('fill in correct data for model train') print(np.shape(_X), np.shape(_y)) self._build_model() self.predictor.fit(_X, _y) model_path = os.path.join(self.checkout_dir, 'best.model') joblib.dump(self.predictor, model_path) def _save_config(self, config, config_type): temp_path = os.path.join(self.checkout_dir, "{0}_config.json".format(config_type)) if os.path.exists(temp_path): os.remove(temp_path) with open(temp_path, "w", encoding='utf-8') as f: f.write(json.dumps(config, indent=4)) def _load_config(self, config_type): temp_path = os.path.join(self.checkout_dir, '{0}_config.json'.format(config_type)) assert os.path.exists( temp_path ), 'cannot find {0}_config.json, please it in dir {1}'.format( config_type, self.checkout_dir) with open(temp_path, 'r') as f: config = json.load(f) return config def load_model(self): """ Parameters ---------- loaded_epoch : str, loaded model name we save the model by <epoch_count>.epoch, latest.epoch, best.epoch Returns ------- self : object loaded estimator. """ model_path = os.path.join(self.checkout_dir, 'best.model') self.task_type = self._load_config('tasktype')['tasktype'] self.predictor = joblib.load(model_path) def inference(self, data_dict, X=None, y=None): """ Parameters ---------- test_data : { 'x':list[episode_file_path], 'y':list[label], 'l':list[seq_len], 'feat_n': n of feature space, 'label_n': n of label space } The input test samples dict. """ if data_dict != None: self._data_check([data_dict]) data = ml_reader.DatasetReader( data_dict, device=self.device, task_type=self.task_type).get_data() _X = data['X'] _y = data['Y'] elif X != None and y != None: self._data_check({'X': X, 'Y': y}) _X = X _y = y else: raise Exception('fill in correct data for model inference') if self.task_type in ['binaryclass', 'regression']: real_v = _y.reshape(-1, 1) prob_v = self.predictor.predict_proba(_X)[:, 1].reshape(-1, 1) elif self.task_type in ['multiclass']: real_v = np.array(_y) prob_v = self.predictor.predict_proba(_X).reshape( -1, np.shape(real_v)[1]) elif self.task_type in ['multilabel']: real_v = np.array(_y) prob_v = [] _prob_v = self.predictor.predict_proba(_X) for each_class in _prob_v: if len(each_class) == 1: each_class = np.array([each_class]) if np.shape(each_class)[1] == 2: v = each_class[:, 1].reshape((-1, 1)) else: v = each_class prob_v.append(v) prob_v = np.concatenate(prob_v, 1) pickle.dump(prob_v, open(os.path.join(self.result_dir, 'hat_y'), 'wb')) pickle.dump(real_v, open(os.path.join(self.result_dir, 'y'), 'wb')) def get_results(self): """ Load saved prediction results in current ExpID truth_value: proj_root/experiments_records/*****(exp_id)/results/y predict_value: proj_root/experiments_records/*****(exp_id)/results/hat_y xxx represents the loaded model """ try: hat_y = pickle.load( open(os.path.join(self.result_dir, 'hat_y'), 'rb')) except IOError: print('Error: cannot find file {0} or load failed'.format( os.path.join(self.result_dir, 'hat_y'))) try: y = pickle.load(open(os.path.join(self.result_dir, 'y'), 'rb')) except IOError: print('Error: cannot find file {0} or load failed'.format( os.path.join(self.result_dir, 'y'))) results = {'hat_y': hat_y, 'y': y} return results
# 1. 데이터 datasets = load_boston() x = datasets.data y = datasets.target print("init x.shape:", x.shape) # 1.1 데이터 전처리 (train_test_split) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=44, shuffle=True, test_size=0.2) # 2 모델 (XGBRFRegressor) model = XGBRFRegressor(max_depth=4) model.fit(x_train, y_train) # 4. 평가 acc = model.score(x_test, y_test) print("acc:", acc) print(model.feature_importances_) # 피쳐 임포턴스 자르는 함수 def earseLowFI_index(fi_arr, low_value, input_arr): input_arr = input_arr.T temp = [] for i in range(fi_arr.shape[0]): if fi_arr[i] >= low_value: temp.append(input_arr[i, :]) temp = np.array(temp)
from sklearn.metrics import mean_squared_error as MSE import numpy as np import pandas as pd import matplotlib.pyplot as plt import datetime from time import time from function import plot_learning_curve boston = load_boston() X, y = boston.data, boston.target x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 构建梯度提升树模型 xgbr = XGBR(n_estimators=100) xgbr.fit(x_train, y_train) # 预测结果 predict = xgbr.predict(x_test) # 计算均方误差 print(MSE(y_test, xgbr.predict(x_test))) # 绘制学习曲线 cv = KFold(n_splits=5, shuffle=True, random_state=32) plot_learning_curve(XGBR(n_estimators=100, random_state=30), 'XGBR', X, y, ax=None, cv=cv) plt.show() # 通过观察图可以发现在数据量很少的情况下,模型处于过拟合状态,在数据流不断提高时,模型的泛华能力不断提高。
from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from xgboost import XGBClassifier, XGBRFRegressor import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import load_diabetes x, y = load_diabetes(return_X_y=True) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, shuffle=True, train_size=0.8) model1 = XGBRFRegressor() model1.fit(x_train, y_train) default_score = model1.score(x_test, y_test) model = XGBRFRegressor() model.fit(x_train, y_train) print(model.feature_importances_) index7 = np.sort(model.feature_importances_)[::-1][int( 0.7 * len(model.feature_importances_))] delete_list = [] for i in model.feature_importances_: if i < index7: print(i, "제거 ") delete_list.append(model.feature_importances_.tolist().index(i))
# gpu_id=0, tree_method='gpu_hist' # model1 = XGBRFRegressor(n_estimators= 300,learning_rate=1,colsample_bytree=1,colsample_bylevel=1,max_depth=50,subsample=0.8, n_jobs=-1) model2 = XGBRFRegressor(n_estimators=400, learning_rate=1, colsample_bytree=1, colsample_bylevel=1, max_depth=50) # model3 = XGBRFRegressor(n_estimators= 350,learning_rate=1,colsample_bytree=1,colsample_bylevel=1,max_depth=40,subsample=1,n_jobs=-1) # model4 = XGBRFRegressor(n_estimators= 100,learning_rate=1,colsample_bytree=1,colsample_bylevel=0.7,max_depth=30,n_jobs=-1) # model = GridSearchCV(model, parameters, cv =5) # model = MultiOutputRegressor(model2) warnings.filterwarnings('ignore') # model1.fit(x_train, y1_train) model2.fit([x1_train, x2_train], y2_train) # model3.fit(x_train, y3_train) # model4.fit(x_train, y4_train) # y1_pred = model1.predict(x_test) # print(y1_pred) # print(y1_pred.shape) y2_pred = model2.predict([x1_test, x2_test]) # print(y2_pred) # print(y2_pred.shape) # y3_pred = model3.predict(x_test) # print(y3_pred) # print(y3_pred.shape)