def test_boston_housing_rf_regression(): from sklearn.metrics import mean_squared_error from sklearn.datasets import load_boston from sklearn.model_selection import KFold boston = load_boston() y = boston['target'] X = boston['data'] kf = KFold(n_splits=2, shuffle=True, random_state=rng) for train_index, test_index in kf.split(X, y): xgb_model = xgb.XGBRFRegressor(random_state=42).fit( X[train_index], y[train_index]) preds = xgb_model.predict(X[test_index]) labels = y[test_index] assert mean_squared_error(preds, labels) < 35
def optimal_model(df): clf_list = [RandomForestRegressor(), xgb.XGBRFRegressor()] #GradientBoostingRegressor(), LinearRegression(), ####################################################### n_folds = 4 X = df.drop('count', axis = 1) y = df['count'] kf = KFold(n_folds,shuffle=True, random_state = 42).get_n_splits(X) n_estimatorslist = [100, 300,500] maxdepthList = [3, 5, 10] learning_ratelist = [0.01,0.1,1] colsample_bytreelist = [0.4,0.6] gammalist = [0] gridBool = [True, False] bestscorelist = [] best_searchlist = [] ################################################### param_GridList = [ # [{'fit_intercept': gridBool}], # Linear regressor [{'n_estimators':n_estimatorslist, 'max_depth': maxdepthList}], # for Random Forest [{'n_estimators':n_estimatorslist, 'maxdepth': maxdepthList, 'learning_rate':learning_ratelist, 'colsample_bytree':colsample_bytreelist, 'gamma': gammalist}] #xgb ] ##################################################### def my_scoring(y_true, y_pred): error = np.sqrt(mean_squared_error(np.log(y_true) + 1, np.log(y_pred) + 1)) return error my_scoring = make_scorer(my_scoring, greater_is_better=False) ##################################################### for clf, params in zip(clf_list, param_GridList): best_search = GridSearchCV(estimator= clf, param_grid=params, cv=kf, scoring = my_scoring) best_search.fit(X,y) bestParams = best_search.best_params_ bestscore = round(np.sqrt(-best_search.best_score_),5) bestscorelist.append(bestscore) best_searchlist.append(best_search) print('model, the best params are {}, the best score is {}' .format(bestParams,bestscore)) return bestscorelist, best_search
def test_num_parallel_tree(): from sklearn.datasets import load_boston reg = xgb.XGBRegressor(n_estimators=4, num_parallel_tree=4, tree_method='hist') boston = load_boston() bst = reg.fit(X=boston['data'], y=boston['target']) dump = bst.get_booster().get_dump(dump_format='json') assert len(dump) == 16 reg = xgb.XGBRFRegressor(n_estimators=4) bst = reg.fit(X=boston['data'], y=boston['target']) dump = bst.get_booster().get_dump(dump_format='json') assert len(dump) == 4 config = json.loads(bst.get_booster().save_config()) assert int(config['learner']['gradient_booster']['gbtree_train_param'][ 'num_parallel_tree']) == 4
def test_xgboost_regressor_unwrapped(self): """ Validate xgboost regressor without wrapper """ X, y = make_regression(n_samples=500, n_features=22, n_informative=8, random_state=8311982) X_train, X_test, y_train, y_test = tts(X, y) model = xgb.XGBRFRegressor() oz = residuals_plot(model, X_train, y_train, X_test, y_test, show=False) assert is_fitted(oz)
def test_num_parallel_tree(): from sklearn.datasets import fetch_california_housing reg = xgb.XGBRegressor(n_estimators=4, num_parallel_tree=4, tree_method='hist') X, y = fetch_california_housing(return_X_y=True) bst = reg.fit(X=X, y=y) dump = bst.get_booster().get_dump(dump_format='json') assert len(dump) == 16 reg = xgb.XGBRFRegressor(n_estimators=4) bst = reg.fit(X=X, y=y) dump = bst.get_booster().get_dump(dump_format='json') assert len(dump) == 4 config = json.loads(bst.get_booster().save_config()) assert int(config['learner']['gradient_booster']['gbtree_train_param'] ['num_parallel_tree']) == 4
def average_score(self, params): forrest = self.forrest x_train = self.x_train x_test = self.x_test y_train = self.y_train y_test = self.y_test features = self.features targets = self.targets split = self.split ''' this method tries the first 50 random seeds for the train test split, then returns an average of the R^2s across the random seeds params : this parameter will usualy be the value of the best param method, but you can enter any paramter you want, as long as they are included in the xgboost.Regression documentation ''' nums = [] for num in range(0,50): try: x_train, x_test, y_train, y_test = train_test_split (features, targets, test_size=split, random_state = num) if forrest == False: xgb_r = xg.XGBRegressor(**params) else: xgb_r = xg.XGBRFRegressor(**params) # Fitting the model xgb_r.fit(x_train, y_train) # Predict the model pred = xgb_r.predict(x_test) r2 = metrics.r2_score(y_test.values, pred) nums.append(r2) except Exception as e: print(e) continue self.best_random_seed = max(nums) return (sum(nums)/len(nums))
def r2_graph(self): ''' plots the predicted value aginst the actual value for both x_train, and x_test ''' x_train = self.x_train x_test = self.x_test y_train = self.y_train y_test = self.y_test forrest = self.forrest params = self.params best_param = self.best_grid features = self.features targets = self.targets split = self.split x_train, x_test, y_train, y_test = train_test_split (features, targets, test_size=split, random_state = 24) if forrest == False: xgb_r = xg.XGBRegressor(**best_param) else: xgb_r = xg.XGBRFRegressor(**best_param) # Fitting the model xgb_r.fit(x_train, y_train) # Predict the model pred_test = xgb_r.predict(x_test) pred_train = xgb_r.predict(x_train) r2_test = metrics.r2_score(y_test.values, pred_test) r2_train = metrics.r2_score(y_train.values, pred_train) #print(num) plt.scatter(pred_test,y_test, marker = "D", color = 'blue', label = 'Test Data') plt.scatter(pred_train,y_train, marker = "D", color = 'red', label = 'Train Data') plt.xlabel('Predicted Value') plt.ylabel('Actual Value') plt.legend() plt.show()
def test_num_parallel_tree(): from sklearn.datasets import fetch_california_housing reg = xgb.XGBRegressor(n_estimators=4, num_parallel_tree=4, tree_method="hist") X, y = fetch_california_housing(return_X_y=True) bst = reg.fit(X=X, y=y) dump = bst.get_booster().get_dump(dump_format="json") assert len(dump) == 16 reg = xgb.XGBRFRegressor(n_estimators=4) bst = reg.fit(X=X, y=y) dump = bst.get_booster().get_dump(dump_format="json") assert len(dump) == 4 config = json.loads(bst.get_booster().save_config()) assert ( int( config["learner"]["gradient_booster"]["gbtree_model_param"][ "num_parallel_tree" ] ) == 4 )
def test_regression_random_forest(): base_score = 0.6 estimator = xgboost.XGBRFRegressor(n_estimators=2, random_state=1, max_depth=1, base_score=base_score) utils.get_regression_model_trainer()(estimator) assembler = assemblers.XGBoostModelAssemblerSelector(estimator) actual = assembler.assemble() expected = ast.BinNumExpr( ast.BinNumExpr( ast.NumVal(0.6), ast.IfExpr( ast.CompExpr(ast.FeatureRef(5), ast.NumVal(6.94099998), ast.CompOpType.GTE), ast.NumVal(18.1008453), ast.NumVal(9.60167599)), ast.BinNumOpType.ADD), ast.IfExpr( ast.CompExpr(ast.FeatureRef(5), ast.NumVal(6.79699993), ast.CompOpType.GTE), ast.NumVal(17.780262), ast.NumVal(9.51712894)), ast.BinNumOpType.ADD) assert utils.cmp_exprs(actual, expected)
# XGBoost (tree method "hist") regression(xgboost.XGBRegressor(**XGBOOST_HIST_PARAMS), test_fraction=0.2), classification(xgboost.XGBClassifier(**XGBOOST_HIST_PARAMS), test_fraction=0.2), classification_binary(xgboost.XGBClassifier(**XGBOOST_HIST_PARAMS), test_fraction=0.2), # XGBoost (LINEAR) regression(xgboost.XGBRegressor(**XGBOOST_PARAMS_LINEAR)), classification(xgboost.XGBClassifier(**XGBOOST_PARAMS_LINEAR)), classification_binary(xgboost.XGBClassifier(**XGBOOST_PARAMS_LINEAR)), # XGBoost (RF) regression(xgboost.XGBRFRegressor(**XGBOOST_PARAMS_RF)), classification(xgboost.XGBRFClassifier(**XGBOOST_PARAMS_RF)), classification_binary(xgboost.XGBRFClassifier(**XGBOOST_PARAMS_RF)), # XGBoost (Boosted Random Forests) regression(xgboost.XGBRegressor(**XGBOOST_PARAMS_BOOSTED_RF)), classification(xgboost.XGBClassifier(**XGBOOST_PARAMS_BOOSTED_RF)), classification_binary( xgboost.XGBClassifier(**XGBOOST_PARAMS_BOOSTED_RF)), # XGBoost (Large Trees) regression_random(xgboost.XGBRegressor(**XGBOOST_PARAMS_LARGE)), classification_random(xgboost.XGBClassifier(**XGBOOST_PARAMS_LARGE)), classification_binary_random( xgboost.XGBClassifier(**XGBOOST_PARAMS_LARGE)),
column_name_list.remove('time_to_failure') print(len(column_name_list)) feature_scaler = StandardScaler() feature_df[column_name_list] = feature_scaler.fit_transform(feature_df[column_name_list]) # Initialize models clf_ridg = Ridge(max_iter=5000) clf_laso = Lasso(max_iter=5000) clf_lala = LassoLars(max_iter=5000) clf_enet = ElasticNet(max_iter=5000) clf_xgbr = xgb.XGBRegressor() clf_xgrf = xgb.XGBRFRegressor() clf_rf = RandomForestRegressor(criterion='mae', max_features='sqrt') clf_tree = ExtraTreesRegressor(criterion='mae', max_features='sqrt') clf_ada = AdaBoostRegressor() clf_grad = GradientBoostingRegressor() clf_svr = SVR() # Model parameters # mae 2.160 param_ridg = { 'alpha': [1, 10, 30, 100, 300, 1000], # 300 'tol': [0.00001, 0.0000001, 0.000000001, 0.00000000001], # 1e-5 'solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'], # sparse_cg
# In[46]: spark.conf.set("spark.synapse.ml.predict.enabled", "true") # ## Train and Save Model # ### Training # In[47]: data = np.random.rand(5, 10) # 5 entities, each contains 10 features label = np.random.randint(1, size=5) # binary target dtrain = xgb.DMatrix(data, label=label) xgr = xgb.XGBRFRegressor(objective='reg:linear', n_estimators=10, seed=123) xgr.fit(data, label) # In[48]: xgr.save_model('./model.json') # In[49]: mlflow.pyfunc.save_model(data_path='./model.json', path='./xgboost_pyfunc_model_path', loader_module='mlflow.xgboost') # In[50]: MODEL_URI = './xgboost_pyfunc_model_path'
print(pca2.explained_variance_) X_pca = pca2.transform(X_scaled) pca_train = pd.DataFrame(data=X_pca, columns=[ 'principal component 1', 'principal component 2', 'principal component 3' ]) print('Explained variation per principal component: {}'.format( pca2.explained_variance_ratio_)) #%% X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=.2, random_state=42) #%% xg_without_tuning = xg.XGBRFRegressor(objective='reg:linear', n_estimators=10, seed=123) xg_without_tuning.fit(X_train, y_train) pred = xg_without_tuning.predict(X_test) MAE = (mean_absolute_error(y_test, pred)) print("MAE: %f" % (MAE)) DM_train = xg.DMatrix(X_train, y_train) DM_test = xg.DMatrix(X_test, y_test) params = {'booster': 'gblinear', "objective": "reg:linear"} xg_reg = xg.train(params=params, dtrain=DM_train, num_boost_round=5) pred1 = xg_reg.predict(DM_test) MAE1 = (mean_absolute_error(y_test, pred1)) print("RMSE1: %f" % MAE1)
ames_X_test.iloc[[0]] ames_df.loc[[2661]] shap.summary_plot(vals, ames_X_test) shap.dependence_plot('Overall Qual', shap_values=vals, features=ames_X_test) # ## XGBoost # Powerful algorithm using "boosting" (like golfing) to predict target dt = tree.DecisionTreeRegressor(max_depth=10) dt.fit(auto_X_train, auto_y_train) dt.score(auto_X_test, auto_y_test) xg = xgb.XGBRFRegressor() xg.fit(auto_X_train, auto_y_train) xg.score(auto_X_test, auto_y_test) xg xgb.plot_importance(xg) booster = xg.get_booster() print(booster.get_dump()[0]) booster = xg.get_booster() print(booster.get_dump()[1]) booster = xg.get_booster() print(booster.get_dump()[-1])
from sklearn import ensemble import xgboost as xgb from sklearn import linear_model MODELS = { "randomforest_classifier": ensemble.RandomForestClassifier(n_estimators=200, n_jobs=-1, verbose=2), "randomforest_regressor": ensemble.RandomForestRegressor(n_estimators=200, n_jobs=-1, verbose=2), "xgb_classifier": xgb.XGBRFClassifier( learning_rate=1, subsample=0.9, ), "xgb_regressor": xgb.XGBRFRegressor(learning_rate=1, subsample=0.9), "logistic_regressor": linear_model.LogisticRegression( penalty='elasticnet', fit_intercept=True, class_weight='balanced', random_state=42, solver='saga', verbose=2, n_jobs=-1, ) #TODO: add more models here }
y_data = pca.transform(y_data) print(y_data.shape) x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, train_size=0.8, random_state=33) print(y_test.shape) print(y_train.shape) # model = DecisionTreeRegressor(max_depth =4) # max_depth 몇 이상 올라가면 구분 잘 못함 # model = RandomForestRegressor(n_estimators = 200, max_depth=3) # model = GradientBoostingRegressor() model = xgb.XGBRFRegressor(eta=0.1, max_depth=5, colsample_bytree=0.5) model.fit(x_train, y_train) y_testpred = model.predict(x_test) # y_test = pca.inverse_transform(y_test) # y_testpred = pca.inverse_transform(y_testpred) score = model.score(x_test, y_test) print(score) y4 = model.predict(x_prdeict) print(y4.shape) y4 = y4.reshape(y4.shape[0], 1) y4 = pca.inverse_transform(y4) y4 = scaler.inverse_transform(y4) print(y_testpred.shape) print(y4.shape) # def tree_fit(y_train, y_test):
# ### Predictions # In[10]: file_1 = "../data/Test.csv" file_2 = "../data/additional_data/testRoot_edited.csv" processor = DataProcessor(file_1, file_2, test = True, minimal = True) x_test = processor.get_numpy_data(fillna = True, additional = True, encode = True, np_split = False, enocde_user = False, normalize = True, drop_ones = False) #print(x_test.head()) # In[6]: param_dist = {'objective':'reg:squarederror', 'n_estimators':1300, 'max_depth':9, 'min_child_weight': 49} bst = xgb.XGBRFRegressor(**param_dist) bst.fit(x_train, y_train.ravel(), eval_set=[(x_valid, y_valid)], verbose = True) pr = bst.predict(x_test) print(pr) # In[ ]: #test
df_test.head() x = x.drop(['scaled_amount','scaled_time'],axis=1) df_test = df_test.drop(['Time','Amount'],axis=1) x x.shape y.shape clf = RandomForestClassifier(max_depth=2, random_state=0) clf.fit(x, y) clf.feature_importances_ xb = xgb.XGBRFRegressor() xb.fit(x, y) xb.feature_importances_ def roc_curve_plots(y_test,y_predict_wrf,X_test,model): print(classification_report(y_test,y_predict_wrf),"\n") neigh_prob_linear=model.predict_proba(X_test) neigh_prob_linear1=neigh_prob_linear[:,1] fpr,tpr,thresh=roc_curve(y_test,neigh_prob_linear1) roc_auc_neigh=auc(fpr,tpr) plt.figure(dpi=80) plt.title("ROC Curve") plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.plot(fpr,tpr,'b',label='AUC Score = %0.2f'%roc_auc_neigh)
test_id_idx = test_df.index print('X_train : ', len(X_train)) print('X_val : ', len(X_val)) print('X_test : ', len(X_test)) #Kaggle에서 유행인 XGBoost 모델을 사용해서 훈련 from sklearn.model_selection import GridSearchCV import xgboost as xgb param = { 'n_estimators': range(550, 700, 50), 'colsample_bytree': [0.5, 0.7, 1], 'colsample_bylevel': [0.5, 0.7, 1], } model = xgb.XGBRFRegressor() grid_search = GridSearchCV(estimator=model, param_grid=param, cv=5, scoring='neg_mean_squared_error', n_jobs=-1) grid_search.fit(X_train, y_train) print(grid_search.best_params_) print(grid_search.best_estimator_) #검증을 위하여 MSE 지표를 활용한다 from sklearn.metrics import mean_squared_error, mean_absolute_error pred_train = grid_search.predict(X_train) pred_val = grid_search.predict(X_val)
x, y = run_it('Final_Data.csv', 7000000, 'CV') #%% # xgb_reg_model = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=1, # learning_rate=0.4, max_depth=20, # alpha=10, n_estimators=20) n_est = 20 max_depth = 20 alpha = 11 learning_rate = 0.4 xgb_reg_model = xgb.XGBRFRegressor(objective='reg:squarederror', colsample_bytree=1, min_child_weight=2, max_depth=max_depth, learning_rate=learning_rate, tree_method='hist', n_estimators=n_est, alpha=alpha) kfold = KFold(n_splits=5, shuffle=True, random_state=4) kfold_scores = cross_val_score(xgb_reg_model, x, y, scoring='neg_mean_squared_log_error', cv=kfold) kfold_scores = np.absolute(kfold_scores) print(np.sqrt(kfold_scores.mean())) # print("Beginning to Train the Model") # start = time.time() # xgb_reg_model.fit(x_train_pp, y_train)
def optuna_tuner(self, lower_bound = .80 , upper_bound = 1.20 ): warnings.filterwarnings('ignore') epoch = self.epoch n_trials = self.n_trials params = self.params x_train = self.x_train x_test = self.x_test y_train = self.y_train y_test = self.y_test forrest = self.forrest ''' Method Summary. This method uses Optuna's parameter tunning recurisvly to tune a XG Boost Regression model's hyper parameters. Optuna is more efficient then sklearns random grid as it prunes trees that are not promising to use more of its processing power on promising param combinations. This means that Optuna get better faster then sklearn's random or grid search epoch: This parameter moderates how many times the tunnner will cycle through a random search. Keep in mind that every epoch the parameters used are narrowed down by the random search run during the last epoch n_trials: this parameter dictates how many trials will be run each time a random search is called. Cross validataion is set to 5 so if n_trial was set to 20 each random search will actualy run through 100 trials params: this parameter is the hyperparamter grid you want to initialy feed into the tunner. An example parameter grid for the xgboost regression looks like this. params = { 'objective' : ['reg:gamma'], 'n_estimators' : range(50, 130,10), #500 'max_depth' : range(2,25), 'tree_method' : ['auto', 'exact','approx', 'hist'], 'booster' : ['gbtree', 'gblinear', 'dart'], 'sampling_method' : ['gradient_based'], 'reg_alpha' : [.05,.1,.15,.20,.25,.30], 'reg_lambda' : [0,.2,.4,.6,.8,1], 'learning_rate' : [.05,.08,.1,.15,.20], 'gamma' : [ 0.0, 0.1, 0.2], 'min_child_weight' : [ 1, 3, 5, 7], 'colsample_bytree': list(float_range(decimal.Decimal(0), decimal.Decimal(1), '0.01')), 'colsample_bylevel':list(float_range(decimal.Decimal(0), decimal.Decimal(1), '0.01')), 'colsample_bynode': list(float_range(decimal.Decimal(0), decimal.Decimal(1), '0.01')), 'importance_type' : ['gain', 'weight', 'cover', 'total_gain','total_cover']} x_train: The train features x_test: the test features y_train: the train targets y_test: the test targets Use this syntax to access the pandas data frame after tunning the model with either sklearn or optuna XGParis(**kwargs).best_params ''' top_params = [] all_params = pd.DataFrame(columns = ('scores', 'params')) def objective (trial: Trial, param_dic = params): new_params = {} for item in (param_dic): new_params[str(item)] = trial.suggest_categorical(str(item),list(param_dic[str(item)])) if forrest == False: xgb_r = xg.XGBRegressor(**new_params) else: xgb_r = xg.XGBRFRegressor(**new_params) xgb_r.fit(x_train,y_train) score = model_selection.cross_val_score(xgb_r, x_train, y_train, n_jobs=-1, cv=5) accuracy = score.mean() return accuracy study = optuna.create_study(direction='maximize',sampler=TPESampler()) study.optimize(lambda trial : objective(trial),n_trials= n_trials) for item in study.trials: all_params.loc[len(all_params)] = (item.value, item.params) param_dic_random = study.best_trial.params top_params.append([study.best_trial.value,study.best_trial.params]) counter = 2 def repeater(epoch,counter, param_dic): def objective_2 (trial: Trial, param_dic = param_dic): new_params = {} for item in (param_dic): if type(param_dic[str(item)]) == int: new_params[str(item)] = trial.suggest_int(str(item), param_dic[str(item)]*lower_bound, param_dic[str(item)]*upper_bound) elif type(param_dic[str(item)]) == float: if item == 'colsample_bytree' or item == 'colsample_bylevel' or item == 'colsample_bynode' : if param_dic[str(item)]*1.25 >= 1: new_params[str(item)] = trial.suggest_float(str(item), param_dic[str(item)]*lower_bound, 1) else: new_params[str(item)] = trial.suggest_float(str(item), param_dic[str(item)]*lower_bound, param_dic[str(item)]*upper_bound) else: new_params[str(item)] = trial.suggest_float(str(item), param_dic[str(item)]*lower_bound, param_dic[str(item)]*upper_bound) elif type(param_dic[str(item)]) == str: new_params[str(item)] = trial.suggest_categorical(str(item), [(param_dic[str(item)])]) else: print('error, skipped ' + str(item)) continue #print(new_params) if forrest == False: xgb_r = xg.XGBRegressor(**new_params) else: xgb_r = xg.XGBRFRegressor(**new_params) xgb_r.fit(x_train,y_train) score = model_selection.cross_val_score(xgb_r, x_train, y_train, n_jobs=-1, cv=5) accuracy = score.mean() return accuracy study = optuna.create_study(direction='maximize',sampler=TPESampler()) study.optimize(lambda trial : objective_2(trial),n_trials= n_trials) for item in study.trials: all_params.loc[len(all_params)] = (item.value, item.params) #print(study) if counter >= epoch: counter += 1 #print('epoch - ' + str(counter) + ' Done') return study.best_trial.params else: counter += 1 #print('epoch - ' + str(counter)) top_params.append([study.best_trial.value,study.best_trial.params]) return repeater(epoch,counter,study.best_trial.params) final_param = repeater(epoch,counter,param_dic_random) scores = [] params_list = [] for item in top_params: scores.append(item[0]) params_list.append(item[1]) top = pd.DataFrame(columns= ['scores','params']) top['scores'] = scores top['params'] = params_list if forrest == False: xgb_r = xg.XGBRegressor(seed = 123) else: xgb_r = xg.XGBRFRegressor(seed = 123) param_dic = top['params'][list(top['scores']).index(max(list(top['scores'])))] new_params = {} for item in param_dic: if type(param_dic[str(item)]) == str: new_params[str(item)] = params[str(item)] elif type(param_dic[str(item)]) == int: new_params[str(item)] = [(param_dic[str(item)])] elif type(param_dic[str(item)]) == float: new_params[str(item)] = [(param_dic[str(item)])] print(new_params) xgb_grid = GridSearchCV(estimator =xgb_r, param_grid = new_params, cv = 5, verbose=2, n_jobs = -1) xgb_grid.fit(x_train, y_train) top_params.append([(xgb_grid.best_score_),(xgb_grid.best_params_)]) self.best_grid = top['params'][list(top['scores']).index(max(list(top['scores'])))] self.best_params = top self.all_params = all_params self.optuna = True
#### restore Height prediction model (input : "Sex","age", model : XGBRegressor) #### 키가 null인경우 null값을 추정하는 모형 사용 height_model = xgboost.XGBRegressor(max_depth=10, learning_rate=1, n_estimators=100) height_model.load_model(os.path.join(RESULT,"XGBRegressor_Height.model")) with open(os.path.join(RESULT,"scaler_Height.pkl"),'rb') as f: scaler = pickle.load(f) #### fill null data = scaler.transform(df.loc[df.Height.isnull(),["Sex","age"]]) pred = height_model.predict(data) df.loc[df.Height.isnull(),"Height"] = (pred*patient_info['Height']['std'])+patient_info['Height']['mean'] #### restore Weight prediction model (input : "PatientHeight","PatientSex","age", model : XGBRFRegressor) #### 몸무게가 null인경우 null값을 추정하는 모형 사용 weight_model = xgboost.XGBRFRegressor(max_depth=10, learning_rate=1, n_estimators=300) weight_model.load_model(os.path.join(RESULT,"XGBRFRegressor_Weight.model")) with open(os.path.join(RESULT,"scaler_Weight.pkl"),'rb') as f: scaler = pickle.load(f) #### fill null data = scaler.transform(df.loc[df.Weight.isnull(),["Height","PatientSex","age"]]) pred = weight_model.predict(data) df.loc[df.Weight.isnull(),"Weight"] = (pred*patient_info['weight']['std'])+patient_info['weight']['mean'] df = df[df.age <= 192] org_test = org_test[org_test.age <= 192] df = labeling(df)
from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import GradientBoostingRegressor import lightgbm as lgb import xgboost as xgb from sklearn.model_selection import cross_val_score from sklearn.metrics import mean_squared_error, make_scorer #%% scores = make_scorer(mean_squared_error) models = [ RandomForestRegressor(n_estimators=200, max_depth=3, verbose=2, random_state=42), GradientBoostingRegressor(random_state=42), lgb.LGBMRegressor(random_state=42), xgb.XGBRFRegressor(random_state=42) ] model_mean = [] model_std = [] #%% for i in models: cross_score = cross_val_score(i, X_clean, y, scoring=scores, n_jobs=-1, cv=4) cross_score = np.sqrt(cross_score) model_mean.append(np.mean(cross_score)) model_std.append(np.std(cross_score)) model_results = pd.DataFrame({
from sklearn import tree from sklearn import ensemble from sklearn import linear_model import xgboost as xgb models = { "decision_tree_gini": tree.DecisionTreeRegressor(criterion='gini'), "decision_tree_entropy": tree.DecisionTreeRegressor(criterion="entropy"), "rf": ensemble.RandomForestRegressor(), "Linres": linear_model.LinearRegression(), "xgb_rf_reg": xgb.XGBRFRegressor(), "xgb_reg": xgb.XGBRegressor() }
def make_parameter_graph(self, parameter, test_range): ''' This method uses optuna to make a parameter graph, which keeps all other paramters constant as the paramter chosen varies. The param grid used is the best_param grid from the tunning process. parameter : this param is the paramter you would like a graph of the graph is created using matplotlib and with the parameter chosen on the x axis and the r^2 on the y axis test_range: This param is a list of all of the values you would like tested for your graph ex. list(range(0,10)) or [1,2,3,4,5,6,7,8,9] or [3,6,9] ''' x_train = self.x_train x_test = self.x_test y_train = self.y_train y_test = self.y_test forrest = self.forrest params = self.params best_param = self.best_grid features = self.features targets = self.targets split = self.split nums = [] x = [] for num in test_range: best_param[parameter] = num try: x_train, x_test, y_train, y_test = train_test_split (features, targets, test_size=split, random_state = 24) if forrest == False: xgb_r = xg.XGBRegressor(**best_param) else: xgb_r = xg.XGBRFRegressor(**best_param) # Fitting the model xgb_r.fit(x_train, y_train) # Predict the model pred = xgb_r.predict(x_test) r2 = metrics.r2_score(y_test.values, pred) x.append(num) nums.append(r2) #print(num) except Exception as e: print(e) continue plt.plot(x,nums) plt.xlabel(parameter) plt.ylabel('R^2') plt.show()
model.fit(train_x_pca, train_y) light_pre = model.predict(test_x_pca) mae(light_pre, test_y) # 1.5838890402706525 # 기존 mae값 보다 저하 model.feature_importances_ ##### import xgboost as xgb lgb.LGBMRegressor(params) # parameter값 확인 params1 = {'learning-rate' : 0.01, 'max_depth' : 20, 'boosting_type' : 'gbdt', 'objective' : 'reg:linear', 'metric' : 'mae', 'is_training_metric' : True, 'nem_leaves' : 144, 'feature_fracton' : 0.9, 'bagginf_fraction' : 0.7, 'bagginf_freq' : 5, 'seed' : 2020} model = MultiOutputRegressor(xgb.XGBRFRegressor(**params1, random_state=0), n_jobs = -1) model.fit(train_x, train_y) preds = model.predict(test_x) mae(preds, test_y) # 1.211640226204764
def test_xgb_base_module(root_client: sy.VirtualMachineClient) -> None: sy.load("xgboost") sy.load("numpy") # third party import numpy as np import xgboost as xgb xgb_remote = root_client.xgboost # import xgboost as xgb X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) y = np.array([0, 0, 1, 1]) param = {"eta": 0.3, "max_depth": 3, "num_class": 3} steps = 20 D_train = xgb.DMatrix(X, label=y) model = xgb.train(param, D_train, steps) preds = model.predict(D_train) D_train = xgb_remote.DMatrix(X, label=y) model = xgb_remote.train(param, D_train, steps) preds_remote = model.predict(D_train).get() classifier = xgb_remote.XGBClassifier( n_estimators=100, reg_lambda=1, gamma=0, max_depth=3, use_label_encoder=False ) classifier.fit(X, y) y_pred_classifier_remote = classifier.predict(X).get() classifier = xgb.XGBClassifier( n_estimators=100, reg_lambda=1, gamma=0, max_depth=3, use_label_encoder=False ) classifier.fit(X, y) y_pred_classifier = classifier.predict(X) classifier = xgb_remote.XGBRFClassifier( n_estimators=100, reg_lambda=1, gamma=0, max_depth=3, use_label_encoder=False ) classifier.fit(X, y) y_pred_classifier_rf_remote = classifier.predict(X).get() classifier = xgb.XGBRFClassifier( n_estimators=100, reg_lambda=1, gamma=0, max_depth=3, use_label_encoder=False ) classifier.fit(X, y) y_pred_classifier_rf = classifier.predict(X) regressor = xgb.XGBRegressor(n_estimators=100, reg_lambda=1, gamma=0, max_depth=3) regressor.fit(X, y) y_pred_regressor = regressor.predict(X) regressor = xgb_remote.XGBRegressor( n_estimators=100, reg_lambda=1, gamma=0, max_depth=3 ) regressor.fit(X, y) y_pred_regressor_remote = regressor.predict(X).get() regressor = xgb.XGBRFRegressor(n_estimators=100, reg_lambda=1, gamma=0, max_depth=3) regressor.fit(X, y) y_pred_regressor_rf = regressor.predict(X) regressor = xgb_remote.XGBRFRegressor( n_estimators=100, reg_lambda=1, gamma=0, max_depth=3 ) regressor.fit(X, y) y_pred_regressor_rf_remote = regressor.predict(X).get() assert np.array_equal(y_pred_classifier_rf, y_pred_classifier_rf_remote) assert np.array_equal(y_pred_regressor_rf, y_pred_regressor_rf_remote) assert np.array_equal(y_pred_regressor, y_pred_regressor_remote) assert np.array_equal(y_pred_classifier, y_pred_classifier_remote) assert np.array_equal(preds_remote, preds)
def sklearn_tuner(self,lower_bound = .90, upper_bound = 1.10): warnings.filterwarnings('ignore') epoch = self.epoch n_trials = self.n_trials params = self.params x_train = self.x_train x_test = self.x_test y_train = self.y_train y_test = self.y_test forrest = self.forrest ''' Method Summary. This method uses sklearns random search grid recurisvly to tune a XG Boost Regression model's hyper parameters. Sklearns random search tends to crash if epoch and n_trials are too high. epoch: This parameter moderates how many times the tunnner will cycle through a random search. Keep in mind that every epoch the parameters used are narrowed down by the random search run during the last epoch n_trials: this parameter dictates how many trials will be run each time a random search is called. Cross validataion is set to 5 so if n_trial was set to 20 each random search will actualy run through 100 trials params: this parameter is the hyperparamter grid you want to initialy feed into the tunner. An example parameter grid for the xgboost regression looks like this. params = { 'objective' : ['reg:gamma'], 'n_estimators' : range(50, 130,10), #500 'max_depth' : range(2,25), 'tree_method' : ['auto', 'exact','approx', 'hist'], 'booster' : ['gbtree', 'gblinear', 'dart'], 'sampling_method' : ['gradient_based'], 'reg_alpha' : [.05,.1,.15,.20,.25,.30], 'reg_lambda' : [0,.2,.4,.6,.8,1], 'learning_rate' : [.05,.08,.1,.15,.20], 'gamma' : [ 0.0, 0.1, 0.2], 'min_child_weight' : [ 1, 3, 5, 7], 'colsample_bytree' : [0,.2,0.3, 0.4,.6,.8,1], #'colsample_bylevel':[0,.2,0.3, 0.4,.6,.8,1], #'colsample_bynode': [0,.2,0.3, 0.4,.6,.8,1], 'importance_type' : ['gain', 'weight', 'cover', 'total_gain','total_cover']} features : model featres (x) targets : model target (y) forrest: Boolean. If you would like the model you are training to be a XGBoost Regression Tree then keep forrest = False (default) If you want to train and tune a forrest change forrest to True. Default False upper_bound: Every epoch the tunner takes the last epoch's best params and creates a new param grid with the chosen param*lower_bound as the lowest number in the new grid, and chosen param*upper_bound as the highest number in the new param_gird. Default is 1.10 lower_bound: Every epoch the tunner takes the last epoch's best params and creates a new param grid with the chosen param*lower_bound as the lowest number in the new grid, and chosen param*upper_bound as the highest number in the new param_gird. Default is .90 split: train/test split. .20 is the same as an 80/20 split. Default Use this syntax to access the pandas data frame after tunning the model with either sklearn or optuna XGParis(**kwargs).best_params ''' def float_range(start, stop, step): while start < stop: yield float(start) start += decimal.Decimal(step) top_params = [] if forrest == False: xgb_r = xg.XGBRegressor(seed = 123) else: xgb_r = xg.XGBRFRegressor(seed = 123) xgb_random = RandomizedSearchCV(estimator =xgb_r, param_distributions = params, n_iter = n_trials, cv = 5, verbose=2, random_state=42, n_jobs = -1) xgb_random.fit(x_train, y_train) param_random = (xgb_random.best_params_) top_params.append([(xgb_random.best_score_),(xgb_random.best_params_)]) #print(param_random ) counter = 2 def repeater(epoch, counter, param_dic): new_params = {} for item in (param_dic): #print(item) if type(param_dic[str(item)]) == int: new_params[str(item)] = list(range(round(param_dic[str(item)]*lower_bound), round(param_dic[str(item)]*upper_bound))) if new_params[str(item)] == []: new_params[str(item)] = [param_dic[str(item)]] elif type(param_dic[str(item)]) == float: if item == 'colsample_bytree' or item == 'colsample_bylevel' or item == 'colsample_bynode' : if param_dic[str(item)]*1.10 >= 1: new_params[str(item)] = (list(float_range(decimal.Decimal(param_dic[str(item)]*(upper_bound-lower_bound)), 1, '0.01'))) else: new_params[str(item)] = (list(float_range(decimal.Decimal(param_dic[str(item)]*lower_bound), decimal.Decimal(param_dic[str(item)]*upper_bound), '0.01'))) elif param_dic[str(item)] == 0.0: new_params[str(item)] = (list(float_range(0, decimal.Decimal(param_dic[str(item)]*(upper_bound-lower_bound)), '0.01'))) if new_params[str(item)] == []: new_params[str(item)] = [param_dic[str(item)]] elif item == 'learning_rate': new_params[str(item)] = (list(float_range(decimal.Decimal(param_dic[str(item)]*lower_bound), decimal.Decimal(param_dic[str(item)]*upper_bound), '0.001'))) else: new_params[str(item)] = (list(float_range(decimal.Decimal(param_dic[str(item)]*lower_bound), decimal.Decimal(param_dic[str(item)]*upper_bound), '0.01'))) elif type(param_dic[str(item)]) == str: new_params[str(item)] = [(param_dic[str(item)])] else: #print('error, skipped ' + str(item)) continue #print(new_params) if forrest == False: xgb_r = xg.XGBRegressor(seed = 123) else: xgb_r = xg.XGBRFRegressor(seed = 123) xgb_random = RandomizedSearchCV(estimator =xgb_r, param_distributions = new_params, n_iter = n_trials, cv = 5, verbose=2, random_state=42, n_jobs = -1) xgb_random.fit(x_train, y_train) if counter >= epoch: counter += 1 #print('epoch - ' + str(counter) + ' Done') return xgb_random.best_params_ else: counter += 1 #print('epoch - ' + str(counter)) top_params.append([(xgb_random.best_score_),(xgb_random.best_params_)]) #print(xgb_random.best_score_) return repeater(epoch,counter,(xgb_random.best_params_)) final_param = repeater(epoch,counter,param_random) scores = [] params_list = [] for item in top_params: scores.append(item[0]) params_list.append(item[1]) top = pd.DataFrame(columns= ['scores','params']) top['scores'] = scores top['params'] = params_list if forrest == False: xgb_r = xg.XGBRegressor(seed = 123) else: xgb_r = xg.XGBRFRegressor(seed = 123) param_dic = top['params'][list(top['scores']).index(max(list(top['scores'])))] new_params = {} for item in param_dic: if type(param_dic[str(item)]) == str: new_params[str(item)] = params[str(item)] elif type(param_dic[str(item)]) == int: new_params[str(item)] = [(param_dic[str(item)])] elif type(param_dic[str(item)]) == float: new_params[str(item)] = [(param_dic[str(item)])] xgb_grid = GridSearchCV(estimator =xgb_r, param_distributions = new_params, n_iter = n_trials, cv = 5, verbose=2, random_state=42, n_jobs = -1) xgb_grid.fit(x_train, y_train) top_params.append([(xgb_grid.best_score_),(xgb_grid.best_params_)]) self.best_grid = top['params'][list(top['scores']).index(max(list(top['scores'])))] self.best_params = top self.optuna = False
feature_scaler = StandardScaler() feature_df[feat_column_name_list] = feature_scaler.fit_transform( feature_df[feat_column_name_list]) test_x[test_column_name_list] = feature_scaler.transform( test_x[test_column_name_list]) # Initialize models clf_line = LinearRegression() clf_ridg = Ridge(alpha=300, tol=1e-05, solver='sparse_cg', max_iter=5000) clf_laso = Lasso(alpha=0.1, tol=1e-05, max_iter=5000) clf_lala = LassoLars(alpha=0.001, max_iter=5000) clf_enet = ElasticNet(alpha=0.1, tol=0.001, l1_ratio=0.2, max_iter=5000) clf_xgbr = xgb.XGBRegressor() # not yet clf_xgrf = xgb.XGBRFRegressor() # not yet clf_rf = RandomForestRegressor(criterion='mae', max_features='sqrt', n_estimators=200, max_depth=10) clf_tree = ExtraTreesRegressor(criterion='mae', max_features='sqrt', n_estimators=200, max_depth=10) clf_ada = AdaBoostRegressor(n_estimators=3, loss='linear') clf_grad = GradientBoostingRegressor() # not yet clf_svr = SVR(kernel='rbf', C=0.1) base_model_name = [ 'LinearReg', 'Ridge', 'Lasso', 'LassoLars', 'ElasticNet', 'XgbReg',
#Random Forest print("Random Forest") print( run_model( RandomForestRegressor(max_depth=5, n_estimators=50, random_state=0), cat_feats)) #XGBoost xgb_params = { 'max_depth': 5, 'n_estimators': 50, #'learning_rate': 0.1, 'seed': 0 } model = xgb.XGBRFRegressor(**xgb_params) print("XGBoost") print(run_model(model, cat_feats)) #most influential features X = df[cat_feats].values Y = df['price_value'].values m = xgb.XGBRFRegressor(max_depth=5, n_estimators=50, learning_rate=0.1, seed=0) m.fit(X, Y) imp = PermutationImportance(m, random_state=0).fit(X, Y) print(eli5.show_weights(imp, feature_names=cat_feats).data) #from above code come out the most infuential features: feats = [ 'param_napęd__cat', 'param_skrzynia-biegów__cat', 'param_faktura-vat__cat',