示例#1
0
    def _build_model(self):
        """
        
        Build the crucial components for model training 
 
        
        """

        _config = {
            'n_estimators': self.n_estimators,
            'max_leaf_nodes': self.max_leaf_nodes,
            'min_impurity_split': self.min_impurity_split,
            'n_jobs': self.n_jobs,
            'random_state': self.random_state,
            'max_samples': self.max_samples
        }
        if self.task_type == 'binaryclass':
            self.predictor = XGBClassifier(**_config,
                                           objective='binary:logistic',
                                           eval_metric="logloss")
        elif self.task_type == 'multiclass':
            self.predictor = XGBClassifier(**_config)
        elif self.task_type == 'multilabel':
            xgb_estimator = XGBClassifier(**_config,
                                          objective='binary:logistic',
                                          eval_metric="logloss")
            self.predictor = MultiOutputClassifier(xgb_estimator)
        elif self.task_type == 'regression':
            self.predictor = XGBRFRegressor(**_config)
        self._save_config(_config, 'predictor')
        _config = {'tasktype': self.task_type}
        self._save_config(_config, 'tasktype')
示例#2
0
def test_xg_XGBRFRegressor():
    print("Testing xgboost, XGBRFRegressor...")
    mod = XGBRFRegressor()
    X, y = iris_data
    mod.fit(X, y)
    docs = {'name': "XGBRFRegressor test"}
    fv = X[0, :]
    upload(mod, fv, docs)
示例#3
0
def fit_model (X, y):
    model = XGBRFRegressor(n_estimators=1000, max_depth=7, random_state=42)
    model.fit(X, y)
    y_pred = model.predict(X)
    #print (y)
    err_mae = mean_absolute_error(y, y_pred)
    err_rmse = np.sqrt(mean_squared_error(y, y_pred))
    return model, y_pred, err_mae, err_rmse
示例#4
0
	def __init__(self,src_file_index,bounds):
		self.model = XGBRFRegressor()
		self.model_name = "XGBRFRegressor"
		self.src = util.get_src_file(src_file_index=src_file_index)
		self.lower_bounds = bounds["lower_bounds"]
		self.upper_bounds = bounds["upper_bounds"]
		self.with_rain = False
		self.optimization_methods = optimization_methods
		self.num_iterations = 200
		self.results = {}
		self.result_save_path = 'optimization_result/with_rain_'+str(self.with_rain)+'/'+self.src.split('.')[0].split('/')[-1]+'/'
		self.optimization()
		self.save_optimization_result()
示例#5
0
class XGBRFRegressorOptimizer(BaseOptimizer):
	def __init__(self,src_file_index,bounds):
		self.model = XGBRFRegressor()
		self.model_name = "XGBRFRegressor"
		self.src = util.get_src_file(src_file_index=src_file_index)
		self.lower_bounds = bounds["lower_bounds"]
		self.upper_bounds = bounds["upper_bounds"]
		self.with_rain = False
		self.optimization_methods = optimization_methods
		self.num_iterations = 200
		self.results = {}
		self.result_save_path = 'optimization_result/with_rain_'+str(self.with_rain)+'/'+self.src.split('.')[0].split('/')[-1]+'/'
		self.optimization()
		self.save_optimization_result()

	def objective_function(self,x):
		print("XGBRegressor优化中...")
		train_x, test_x, train_y, test_y = util.get_train_test_split(self.src,int(np.round(x[0])),int(np.round(x[1])),with_rain=self.with_rain)
		print(self.model_name)
		self.tune_params = ['offset','period','max_depth',
							# 'learning_rate',
		 					'n_estimators',
							'gasmma',
							'min_child_weight','max_delta_step','subsample',
							'colsample_bytree','colsample_bylevel','colsample_bynode','reg_alpha',
							'reg_lambda','scale_pos_weight','base_score'
							]
		self.model.max_depth = int(x[2])
		self.model.n_estimators = int(x[3])
		self.model.gamma = x[4]
		self.model.min_child_weight = int(x[5])
		self.model.max_delta_step = int(x[6])
		self.model.subsample = x[7]
		self.model.colsample_bytree = x[8]
		self.model.colsample_bylevel = x[9]
		self.model.colsample_bynode = x[10]
		self.model.reg_alpha = x[11]
		self.model.reg_lambda = x[12]
		self.model.scale_pos_weight = x[13]
		self.model.base_score = x[14]
		self.model.objective = 'reg:squarederror'
		self.model.learning_rate = 0.001
		self.model.fit(X=train_x,y=train_y)
		y_hat = self.model.predict(test_x)
		mse = mean_squared_error(y_hat,test_y)
		return mse
示例#6
0
    def _build_model(self):
        """
        
        Build the crucial components for model training 
 
        
        """

        _config = {
            'n_estimators': self.n_estimators,
            'criterion': self.criterion,
            'max_depth': self.max_depth,
            'min_samples_split': self.min_samples_split,
            'min_samples_leaf': self.min_samples_leaf,
            'min_weight_fraction_leaf': self.min_weight_fraction_leaf,
            'max_features': self.max_features,
            'max_leaf_nodes': self.max_leaf_nodes,
            'min_impurity_split': self.min_impurity_split,
            'bootstrap': self.bootstrap,
            'oob_score': self.oob_score,
            'n_jobs': self.n_jobs,
            'random_state': self.random_state,
            'verbose': self.verbose,
            'warm_start': self.warm_start,
            'ccp_alpha': self.ccp_alpha,
            'max_samples': self.max_samples
        }
        if self.task_type == 'binaryclass':
            self.predictor = XGBClassifier(**_config,
                                           objective='binary:logistic')
        elif self.task_type == 'multiclass':
            self.predictor = XGBClassifier(**_config)
        elif self.task_type == 'multilabel':
            xgb_estimator = XGBClassifier(**_config,
                                          objective='binary:logistic')
            self.predictor = MultiOutputClassifier(xgb_estimator)
        elif self.task_type == 'regression':
            self.predictor = XGBRFRegressor(**_config)
        self._save_config(_config, 'predictor')
        _config = {'tasktype': self.task_type}
        self._save_config(_config, 'tasktype')
示例#7
0
 def xgrfboost (train, target, n_estimators = 100, max_depth = 8, random_state = 17, learning_rate = 0.1, colsample_bytree = 0.9, colsample_bynode = 0.9, 
                colsample_bylevel = 0.9, importance_type = 'split', reg_alpha = 2, reg_lambda = 2):
     '''XGRFBoost Regressor
        Params :-
        train - Training Set to train
        target - Target Set to predict
        n_estimators - no. of trees to predict (default set to 100)
        max_depth - Maximum depth that a tree can grow (default set to 8)
        random_state - A arbitary number to get same results when run on different machine with same params (default set to 17)
        learning_rate - size of step to to attain towards local minima
        colsample_bytree, colsample_bynode, colsample_bylevel - part of total features to use bytree, bynode, bylevel
        importance_type - metric to split samples (default set to split)
        reg_alpha, reg_lambda - L1 regularisation and L2 regularisation respectively'''
     
     from xgboost import XGBRFRegressor
     model = XGBRFRegressor(n_estimators = n_estimators, max_depth = max_depth, random_state = random_state, learning_rate = learning_rate, 
                            colsample_bytree = colsample_bytree, colsample_bynode = colsample_bynode, colsample_bylevel = colsample_bylevel, 
                            importance_type = importance_type, reg_alpha = reg_alpha, reg_lambda = reg_lambda)
     model.fit(train, target)
     
     return model
示例#8
0
    def _set_surrogate(self, X, y=None):

        if not hasattr(self, "_surrogate"):
            target = type_of_target(y)
            if target == "continuous":
                self._surrogate = XGBRFRegressor(max_depth=7, n_estimators=150)
            elif target in ["binary", "multiclass"]:
                self._surrogate = XGBRFClassifier(max_depth=7,
                                                  n_estimators=150)
            else:
                raise ValueError(
                    "Multioutput and multilabel datasets is not supported.")
示例#9
0
 def train(self):
     self.config.logger.info("XGBoostOptimiser::train")
     model = XGBRFRegressor(verbosity=1, **(self.config.params))
     start = timer()
     inputs, exp_outputs = self.get_data_("train")
     end = timer()
     log_time(start, end, "for loading training data")
     log_memory_usage(
         ((inputs, "Input train data"), (exp_outputs, "Output train data")))
     log_total_memory_usage("Memory usage after loading data")
     if self.config.plot_train:
         inputs_val, outputs_val = self.get_data_("validation")
         log_memory_usage(((inputs_val, "Input val data"),
                           (outputs_val, "Output val data")))
         log_total_memory_usage("Memory usage after loading val data")
         self.plot_train_(model, inputs, exp_outputs, inputs_val,
                          outputs_val)
     start = timer()
     model.fit(inputs, exp_outputs)
     end = timer()
     log_time(start, end, "actual train")
     self.save_model(model)
示例#10
0
    def train(self):
        """
        Train the optimizer.
        """
        self.config.logger.info("XGBoostOptimiser::train")
        if self.config.dim_output > 1:
            logger = get_logger()
            logger.fatal(
                "YOU CAN PREDICT ONLY 1 DISTORTION. dim_output is bigger than 1."
            )

        model = XGBRFRegressor(verbosity=1, **(self.config.params))
        start = timer()
        inputs, exp_outputs, *_ = self.__get_data("train")
        end = timer()
        log_time(start, end, "for loading training data")
        log_memory_usage(
            ((inputs, "Input train data"), (exp_outputs, "Output train data")))
        log_total_memory_usage("Memory usage after loading data")
        if self.config.plot_train:
            inputs_val, outputs_val, *_ = self.__get_data("validation")
            log_memory_usage(((inputs_val, "Input validation data"),
                              (outputs_val, "Output validation data")))
            log_total_memory_usage(
                "Memory usage after loading validation data")
            self.__plot_train(model, inputs, exp_outputs, inputs_val,
                              outputs_val)
        start = timer()
        model.fit(inputs, exp_outputs)
        end = timer()
        log_time(start, end, "actual train")
        model.get_booster().feature_names = get_input_names_oned_idc(
            self.config.opt_usederivative,
            self.config.num_fourier_coeffs_train)
        self.__plot_feature_importance(model)
        self.save_model(model)
示例#11
0
def ml_train(df_train, target):

    X_train, y_train = get_x_y(df_train, target)
    # ML train

    X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(
        X_train, y_train, test_size=0.20, random_state=7)

    if target == 'target':
        #        classification
        model = XGBClassifier()
        model.fit(X_train_train,
                  y_train_train,
                  eval_metric='mlogloss',
                  eval_set=[(X_train_test, y_train_test)],
                  early_stopping_rounds=25,
                  verbose=False)
    else:
        #        regression
        model = XGBRFRegressor()
        model.fit(X_train_train,
                  y_train_train,
                  eval_metric='rmse',
                  eval_set=[(X_train_test, y_train_test)],
                  early_stopping_rounds=25,
                  verbose=False)

    print('Training Set: {} to {}'.format(df_train['date'].min(),
                                          df_train['date'].max()))

    # ML score
    y_pred = model.predict(X_train)

    if target == 'target':
        accuracy = accuracy_score(y_train, y_pred)
        print("In-Sample Accuracy: %.2f%%" % (accuracy * 100.0))
    else:
        mse = mean_squared_error(y_train, y_pred)
        print("In-Sample RMSE: %.2f%%" % (sqrt(mse) * 100))

    return model, X_train
示例#12
0
    def __create_pipeline(self):
        if self.mode == "bypass_knnsr":
            pipeline = [("regression",
                         KNNSRBypassRegression(column=KNNSR_BYPASS))]
            self.pipeline_params = {}
            self.n_jobs = 1
            self.training_cv_folds = 2

        if self.mode == "xgb":
            pipeline = [
                ("variance_treshold", VarianceThreshold()),
                ("scale", StandardScaler()),
                ("regression", XGBRFRegressor()),
            ]
            self.pipeline_params = {
                "regression__n_estimators": [100, 200, 400, 800],
                "regression__max_depth": [1, 3, 5, 7, 11],
                "regression__subsample": [0.5, 1],
                "regression__colsample_bylevel": [0.8, 1],
                "regression__random_state": [0],
                "regression__eval_metric": ["mae"],
                "regression__reg_lambda": [0, 1],
                "regression__reg_alpha": [0, 1],
                "regression__objective": ["reg:squarederror"],
            }
        if self.mode == "linear":
            pipeline = [
                (
                    "filter",
                    FilterColumns(
                        columns=[BASELINE, AugmentedTADPOLEData.FORECAST_DIST
                                 ]),
                ),
                ("scale", StandardScaler()),
                ("polynomial_features", PolynomialFeatures()),
                ("regression", LinearRegression()),
            ]
            self.pipeline_params = {}
            self.n_jobs = 1

        self.pipeline = Pipeline(pipeline)
示例#13
0
# 1. 데이터
datasets = load_boston()
x = datasets.data
y = datasets.target
print("init x.shape:", x.shape)

# 1.1 데이터 전처리 (train_test_split)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    random_state=44,
                                                    shuffle=True,
                                                    test_size=0.2)

# 2 모델 (XGBRFRegressor)
model = XGBRFRegressor(max_depth=4)
model.fit(x_train, y_train)

# 4. 평가
acc = model.score(x_test, y_test)
print("acc:", acc)
print(model.feature_importances_)


# 피쳐 임포턴스 자르는 함수
def earseLowFI_index(fi_arr, low_value, input_arr):
    input_arr = input_arr.T
    temp = []
    for i in range(fi_arr.shape[0]):
        if fi_arr[i] >= low_value:
            temp.append(input_arr[i, :])
示例#14
0
#1. 데이터
dataset = load_boston()
x = dataset.data
y = dataset.target

df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    train_size=0.8,
                                                    random_state=77)

#2. 모델
# model = GradientBoostingClassifier(max_depth=4)
model = XGBRFRegressor(n_jobs=-1)
#3. 훈련
model.fit(x_train, y_train)

#4. 평가, 예측
acc = model.score(x_test, y_test)

print(model.feature_importances_
      )  #[0.0244404  0.01669101 0.00766884 0.95119975] 다 합치면 1
print('acc : ', acc)

fi = model.feature_importances_

new_data = []
feature = []
示例#15
0
    'n_estimators': [1, 50, 100],
    "max_depth": [2, 6, 8],
    'min_child_weight': [1, 0.1, 0.3],
    'eta': [0, 2, 10],
    'gamma': [0, 1, 2],
    'max_delta_step': [0, 1],
    'subsample': [0.5, 0.6],
    'colsample_bytree': [1, 0.5],
    'colsample_bylevel': [0, 1],
    'lambda': [1, 0.5, 1.5],
    'alpha': [0, 1],
    'scale_pos_weight': [1, 2],
    'L1': [0]
}

model = RandomizedSearchCV(XGBRFRegressor(n_estimators=1000,
                                          penalty=('l1', 'l2')),
                           parameters,
                           cv=5,
                           n_jobs=-1)
model = MultiOutputRegressor(model)
warnings.filterwarnings('ignore')
model.fit(x_train, y_train)

score = model.score(x_test, y_test)
print(score)
y4 = model.predict(test.values)

#여기서 definition과 for 문을 써준 이유는 GB와 XGB에서는 스칼라 형태일때만 정보가 받아지기 때문에 저 두개의 모델을 구동시키기 위해서는
#현재 가지고 있는 데이터셋을 총 4번(4컬럼이니까) 으로 잘라줘서 스칼라의 형태로 만들어주는 것이다 . 이 for문은 그것을 진행해주기 위해서 있는것이다.
#나머지 random forest와 decision tree는 스칼라의 형태로 구동을 하더라도 전혀 상관 없이 잘 구동된다.
示例#16
0
}, {
    "anyway__n_estimators": [100, 200, 300],
    "anyway__learning_rate": [0.1, 0.09, 1],
    "anyway__colsample_bylevel": [0.6, 0.7, 0.8]
}]

# 1.1 데이터 전처리 (train_test_split)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    random_state=44,
                                                    shuffle=True,
                                                    test_size=0.2)

kfold = KFold(n_splits=5, shuffle=True)

pipe = Pipeline([('scaler', StandardScaler()), ('anyway', XGBRFRegressor())])

# model = XGBRFRegressor(max_depth=max_depth, learning_rate=learning_rate,
#                         n_estimators=n_estimators, n_jobs=n_jobs,
#                         colsample_bylevel = colsample_bylevel,
#                         colsample_bytree=colsample_bytree )
# model = RandomizedSearchCV(XGBRFRegressor(),
#                     parameters,
#                     cv=kfold,
#                     verbose=2) # kfold가 5번 x 20번 = 총 100번

model = RandomizedSearchCV(pipe, parameters, cv=5, verbose=2)
model.fit(x_train, y_train)

# 4. 평가, 예측
acc = model.score(x_test, y_test)
示例#17
0
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
        
df_num = df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df_num[column] = encoder.transform(df[column])

# feature, target 설정
train_num = df_num.sample(frac=1, random_state=0)
train_features = train_num.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1)
train_target = np.log1p(train_num['AMT'])

# 훈련
model = XGBRFRegressor(n_jobs=-1)
model.fit(train_features, train_target)

# 예측 템플릿 만들기
CARD_SIDO_NMs = df_num['CARD_SIDO_NM'].unique()
STD_CLSS_NMs  = df_num['STD_CLSS_NM'].unique()
HOM_SIDO_NMs  = df_num['HOM_SIDO_NM'].unique()
AGEs          = df_num['AGE'].unique()
SEX_CTGO_CDs  = df_num['SEX_CTGO_CD'].unique()
FLCs          = df_num['FLC'].unique()
years         = [2020]
months        = [4, 7]

temp = []
for CARD_SIDO_NM in CARD_SIDO_NMs:
    for STD_CLSS_NM in STD_CLSS_NMs:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier, XGBRFRegressor
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_diabetes

x, y = load_diabetes(return_X_y=True)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    random_state=42,
                                                    shuffle=True,
                                                    train_size=0.8)

model1 = XGBRFRegressor()
model1.fit(x_train, y_train)

default_score = model1.score(x_test, y_test)

model = XGBRFRegressor()
model.fit(x_train, y_train)
print(model.feature_importances_)

index7 = np.sort(model.feature_importances_)[::-1][int(
    0.7 * len(model.feature_importances_))]

delete_list = []
for i in model.feature_importances_:
    if i < index7:
        print(i, "제거 ")
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score, accuracy_score
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd

x, y = load_boston(return_X_y=True)

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    shuffle=True,
                                                    train_size=0.8,
                                                    random_state=66)

model = XGBRFRegressor(n_jobs=-1)
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
print('R2', score)

thresholds = np.sort(model.feature_importances_)  #피처를 소팅
print(thresholds)

for thresh in thresholds:
    selection = SelectFromModel(model, threshold=thresh,
                                prefit=True)  # 피처의 개수를 하나씩 제거

    select_x_train = selection.transform(x_train)  # 피쳐의 개수를 줄인 트레인을 반환

    selection_model = XGBRFRegressor(n_jobs=-1)  # 모델 생성
    selection_model.fit(select_x_train, y_train)  #모델의 핏
示例#20
0
coef6 = pd.Series(alg6.feature_importances_, predictors).sort_values(ascending=False)
coef6.plot(kind='bar', title='Feature Importances')


# In[ ]:





# In[157]:


from xgboost import XGBRFRegressor
predictors = [x for x in train.columns if x not in [target]+IDcol]
alg7 = XGBRFRegressor()
modelfit(alg7, train, test, predictors, target, IDcol, 'alg7.csv')


# In[ ]:





# In[ ]:




示例#21
0
from xgboost import XGBRFRegressor

#from sklearn.linear_model import LinearRegression

df = pd.read_csv("wheel_prediction_data.csv",
                 encoding='ISO 8859-1',
                 sep=";",
                 decimal=",")
df.head()

# evaluate xgboost random forest ensemble for regression

y = df[['km_till_OMS']].values
X = df[["LeftWheelDiameter", "Littera", "VehicleOperatorname"]]
# define the model
model = XGBRFRegressor(n_estimators=100, subsample=0.9, colsample_bynode=0.2)
# define the model evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
n_scores = cross_val_score(model,
                           X,
                           y,
                           scoring='neg_mean_absolute_error',
                           cv=cv,
                           n_jobs=-1)
# report performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# def model(df):
#
#     # With Statsmodels, we need to add our intercept term, B0, manually
示例#22
0
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor


def train_and_test(model):
    model.fit(X_train, y_train)
    y_hat = model.predict(X_test)
    report = metrics.mean_squared_error(y_test, y_hat)
    print(report)

    return y_hat


# XGBRF Regression
xgbrf_pred = train_and_test(XGBRFRegressor(n_estimators=400))
# kNN
knn_pred_4 = train_and_test(KNeighborsRegressor(n_neighbors=14))
# Random Forest
rf_pred = train_and_test(
    RandomForestRegressor(n_estimators=400, random_state=14))
# LGBM Regression
lgbm_pred = train_and_test(
    LGBMRegressor(boosting_type='gbdt',
                  random_state=94,
                  colsample_bytree=0.9,
                  max_depth=5,
                  subsample=0.9,
                  n_estimators=40))

#%%
示例#23
0
parameters = {
    'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95],
    'max_depth': [5, 10, 20, 30, 50, 80, 100],
    'n_jobs': [-1]
}
fit_params = {
    'verbose': True,
    'eval_set': [(x_train, y_train), (x_test, y_test)],
    # 'early_stopping_rounds' : 5
}
kfold = KFold(n_splits=5, shuffle=True, random_state=66)
# 2. 모델
y_pred = []
y_test_pred = []
for i in range(4):
    model = RandomizedSearchCV(XGBRFRegressor(), parameters, cv=5, n_iter=50)
    model.fit(x_train, y_train[:, i])

    print("acc : ", model.score(x_test, y_test[:, i]))

    y_test_pred.append(model.predict(x_test))
    y_pred.append(model.predict(x_pred))

y_pred = np.array(y_pred).T
y_test_pred = np.array(y_test_pred).T
print(y_pred.shape)

mspe = kaeri_metric(y_test, y_test_pred)
print('mspe : ', mspe)

submissions = pd.DataFrame({
示例#24
0
print(x.shape)  # (506, 13)
print(y.shape)

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True)

parameters = [{
    'n_estimators': [300, 500, 3300],
    'learning_rate': [0.01, 0.5, 1],
    'colsample_bytree': [0.6, 0.8, 0.9],  # 0.6~0.9사용
    'colsample_bylevel': [0.6, 0.8, 0.9],
    'max_depth': [6, 7, 8]
}]

model = GridSearchCV(XGBRFRegressor(), parameters, cv=5,
                     n_jobs=-1)  # 결측치제거 전처리 안해도된다.

model.fit(x_train, y_train)

print(model.best_estimator_)
print("==========================================")
print(model.best_params_)
print("==========================================")
score = model.score(x_test, y_test)
print('정수: ', score)

# plot_importance(model)
# plt.show()
示例#25
0
print("x_train 모양 : ", x_train.shape)  #(8000, 71)
print("x_test 모양 : ", x_test.shape)  #(2000, 71)
print("y_train 모양 : ", y_train.shape)  #(8000, 4)
print("y_test 모양 : ", y_test.shape)  #(2000, 4)

#트리구조
#MultiOutputRegressor(xgb.XGBRFRegressor())
'''
model = MultiOutputRegressor(XGBRegressor())
model.fit(x_train,y_train)
score = model.score(x_test,y_test)
print(score)
y4 = model.predict(test.values)
'''

model = MultiOutputRegressor(XGBRFRegressor())
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
print(score)
y4 = model.predict(test.values)

n_features = x.data.shape[1]  #30


def plot_feature_importances_x(model):
    n_features = x.data.shape[1]  #30
    plt.barh(
        np.arange(n_features),
        model.feature_importances_,  #수평 가로 막대를 그린다. (                 )
        align='center')
    plt.yticks(np.arange(n_features),
示例#26
0
                                                    y,
                                                    train_size=0.8,
                                                    shuffle=True,
                                                    random_state=66)

# 이 정도만 조작해 주면 됨
n_estimators = 1000  # The number of trees in the forest.
learning_rate = 1  # 학습률
colsample_bytree = None  # 트리의 샘플 / 실전0.6 ~ 0.9 사이 씀 / 실전 1씀
colsample_bylevel = 0.9  # [기본설정값: 1]: subsample, colsample_bytree 두 초모수 설정을 통해서 이미 의사결정나무 모형 개발에 사용될 변수갯수와 관측점 갯수를 사용했는데 추가로 colsample_bylevel을 지정하는 것이 특별한 의미를 갖는지 의문이 듦.
max_depth = 29  # [기본설정값: 6]: 과적합 방지를 위해서 사용되는데 역시 CV를 사용해서 적절한 값이 제시되어야 하고 보통 3-10 사이 값이 적용된다.
n_jobs = -1

# CV 써라
# XGB 속도가 굉장히 빠름, 전처리 결측치 제거 안해줘도 됨

model = XGBRFRegressor(max_depth=max_depth,
                       learning_rate=learning_rate,
                       n_estimators=n_estimators,
                       colsample_bylevel=colsample_bylevel,
                       colsample_bytree=colsample_bytree)

model.fit(x_train, y_train)

score = model.score(x_test, y_test)  # score는 evaluate
print('점수 :', score)

# print(model.feature_importances_)
plot_importance(model)
# plt.show()
示例#27
0
models = [
    LinearRegression(),
    LassoCV(alphas=np.logspace(-6, 6, 13)),
    ElasticNetCV(alphas=np.logspace(-6, 6, 13)),
    SGDRegressor(),
    PassiveAggressiveRegressor(),
    Ridge(),
    PassiveAggressiveRegressor(),
    RandomForestRegressor(max_depth=5),
    GradientBoostingRegressor(),
    AdaBoostRegressor(loss='exponential'),
    BaggingRegressor(),
    SVR(),
    NuSVR(),
    XGBRFRegressor(max_depth=5, objective="reg:squarederror"),
    XGBRegressor(max_depth=5, objective="reg:squarederror")
]


def show_score(x, y, estimator):
    """
    Returns MAE scores for specified models.
    Also returns r2 scores if applicable

    Arguments:
        x {[array/DataFrame]} -- [Array or matrix of features. Can also be dataframe]
        y {[array]} -- [Target values]
        estimator {[str]} -- [The estimator being used]
    """
    # Instantiate models and predict values
示例#28
0
class XGBoostText:
    def __init__(self,
                 expmodel_id='test.new',
                 n_estimators=100,
                 use_gpu=False,
                 criterion='gini',
                 max_depth=None,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 min_weight_fraction_leaf=0.0,
                 max_features='auto',
                 max_leaf_nodes=None,
                 min_impurity_decrease=0.0,
                 min_impurity_split=None,
                 bootstrap=True,
                 oob_score=False,
                 n_jobs=None,
                 random_state=None,
                 verbose=0,
                 warm_start=False,
                 class_weight=None,
                 ccp_alpha=0.0,
                 max_samples=None):
        """
        XGboost from public XGBoostText Lib.


        Parameters

        ----------

        """
        check_model_dir(expmodel_id=expmodel_id)
        self.checkout_dir = os.path.join('./experiments_records', expmodel_id,
                                         'checkouts')
        self.result_dir = os.path.join('./experiments_records', expmodel_id,
                                       'results')
        # make saving directory if needed
        if not os.path.isdir(self.checkout_dir):
            os.makedirs(self.checkout_dir)

        if not os.path.isdir(self.result_dir):
            os.makedirs(self.result_dir)

        self.expmodel_id = expmodel_id
        self.n_estimators = n_estimators
        self.use_gpu = use_gpu
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.max_features = max_features
        self.max_leaf_nodes = max_leaf_nodes
        self.min_impurity_decrease = min_impurity_decrease
        self.min_impurity_split = min_impurity_split
        self.bootstrap = bootstrap
        self.oob_score = oob_score
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbose = verbose
        self.warm_start = warm_start
        self.class_weight = class_weight
        self.ccp_alpha = ccp_alpha
        self.max_samples = max_samples
        self.task_type = None
        # self._args_check()
        self.device = self._get_device()

    def _data_check(self, datalist):
        """
        
        Target to 1) check train_data/valid_data valid, if not give tips about data problem
                  2) check loss function valid, if not recommend proper loss func
        
        Parameters

        ----------

        datalist = [data1 = {
                      'x':list[episode_file_path], 
                      'y':list[label], 
                      'l':list[seq_len], 
                      'feat_n': n of feature space, 
                      'label_n': n of label space
                    },
                    data2 = {
                      'x':list[episode_file_path], 
                      'y':list[label], 
                      'l':list[seq_len], 
                      'feat_n': n of feature space, 
                      'label_n': n of label space
                    }, ...
                    ]
        Returns

        -------

        self : object


        """

        label_n_check = set([])
        task_type_check = set([])
        for each_data in datalist:
            for each_x_path in each_data['x']:
                if os.path.exists(each_x_path) is False:
                    raise Exception('episode file not exist')
            label_n_check.add(np.shape(np.array(each_data['y']))[1])
            task_type_check.add(
                label_check(each_data['y'],
                            hat_y=None,
                            assign_task_type=self.task_type))

        if len(task_type_check) != 1:
            raise Exception('task_type is inconformity in data')

        pre_task_type = list(task_type_check)[0]
        if self.task_type == None:
            self.task_type = pre_task_type
        elif self.task_type == pre_task_type:
            pass
        else:
            raise Exception(
                'predifine task-type {0}, but data support task-type {1}'.
                format(self.task_type, pre_task_type))
        print('current task can beed seen as {0}'.format(self.task_type))

    def _get_device(self):
        if self.use_gpu:
            if torch.cuda.is_available():
                device = torch.device("cuda")
                print('use GPU recource')
            else:
                device = torch.device("cpu")
                print('not find effcient GPU, use CPU recource')
        else:
            device = torch.device("cpu")
            print('use CPU recource')
        return device

    def _build_model(self):
        """
        
        Build the crucial components for model training 
 
        
        """

        _config = {
            'n_estimators': self.n_estimators,
            'max_leaf_nodes': self.max_leaf_nodes,
            'min_impurity_split': self.min_impurity_split,
            'n_jobs': self.n_jobs,
            'random_state': self.random_state,
            'max_samples': self.max_samples
        }
        if self.task_type == 'binaryclass':
            self.predictor = XGBClassifier(**_config,
                                           objective='binary:logistic',
                                           eval_metric="logloss")
        elif self.task_type == 'multiclass':
            self.predictor = XGBClassifier(**_config)
        elif self.task_type == 'multilabel':
            xgb_estimator = XGBClassifier(**_config,
                                          objective='binary:logistic',
                                          eval_metric="logloss")
            self.predictor = MultiOutputClassifier(xgb_estimator)
        elif self.task_type == 'regression':
            self.predictor = XGBRFRegressor(**_config)
        self._save_config(_config, 'predictor')
        _config = {'tasktype': self.task_type}
        self._save_config(_config, 'tasktype')

    def _data_check(self, datalist):
        """
        
        Target to 1) check train_data/valid_data valid, if not give tips about data problem
                  2) check loss function valid, if not recommend proper loss func
        
        Parameters

        ----------

        datalist = [data1 = {
                      'x':list[episode_file_path], 
                      'y':list[label], 
                      'l':list[seq_len], 
                      'feat_n': n of feature space, 
                      'label_n': n of label space
                    },
                    data2 = {
                      'x':list[episode_file_path], 
                      'y':list[label], 
                      'l':list[seq_len], 
                      'feat_n': n of feature space, 
                      'label_n': n of label space
                    }, ...
                    ]
        Returns

        -------

        self : object


        """

        label_n_check = set([])
        task_type_check = set([])
        for each_data in datalist:
            for each_x_path in each_data['x']:
                if os.path.exists(each_x_path) is False:
                    raise Exception('episode file not exist')
            label_n_check.add(np.shape(np.array(each_data['y']))[1])
            task_type_check.add(
                label_check(each_data['y'],
                            hat_y=None,
                            assign_task_type=self.task_type))

        if len(task_type_check) != 1:
            raise Exception('task_type is inconformity in data')

        pre_task_type = list(task_type_check)[0]
        if self.task_type == None:
            self.task_type = pre_task_type
        elif self.task_type == pre_task_type:
            pass
        else:
            raise Exception(
                'predifine task-type {0}, but data support task-type {1}'.
                format(self.task_type, pre_task_type))

    def fit(self, data_dict, X=None, y=None, assign_task_type=None):
        """
        Parameters

        ----------

        train_data : {
                      'x':list[episode_file_path], 
                      'y':list[label], 
                      'l':list[seq_len], 
                      'feat_n': n of feature space, 
                      'label_n': n of label space
                      }

            The input train samples dict.
 
        valid_data : {
                      'x':list[episode_file_path], 
                      'y':list[label], 
                      'l':list[seq_len], 
                      'feat_n': n of feature space, 
                      'label_n': n of label space
                      }

            The input valid samples dict.


        Returns

        -------

        self : object

            Fitted estimator.

        """
        self.task_type = assign_task_type
        if data_dict != None:
            self._data_check([data_dict])
            data = ml_reader.DatasetReader(
                data_dict, device=self.device,
                task_type=self.task_type).get_data()
            _X = np.array(data['X'])
            _y = np.array(data['Y'])
        elif X != None and y != None:
            self._data_check([{'X': X, 'Y': Y}])
            _X = X
            _y = Y
        else:
            raise Exception('fill in correct data for model train')

        print(np.shape(_X), np.shape(_y))
        self._build_model()
        self.predictor.fit(_X, _y)
        model_path = os.path.join(self.checkout_dir, 'best.model')
        joblib.dump(self.predictor, model_path)

    def _save_config(self, config, config_type):
        temp_path = os.path.join(self.checkout_dir,
                                 "{0}_config.json".format(config_type))
        if os.path.exists(temp_path):
            os.remove(temp_path)
        with open(temp_path, "w", encoding='utf-8') as f:
            f.write(json.dumps(config, indent=4))

    def _load_config(self, config_type):
        temp_path = os.path.join(self.checkout_dir,
                                 '{0}_config.json'.format(config_type))
        assert os.path.exists(
            temp_path
        ), 'cannot find {0}_config.json, please it in dir {1}'.format(
            config_type, self.checkout_dir)
        with open(temp_path, 'r') as f:
            config = json.load(f)
        return config

    def load_model(self):
        """
        
        Parameters

        ----------

        loaded_epoch : str, loaded model name 
        
            we save the model by <epoch_count>.epoch, latest.epoch, best.epoch

        Returns

        -------

        self : object

            loaded estimator.

        """
        model_path = os.path.join(self.checkout_dir, 'best.model')
        self.task_type = self._load_config('tasktype')['tasktype']
        self.predictor = joblib.load(model_path)

    def inference(self, data_dict, X=None, y=None):
        """

        Parameters

        ----------

        test_data : {
                      'x':list[episode_file_path], 
                      'y':list[label], 
                      'l':list[seq_len], 
                      'feat_n': n of feature space, 
                      'label_n': n of label space
                      }

            The input test samples dict.
  
        """

        if data_dict != None:
            self._data_check([data_dict])
            data = ml_reader.DatasetReader(
                data_dict, device=self.device,
                task_type=self.task_type).get_data()
            _X = data['X']
            _y = data['Y']
        elif X != None and y != None:
            self._data_check({'X': X, 'Y': y})
            _X = X
            _y = y
        else:
            raise Exception('fill in correct data for model inference')

        if self.task_type in ['binaryclass', 'regression']:
            real_v = _y.reshape(-1, 1)
            prob_v = self.predictor.predict_proba(_X)[:, 1].reshape(-1, 1)
        elif self.task_type in ['multiclass']:
            real_v = np.array(_y)
            prob_v = self.predictor.predict_proba(_X).reshape(
                -1,
                np.shape(real_v)[1])
        elif self.task_type in ['multilabel']:
            real_v = np.array(_y)
            prob_v = []
            _prob_v = self.predictor.predict_proba(_X)
            for each_class in _prob_v:
                if len(each_class) == 1:
                    each_class = np.array([each_class])
                if np.shape(each_class)[1] == 2:
                    v = each_class[:, 1].reshape((-1, 1))
                else:
                    v = each_class
                prob_v.append(v)
            prob_v = np.concatenate(prob_v, 1)

        pickle.dump(prob_v, open(os.path.join(self.result_dir, 'hat_y'), 'wb'))
        pickle.dump(real_v, open(os.path.join(self.result_dir, 'y'), 'wb'))

    def get_results(self):
        """
        
        Load saved prediction results in current ExpID
            truth_value: proj_root/experiments_records/*****(exp_id)/results/y
            predict_value: proj_root/experiments_records/*****(exp_id)/results/hat_y
            xxx represents the loaded model
        
        """
        try:
            hat_y = pickle.load(
                open(os.path.join(self.result_dir, 'hat_y'), 'rb'))
        except IOError:
            print('Error: cannot find file {0} or load failed'.format(
                os.path.join(self.result_dir, 'hat_y')))
        try:
            y = pickle.load(open(os.path.join(self.result_dir, 'y'), 'rb'))
        except IOError:
            print('Error: cannot find file {0} or load failed'.format(
                os.path.join(self.result_dir, 'y')))

        results = {'hat_y': hat_y, 'y': y}

        return results
示例#29
0
                                                    random_state=44,
                                                    shuffle=True,
                                                    test_size=0.2)

from xgboost import XGBClassifier, XGBRFRegressor, plot_importance
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import r2_score

kfold = KFold(n_splits=5, shuffle=True)

# model = XGBRFRegressor(max_depth=max_depth, learning_rate=learning_rate,
#                         n_estimators=n_estimators, n_jobs=n_jobs,
#                         colsample_bylevel = colsample_bylevel,
#                         colsample_bytree=colsample_bytree )
model = RandomizedSearchCV(XGBRFRegressor(), parameters, cv=kfold,
                           verbose=2)  # kfold가 5번 x 20번 = 총 100번

# score 디폴트로 했던 놈과 성능 비교

model.fit(x_train, y_train)

acc = model.score(x_test, y_test)
print("acc:", acc)

print("최적의 매개변수:", model.best_estimator_)
print("최적의 파라미터:", model.best_params_)

y_predict = model.predict(x_test)
print('최종정답률:', r2_score(y_test, y_predict))
'''
示例#30
0
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder

df_num = df.copy()
for column in encoders.keys():
    encoder = encoders[column]
    df_num[column] = encoder.transform(df[column])

    # feature, target 설정
train_num = df_num.sample(frac=1, random_state=0)
train_features = train_num.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1)
train_target = np.log1p(train_num['AMT'])

# 훈련
model = XGBRFRegressor(learning_rate=0.1)
model.fit(train_features, train_target)

CARD_SIDO_NMs = df_num['CARD_SIDO_NM'].unique()
STD_CLSS_NMs = df_num['STD_CLSS_NM'].unique()
HOM_SIDO_NMs = df_num['HOM_SIDO_NM'].unique()
AGEs = df_num['AGE'].unique()
SEX_CTGO_CDs = df_num['SEX_CTGO_CD'].unique()
FLCs = df_num['FLC'].unique()
years = [2020]
months = [4, 7]

temp = []
for CARD_SIDO_NM in CARD_SIDO_NMs:
    for STD_CLSS_NM in STD_CLSS_NMs:
        for HOM_SIDO_NM in HOM_SIDO_NMs: