def build_model(X_train, y_train, X_valid, y_valid): best_params = { 'base_score': 2, 'colsample_bylevel': 0.75, 'colsample_bynode': 0.57, 'colsample_bytree': 0.95, 'gamma': 0.25, 'learning_rate': 1.7, 'max_depth': 18, 'min_child_weight': 0.025, 'n_estimators': 353, 'n_jobs': -1, 'num_class': 3, 'num_parallel_tree': 105, 'objective': 'multi:softmax', 'random_state': 42, 'subsample': 0.8, 'verbosity': 0, 'reg_alpha': 0.05, 'reg_lambda': 1, 'rate_drop': 0.5 } best_xgb = XGBRFClassifier(**best_params) best_xgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=['merror'], early_stopping_rounds=50, callbacks=[print_evaluation(period=5), early_stop(stopping_rounds=15)], verbose=False,) return best_xgb
def test_xg_XGBRFClassifier(): print("Testing xgboost, XGBRFClassifier...") # Note, only works with binary outcomes! mod = XGBRFClassifier() X, y = iris_data ybin = np.where(y <= 1, 0, 1) mod.fit(X, ybin) docs = {'name': "XGBRFClassifier test"} fv = X[0, :] upload(mod, fv, docs)
def fast_gbtree_classifier( X, y, *, learning_rate: float = 1.0, n_estimators: int = 100, subsample: float = 0.8, max_depth: Optional[int] = None, reg_alpha: Optional[float] = None, # L1 reg_lambda: Optional[float] = 1e-05, # L2 gamma: Optional[float] = None, missing: Optional[Any] = np.nan, objective: Objectives = 'binary:logistic', grow_policy: Literal['depthwise', 'lossguide'] = 'depthwise', tree_method: Literal['auto', 'exact', 'approx', 'hist', 'gpu_hist'] = 'auto', importance_type: Literal['gain', 'weight', 'cover', 'total_gain', 'total_cover'] = 'gain', random_state: int = 1, n_jobs: Optional[int] = None, framework: Literal['auto', 'xgboost', 'sklearn'] = 'auto', **kwargs, ) -> GradientBoostingClassifier: """Shared interface for XGBoost and sklearn Gradient Boosting Tree Classifier""" kw = dict(locals()) kwargs = kw.pop('kwargs') X = kw.pop('X') y = kw.pop('y') kw.update(kwargs) framework = kw.pop('framework') ### XGBOOST is_xgboost = False if framework == 'sklearn': XGB = GradientBoostingClassifier else: try: from xgboost import XGBRFClassifier as XGB is_xgboost = True except ImportError as e: warn('Run `pip install xgboost` to get significant ' 'faster GradientBoostingTree') XGB = GradientBoostingClassifier ### fine-tune the keywords for sklearn if not is_xgboost: org = dict(kw) spec = inspect.getfullargspec(XGB.__init__) kw = dict() for k in spec.args + spec.kwonlyargs: if k in org: kw[k] = org[k] ### training tree = XGB(**kw) tree.fit(X, y) return tree
def get_models(): models = dict() for v in arange(0.1, 1.1, 0.1): key = '%.1f' % v models[key] = XGBRFClassifier(n_estimators=100, subsample=0.9, colsample_bynode=v) return models
def _set_surrogate(self, X, y=None): if not hasattr(self, "_surrogate"): target = type_of_target(y) if target == "continuous": self._surrogate = XGBRFRegressor(max_depth=7, n_estimators=150) elif target in ["binary", "multiclass"]: self._surrogate = XGBRFClassifier(max_depth=7, n_estimators=150) else: raise ValueError( "Multioutput and multilabel datasets is not supported.")
def xgrfboost_classification(train, target, n_estimators=100, max_depth=8, random_state=17, learning_rate=0.1, colsample_bytree=0.9, colsample_bynode=0.9, colsample_bylevel=0.9, importance_type='split', reg_alpha=2, reg_lambda=2): '''XGRFBoost Classification Params :- train - Training Set to train target - Target Set to predict n_estimators - no. of trees to predict (default set to 100) max_depth - Maximum depth that a tree can grow (default set to 8) random_state - A arbitary number to get same results when run on different machine with same params (default set to 17) learning_rate - size of step to to attain towards local minima colsample_bytree, colsample_bynode, colsample_bylevel - part of total features to use bytree, bynode, bylevel importance_type - metric to split samples (default set to split) reg_alpha, reg_lambda - L1 regularisation and L2 regularisation respectively''' from xgboost import XGBRFClassifier model = XGBRFClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state, learning_rate=learning_rate, colsample_bytree=colsample_bytree, colsample_bynode=colsample_bynode, colsample_bylevel=colsample_bylevel, importance_type=importance_type, reg_alpha=reg_alpha, reg_lambda=reg_lambda) model.fit(train, target) print("Training Completed .....") return model
def train_XGB(self): try: self.xgb_signal = XGBRegressor() self.xgb_coverage = XGBClassifier() self.xgb_coverage2 = XGBRFClassifier() self.xgb_polluted = XGBClassifier() self.xgb_hole = XGBClassifier() self.xgb_preds = other_result( 'xgb', self.xgb_signal, self.xgb_coverage, self.xgb_coverage2, self.xgb_polluted, self.xgb_hole, self.df_train, self.df_val, self.df_train_per_locs, self.df_val_per_locs, self.pollute_happened, self.model_info) except Exception: traceback.print_exc()
def multi_default_models(self, models=None): if models: ob2 = cl_modeling(self.X_train, self.X_test, self.y_train, self.y_test) for model in models: print(model) ob2.train_predict_model(model) print() else: ob2 = cl_modeling(self.X_train, self.X_test, self.y_train, self.y_test) # Import model's lib from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier from lightgbm import LGBMClassifier from xgboost import XGBRFClassifier # Setting Models nb = GaussianNB() rf = RandomForestClassifier( criterion="entropy", n_estimators=500, max_depth=6 ) lr = LogisticRegression(solver="lbfgs", max_iter=1000) dt = DecisionTreeClassifier(max_depth=13, min_samples_leaf=10) xgb = XGBRFClassifier(max_depth=10, learning_rate=0.1) lgbm_rf = LGBMClassifier( boosting_type="rf", n_jobs=1, bagging_freq=3, bagging_fraction=0.3, importance_type="gain", ) lgbm_dart = LGBMClassifier( boosting_type="dart", n_jobs=1, importance_type="gain" ) lgbm = LGBMClassifier(n_jobs=1, importance_type="gain") # Evaluating model_list = [nb, lr, dt, rf, xgb, lgbm_rf, lgbm_dart, lgbm] for model in model_list: print(model) ob2.train_predict_model(model) print() ob2 = cl_modeling(self.X_train, self.X_test, self.y_train, self.y_test)
x = np.load('./data/x_data.npy') y = np.load('./data/y_data.npy') x_pred = np.load('./data/x_pred.npy') print("x.shape :", x.shape) print("y.shape :", y.shape) print("x_pred.shape :", x_pred.shape) x = x.reshape(x.shape[0], 64 * 64 * 3) x_pred = x_pred.reshape(x_pred.shape[0], 64 * 64 * 3) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=77, shuffle=True) # model = XGBClassifier() model = MultiOutputClassifier(XGBRFClassifier()) # 3. 훈련 model.fit(x_train, y_train) # 4. 평가, 예측 acc = model.score(x_test, y_test) print("acc :", acc) y_pred = model.predict(x_pred)
# base learners # C0=best_classifier C1 = DecisionTreeClassifier(max_depth=8) C2 = CatBoostClassifier(verbose=0) C3 = KNeighborsClassifier() C4 = BernoulliNB() C5 = RandomForestClassifier() C6 = XGBClassifier() C7 = RidgeClassifier() C8 = KNeighborsClassifier() C9 = AdaBoostClassifier() C10 = MLPClassifier(alpha=1, max_iter=1000) C11 = RidgeClassifier() C12 = BaggingClassifier() C13 = ExtraTreesClassifier() C14 = XGBRFClassifier() C15 = GradientBoostingClassifier() C16 = GaussianNB() C17 = HistGradientBoostingClassifier() C18 = KNeighborsClassifier() C19 = SVC() C20 = RidgeClassifierCV() Cm = LogisticRegression(max_iter=3000, C=0.2) Cm1 = LogisticRegression(max_iter=3000, C=0.4) Cm2 = LogisticRegression(max_iter=3000, C=0.6) Cm3 = LogisticRegression(max_iter=3000, C=0.8) Cm4 = LogisticRegression(max_iter=3000, C=1) names = [ 'XGBClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'HistGradientBoostingClassifier', 'GradientBoostingClassifier', 'BaggingClassifier', 'ExtraTreesClassifier', 'XGBRFClassifier',
) print(f"Before Label Encoder, x['Gender'].unique() = { x['Gender'].unique() }") from sklearn.preprocessing import StandardScaler sc = StandardScaler() print(f"\nBefore Standard Scaler, x.head() :- \n{ x.head() }") x = sc.fit_transform(x) print(f"\nAfter Standard Scaler, x :- \n{ x }") from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) from xgboost import XGBRFClassifier xgboost = XGBRFClassifier() xgboost.fit(x_train, y_train) y_pred = xgboost.predict(x_test) print( f"xgboost.score( x_test, y_test ) = { xgboost.score( x_test, y_test ) * 100 }%" ) import matplotlib.pyplot as plt plt.plot(x_test, y_test, label='Actual', marker='*', color='blue',
from sklearn.datasets import load_breast_cancer from sklearn.metrics import accuracy_score, r2_score from sklearn.model_selection import train_test_split from xgboost import XGBRFRegressor, XGBRFClassifier # x, y = load_breast_cancer(return_X_y=True) x_train, x_test, y_train, y_test = train_test_split(x,y, train_size=0.8) model = XGBRFClassifier(n_estimators=1000, learning_rate=0.1) model.fit(x_train, y_train, verbose=True, eval_metric="error", eval_set = [(x_train, y_train), (x_test, y_test)]) #rmse,mae,logloss,error,auc results = model.evals_result() print("eval:", results) y_pred = model.predict(x_test) acc = accuracy_score(y_test, y_pred) print("acc:", acc) # import pickle # pickle.dump(model, open("./model/sample/xgb_save/cancer.pickle.dat", "wb")) # import joblib # joblib.dump(model, "./model/sample/xgb_save/cancer.joblib.dat") model.save_model("./model/sample/xgb_save/cancer.model")
accuracy = accuracy_score(y_train, y_hat) print(accuracy) return y_hat # SVM svm_pred = train_and_test_survived(svm.SVC(kernel='rbf', C=100, gamma=0.01)) #kNN knn_pred_4 = train_and_test_survived(KNeighborsClassifier(n_neighbors=6)) # Random Forest rf_pred = train_and_test_survived( RandomForestClassifier(n_estimators=400, random_state=14)) # XGBRF Classifier xgbrf_pred = train_and_test_survived(XGBRFClassifier(n_estimators=100)) # LGBM Classifier lgbm_pred = train_and_test_survived( LGBMClassifier(boosting_type='gbdt', random_state=90, colsample_bytree=0.9, max_depth=5, subsample=0.9, n_estimators=40)) #%% from xgboost import XGBRFClassifier from lightgbm import LGBMClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn import svm
# negative = train_data[train_data.target == 0] # positive = train_data[train_data.target == 1] # # # downsample majority # neg_downsampled = resample(negative, # replace=True, # sample with replacement # n_samples=len(positive), # match number in minority class # random_state=27) # reproducible results # # combine minority and downsampled majority # downsampled = pd.concat([positive, neg_downsampled]).dropna() # check new class counts # # X_train = pd.DataFrame(downsampled.drop(columns="target"), index=downsampled.index) # y_train = pd.Series(downsampled["target"], index=downsampled.index) my_model = XGBRFClassifier(random_state=1).fit(X_train, y_train) predictions = my_model.predict(X_val) print("Matthews Correlation Coefficient: " + str(matthews_corrcoef(predictions, y_val))) print("Precision Score: " + str(precision_score(predictions, y_val))) print("Recall Score: " + str(recall_score(predictions, y_val))) ROC_curve(y_val, predictions) X_train_filtered = pd.DataFrame(X_train).iloc[:, [ 173, 141, 530, 683, 661, 498, 48, 183, 206, 716, 697, 185, 211, 624, 671, 623, 67, 111, 118, 129 ]] X_val_filtered = pd.DataFrame(X_val).iloc[:, [ 173, 141, 530, 683, 661, 498, 48, 183, 206, 716, 697, 185, 211, 624, 671,
from xgboost import XGBRFClassifier from sklearn.model_selection import train_test_split df = pd.read_csv("heart_failure_clinical_records_dataset.csv") t = np.array(list(df['creatinine_phosphokinase'])).reshape(-1, 1) pt = PowerTransformer(method="yeo-johnson") creatinine_phosphokinase = pt.fit_transform(t) df['creatinine_phosphokinase'] = creatinine_phosphokinase t = np.array(list(df['serum_creatinine'])).reshape(-1, 1) pt = PowerTransformer(method="yeo-johnson") serum_creatinine = pt.fit_transform(t) df['serum_creatinine'] = serum_creatinine df.drop(columns=['sex', 'diabetes'], inplace=True) X = df.iloc[:, 0:10].values Y = df['DEATH_EVENT'].values x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=6) xrclf = XGBRFClassifier() xrclf.fit(x_train, y_train) pickle.dump(xrclf, open('xrclf.pkl', 'wb')) clf = pickle.load(open('xrclf.pkl', 'rb')) print(clf.score(x_test, y_test))
from sklearn.datasets import load_breast_cancer from sklearn.metrics import accuracy_score, r2_score from sklearn.model_selection import train_test_split from xgboost import XGBRFRegressor, XGBRFClassifier # x, y = load_breast_cancer(return_X_y=True) x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8) model = XGBRFClassifier(n_estimators=1000, learning_rate=0.1) model.fit(x_train, y_train, verbose=True, eval_metric="error", eval_set=[(x_train, y_train), (x_test, y_test)]) #rmse,mae,logloss,error,auc results = model.evals_result() print("eval:", results) y_pred = model.predict(x_test) acc = accuracy_score(y_test, y_pred) print("acc:", acc) # import pickle # pickle.dump(model, open("./model/sample/xgb_save/cancer.pickle.dat", "wb")) import joblib
# 화귀 모델 iris = load_iris() x = iris.data y = iris.target print(x.shape) # (506, 13) print(y.shape) x_train, x_test, y_train, y_test = train_test_split(x,y, test_size =0.2, shuffle=True) n_estimators = 3300 # 나무의 개수 learning_rate = 1 # 학습률 colsample_bytree = 0.92 # 0.6~0.9사용 colsample_bylevel = 0.92 # 0.6~0.9사용 max_depth = 6 n_jobs = -1 model = XGBRFClassifier(maxdepth= max_depth, learning_rate = learning_rate, n_estimators = n_estimators, n_jobs=n_jobs, colsample_bytree = colsample_bytree, colsample_bylevel = colsample_bylevel) # 결측치제거 전처리 안해도된다. model.fit(x_train, y_train) score = model.score(x_test, y_test) print('정수: ', score) plot_importance(model) # plt.show()
## 데이터 x, y = load_breast_cancer(return_X_y=True) print(x.shape) # (506, 13) print(y.shape) # (506,) ## train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=66) ## 모델링 model = XGBRFClassifier( n_estimators=300, # verbose의 갯수, epochs와 동일 learning_rate=0.1) model.fit(x_train, y_train, verbose=True, eval_metric=['error', 'auc'], eval_set=[(x_train, y_train), (x_test, y_test)]) # early_stopping_rounds = 100) # eval_metic의 종류 : rmse, mae, logloss, error(error가 0.2면 accuracy는 0.8), auc(정확도, 정밀도; accuracy의 친구다) results = model.evals_result() print("eval's result : ", results) y_pred = model.predict(x_test)
) svm_svc = svm.SVC(C=50, degree=1, gamma="auto", kernel="rbf", probability=True, random_state=RANDOM_STATE) svm_nu = svm.NuSVC(degree=1, kernel="rbf", nu=0.25, probability=True, random_state=RANDOM_STATE) mlpc = MLPClassifier(activation="relu", alpha=0.1, hidden_layer_sizes=(10, 10, 10), learning_rate="constant", max_iter=3000, random_state=RANDOM_STATE) xgboost = XGBClassifier(n_estimators=600, objective='multi:softmax', use_label_encoder=False, nthread=1) xgforest = XGBRFClassifier(n_estimators=600, objective='multi:softmax', subsample=0.9, colsample_bynode=0.2, use_label_encoder=False) model_name = PATH_MODEL + 'RandomForestOptimized.sav' loaded_model = pickle.load(open(model_name, 'rb'))
#特征向量化 dv = DictVectorizer(sparse=False) #对字典形式的非数值value进行onehot train_features = dv.fit_transform( train_features.to_dict(orient='record')) #to_dict转为字典形式 test_features = dv.transform(test_features.to_dict(orient='record')) #定义模型 #定义分类器 classifiers = [ SVC(random_state=1), DecisionTreeClassifier(random_state=1), KNeighborsClassifier(), LogisticRegression(random_state=1), CatBoostClassifier(random_state=1), XGBRFClassifier(random_state=1), LGBMClassifier(random_state=1) ] #定义分类器名字 classifiers_names = ['svc', 'dt', 'knn', 'lr', 'cbc', 'xgbfc', 'lgbmc'] #定义参数 # classifiers_param=[{'svc_C':[0.1,1,10]},{'dt_min_samples_split':[1,3,5]},{'knn_n_neighbors':[3,5,7]},{'lr_c':[0.1,1,10]}, # {'cbc_learning_rate':[0.01,0.05,0.1]},{'xgbfc_learning_rate':[0.01,0.05,0.1]},{'lgbmc_learning_rate':[0.01,0.05,0.1]}] classifiers_param = [{ 'C': [0.1, 0.5, 1] }, { 'criterion': ['gini', 'entropy'] }, { 'n_neighbors': [1, 2, 3]
def HyperOptPipeline(algo, n_iter=-1): if algo in ['linreg', 'logreg', 'svr', 'svc']: ss = StandardScaler() mms = MinMaxScaler() if algo == 'linreg': model_linreg = LinearRegression() model_lasso = Lasso() model_ridge = Ridge() model_elasticnet = ElasticNet() params = [ { 'scaler': [ss, mms], 'estimator': [model_linreg] },{ 'scaler': [ss, mms], 'estimator__alpha': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100], 'estimator': [model_lasso] },{ 'scaler': [ss, mms], 'estimator__alpha': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100], 'estimator': [model_ridge] },{ 'scaler': [ss, mms], 'estimator__alpha': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100], 'estimator__l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 'estimator': [model_elasticnet] } ] pipeline = Pipeline([('scaler', ss), ('estimator', model_linreg)]) if algo == 'logreg': model_logreg = LogisticRegression(class_weight='balanced', solver='saga', max_iter=100_000) params = [ { 'scaler': [ss, mms], 'estimator__penalty': ['l1', 'l2'], 'estimator__C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100], }, { 'scaler': [ss, mms], 'estimator__penalty': ['elasticnet'], 'estimator__C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100], 'estimator__l1_ratio': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] }, { 'scaler': [ss, mms], 'estimator__penalty': ['none'], }, ] pipeline = Pipeline([('scaler', ss), ('estimator', model_logreg)]) if algo in ['svc', 'svr']: model = SVC(class_weight='balanced') if algo == 'svc' else SVR() params = [ { 'scaler': [ss, mms], 'estimator__kernel': ['linear'], 'estimator__C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100], }, { 'scaler': [ss, mms], 'estimator__kernel': ['rbf', 'sigmoid'], 'estimator__gamma': ['scale', 'auto'], 'estimator__C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100], }, { 'scaler': [ss, mms], 'estimator__kernel': ['poly'], 'estimator__gamma': ['scale', 'auto'], 'estimator__C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100], 'estimator__degree': [2, 3, 4, 5] } ] pipeline = Pipeline([('scaler', ss), ('estimator', model)]) if algo in ['ctree', 'rtree']: if algo == 'ctree': model_rf = RandomForestClassifier(class_weight='balanced') model_gb = GradientBoostingClassifier() model_et = ExtraTreesClassifier(class_weight='balanced') model_xgb = XGBClassifier() model_xgbrf = XGBRFClassifier() model_cb = CatBoostClassifier(bootstrap_type='Bernoulli') model_lgbm = LGBMClassifier(class_weight='balanced') else: model_rf = RandomForestRegressor() model_gb = GradientBoostingRegressor() model_et = ExtraTreesRegressor() model_xgb = XGBRegressor() model_xgbrf = XGBRFRegressor() model_cb = CatBoostRegressor(bootstrap_type='Bernoulli') model_lgbm = LGBMRegressor() params = [ { 'estimator': [model_rf], 'estimator__n_estimators': [10, 50, 100, 250, 500], 'estimator__max_depth': [5, 10, 15, 25, 30, None], 'estimator__min_samples_split': [1.0, 2, 5, 10, 15, 100], 'estimator__min_samples_leaf': [1, 2, 5, 10], }, { 'estimator': [model_gb], 'estimator__n_estimators': [10, 50, 100, 250, 500], 'estimator__learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1], 'estimator__subsample': [0.6, 0.7, 0.8, 0.9, 1.0], 'estimator__max_depth': [3, 5, 7, 9, 12, 15, 17, 25], 'estimator__min_samples_split': [1.0, 2, 5, 10, 15, 100], 'estimator__min_samples_leaf': [1, 2, 5, 10], }, { 'estimator': [model_et], 'estimator__n_estimators': [10, 50, 100, 250, 500], 'estimator__max_depth': [5, 10, 15, 25, 30, None], 'estimator__min_samples_split': [1.0, 2, 5, 10, 15, 100], 'estimator__min_samples_leaf': [1, 2, 5, 10], }, { 'estimator': [model_xgb], 'estimator__n_estimators': [10, 50, 100, 250, 500], 'estimator__learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1], 'estimator__subsample': [0.6, 0.7, 0.8, 0.9, 1.0], 'estimator__max_depth': [3, 5, 7, 9, 12, 15, 17, 25], 'estimator__gamma': [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0], 'estimator__min_child_weight': [1, 3, 5, 7], 'estimator__reg_lambda': [0.01, 0.1, 1.0], 'estimator__reg_alpha': [0, 0.1, 0.5, 1.0], }, { 'estimator': [model_xgbrf], 'estimator__n_estimators': [10, 50, 100, 250, 500], 'estimator__learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1], 'estimator__subsample': [0.6, 0.7, 0.8, 0.9, 1.0], 'estimator__max_depth': [3, 5, 7, 9, 12, 15, 17, 25], 'estimator__gamma': [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0], 'estimator__min_child_weight': [1, 3, 5, 7], 'estimator__reg_lambda': [0.01, 0.1, 1.0], 'estimator__reg_alpha': [0, 0.1, 0.5, 1.0], }, { 'estimator': [model_cb], 'estimator__n_estimators': [10, 50, 100, 250, 500], 'estimator__learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1], 'estimator__subsample': [0.6, 0.7, 0.8, 0.9, 1.0], 'estimator__max_depth': [3, 5, 7, 9, 12, 15, 16], 'estimator__reg_lambda': [0.01, 0.1, 1.0], }, { 'estimator': [model_lgbm], 'estimator__n_estimators': [10, 50, 100, 250, 500], 'estimator__learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1], 'estimator__subsample': [0.6, 0.7, 0.8, 0.9, 1.0], 'estimator__min_child_samples': [1, 2, 5, 10, 15, 100], 'estimator__min_child_weight': [1, 3, 5, 7], 'estimator__reg_lambda': [0.01, 0.1, 1.0], 'estimator__reg_alpha': [0, 0.1, 0.5, 1.0], } ] pipeline = Pipeline([('estimator', model_rf)]) n_params = 0 for param_dict in params: n = 1 for v in param_dict.values(): n *= len(v) n_params += n print(n_params, 'parameter settings identified') if n_iter == -1: return GridSearchCV(pipeline, params, cv=ShuffleSplit(test_size=0.1, n_splits=1, random_state=19)) return RandomizedSearchCV(pipeline, params, n_iter=n_iter, cv=ShuffleSplit(test_size=0.1, n_splits=1, random_state=19), random_state=19)
param_grid=param_grid, scoring='roc_auc', cv=5, n_jobs=-2, verbose=2 ) gridfit_class1_xgb = grid_search.fit(X_train, y_train) print('Best CV roc_auc score:', gridfit_class1_xgb.best_score_) gridfit_class1_xgb.best_params_ # Import classifier, instantiate, and set target variable from xgboost import XGBRFClassifier clf = XGBRFClassifier(random_state=0, n_jobs=-1) y = targets.class2 # Establish estimators to be used with IterativeImputer estimators = [BayesianRidge(), DecisionTreeRegressor(max_features='sqrt', random_state=123), ExtraTreesRegressor(max_features='sqrt', n_estimators=10, random_state=123), KNeighborsRegressor(n_neighbors=15)] # Use our wrapper function to compare imputation methods ax1, ax2 = compare_imputer_scores(X_reduced, y, clf, 'roc_auc', estimators) # Generate training and holdout (testing) set
d = dataset.drop(columns=['PassengerId']) if test: y = None X = d.values else: y = np.ravel(d[['Survived']].values) X = d.drop(columns=['Survived']).values X = preprocessing.scale(X) return (X, y) (Xtrain, ytrain) = for_model_input(trainset) knn_imputer = KNNImputer() Xtrain = knn_imputer.fit_transform(Xtrain) boosted_model = XGBRFClassifier() boosted_model.fit(Xtrain, ytrain) boosted_scores = cross_val_score(boosted_model, Xtrain, ytrain, cv=5) print("Gradient-Boosting Model CV scores:\n", boosted_scores, np.mean(boosted_scores)) (Xtest, _) = for_model_input(testset, test=True) Xtest = knn_imputer.fit_transform(Xtest) predictions_boosted = boosted_model.predict(Xtest) # + 1) / 2 predictions_boosted = predictions_boosted.astype('int64') pred_boosted_df = pandas.DataFrame(predictions_boosted, columns=['Survived']) fin_ans_boosted = pandas.DataFrame( testset['PassengerId']).join(pred_boosted_df) with open('predictions_xgboost_rf.csv', 'w') as f: f.write((fin_ans_boosted.to_csv(index=False)))
print(x.shape) # (506, 13) print(y.shape) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True) parameters = [{ 'n_estimators': [300, 500, 3300], 'learning_rate': [0.01, 0.5, 1], 'colsample_bytree': [0.6, 0.8, 0.9], # 0.6~0.9사용 'colsample_bylevel': [0.6, 0.8, 0.9], 'max_depth': [6, 7, 8] }] model = GridSearchCV(XGBRFClassifier(), parameters, cv=5, n_jobs=-1) # 결측치제거 전처리 안해도된다. model.fit(x_train, y_train) print(model.best_estimator_) print("==========================================") print(model.best_params_) print("==========================================") score = model.score(x_test, y_test) print('정수: ', score) # plot_importance(model) # plt.show()
# print("r2 Score : %.2f%%" %(r2 * 100)) print("acc : ", acc) thresholds = np.sort(model.feature_importances_) print(thresholds) for thresh in thresholds: #중요하지 않은 컬럼들을 하나씩 지워나간다. selection = SelectFromModel(model, threshold=thresh, prefit=True) selection_x_train = selection.transform(x_train) selection_x_test = selection.transform(x_test) print(selection_x_train.shape) selection_model = XGBRFClassifier(objective="multi:softprob", n_jobs=-1) selection_model.fit(selection_x_train, y_train, eval_metric=['merror', 'mlogloss'], eval_set=[(selection_x_train, y_train), (selection_x_test, y_test)]) y_pred = selection_model.predict(selection_x_test) acc = accuracy_score(y_test, y_pred) #print("R2:",r2) for i in thresholds: pickle.dump( model, open(
x_test = scaler.transform(x_test) # 이 정도만 조작해 주면 됨 n_estimators = 1000 # The number of trees in the forest. learning_rate = 1 # 학습률 colsample_bytree = None # 트리의 샘플 / 실전0.6 ~ 0.9 사이 씀 / 실전 1씀 colsample_bylevel = 0.9 # [기본설정값: 1]: subsample, colsample_bytree 두 초모수 설정을 통해서 이미 의사결정나무 모형 개발에 사용될 변수갯수와 관측점 갯수를 사용했는데 추가로 colsample_bylevel을 지정하는 것이 특별한 의미를 갖는지 의문이 듦. max_depth = 29 # [기본설정값: 6]: 과적합 방지를 위해서 사용되는데 역시 CV를 사용해서 적절한 값이 제시되어야 하고 보통 3-10 사이 값이 적용된다. n_jobs = -1 # CV 써라 # XGB 속도가 굉장히 빠름, 전처리 결측치 제거 안해줘도 됨 model = XGBRFClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, colsample_bylevel=colsample_bylevel, colsample_bytree=colsample_bytree) model.fit(x_train, y_train) score = model.score(x_test, y_test) # score는 evaluate print('점수 :', score) # print(model.feature_importances_) plot_importance(model) # plt.show() # XGBRFClassifier 점수 : 0.9666666666666667 # XGBClassifier 점수 : 0.8666666666666667
def main(): obj_left = load('rfecv_lefthemisphere_RF.joblib') obj_right = load('rfecv_righthemisphere_RF.joblib') # Get the data ready df_leftHemi_train, df_rightHemi_train, df_test_left, df_test_right = \ get_csvfile_ready(constants.DATADIR_aparc) # Correlation analysis df_leftHemi_train_corr = correlation_analysis(df_leftHemi_train) df_rightHemi_train_corr = correlation_analysis(df_rightHemi_train) selected_left_feats = df_leftHemi_train_corr.columns[np.where( obj_left.ranking_ == 1)[0]] selected_right_feats = df_rightHemi_train_corr.columns[np.where( obj_right.ranking_ == 1)[0]] X_left_clean = df_leftHemi_train_corr[selected_left_feats] X_right_clean = df_rightHemi_train_corr[selected_right_feats] y_left = df_leftHemi_train_corr['labels'] y_right = df_rightHemi_train_corr['labels'] rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=372957125) left_scores_dict = defaultdict(list) right_scores_dict = defaultdict(list) for train_index, test_index in rskf.split(X_left_clean, y_left): X_left_clean_train, X_left_clean_test = X_left_clean.iloc[ train_index, :], X_left_clean.iloc[test_index, :] X_right_clean_train, X_right_clean_test = X_right_clean.iloc[ train_index, :], X_right_clean.iloc[test_index, :] y_left_train, y_left_test = y_left.iloc[train_index], y_left.iloc[ test_index] y_right_train, y_right_test = y_right.iloc[train_index], y_right.iloc[ test_index] rf_clc_left, rf_clc_right = RandomForestClassifier( ), RandomForestClassifier() left_scores_dict = get_score_dict(rf_clc_left, X_left_clean_train, y_left_train, X_left_clean_test, y_left_test, left_scores_dict, 'rf') right_scores_dict = get_score_dict(rf_clc_right, X_right_clean_train, y_right_train, X_right_clean_test, y_right_test, right_scores_dict, 'rf') rf_clc_left, rf_clc_right = LogisticRegression(), LogisticRegression() left_scores_dict = get_score_dict(rf_clc_left, X_left_clean_train, y_left_train, X_left_clean_test, y_left_test, left_scores_dict, 'lg') right_scores_dict = get_score_dict(rf_clc_right, X_right_clean_train, y_right_train, X_right_clean_test, y_right_test, right_scores_dict, 'lg') rf_clc_left, rf_clc_right = XGBClassifier(), XGBClassifier() left_scores_dict = get_score_dict(rf_clc_left, X_left_clean_train, y_left_train, X_left_clean_test, y_left_test, left_scores_dict, 'xgb') right_scores_dict = get_score_dict(rf_clc_right, X_right_clean_train, y_right_train, X_right_clean_test, y_right_test, right_scores_dict, 'xgb') rf_clc_left, rf_clc_right = XGBRFClassifier(), XGBRFClassifier() left_scores_dict = get_score_dict(rf_clc_left, X_left_clean_train, y_left_train, X_left_clean_test, y_left_test, left_scores_dict, 'xgbrf') right_scores_dict = get_score_dict(rf_clc_right, X_right_clean_train, y_right_train, X_right_clean_test, y_right_test, right_scores_dict, 'xgbrf') rf_clc_left, rf_clc_right = GaussianNB(), GaussianNB() left_scores_dict = get_score_dict(rf_clc_left, X_left_clean_train, y_left_train, X_left_clean_test, y_left_test, left_scores_dict, 'nb') right_scores_dict = get_score_dict(rf_clc_right, X_right_clean_train, y_right_train, X_right_clean_test, y_right_test, right_scores_dict, 'nb') rf_clc_left, rf_clc_right = SVC(), SVC() left_scores_dict = get_score_dict(rf_clc_left, X_left_clean_train, y_left_train, X_left_clean_test, y_left_test, left_scores_dict, 'rsvm') right_scores_dict = get_score_dict(rf_clc_right, X_right_clean_train, y_right_train, X_right_clean_test, y_right_test, right_scores_dict, 'rsvm') rf_clc_left, rf_clc_right = LinearSVC(), LinearSVC() left_scores_dict = get_score_dict(rf_clc_left, X_left_clean_train, y_left_train, X_left_clean_test, y_left_test, left_scores_dict, 'lsvm') right_scores_dict = get_score_dict(rf_clc_right, X_right_clean_train, y_right_train, X_right_clean_test, y_right_test, right_scores_dict, 'lsvm')