Exemplo n.º 1
0
def build_model(X_train, y_train, X_valid, y_valid):
    best_params = {
        'base_score': 2,
        'colsample_bylevel': 0.75,
        'colsample_bynode': 0.57,
        'colsample_bytree': 0.95,
        'gamma': 0.25,
        'learning_rate': 1.7,
        'max_depth': 18,
        'min_child_weight': 0.025,
        'n_estimators': 353,
        'n_jobs': -1,
        'num_class': 3,
        'num_parallel_tree': 105,
        'objective': 'multi:softmax',
        'random_state': 42,
        'subsample': 0.8,
        'verbosity': 0,
        'reg_alpha': 0.05,
        'reg_lambda': 1,
        'rate_drop': 0.5
    }
    best_xgb = XGBRFClassifier(**best_params)

    best_xgb.fit(X_train, y_train,
                 eval_set=[(X_train, y_train),
                           (X_valid, y_valid)],
                 eval_metric=['merror'],
                 early_stopping_rounds=50,
                 callbacks=[print_evaluation(period=5),
                            early_stop(stopping_rounds=15)],
                 verbose=False,)
    return best_xgb
Exemplo n.º 2
0
def test_xg_XGBRFClassifier():
    print("Testing xgboost, XGBRFClassifier...")
    # Note, only works with binary outcomes!
    mod = XGBRFClassifier()
    X, y = iris_data
    ybin = np.where(y <= 1, 0, 1)
    mod.fit(X, ybin)
    docs = {'name': "XGBRFClassifier test"}
    fv = X[0, :]
    upload(mod, fv, docs)
Exemplo n.º 3
0
def fast_gbtree_classifier(
    X,
    y,
    *,
    learning_rate: float = 1.0,
    n_estimators: int = 100,
    subsample: float = 0.8,
    max_depth: Optional[int] = None,
    reg_alpha: Optional[float] = None,  # L1
    reg_lambda: Optional[float] = 1e-05,  # L2
    gamma: Optional[float] = None,
    missing: Optional[Any] = np.nan,
    objective: Objectives = 'binary:logistic',
    grow_policy: Literal['depthwise', 'lossguide'] = 'depthwise',
    tree_method: Literal['auto', 'exact', 'approx', 'hist',
                         'gpu_hist'] = 'auto',
    importance_type: Literal['gain', 'weight', 'cover', 'total_gain',
                             'total_cover'] = 'gain',
    random_state: int = 1,
    n_jobs: Optional[int] = None,
    framework: Literal['auto', 'xgboost', 'sklearn'] = 'auto',
    **kwargs,
) -> GradientBoostingClassifier:
    """Shared interface for XGBoost and sklearn Gradient Boosting Tree Classifier"""
    kw = dict(locals())
    kwargs = kw.pop('kwargs')
    X = kw.pop('X')
    y = kw.pop('y')
    kw.update(kwargs)
    framework = kw.pop('framework')
    ### XGBOOST
    is_xgboost = False
    if framework == 'sklearn':
        XGB = GradientBoostingClassifier
    else:
        try:
            from xgboost import XGBRFClassifier as XGB
            is_xgboost = True
        except ImportError as e:
            warn('Run `pip install xgboost` to get significant '
                 'faster GradientBoostingTree')
            XGB = GradientBoostingClassifier
    ### fine-tune the keywords for sklearn
    if not is_xgboost:
        org = dict(kw)
        spec = inspect.getfullargspec(XGB.__init__)
        kw = dict()
        for k in spec.args + spec.kwonlyargs:
            if k in org:
                kw[k] = org[k]
    ### training
    tree = XGB(**kw)
    tree.fit(X, y)
    return tree
Exemplo n.º 4
0
def get_models():
    models = dict()
    for v in arange(0.1, 1.1, 0.1):
        key = '%.1f' % v
        models[key] = XGBRFClassifier(n_estimators=100,
                                      subsample=0.9,
                                      colsample_bynode=v)
    return models
Exemplo n.º 5
0
    def _set_surrogate(self, X, y=None):

        if not hasattr(self, "_surrogate"):
            target = type_of_target(y)
            if target == "continuous":
                self._surrogate = XGBRFRegressor(max_depth=7, n_estimators=150)
            elif target in ["binary", "multiclass"]:
                self._surrogate = XGBRFClassifier(max_depth=7,
                                                  n_estimators=150)
            else:
                raise ValueError(
                    "Multioutput and multilabel datasets is not supported.")
Exemplo n.º 6
0
    def xgrfboost_classification(train,
                                 target,
                                 n_estimators=100,
                                 max_depth=8,
                                 random_state=17,
                                 learning_rate=0.1,
                                 colsample_bytree=0.9,
                                 colsample_bynode=0.9,
                                 colsample_bylevel=0.9,
                                 importance_type='split',
                                 reg_alpha=2,
                                 reg_lambda=2):
        '''XGRFBoost Classification
           Params :-
           train - Training Set to train
           target - Target Set to predict
           n_estimators - no. of trees to predict (default set to 100)
           max_depth - Maximum depth that a tree can grow (default set to 8)
           random_state - A arbitary number to get same results when run on different machine with same params (default set to 17)
           learning_rate - size of step to to attain towards local minima
           colsample_bytree, colsample_bynode, colsample_bylevel - part of total features to use bytree, bynode, bylevel
           importance_type - metric to split samples (default set to split)
           reg_alpha, reg_lambda - L1 regularisation and L2 regularisation respectively'''

        from xgboost import XGBRFClassifier
        model = XGBRFClassifier(n_estimators=n_estimators,
                                max_depth=max_depth,
                                random_state=random_state,
                                learning_rate=learning_rate,
                                colsample_bytree=colsample_bytree,
                                colsample_bynode=colsample_bynode,
                                colsample_bylevel=colsample_bylevel,
                                importance_type=importance_type,
                                reg_alpha=reg_alpha,
                                reg_lambda=reg_lambda)
        model.fit(train, target)
        print("Training Completed .....")

        return model
Exemplo n.º 7
0
 def train_XGB(self):
     try:
         self.xgb_signal = XGBRegressor()
         self.xgb_coverage = XGBClassifier()
         self.xgb_coverage2 = XGBRFClassifier()
         self.xgb_polluted = XGBClassifier()
         self.xgb_hole = XGBClassifier()
         self.xgb_preds = other_result(
             'xgb', self.xgb_signal, self.xgb_coverage, self.xgb_coverage2,
             self.xgb_polluted, self.xgb_hole, self.df_train, self.df_val,
             self.df_train_per_locs, self.df_val_per_locs,
             self.pollute_happened, self.model_info)
     except Exception:
         traceback.print_exc()
Exemplo n.º 8
0
    def multi_default_models(self, models=None):
        if models:
            ob2 = cl_modeling(self.X_train, self.X_test, self.y_train, self.y_test)
            for model in models:
                print(model)
                ob2.train_predict_model(model)
                print()
        else:
            ob2 = cl_modeling(self.X_train, self.X_test, self.y_train, self.y_test)
            # Import model's lib
            from sklearn.ensemble import RandomForestClassifier
            from sklearn.linear_model import LogisticRegression
            from sklearn.naive_bayes import GaussianNB
            from sklearn.tree import DecisionTreeClassifier
            from lightgbm import LGBMClassifier
            from xgboost import XGBRFClassifier

            # Setting Models
            nb = GaussianNB()
            rf = RandomForestClassifier(
                criterion="entropy", n_estimators=500, max_depth=6
            )
            lr = LogisticRegression(solver="lbfgs", max_iter=1000)
            dt = DecisionTreeClassifier(max_depth=13, min_samples_leaf=10)
            xgb = XGBRFClassifier(max_depth=10, learning_rate=0.1)
            lgbm_rf = LGBMClassifier(
                boosting_type="rf",
                n_jobs=1,
                bagging_freq=3,
                bagging_fraction=0.3,
                importance_type="gain",
            )
            lgbm_dart = LGBMClassifier(
                boosting_type="dart", n_jobs=1, importance_type="gain"
            )
            lgbm = LGBMClassifier(n_jobs=1, importance_type="gain")

            # Evaluating
            model_list = [nb, lr, dt, rf, xgb, lgbm_rf, lgbm_dart, lgbm]
            for model in model_list:
                print(model)
                ob2.train_predict_model(model)
                print()
            ob2 = cl_modeling(self.X_train, self.X_test, self.y_train, self.y_test)
Exemplo n.º 9
0
x = np.load('./data/x_data.npy')
y = np.load('./data/y_data.npy')
x_pred = np.load('./data/x_pred.npy')

print("x.shape :", x.shape)
print("y.shape :", y.shape)
print("x_pred.shape :", x_pred.shape)

x = x.reshape(x.shape[0], 64 * 64 * 3)

x_pred = x_pred.reshape(x_pred.shape[0], 64 * 64 * 3)

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=77,
                                                    shuffle=True)

# model = XGBClassifier()
model = MultiOutputClassifier(XGBRFClassifier())

# 3. 훈련
model.fit(x_train, y_train)

# 4. 평가, 예측
acc = model.score(x_test, y_test)

print("acc :", acc)

y_pred = model.predict(x_pred)
Exemplo n.º 10
0
# base learners
# C0=best_classifier
C1 = DecisionTreeClassifier(max_depth=8)
C2 = CatBoostClassifier(verbose=0)
C3 = KNeighborsClassifier()
C4 = BernoulliNB()
C5 = RandomForestClassifier()
C6 = XGBClassifier()
C7 = RidgeClassifier()
C8 = KNeighborsClassifier()
C9 = AdaBoostClassifier()
C10 = MLPClassifier(alpha=1, max_iter=1000)
C11 = RidgeClassifier()
C12 = BaggingClassifier()
C13 = ExtraTreesClassifier()
C14 = XGBRFClassifier()
C15 = GradientBoostingClassifier()
C16 = GaussianNB()
C17 = HistGradientBoostingClassifier()
C18 = KNeighborsClassifier()
C19 = SVC()
C20 = RidgeClassifierCV()
Cm = LogisticRegression(max_iter=3000, C=0.2)
Cm1 = LogisticRegression(max_iter=3000, C=0.4)
Cm2 = LogisticRegression(max_iter=3000, C=0.6)
Cm3 = LogisticRegression(max_iter=3000, C=0.8)
Cm4 = LogisticRegression(max_iter=3000, C=1)
names = [
    'XGBClassifier', 'RidgeClassifier', 'RidgeClassifierCV',
    'HistGradientBoostingClassifier', 'GradientBoostingClassifier',
    'BaggingClassifier', 'ExtraTreesClassifier', 'XGBRFClassifier',
Exemplo n.º 11
0
)
print(f"Before Label Encoder, x['Gender'].unique() = { x['Gender'].unique() }")

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

print(f"\nBefore Standard Scaler, x.head() :- \n{ x.head() }")
x = sc.fit_transform(x)
print(f"\nAfter Standard Scaler, x :- \n{ x }")

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

from xgboost import XGBRFClassifier
xgboost = XGBRFClassifier()

xgboost.fit(x_train, y_train)
y_pred = xgboost.predict(x_test)

print(
    f"xgboost.score( x_test, y_test ) = { xgboost.score( x_test, y_test ) * 100 }%"
)

import matplotlib.pyplot as plt

plt.plot(x_test,
         y_test,
         label='Actual',
         marker='*',
         color='blue',
Exemplo n.º 12
0
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score, r2_score
from sklearn.model_selection import train_test_split
from xgboost import XGBRFRegressor, XGBRFClassifier

#
x, y = load_breast_cancer(return_X_y=True)

x_train, x_test, y_train, y_test = train_test_split(x,y, train_size=0.8)

model = XGBRFClassifier(n_estimators=1000, learning_rate=0.1)

model.fit(x_train, y_train, verbose=True, eval_metric="error",
                    eval_set = [(x_train, y_train), (x_test, y_test)])

#rmse,mae,logloss,error,auc

results = model.evals_result()
print("eval:", results)

y_pred = model.predict(x_test)
acc = accuracy_score(y_test, y_pred) 
print("acc:", acc)

# import pickle
# pickle.dump(model, open("./model/sample/xgb_save/cancer.pickle.dat", "wb"))

# import joblib
# joblib.dump(model, "./model/sample/xgb_save/cancer.joblib.dat")

model.save_model("./model/sample/xgb_save/cancer.model")
Exemplo n.º 13
0
    accuracy = accuracy_score(y_train, y_hat)
    print(accuracy)

    return y_hat


# SVM
svm_pred = train_and_test_survived(svm.SVC(kernel='rbf', C=100, gamma=0.01))
#kNN
knn_pred_4 = train_and_test_survived(KNeighborsClassifier(n_neighbors=6))
# Random Forest
rf_pred = train_and_test_survived(
    RandomForestClassifier(n_estimators=400, random_state=14))
# XGBRF Classifier
xgbrf_pred = train_and_test_survived(XGBRFClassifier(n_estimators=100))
# LGBM Classifier
lgbm_pred = train_and_test_survived(
    LGBMClassifier(boosting_type='gbdt',
                   random_state=90,
                   colsample_bytree=0.9,
                   max_depth=5,
                   subsample=0.9,
                   n_estimators=40))

#%%
from xgboost import XGBRFClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
Exemplo n.º 14
0
# negative = train_data[train_data.target == 0]
# positive = train_data[train_data.target == 1]
#
# # downsample majority
# neg_downsampled = resample(negative,
#                            replace=True,  # sample with replacement
#                            n_samples=len(positive),  # match number in minority class
#                            random_state=27)  # reproducible results
# # combine minority and downsampled majority
# downsampled = pd.concat([positive, neg_downsampled]).dropna()
# check new class counts
#
# X_train = pd.DataFrame(downsampled.drop(columns="target"), index=downsampled.index)
# y_train = pd.Series(downsampled["target"], index=downsampled.index)

my_model = XGBRFClassifier(random_state=1).fit(X_train, y_train)

predictions = my_model.predict(X_val)

print("Matthews Correlation Coefficient: " +
      str(matthews_corrcoef(predictions, y_val)))
print("Precision Score: " + str(precision_score(predictions, y_val)))
print("Recall Score: " + str(recall_score(predictions, y_val)))
ROC_curve(y_val, predictions)

X_train_filtered = pd.DataFrame(X_train).iloc[:, [
    173, 141, 530, 683, 661, 498, 48, 183, 206, 716, 697, 185, 211, 624, 671,
    623, 67, 111, 118, 129
]]
X_val_filtered = pd.DataFrame(X_val).iloc[:, [
    173, 141, 530, 683, 661, 498, 48, 183, 206, 716, 697, 185, 211, 624, 671,
Exemplo n.º 15
0
from xgboost import XGBRFClassifier
from sklearn.model_selection import train_test_split

df = pd.read_csv("heart_failure_clinical_records_dataset.csv")

t = np.array(list(df['creatinine_phosphokinase'])).reshape(-1, 1)
pt = PowerTransformer(method="yeo-johnson")
creatinine_phosphokinase = pt.fit_transform(t)
df['creatinine_phosphokinase'] = creatinine_phosphokinase

t = np.array(list(df['serum_creatinine'])).reshape(-1, 1)
pt = PowerTransformer(method="yeo-johnson")
serum_creatinine = pt.fit_transform(t)
df['serum_creatinine'] = serum_creatinine

df.drop(columns=['sex', 'diabetes'], inplace=True)
X = df.iloc[:, 0:10].values
Y = df['DEATH_EVENT'].values

x_train, x_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.1,
                                                    random_state=6)

xrclf = XGBRFClassifier()
xrclf.fit(x_train, y_train)

pickle.dump(xrclf, open('xrclf.pkl', 'wb'))

clf = pickle.load(open('xrclf.pkl', 'rb'))
print(clf.score(x_test, y_test))
Exemplo n.º 16
0
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score, r2_score
from sklearn.model_selection import train_test_split
from xgboost import XGBRFRegressor, XGBRFClassifier

#
x, y = load_breast_cancer(return_X_y=True)

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8)

model = XGBRFClassifier(n_estimators=1000, learning_rate=0.1)

model.fit(x_train,
          y_train,
          verbose=True,
          eval_metric="error",
          eval_set=[(x_train, y_train), (x_test, y_test)])

#rmse,mae,logloss,error,auc

results = model.evals_result()
print("eval:", results)

y_pred = model.predict(x_test)
acc = accuracy_score(y_test, y_pred)
print("acc:", acc)

# import pickle
# pickle.dump(model, open("./model/sample/xgb_save/cancer.pickle.dat", "wb"))

import joblib
Exemplo n.º 17
0
# 화귀 모델
iris = load_iris()
x = iris.data
y = iris.target

print(x.shape) # (506, 13)
print(y.shape) 

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size =0.2, shuffle=True)

n_estimators = 3300    # 나무의 개수
learning_rate = 1    # 학습률
colsample_bytree = 0.92  # 0.6~0.9사용
colsample_bylevel = 0.92 # 0.6~0.9사용

max_depth = 6
n_jobs = -1

model = XGBRFClassifier(maxdepth= max_depth, learning_rate = learning_rate,
                    n_estimators = n_estimators, n_jobs=n_jobs,
                    colsample_bytree = colsample_bytree,
                    colsample_bylevel = colsample_bylevel) # 결측치제거 전처리 안해도된다.

model.fit(x_train, y_train)

score = model.score(x_test, y_test)
print('정수: ', score)

plot_importance(model)
# plt.show()
Exemplo n.º 18
0
## 데이터
x, y = load_breast_cancer(return_X_y=True)
print(x.shape)  # (506, 13)
print(y.shape)  # (506,)

## train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=66)

## 모델링
model = XGBRFClassifier(
    n_estimators=300,  # verbose의 갯수, epochs와 동일
    learning_rate=0.1)

model.fit(x_train,
          y_train,
          verbose=True,
          eval_metric=['error', 'auc'],
          eval_set=[(x_train, y_train), (x_test, y_test)])
#   early_stopping_rounds = 100)
# eval_metic의 종류 : rmse, mae, logloss, error(error가 0.2면 accuracy는 0.8), auc(정확도, 정밀도; accuracy의 친구다)

results = model.evals_result()
print("eval's result : ", results)

y_pred = model.predict(x_test)
Exemplo n.º 19
0
)
svm_svc = svm.SVC(C=50,
                  degree=1,
                  gamma="auto",
                  kernel="rbf",
                  probability=True,
                  random_state=RANDOM_STATE)
svm_nu = svm.NuSVC(degree=1,
                   kernel="rbf",
                   nu=0.25,
                   probability=True,
                   random_state=RANDOM_STATE)
mlpc = MLPClassifier(activation="relu",
                     alpha=0.1,
                     hidden_layer_sizes=(10, 10, 10),
                     learning_rate="constant",
                     max_iter=3000,
                     random_state=RANDOM_STATE)

xgboost = XGBClassifier(n_estimators=600,
                        objective='multi:softmax',
                        use_label_encoder=False,
                        nthread=1)
xgforest = XGBRFClassifier(n_estimators=600,
                           objective='multi:softmax',
                           subsample=0.9,
                           colsample_bynode=0.2,
                           use_label_encoder=False)

model_name = PATH_MODEL + 'RandomForestOptimized.sav'
loaded_model = pickle.load(open(model_name, 'rb'))
Exemplo n.º 20
0
#特征向量化
dv = DictVectorizer(sparse=False)  #对字典形式的非数值value进行onehot
train_features = dv.fit_transform(
    train_features.to_dict(orient='record'))  #to_dict转为字典形式
test_features = dv.transform(test_features.to_dict(orient='record'))

#定义模型
#定义分类器
classifiers = [
    SVC(random_state=1),
    DecisionTreeClassifier(random_state=1),
    KNeighborsClassifier(),
    LogisticRegression(random_state=1),
    CatBoostClassifier(random_state=1),
    XGBRFClassifier(random_state=1),
    LGBMClassifier(random_state=1)
]

#定义分类器名字
classifiers_names = ['svc', 'dt', 'knn', 'lr', 'cbc', 'xgbfc', 'lgbmc']

#定义参数
# classifiers_param=[{'svc_C':[0.1,1,10]},{'dt_min_samples_split':[1,3,5]},{'knn_n_neighbors':[3,5,7]},{'lr_c':[0.1,1,10]},
#                    {'cbc_learning_rate':[0.01,0.05,0.1]},{'xgbfc_learning_rate':[0.01,0.05,0.1]},{'lgbmc_learning_rate':[0.01,0.05,0.1]}]
classifiers_param = [{
    'C': [0.1, 0.5, 1]
}, {
    'criterion': ['gini', 'entropy']
}, {
    'n_neighbors': [1, 2, 3]
Exemplo n.º 21
0
def HyperOptPipeline(algo, n_iter=-1):
    if algo in ['linreg', 'logreg', 'svr', 'svc']:
        ss = StandardScaler()
        mms = MinMaxScaler()
        
    if algo == 'linreg':
        model_linreg = LinearRegression()
        model_lasso = Lasso()
        model_ridge = Ridge()
        model_elasticnet = ElasticNet()
        
        params = [
            {
                'scaler': [ss, mms],
                'estimator': [model_linreg]
            },{
                'scaler': [ss, mms],
                'estimator__alpha': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100],
                'estimator': [model_lasso]
            },{
                'scaler': [ss, mms],
                'estimator__alpha': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100],
                'estimator': [model_ridge]
            },{
                'scaler': [ss, mms],
                'estimator__alpha': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100],
                'estimator__l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
                'estimator': [model_elasticnet]
            }
        ]
        
        pipeline = Pipeline([('scaler', ss), ('estimator', model_linreg)])
        
    if algo == 'logreg':
        model_logreg = LogisticRegression(class_weight='balanced', solver='saga', max_iter=100_000)

        params = [
            {
                'scaler': [ss, mms],
                'estimator__penalty': ['l1', 'l2'],
                'estimator__C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100],
            },
            {
                'scaler': [ss, mms],
                'estimator__penalty': ['elasticnet'],
                'estimator__C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100],
                'estimator__l1_ratio': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
            },
            {
                'scaler': [ss, mms],
                'estimator__penalty': ['none'],
            },
        ]
        
        pipeline = Pipeline([('scaler', ss), ('estimator', model_logreg)])

    if algo in ['svc', 'svr']:
        
        model = SVC(class_weight='balanced') if algo == 'svc' else SVR()
        
        params = [
            {
                'scaler': [ss, mms],
                'estimator__kernel': ['linear'],
                'estimator__C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100],
            },
            {
                'scaler': [ss, mms],
                'estimator__kernel': ['rbf', 'sigmoid'],
                'estimator__gamma': ['scale', 'auto'],
                'estimator__C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100],
            },
            {
                'scaler': [ss, mms],
                'estimator__kernel': ['poly'],
                'estimator__gamma': ['scale', 'auto'],
                'estimator__C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100],
                'estimator__degree': [2, 3, 4, 5]
            }
        ]
        
        pipeline = Pipeline([('scaler', ss), ('estimator', model)])
        
    if algo in ['ctree', 'rtree']:
        if algo == 'ctree':
            model_rf = RandomForestClassifier(class_weight='balanced')
            model_gb = GradientBoostingClassifier()
            model_et = ExtraTreesClassifier(class_weight='balanced')
            model_xgb = XGBClassifier()
            model_xgbrf = XGBRFClassifier()
            model_cb = CatBoostClassifier(bootstrap_type='Bernoulli')
            model_lgbm = LGBMClassifier(class_weight='balanced')
        else:
            model_rf = RandomForestRegressor()
            model_gb = GradientBoostingRegressor()
            model_et = ExtraTreesRegressor()
            model_xgb = XGBRegressor()
            model_xgbrf = XGBRFRegressor()
            model_cb = CatBoostRegressor(bootstrap_type='Bernoulli')
            model_lgbm = LGBMRegressor()

        params =  [
            {
                'estimator': [model_rf],
                'estimator__n_estimators': [10, 50, 100, 250, 500],
                'estimator__max_depth': [5, 10, 15, 25, 30, None],
                'estimator__min_samples_split': [1.0, 2, 5, 10, 15, 100],
                'estimator__min_samples_leaf': [1, 2, 5, 10],
            },
            {
                'estimator': [model_gb],
                'estimator__n_estimators': [10, 50, 100, 250, 500],
                'estimator__learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1],
                'estimator__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
                'estimator__max_depth': [3, 5, 7, 9, 12, 15, 17, 25],
                'estimator__min_samples_split': [1.0, 2, 5, 10, 15, 100],
                'estimator__min_samples_leaf': [1, 2, 5, 10],
            },
            {
                'estimator': [model_et],
                'estimator__n_estimators': [10, 50, 100, 250, 500],
                'estimator__max_depth': [5, 10, 15, 25, 30, None],
                'estimator__min_samples_split': [1.0, 2, 5, 10, 15, 100],
                'estimator__min_samples_leaf': [1, 2, 5, 10],
            },
            {
                'estimator': [model_xgb],
                'estimator__n_estimators': [10, 50, 100, 250, 500],
                'estimator__learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1],
                'estimator__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
                'estimator__max_depth': [3, 5, 7, 9, 12, 15, 17, 25],
                'estimator__gamma': [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
                'estimator__min_child_weight': [1, 3, 5, 7],
                'estimator__reg_lambda': [0.01, 0.1, 1.0],
                'estimator__reg_alpha': [0, 0.1, 0.5, 1.0],
            },
            {
                'estimator': [model_xgbrf],
                'estimator__n_estimators': [10, 50, 100, 250, 500],
                'estimator__learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1],
                'estimator__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
                'estimator__max_depth': [3, 5, 7, 9, 12, 15, 17, 25],
                'estimator__gamma': [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
                'estimator__min_child_weight': [1, 3, 5, 7],
                'estimator__reg_lambda': [0.01, 0.1, 1.0],
                'estimator__reg_alpha': [0, 0.1, 0.5, 1.0],
            },
            {
                'estimator': [model_cb],
                'estimator__n_estimators': [10, 50, 100, 250, 500],
                'estimator__learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1],
                'estimator__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
                'estimator__max_depth': [3, 5, 7, 9, 12, 15, 16],
                'estimator__reg_lambda': [0.01, 0.1, 1.0],
            },
            {
                'estimator': [model_lgbm],
                'estimator__n_estimators': [10, 50, 100, 250, 500],
                'estimator__learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1],
                'estimator__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
                'estimator__min_child_samples': [1, 2, 5, 10, 15, 100],
                'estimator__min_child_weight': [1, 3, 5, 7],
                'estimator__reg_lambda': [0.01, 0.1, 1.0],
                'estimator__reg_alpha': [0, 0.1, 0.5, 1.0],
            } 
        ]  
        
        pipeline = Pipeline([('estimator', model_rf)]) 
    
    n_params = 0        
    for param_dict in params:    
        n = 1
        for v in param_dict.values():
            n *= len(v)
        n_params += n
        
    print(n_params, 'parameter settings identified')
    if n_iter == -1:
        return GridSearchCV(pipeline, params, cv=ShuffleSplit(test_size=0.1, n_splits=1, random_state=19))
    return RandomizedSearchCV(pipeline, params, n_iter=n_iter, cv=ShuffleSplit(test_size=0.1, n_splits=1, random_state=19), random_state=19)
Exemplo n.º 22
0
                           param_grid=param_grid, 
                           scoring='roc_auc', 
                           cv=5,
                           n_jobs=-2,
                           verbose=2
                          )
gridfit_class1_xgb = grid_search.fit(X_train, y_train)


print('Best CV roc_auc score:', gridfit_class1_xgb.best_score_)
gridfit_class1_xgb.best_params_


# Import classifier, instantiate, and set target variable
from xgboost import XGBRFClassifier
clf = XGBRFClassifier(random_state=0, n_jobs=-1)
y = targets.class2

# Establish estimators to be used with IterativeImputer
estimators = [BayesianRidge(),
              DecisionTreeRegressor(max_features='sqrt', random_state=123),
              ExtraTreesRegressor(max_features='sqrt', n_estimators=10, random_state=123),
              KNeighborsRegressor(n_neighbors=15)]

# Use our wrapper function to compare imputation methods
ax1, ax2 = compare_imputer_scores(X_reduced, y, clf, 'roc_auc', estimators)




# Generate training and holdout (testing) set
Exemplo n.º 23
0
    d = dataset.drop(columns=['PassengerId'])
    if test:
        y = None
        X = d.values
    else:
        y = np.ravel(d[['Survived']].values)
        X = d.drop(columns=['Survived']).values
    X = preprocessing.scale(X)
    return (X, y)


(Xtrain, ytrain) = for_model_input(trainset)
knn_imputer = KNNImputer()
Xtrain = knn_imputer.fit_transform(Xtrain)

boosted_model = XGBRFClassifier()
boosted_model.fit(Xtrain, ytrain)
boosted_scores = cross_val_score(boosted_model, Xtrain, ytrain, cv=5)

print("Gradient-Boosting Model CV scores:\n", boosted_scores,
      np.mean(boosted_scores))

(Xtest, _) = for_model_input(testset, test=True)
Xtest = knn_imputer.fit_transform(Xtest)
predictions_boosted = boosted_model.predict(Xtest)  # + 1) / 2
predictions_boosted = predictions_boosted.astype('int64')
pred_boosted_df = pandas.DataFrame(predictions_boosted, columns=['Survived'])
fin_ans_boosted = pandas.DataFrame(
    testset['PassengerId']).join(pred_boosted_df)
with open('predictions_xgboost_rf.csv', 'w') as f:
    f.write((fin_ans_boosted.to_csv(index=False)))
Exemplo n.º 24
0
print(x.shape)  # (506, 13)
print(y.shape)

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True)

parameters = [{
    'n_estimators': [300, 500, 3300],
    'learning_rate': [0.01, 0.5, 1],
    'colsample_bytree': [0.6, 0.8, 0.9],  # 0.6~0.9사용
    'colsample_bylevel': [0.6, 0.8, 0.9],
    'max_depth': [6, 7, 8]
}]

model = GridSearchCV(XGBRFClassifier(), parameters, cv=5,
                     n_jobs=-1)  # 결측치제거 전처리 안해도된다.

model.fit(x_train, y_train)

print(model.best_estimator_)
print("==========================================")
print(model.best_params_)
print("==========================================")
score = model.score(x_test, y_test)
print('정수: ', score)

# plot_importance(model)
# plt.show()
Exemplo n.º 25
0
# print("r2 Score : %.2f%%" %(r2 * 100))
print("acc : ", acc)

thresholds = np.sort(model.feature_importances_)

print(thresholds)

for thresh in thresholds:  #중요하지 않은 컬럼들을 하나씩 지워나간다.
    selection = SelectFromModel(model, threshold=thresh, prefit=True)

    selection_x_train = selection.transform(x_train)
    selection_x_test = selection.transform(x_test)

    print(selection_x_train.shape)

    selection_model = XGBRFClassifier(objective="multi:softprob", n_jobs=-1)

    selection_model.fit(selection_x_train,
                        y_train,
                        eval_metric=['merror', 'mlogloss'],
                        eval_set=[(selection_x_train, y_train),
                                  (selection_x_test, y_test)])

    y_pred = selection_model.predict(selection_x_test)

    acc = accuracy_score(y_test, y_pred)
    #print("R2:",r2)
    for i in thresholds:
        pickle.dump(
            model,
            open(
Exemplo n.º 26
0
x_test = scaler.transform(x_test)

# 이 정도만 조작해 주면 됨
n_estimators = 1000  # The number of trees in the forest.
learning_rate = 1  # 학습률
colsample_bytree = None  # 트리의 샘플 / 실전0.6 ~ 0.9 사이 씀 / 실전 1씀
colsample_bylevel = 0.9  # [기본설정값: 1]: subsample, colsample_bytree 두 초모수 설정을 통해서 이미 의사결정나무 모형 개발에 사용될 변수갯수와 관측점 갯수를 사용했는데 추가로 colsample_bylevel을 지정하는 것이 특별한 의미를 갖는지 의문이 듦.
max_depth = 29  # [기본설정값: 6]: 과적합 방지를 위해서 사용되는데 역시 CV를 사용해서 적절한 값이 제시되어야 하고 보통 3-10 사이 값이 적용된다.
n_jobs = -1

# CV 써라
# XGB 속도가 굉장히 빠름, 전처리 결측치 제거 안해줘도 됨

model = XGBRFClassifier(max_depth=max_depth,
                        learning_rate=learning_rate,
                        n_estimators=n_estimators,
                        colsample_bylevel=colsample_bylevel,
                        colsample_bytree=colsample_bytree)

model.fit(x_train, y_train)

score = model.score(x_test, y_test)  # score는 evaluate
print('점수 :', score)

# print(model.feature_importances_)
plot_importance(model)
# plt.show()

# XGBRFClassifier 점수 : 0.9666666666666667

# XGBClassifier 점수 : 0.8666666666666667
Exemplo n.º 27
0
def main():
    obj_left = load('rfecv_lefthemisphere_RF.joblib')
    obj_right = load('rfecv_righthemisphere_RF.joblib')

    # Get the data ready
    df_leftHemi_train, df_rightHemi_train, df_test_left, df_test_right = \
        get_csvfile_ready(constants.DATADIR_aparc)

    # Correlation analysis
    df_leftHemi_train_corr = correlation_analysis(df_leftHemi_train)
    df_rightHemi_train_corr = correlation_analysis(df_rightHemi_train)

    selected_left_feats = df_leftHemi_train_corr.columns[np.where(
        obj_left.ranking_ == 1)[0]]
    selected_right_feats = df_rightHemi_train_corr.columns[np.where(
        obj_right.ranking_ == 1)[0]]
    X_left_clean = df_leftHemi_train_corr[selected_left_feats]
    X_right_clean = df_rightHemi_train_corr[selected_right_feats]
    y_left = df_leftHemi_train_corr['labels']
    y_right = df_rightHemi_train_corr['labels']

    rskf = RepeatedStratifiedKFold(n_splits=5,
                                   n_repeats=5,
                                   random_state=372957125)
    left_scores_dict = defaultdict(list)
    right_scores_dict = defaultdict(list)

    for train_index, test_index in rskf.split(X_left_clean, y_left):
        X_left_clean_train, X_left_clean_test = X_left_clean.iloc[
            train_index, :], X_left_clean.iloc[test_index, :]
        X_right_clean_train, X_right_clean_test = X_right_clean.iloc[
            train_index, :], X_right_clean.iloc[test_index, :]
        y_left_train, y_left_test = y_left.iloc[train_index], y_left.iloc[
            test_index]
        y_right_train, y_right_test = y_right.iloc[train_index], y_right.iloc[
            test_index]

        rf_clc_left, rf_clc_right = RandomForestClassifier(
        ), RandomForestClassifier()
        left_scores_dict = get_score_dict(rf_clc_left, X_left_clean_train,
                                          y_left_train, X_left_clean_test,
                                          y_left_test, left_scores_dict, 'rf')
        right_scores_dict = get_score_dict(rf_clc_right, X_right_clean_train,
                                           y_right_train, X_right_clean_test,
                                           y_right_test, right_scores_dict,
                                           'rf')

        rf_clc_left, rf_clc_right = LogisticRegression(), LogisticRegression()
        left_scores_dict = get_score_dict(rf_clc_left, X_left_clean_train,
                                          y_left_train, X_left_clean_test,
                                          y_left_test, left_scores_dict, 'lg')
        right_scores_dict = get_score_dict(rf_clc_right, X_right_clean_train,
                                           y_right_train, X_right_clean_test,
                                           y_right_test, right_scores_dict,
                                           'lg')

        rf_clc_left, rf_clc_right = XGBClassifier(), XGBClassifier()
        left_scores_dict = get_score_dict(rf_clc_left, X_left_clean_train,
                                          y_left_train, X_left_clean_test,
                                          y_left_test, left_scores_dict, 'xgb')
        right_scores_dict = get_score_dict(rf_clc_right, X_right_clean_train,
                                           y_right_train, X_right_clean_test,
                                           y_right_test, right_scores_dict,
                                           'xgb')

        rf_clc_left, rf_clc_right = XGBRFClassifier(), XGBRFClassifier()
        left_scores_dict = get_score_dict(rf_clc_left, X_left_clean_train,
                                          y_left_train, X_left_clean_test,
                                          y_left_test, left_scores_dict,
                                          'xgbrf')
        right_scores_dict = get_score_dict(rf_clc_right, X_right_clean_train,
                                           y_right_train, X_right_clean_test,
                                           y_right_test, right_scores_dict,
                                           'xgbrf')

        rf_clc_left, rf_clc_right = GaussianNB(), GaussianNB()
        left_scores_dict = get_score_dict(rf_clc_left, X_left_clean_train,
                                          y_left_train, X_left_clean_test,
                                          y_left_test, left_scores_dict, 'nb')
        right_scores_dict = get_score_dict(rf_clc_right, X_right_clean_train,
                                           y_right_train, X_right_clean_test,
                                           y_right_test, right_scores_dict,
                                           'nb')

        rf_clc_left, rf_clc_right = SVC(), SVC()
        left_scores_dict = get_score_dict(rf_clc_left, X_left_clean_train,
                                          y_left_train, X_left_clean_test,
                                          y_left_test, left_scores_dict,
                                          'rsvm')
        right_scores_dict = get_score_dict(rf_clc_right, X_right_clean_train,
                                           y_right_train, X_right_clean_test,
                                           y_right_test, right_scores_dict,
                                           'rsvm')

        rf_clc_left, rf_clc_right = LinearSVC(), LinearSVC()
        left_scores_dict = get_score_dict(rf_clc_left, X_left_clean_train,
                                          y_left_train, X_left_clean_test,
                                          y_left_test, left_scores_dict,
                                          'lsvm')
        right_scores_dict = get_score_dict(rf_clc_right, X_right_clean_train,
                                           y_right_train, X_right_clean_test,
                                           y_right_test, right_scores_dict,
                                           'lsvm')