示例#1
0
def test_deprecate_position_arg():
    from sklearn.datasets import load_digits
    X, y = load_digits(return_X_y=True, n_class=2)
    w = y
    with pytest.warns(FutureWarning):
        xgb.XGBRegressor(3, learning_rate=0.1)
    model = xgb.XGBRegressor(n_estimators=1)
    with pytest.warns(FutureWarning):
        model.fit(X, y, w)

    with pytest.warns(FutureWarning):
        xgb.XGBClassifier(1, use_label_encoder=False)
    model = xgb.XGBClassifier(n_estimators=1, use_label_encoder=False)
    with pytest.warns(FutureWarning):
        model.fit(X, y, w)

    with pytest.warns(FutureWarning):
        xgb.XGBRanker('rank:ndcg', learning_rate=0.1)
    model = xgb.XGBRanker(n_estimators=1)
    group = np.repeat(1, X.shape[0])
    with pytest.warns(FutureWarning):
        model.fit(X, y, group)

    with pytest.warns(FutureWarning):
        xgb.XGBRFRegressor(1, learning_rate=0.1)
    model = xgb.XGBRFRegressor(n_estimators=1)
    with pytest.warns(FutureWarning):
        model.fit(X, y, w)

    with pytest.warns(FutureWarning):
        xgb.XGBRFClassifier(1, use_label_encoder=True)
    model = xgb.XGBRFClassifier(n_estimators=1)
    with pytest.warns(FutureWarning):
        model.fit(X, y, w)
示例#2
0
def run_training(pred_df, fold):

    train_df = pred_df[pred_df.kfold != fold].reset_index(drop=True)
    valid_df = pred_df[pred_df.kfold == fold].reset_index(drop=True)

    xtrain = train_df[["lr_pred", "lr_cnt_pred", "rf_svd_pred",
                       "gnb_pred"]].values
    xvalid = valid_df[["lr_pred", "lr_cnt_pred", "rf_svd_pred",
                       "gnb_pred"]].values

    clf = xgb.XGBRFClassifier(use_label_encoder=False,
                              base_score=0.5,
                              colsample_bylevel=1,
                              colsample_bytree=1,
                              gamma=0,
                              learning_rate=0.1,
                              max_delta_step=0,
                              max_depth=10,
                              min_child_weight=1,
                              missing=None,
                              n_estimators=100,
                              nthread=-1,
                              objective='binary:logistic',
                              eval_metric='logloss')

    clf.fit(xtrain, train_df.is_duplicate.values)
    preds = clf.predict_proba(xvalid)[:, 1]
    auc = metrics.roc_auc_score(valid_df.is_duplicate.values, preds)
    print(f"{fold}, {auc}")
    valid_df.loc[:, "xgb_pred"] = preds
    return valid_df
示例#3
0
文件: main.py 项目: erhant/dasc521
def XGB(train, target, test, rf=True):
    if rf:
        prtstr = "XGBRF Score"
        classifier = xgb.XGBClassifier()
    else:
        prtstr = "XGB Score"
        classifier = xgb.XGBRFClassifier()
    classifier.fit(train, target)
    print(prtstr, classifier.score(train, target))
    prediction = classifier.predict_proba(test)[:, 1]
    return prediction
示例#4
0
def choose_ml(classifier_name, train_x, train_y, test_x, test_y):
    if classifier_name == "lr":
        print("logistic regression")
        model = LogisticRegression(solver='liblinear', max_iter=1000)

    if classifier_name == "svm":
        print("support vector machine")
        params_grid = [{
            'kernel': ['rbf'],
            'gamma': [1e-3, 1e-4],
            'C': [1, 10, 100, 1000]
        }, {
            'kernel': ['linear'],
            'C': [1, 10, 100, 1000]
        }]
        model = GridSearchCV(SVC(), params_grid, iid=True, cv=5)

    if classifier_name == "dt":
        print("decision tree")
        model = tree.DecisionTreeClassifier()

    if classifier_name == "rf":
        print("random forest")
        model = RandomForestClassifier(n_estimators=20,
                                       max_depth=10,
                                       random_state=42)

    if classifier_name == "ann":
        print("artificianl neural network")
        model = MLPClassifier(activation='logistic',
                              hidden_layer_sizes=(train_x.shape[1],
                                                  train_x.shape[1] + 1, 2),
                              max_iter=500)

    if classifier_name == "nb":
        print("naive bayes")
        model = GaussianNB()

    if classifier_name == "knn":
        print("k  nearest neighbours")
        model = KNeighborsClassifier()

    if classifier_name == "xgb":
        print("xgboost")
        model = xgb.XGBRFClassifier()

    model.fit(train_x, train_y)
    pred_y = model.predict(test_x)
    accuracy = metrics.accuracy_score(test_y, pred_y)
    f1score = metrics.f1_score(test_y, pred_y)
    print("we are here")
    return accuracy, f1score
示例#5
0
def xgboost_classifier(x_train_tf, train_df, x_test_tfidf):
    """
    classify data by xgboost classifier
    :param x_train_tf: training data represented as counted vector
    :param train_df: the training data
    :param x_test_tfidf: test data represented as counted vector
    :return: predicted labels
    """
    model = xgb.XGBRFClassifier()
    model.fit(x_train_tf, train_df.label)
    predictions = model.predict(x_test_tfidf)
    predictions_proba = model.predict_proba(x_test_tfidf)[:, 1]
    return predictions, predictions_proba
示例#6
0
def select_model_train_with_vlaid(train_data, train_label, test_data,
                                  test_label):
    random_state = 0
    plt.figure(figsize=(12, 8))
    plt.subplots_adjust(wspace=0.7, hspace=0.5)
    y = train_label.reshape(1, -1)[0]
    y_test = test_label.reshape(1, -1)[0]
    for i in range(len(train_data)):
        X = train_data[i]
        X_test = test_data[i]
        classifiers = []
        classifiers.append(LogisticRegression(random_state=random_state))
        classifiers.append(KNeighborsClassifier())
        classifiers.append(DecisionTreeClassifier(random_state=random_state))
        classifiers.append(SVC(probability=True, random_state=random_state))
        classifiers.append(RandomForestClassifier(random_state=random_state))
        # classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),random_state=random_state))
        classifiers.append(
            GradientBoostingClassifier(random_state=random_state))
        classifiers.append(ExtraTreesClassifier(random_state=random_state))
        classifiers.append(lgb.LGBMClassifier(random_state=random_state))
        classifiers.append(xgb.XGBRFClassifier(random_state=random_state))
        acc = []
        for classifier in classifiers:
            clf = fit_model(classifier, X, y)
            y_pred = clf.predict(X_test)
            acc.append(accuracy_score(y_test, y_pred))
        indexs = [
            'Logistic', 'KNeighbors', 'DecisionTree', 'SVC', 'RandomForest',
            'GradientBoosting', 'ExtraTreeClassifier', 'lightgbm', 'xgbRF'
        ]
        titles = [
            "PSE-PP", "PSE-AAC", "PSE-PSSM", "AVB-PP", "AVB-AAC", "AVB-PSSM",
            "DWT-PP", "DWT-AAC", "DWT-PSSM"
        ]
        data = pd.DataFrame(acc, columns=['Acc'], index=indexs)
        print(titles[i])
        print(data)
        print()

        p = plt.subplot(3, 3, (i + 1))
        p.set_xlim([0, 1])
        p.set_title(titles[i])
        # p.set_ylabel('Model')
        g = sns.barplot(x=data['Acc'], y=data.index, data=data)
    # plt.savefig("./imgs/model_select.jpg")
    plt.show()
示例#7
0
def trainBDT(X, y, X_val, y_val, param, min_background):
    #Train trees
    evallist = [(X, y), (X_val, y_val)]
    model = xgb.XGBRFClassifier(**param)
    model.fit(X, y.ravel(), eval_set=evallist, verbose=True)

    #Get significance data
    ypred = model.predict(X_val)
    predictions = [round(value) for value in ypred]
    accuracy = accuracy_score(y_val, predictions)
    print("The training accuaracy is: {}".format(accuracy))
    conf_matrix = confusion_matrix(y_val, predictions)
    print("The confusion matrix: {}".format(conf_matrix))
    print("The precision is: {}".format(precision_score(y_val, predictions)))
    plot_BDTScore(X_val.copy(), y_val.copy(), model, min_background)

    return model, predictions
示例#8
0
def post_train_maxfeat_rf(
    config,
    train_dataloader,
    val_dataloader,
):
    ## Get and preprocess training data to maxfeat
    if is_rank0:
        print("### Get and preprocess training data ###")

    maxfeat_list, pred_list = to_maxfeat_feature(train_dataloader, is_rank0)

    # Gather training data to rank 0
    if hvd != None:
        maxfeat_list = maxfeat_list.tolist()
        pred_list = pred_list.tolist()

        all_maxfeat_list = MPI.COMM_WORLD.gather(maxfeat_list, root=0)
        all_pred_list = MPI.COMM_WORLD.gather(pred_list, root=0)

        if is_rank0:
            all_maxfeat_list = np.concatenate(np.array(all_maxfeat_list),
                                              axis=0)
            all_pred_list = np.concatenate(np.array(all_pred_list), axis=0)

        maxfeat_list = all_maxfeat_list
        pred_list = all_pred_list

    ## Train the post_train model
    if is_rank0:
        print("### Training the post-train model ###")

        post_train_model = xgb.XGBRFClassifier()

        post_train_model.fit(maxfeat_list, pred_list)
        accuracy = post_train_model.score(maxfeat_list, pred_list)
        print("Train Accuracy: {}".format(accuracy))
        with open(config["POST_TRAIN_MODEL_PATH"], "wb") as f:
            pickle.dump(post_train_model, f)
        print("Post-train model saved at {}.".format(
            config["POST_TRAIN_MODEL_PATH"]))
示例#9
0
 def _init_model(self, **kwargs):
     self._model_name = MODEL_XGB_RF
     self._model = xgb.XGBRFClassifier(**kwargs)
示例#10
0
文件: test_e2e.py 项目: goldv/m2cgen
        # XGBoost (tree method "hist")
        regression(xgboost.XGBRegressor(**XGBOOST_HIST_PARAMS),
                   test_fraction=0.2),
        classification(xgboost.XGBClassifier(**XGBOOST_HIST_PARAMS),
                       test_fraction=0.2),
        classification_binary(xgboost.XGBClassifier(**XGBOOST_HIST_PARAMS),
                              test_fraction=0.2),

        # XGBoost (LINEAR)
        regression(xgboost.XGBRegressor(**XGBOOST_PARAMS_LINEAR)),
        classification(xgboost.XGBClassifier(**XGBOOST_PARAMS_LINEAR)),
        classification_binary(xgboost.XGBClassifier(**XGBOOST_PARAMS_LINEAR)),

        # XGBoost (RF)
        regression(xgboost.XGBRFRegressor(**XGBOOST_PARAMS_RF)),
        classification(xgboost.XGBRFClassifier(**XGBOOST_PARAMS_RF)),
        classification_binary(xgboost.XGBRFClassifier(**XGBOOST_PARAMS_RF)),

        # XGBoost (Boosted Random Forests)
        regression(xgboost.XGBRegressor(**XGBOOST_PARAMS_BOOSTED_RF)),
        classification(xgboost.XGBClassifier(**XGBOOST_PARAMS_BOOSTED_RF)),
        classification_binary(
            xgboost.XGBClassifier(**XGBOOST_PARAMS_BOOSTED_RF)),

        # XGBoost (Large Trees)
        regression_random(xgboost.XGBRegressor(**XGBOOST_PARAMS_LARGE)),
        classification_random(xgboost.XGBClassifier(**XGBOOST_PARAMS_LARGE)),
        classification_binary_random(
            xgboost.XGBClassifier(**XGBOOST_PARAMS_LARGE)),

        # XGBoost (Huge Trees)
示例#11
0
    #,(vali_x,vali_y)
    model.fit(train_x1,
              train_y1,
              eval_set=[(train_x1, train_y1), (test_x1, test_y1)],
              eval_metric='auc')
    test['prob'] = test['prob'] + model.predict_proba(test_x)[:, 1]
test['prob'] = test['prob'] / 10
test.rename(columns={'seller_id': 'merchant_id'}, inplace=True)
test[['user_id', 'merchant_id', 'prob']].to_csv('result1.csv', index=False)

#构建stacking模型 分数上升了0.001
train_x = train[features]
train_y = train['label']
test_x = test[features]
clf1 = xgb.XGBRFClassifier(learning_rate=0.01,
                           n_estimators=1500,
                           random_state=2019)
clf2 = lgb.LGBMClassifier(learning_rate=0.01,
                          n_estimators=1500,
                          random_state=2019)
dtc = DecisionTreeClassifier()
sclf = StackingClassifier(classifiers=[clf1, clf2], meta_classifier=dtc)
for clf, label in zip([clf1, clf2, sclf],
                      ['xgb', 'lgb', 'StackingClassifier']):
    scores = model_selection.cross_val_score(clf,
                                             train_x,
                                             train_y,
                                             cv=10,
                                             scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" %
          (scores.mean(), scores.std(), label))
示例#12
0
########################### xgboost ############################################
##### boosted tree
# xgb = xgboost.XGBClassifier(max_depth=8, scale_pos_weight=9, n_estimators=1000) ##n_estimators=1000, learning_rate=0.05
# xgb.fit(X=train.values.astype(np.float32), y=np.squeeze(train_labels.astype(np.float32)), early_stopping_rounds=20,
#         eval_set=[(val.values.astype(np.float32), val_labels.astype(np.float32))], verbose=True)
#
# # make predictions
# predxgb = xgb.predict(test.values.astype(np.float32))
# xgb_conf_mat = metrics.confusion_matrix(test_labels.astype(np.float32), predxgb) ## tree methods tend to have higher false negative rates than ANN
# print(xgb_conf_mat/np.expand_dims(np.sum(xgb_conf_mat, axis=1), axis=1))

# xgb.feature_importances_

#### random forest
xgbrf = xgboost.XGBRFClassifier(max_depth=8,
                                scale_pos_weight=9,
                                n_estimators=100)
# xgbrf = xgboost.XGBRFClassifier(scale_pos_weight=9)
xgbrf.fit(X=train.values.astype(np.float32),
          y=np.squeeze(train_labels.astype(np.float32)),
          early_stopping_rounds=20,
          eval_set=[(val.values.astype(np.float32),
                     val_labels.astype(np.float32))],
          verbose=True)
predxgbrf = xgbrf.predict(test.values.astype(np.float32))
xgbrf_conf_mat = metrics.confusion_matrix(
    test_labels.astype(np.float32), predxgbrf
)  ## tree methods tend to have higher false negative rates than ANN
print(xgbrf_conf_mat / np.expand_dims(np.sum(xgbrf_conf_mat, axis=1), axis=1))

# xgbrf.feature_importances_
示例#13
0
df = df.append(gen_df(vcf_reader_nn, flabel_nn), ignore_index=True)
df = df.append(gen_df(vcf_reader_ns, flabel_ns), ignore_index=True)
df = df.append(gen_df(vcf_reader_gn, flabel_gn), ignore_index=True)
df = df.append(gen_df(vcf_reader_gs, flabel_gs), ignore_index=True)
df = df.append(gen_df(vcf_reader_lp, flabel_lp), ignore_index=True)
#print(df)

print('Running classifier...')

# Fitting
X = df[['PRECISE', 'CIPOS', 'CIEND', 'RE', 'SVLEN', 'MAPQ', 'DEPTHPVAL']]
y = df['LABEL']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2)

clf = xgb.XGBRFClassifier(n_estimators=100)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred, pos_label="True"))
print("Recall:", metrics.recall_score(y_test, y_pred, pos_label="True"))
print("F1 score:", metrics.f1_score(y_test, y_pred, pos_label="True"))

callset = [i for i, x in enumerate(y_pred) if x == 'True']
print('Callset:', len(callset))
trueset = [i for i, x in enumerate(y_test) if x == 'True']
print('Trueset:', len(trueset))
intersect = [value for value in callset if value in trueset]
print('Intersect:', len(intersect))
示例#14
0
    def fit(self, X, y):
        self.model = LassoLarsIC(criterion='aic').fit(X, y)
        return self

    def transform(self, X):
        return np.asarray(X)[:, abs(self.model.coef_) > 0]


scale_pos_weight = Counter(y_train)[0] / Counter(y_train)[1]
clf = xgb.XGBRFClassifier(objective='binary:logistic',
                          scale_pos_weight=scale_pos_weight,
                          learning_rate=0.01,
                          n_estimators=5000,
                          max_depth=10,
                          min_child_weight=1,
                          gamma=0,
                          subsample=0.3,
                          colsample_bytree=0.3,
                          reg_alpha=0.014,
                          nthread=4,
                          seed=27)

PL = Pipeline(
    steps=[('PreProcessor',
            StandardScaler()), ('PCA', PCA()), ('EmbeddedSelector',
                                                LASSOJorn()),
           ('clf',
            CalibratedClassifierCV(base_estimator=clf, method='sigmoid'))])

#tss = TimeSeriesSplit(n_splits=3)
#optimizer = GridSearchCV(PL, parameters, cv=tss, n_jobs=-1, verbose=10, scoring='roc_auc')
示例#15
0
groups = pd.DataFrame(list_groups, columns  = (['Severity']))
groups = groups.set_index(clin.index)
clin =  clin.join(groups)

#scaling data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(prot)
X= scaler.transform(prot)
X= pd.DataFrame(X)
X.columns= prot.columns
X = X.set_index(prot.index)

groups['Severity_2'] = groups['Severity'].replace({"Control":0,  "Low":0, "Moderate":0, "Severe": 1, "Critical":1})
y = groups['Severity_2']


#setting xgboost
gbm_param_grid = {'learning_rate': [0.15, 0.20, 0.25, 0.30],
                  'num_boosting_rounds' :[10, 15, 20, 25, 30],
                  'subsample':[0.2, 0.3, 0.5, 0.8, 0.9],
                  'colsample_bytree':[0.2, 0.25, 0.30, 0.35],
                  'max_depth':[2, 3, 5]}

gbm = xgb.XGBRFClassifier()
grid_roc = GridSearchCV(estimator=gbm, param_grid=gbm_param_grid, scoring= "accuracy", cv = 3)
grid_roc.fit(X, y)
print("Best parameters found are", grid_roc.best_params_)
print("best roc score found", grid_roc.best_score_)
def get_model():
    """Return model of specified type."""
    model_type = config["adu"]["model"]
    if model_type == "SVC":
        model = SVC()
    elif model_type == "LogisticRegression":
        model = LogisticRegression()
    elif model_type == "RandomForest":
        model = RandomForestClassifier()
    elif model_type == "AdaBoost":
        model = AdaBoostClassifier()
    elif model_type == "XGBoost":
        model = xgb.XGBClassifier()
    elif model_type == "XGBRF":
        model = xgb.XGBRFClassifier()
    elif model_type == "AutoML":
        model = autosklearn.classification.AutoSklearnClassifier(
            resampling_strategy="cv",
            resampling_strategy_arguments={"folds": 10},
            n_jobs=80,
            ensemble_memory_limit=10240,
            ml_memory_limit=30720,
        )
    elif model_type == "Stacking":
        cv_split = StratifiedShuffleSplit(n_splits=config["adu"]["n_splits"],
                                          test_size=0.33)
        rf_param_grid = Grid["RandomForest"]
        log_param_grid = Grid["LogisticRegression"]
        svc_param_grid = Grid["SVC"]
        ada_param_grid = Grid["AdaBoost"]
        xgb_param_grid = Grid["XGBoost"]
        xgbrf_param_grid = Grid["XGBRF"]
        train_method = config["adu"]["train_method"]
        estimator_dict = dict()
        if train_method == "GridSearch":
            estimator_dict["rf"] = GridSearchCV(
                RandomForestClassifier(),
                param_grid=rf_param_grid,
                cv=cv_split,
                refit=True,
            )
            estimator_dict["log"] = GridSearchCV(LogisticRegression(),
                                                 param_grid=log_param_grid,
                                                 cv=cv_split,
                                                 refit=True)
            estimator_dict["svc"] = GridSearchCV(SVC(),
                                                 param_grid=svc_param_grid,
                                                 cv=cv_split,
                                                 refit=True)
            estimator_dict["ada"] = GridSearchCV(AdaBoostClassifier(),
                                                 param_grid=ada_param_grid,
                                                 cv=cv_split,
                                                 refit=True)
            estimator_dict["xgbrf"] = GridSearchCV(
                xgb.XGBRFClassifier(),
                param_grid=xgbrf_param_grid,
                cv=cv_split,
                refit=True,
            )
            estimator_dict["xgb"] = GridSearchCV(xgb.XGBClassifier(),
                                                 param_grid=xgb_param_grid,
                                                 cv=cv_split,
                                                 refit=True)
        elif train_method == "RandomSearch":
            estimator_dict["rf"] = RandomizedSearchCV(
                RandomForestClassifier(),
                param_distributions=rf_param_grid,
                cv=cv_split,
                refit=True,
            )
            estimator_dict["log"] = RandomizedSearchCV(
                LogisticRegression(),
                param_distributions=log_param_grid,
                cv=cv_split,
                refit=True,
            )
            estimator_dict["svc"] = RandomizedSearchCV(
                SVC(),
                param_distributions=svc_param_grid,
                cv=cv_split,
                refit=True)
            estimator_dict["ada"] = RandomizedSearchCV(
                AdaBoostClassifier(),
                param_distributions=ada_param_grid,
                cv=cv_split,
                refit=True,
            )
            estimator_dict["xgbrf"] = RandomizedSearchCV(
                xgb.XGBRFClassifier(),
                param_distributions=xgbrf_param_grid,
                cv=cv_split,
                refit=True,
            )
            estimator_dict["xgb"] = RandomizedSearchCV(
                xgb.XGBClassifier(),
                param_distributions=xgb_param_grid,
                cv=cv_split,
                refit=True,
            )

        stacks = config["adu"]["stacking"]["estimator_stack"]
        final_est = estimator_dict[config["adu"]["stacking"]
                                   ["final_estimator"]]
        passth = config["adu"]["stacking"]["passthrough"]
        single_layer = []
        for i, m in enumerate(stacks):
            if isinstance(m, list):
                sublayer = [(mo + str(i + j), estimator_dict[mo])
                            for j, mo in enumerate(m)]
                layer = StackingClassifier(
                    estimators=sublayer,
                    final_estimator=final_est,
                    n_jobs=-1,
                    passthrough=passth,
                    verbose=0,
                )
                final_est = layer
            else:
                single_layer.append((m + str(i), estimator_dict[m]))

        if len(single_layer) > 0:
            model = StackingClassifier(
                estimators=single_layer,
                final_estimator=final_est,
                n_jobs=-1,
                passthrough=passth,
                verbose=0,
            )
        else:
            model = layer
    else:
        print("Invalid model option")
        exit(1)
    return model
示例#17
0
from sklearn import ensemble
import xgboost as xgb
from sklearn import linear_model 

#ML MODELS
MODELS = {
    "randomforest": ensemble.RandomForestClassifier(
        n_estimators=200,
        n_jobs=-1,
        verbose=2),
    "extratrees": ensemble.ExtraTreesClassifier(
        n_estimators=200,
        n_jobs=-1,
        verbose=2),

    "xgboost": xgb.XGBRFClassifier(verbosity=2, 
        max_depth=4, 
        n_estimators=200, 
        n_jobs=-1),

    "logreg": linear_model.LogisticRegression(
       n_jobs= -1 
    )

}

#deep learning models:

DL_MODELS = {
    
}
示例#18
0
train_x = df.iloc[train_ind, :]
train_y = labels[train_ind, :]
val_x = df.iloc[val_ind, :]
val_y = labels[val_ind, :]
test_x = df.iloc[test_ind, :]
test_y = labels[test_ind, :]

xgb = xgboost.XGBRFClassifier(
    learning_rate=0.05,
    max_depth=8,
    scale_pos_weight=10,
    n_estimators=100,
    n_jobs=8,
    nthread=-1,
    subsample=.6,
    verbosity=1,
    colsample_bylevel=.9,
    colsample_bynode=.9,
    colsample_bytree=.9,
    gamma=1,
    base_score=.5,
    min_child_weight=1,
    max_delta_step=10)  ##n_estimators=1000, learning_rate=0.05
xgb.fit(X=train_x.values.astype(np.float32),
        y=np.squeeze(train_y.astype(np.float32)),
        early_stopping_rounds=50,
        eval_set=[(val_x.values.astype(np.float32), val_y.astype(np.float32))],
        verbose=True)

# make predictions
predxgb = xgb.predict(test_x.values.astype(np.float32))
示例#19
0
    train.drop(TARGET_COL,
               axis=1).columns.get_loc('apache_4a_hospital_death_prob'))
print(
    train.drop(TARGET_COL, axis=1).columns.get_loc('apache_4a_icu_death_prob'))
print(
    train.drop(TARGET_COL,
               axis=1).columns.get_loc('apache_hospital_minus_apache_icu'))
print(
    train.drop(TARGET_COL,
               axis=1).columns.get_loc('apache_icu_div_apache_hospital'))
print(train.drop(TARGET_COL, axis=1).columns.get_loc('age'))
print(train.drop(TARGET_COL, axis=1).columns.get_loc('ventilated_apache'))

models = [[
    xgboost.XGBRFClassifier(n_estimators=300,
                            max_depth=50,
                            tree_method="gpu_hist",
                            verbose=10),
    xgboost.XGBRFClassifier(n_estimators=2000,
                            max_depth=12,
                            tree_method="gpu_hist",
                            n_jobs=1),
    SelectFromModel(
        CatBoostClassifier(iterations=2200,
                           depth=10,
                           objective="Logloss",
                           nan_mode="Max",
                           verbose=1000,
                           task_type="GPU")),
    SelectFromModel(
        xgboost.XGBClassifier(n_estimators=3000,
                              eta=0.02,
示例#20
0
columns_in_train = df_train.columns
ct = make_column_transformer((OneHotEncoder(handle_unknown='ignore'),
                              make_column_selector(dtype_include=object)),
                             remainder='passthrough')
X = ct.fit_transform(df_train)
label_encoder = LabelEncoder()

y = label_encoder.fit_transform(y)
print(X.shape, len(X), len(y))

# Model selection
print("\n")
print("#" * 30)
print('Model selection:')
clf = xgb.XGBRFClassifier()

clf.fit(X, y)

# Model application
print("\n")
print("#" * 30)
print('Model Application with missing values:')
df_test = pd.read_csv("./data/bank_marketing_test.csv")
df_test = df_test.astype(object).replace(to_replace={"unknown": np.nan})
scores = pd.Series(index=df_test.index, dtype=np.float64)
vld_index = df_test.index[df_test[cat_columns].notnull().all(axis=1)]
df_test_val = df_test.dropna(subset=cat_columns, inplace=False)
assert len(vld_index) == len(df_test_val)

df_test_val = df_test_val[columns_in_train]
示例#21
0
                         groups=uuid_groups)

    dump(clf.get_params(), "params_separated_lr.joblib")
    param_dict = load("params_separated_lr.joblib")
    print(param_dict)
    clf.fit(X_train_clean, y_train)
    y_pred = clf.predict(X_test_clean)
    print(
        "Balanced accuracy LR: ",
        balanced_accuracy_score(y_test.T,
                                y_pred,
                                average="macro",
                                zero_default=0))

    rf_clf = xgb.XGBRFClassifier(max_depth=12,
                                 n_estimators=200,
                                 tree_method="gpu_hist",
                                 objective="binary:logistic")

    clf = FlexOneVsRestClassifier(rf_clf, n_estimators=y_train.shape[1])
    bounds = {"max_depth": (8, 15), "colsample_bynode": (0.5, 0.9)}
    clf.tune_hyperparams(X=X_train,
                         y=y_train,
                         bounds=bounds,
                         metric=single_balanced_accuracy_score,
                         init_points=6,
                         n_iter=9,
                         int_params=["max_depth"],
                         groups=uuid_groups)

    dump(clf.get_params(), "params_separated_rf.joblib")
    param_dict = load("params_separated_rf.joblib")
示例#22
0
                                     ('GNB', model4)],
                         voting='soft')
eclf3 = VotingClassifier(estimators=[('dt', model1), ('lr', model3),
                                     ('GNB', model4)],
                         voting='soft')
eclf4 = VotingClassifier(estimators=[('knn', model2), ('lr', model3),
                                     ('GNB', model4)],
                         voting='soft')

#XGBoost
xgb_model1 = xgb.XGBClassifier(objective="binary:logistic",
                               seed=42,
                               learning_rate=0.01)

xgb_model2 = xgb.XGBRFClassifier(n_estimators=100,
                                 subsample=0.9,
                                 colsample_bynode=0.2)

gb_model = GradientBoostingClassifier(random_state=34)

catboost_model = CatBoostClassifier()

#Stacking Different Models and using logistic regression as a meta classifier
clf1 = KNeighborsClassifier()
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()
clf4 = LogisticRegression(random_state=1, max_iter=300)
lr = lr_model

sclf1 = StackingClassifier(classifiers=[clf1, clf2, clf3],
                           use_probas=True,
示例#23
0
def test_evaluation_metric():
    from sklearn.datasets import load_diabetes, load_digits
    from sklearn.metrics import mean_absolute_error
    X, y = load_diabetes(return_X_y=True)
    n_estimators = 16

    with tm.captured_output() as (out, err):
        reg = xgb.XGBRegressor(
            tree_method="hist",
            eval_metric=mean_absolute_error,
            n_estimators=n_estimators,
        )
        reg.fit(X, y, eval_set=[(X, y)])
        lines = out.getvalue().strip().split('\n')

    assert len(lines) == n_estimators
    for line in lines:
        assert line.find("mean_absolute_error") != -1

    def metric(predt: np.ndarray, Xy: xgb.DMatrix):
        y = Xy.get_label()
        return "m", np.abs(predt - y).sum()

    with pytest.warns(UserWarning):
        reg = xgb.XGBRegressor(
            tree_method="hist",
            n_estimators=1,
        )
        reg.fit(X, y, eval_set=[(X, y)], eval_metric=metric)

    def merror(y_true: np.ndarray, predt: np.ndarray):
        n_samples = y_true.shape[0]
        assert n_samples == predt.size
        errors = np.zeros(y_true.shape[0])
        errors[y != predt] = 1.0
        return np.sum(errors) / n_samples

    X, y = load_digits(n_class=10, return_X_y=True)

    clf = xgb.XGBClassifier(use_label_encoder=False,
                            tree_method="hist",
                            eval_metric=merror,
                            n_estimators=16,
                            objective="multi:softmax")
    clf.fit(X, y, eval_set=[(X, y)])
    custom = clf.evals_result()

    clf = xgb.XGBClassifier(use_label_encoder=False,
                            tree_method="hist",
                            eval_metric="merror",
                            n_estimators=16,
                            objective="multi:softmax")
    clf.fit(X, y, eval_set=[(X, y)])
    internal = clf.evals_result()

    np.testing.assert_allclose(custom["validation_0"]["merror"],
                               internal["validation_0"]["merror"],
                               atol=1e-6)

    clf = xgb.XGBRFClassifier(
        use_label_encoder=False,
        tree_method="hist",
        n_estimators=16,
        objective=tm.softprob_obj(10),
        eval_metric=merror,
    )
    with pytest.raises(AssertionError):
        # shape check inside the `merror` function
        clf.fit(X, y, eval_set=[(X, y)])
示例#24
0
from sklearn import ensemble
import xgboost as xgb
from sklearn import linear_model

MODELS = {
    "randomforest_classifier":
    ensemble.RandomForestClassifier(n_estimators=200, n_jobs=-1, verbose=2),
    "randomforest_regressor":
    ensemble.RandomForestRegressor(n_estimators=200, n_jobs=-1, verbose=2),
    "xgb_classifier":
    xgb.XGBRFClassifier(
        learning_rate=1,
        subsample=0.9,
    ),
    "xgb_regressor":
    xgb.XGBRFRegressor(learning_rate=1, subsample=0.9),
    "logistic_regressor":
    linear_model.LogisticRegression(
        penalty='elasticnet',
        fit_intercept=True,
        class_weight='balanced',
        random_state=42,
        solver='saga',
        verbose=2,
        n_jobs=-1,
    )

    #TODO: add more models here
}
示例#25
0
# xgb.XGBRFClassifier(learning_rate=1,)
# For this case, we want objetive='binary:hinge'
# Relevant parameters
#   n_estimators = number of trees in random forest to fit(RF only)
#   max_depth = maximum tree depth for base learners
#   learning_rate = (float) "eta" in xgb, boosted learning rate
#   objective = learning task and objective, or custom objective function
#   booster = 'gbtree', 'gblinear', or 'dart'
#   tree_method = ??? (leave as default for now)
#   n_threads = number of threads to use
#   gamma = (float) minimum loss reduction requared to make a furhter partition
#   min_child_seight = minimum sum of instance weight needed in a child
#   missing = value to treat as missing
#   num_parallel_tree = used for random forest

xgb_rf_model = xgb.XGBRFClassifier(objective="binary:hinge", missing=-1)
xgb_param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [4, 6, 8, 10, 12],
    "learning_rate": [0.2, 0.3, 0.5, 0.75, 1, 1.25, 1.5, 2],
    "num_parallel_tree": [50, 100, 200],
    "gamma": [0, 0.1, 0.25],
    "min_child_weight": [0.5, 1, 2]
}

xgb_grid_rf_clf = model_selection.GridSearchCV(xgb_rf_model,
                                               xgb_param_grid,
                                               n_jobs=4)
#xgb_grid_rf_clf = xgb_rf_model
xgb_grid_rf_clf.fit(xgb_x_train, xgb_y_train)
示例#26
0
def test_xgb_base_module(root_client: sy.VirtualMachineClient) -> None:

    sy.load("xgboost")
    sy.load("numpy")

    # third party
    import numpy as np
    import xgboost as xgb

    xgb_remote = root_client.xgboost

    # import xgboost as xgb

    X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
    y = np.array([0, 0, 1, 1])

    param = {"eta": 0.3, "max_depth": 3, "num_class": 3}

    steps = 20

    D_train = xgb.DMatrix(X, label=y)
    model = xgb.train(param, D_train, steps)
    preds = model.predict(D_train)

    D_train = xgb_remote.DMatrix(X, label=y)
    model = xgb_remote.train(param, D_train, steps)
    preds_remote = model.predict(D_train).get()

    classifier = xgb_remote.XGBClassifier(
        n_estimators=100, reg_lambda=1, gamma=0, max_depth=3, use_label_encoder=False
    )

    classifier.fit(X, y)
    y_pred_classifier_remote = classifier.predict(X).get()

    classifier = xgb.XGBClassifier(
        n_estimators=100, reg_lambda=1, gamma=0, max_depth=3, use_label_encoder=False
    )

    classifier.fit(X, y)
    y_pred_classifier = classifier.predict(X)

    classifier = xgb_remote.XGBRFClassifier(
        n_estimators=100, reg_lambda=1, gamma=0, max_depth=3, use_label_encoder=False
    )

    classifier.fit(X, y)
    y_pred_classifier_rf_remote = classifier.predict(X).get()

    classifier = xgb.XGBRFClassifier(
        n_estimators=100, reg_lambda=1, gamma=0, max_depth=3, use_label_encoder=False
    )

    classifier.fit(X, y)
    y_pred_classifier_rf = classifier.predict(X)

    regressor = xgb.XGBRegressor(n_estimators=100, reg_lambda=1, gamma=0, max_depth=3)
    regressor.fit(X, y)
    y_pred_regressor = regressor.predict(X)

    regressor = xgb_remote.XGBRegressor(
        n_estimators=100, reg_lambda=1, gamma=0, max_depth=3
    )
    regressor.fit(X, y)
    y_pred_regressor_remote = regressor.predict(X).get()

    regressor = xgb.XGBRFRegressor(n_estimators=100, reg_lambda=1, gamma=0, max_depth=3)
    regressor.fit(X, y)
    y_pred_regressor_rf = regressor.predict(X)

    regressor = xgb_remote.XGBRFRegressor(
        n_estimators=100, reg_lambda=1, gamma=0, max_depth=3
    )
    regressor.fit(X, y)
    y_pred_regressor_rf_remote = regressor.predict(X).get()

    assert np.array_equal(y_pred_classifier_rf, y_pred_classifier_rf_remote)
    assert np.array_equal(y_pred_regressor_rf, y_pred_regressor_rf_remote)
    assert np.array_equal(y_pred_regressor, y_pred_regressor_remote)
    assert np.array_equal(y_pred_classifier, y_pred_classifier_remote)
    assert np.array_equal(preds_remote, preds)
示例#27
0
def run_training(fold_):
    total_roc = []
    total_conf = []

    t0 = time.time()
    #df = pd.read_csv("../input/embedded_train_tiny_folds.csv")
    df = pd.read_hdf(path_or_buf="../input/tiny_data/full_data_folds.h5",
                     key='dataset')
    #print("tg\n",df.target.value_counts())
    #print(" ")
    t1 = time.time()
    total_time = t1 - t0
    print("time to read file", total_time)

    print(f"fold: {fold_}")

    t0 = time.time()

    train_df = df[df.kfold != fold_].reset_index(drop=True)
    test_df = df[df.kfold == fold_].reset_index(drop=True)
    #    print("train shape\n", train_df.shape)
    #   print("test shape\n", test_df.shape)

    #features
    xtrain = train_df.drop(["kfold", "target"], axis=1)
    xtest = test_df.drop(["kfold", "target"], axis=1)
    # Standard scaler

    #sc = StandardScaler()
    #sc.fit(xtrain)

    #xtrain = sc.transform(xtrain)
    #xtest = sc.transform(xtest)

    # target
    # First make the target binary
    train_df.target = train_df.target.apply(lambda x: 'open'
                                            if x == 'open' else 'closed')

    test_df.target = test_df.target.apply(lambda x: 'open'
                                          if x == 'open' else 'closed')
    # Encode labels
    le = preprocessing.LabelEncoder()
    le.fit(train_df.target)
    #print(le.classes_)
    ytrain = le.transform(train_df.target)

    ytest = le.transform(test_df.target)

    print("now do SMOTE")
    # defin pipeline
    #over = RandomOverSampler(
    #    sampling_strategy=0.032,
    #    random_state=0)
    over = SMOTE(sampling_strategy=0.8, n_jobs=-1)
    under = RandomUnderSampler(sampling_strategy=0.9)

    steps = [('o', over), ('u', under)]

    pipeline = Pipeline(steps=steps)
    #transform the datset
    X_res, y_res = pipeline.fit_resample(xtrain, ytrain)
    #X_res, y_res =xtrain, ytrain
    print("Before sampling %s" % Counter(ytrain))
    print('Resampled dataset shape %s' % Counter(y_res))

    #model

    model = xgb.XGBRFClassifier(use_label_encoder=False,
                                scale_pos_weight=0.9,
                                n_estimators=70,
                                max_depth=6,
                                n_jobs=-1,
                                subsample=0.4,
                                num_parallel_tree=20,
                                eval_metric='logloss',
                                tree_method='auto',
                                objective='reg:logistic',
                                gamma=.1,
                                min_child_weight=6,
                                booster='dart',
                                eta=0.8)
    #fit the model on training data
    model.fit(X_res, y_res)
    # make predictions
    preds = model.predict(xtest)
    preds_proba = model.predict_proba(xtest)[:, 1]
    # print('preds shape',preds_proba.shape)

    t1 = time.time()
    total_time = t1 - t0
    print('time to fit model:', total_time)

    accuracy_score = np.sum(preds == ytest) / len(ytest)
    #log_loss= metrics.log_loss(train_df.OpenStatus,preds)

    #print(f"Fold:{fold_}")
    #print(f"Accuracy={accuracy_score}")
    conf_m = confusion_matrix(ytest, preds)
    #print('Confusion matrix\n',conf_m)
    roc_score = roc_auc_score(ytest, preds_proba)
    print('ROC AUC score\n', roc_score)
    t = [fold_, roc_score]
    total_conf.append(conf_m)
    total_roc.append(t)
    test_df.loc[:, "xgb_pred_n"] = preds_proba
    print('Confusion matrix\n', confusion_matrix(ytest, preds))

    return test_df[["id", "target", "kfold", "xgb_pred_n"]], np.mean(total_roc,
                                                                     axis=0)[1]