def test_invalid_classification_loss():
    binary_clf = HistGradientBoostingClassifier(loss="binary_crossentropy")
    err_msg = ("loss='binary_crossentropy' is not defined for multiclass "
               "classification with n_classes=3, use "
               "loss='categorical_crossentropy' instead")
    with pytest.raises(ValueError, match=err_msg):
        binary_clf.fit(np.zeros(shape=(3, 2)), np.arange(3))
def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
                                         max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    max_iter = 1
    max_bins = 256

    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5,
                               n_informative=5, n_redundant=0, random_state=0)

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_sklearn = HistGradientBoostingClassifier(
        loss='binary_crossentropy',
        max_iter=max_iter,
        max_bins=max_bins,
        learning_rate=1,
        n_iter_no_change=None,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')

    est_lightgbm.fit(X_train, y_train)
    est_sklearn.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_sklearn = est_sklearn.predict(X_train)
    assert np.mean(pred_sklearn == pred_lightgbm) > .89

    acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
    acc_sklearn = accuracy_score(y_train, pred_sklearn)
    np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_sklearn = est_sklearn.predict(X_test)
        assert np.mean(pred_sklearn == pred_lightgbm) > .89

        acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
        acc_sklearn = accuracy_score(y_test, pred_sklearn)
        np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
def test_early_stopping_classification(data, scoring, validation_fraction,
                                       n_iter_no_change, tol):

    max_iter = 50

    X, y = data

    gb = HistGradientBoostingClassifier(
        verbose=1,  # just for coverage
        min_samples_leaf=5,  # easier to overfit fast
        scoring=scoring,
        tol=tol,
        validation_fraction=validation_fraction,
        max_iter=max_iter,
        n_iter_no_change=n_iter_no_change,
        random_state=0
    )
    gb.fit(X, y)

    if n_iter_no_change is not None:
        assert n_iter_no_change <= gb.n_iter_ < max_iter
    else:
        assert gb.n_iter_ == max_iter
def test_binning_train_validation_are_separated():
    # Make sure training and validation data are binned separately.
    # See issue 13926

    rng = np.random.RandomState(0)
    validation_fraction = .2
    gb = HistGradientBoostingClassifier(
        n_iter_no_change=5,
        validation_fraction=validation_fraction,
        random_state=rng
    )
    gb.fit(X_classification, y_classification)
    mapper_training_data = gb.bin_mapper_

    # Note that since the data is small there is no subsampling and the
    # random_state doesn't matter
    mapper_whole_data = _BinMapper(random_state=0)
    mapper_whole_data.fit(X_classification)

    n_samples = X_classification.shape[0]
    assert np.all(mapper_training_data.actual_n_bins_ ==
                  int((1 - validation_fraction) * n_samples))
    assert np.all(mapper_training_data.actual_n_bins_ !=
                  mapper_whole_data.actual_n_bins_)
Пример #5
0
# -*- coding: utf-8 -*-
"""
Created on Sun May  3 03:14:13 2020

@author: Jie.Hu
"""


''' 9: Hist Gradient Boosting'''
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import HistGradientBoostingClassifier
clf_hgb = HistGradientBoostingClassifier(validation_fraction=0.2,
                                         n_iter_no_change=20, 
                                         tol=0.001,
                                         random_state= 1337)

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1337)
acc = cross_val_score(estimator = clf_gb, X = X_train, y = y_train, cv = cv, scoring='roc_auc')
acc.mean(), acc.std()

# KF & GS
#gmean_scorer = make_scorer(geometric_mean_score, greater_is_better=True)

parameters = {'learning_rate':[0.001, 0.01, 0.1], 
              'max_depth':[3,5,7],
              'max_leaf_nodes':[11,21,31],
              'min_samples_leaf':[1,3,5],
              'max_iter':[100,200,400],
              'l2_regularization':[0.001,0.01,0.1]}

                                       
data = np.ascontiguousarray(df.values[:, 1:])
data_train, data_test, target_train, target_test = train_test_split(
    data, target, test_size=.2, random_state=0)

if subsample is not None:
    data_train, target_train = data_train[:subsample], target_train[:subsample]

n_samples, n_features = data_train.shape
print(f"Training set with {n_samples} records with {n_features} features.")

print("Fitting a sklearn model...")
tic = time()
est = HistGradientBoostingClassifier(loss='binary_crossentropy',
                                     learning_rate=lr,
                                     max_iter=n_trees,
                                     max_bins=max_bins,
                                     max_leaf_nodes=n_leaf_nodes,
                                     n_iter_no_change=None,
                                     random_state=0,
                                     verbose=1)
est.fit(data_train, target_train)
toc = time()
predicted_test = est.predict(data_test)
predicted_proba_test = est.predict_proba(data_test)
roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
acc = accuracy_score(target_test, predicted_test)
print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")

if args.lightgbm:
    print("Fitting a LightGBM model...")
    tic = time()
    lightgbm_est = get_equivalent_estimator(est, lib='lightgbm')
def test_should_stop(scores, n_iter_no_change, tol, stopping):

    gbdt = HistGradientBoostingClassifier(
        n_iter_no_change=n_iter_no_change, tol=tol
    )
    assert gbdt._should_stop(scores) == stopping
# %%
# Native support for missing values for gradient boosting
# -------------------------------------------------------
#
# The :class:`ensemble.HistGradientBoostingClassifier`
# and :class:`ensemble.HistGradientBoostingRegressor` now have native
# support for missing values (NaNs). This means that there is no need for
# imputing data when training or predicting.

from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier

X = np.array([0, 1, 2, np.nan]).reshape(-1, 1)
y = [0, 0, 1, 1]

gbdt = HistGradientBoostingClassifier(min_samples_leaf=1).fit(X, y)
print(gbdt.predict(X))

# %%
# Precomputed sparse nearest neighbors graph
# ------------------------------------------
# Most estimators based on nearest neighbors graphs now accept precomputed
# sparse graphs as input, to reuse the same graph for multiple estimator fits.
# To use this feature in a pipeline, one can use the `memory` parameter, along
# with one of the two new transformers,
# :class:`neighbors.KNeighborsTransformer` and
# :class:`neighbors.RadiusNeighborsTransformer`. The precomputation
# can also be performed by custom estimators to use alternative
# implementations, such as approximate nearest neighbors methods.
# See more details in the :ref:`User Guide <neighbors_transformer>`.
Пример #9
0
    'race', 'native-country', 'sex'
]

categories = [data[column].unique() for column in data[categorical_columns]]

categorical_preprocessor = OrdinalEncoder(categories=categories)

preprocessor = ColumnTransformer(
    [('cat-preprocessor', categorical_preprocessor, categorical_columns)],
    remainder='passthrough',
    sparse_threshold=0)

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import make_pipeline

model = make_pipeline(preprocessor,
                      HistGradientBoostingClassifier(random_state=42))

# %% [markdown]
# TODO: write your solution here
#
# Use the previously defined model (called `model`) and using two nested `for`
# loops, make a search of the best combinations of the `learning_rate` and
# `max_leaf_nodes` parameters. In this regard, you will need to train and test
# the model by setting the parameters. The evaluation of the model should be
# performed using `cross_val_score`. We can propose to define the following
# parameters search:
# - `learning_rate` for the values 0.01, 0.1, and 1;
# - `max_leaf_nodes` for the values 5, 25, 45.
x = np.concatenate([f_ds, m_ds], 0)
x = x.reshape(x.shape[0], x.shape[1]*x.shape[2])
y = np.concatenate([f_lb, m_lb], 0)
print(x.shape)  # (2141, 110336)
print(y.shape)  # (2141,)

# 전처리
x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=True, test_size=0.2, random_state=42)
print(x_train.shape)    # (1712, 110336)
print(x_test.shape)     # (429, 110336)
print(y_train.shape)    # (1712,)
print(y_test.shape)     # (429,)

# 모델 구성
model = HistGradientBoostingClassifier(verbose=1)
model.fit(x_train, y_train)

# model & weight save
# pickle.dump(model, open('E:/nmb/nmb_data/cp/m03_mels_HistGradientBoostingClassifier.data', 'wb')) # wb : write
# print("== save complete ==")

# model load
model = pickle.load(open('E:/nmb/nmb_data/cp/m03_mels_HistGradientBoostingClassifier.data', 'rb'))  # rb : read
# time >>  0:30:49.704071

# evaluate
y_pred = model.predict(x_test)
# print(y_pred[:100])
# print(y_pred[100:])
Пример #11
0
def ClassificationModelDictionary():
    LR = dict(name='LogisticRegression',
              model=LogisticRegression(),
              parameters={
                  "penalty": ['l1', 'l2'],
                  'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
              },
              best_parameters={},
              cv_params={
                  'penalty': ['l1', 'l2'],
                  'random_state': [0, 8]
              })
    DT = dict(name='DecisionTreeClassifier',
              model=DecisionTreeClassifier(),
              parameters={
                  'criterion': ['gini', 'entropy'],
                  'splitter': ['best', 'random'],
                  'max_depth': [None, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                  'max_features': ['auto', 'log2', None],
                  'random_state': [8],
                  'min_samples_leaf': [1, 2, 3, 4, 5]
              },
              best_parameters={},
              cv_params={
                  'criterion': ['gini', 'entropy'],
                  'splitter': ['best'],
                  'max_features': ['auto', 'log2', None],
                  'random_state': [0, 8]
              })

    KNN = dict(
        name='KNeighborsClassifier',
        model=KNeighborsClassifier(),
        parameters={
            'n_neighbors': [i for i in range(1, 25)],
            'p': [1, 2]
        },  # 1=manhattan, 2, euclidean
        best_parameters={},
        cv_params={
            'priors': [None],
            'var_smoothing': [1e-09]
        })

    GNB = dict(name='GaussianNB',
               model=GaussianNB(),
               parameters={
                   'priors': [
                       None,
                   ],
                   'var_smoothing': [
                       1e-09,
                   ]
               },
               best_parameters={},
               cv_params={
                   'priors': [None],
                   'var_smoothing': [1e-09]
               })
    BNB = dict(name='BernoulliNB',
               model=BernoulliNB(),
               parameters={
                   'alpha': [
                       1.0,
                   ],
                   'binarize': [
                       0.0,
                   ],
                   'fit_prior': [True, False],
                   'class_prior': [None]
               },
               best_parameters={},
               cv_params={
                   'alpha': [1.0],
                   'binarize': [0.0],
                   'fit_prior': [True, False],
                   'class_prior': [None]
               })

    RF = dict(name='RandomForestClassifier',
              model=RandomForestClassifier(),
              parameters={
                  'max_depth': [2, 3, 4],
                  'bootstrap': [True, False],
                  'max_features': ['auto', 'sqrt', 'log2', None],
                  'criterion': ['gini', 'entropy'],
                  'random_state': [8]
              },
              best_parameters={},
              cv_params={
                  'max_depth': [2, 3, 4],
                  'bootstrap': [True, False],
                  'max_features': ['auto', 'sqrt', 'log2', None],
                  'criterion': ['gini', 'entropy'],
                  'random_state': [8]
              })
    SVM = dict(
        name='SVC',
        model=SVC(),
        parameters={
            'C': [1, 10, 100, 500, 1000],
            'kernel': ['linear', 'rbf'],
            'C': [1, 10, 100, 500, 1000],
            'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
            'kernel': ['rbf'],
            #'degree': [2,3,4,5,6] , 'C':[1,10,100,500,1000] , 'kernel':['poly']
        },
        best_parameters={},
        cv_params={
            'C': [1, 10, 100, 500, 1000],
            'kernel': ['rbf'],
            'gamma': [1, 0.1, 0.01, 0.001, 0.0001]
        })

    BAG_params = {
        'base_estimator': [
            DecisionTreeClassifier(),
            DecisionTreeClassifier(max_depth=2),
            DecisionTreeClassifier(max_depth=4),
            BernoulliNB(),
            LogisticRegression(penalty='l1'),
            LogisticRegression(penalty='l2'),
        ],  #GaussianNB(),],
        'n_estimators': [
            10,
        ],
        'max_samples': [1.0],
        'max_features': [1.0],
        'bootstrap': [
            True,
        ],
        'bootstrap_features': [False],
        'oob_score': [False],  #'warm_start': [False], 
        'n_jobs': [None],
        'random_state': [8],
        'verbose': [0]
    }

    BAG = dict(name='BaggingClassifier',
               model=BaggingClassifier(),
               parameters=BAG_params,
               best_parameters={},
               cv_params={
                   'base_estimator': [
                       DecisionTreeClassifier(criterion='gini'),
                       DecisionTreeClassifier(criterion='entropy'),
                       BernoulliNB(),
                       LogisticRegression(penalty='l1'),
                       LogisticRegression(penalty='l2')
                   ],
                   'bootstrap': [True],
                   'random_state': [0, 8]
               })

    GB = dict(
        name='GradientBoostingClassifier',
        model=GradientBoostingClassifier(),
        parameters={
            'loss': ['deviance', 'exponential'],
            'learning_rate': [0.1, 0.01, 1.0],
            'n_estimators': [100, 200, 25, 50, 75],
            'subsample':
            [1.0, 0.75, 0.5, 0.25, 0.01
             ],  # < 1.0 leads to reduction of variance and increase in bias
            #  < 1.0 results in Stochastic Gradient Boosting
            'random_state': [8],
            #'ccp_alpha': [0.0,0.0001,0.001,0.01,0.1,1.0]# only in version 0.22
            #cost-complexity pruning algorithm to prune tree to avoid over fitting
            #'min_samples_split':[2,3,4],
            #'min_samples_leaf':[1,2,3],
            #'min_weight_fraction_leaf':[0],
            #'max_depth':[3,4,5],
            #'min_impurity_decrease':[0],
            #'init':[None],
            #'max_features':[None],
            #'verbose':[0],
        },
        best_parameters={},
        cv_params={
            'loss': ['deviance', 'exponential'],
            'n_estimators': [100],
            'random_state': [0, 8]
        })
    ADA = dict(
        name='AdaBoostClassifier',
        model=AdaBoostClassifier(),
        parameters={
            'base_estimator': [
                DecisionTreeClassifier(max_depth=1),
                DecisionTreeClassifier(max_depth=2),
                DecisionTreeClassifier(max_depth=3),
                DecisionTreeClassifier(max_depth=4),
                BernoulliNB(),
                #GaussianNB(),
            ],
            'n_estimators': [25, 50, 75, 100],  # ,100
            'learning_rate': [1.0, 0.1],
            #'alogorithm':['SAMME', 'SAMME.R'],
            'random_state': [8],
        },
        best_parameters={},
        cv_params={
            'base_estimator': [
                None,
                DecisionTreeClassifier(criterion='gini'),
                DecisionTreeClassifier(criterion='entropy'),
                BernoulliNB(),
                LogisticRegression(penalty='l1'),
                LogisticRegression(penalty='l2')
            ],
            'random_state': [0, 8]
        })

    XGB_params = {
        'max_depth': [3],
        'learning_rate': [0.1],
        'n_estimators': [
            100,
        ],  #50,150,200],
        'verbosity': [1],
        'objective': ['binary:logistic'],
        'booster': ['gbtree', 'gblinear', 'dart'],  # IMPORTANT
        'tree_method': ['auto', 'exact', 'approx',
                        'hist'],  #, 'gpu_hist' # IMPORTANT
        'n_jobs': [1],
        'gamma': [0],
        'min_child_weight': [1],
        'max_delta_step': [0],
        'subsample': [1],
        'colsample_bytree': [1],
        'colsample_bylevel': [1],
        'colsample_bynode': [1],
        'reg_alpha': [0],
        'reg_lambda': [1],
        'scale_pos_weight': [1],
        'base_score': [0.5],
        'random_state': [8],
        'missing': [None]
    }

    XGB = dict(name='XGBClassifier',
               model=XGBClassifier(),
               parameters=XGB_params,
               best_parameters={},
               cv_params={
                   'tree_method': ['auto', 'exact', 'approx', 'hist'],
                   'booster': ['gbtree', 'gblinear', 'dart'],
                   'random_state': [0, 8]
               })

    LBGM_params = {
        'boosting_type': ['gbdt', 'goss'],  # ,'dart','rf'
        'num_leaves': [31],
        'max_depth': [-1],
        'learning_rate': [0.1],
        'n_estimators': [100],
        'subsample_for_bin': [200000],
        'objective': [None],
        'class_weight': [None],
        'min_split_gain': [0.0],
        'min_child_weight': [0.001],
        'min_child_samples': [20],
        'subsample': [1.0],
        'subsample_freq': [0],
        'colsample_bytree': [1.0],
        'reg_alpha': [0.0],
        'reg_lambda': [0.0],
        'random_state': [8],
        'n_jobs': [-1],
        'silent': [True],
        'importance_type': ['split']
    }

    LGBM = dict(name='LGBMClassifier',
                model=LGBMClassifier(),
                parameters=LBGM_params,
                best_parameters={},
                cv_params={
                    'boosting_type': ['gbdt', 'goss'],
                    'random_state': [0, 8]
                })

    HGB_params = {
        'loss': [
            'auto',
            'binary_crossentropy',
        ],  # 'categorical_crossentropy'
        'learning_rate': [0.1],
        'max_iter': [100],
        'max_leaf_nodes': [31],
        'max_depth': [None],
        'min_samples_leaf': [20],
        'l2_regularization': [0, 1, 2],  # for no-regulaiziation, 1 regulztn
        'max_bins': [255],
        #'warm_start': [False],
        'scoring': [None],
        'validation_fraction': [0.1],
        'n_iter_no_change': [None],
        'tol': [1e-07],
        'verbose': [0],
        'random_state': [8]
    }

    HGB = dict(name='HistGradientBoostingClassifier',
               model=HistGradientBoostingClassifier(),
               parameters=HGB_params,
               best_parameters={},
               cv_params={
                   'loss': ['auto', 'binary_crossentropy'],
                   'l2_regularization': [0, 1, 2],
                   'random_state': [0, 8]
               })

    models = {
        i: mod
        for i, mod in enumerate(
            [LR, DT, KNN, GNB, BNB, RF, SVM, BAG, GB, ADA, XGB, LGBM, HGB],
            start=1)
    }
    return models
Пример #12
0
    args = parser.parse_args()

    n_trees = args.n_trees

    df = load_data()
    target = df.values[:, 0]
    data = np.ascontiguousarray(df.values[:, 1:])
    data_train, data_test, target_train, target_test = train_test_split(
        data, target, test_size=.2, random_state=0)

    n_samples, n_features = data_train.shape
    print(f"Training set with {n_samples} records with {n_features} features.")

    est = HistGradientBoostingClassifier(loss='binary_crossentropy',
                                         max_iter=n_trees,
                                         n_iter_no_change=None,
                                         random_state=0,
                                         verbose=1)

    if args.library == 'sklearn':
        print("Fitting a sklearn model...")
        tic = time()
        est.fit(data_train, target_train)
        toc = time()
        predicted_test = est.predict(data_test)
        predicted_proba_test = est.predict_proba(data_test)
        roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
        acc = accuracy_score(target_test, predicted_test)
        print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, "
              f"ACC: {acc :.4f}")
Пример #13
0
    else:
        X = X_
        y = y_

    # TODO convert into arguments
    n_components = 0.95
    inner_cv = 5
    max_iter = 300
    class_weight = None

    lrcv = LogisticRegressionCV(cv=inner_cv, scoring="accuracy",
                                n_jobs=1,
                                class_weight=class_weight,
                                random_state=random_state, max_iter=max_iter)
    gbm = GradientBoostingClassifier(n_estimators=100)
    hgbm = HistGradientBoostingClassifier(max_iter=100)

    print("Samples: ", str(n_samples))
    print("Features: ", str(n_features))
    print("Informative: ", str(n_informative))

    # convert single clf argument to list, if only one was passed
    if type(clf) is not list:
        clf = [clf]

    for clf_ in clf:
        print("Evaluating classifier: ", clf_)

        if add_pca:
            pipe = make_pipeline(
                StandardScaler(),
Пример #14
0
    res["max_depth"] = max_depth
    res["n_estimators"] = n_estimators
    res["n_features"] = n_features
    if verbose:
        pprint(res)
    yield res


compilation.extend(list(measure_onnx_runtime(rf, X_test)))


########################################
# HistGradientBoostingClassifier
# ++++++++++++++++++++++++++++++

hist = HistGradientBoostingClassifier(
    max_iter=n_estimators, max_depth=max_depth)
print('train')
hist = train_cache(hist, X_train, y_train, max_depth, n_estimators, n_classes)

compilation.extend(list(measure_onnx_runtime(hist, X_test)))

########################################
# LightGBM
# ++++++++

lgb = LGBMClassifier(n_estimators=n_estimators,
                     max_depth=max_depth, pred_early_stop=False)
print('train')
lgb = train_cache(lgb, X_train, y_train, max_depth, n_estimators, n_classes)

compilation.extend(list(measure_onnx_runtime(lgb, X_test)))
Пример #15
0
def Gridsearchcv(X_train, X_test, y_train, y_test):
    ############
    # Scale numeric values
    num_transformer = Pipeline(steps=[
        ('scaler', MinMaxScaler())])
    
    preprocessor = ColumnTransformer(
        remainder='passthrough',
        transformers=[
            ('num', num_transformer, make_column_selector(pattern='EDAD'))
            ])
    ############
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('clf', PipelineHelper([
            ('svc', SVC()),
            ('gb', GradientBoostingClassifier()),
            ('xgb', XGBClassifier(use_label_encoder=False)),
            ('eec', EasyEnsembleClassifier()),
            ('rbc', RUSBoostClassifier()),
            ('bbc', BalancedBaggingClassifier()),
            ('brf', BalancedRandomForestClassifier()),
        ])),
    ])

    params = {
    'clf__selected_model': pipe.named_steps['clf'].generate({

        # # #EasyEnsembleClassifier
        'eec__n_estimators' : [10, 25, 50, 100],
        'eec__warm_start' : [False, True],
        'eec__replacement' : [False, True],

        # # #RUSBoostClassifier
        'rbc__algorithm' : ['SAMME','SAMME.R'],
        'rbc__n_estimators' : [10, 50, 100, 200, 500],
        'rbc__learning_rate' : [1e-3, 1e-2, 1e-1, 0.5, 1.],
        
        # # #BalancedBaggingClassifier
        'bbc__base_estimator': [HistGradientBoostingClassifier(), None],
        'bbc__n_estimators' : [10, 50, 100, 200, 500,750,1000],
        'bbc__max_samples':[0.5,0.6,0.7,0.8,0.9,1.0],
        'bbc__max_features':[0.5,0.6,0.7,0.8,0.9,1.0],

        # #BalancedRandomForestClassifier
        'brf__criterion': ['gini', 'entropy'],
        'brf__n_estimators' : [int(x) for x in np.linspace(start = 20, stop = 200, num = 5)],
        'brf__max_depth' : [int(x) for x in np.linspace(1, 45, num = 3)],
        'brf__min_samples_split' : range(2,10),
        'brf__min_samples_leaf': [1,3,5,10], 
        'brf__max_features' : ['auto', 'sqrt', 'log2'],

        # # #svm 
        'svc__C': [0.1, 0.5, 1, 10, 30, 40, 50, 75, 100, 500, 1000], 
        'svc__gamma' : [0.0001, 0.001, 0.005, 0.01, 0.05, 0.07, 0.1, 0.5, 1, 5, 10, 50],
        'svc__kernel': ['rbf'],
        
        # # #gb 3780
        "gb__learning_rate": [0.0001, 0.001, 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
        "gb__max_depth":[3,7,8,9,10,50],
        "gb__max_features":["log2","sqrt"],
        "gb__subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
        "gb__n_estimators":[10, 50, 100, 200, 300],
        
        # #xgboost
        'xgb__learning_rate' : [1e-3, 1e-2, 1e-1, 0.5, 1.],  
        'xgb__min_child_weight': np.arange(1, 21, 5),
        'xgb__subsample': np.arange(0.05, 1.01, 0.05),
        'xgb__verbosity': [0],

        # 'xgb__booster': ['gbtree', 'gblinear' ,'dart'], 
        # 'xgb__learning_rate' : [1e-3, 1e-2, 1e-1, 0.5, 1.], 
        # 'xgb__min_child_weight': range(1, 21, 5),
        # 'xgb__subsample': np.arange(0.05, 1.01, 0.05),
        # 'xgb__max_depth': [15,20,25],
        # 'xgb__verbosity': [0],

        # 'xgb__n_estimators': [100],
        # 'xgb__max_depth': range(1, 11),
        # 'xgb__learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
        # 'xgb__subsample': np.arange(0.05, 1.01, 0.05),
        # 'xgb__min_child_weight': range(1, 21),
        # 'xgb__verbosity': [0], # add this line to slient warning 
        
        # 'xgb__n_estimators': [400, 700, 1000],
        # 'xgb__colsample_bytree': [0.7, 0.8],
        # 'xgb__max_depth': [15,20,25],
        # 'xgb__reg_alpha': [1.1, 1.2, 1.3],
        # 'xgb__reg_lambda': [1.1, 1.2, 1.3],
        # 'xgb__subsample': [0.7, 0.8, 0.9],
        # 'xgb__eval_metric' : ['mlogloss']
        }),
    }
    scoring = {'ba': 'balanced_accuracy','ap': 'average_precision', 'F1' : 'f1', 'ra': 'roc_auc', 'rc': 'recall'}
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3)
    #cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5)
    #https://towardsdatascience.com/hyper-parameter-tuning-with-randomised-grid-search-54f865d27926
    #n_iter: 30,60, 100
    grid = RandomizedSearchCV(
        pipe, 
        params,
        refit = 'ba',
        cv = cv, 
        verbose = 3, 
        n_jobs=-1,
        n_iter = 60,
        scoring= scoring,
        return_train_score = True
        )

    grid.fit(X_train, y_train)
    df_grid=pd.DataFrame(grid.cv_results_)
    df_grid = df_grid.sort_values(by=['mean_test_ba'],ascending=False)
    df_grid = df_grid[[
        'param_clf__selected_model',
        'params',
        'mean_fit_time',
        'std_fit_time',
        'mean_test_ba',
        'std_test_ba',
        'rank_test_ba',
        'mean_test_ap',
        'std_test_ap',
        'rank_test_ap',
        'mean_test_ra',
        'std_test_ra',
        'rank_test_ra',
        'mean_test_F1', 
        'std_test_F1', 
        'rank_test_F1'
    ]]

    print("Best-Fit Parameters From Training Data:\n",grid.best_params_)
    grid_predictions = grid.best_estimator_.predict(X_test) 
    report = classification_report(y_test, grid_predictions, output_dict=True)
    report = pd.DataFrame(report).transpose()
    print(report)
    print(confusion_matrix(y_test, grid_predictions))

    return grid, df_grid, report
Пример #16
0
#Create the min max scalar and apply it to our parameters. Drop all uneeded columns and store the column to be predicted as our y.
X = df.drop(columns=[
    'Unnamed: 0', 'id', 'title', 'category', 'subcategory', 'blurb', 'launch',
    'deadline', 'state', 'city', 'backers', 'pledged', 'ongoing', 'location',
    'success'
])
columns = X.columns
X = pd.DataFrame(X, columns=columns)
y = df['success']

#separate training and testing data for the model.
kf = KFold(n_splits=10)
scaler = MinMaxScaler()
scaler.fit(X)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
#Create model, train it, and test it.
model = HistGradientBoostingClassifier(
)  #learning_rate=0.1, loss='binary_crossentropy', max_bins=255, max_depth=3, max_iter=100, max_leaf_nodes=31, min_samples_leaf=10)
#hyperparameters had slightly lower results of 71.96 average accuracy
model.fit(X, y)
pipeline = Pipeline([('scaler', scaler), ('HGB Classifier', model)])
score = cross_val_score(pipeline, X, y, cv=kf, scoring='accuracy').mean()
print(score)

#pickle the model for future use
pkl.dump(model, file1)
pkl.dump(encoder, file2)
pkl.dump(scaler, file3)
file1.close()
file2.close()
file3.close()
Пример #17
0
def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
                                         max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification
    pytest.importorskip("lightgbm")

    rng = np.random.RandomState(seed=seed)
    max_iter = 1
    n_classes = 2
    max_bins = 255

    X, y = make_classification(
        n_samples=n_samples,
        n_classes=n_classes,
        n_features=5,
        n_informative=5,
        n_redundant=0,
        random_state=0,
    )

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_sklearn = HistGradientBoostingClassifier(
        loss="log_loss",
        max_iter=max_iter,
        max_bins=max_bins,
        learning_rate=1,
        early_stopping=False,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes,
    )
    est_lightgbm = get_equivalent_estimator(est_sklearn,
                                            lib="lightgbm",
                                            n_classes=n_classes)

    est_lightgbm.fit(X_train, y_train)
    est_sklearn.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_sklearn = est_sklearn.predict(X_train)
    assert np.mean(pred_sklearn == pred_lightgbm) > 0.89

    acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
    acc_sklearn = accuracy_score(y_train, pred_sklearn)
    np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_sklearn = est_sklearn.predict(X_test)
        assert np.mean(pred_sklearn == pred_lightgbm) > 0.89

        acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
        acc_sklearn = accuracy_score(y_test, pred_sklearn)
        np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
Пример #18
0
def cat(X):
    return X.dtypes == "category"


cat_imp = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", sparse=False),
    TruncatedSVD(),
)
ct = ColumnTransformer([("cat", cat_imp, cat),
                        ("cont",
                         FunctionTransformer(lambda x: x,
                                             validate=False), cont)])
clf = sklearn.pipeline.Pipeline(steps=[
    ("transform", ct),
    ("estimator", HistGradientBoostingClassifier()),
])

suite = openml.study.get_suite(1)
# We'll create a study with one run on three random datasets each
tasks = np.random.choice(suite.tasks, size=3, replace=False)
run_ids = []
for task_id in tasks:
    task = openml.tasks.get_task(task_id)
    run = openml.runs.run_model_on_task(clf, task)
    run.publish()
    run_ids.append(run.run_id)

# The study needs a machine-readable and unique alias. To obtain this,
# we simply generate a random uuid.
alias = uuid.uuid4().hex
def test_should_stop(scores, n_iter_no_change, tol, stopping):

    gbdt = HistGradientBoostingClassifier(n_iter_no_change=n_iter_no_change,
                                          tol=tol)
    assert gbdt._should_stop(scores) == stopping
Пример #20
0
g_nb = GaussianNB()
knn = KNeighborsClassifier(
)  # 參數:n_neighbors(鄰居數:預設為5)、weights(權重,預設為uniform)、leaf_size(葉的大小:預設為30)
ran_for = RandomForestClassifier()
# n_estimators:樹的顆數、max_depth:最大深度,剪枝用,超過全部剪掉。
# min_samples_leaf:搭配max_depth使用,一個節點在分枝後每個子節點都必須包含至少min_samples_leaf個訓練樣本
# bootstrap:重新取樣原有Data產生新的Data,取樣的過程是均勻且可以重複取樣
log_reg = LogisticRegression(
)  #penalty:懲罰函數(預設L2)、C:正則強度倒數,預設為1.0、solver:解決器(默認='lbfgs'),saga對所有懲罰都可以使用
tree = DecisionTreeClassifier()
xgb = XGBClassifier()  # https://www.itread01.com/content/1536594984.html 參數詳解
ada_boost = AdaBoostClassifier(
)  # https://ask.hellobi.com/blog/zhangjunhong0428/12405 參數詳解
grad_boost = GradientBoostingClassifier(
    n_estimators=100)  # https://www.itread01.com/content/1514358146.html 參數詳解
hist_grad_boost = HistGradientBoostingClassifier(
)  # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html

#訓練模型之參數設定
clf = [("Naive Bayes", g_nb, {}), \
       ("K Nearest", knn, {"n_neighbors": [3, 5, 6, 7, 8, 9, 10], "leaf_size": [25, 30, 35]}), \
       ("Random Forest", ran_for,
        {"n_estimators": [10, 50, 100, 200, 400], "max_depth": [3, 10, 20, 40], "random_state": [99],
         "min_samples_leaf": [5, 10, 20, 40, 50], "bootstrap": [False]}), \
       ("Logistic Regression", log_reg, {"penalty": ['l2'], 'max_iter':[10, 20],"C": [100, 10, 1.0, 0.1, 0.01], "solver": ['saga']}), \
 \
       ("Decision Tree", tree, {}), \
       ("XGBoost", xgb,
        {"n_estimators": [200], "max_depth": [3, 4, 5], "learning_rate": [.01, .1, .2], "subsample": [.8],
         "colsample_bytree": [1], "gamma": [0, 1, 5], "lambda": [.01, .1, 1]}), \
 \
       ("Adapative Boost", ada_boost, {"n_estimators": [100], "learning_rate": [.6, .8, 1]}), \
def test_categorical_encoding_strategies():
    # Check native categorical handling vs different encoding strategies. We
    # make sure that native encoding needs only 1 split to achieve a perfect
    # prediction on a simple dataset. In contrast, OneHotEncoded data needs
    # more depth / splits, and treating categories as ordered (just using
    # OrdinalEncoder) requires even more depth.

    # dataset with one random continuous feature, and one categorical feature
    # with values in [0, 5], e.g. from an OrdinalEncoder.
    # class == 1 iff categorical value in {0, 2, 4}
    rng = np.random.RandomState(0)
    n_samples = 10_000
    f1 = rng.rand(n_samples)
    f2 = rng.randint(6, size=n_samples)
    X = np.c_[f1, f2]
    y = np.zeros(shape=n_samples)
    y[X[:, 1] % 2 == 0] = 1

    # make sure dataset is balanced so that the baseline_prediction doesn't
    # influence predictions too much with max_iter = 1
    assert 0.49 < y.mean() < 0.51

    clf_cat = HistGradientBoostingClassifier(
        max_iter=1, max_depth=1, categorical_features=[False, True])

    # Using native categorical encoding, we get perfect predictions with just
    # one split
    assert cross_val_score(clf_cat, X, y).mean() == 1

    # quick sanity check for the bitset: 0, 2, 4 = 2**0 + 2**2 + 2**4 = 21
    expected_left_bitset = [21, 0, 0, 0, 0, 0, 0, 0]
    left_bitset = clf_cat.fit(X, y)._predictors[0][0].raw_left_cat_bitsets[0]
    assert_array_equal(left_bitset, expected_left_bitset)

    # Treating categories as ordered, we need more depth / more splits to get
    # the same predictions
    clf_no_cat = HistGradientBoostingClassifier(max_iter=1,
                                                max_depth=4,
                                                categorical_features=None)
    assert cross_val_score(clf_no_cat, X, y).mean() < .9

    clf_no_cat.set_params(max_depth=5)
    assert cross_val_score(clf_no_cat, X, y).mean() == 1

    # Using OHEd data, we need less splits than with pure OEd data, but we
    # still need more splits than with the native categorical splits
    ct = make_column_transformer((OneHotEncoder(sparse=False), [1]),
                                 remainder='passthrough')
    X_ohe = ct.fit_transform(X)
    clf_no_cat.set_params(max_depth=2)
    assert cross_val_score(clf_no_cat, X_ohe, y).mean() < .9

    clf_no_cat.set_params(max_depth=3)
    assert cross_val_score(clf_no_cat, X_ohe, y).mean() == 1
Пример #22
0
class GradientBoostingMsgClassifierModel(h1.Model):
    def load_data(self, num_files=None):
        return util.load_data(num_files, shuffle=True)

    def prep_data(self, data):
        def concat_processed_files(files):
            dfs = []
            for f in files:
                z = pd.read_csv(f)
                z.columns = [
                    'Timestamp',
                    'Label',
                    'CarSpeed',
                    'SteeringAngle',
                    'YawRate',
                    'Gx',
                    'Gy',
                ]
                z = util.compute_timediff_fillna(z)
                dfs.append(z)
            df2 = pd.concat(dfs)
            return df2

        result = {
            "train_attack_df":
            concat_processed_files(data["train_attack_files"]),
            "test_attack_df": concat_processed_files(data["test_attack_files"])
        }
        print("len train_attack_df = %s" % len(result["train_attack_df"]))
        print("len test_attack_df = %s" % len(result["test_attack_df"]))
        return result

    def train(self, prepared_data):
        df = prepared_data["train_attack_df"]
        from sklearn.experimental import enable_hist_gradient_boosting
        from sklearn.ensemble import HistGradientBoostingClassifier
        X = df[FEATURES]
        y = df.Label == "Tx"
        self.model = HistGradientBoostingClassifier(max_iter=500).fit(X, y)

    def evaluate(self, prepared_data):
        df = prepared_data["test_attack_df"]
        ypred = self.model.predict(df[FEATURES])
        import sklearn.metrics
        cf = sklearn.metrics.confusion_matrix(df.Label == "Tx", ypred)
        acc = sklearn.metrics.accuracy_score(df.Label == "Tx", ypred)
        print(cf)
        print("Accuracy = %.4f" % acc)
        self.metrics = {"confusion_matrix": cf, "accuracy": acc}

    def predict(self, data):
        df = data["df"].copy()
        df = util.compute_timediff_fillna(df)
        df['MsgIsAttack'] = 0
        df['WindowInAttack'] = 0
        for event_result in data["event_detection_results"]:
            if event_result['WindowInAttack']:
                # print("window %s in attack: event_result = %s" % (event_result['window_start'], event_result))
                in_window = (df.Timestamp >= event_result['window_start']) & (
                    df.Timestamp <
                    event_result['window_start'] + config.WINDOW_SIZE)
                w_df = df[in_window]
                ypred = self.model.predict(w_df[FEATURES])
                df.loc[in_window, "WindowInAttack"] = 1
                df.loc[in_window, "MsgIsAttack"] = ypred.astype(int)
        return {"injection_window_results": df}
Пример #23
0
        y=target_column_name,
    )
elif args.library == 'lightgbm':
    import lightgbm as lgb
    model = lgb.LGBMClassifier(
        learning_rate=0.1,
        n_estimators=100,
        num_leaves=255,
    )
    model.fit(features_train, labels_train)
elif args.library == 'sklearn':
    from sklearn.experimental import enable_hist_gradient_boosting
    from sklearn.ensemble import HistGradientBoostingClassifier
    model = HistGradientBoostingClassifier(
        learning_rate=0.1,
        max_iter=100,
        max_leaf_nodes=255,
        validation_fraction=None,
    )
    model.fit(features_train, labels_train)
elif args.library == 'xgboost':
    import xgboost as xgb
    model = xgb.XGBClassifier(eta=0.1,
                              grow_policy='lossguide',
                              n_estimators=100,
                              tree_method='hist')
    model.fit(features_train, labels_train)
elif args.library == 'catboost':
    from catboost import CatBoostClassifier
    model = CatBoostClassifier(grow_policy='Lossguide',
                               learning_rate=0.1,
                               n_estimators=100,
Пример #24
0
def run(argv=None):
    """Emulate a HP search and monitor fit time."""
    args = parser.parse_args(argv)

    imputers = {
        'Mean': SimpleImputer(strategy='mean'),
        'Mean+mask': SimpleImputer(strategy='mean', add_indicator=True),
        'Med': SimpleImputer(strategy='median'),
        'Med+mask': SimpleImputer(strategy='median', add_indicator=True),
        'Iterative': IterativeImputer(max_iter=args.max_iter),
        'Iterative+mask': IterativeImputer(add_indicator=True,
                                           max_iter=args.max_iter),
        'IterativeR': IterativeImputer(estimator=RidgeCV(),
                                       max_iter=args.max_iter),
        'IterativeR+mask': IterativeImputer(estimator=RidgeCV(),
                                            add_indicator=True,
                                            max_iter=args.max_iter),
        'KNN': KNNImputer(),
        'KNN+mask': KNNImputer(add_indicator=True),

    }

    task_name = args.task_name
    est = args.est
    imp = imputers.get(args.imp, None)

    if task_name is None or est is None:
        logger.info('No argv given.')
        task_name = 'TB/shock_hemo'
        est = 'HGBC'

    task = tasks[task_name]
    logger.info(f'Argv given. Task {task.meta.tag}. est {est}.')

    t0 = time()
    logger.info('Getting X.')
    X = task.X
    logger.info('Getting y.')
    y = task.y

    logger.info(f'X shape before splits: {X.shape}')

    # Simulate the outer CV (the one of KFold)
    X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2)

    # Simulate the inner CV (the one of RandomSearchCV)
    X_train2, X_test2, y_train2, _ = train_test_split(X_train, y_train, test_size=0.2)

    # Now X has the same shape as in real experiment
    logger.info(f'X shape: {X_train2.shape}')

    t_X_ready = time()

    if imp is not None:
        logger.info(f'Fitting imputer {args.imp}')
        imp.fit(X_train2, y_train2)
        t_fit_imp = time()
        logger.info('Imputer fitted.')

        logger.info('Transforming X_train')
        imp.transform(X_train2)
        t_tra1_imp = time()
        logger.info('X_train transformed')

        logger.info('Transforming X_test')
        imp.transform(X_test2)
        t_tra2_imp = time()
        logger.info('X_test transformed')

    t_fits = [time()]

    for learning_rate in param_space['learning_rate']:
        for max_depth in param_space['max_depth']:
            if est == 'HGBC':
                estimator = HistGradientBoostingClassifier(
                    learning_rate=learning_rate,
                    max_depth=max_depth
                )
            elif est == 'HGBR':
                estimator = HistGradientBoostingRegressor(
                    loss='least_absolute_deviation',
                    learning_rate=learning_rate,
                    max_depth=max_depth
                )
            else:
                raise ValueError(f'Unknown estimator {est}')

            logger.info(f'Params: LR {learning_rate} MD {max_depth}')
            logger.info('Fitting estimator.')
            estimator.fit(X_train2, y_train2)
            t_fits.append(time())
            logger.info('Estimator fitted.')

    t_fits = np.diff(t_fits)

    data = {
        'task_tag': [task.meta.tag],
        'imp': [args.imp],
        'imp_params': [repr({'max_iter': args.max_iter})],
        'X_shape': [repr(X.shape)],
        'X_train_shape': [repr(X_train2.shape)],
        'X_test_shape': [repr(X_test2.shape)],
        'time_X_ready': [t_X_ready-t0],
        'time_fit_imp': np.around([0 if imp is None else t_fit_imp-t_X_ready], 2),
        'time_tra1_imp': np.around([0 if imp is None else t_tra1_imp-t_X_ready], 2),
        'time_tra2_imp': np.around([0 if imp is None else t_tra2_imp-t_tra1_imp], 2),
        'time_fits': [repr(np.around(t_fits.tolist(), 2))],
        'time_fits_mean': [np.around(t_fits.mean(), 2)]
    }

    new_df = pd.DataFrame(data)

    df = None
    filepath = 'results/fit_time.csv'
    if os.path.exists(filepath):
        df = pd.read_csv(filepath, index_col=0)

    if df is not None:
        new_df = pd.concat([df, new_df])

    new_df.to_csv(filepath)
Пример #25
0
            #
            #         clf.fit(X_train_selected, y_train)
            #
            #         y_pred = clf.predict(X_test_selected)
            #         y_train_pred = clf.predict(X_train_selected)
            #
            #         balacc, acc, mse, r2, rho = gradeoutput(y_test, y_pred, class_boundary, tfm)
            #         outdf = writeresults(outdf, sel_name, clf_name, split, param1, param2, acc, balacc, mse, r2, rho)

            elif clf_name is "Huber":
                param1 = np.NaN
                param2 = np.NaN
                eps_list = [1.1, 1.2, 1.35, 1.5,
                            2]  # epsilon:  greater than 1.0, default 1.35
                for param1 in tqdm(eps_list):
                    clf = HistGradientBoostingClassifier(
                        learning_rate=param1, random_state=randomstate)

                    clf.fit(X_train_selected, y_train)

                    y_pred = clf.predict(X_test_selected)
                    y_train_pred = clf.predict(X_train_selected)

                    balacc, acc, mse, r2, rho = gradeoutput(
                        y_test, y_pred, class_boundary, qtfm)
                    outdf = writeresults(outdf, sel_name, clf_name, split,
                                         param1, param2, acc, balacc, mse, r2,
                                         rho)

            elif clf_name is "K-Neighbors":
                param1 = np.NaN
                param2 = np.NaN
Пример #26
0
def main():
    EXPORT = True
    random_state = 49
    train_size = 0.8 if not EXPORT else 1

    df = pd.read_csv("train.csv")
    #df['Fare'].fillna(-1, inplace=True)
    #df['Embarked'].fillna('C', inplace=True)
    #df['Age'].fillna(-1, inplace=True)

    test = pd.read_csv('test.csv')
    #test['Fare'].fillna(-1, inplace=True)
    #test['Embarked'].fillna('C', inplace=True)
    #test['Age'].fillna(-1, inplace=True)
    #df_train: pd.DataFrame
    #df_valid: pd.DataFrame
    df_train, df_valid = train_test_split(df, train_size=train_size, random_state=random_state)\
        if not EXPORT else (df, df)

    preprocessor = Preprocessor()
    preprocessor.fit(df_train)

    x_train = preprocessor.transform(df_train)
    y_train = df_train["target"]

    x_valid = preprocessor.transform(df_valid) if not EXPORT else None
    y_valid = df_valid["target"] if not EXPORT else None

    x_test = preprocessor.transform(test)

    # Tell us is that there are not too many features strongly correlated with one another.
    # Not correlated is good because this means that there isn't much redundant or superfluous data.
    # Disapear when the program finish.
    plt.figure(figsize=(14, 12))
    plt.title('Pearson Correlation of Features', y=1.05, size=15)
    sns.heatmap(pd.DataFrame(x_test.astype(float)).corr(),
                linewidths=0.1,
                vmax=1.0,
                square=True,
                cmap=plt.cm.get_cmap('RdBu'),
                linecolor='white',
                annot=True)

    #model = RandomForestClassifier(n_estimators=200, bootstrap=False, min_samples_split=49, criterion='entropy')     #78.81 ... 86.19
    #model = GradientBoostingClassifier(n_estimators=500, learning_rate=0.1, max_depth=5, random_state=0, verbose=True)   #77.08 ... 86.55
    model = HistGradientBoostingClassifier(loss='binary_crossentropy',
                                           tol=1e-12,
                                           max_iter=10000,
                                           min_samples_leaf=2,
                                           verbose=True)  #78.87 ... 86.85
    #model = MLPClassifier(hidden_layer_sizes=20, activation='tanh', solver='adam', tol=1e-6, verbose=True, max_iter=500, random_state=0) #100    78.92 79.11 81.24 81.85 81.74 .. 84.03

    #GridSearchCV
    # Going to use these 5 base models for the stacking
    # from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier)
    # from sklearn.svm import SVC

    #model = DecisionTreeClassifier(max_depth=None, min_samples_split=10, random_state = 0)                #78.56 ... 82.26
    #model = LogisticRegression(max_iter=1000, random_state=0)                                             #77.96 ... 81.83 ...
    #model = LogisticRegressionCV(max_iter=1000, random_state=0)                                           #78.016 ... 82.03
    #model = RadiusNeighborsClassifier(radius=9)                                                           #75.87 ... 75.87
    #model = KNeighborsClassifier(n_neighbors=6)                                                           #73.795 ... 81.76
    ## model = GaussianNB()                                                                                  #78.667
    model.fit(x_train, y_train)

    if EXPORT:
        y_test_pred = model.predict(x_test)
        result = pd.DataFrame(np.stack((np.array(test['Id']), y_test_pred),
                                       axis=1),
                              columns=['Id', 'Predicted'])
        result.to_csv('submission.csv', index=False)
        print("Result exported.")
    else:
        y_valid_pred = model.predict(x_valid)
        accuracy = accuracy_score(y_valid, y_valid_pred)
        # mcc = matthews_corrcoef(y_valid, y_valid_pred)
        #metrics = dict(accuracy=accuracy) ###############################################################
        #params = dict(min_samples_split=min_samples_split)
        print("Accuracy: {}".format(accuracy))
        # print("Mcc: {}".format(mcc))
        # tn, fp, fn, tp = confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0]).ravel()

    #print("Train size: {} Random state: {} Accuracy: {}".format(i, j, accuracy))
    #best.append(accuracy)
    #print(max(best))

    pass
Пример #27
0
def test_same_predictions_multiclass_classification(seed, min_samples_leaf,
                                                    n_samples, max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification
    pytest.importorskip("lightgbm")

    rng = np.random.RandomState(seed=seed)
    max_iter = 1
    max_bins = 255
    lr = 1

    X, y = make_classification(n_samples=n_samples,
                               n_classes=3,
                               n_features=5,
                               n_informative=5,
                               n_redundant=0,
                               n_clusters_per_class=1,
                               random_state=0)

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_sklearn = HistGradientBoostingClassifier(
        loss='categorical_crossentropy',
        max_iter=max_iter,
        max_bins=max_bins,
        learning_rate=lr,
        early_stopping=False,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')

    est_lightgbm.fit(X_train, y_train)
    est_sklearn.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_sklearn = est_sklearn.predict(X_train)
    assert np.mean(pred_sklearn == pred_lightgbm) > .89

    proba_lightgbm = est_lightgbm.predict_proba(X_train)
    proba_sklearn = est_sklearn.predict_proba(X_train)
    # assert more than 75% of the predicted probabilities are the same up to
    # the second decimal
    assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75

    acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
    acc_sklearn = accuracy_score(y_train, pred_sklearn)
    np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_sklearn = est_sklearn.predict(X_test)
        assert np.mean(pred_sklearn == pred_lightgbm) > .89

        proba_lightgbm = est_lightgbm.predict_proba(X_train)
        proba_sklearn = est_sklearn.predict_proba(X_train)
        # assert more than 75% of the predicted probabilities are the same up
        # to the second decimal
        assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75

        acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
        acc_sklearn = accuracy_score(y_test, pred_sklearn)
        np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
Пример #28
0
                                                    y,
                                                    shuffle=True,
                                                    test_size=0.2,
                                                    random_state=42)
print(x_train.shape)  # (1712, 110336)
print(x_test.shape)  # (429, 110336)
print(y_train.shape)  # (1712,)
print(y_test.shape)  # (429,)

# 모델 구성
# model = SVC(verbose=1)
# hist = model.fit(x_train, y_train)

# SVC Visual
plt.figure(figsize=(10, 6))
model = HistGradientBoostingClassifier(verbose=1)

# mse
# train_sizes, train_scores_model, test_scores_model = \
#     learning_curve(model, x_train[:100], y_train[:100], train_sizes=np.linspace(0.1, 1.0, 10),
#                    scoring="neg_mean_squared_error", cv=8, shuffle=True, random_state=42)
# plt.plot(train_sizes, -test_scores_model.mean(1), 'o-', color="r", label="mse")

# accuracy
train_sizes, train_scores_model, test_scores_model = \
    learning_curve(model, x_train[:100], y_train[:100], train_sizes=np.linspace(0.1, 1.0, 10),
                   scoring="accuracy", cv=8, shuffle=True, random_state=42)

train_scores_mean = np.mean(train_scores_model, axis=1)
train_scores_std = np.std(train_scores_model, axis=1)
test_scores_mean = np.mean(test_scores_model, axis=1)
print(f"Number of numerical features: {n_numerical_features}")

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# Note: no need to use an OrdinalEncoder because categorical features are
# already clean
is_categorical = [name in data.categories for name in data.feature_names]
est = HistGradientBoostingClassifier(
    loss="log_loss",
    learning_rate=lr,
    max_iter=n_trees,
    max_bins=max_bins,
    max_leaf_nodes=n_leaf_nodes,
    categorical_features=is_categorical,
    early_stopping=False,
    random_state=0,
    verbose=verbose,
)

fit(est, X_train, y_train, "sklearn")
predict(est, X_test, y_test)

if args.lightgbm:
    est = get_equivalent_estimator(est, lib="lightgbm", n_classes=n_classes)
    est.set_params(max_cat_to_onehot=1)  # dont use OHE
    categorical_features = [
        f_idx for (f_idx, is_cat) in enumerate(is_categorical) if is_cat
    ]
preprocessor = ColumnTransformer(
    [
        ('cat_preprocessor', categorical_preprocessor, categorical_columns),
    ],
    remainder='passthrough',
    sparse_threshold=0,
)

# %%
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline

model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier",
     HistGradientBoostingClassifier(random_state=42, max_leaf_nodes=4)),
])
model

# %% [markdown]
# ## Evaluation
#
# ### Without hyperparameter tuning
#
# In the module "Selecting the best model", we saw that one must use
# cross-validation to evaluate such a model. Cross-validation allows to get a
# distribution of the scores of the model. Thus, having this distribution at
# hand, we can get to assess the variability of our estimate of the
# generalization performance of the model. Here, we recall the necessary
# `scikit-learn` tools needed to obtain the mean and standard deviation of the
# scores.
df_scores = evaluate_classifier(rf_clf, df_scores, "Balanced RF")
df_scores

###############################################################################
# The performance with the `BalancedRandomForestClassifier` is better than
# applying a single random under-sampling. We will use a gradient-boosting
# classifier within a `BalancedBaggingClassifier`.

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from imblearn.ensemble import BalancedBaggingClassifier

bag_clf = make_pipeline(
    preprocessor_tree,
    BalancedBaggingClassifier(
        base_estimator=HistGradientBoostingClassifier(random_state=42),
        n_estimators=10,
        random_state=42,
        n_jobs=2))

df_scores = evaluate_classifier(bag_clf, df_scores, "Balanced bagging")
df_scores

###############################################################################
# This last approach is the most effective. The different under-sampling allows
# to bring some diversity for the different GBDT to learn and not focus on a
# portion of the majority class.
#
# We will repeat the same experiment but with a ratio of 100:1 and make a
# similar analysis.
Пример #32
0
# arbitrary. Therefore we adapt the preprocessing pipeline as follows:

# %%
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

# For each categorical column, extract the list of all possible categories
# in some arbritrary order.
categories = [data[column].unique() for column in data[categorical_columns]]

preprocessor = ColumnTransformer([
    ('categorical', OrdinalEncoder(categories=categories), categorical_columns)
],
                                 remainder="passthrough")

model = make_pipeline(preprocessor, HistGradientBoostingClassifier())
model.fit(data_train, target_train)
print(model.score(data_test, target_test))

# %% [markdown]
# We can observe that we get significantly higher accuracies with the Gradient
# Boosting model. This is often what we observe whenever the dataset has a large
# number of samples and limited number of informative features (e.g. less than
# 1000) with a mix of numerical and categorical variables.
#
# This explains why Gradient Boosted Machines are very popular among datascience
# practitioners who work with tabular data.
#
#
#
#
Пример #33
0
# training models

models = [[
    DecisionTreeClassifier(random_state=42),
    DecisionTreeClassifier(random_state=42),
    DecisionTreeClassifier(random_state=42),
    DecisionTreeClassifier(random_state=42),
],
          [
              RandomForestClassifier(n_jobs=6, random_state=42),
              RandomForestClassifier(n_jobs=6, random_state=42),
              RandomForestClassifier(n_jobs=6, random_state=42),
              RandomForestClassifier(n_jobs=6, random_state=42),
          ],
          [
              HistGradientBoostingClassifier(max_depth=4, random_state=42),
              HistGradientBoostingClassifier(max_depth=4, random_state=42),
              HistGradientBoostingClassifier(max_depth=4, random_state=42),
              HistGradientBoostingClassifier(max_depth=4, random_state=42),
          ]]

names = [
    'Drzewo decyzyjne',
    'Las losowy',
    'Wzmocnienie gradientowe',
]

y_preds = []
confusion_matrices = []

for model_list, name in zip(models, names):
Пример #34
0
    QuantileTransformer(output_distribution='uniform'),
    Normalizer()
]

#=================Classifier
classifier_test = [
    OneVsRestClassifier(SVC()),
    DecisionTreeClassifier(max_depth=5),
    SVC(),
    SVC(kernel="linear", C=0.025),
    LogisticRegressionCV(cv=5, random_state=0),
    GradientBoostingClassifier(random_state=0),
    BaggingClassifier(base_estimator=SVC(), n_estimators=10,
                      random_state=0).fit(features, target),
    ExtraTreesClassifier(n_estimators=100, random_state=0),
    HistGradientBoostingClassifier(),
    MLPClassifier(random_state=1, max_iter=300),
    OneVsOneClassifier(LinearSVC(random_state=0)),
    OutputCodeClassifier(estimator=RandomForestClassifier(random_state=0),
                         random_state=0)
]
print('Importacao OK')

# %%
# =================Looping here

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline

Пример #35
0
class tuned_HGB(BaseEstimator):
    """
    Scikit-learn histogram gradient-boosted tree models, tuned with nested
    cross-validation to minimize the error on a unseen table.
    
    Parameters
    ----------
    
    task : str
        The estimation task to perform, either 'salary', 'quantile', or 'sex'.
    learning_rate : None or float
        The learning rate of the model. If None, a nested cross-validation
        procedure is used to determine the best one.
    fit_on : str
        If fit_on = 'all', all the validation data is used to compute the
        validation error. Set fit_on = 'seen' or 'unseen' to optimize the
        learning rate for unseen or seen categories only.
        
    """
    def __init__(self, task, learning_rate=None, fit_on='all'):

        self.task = task
        self.learning_rate = learning_rate
        self.fit_on = fit_on
        return

    def param_tuning(self, X1, y1):

        D_var = make_D_var(self.X1_nem, self.X1_mem, n_jobs=1)
        n_var = n_variants(self.X1_nem,
                           self.X1_mem,
                           y1,
                           self.groups1,
                           n_splits=None,
                           test_size=None,
                           D_var=D_var,
                           n_jobs=1,
                           nested_cross_val=True)
        lr_list = np.logspace(-2, -0.5, 4)
        res = np.zeros(len(lr_list))
        for k in range(len(lr_list)):
            if self.task == "salary":
                self2 = HistGradientBoostingRegressor(learning_rate=lr_list[k])
            else:
                self2 = HistGradientBoostingClassifier(
                    learning_rate=lr_list[k])
            cv_err = cv_errors(self.task,
                               self2,
                               X1,
                               self.X1_nem,
                               self.X1_mem,
                               y1,
                               self.groups1,
                               n_splits=None,
                               test_size=None,
                               n_jobs=1,
                               nested_cross_val=True)
            if self.task != 'quantile':
                cv_err = cv_err**2
            if self.fit_on == 'unseen':
                res[k] = cv_err[n_var == 0].mean()
            elif self.fit_on == 'seen':
                res[k] = cv_err[n_var >= 1].mean()
            else:
                res[k] = cv_err.mean()
        self.learning_rate = lr_list[np.argmin(res)]
        print(int(sum(n_var == 0) / len(n_var) * 100) / 100)
        return

    def fit(self, X1, y1):

        # Parameter tuning
        if self.learning_rate == None:
            self.param_tuning(X1, y1)
            print(self.learning_rate)
        # Fit on all train data with tuned params
        if self.task == "salary":
            self.model = HistGradientBoostingRegressor(
                learning_rate=self.learning_rate)
        else:
            self.model = HistGradientBoostingClassifier(
                learning_rate=self.learning_rate)
        self.model.fit(X1, y1)
        return

    def predict(self, X2):
        return self.model.predict(X2)

    def predict_proba(self, X2):
        return self.model.predict_proba(X2)
def main():
    #====================================================
    #     DATA PREPARATION
    #====================================================

    #Let's have a look at the dataset:
    data_full = pd.read_csv('dataset_higgs_challenge.csv')

    #For this classification I used only yhe "t" (training data), "b" (validation data) and "v" (test data) set of variables:
    print('Total number of events: ', len(data_full), '\n')
    for KaggleSetID in ['t', 'b', 'v', 'u']:
        print('Number of events in the {} KaggleSet: {}'.format(
            KaggleSetID,
            len(data_full['KaggleSet'][data_full['KaggleSet'] ==
                                       KaggleSetID])))

    #Description of the sub-dataset in each line:
    #1) Splitting of the dataset into train, test and validation set.
    #2) Extracting the weights of the validation and test set.
    #3) Extracting the binary arrays for my networks.
    #4) Extracting the binary arrays for my BDT
    #Within the splitting of the dataset, have been applyied some operations on the engineering of the features for each subset. The problem is that the "phi" variables have a signal distribution that is very similar to the background one. So it's better to consider their linear combination (difference in this case) to make them useful in my classification.
    X, df_empty, y_train, y_train_BDT = splitting(data_full, "t")
    X_val, weights_val, y_val, y_val_BDT = splitting(data_full, "b")
    X_test, weights_test, y_test, y_test_BDT = splitting(data_full, "v")
    del (data_full)

    #====================================================
    #     BDT
    #====================================================

    #Let's first scale my data:
    standard = StandardScaler()
    standard.fit(X)
    X_standard = standard.transform(X)
    X_val_standard = standard.transform(X_val)
    X_test_standard = standard.transform(X_test)

    #BDT classification:
    BDT = HistGradientBoostingClassifier(max_iter=90,
                                         verbose=1,
                                         l2_regularization=0.5,
                                         learning_rate=.1,
                                         max_leaf_nodes=50,
                                         random_state=45,
                                         max_depth=15,
                                         max_bins=50)
    BDT.fit(X_standard, y_train_BDT)

    y_pred_val = BDT.predict_proba(X_val_standard)
    y_pred_test = BDT.predict_proba(X_test_standard)

    del X_standard, X_val_standard, X_test_standard

    #I will split the results just to be able to combine them with the DNN result later:
    BDT_0jets_val = y_pred_val[X_val['PRI_jet_num'] == 0]
    BDT_1jet_val = y_pred_val[X_val['PRI_jet_num'] == 1]
    BDT_2jets_val = y_pred_val[X_val['PRI_jet_num'] >= 2]

    y_pred_BDT_val = np.concatenate(
        (BDT_0jets_val, BDT_1jet_val, BDT_2jets_val))

    BDT_0jets_test = y_pred_test[X_test['PRI_jet_num'] == 0]
    BDT_1jet_test = y_pred_test[X_test['PRI_jet_num'] == 1]
    BDT_2jets_test = y_pred_test[X_test['PRI_jet_num'] >= 2]

    y_pred_BDT_test = np.concatenate(
        (BDT_0jets_test, BDT_1jet_test, BDT_2jets_test))

    #====================================================
    #     DATA PROCESSING
    #====================================================

    #Let's construct the data for the case with 0 jets:
    X_0jets, y_train_0jets, empty_0 = splitting_jets(X, y_train, df_empty, 0)
    X_val_0jets, y_val_0jets, weights_0jets_val = splitting_jets(
        X_val, y_val, weights_val, 0)
    X_test_0jets, y_test_0jets, weights_0jets_test = splitting_jets(
        X_test, y_test, weights_test, 0)

    #Let's construct the data for the case with 1 jets:
    X_1jet, y_train_1jet, empty_1 = splitting_jets(X, y_train, df_empty, 1)
    X_val_1jet, y_val_1jet, weights_1jet_val = splitting_jets(
        X_val, y_val, weights_val, 1)
    X_test_1jet, y_test_1jet, weights_1jet_test = splitting_jets(
        X_test, y_test, weights_test, 1)

    #Let's construct the data for the case with 2 jets:
    X_2jets, y_train_2jets, empty_2 = splitting_jets(X, y_train, df_empty, 2)
    X_val_2jets, y_val_2jets, weights_2jets_val = splitting_jets(
        X_val, y_val, weights_val, 2)
    X_test_2jets, y_test_2jets, weights_2jets_test = splitting_jets(
        X_test, y_test, weights_test, 2)

    del empty_0, empty_1, empty_2

    #====================================================
    #     2-JETS DNN
    #====================================================

    #Scaling data:
    standard_2jets = StandardScaler()
    standard_2jets.fit(X_2jets)
    X_2jets_standard = standard_2jets.transform(X_2jets)
    X_val_2jets_standard = standard_2jets.transform(X_val_2jets)
    X_test_2jets_standard = standard_2jets.transform(X_test_2jets)

    #DNN:
    np.random.seed(42)
    DNN_2jets = make_model([64, 128, 64, 64, 32, 8], 'relu', 0.1, 'Adam', 'L2',
                           0.0001, X_2jets.shape[-1])

    early_stopping = EarlyStopping(monitor='val_accuracy',
                                   min_delta=0,
                                   patience=10,
                                   verbose=0,
                                   mode='auto',
                                   baseline=None,
                                   restore_best_weights=True)

    history = DNN_2jets.fit(X_2jets_standard,
                            y_train_2jets,
                            batch_size=256,
                            epochs=50,
                            verbose=1,
                            validation_data=(X_val_2jets_standard,
                                             y_val_2jets),
                            callbacks=[early_stopping],
                            class_weight=None)

    y_pred_2jets_val = DNN_2jets.predict(X_val_2jets_standard)
    y_pred_2jets_test = DNN_2jets.predict(X_test_2jets_standard)

    del X_2jets_standard, X_val_2jets_standard, X_2jets, X_val_2jets, X_test_2jets_standard, X_test_2jets

    #====================================================
    #     1-JET DNN
    #====================================================

    #Scaling data:
    standard_1jet = StandardScaler()
    standard_1jet.fit(X_1jet)
    X_1jet_standard = standard_1jet.transform(X_1jet)
    X_val_1jet_standard = standard_1jet.transform(X_val_1jet)
    X_test_1jet_standard = standard_1jet.transform(X_test_1jet)

    #DNN:
    np.random.seed(42)
    DNN_1jet = make_model([64, 64, 64, 32, 8], 'relu', 0.1, 'Adagrad', 'L1',
                          0.0001, X_1jet.shape[-1])

    early_stopping = EarlyStopping(monitor='val_accuracy',
                                   min_delta=0,
                                   patience=10,
                                   verbose=0,
                                   mode='auto',
                                   baseline=None,
                                   restore_best_weights=True)

    history = DNN_1jet.fit(X_1jet_standard,
                           y_train_1jet,
                           batch_size=256,
                           epochs=50,
                           verbose=1,
                           validation_data=(X_val_1jet_standard, y_val_1jet),
                           callbacks=[early_stopping],
                           class_weight=None)

    y_pred_1jet_val = DNN_1jet.predict(X_val_1jet_standard)
    y_pred_1jet_test = DNN_1jet.predict(X_test_1jet_standard)

    del X_1jet_standard, X_val_1jet_standard, X_1jet, X_val_1jet, X_test_1jet_standard, X_test_1jet

    #====================================================
    #     0-JET DNN
    #====================================================

    #Scaling data:
    standard_0jets = StandardScaler()
    standard_0jets.fit(X_0jets)
    X_0jets_standard = standard_0jets.transform(X_0jets)
    X_val_0jets_standard = standard_0jets.transform(X_val_0jets)
    X_test_0jets_standard = standard_0jets.transform(X_test_0jets)

    #DNN:
    np.random.seed(42)
    DNN_0jets = make_model([32, 64, 128, 64, 32, 8], 'elu', 0.1, 'Adagrad',
                           'L1', 0.0001, X_0jets.shape[-1])

    early_stopping = EarlyStopping(monitor='val_accuracy',
                                   min_delta=0,
                                   patience=10,
                                   verbose=0,
                                   mode='auto',
                                   baseline=None,
                                   restore_best_weights=True)

    history = DNN_0jets.fit(X_0jets_standard,
                            y_train_0jets,
                            batch_size=256,
                            epochs=50,
                            verbose=1,
                            validation_data=(X_val_0jets_standard,
                                             y_val_0jets),
                            callbacks=[early_stopping],
                            class_weight=None)

    y_pred_0jets_val = DNN_0jets.predict(X_val_0jets_standard)
    y_pred_0jets_test = DNN_0jets.predict(X_test_0jets_standard)

    del X_0jets_standard, X_val_0jets_standard, X_0jets, X_val_0jets, X_test_0jets_standard, X_test_0jets

    #====================================================
    #     TOTAL AMS SCORE OF DNNs
    #====================================================

    #Total AMS score considering all the AMS of each subset:
    y_pred_DNN_val = np.concatenate(
        (y_pred_0jets_val, y_pred_1jet_val, y_pred_2jets_val))
    y_val_total = np.concatenate((y_val_0jets, y_val_1jet, y_val_2jets))
    weights_total_val = np.concatenate(
        (weights_0jets_val, weights_1jet_val, weights_2jets_val))

    y_pred_DNN_test = np.concatenate(
        (y_pred_0jets_test, y_pred_1jet_test, y_pred_2jets_test))
    y_test_total = np.concatenate((y_test_0jets, y_test_1jet, y_test_2jets))
    weights_total_test = np.concatenate(
        (weights_0jets_test, weights_1jet_test, weights_2jets_test))

    #====================================================
    #     COMBINING DNNs AND BDT AMS
    #====================================================

    dataset_blend_val = np.append(y_pred_DNN_val[:, 1].reshape(-1, 1),
                                  y_pred_BDT_val[:, 1].reshape(-1, 1),
                                  axis=1)
    dataset_blend_test = np.append(y_pred_DNN_test[:, 1].reshape(-1, 1),
                                   y_pred_BDT_test[:, 1].reshape(-1, 1),
                                   axis=1)
    blend = LogisticRegression(solver='lbfgs')
    blend.fit(dataset_blend_val, y_val_total[:, 1])
    blended_val = blend.predict_proba(dataset_blend_val)
    blended_test = blend.predict_proba(dataset_blend_test)

    #====================================================
    #     FINAL RESULTS
    #====================================================

    print('DNN:')
    plot_AMS(y_pred_DNN_test, y_test_total, weights_total_test)
    print('BDT:')
    plot_AMS(y_pred_BDT_test, y_test_total, weights_total_test)
    print('Combination:')
    plot_AMS(blended_test, y_test_total, weights_total_test)
    plt.legend(['DNN', 'BDT', 'DNN + BDT'])
    plt.ylim(2.8, )
    plt.savefig('AMS_total.png', dpi=300)
    plt.show()

    plot_distributions_final(blended_val, blended_test, y_val_total, 50, False,
                             weights_total_val, weights_total_test)
    plt.savefig('Final_distribution_unweighted.png', dpi=300)
    plt.show()

    plot_distributions_final(blended_val, blended_test, y_val_total, 50, True,
                             weights_total_val, weights_total_test)
    plt.savefig('Final_distribution_weighted.png', dpi=300)
    plt.show()