def test_invalid_classification_loss(): binary_clf = HistGradientBoostingClassifier(loss="binary_crossentropy") err_msg = ("loss='binary_crossentropy' is not defined for multiclass " "classification with n_classes=3, use " "loss='categorical_crossentropy' instead") with pytest.raises(ValueError, match=err_msg): binary_clf.fit(np.zeros(shape=(3, 2)), np.arange(3))
def test_same_predictions_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 256 X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5, n_informative=5, n_redundant=0, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = HistGradientBoostingClassifier( loss='binary_crossentropy', max_iter=max_iter, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm') est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_sklearn = est_sklearn.predict(X_train) assert np.mean(pred_sklearn == pred_lightgbm) > .89 acc_lightgbm = accuracy_score(y_train, pred_lightgbm) acc_sklearn = accuracy_score(y_train, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_sklearn = est_sklearn.predict(X_test) assert np.mean(pred_sklearn == pred_lightgbm) > .89 acc_lightgbm = accuracy_score(y_test, pred_lightgbm) acc_sklearn = accuracy_score(y_test, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
def test_early_stopping_classification(data, scoring, validation_fraction, n_iter_no_change, tol): max_iter = 50 X, y = data gb = HistGradientBoostingClassifier( verbose=1, # just for coverage min_samples_leaf=5, # easier to overfit fast scoring=scoring, tol=tol, validation_fraction=validation_fraction, max_iter=max_iter, n_iter_no_change=n_iter_no_change, random_state=0 ) gb.fit(X, y) if n_iter_no_change is not None: assert n_iter_no_change <= gb.n_iter_ < max_iter else: assert gb.n_iter_ == max_iter
def test_binning_train_validation_are_separated(): # Make sure training and validation data are binned separately. # See issue 13926 rng = np.random.RandomState(0) validation_fraction = .2 gb = HistGradientBoostingClassifier( n_iter_no_change=5, validation_fraction=validation_fraction, random_state=rng ) gb.fit(X_classification, y_classification) mapper_training_data = gb.bin_mapper_ # Note that since the data is small there is no subsampling and the # random_state doesn't matter mapper_whole_data = _BinMapper(random_state=0) mapper_whole_data.fit(X_classification) n_samples = X_classification.shape[0] assert np.all(mapper_training_data.actual_n_bins_ == int((1 - validation_fraction) * n_samples)) assert np.all(mapper_training_data.actual_n_bins_ != mapper_whole_data.actual_n_bins_)
# -*- coding: utf-8 -*- """ Created on Sun May 3 03:14:13 2020 @author: Jie.Hu """ ''' 9: Hist Gradient Boosting''' from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier clf_hgb = HistGradientBoostingClassifier(validation_fraction=0.2, n_iter_no_change=20, tol=0.001, random_state= 1337) cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1337) acc = cross_val_score(estimator = clf_gb, X = X_train, y = y_train, cv = cv, scoring='roc_auc') acc.mean(), acc.std() # KF & GS #gmean_scorer = make_scorer(geometric_mean_score, greater_is_better=True) parameters = {'learning_rate':[0.001, 0.01, 0.1], 'max_depth':[3,5,7], 'max_leaf_nodes':[11,21,31], 'min_samples_leaf':[1,3,5], 'max_iter':[100,200,400], 'l2_regularization':[0.001,0.01,0.1]}
data = np.ascontiguousarray(df.values[:, 1:]) data_train, data_test, target_train, target_test = train_test_split( data, target, test_size=.2, random_state=0) if subsample is not None: data_train, target_train = data_train[:subsample], target_train[:subsample] n_samples, n_features = data_train.shape print(f"Training set with {n_samples} records with {n_features} features.") print("Fitting a sklearn model...") tic = time() est = HistGradientBoostingClassifier(loss='binary_crossentropy', learning_rate=lr, max_iter=n_trees, max_bins=max_bins, max_leaf_nodes=n_leaf_nodes, n_iter_no_change=None, random_state=0, verbose=1) est.fit(data_train, target_train) toc = time() predicted_test = est.predict(data_test) predicted_proba_test = est.predict_proba(data_test) roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1]) acc = accuracy_score(target_test, predicted_test) print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}") if args.lightgbm: print("Fitting a LightGBM model...") tic = time() lightgbm_est = get_equivalent_estimator(est, lib='lightgbm')
def test_should_stop(scores, n_iter_no_change, tol, stopping): gbdt = HistGradientBoostingClassifier( n_iter_no_change=n_iter_no_change, tol=tol ) assert gbdt._should_stop(scores) == stopping
# %% # Native support for missing values for gradient boosting # ------------------------------------------------------- # # The :class:`ensemble.HistGradientBoostingClassifier` # and :class:`ensemble.HistGradientBoostingRegressor` now have native # support for missing values (NaNs). This means that there is no need for # imputing data when training or predicting. from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingClassifier X = np.array([0, 1, 2, np.nan]).reshape(-1, 1) y = [0, 0, 1, 1] gbdt = HistGradientBoostingClassifier(min_samples_leaf=1).fit(X, y) print(gbdt.predict(X)) # %% # Precomputed sparse nearest neighbors graph # ------------------------------------------ # Most estimators based on nearest neighbors graphs now accept precomputed # sparse graphs as input, to reuse the same graph for multiple estimator fits. # To use this feature in a pipeline, one can use the `memory` parameter, along # with one of the two new transformers, # :class:`neighbors.KNeighborsTransformer` and # :class:`neighbors.RadiusNeighborsTransformer`. The precomputation # can also be performed by custom estimators to use alternative # implementations, such as approximate nearest neighbors methods. # See more details in the :ref:`User Guide <neighbors_transformer>`.
'race', 'native-country', 'sex' ] categories = [data[column].unique() for column in data[categorical_columns]] categorical_preprocessor = OrdinalEncoder(categories=categories) preprocessor = ColumnTransformer( [('cat-preprocessor', categorical_preprocessor, categorical_columns)], remainder='passthrough', sparse_threshold=0) from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.pipeline import make_pipeline model = make_pipeline(preprocessor, HistGradientBoostingClassifier(random_state=42)) # %% [markdown] # TODO: write your solution here # # Use the previously defined model (called `model`) and using two nested `for` # loops, make a search of the best combinations of the `learning_rate` and # `max_leaf_nodes` parameters. In this regard, you will need to train and test # the model by setting the parameters. The evaluation of the model should be # performed using `cross_val_score`. We can propose to define the following # parameters search: # - `learning_rate` for the values 0.01, 0.1, and 1; # - `max_leaf_nodes` for the values 5, 25, 45.
x = np.concatenate([f_ds, m_ds], 0) x = x.reshape(x.shape[0], x.shape[1]*x.shape[2]) y = np.concatenate([f_lb, m_lb], 0) print(x.shape) # (2141, 110336) print(y.shape) # (2141,) # 전처리 x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=True, test_size=0.2, random_state=42) print(x_train.shape) # (1712, 110336) print(x_test.shape) # (429, 110336) print(y_train.shape) # (1712,) print(y_test.shape) # (429,) # 모델 구성 model = HistGradientBoostingClassifier(verbose=1) model.fit(x_train, y_train) # model & weight save # pickle.dump(model, open('E:/nmb/nmb_data/cp/m03_mels_HistGradientBoostingClassifier.data', 'wb')) # wb : write # print("== save complete ==") # model load model = pickle.load(open('E:/nmb/nmb_data/cp/m03_mels_HistGradientBoostingClassifier.data', 'rb')) # rb : read # time >> 0:30:49.704071 # evaluate y_pred = model.predict(x_test) # print(y_pred[:100]) # print(y_pred[100:])
def ClassificationModelDictionary(): LR = dict(name='LogisticRegression', model=LogisticRegression(), parameters={ "penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }, best_parameters={}, cv_params={ 'penalty': ['l1', 'l2'], 'random_state': [0, 8] }) DT = dict(name='DecisionTreeClassifier', model=DecisionTreeClassifier(), parameters={ 'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_depth': [None, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'max_features': ['auto', 'log2', None], 'random_state': [8], 'min_samples_leaf': [1, 2, 3, 4, 5] }, best_parameters={}, cv_params={ 'criterion': ['gini', 'entropy'], 'splitter': ['best'], 'max_features': ['auto', 'log2', None], 'random_state': [0, 8] }) KNN = dict( name='KNeighborsClassifier', model=KNeighborsClassifier(), parameters={ 'n_neighbors': [i for i in range(1, 25)], 'p': [1, 2] }, # 1=manhattan, 2, euclidean best_parameters={}, cv_params={ 'priors': [None], 'var_smoothing': [1e-09] }) GNB = dict(name='GaussianNB', model=GaussianNB(), parameters={ 'priors': [ None, ], 'var_smoothing': [ 1e-09, ] }, best_parameters={}, cv_params={ 'priors': [None], 'var_smoothing': [1e-09] }) BNB = dict(name='BernoulliNB', model=BernoulliNB(), parameters={ 'alpha': [ 1.0, ], 'binarize': [ 0.0, ], 'fit_prior': [True, False], 'class_prior': [None] }, best_parameters={}, cv_params={ 'alpha': [1.0], 'binarize': [0.0], 'fit_prior': [True, False], 'class_prior': [None] }) RF = dict(name='RandomForestClassifier', model=RandomForestClassifier(), parameters={ 'max_depth': [2, 3, 4], 'bootstrap': [True, False], 'max_features': ['auto', 'sqrt', 'log2', None], 'criterion': ['gini', 'entropy'], 'random_state': [8] }, best_parameters={}, cv_params={ 'max_depth': [2, 3, 4], 'bootstrap': [True, False], 'max_features': ['auto', 'sqrt', 'log2', None], 'criterion': ['gini', 'entropy'], 'random_state': [8] }) SVM = dict( name='SVC', model=SVC(), parameters={ 'C': [1, 10, 100, 500, 1000], 'kernel': ['linear', 'rbf'], 'C': [1, 10, 100, 500, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf'], #'degree': [2,3,4,5,6] , 'C':[1,10,100,500,1000] , 'kernel':['poly'] }, best_parameters={}, cv_params={ 'C': [1, 10, 100, 500, 1000], 'kernel': ['rbf'], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001] }) BAG_params = { 'base_estimator': [ DecisionTreeClassifier(), DecisionTreeClassifier(max_depth=2), DecisionTreeClassifier(max_depth=4), BernoulliNB(), LogisticRegression(penalty='l1'), LogisticRegression(penalty='l2'), ], #GaussianNB(),], 'n_estimators': [ 10, ], 'max_samples': [1.0], 'max_features': [1.0], 'bootstrap': [ True, ], 'bootstrap_features': [False], 'oob_score': [False], #'warm_start': [False], 'n_jobs': [None], 'random_state': [8], 'verbose': [0] } BAG = dict(name='BaggingClassifier', model=BaggingClassifier(), parameters=BAG_params, best_parameters={}, cv_params={ 'base_estimator': [ DecisionTreeClassifier(criterion='gini'), DecisionTreeClassifier(criterion='entropy'), BernoulliNB(), LogisticRegression(penalty='l1'), LogisticRegression(penalty='l2') ], 'bootstrap': [True], 'random_state': [0, 8] }) GB = dict( name='GradientBoostingClassifier', model=GradientBoostingClassifier(), parameters={ 'loss': ['deviance', 'exponential'], 'learning_rate': [0.1, 0.01, 1.0], 'n_estimators': [100, 200, 25, 50, 75], 'subsample': [1.0, 0.75, 0.5, 0.25, 0.01 ], # < 1.0 leads to reduction of variance and increase in bias # < 1.0 results in Stochastic Gradient Boosting 'random_state': [8], #'ccp_alpha': [0.0,0.0001,0.001,0.01,0.1,1.0]# only in version 0.22 #cost-complexity pruning algorithm to prune tree to avoid over fitting #'min_samples_split':[2,3,4], #'min_samples_leaf':[1,2,3], #'min_weight_fraction_leaf':[0], #'max_depth':[3,4,5], #'min_impurity_decrease':[0], #'init':[None], #'max_features':[None], #'verbose':[0], }, best_parameters={}, cv_params={ 'loss': ['deviance', 'exponential'], 'n_estimators': [100], 'random_state': [0, 8] }) ADA = dict( name='AdaBoostClassifier', model=AdaBoostClassifier(), parameters={ 'base_estimator': [ DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=2), DecisionTreeClassifier(max_depth=3), DecisionTreeClassifier(max_depth=4), BernoulliNB(), #GaussianNB(), ], 'n_estimators': [25, 50, 75, 100], # ,100 'learning_rate': [1.0, 0.1], #'alogorithm':['SAMME', 'SAMME.R'], 'random_state': [8], }, best_parameters={}, cv_params={ 'base_estimator': [ None, DecisionTreeClassifier(criterion='gini'), DecisionTreeClassifier(criterion='entropy'), BernoulliNB(), LogisticRegression(penalty='l1'), LogisticRegression(penalty='l2') ], 'random_state': [0, 8] }) XGB_params = { 'max_depth': [3], 'learning_rate': [0.1], 'n_estimators': [ 100, ], #50,150,200], 'verbosity': [1], 'objective': ['binary:logistic'], 'booster': ['gbtree', 'gblinear', 'dart'], # IMPORTANT 'tree_method': ['auto', 'exact', 'approx', 'hist'], #, 'gpu_hist' # IMPORTANT 'n_jobs': [1], 'gamma': [0], 'min_child_weight': [1], 'max_delta_step': [0], 'subsample': [1], 'colsample_bytree': [1], 'colsample_bylevel': [1], 'colsample_bynode': [1], 'reg_alpha': [0], 'reg_lambda': [1], 'scale_pos_weight': [1], 'base_score': [0.5], 'random_state': [8], 'missing': [None] } XGB = dict(name='XGBClassifier', model=XGBClassifier(), parameters=XGB_params, best_parameters={}, cv_params={ 'tree_method': ['auto', 'exact', 'approx', 'hist'], 'booster': ['gbtree', 'gblinear', 'dart'], 'random_state': [0, 8] }) LBGM_params = { 'boosting_type': ['gbdt', 'goss'], # ,'dart','rf' 'num_leaves': [31], 'max_depth': [-1], 'learning_rate': [0.1], 'n_estimators': [100], 'subsample_for_bin': [200000], 'objective': [None], 'class_weight': [None], 'min_split_gain': [0.0], 'min_child_weight': [0.001], 'min_child_samples': [20], 'subsample': [1.0], 'subsample_freq': [0], 'colsample_bytree': [1.0], 'reg_alpha': [0.0], 'reg_lambda': [0.0], 'random_state': [8], 'n_jobs': [-1], 'silent': [True], 'importance_type': ['split'] } LGBM = dict(name='LGBMClassifier', model=LGBMClassifier(), parameters=LBGM_params, best_parameters={}, cv_params={ 'boosting_type': ['gbdt', 'goss'], 'random_state': [0, 8] }) HGB_params = { 'loss': [ 'auto', 'binary_crossentropy', ], # 'categorical_crossentropy' 'learning_rate': [0.1], 'max_iter': [100], 'max_leaf_nodes': [31], 'max_depth': [None], 'min_samples_leaf': [20], 'l2_regularization': [0, 1, 2], # for no-regulaiziation, 1 regulztn 'max_bins': [255], #'warm_start': [False], 'scoring': [None], 'validation_fraction': [0.1], 'n_iter_no_change': [None], 'tol': [1e-07], 'verbose': [0], 'random_state': [8] } HGB = dict(name='HistGradientBoostingClassifier', model=HistGradientBoostingClassifier(), parameters=HGB_params, best_parameters={}, cv_params={ 'loss': ['auto', 'binary_crossentropy'], 'l2_regularization': [0, 1, 2], 'random_state': [0, 8] }) models = { i: mod for i, mod in enumerate( [LR, DT, KNN, GNB, BNB, RF, SVM, BAG, GB, ADA, XGB, LGBM, HGB], start=1) } return models
args = parser.parse_args() n_trees = args.n_trees df = load_data() target = df.values[:, 0] data = np.ascontiguousarray(df.values[:, 1:]) data_train, data_test, target_train, target_test = train_test_split( data, target, test_size=.2, random_state=0) n_samples, n_features = data_train.shape print(f"Training set with {n_samples} records with {n_features} features.") est = HistGradientBoostingClassifier(loss='binary_crossentropy', max_iter=n_trees, n_iter_no_change=None, random_state=0, verbose=1) if args.library == 'sklearn': print("Fitting a sklearn model...") tic = time() est.fit(data_train, target_train) toc = time() predicted_test = est.predict(data_test) predicted_proba_test = est.predict_proba(data_test) roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1]) acc = accuracy_score(target_test, predicted_test) print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, " f"ACC: {acc :.4f}")
else: X = X_ y = y_ # TODO convert into arguments n_components = 0.95 inner_cv = 5 max_iter = 300 class_weight = None lrcv = LogisticRegressionCV(cv=inner_cv, scoring="accuracy", n_jobs=1, class_weight=class_weight, random_state=random_state, max_iter=max_iter) gbm = GradientBoostingClassifier(n_estimators=100) hgbm = HistGradientBoostingClassifier(max_iter=100) print("Samples: ", str(n_samples)) print("Features: ", str(n_features)) print("Informative: ", str(n_informative)) # convert single clf argument to list, if only one was passed if type(clf) is not list: clf = [clf] for clf_ in clf: print("Evaluating classifier: ", clf_) if add_pca: pipe = make_pipeline( StandardScaler(),
res["max_depth"] = max_depth res["n_estimators"] = n_estimators res["n_features"] = n_features if verbose: pprint(res) yield res compilation.extend(list(measure_onnx_runtime(rf, X_test))) ######################################## # HistGradientBoostingClassifier # ++++++++++++++++++++++++++++++ hist = HistGradientBoostingClassifier( max_iter=n_estimators, max_depth=max_depth) print('train') hist = train_cache(hist, X_train, y_train, max_depth, n_estimators, n_classes) compilation.extend(list(measure_onnx_runtime(hist, X_test))) ######################################## # LightGBM # ++++++++ lgb = LGBMClassifier(n_estimators=n_estimators, max_depth=max_depth, pred_early_stop=False) print('train') lgb = train_cache(lgb, X_train, y_train, max_depth, n_estimators, n_classes) compilation.extend(list(measure_onnx_runtime(lgb, X_test)))
def Gridsearchcv(X_train, X_test, y_train, y_test): ############ # Scale numeric values num_transformer = Pipeline(steps=[ ('scaler', MinMaxScaler())]) preprocessor = ColumnTransformer( remainder='passthrough', transformers=[ ('num', num_transformer, make_column_selector(pattern='EDAD')) ]) ############ pipe = Pipeline([ ('preprocessor', preprocessor), ('clf', PipelineHelper([ ('svc', SVC()), ('gb', GradientBoostingClassifier()), ('xgb', XGBClassifier(use_label_encoder=False)), ('eec', EasyEnsembleClassifier()), ('rbc', RUSBoostClassifier()), ('bbc', BalancedBaggingClassifier()), ('brf', BalancedRandomForestClassifier()), ])), ]) params = { 'clf__selected_model': pipe.named_steps['clf'].generate({ # # #EasyEnsembleClassifier 'eec__n_estimators' : [10, 25, 50, 100], 'eec__warm_start' : [False, True], 'eec__replacement' : [False, True], # # #RUSBoostClassifier 'rbc__algorithm' : ['SAMME','SAMME.R'], 'rbc__n_estimators' : [10, 50, 100, 200, 500], 'rbc__learning_rate' : [1e-3, 1e-2, 1e-1, 0.5, 1.], # # #BalancedBaggingClassifier 'bbc__base_estimator': [HistGradientBoostingClassifier(), None], 'bbc__n_estimators' : [10, 50, 100, 200, 500,750,1000], 'bbc__max_samples':[0.5,0.6,0.7,0.8,0.9,1.0], 'bbc__max_features':[0.5,0.6,0.7,0.8,0.9,1.0], # #BalancedRandomForestClassifier 'brf__criterion': ['gini', 'entropy'], 'brf__n_estimators' : [int(x) for x in np.linspace(start = 20, stop = 200, num = 5)], 'brf__max_depth' : [int(x) for x in np.linspace(1, 45, num = 3)], 'brf__min_samples_split' : range(2,10), 'brf__min_samples_leaf': [1,3,5,10], 'brf__max_features' : ['auto', 'sqrt', 'log2'], # # #svm 'svc__C': [0.1, 0.5, 1, 10, 30, 40, 50, 75, 100, 500, 1000], 'svc__gamma' : [0.0001, 0.001, 0.005, 0.01, 0.05, 0.07, 0.1, 0.5, 1, 5, 10, 50], 'svc__kernel': ['rbf'], # # #gb 3780 "gb__learning_rate": [0.0001, 0.001, 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2], "gb__max_depth":[3,7,8,9,10,50], "gb__max_features":["log2","sqrt"], "gb__subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0], "gb__n_estimators":[10, 50, 100, 200, 300], # #xgboost 'xgb__learning_rate' : [1e-3, 1e-2, 1e-1, 0.5, 1.], 'xgb__min_child_weight': np.arange(1, 21, 5), 'xgb__subsample': np.arange(0.05, 1.01, 0.05), 'xgb__verbosity': [0], # 'xgb__booster': ['gbtree', 'gblinear' ,'dart'], # 'xgb__learning_rate' : [1e-3, 1e-2, 1e-1, 0.5, 1.], # 'xgb__min_child_weight': range(1, 21, 5), # 'xgb__subsample': np.arange(0.05, 1.01, 0.05), # 'xgb__max_depth': [15,20,25], # 'xgb__verbosity': [0], # 'xgb__n_estimators': [100], # 'xgb__max_depth': range(1, 11), # 'xgb__learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.], # 'xgb__subsample': np.arange(0.05, 1.01, 0.05), # 'xgb__min_child_weight': range(1, 21), # 'xgb__verbosity': [0], # add this line to slient warning # 'xgb__n_estimators': [400, 700, 1000], # 'xgb__colsample_bytree': [0.7, 0.8], # 'xgb__max_depth': [15,20,25], # 'xgb__reg_alpha': [1.1, 1.2, 1.3], # 'xgb__reg_lambda': [1.1, 1.2, 1.3], # 'xgb__subsample': [0.7, 0.8, 0.9], # 'xgb__eval_metric' : ['mlogloss'] }), } scoring = {'ba': 'balanced_accuracy','ap': 'average_precision', 'F1' : 'f1', 'ra': 'roc_auc', 'rc': 'recall'} cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3) #cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5) #https://towardsdatascience.com/hyper-parameter-tuning-with-randomised-grid-search-54f865d27926 #n_iter: 30,60, 100 grid = RandomizedSearchCV( pipe, params, refit = 'ba', cv = cv, verbose = 3, n_jobs=-1, n_iter = 60, scoring= scoring, return_train_score = True ) grid.fit(X_train, y_train) df_grid=pd.DataFrame(grid.cv_results_) df_grid = df_grid.sort_values(by=['mean_test_ba'],ascending=False) df_grid = df_grid[[ 'param_clf__selected_model', 'params', 'mean_fit_time', 'std_fit_time', 'mean_test_ba', 'std_test_ba', 'rank_test_ba', 'mean_test_ap', 'std_test_ap', 'rank_test_ap', 'mean_test_ra', 'std_test_ra', 'rank_test_ra', 'mean_test_F1', 'std_test_F1', 'rank_test_F1' ]] print("Best-Fit Parameters From Training Data:\n",grid.best_params_) grid_predictions = grid.best_estimator_.predict(X_test) report = classification_report(y_test, grid_predictions, output_dict=True) report = pd.DataFrame(report).transpose() print(report) print(confusion_matrix(y_test, grid_predictions)) return grid, df_grid, report
#Create the min max scalar and apply it to our parameters. Drop all uneeded columns and store the column to be predicted as our y. X = df.drop(columns=[ 'Unnamed: 0', 'id', 'title', 'category', 'subcategory', 'blurb', 'launch', 'deadline', 'state', 'city', 'backers', 'pledged', 'ongoing', 'location', 'success' ]) columns = X.columns X = pd.DataFrame(X, columns=columns) y = df['success'] #separate training and testing data for the model. kf = KFold(n_splits=10) scaler = MinMaxScaler() scaler.fit(X) #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed) #Create model, train it, and test it. model = HistGradientBoostingClassifier( ) #learning_rate=0.1, loss='binary_crossentropy', max_bins=255, max_depth=3, max_iter=100, max_leaf_nodes=31, min_samples_leaf=10) #hyperparameters had slightly lower results of 71.96 average accuracy model.fit(X, y) pipeline = Pipeline([('scaler', scaler), ('HGB Classifier', model)]) score = cross_val_score(pipeline, X, y, cv=kf, scoring='accuracy').mean() print(score) #pickle the model for future use pkl.dump(model, file1) pkl.dump(encoder, file2) pkl.dump(scaler, file3) file1.close() file2.close() file3.close()
def test_same_predictions_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification pytest.importorskip("lightgbm") rng = np.random.RandomState(seed=seed) max_iter = 1 n_classes = 2 max_bins = 255 X, y = make_classification( n_samples=n_samples, n_classes=n_classes, n_features=5, n_informative=5, n_redundant=0, random_state=0, ) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = HistGradientBoostingClassifier( loss="log_loss", max_iter=max_iter, max_bins=max_bins, learning_rate=1, early_stopping=False, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, ) est_lightgbm = get_equivalent_estimator(est_sklearn, lib="lightgbm", n_classes=n_classes) est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_sklearn = est_sklearn.predict(X_train) assert np.mean(pred_sklearn == pred_lightgbm) > 0.89 acc_lightgbm = accuracy_score(y_train, pred_lightgbm) acc_sklearn = accuracy_score(y_train, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_sklearn = est_sklearn.predict(X_test) assert np.mean(pred_sklearn == pred_lightgbm) > 0.89 acc_lightgbm = accuracy_score(y_test, pred_lightgbm) acc_sklearn = accuracy_score(y_test, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
def cat(X): return X.dtypes == "category" cat_imp = make_pipeline( SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore", sparse=False), TruncatedSVD(), ) ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", FunctionTransformer(lambda x: x, validate=False), cont)]) clf = sklearn.pipeline.Pipeline(steps=[ ("transform", ct), ("estimator", HistGradientBoostingClassifier()), ]) suite = openml.study.get_suite(1) # We'll create a study with one run on three random datasets each tasks = np.random.choice(suite.tasks, size=3, replace=False) run_ids = [] for task_id in tasks: task = openml.tasks.get_task(task_id) run = openml.runs.run_model_on_task(clf, task) run.publish() run_ids.append(run.run_id) # The study needs a machine-readable and unique alias. To obtain this, # we simply generate a random uuid. alias = uuid.uuid4().hex
def test_should_stop(scores, n_iter_no_change, tol, stopping): gbdt = HistGradientBoostingClassifier(n_iter_no_change=n_iter_no_change, tol=tol) assert gbdt._should_stop(scores) == stopping
g_nb = GaussianNB() knn = KNeighborsClassifier( ) # 參數:n_neighbors(鄰居數:預設為5)、weights(權重,預設為uniform)、leaf_size(葉的大小:預設為30) ran_for = RandomForestClassifier() # n_estimators:樹的顆數、max_depth:最大深度,剪枝用,超過全部剪掉。 # min_samples_leaf:搭配max_depth使用,一個節點在分枝後每個子節點都必須包含至少min_samples_leaf個訓練樣本 # bootstrap:重新取樣原有Data產生新的Data,取樣的過程是均勻且可以重複取樣 log_reg = LogisticRegression( ) #penalty:懲罰函數(預設L2)、C:正則強度倒數,預設為1.0、solver:解決器(默認='lbfgs'),saga對所有懲罰都可以使用 tree = DecisionTreeClassifier() xgb = XGBClassifier() # https://www.itread01.com/content/1536594984.html 參數詳解 ada_boost = AdaBoostClassifier( ) # https://ask.hellobi.com/blog/zhangjunhong0428/12405 參數詳解 grad_boost = GradientBoostingClassifier( n_estimators=100) # https://www.itread01.com/content/1514358146.html 參數詳解 hist_grad_boost = HistGradientBoostingClassifier( ) # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html #訓練模型之參數設定 clf = [("Naive Bayes", g_nb, {}), \ ("K Nearest", knn, {"n_neighbors": [3, 5, 6, 7, 8, 9, 10], "leaf_size": [25, 30, 35]}), \ ("Random Forest", ran_for, {"n_estimators": [10, 50, 100, 200, 400], "max_depth": [3, 10, 20, 40], "random_state": [99], "min_samples_leaf": [5, 10, 20, 40, 50], "bootstrap": [False]}), \ ("Logistic Regression", log_reg, {"penalty": ['l2'], 'max_iter':[10, 20],"C": [100, 10, 1.0, 0.1, 0.01], "solver": ['saga']}), \ \ ("Decision Tree", tree, {}), \ ("XGBoost", xgb, {"n_estimators": [200], "max_depth": [3, 4, 5], "learning_rate": [.01, .1, .2], "subsample": [.8], "colsample_bytree": [1], "gamma": [0, 1, 5], "lambda": [.01, .1, 1]}), \ \ ("Adapative Boost", ada_boost, {"n_estimators": [100], "learning_rate": [.6, .8, 1]}), \
def test_categorical_encoding_strategies(): # Check native categorical handling vs different encoding strategies. We # make sure that native encoding needs only 1 split to achieve a perfect # prediction on a simple dataset. In contrast, OneHotEncoded data needs # more depth / splits, and treating categories as ordered (just using # OrdinalEncoder) requires even more depth. # dataset with one random continuous feature, and one categorical feature # with values in [0, 5], e.g. from an OrdinalEncoder. # class == 1 iff categorical value in {0, 2, 4} rng = np.random.RandomState(0) n_samples = 10_000 f1 = rng.rand(n_samples) f2 = rng.randint(6, size=n_samples) X = np.c_[f1, f2] y = np.zeros(shape=n_samples) y[X[:, 1] % 2 == 0] = 1 # make sure dataset is balanced so that the baseline_prediction doesn't # influence predictions too much with max_iter = 1 assert 0.49 < y.mean() < 0.51 clf_cat = HistGradientBoostingClassifier( max_iter=1, max_depth=1, categorical_features=[False, True]) # Using native categorical encoding, we get perfect predictions with just # one split assert cross_val_score(clf_cat, X, y).mean() == 1 # quick sanity check for the bitset: 0, 2, 4 = 2**0 + 2**2 + 2**4 = 21 expected_left_bitset = [21, 0, 0, 0, 0, 0, 0, 0] left_bitset = clf_cat.fit(X, y)._predictors[0][0].raw_left_cat_bitsets[0] assert_array_equal(left_bitset, expected_left_bitset) # Treating categories as ordered, we need more depth / more splits to get # the same predictions clf_no_cat = HistGradientBoostingClassifier(max_iter=1, max_depth=4, categorical_features=None) assert cross_val_score(clf_no_cat, X, y).mean() < .9 clf_no_cat.set_params(max_depth=5) assert cross_val_score(clf_no_cat, X, y).mean() == 1 # Using OHEd data, we need less splits than with pure OEd data, but we # still need more splits than with the native categorical splits ct = make_column_transformer((OneHotEncoder(sparse=False), [1]), remainder='passthrough') X_ohe = ct.fit_transform(X) clf_no_cat.set_params(max_depth=2) assert cross_val_score(clf_no_cat, X_ohe, y).mean() < .9 clf_no_cat.set_params(max_depth=3) assert cross_val_score(clf_no_cat, X_ohe, y).mean() == 1
class GradientBoostingMsgClassifierModel(h1.Model): def load_data(self, num_files=None): return util.load_data(num_files, shuffle=True) def prep_data(self, data): def concat_processed_files(files): dfs = [] for f in files: z = pd.read_csv(f) z.columns = [ 'Timestamp', 'Label', 'CarSpeed', 'SteeringAngle', 'YawRate', 'Gx', 'Gy', ] z = util.compute_timediff_fillna(z) dfs.append(z) df2 = pd.concat(dfs) return df2 result = { "train_attack_df": concat_processed_files(data["train_attack_files"]), "test_attack_df": concat_processed_files(data["test_attack_files"]) } print("len train_attack_df = %s" % len(result["train_attack_df"])) print("len test_attack_df = %s" % len(result["test_attack_df"])) return result def train(self, prepared_data): df = prepared_data["train_attack_df"] from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier X = df[FEATURES] y = df.Label == "Tx" self.model = HistGradientBoostingClassifier(max_iter=500).fit(X, y) def evaluate(self, prepared_data): df = prepared_data["test_attack_df"] ypred = self.model.predict(df[FEATURES]) import sklearn.metrics cf = sklearn.metrics.confusion_matrix(df.Label == "Tx", ypred) acc = sklearn.metrics.accuracy_score(df.Label == "Tx", ypred) print(cf) print("Accuracy = %.4f" % acc) self.metrics = {"confusion_matrix": cf, "accuracy": acc} def predict(self, data): df = data["df"].copy() df = util.compute_timediff_fillna(df) df['MsgIsAttack'] = 0 df['WindowInAttack'] = 0 for event_result in data["event_detection_results"]: if event_result['WindowInAttack']: # print("window %s in attack: event_result = %s" % (event_result['window_start'], event_result)) in_window = (df.Timestamp >= event_result['window_start']) & ( df.Timestamp < event_result['window_start'] + config.WINDOW_SIZE) w_df = df[in_window] ypred = self.model.predict(w_df[FEATURES]) df.loc[in_window, "WindowInAttack"] = 1 df.loc[in_window, "MsgIsAttack"] = ypred.astype(int) return {"injection_window_results": df}
y=target_column_name, ) elif args.library == 'lightgbm': import lightgbm as lgb model = lgb.LGBMClassifier( learning_rate=0.1, n_estimators=100, num_leaves=255, ) model.fit(features_train, labels_train) elif args.library == 'sklearn': from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier model = HistGradientBoostingClassifier( learning_rate=0.1, max_iter=100, max_leaf_nodes=255, validation_fraction=None, ) model.fit(features_train, labels_train) elif args.library == 'xgboost': import xgboost as xgb model = xgb.XGBClassifier(eta=0.1, grow_policy='lossguide', n_estimators=100, tree_method='hist') model.fit(features_train, labels_train) elif args.library == 'catboost': from catboost import CatBoostClassifier model = CatBoostClassifier(grow_policy='Lossguide', learning_rate=0.1, n_estimators=100,
def run(argv=None): """Emulate a HP search and monitor fit time.""" args = parser.parse_args(argv) imputers = { 'Mean': SimpleImputer(strategy='mean'), 'Mean+mask': SimpleImputer(strategy='mean', add_indicator=True), 'Med': SimpleImputer(strategy='median'), 'Med+mask': SimpleImputer(strategy='median', add_indicator=True), 'Iterative': IterativeImputer(max_iter=args.max_iter), 'Iterative+mask': IterativeImputer(add_indicator=True, max_iter=args.max_iter), 'IterativeR': IterativeImputer(estimator=RidgeCV(), max_iter=args.max_iter), 'IterativeR+mask': IterativeImputer(estimator=RidgeCV(), add_indicator=True, max_iter=args.max_iter), 'KNN': KNNImputer(), 'KNN+mask': KNNImputer(add_indicator=True), } task_name = args.task_name est = args.est imp = imputers.get(args.imp, None) if task_name is None or est is None: logger.info('No argv given.') task_name = 'TB/shock_hemo' est = 'HGBC' task = tasks[task_name] logger.info(f'Argv given. Task {task.meta.tag}. est {est}.') t0 = time() logger.info('Getting X.') X = task.X logger.info('Getting y.') y = task.y logger.info(f'X shape before splits: {X.shape}') # Simulate the outer CV (the one of KFold) X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2) # Simulate the inner CV (the one of RandomSearchCV) X_train2, X_test2, y_train2, _ = train_test_split(X_train, y_train, test_size=0.2) # Now X has the same shape as in real experiment logger.info(f'X shape: {X_train2.shape}') t_X_ready = time() if imp is not None: logger.info(f'Fitting imputer {args.imp}') imp.fit(X_train2, y_train2) t_fit_imp = time() logger.info('Imputer fitted.') logger.info('Transforming X_train') imp.transform(X_train2) t_tra1_imp = time() logger.info('X_train transformed') logger.info('Transforming X_test') imp.transform(X_test2) t_tra2_imp = time() logger.info('X_test transformed') t_fits = [time()] for learning_rate in param_space['learning_rate']: for max_depth in param_space['max_depth']: if est == 'HGBC': estimator = HistGradientBoostingClassifier( learning_rate=learning_rate, max_depth=max_depth ) elif est == 'HGBR': estimator = HistGradientBoostingRegressor( loss='least_absolute_deviation', learning_rate=learning_rate, max_depth=max_depth ) else: raise ValueError(f'Unknown estimator {est}') logger.info(f'Params: LR {learning_rate} MD {max_depth}') logger.info('Fitting estimator.') estimator.fit(X_train2, y_train2) t_fits.append(time()) logger.info('Estimator fitted.') t_fits = np.diff(t_fits) data = { 'task_tag': [task.meta.tag], 'imp': [args.imp], 'imp_params': [repr({'max_iter': args.max_iter})], 'X_shape': [repr(X.shape)], 'X_train_shape': [repr(X_train2.shape)], 'X_test_shape': [repr(X_test2.shape)], 'time_X_ready': [t_X_ready-t0], 'time_fit_imp': np.around([0 if imp is None else t_fit_imp-t_X_ready], 2), 'time_tra1_imp': np.around([0 if imp is None else t_tra1_imp-t_X_ready], 2), 'time_tra2_imp': np.around([0 if imp is None else t_tra2_imp-t_tra1_imp], 2), 'time_fits': [repr(np.around(t_fits.tolist(), 2))], 'time_fits_mean': [np.around(t_fits.mean(), 2)] } new_df = pd.DataFrame(data) df = None filepath = 'results/fit_time.csv' if os.path.exists(filepath): df = pd.read_csv(filepath, index_col=0) if df is not None: new_df = pd.concat([df, new_df]) new_df.to_csv(filepath)
# # clf.fit(X_train_selected, y_train) # # y_pred = clf.predict(X_test_selected) # y_train_pred = clf.predict(X_train_selected) # # balacc, acc, mse, r2, rho = gradeoutput(y_test, y_pred, class_boundary, tfm) # outdf = writeresults(outdf, sel_name, clf_name, split, param1, param2, acc, balacc, mse, r2, rho) elif clf_name is "Huber": param1 = np.NaN param2 = np.NaN eps_list = [1.1, 1.2, 1.35, 1.5, 2] # epsilon: greater than 1.0, default 1.35 for param1 in tqdm(eps_list): clf = HistGradientBoostingClassifier( learning_rate=param1, random_state=randomstate) clf.fit(X_train_selected, y_train) y_pred = clf.predict(X_test_selected) y_train_pred = clf.predict(X_train_selected) balacc, acc, mse, r2, rho = gradeoutput( y_test, y_pred, class_boundary, qtfm) outdf = writeresults(outdf, sel_name, clf_name, split, param1, param2, acc, balacc, mse, r2, rho) elif clf_name is "K-Neighbors": param1 = np.NaN param2 = np.NaN
def main(): EXPORT = True random_state = 49 train_size = 0.8 if not EXPORT else 1 df = pd.read_csv("train.csv") #df['Fare'].fillna(-1, inplace=True) #df['Embarked'].fillna('C', inplace=True) #df['Age'].fillna(-1, inplace=True) test = pd.read_csv('test.csv') #test['Fare'].fillna(-1, inplace=True) #test['Embarked'].fillna('C', inplace=True) #test['Age'].fillna(-1, inplace=True) #df_train: pd.DataFrame #df_valid: pd.DataFrame df_train, df_valid = train_test_split(df, train_size=train_size, random_state=random_state)\ if not EXPORT else (df, df) preprocessor = Preprocessor() preprocessor.fit(df_train) x_train = preprocessor.transform(df_train) y_train = df_train["target"] x_valid = preprocessor.transform(df_valid) if not EXPORT else None y_valid = df_valid["target"] if not EXPORT else None x_test = preprocessor.transform(test) # Tell us is that there are not too many features strongly correlated with one another. # Not correlated is good because this means that there isn't much redundant or superfluous data. # Disapear when the program finish. plt.figure(figsize=(14, 12)) plt.title('Pearson Correlation of Features', y=1.05, size=15) sns.heatmap(pd.DataFrame(x_test.astype(float)).corr(), linewidths=0.1, vmax=1.0, square=True, cmap=plt.cm.get_cmap('RdBu'), linecolor='white', annot=True) #model = RandomForestClassifier(n_estimators=200, bootstrap=False, min_samples_split=49, criterion='entropy') #78.81 ... 86.19 #model = GradientBoostingClassifier(n_estimators=500, learning_rate=0.1, max_depth=5, random_state=0, verbose=True) #77.08 ... 86.55 model = HistGradientBoostingClassifier(loss='binary_crossentropy', tol=1e-12, max_iter=10000, min_samples_leaf=2, verbose=True) #78.87 ... 86.85 #model = MLPClassifier(hidden_layer_sizes=20, activation='tanh', solver='adam', tol=1e-6, verbose=True, max_iter=500, random_state=0) #100 78.92 79.11 81.24 81.85 81.74 .. 84.03 #GridSearchCV # Going to use these 5 base models for the stacking # from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier) # from sklearn.svm import SVC #model = DecisionTreeClassifier(max_depth=None, min_samples_split=10, random_state = 0) #78.56 ... 82.26 #model = LogisticRegression(max_iter=1000, random_state=0) #77.96 ... 81.83 ... #model = LogisticRegressionCV(max_iter=1000, random_state=0) #78.016 ... 82.03 #model = RadiusNeighborsClassifier(radius=9) #75.87 ... 75.87 #model = KNeighborsClassifier(n_neighbors=6) #73.795 ... 81.76 ## model = GaussianNB() #78.667 model.fit(x_train, y_train) if EXPORT: y_test_pred = model.predict(x_test) result = pd.DataFrame(np.stack((np.array(test['Id']), y_test_pred), axis=1), columns=['Id', 'Predicted']) result.to_csv('submission.csv', index=False) print("Result exported.") else: y_valid_pred = model.predict(x_valid) accuracy = accuracy_score(y_valid, y_valid_pred) # mcc = matthews_corrcoef(y_valid, y_valid_pred) #metrics = dict(accuracy=accuracy) ############################################################### #params = dict(min_samples_split=min_samples_split) print("Accuracy: {}".format(accuracy)) # print("Mcc: {}".format(mcc)) # tn, fp, fn, tp = confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0]).ravel() #print("Train size: {} Random state: {} Accuracy: {}".format(i, j, accuracy)) #best.append(accuracy) #print(max(best)) pass
def test_same_predictions_multiclass_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification pytest.importorskip("lightgbm") rng = np.random.RandomState(seed=seed) max_iter = 1 max_bins = 255 lr = 1 X, y = make_classification(n_samples=n_samples, n_classes=3, n_features=5, n_informative=5, n_redundant=0, n_clusters_per_class=1, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = HistGradientBoostingClassifier( loss='categorical_crossentropy', max_iter=max_iter, max_bins=max_bins, learning_rate=lr, early_stopping=False, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm') est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_sklearn = est_sklearn.predict(X_train) assert np.mean(pred_sklearn == pred_lightgbm) > .89 proba_lightgbm = est_lightgbm.predict_proba(X_train) proba_sklearn = est_sklearn.predict_proba(X_train) # assert more than 75% of the predicted probabilities are the same up to # the second decimal assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75 acc_lightgbm = accuracy_score(y_train, pred_lightgbm) acc_sklearn = accuracy_score(y_train, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_sklearn = est_sklearn.predict(X_test) assert np.mean(pred_sklearn == pred_lightgbm) > .89 proba_lightgbm = est_lightgbm.predict_proba(X_train) proba_sklearn = est_sklearn.predict_proba(X_train) # assert more than 75% of the predicted probabilities are the same up # to the second decimal assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75 acc_lightgbm = accuracy_score(y_test, pred_lightgbm) acc_sklearn = accuracy_score(y_test, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
y, shuffle=True, test_size=0.2, random_state=42) print(x_train.shape) # (1712, 110336) print(x_test.shape) # (429, 110336) print(y_train.shape) # (1712,) print(y_test.shape) # (429,) # 모델 구성 # model = SVC(verbose=1) # hist = model.fit(x_train, y_train) # SVC Visual plt.figure(figsize=(10, 6)) model = HistGradientBoostingClassifier(verbose=1) # mse # train_sizes, train_scores_model, test_scores_model = \ # learning_curve(model, x_train[:100], y_train[:100], train_sizes=np.linspace(0.1, 1.0, 10), # scoring="neg_mean_squared_error", cv=8, shuffle=True, random_state=42) # plt.plot(train_sizes, -test_scores_model.mean(1), 'o-', color="r", label="mse") # accuracy train_sizes, train_scores_model, test_scores_model = \ learning_curve(model, x_train[:100], y_train[:100], train_sizes=np.linspace(0.1, 1.0, 10), scoring="accuracy", cv=8, shuffle=True, random_state=42) train_scores_mean = np.mean(train_scores_model, axis=1) train_scores_std = np.std(train_scores_model, axis=1) test_scores_mean = np.mean(test_scores_model, axis=1)
print(f"Number of numerical features: {n_numerical_features}") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Note: no need to use an OrdinalEncoder because categorical features are # already clean is_categorical = [name in data.categories for name in data.feature_names] est = HistGradientBoostingClassifier( loss="log_loss", learning_rate=lr, max_iter=n_trees, max_bins=max_bins, max_leaf_nodes=n_leaf_nodes, categorical_features=is_categorical, early_stopping=False, random_state=0, verbose=verbose, ) fit(est, X_train, y_train, "sklearn") predict(est, X_test, y_test) if args.lightgbm: est = get_equivalent_estimator(est, lib="lightgbm", n_classes=n_classes) est.set_params(max_cat_to_onehot=1) # dont use OHE categorical_features = [ f_idx for (f_idx, is_cat) in enumerate(is_categorical) if is_cat ]
preprocessor = ColumnTransformer( [ ('cat_preprocessor', categorical_preprocessor, categorical_columns), ], remainder='passthrough', sparse_threshold=0, ) # %% from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.pipeline import Pipeline model = Pipeline([ ("preprocessor", preprocessor), ("classifier", HistGradientBoostingClassifier(random_state=42, max_leaf_nodes=4)), ]) model # %% [markdown] # ## Evaluation # # ### Without hyperparameter tuning # # In the module "Selecting the best model", we saw that one must use # cross-validation to evaluate such a model. Cross-validation allows to get a # distribution of the scores of the model. Thus, having this distribution at # hand, we can get to assess the variability of our estimate of the # generalization performance of the model. Here, we recall the necessary # `scikit-learn` tools needed to obtain the mean and standard deviation of the # scores.
df_scores = evaluate_classifier(rf_clf, df_scores, "Balanced RF") df_scores ############################################################################### # The performance with the `BalancedRandomForestClassifier` is better than # applying a single random under-sampling. We will use a gradient-boosting # classifier within a `BalancedBaggingClassifier`. from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier from imblearn.ensemble import BalancedBaggingClassifier bag_clf = make_pipeline( preprocessor_tree, BalancedBaggingClassifier( base_estimator=HistGradientBoostingClassifier(random_state=42), n_estimators=10, random_state=42, n_jobs=2)) df_scores = evaluate_classifier(bag_clf, df_scores, "Balanced bagging") df_scores ############################################################################### # This last approach is the most effective. The different under-sampling allows # to bring some diversity for the different GBDT to learn and not focus on a # portion of the majority class. # # We will repeat the same experiment but with a ratio of 100:1 and make a # similar analysis.
# arbitrary. Therefore we adapt the preprocessing pipeline as follows: # %% from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier # For each categorical column, extract the list of all possible categories # in some arbritrary order. categories = [data[column].unique() for column in data[categorical_columns]] preprocessor = ColumnTransformer([ ('categorical', OrdinalEncoder(categories=categories), categorical_columns) ], remainder="passthrough") model = make_pipeline(preprocessor, HistGradientBoostingClassifier()) model.fit(data_train, target_train) print(model.score(data_test, target_test)) # %% [markdown] # We can observe that we get significantly higher accuracies with the Gradient # Boosting model. This is often what we observe whenever the dataset has a large # number of samples and limited number of informative features (e.g. less than # 1000) with a mix of numerical and categorical variables. # # This explains why Gradient Boosted Machines are very popular among datascience # practitioners who work with tabular data. # # # #
# training models models = [[ DecisionTreeClassifier(random_state=42), DecisionTreeClassifier(random_state=42), DecisionTreeClassifier(random_state=42), DecisionTreeClassifier(random_state=42), ], [ RandomForestClassifier(n_jobs=6, random_state=42), RandomForestClassifier(n_jobs=6, random_state=42), RandomForestClassifier(n_jobs=6, random_state=42), RandomForestClassifier(n_jobs=6, random_state=42), ], [ HistGradientBoostingClassifier(max_depth=4, random_state=42), HistGradientBoostingClassifier(max_depth=4, random_state=42), HistGradientBoostingClassifier(max_depth=4, random_state=42), HistGradientBoostingClassifier(max_depth=4, random_state=42), ]] names = [ 'Drzewo decyzyjne', 'Las losowy', 'Wzmocnienie gradientowe', ] y_preds = [] confusion_matrices = [] for model_list, name in zip(models, names):
QuantileTransformer(output_distribution='uniform'), Normalizer() ] #=================Classifier classifier_test = [ OneVsRestClassifier(SVC()), DecisionTreeClassifier(max_depth=5), SVC(), SVC(kernel="linear", C=0.025), LogisticRegressionCV(cv=5, random_state=0), GradientBoostingClassifier(random_state=0), BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=0).fit(features, target), ExtraTreesClassifier(n_estimators=100, random_state=0), HistGradientBoostingClassifier(), MLPClassifier(random_state=1, max_iter=300), OneVsOneClassifier(LinearSVC(random_state=0)), OutputCodeClassifier(estimator=RandomForestClassifier(random_state=0), random_state=0) ] print('Importacao OK') # %% # =================Looping here from sklearn.preprocessing import StandardScaler from sklearn.ensemble import GradientBoostingClassifier from sklearn.pipeline import Pipeline
class tuned_HGB(BaseEstimator): """ Scikit-learn histogram gradient-boosted tree models, tuned with nested cross-validation to minimize the error on a unseen table. Parameters ---------- task : str The estimation task to perform, either 'salary', 'quantile', or 'sex'. learning_rate : None or float The learning rate of the model. If None, a nested cross-validation procedure is used to determine the best one. fit_on : str If fit_on = 'all', all the validation data is used to compute the validation error. Set fit_on = 'seen' or 'unseen' to optimize the learning rate for unseen or seen categories only. """ def __init__(self, task, learning_rate=None, fit_on='all'): self.task = task self.learning_rate = learning_rate self.fit_on = fit_on return def param_tuning(self, X1, y1): D_var = make_D_var(self.X1_nem, self.X1_mem, n_jobs=1) n_var = n_variants(self.X1_nem, self.X1_mem, y1, self.groups1, n_splits=None, test_size=None, D_var=D_var, n_jobs=1, nested_cross_val=True) lr_list = np.logspace(-2, -0.5, 4) res = np.zeros(len(lr_list)) for k in range(len(lr_list)): if self.task == "salary": self2 = HistGradientBoostingRegressor(learning_rate=lr_list[k]) else: self2 = HistGradientBoostingClassifier( learning_rate=lr_list[k]) cv_err = cv_errors(self.task, self2, X1, self.X1_nem, self.X1_mem, y1, self.groups1, n_splits=None, test_size=None, n_jobs=1, nested_cross_val=True) if self.task != 'quantile': cv_err = cv_err**2 if self.fit_on == 'unseen': res[k] = cv_err[n_var == 0].mean() elif self.fit_on == 'seen': res[k] = cv_err[n_var >= 1].mean() else: res[k] = cv_err.mean() self.learning_rate = lr_list[np.argmin(res)] print(int(sum(n_var == 0) / len(n_var) * 100) / 100) return def fit(self, X1, y1): # Parameter tuning if self.learning_rate == None: self.param_tuning(X1, y1) print(self.learning_rate) # Fit on all train data with tuned params if self.task == "salary": self.model = HistGradientBoostingRegressor( learning_rate=self.learning_rate) else: self.model = HistGradientBoostingClassifier( learning_rate=self.learning_rate) self.model.fit(X1, y1) return def predict(self, X2): return self.model.predict(X2) def predict_proba(self, X2): return self.model.predict_proba(X2)
def main(): #==================================================== # DATA PREPARATION #==================================================== #Let's have a look at the dataset: data_full = pd.read_csv('dataset_higgs_challenge.csv') #For this classification I used only yhe "t" (training data), "b" (validation data) and "v" (test data) set of variables: print('Total number of events: ', len(data_full), '\n') for KaggleSetID in ['t', 'b', 'v', 'u']: print('Number of events in the {} KaggleSet: {}'.format( KaggleSetID, len(data_full['KaggleSet'][data_full['KaggleSet'] == KaggleSetID]))) #Description of the sub-dataset in each line: #1) Splitting of the dataset into train, test and validation set. #2) Extracting the weights of the validation and test set. #3) Extracting the binary arrays for my networks. #4) Extracting the binary arrays for my BDT #Within the splitting of the dataset, have been applyied some operations on the engineering of the features for each subset. The problem is that the "phi" variables have a signal distribution that is very similar to the background one. So it's better to consider their linear combination (difference in this case) to make them useful in my classification. X, df_empty, y_train, y_train_BDT = splitting(data_full, "t") X_val, weights_val, y_val, y_val_BDT = splitting(data_full, "b") X_test, weights_test, y_test, y_test_BDT = splitting(data_full, "v") del (data_full) #==================================================== # BDT #==================================================== #Let's first scale my data: standard = StandardScaler() standard.fit(X) X_standard = standard.transform(X) X_val_standard = standard.transform(X_val) X_test_standard = standard.transform(X_test) #BDT classification: BDT = HistGradientBoostingClassifier(max_iter=90, verbose=1, l2_regularization=0.5, learning_rate=.1, max_leaf_nodes=50, random_state=45, max_depth=15, max_bins=50) BDT.fit(X_standard, y_train_BDT) y_pred_val = BDT.predict_proba(X_val_standard) y_pred_test = BDT.predict_proba(X_test_standard) del X_standard, X_val_standard, X_test_standard #I will split the results just to be able to combine them with the DNN result later: BDT_0jets_val = y_pred_val[X_val['PRI_jet_num'] == 0] BDT_1jet_val = y_pred_val[X_val['PRI_jet_num'] == 1] BDT_2jets_val = y_pred_val[X_val['PRI_jet_num'] >= 2] y_pred_BDT_val = np.concatenate( (BDT_0jets_val, BDT_1jet_val, BDT_2jets_val)) BDT_0jets_test = y_pred_test[X_test['PRI_jet_num'] == 0] BDT_1jet_test = y_pred_test[X_test['PRI_jet_num'] == 1] BDT_2jets_test = y_pred_test[X_test['PRI_jet_num'] >= 2] y_pred_BDT_test = np.concatenate( (BDT_0jets_test, BDT_1jet_test, BDT_2jets_test)) #==================================================== # DATA PROCESSING #==================================================== #Let's construct the data for the case with 0 jets: X_0jets, y_train_0jets, empty_0 = splitting_jets(X, y_train, df_empty, 0) X_val_0jets, y_val_0jets, weights_0jets_val = splitting_jets( X_val, y_val, weights_val, 0) X_test_0jets, y_test_0jets, weights_0jets_test = splitting_jets( X_test, y_test, weights_test, 0) #Let's construct the data for the case with 1 jets: X_1jet, y_train_1jet, empty_1 = splitting_jets(X, y_train, df_empty, 1) X_val_1jet, y_val_1jet, weights_1jet_val = splitting_jets( X_val, y_val, weights_val, 1) X_test_1jet, y_test_1jet, weights_1jet_test = splitting_jets( X_test, y_test, weights_test, 1) #Let's construct the data for the case with 2 jets: X_2jets, y_train_2jets, empty_2 = splitting_jets(X, y_train, df_empty, 2) X_val_2jets, y_val_2jets, weights_2jets_val = splitting_jets( X_val, y_val, weights_val, 2) X_test_2jets, y_test_2jets, weights_2jets_test = splitting_jets( X_test, y_test, weights_test, 2) del empty_0, empty_1, empty_2 #==================================================== # 2-JETS DNN #==================================================== #Scaling data: standard_2jets = StandardScaler() standard_2jets.fit(X_2jets) X_2jets_standard = standard_2jets.transform(X_2jets) X_val_2jets_standard = standard_2jets.transform(X_val_2jets) X_test_2jets_standard = standard_2jets.transform(X_test_2jets) #DNN: np.random.seed(42) DNN_2jets = make_model([64, 128, 64, 64, 32, 8], 'relu', 0.1, 'Adam', 'L2', 0.0001, X_2jets.shape[-1]) early_stopping = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=10, verbose=0, mode='auto', baseline=None, restore_best_weights=True) history = DNN_2jets.fit(X_2jets_standard, y_train_2jets, batch_size=256, epochs=50, verbose=1, validation_data=(X_val_2jets_standard, y_val_2jets), callbacks=[early_stopping], class_weight=None) y_pred_2jets_val = DNN_2jets.predict(X_val_2jets_standard) y_pred_2jets_test = DNN_2jets.predict(X_test_2jets_standard) del X_2jets_standard, X_val_2jets_standard, X_2jets, X_val_2jets, X_test_2jets_standard, X_test_2jets #==================================================== # 1-JET DNN #==================================================== #Scaling data: standard_1jet = StandardScaler() standard_1jet.fit(X_1jet) X_1jet_standard = standard_1jet.transform(X_1jet) X_val_1jet_standard = standard_1jet.transform(X_val_1jet) X_test_1jet_standard = standard_1jet.transform(X_test_1jet) #DNN: np.random.seed(42) DNN_1jet = make_model([64, 64, 64, 32, 8], 'relu', 0.1, 'Adagrad', 'L1', 0.0001, X_1jet.shape[-1]) early_stopping = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=10, verbose=0, mode='auto', baseline=None, restore_best_weights=True) history = DNN_1jet.fit(X_1jet_standard, y_train_1jet, batch_size=256, epochs=50, verbose=1, validation_data=(X_val_1jet_standard, y_val_1jet), callbacks=[early_stopping], class_weight=None) y_pred_1jet_val = DNN_1jet.predict(X_val_1jet_standard) y_pred_1jet_test = DNN_1jet.predict(X_test_1jet_standard) del X_1jet_standard, X_val_1jet_standard, X_1jet, X_val_1jet, X_test_1jet_standard, X_test_1jet #==================================================== # 0-JET DNN #==================================================== #Scaling data: standard_0jets = StandardScaler() standard_0jets.fit(X_0jets) X_0jets_standard = standard_0jets.transform(X_0jets) X_val_0jets_standard = standard_0jets.transform(X_val_0jets) X_test_0jets_standard = standard_0jets.transform(X_test_0jets) #DNN: np.random.seed(42) DNN_0jets = make_model([32, 64, 128, 64, 32, 8], 'elu', 0.1, 'Adagrad', 'L1', 0.0001, X_0jets.shape[-1]) early_stopping = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=10, verbose=0, mode='auto', baseline=None, restore_best_weights=True) history = DNN_0jets.fit(X_0jets_standard, y_train_0jets, batch_size=256, epochs=50, verbose=1, validation_data=(X_val_0jets_standard, y_val_0jets), callbacks=[early_stopping], class_weight=None) y_pred_0jets_val = DNN_0jets.predict(X_val_0jets_standard) y_pred_0jets_test = DNN_0jets.predict(X_test_0jets_standard) del X_0jets_standard, X_val_0jets_standard, X_0jets, X_val_0jets, X_test_0jets_standard, X_test_0jets #==================================================== # TOTAL AMS SCORE OF DNNs #==================================================== #Total AMS score considering all the AMS of each subset: y_pred_DNN_val = np.concatenate( (y_pred_0jets_val, y_pred_1jet_val, y_pred_2jets_val)) y_val_total = np.concatenate((y_val_0jets, y_val_1jet, y_val_2jets)) weights_total_val = np.concatenate( (weights_0jets_val, weights_1jet_val, weights_2jets_val)) y_pred_DNN_test = np.concatenate( (y_pred_0jets_test, y_pred_1jet_test, y_pred_2jets_test)) y_test_total = np.concatenate((y_test_0jets, y_test_1jet, y_test_2jets)) weights_total_test = np.concatenate( (weights_0jets_test, weights_1jet_test, weights_2jets_test)) #==================================================== # COMBINING DNNs AND BDT AMS #==================================================== dataset_blend_val = np.append(y_pred_DNN_val[:, 1].reshape(-1, 1), y_pred_BDT_val[:, 1].reshape(-1, 1), axis=1) dataset_blend_test = np.append(y_pred_DNN_test[:, 1].reshape(-1, 1), y_pred_BDT_test[:, 1].reshape(-1, 1), axis=1) blend = LogisticRegression(solver='lbfgs') blend.fit(dataset_blend_val, y_val_total[:, 1]) blended_val = blend.predict_proba(dataset_blend_val) blended_test = blend.predict_proba(dataset_blend_test) #==================================================== # FINAL RESULTS #==================================================== print('DNN:') plot_AMS(y_pred_DNN_test, y_test_total, weights_total_test) print('BDT:') plot_AMS(y_pred_BDT_test, y_test_total, weights_total_test) print('Combination:') plot_AMS(blended_test, y_test_total, weights_total_test) plt.legend(['DNN', 'BDT', 'DNN + BDT']) plt.ylim(2.8, ) plt.savefig('AMS_total.png', dpi=300) plt.show() plot_distributions_final(blended_val, blended_test, y_val_total, 50, False, weights_total_val, weights_total_test) plt.savefig('Final_distribution_unweighted.png', dpi=300) plt.show() plot_distributions_final(blended_val, blended_test, y_val_total, 50, True, weights_total_val, weights_total_test) plt.savefig('Final_distribution_weighted.png', dpi=300) plt.show()