def test_sklearn_classification(): check_classifier( SklearnClassifier(clf=AdaBoostClassifier(n_estimators=10))) check_classifier( SklearnClassifier(clf=AdaBoostClassifier(n_estimators=10)), n_classes=3) check_classifier( SklearnClassifier(clf=GradientBoostingClassifier(n_estimators=10)))
def test_folding_classifier(): base_ada = SklearnClassifier(AdaBoostClassifier()) folding_str = FoldingClassifier(base_ada, n_folds=2) check_folding(folding_str, True, True, True) base_log_reg = SklearnClassifier(LogisticRegression()) folding_str = FoldingClassifier(base_log_reg, n_folds=4) check_folding(folding_str, True, False, False, False)
def test_folding_classifier(): base_ada = SklearnClassifier(AdaBoostClassifier()) folding_str = FoldingClassifier(base_ada, n_folds=2) check_folding(folding_str, True, True, True) base_svm = SklearnClassifier(SVC()) folding_str = FoldingClassifier(base_svm, n_folds=4) check_folding(folding_str, True, False, False)
def test_sklearn_classification(): # supports weights check_classifier( SklearnClassifier(clf=AdaBoostClassifier(n_estimators=10))) check_classifier( SklearnClassifier(clf=AdaBoostClassifier(n_estimators=10)), n_classes=3) # doesn't support weights check_classifier( SklearnClassifier(clf=GradientBoostingClassifier(n_estimators=10)), supports_weight=False)
def test_own_classification_reports(): """ testing clf.test_on """ X, y, sample_weight = generate_classification_data() clf = SklearnClassifier(RandomForestClassifier()) clf.fit(X, y, sample_weight=sample_weight) report = clf.test_on(X, y, sample_weight=sample_weight) roc1 = report.compute_metric(RocAuc()) lds = LabeledDataStorage(X, y, sample_weight=sample_weight) roc2 = clf.test_on_lds(lds=lds).compute_metric(RocAuc()) assert roc1 == roc2, 'Something wrong with test_on'
def test_gridsearch_metrics_threads(n_threads=3): X, y, sample_weight = generate_classification_data(n_classes=2, distance=0.7) param_grid = OrderedDict({'reg_param': numpy.linspace(0, 1, 20)}) from itertools import cycle optimizers = cycle([ RegressionParameterOptimizer(param_grid=param_grid, n_evaluations=4, start_evaluations=2), SubgridParameterOptimizer(param_grid=param_grid, n_evaluations=4), RandomParameterOptimizer(param_grid=param_grid, n_evaluations=4), ]) for metric in [RocAuc(), OptimalAMS(), OptimalSignificance(), log_loss]: scorer = FoldingScorer(metric) clf = SklearnClassifier(QDA()) grid = GridOptimalSearchCV( estimator=clf, params_generator=next(optimizers), scorer=scorer, parallel_profile='threads-{}'.format(n_threads)) grid.fit(X, y) print(grid.params_generator.best_score_) print(grid.params_generator.best_params_) grid.params_generator.print_results()
def test_simple_stacking_pybrain(): base_pybrain = PyBrainClassifier() check_classifier(SklearnClassifier( clf=BaggingClassifier(base_estimator=base_pybrain, n_estimators=3)), has_staged_pp=False, has_importances=False, supports_weight=False)
def test_gridsearch_sklearn(): metric = numpy.random.choice([OptimalAMS(), RocAuc(), LogLoss()]) scorer = ClassificationFoldingScorer(metric) maximization = True if isinstance(metric, LogLoss): maximization = False grid_param = OrderedDict({ "n_estimators": [10, 20], "learning_rate": [0.1, 0.05], 'features': [['column0', 'column1'], ['column0', 'column1', 'column2']] }) generator = RegressionParameterOptimizer(grid_param, n_evaluations=4, maximize=maximization) grid = GridOptimalSearchCV(SklearnClassifier(clf=AdaBoostClassifier()), generator, scorer, parallel_profile='threads-3') _ = check_grid(grid, False, False, False, use_weights=True) classifier = check_grid(grid, False, False, False, use_weights=False) # Check parameters of best fitted classifier assert 2 <= len(classifier.features) <= 3, 'Features were not set' params = classifier.get_params() for key in grid_param: if key in params: assert params[key] == grid.generator.best_params_[key] else: assert params['clf__' + key] == grid.generator.best_params_[key]
def test_complex_stacking_mn(): # Ada over kFold over MatrixNet base_kfold = FoldingClassifier(base_estimator=MatrixNetClassifier( iterations=30)) check_classifier(SklearnClassifier( clf=AdaBoostClassifier(base_estimator=base_kfold, n_estimators=3)), has_staged_pp=False, has_importances=False)
def _make_clf(self, clf, bagging=None): """Creates a classifier from a dict or returns the clf""" if isinstance(clf, dict): key, val = clf.popitem() try: val = self.__DEFAULT_CLF_CFG.get(key) if val is None else val except KeyError: logger.error(str(val) + " not an implemented classifier.") raise temp_bagging = val.pop('bagging', bagging) bagging = temp_bagging if bagging is None else bagging if key == 'rdf': config_clf = dict(val) # possible multi-threading arguments clf = SklearnClassifier(RandomForestClassifier(**config_clf)) elif key == 'erf': config_clf = dict(val) # possible multi-threading arguments clf = SklearnClassifier(ExtraTreesClassifier(**config_clf)) elif key == 'nn': config_clf = dict(val) # possible multi-threading arguments clf = TheanetsClassifier(**config_clf) elif key == 'ada': config_clf = dict(val) # possible multi-threading arguments clf = SklearnClassifier(AdaBoostClassifier(**config_clf)) elif key == 'gb': config_clf = dict(val) # possible multi-threading arguments clf = SklearnClassifier(GradientBoostingClassifier(**config_clf)) elif key == 'xgb': config_clf = dict(val) # possible multi-threading arguments clf = XGBoostClassifier(**config_clf) elif hasattr(clf, 'fit'): bagging = False # return the classifier # bagging over the instantiated estimators if isinstance(bagging, int) and bagging >= 1: bagging = dict(self.__DEFAULT_BAG_CFG, n_estimators=bagging) if isinstance(bagging, dict): # TODO: implement multi-thread: bagging.update({'base_estimator': clf}) clf = SklearnClassifier(BaggingClassifier(**bagging)) else: raise ValueError(str(clf) + " not valid as a classifier.") clf = {key: clf} return clf
def fit_categorical_labels(df_train, df_test, df_labels, fit_type='regressor', fit_framework='theanets', labels_list=None): from rep.estimators import SklearnClassifier, SklearnRegressor from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import GradientBoostingRegressor from rep.estimators.neurolab import NeurolabRegressor from rep.estimators.theanets import TheanetsRegressor #from rep.estimators import XGBoostRegressor #from rep.estimators import XGBoostRegressor # Using gradient boosting with default settings if fit_framework == 'sklearn': if fit_type == 'classifier': sk = SklearnClassifier(GradientBoostingClassifier(), features=df_train.columns.values) elif fit_type == 'regressor': sk = SklearnRegressor(GradientBoostingRegressor(), features=df_train.columns.values) elif fit_framework == 'neural': if fit_type == 'regressor': sk = NeurolabRegressor(features=df_train.columns.values, ) elif fit_framework == 'xgboost': if fit_type == 'regressor': sk = XGBoostRegressor(features=df_train.columns.values, ) elif fit_framework == 'theanets': if fit_type == 'regressor': sk = TheanetsRegressor(features=df_train.columns.values, ) else: raise ValueError('No correct combo of fit_type and fit_framework found') prediction_array = np.empty((len(df_test), len(df_labels.columns))) for i, column in enumerate(df_labels.columns.values): # get a single column to predict labels = df_labels[column] # fit the data with the training set sk.fit(df_train, labels) # predict new countries prediction = np.squeeze(sk.predict(df_test)) prediction_array[:, i] = prediction #prediction = pd.read_pickle(filename).squeeze() df_predict = pd.DataFrame(prediction_array, columns=df_labels.columns.values) df_predict = gather_dummy_predictions(df_predict, labels_list) #print('unique labels', np.unique(df_predict)) return df_predict
def test_classifier_with_dataframe(): try: from rep.estimators import SklearnClassifier clf = SklearnClassifier(GradientBoostingClassifier(n_estimators=1)) X, y = generate_sample(n_samples=100, n_features=4) for X_ in [X, pandas.DataFrame(X)]: lookup = LookupClassifier(clf, n_bins=16).fit(X_, y) lookup.predict_proba(X) except ImportError: print('expected fail: yandex/rep not installed')
def test_gridsearch_threads(n_threads=3): scorer = FoldingScorer(numpy.random.choice([OptimalAMS(), RocAuc()])) grid_param = OrderedDict({ "n_estimators": [10, 20], "learning_rate": [0.1, 0.05], 'features': [['column0', 'column1'], ['column0', 'column1', 'column2']] }) generator = RegressionParameterOptimizer(grid_param, n_evaluations=4) base = SklearnClassifier(clf=AdaBoostClassifier()) grid = GridOptimalSearchCV(base, generator, scorer, parallel_profile='threads-{}'.format(n_threads)) X, y, sample_weight = generate_classification_data() grid.fit(X, y, sample_weight=sample_weight)
def grid_sklearn(score_function): grid_param = OrderedDict({ "n_estimators": [10, 20], "learning_rate": [0.1, 0.05], 'features': [['column0', 'column1'], ['column0', 'column1', 'column2']] }) generator = RegressionParameterOptimizer(grid_param) scorer = FoldingScorer(score_function) grid = GridOptimalSearchCV(SklearnClassifier(clf=AdaBoostClassifier()), generator, scorer) cl = check_grid(grid, False, False, False) assert 1 <= len(cl.features) <= 3 params = cl.get_params() for key in grid_param: if key in params: assert params[key] == grid.generator.best_params_[key] else: assert params['clf__' + key] == grid.generator.best_params_[key]
def grid_custom(custom): grid_param = OrderedDict({ "n_estimators": [10, 20], "learning_rate": [0.1, 0.05], 'features': [['column0', 'column1'], ['column0', 'column1', 'column2']] }) generator = SubgridParameterOptimizer(grid_param) grid = GridOptimalSearchCV( SklearnClassifier(clf=AdaBoostClassifier(), features=['column0', 'column1']), generator, custom) cl = check_grid(grid, False, False, False) assert 1 <= len(cl.features) <= 3 params = cl.get_params() for key in grid_param: if key in params: assert params[key] == grid.generator.best_params_[key] else: assert params['clf__' + key] == grid.generator.best_params_[key]
def test_grid_with_custom_scorer(): """ Introducing here special scorer which always uses all data passed to gridsearch.fit as training and tests on another fixed dataset (which was passed to scorer) bu computing roc_auc_score from sklearn. """ class CustomScorer(object): def __init__(self, testX, testY): self.testY = testY self.testX = testX def __call__(self, base_estimator, params, X, y, sample_weight=None): cl = clone(base_estimator) cl.set_params(**params) if sample_weight is not None: cl.fit(X, y, sample_weight) else: cl.fit(X, y) return roc_auc_score(self.testY, cl.predict_proba(self.testX)[:, 1]) X, y, _ = generate_classification_data() custom_scorer = CustomScorer(X, y) grid_param = OrderedDict({ "n_estimators": [10, 20], "learning_rate": [0.1, 0.05], 'features': [['column0', 'column1'], ['column0', 'column1', 'column2']] }) generator = SubgridParameterOptimizer(grid_param) base_estimator = SklearnClassifier(clf=AdaBoostClassifier()) grid = GridOptimalSearchCV(base_estimator, generator, custom_scorer) cl = check_grid(grid, False, False, False) assert len(cl.features) <= 3 params = cl.get_params() for key in grid_param: if key in params: assert params[key] == grid.generator.best_params_[key] else: assert params['clf__' + key] == grid.generator.best_params_[key]
'IP_p0p2', 'IP_p1p2', 'isolationa', 'isolationb', 'isolationc', 'isolationd', 'isolatione', 'isolationf', 'iso', 'CDF1', 'CDF2', 'p0_track_Chi2Dof', 'p1_track_Chi2Dof', 'p2_track_Chi2Dof','p0_IP', 'p1_IP', 'p2_IP', 'p0_IPSig', 'p1_IPSig', 'p2_IPSig', 'p0_eta', 'p1_eta', 'p2_eta'] uniform_features = ["mass"] n_estimators = 150 base_estimator = DecisionTreeClassifier(max_depth=4) base_ada = GradientBoostingClassifier(max_depth=4, n_estimators=100, learning_rate=0.1) AdaBoost = SklearnClassifier(base_ada, features=train_features) knnloss = ugb.KnnAdaLossFunction(uniform_features, knn=10, uniform_label=1) ugbKnn = ugb.UGradientBoostingClassifier(loss=knnloss, max_depth=4, n_estimators=n_estimators, learning_rate=0.4, train_features=train_features) uGB+knnAda = SklearnClassifier(ugbKnn) uboost_clf = uboost.uBoostClassifier(uniform_features=uniform_features, uniform_label=1, base_estimator=base_estimator, n_estimators=n_estimators, train_features=train_features, efficiency_steps=12, n_threads=4) uBoost = SklearnClassifier(uboost_clf) flatnessloss = ugb.KnnFlatnessLossFunction(uniform_features, fl_coefficient=3., power=1.3, uniform_label=1) ugbFL = ugb.UGradientBoostingClassifier(loss=flatnessloss, max_depth=4,
def test_simple_stacking_mn(): base_mn = MatrixNetClassifier(iterations=10) check_classifier(SklearnClassifier( clf=AdaBoostClassifier(base_estimator=base_mn, n_estimators=2)), has_staged_pp=True)
def clf_mayou(data1, data2, n_folds=3, n_base_clf=5): """DEVELOPEMENT, WIP. Test a setup of clf involving bagging and stacking.""" # import raredecay.analysis.ml_analysis as ml_ana # import pandas as pd import copy from rep.estimators import SklearnClassifier, XGBoostClassifier from rep.metaml.folding import FoldingClassifier from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from sklearn.ensemble import BaggingClassifier # , VotingClassifier, AdaBoostClassifier from rep.estimators.theanets import TheanetsClassifier from sklearn.linear_model import LogisticRegression from rep.metaml.cache import CacheClassifier from rep.report.metrics import RocAuc import rep.metaml.cache from rep.metaml._cache import CacheHelper rep.metaml.cache.cache_helper = CacheHelper('/home/mayou/cache', 100000) # data1.make_folds(n_folds) # data2.make_folds(n_folds) output = {} # for i in range(n_folds): xgb_clf = XGBoostClassifier(n_estimators=350, eta=0.1, max_depth=4, nthreads=3) xgb_folded = FoldingClassifier(base_estimator=xgb_clf, stratified=True, parallel_profile='threads-2') xgb_bagged = BaggingClassifier(base_estimator=xgb_folded, n_estimators=n_base_clf, bootstrap=False) xgb_bagged = SklearnClassifier(xgb_bagged) xgb_big_stacker = copy.deepcopy(xgb_bagged) xgb_bagged = CacheClassifier(name='xgb_bagged1', clf=xgb_bagged) xgb_single = XGBoostClassifier(n_estimators=350, eta=0.1, max_depth=4, nthreads=3) xgb_single = FoldingClassifier(base_estimator=xgb_single, stratified=True, n_folds=10, parallel_profile='threads-2') xgb_single = CacheClassifier(name='xgb_singled1', clf=xgb_single) rdf_clf = SklearnClassifier( RandomForestClassifier(n_estimators=300, n_jobs=3)) rdf_folded = FoldingClassifier(base_estimator=rdf_clf, stratified=True, parallel_profile='threads-2') rdf_bagged = BaggingClassifier(base_estimator=rdf_folded, n_estimators=n_base_clf, bootstrap=False) rdf_bagged = SklearnClassifier(rdf_bagged) rdf_bagged = CacheClassifier(name='rdf_bagged1', clf=rdf_bagged) gb_clf = SklearnClassifier(GradientBoostingClassifier(n_estimators=50)) gb_folded = FoldingClassifier(base_estimator=gb_clf, stratified=True, parallel_profile='threads-6') gb_bagged = BaggingClassifier(base_estimator=gb_folded, n_estimators=n_base_clf, bootstrap=False, n_jobs=5) gb_bagged = SklearnClassifier(gb_bagged) gb_bagged = CacheClassifier(name='gb_bagged1', clf=gb_bagged) nn_clf = TheanetsClassifier(layers=[300, 300], hidden_dropout=0.03, trainers=[{ 'optimize': 'adagrad', 'patience': 5, 'learning_rate': 0.2, 'min_improvement': 0.1, 'momentum': 0.4, 'nesterov': True, 'loss': 'xe' }]) nn_folded = FoldingClassifier(base_estimator=nn_clf, stratified=True, parallel_profile=None) # 'threads-6') nn_bagged = BaggingClassifier(base_estimator=nn_folded, n_estimators=n_base_clf, bootstrap=False, n_jobs=1) nn_bagged = CacheClassifier(name='nn_bagged1', clf=nn_bagged) nn_single_clf = TheanetsClassifier(layers=[300, 300, 300], hidden_dropout=0.03, trainers=[{ 'optimize': 'adagrad', 'patience': 5, 'learning_rate': 0.2, 'min_improvement': 0.1, 'momentum': 0.4, 'nesterov': True, 'loss': 'xe' }]) nn_single = FoldingClassifier(base_estimator=nn_single_clf, n_folds=3, stratified=True) nn_single = CacheClassifier(name='nn_single1', clf=nn_single) logit_stacker = SklearnClassifier( LogisticRegression(penalty='l2', solver='sag')) logit_stacker = FoldingClassifier(base_estimator=logit_stacker, n_folds=n_folds, stratified=True, parallel_profile='threads-6') logit_stacker = CacheClassifier(name='logit_stacker1', clf=logit_stacker) xgb_stacker = XGBoostClassifier(n_estimators=400, eta=0.1, max_depth=4, nthreads=8) # HACK xgb_stacker = xgb_big_stacker xgb_stacker = FoldingClassifier(base_estimator=xgb_stacker, n_folds=n_folds, random_state=42, stratified=True, parallel_profile='threads-6') xgb_stacker = CacheClassifier(name='xgb_stacker1', clf=xgb_stacker) # train1, test1 = data1.get_fold(i) # train2, test2 = data1.get_fold(i) # # t_data, t_targets, t_weights = data, targets, weights = data1.make_dataset(data2, weights_ratio=1) # xgb_bagged.fit(data, targets, weights) # xgb_report = xgb_bagged.test_on(data, targets, weights) # xgb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC xgb_base classifier") # output['xgb_base'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc())) # xgb_proba = xgb_report.prediction['clf'][:, 1] # del xgb_bagged, xgb_folded, xgb_clf, xgb_report # # xgb_single.fit(data, targets, weights) # xgb_report = xgb_single.test_on(data, targets, weights) # xgb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC xgb_single classifier") # output['xgb_single'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc())) # xgb_proba = xgb_report.prediction['clf'][:, 1] # del xgb_single, xgb_report nn_single.fit(data, targets, weights) nn_report = nn_single.test_on(data, targets, weights) nn_report.roc(physics_notion=True).plot( new_plot=True, title="ROC AUC nn_single classifier") output['nn_single'] = "roc auc:" + str( nn_report.compute_metric(metric=RocAuc())) # nn_proba = nn_report.prediction['clf'][:, 1] del nn_single, nn_report # rdf_bagged.fit(data, targets, weights) # rdf_report = rdf_bagged.test_on(data, targets, weights) # rdf_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC rdf_base classifier") # output['rdf_base'] = "roc auc:" + str(rdf_report.compute_metric(metric=RocAuc())) # rdf_proba = rdf_report.prediction['clf'][:, 1] # del rdf_bagged, rdf_clf, rdf_folded, rdf_report # gb_bagged.fit(data, targets, weights) # gb_report = gb_bagged.test_on(data, targets, weights) # gb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC gb_base classifier") # output['gb_base'] = "roc auc:" + str(gb_report.compute_metric(metric=RocAuc())) # gb_proba = gb_report.prediction['clf'][:, 1] # del gb_bagged, gb_clf, gb_folded, gb_report # nn_bagged.fit(data, targets, weights) # nn_report = nn_bagged.test_on(data, targets, weights) # nn_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC nn_base classifier") # output['nn_base'] = "roc auc:" + str(nn_report.compute_metric(metric=RocAuc())) # nn_proba = nn_report.prediction['clf'][:, 1] # del nn_bagged, nn_clf, nn_folded, nn_report # # base_predict = pd.DataFrame({'xgb': xgb_proba, # #'rdf': rdf_proba, # #'gb': gb_proba, # 'nn': nn_proba # }) # # # xgb_stacker.fit(base_predict, targets, weights) # xgb_report = xgb_stacker.test_on(base_predict, targets, weights) # xgb_report.roc(physics_notion=True).plot(new_plot=True, # title="ROC AUC xgb_stacked classifier") # output['stacker_xgb'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc())) # del xgb_stacker, xgb_report # # logit_stacker.fit(base_predict, targets, weights) # logit_report = logit_stacker.test_on(base_predict, targets, weights) # logit_report.roc(physics_notion=True).plot(new_plot=True, # title="ROC AUC logit_stacked classifier") # output['stacker_logit'] = "roc auc:" + str(logit_report.compute_metric(metric=RocAuc())) # del logit_stacker, logit_report print output
", test_size=" + str(uconfig.training.size)) # create classifiers classifiers = ClassifiersFactory() weights = OrderedDict() # standard bdt if "bdt" in uconfig.training.algorithms: base_grad = GradientBoostingClassifier( max_depth=uconfig.hyper.max_depth, n_estimators=uconfig.hyper.n_estimators, subsample=uconfig.hyper.subsample, learning_rate=uconfig.hyper.learning_rate, min_samples_leaf=uconfig.hyper.min_samples_leaf, ) classifiers["bdt"] = SklearnClassifier(base_grad, features=uconfig.features.train) weights["bdt"] = trainW[uconfig.training.algorithms["bdt"]] # uniform bdt if "ubdt" in uconfig.training.algorithms: if uconfig.hyper.uloss == "log": from mods import flat_log_loss flat_log_loss() flatnessloss = ugb.BinFlatnessLossFunction( uconfig.features.uniform, fl_coefficient=uconfig.hyper.fl_coefficient, power=uconfig.hyper.power, uniform_label=uconfig.hyper.uniform_label, n_bins=uconfig.hyper.n_bins, ) ugbFL = ugb.UGradientBoostingClassifier(
if primitiv: X = pd.DataFrame({'odin': np.array([2., 2., 2., 2., 3., 3., 2., 3., 8., 7., 8., 7., 8., 8., 7., 8.]), 'dwa': np.array([2.2, 2.1, 2.2, 2.3, 3.1, 3.1, 2.1, 3.2, 8.1, 7.5, 8.2, 7.1, 8.5, 8.2, 7.6, 8.1]) }) y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]) w = np.ones(16) branch_names = ['odin', 'dwa'] print branch_names X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, w, test_size=0.33) lds = LabeledDataStorage(X_test, y_test, w_test) # CLASSIFIER clf_stacking = SklearnClassifier(RandomForestClassifier(n_estimators=5000, bootstrap=False, n_jobs=7)) # clf_stacking = XGBoostClassifier(n_estimators=700, eta=0.1, nthreads=8, # subsample=0.5 # ) # clf_stacking='nn' clf = Mayou(base_estimators={'xgb': None}, bagging_base=None, bagging_stack=8, stacking=clf_stacking, features_stack=branch_names, transform=False, transform_pred=False) # clf = SklearnClassifier(GaussianNB()) # clf = SklearnClassifier(BaggingClassifier(n_jobs=1, max_features=1., # bootstrap=False, base_estimator=clf, n_estimators=20, max_samples=0.1)) # clf = XGBoostClassifier(n_estimators=400, eta=0.1, nthreads=6) # clf = SklearnClassifier(BaggingClassifier(clf, max_samples=0.8)) # clf = SklearnClassifier(NuSVC(cache_size=1000000)) # clf = SklearnClassifier(clf) if folding:
def test_simple_stacking_pybrain(): base_pybrain = PyBrainClassifier(epochs=2) base_bagging = BaggingClassifier(base_estimator=base_pybrain, n_estimators=3) check_classifier(SklearnClassifier(clf=base_bagging), **classifier_params)
data = data.drop('g', axis=1) import numpy import numexpr import pandas from rep import utils from sklearn.ensemble import GradientBoostingClassifier from rep.report.metrics import RocAuc from rep.metaml import GridOptimalSearchCV, FoldingScorer, RandomParameterOptimizer from rep.estimators import SklearnClassifier, TMVAClassifier, XGBoostRegressor # define grid parameters grid_param = {} grid_param['learning_rate'] = [0.2, 0.1, 0.05, 0.02, 0.01] grid_param['max_depth'] = [2, 3, 4, 5] # use random hyperparameter optimization algorithm generator = RandomParameterOptimizer(grid_param) # define folding scorer scorer = FoldingScorer(RocAuc(), folds=3, fold_checks=3) estimator = SklearnClassifier(GradientBoostingClassifier(n_estimators=30)) #grid_finder = GridOptimalSearchCV(estimator, generator, scorer) #% time grid_finder.fit(data, labels) grid_finder = GridOptimalSearchCV(estimator, generator, scorer, parallel_profile="default") print "start grid search" grid_finder.fit(data, labels) grid_finder.params_generator.print_results() assert 10 == grid_finder.params_generator.n_evaluations, "oops"