示例#1
0
def test_sklearn_classification():
    check_classifier(
        SklearnClassifier(clf=AdaBoostClassifier(n_estimators=10)))
    check_classifier(
        SklearnClassifier(clf=AdaBoostClassifier(n_estimators=10)),
        n_classes=3)
    check_classifier(
        SklearnClassifier(clf=GradientBoostingClassifier(n_estimators=10)))
示例#2
0
def test_folding_classifier():
    base_ada = SklearnClassifier(AdaBoostClassifier())
    folding_str = FoldingClassifier(base_ada, n_folds=2)
    check_folding(folding_str, True, True, True)

    base_log_reg = SklearnClassifier(LogisticRegression())
    folding_str = FoldingClassifier(base_log_reg, n_folds=4)
    check_folding(folding_str, True, False, False, False)
示例#3
0
def test_folding_classifier():
    base_ada = SklearnClassifier(AdaBoostClassifier())
    folding_str = FoldingClassifier(base_ada, n_folds=2)
    check_folding(folding_str, True, True, True)

    base_svm = SklearnClassifier(SVC())
    folding_str = FoldingClassifier(base_svm, n_folds=4)
    check_folding(folding_str, True, False, False)
示例#4
0
def test_sklearn_classification():
    # supports weights
    check_classifier(
        SklearnClassifier(clf=AdaBoostClassifier(n_estimators=10)))
    check_classifier(
        SklearnClassifier(clf=AdaBoostClassifier(n_estimators=10)),
        n_classes=3)
    # doesn't support weights
    check_classifier(
        SklearnClassifier(clf=GradientBoostingClassifier(n_estimators=10)),
        supports_weight=False)
示例#5
0
def test_own_classification_reports():
    """
    testing clf.test_on
    """
    X, y, sample_weight = generate_classification_data()
    clf = SklearnClassifier(RandomForestClassifier())
    clf.fit(X, y, sample_weight=sample_weight)
    report = clf.test_on(X, y, sample_weight=sample_weight)
    roc1 = report.compute_metric(RocAuc())

    lds = LabeledDataStorage(X, y, sample_weight=sample_weight)
    roc2 = clf.test_on_lds(lds=lds).compute_metric(RocAuc())
    assert roc1 == roc2, 'Something wrong with test_on'
示例#6
0
def test_gridsearch_metrics_threads(n_threads=3):
    X, y, sample_weight = generate_classification_data(n_classes=2,
                                                       distance=0.7)
    param_grid = OrderedDict({'reg_param': numpy.linspace(0, 1, 20)})

    from itertools import cycle

    optimizers = cycle([
        RegressionParameterOptimizer(param_grid=param_grid,
                                     n_evaluations=4,
                                     start_evaluations=2),
        SubgridParameterOptimizer(param_grid=param_grid, n_evaluations=4),
        RandomParameterOptimizer(param_grid=param_grid, n_evaluations=4),
    ])

    for metric in [RocAuc(), OptimalAMS(), OptimalSignificance(), log_loss]:
        scorer = FoldingScorer(metric)
        clf = SklearnClassifier(QDA())
        grid = GridOptimalSearchCV(
            estimator=clf,
            params_generator=next(optimizers),
            scorer=scorer,
            parallel_profile='threads-{}'.format(n_threads))
        grid.fit(X, y)
        print(grid.params_generator.best_score_)
        print(grid.params_generator.best_params_)
        grid.params_generator.print_results()
示例#7
0
def test_simple_stacking_pybrain():
    base_pybrain = PyBrainClassifier()
    check_classifier(SklearnClassifier(
        clf=BaggingClassifier(base_estimator=base_pybrain, n_estimators=3)),
                     has_staged_pp=False,
                     has_importances=False,
                     supports_weight=False)
示例#8
0
def test_gridsearch_sklearn():
    metric = numpy.random.choice([OptimalAMS(), RocAuc(), LogLoss()])
    scorer = ClassificationFoldingScorer(metric)
    maximization = True
    if isinstance(metric, LogLoss):
        maximization = False
    grid_param = OrderedDict({
        "n_estimators": [10, 20],
        "learning_rate": [0.1, 0.05],
        'features': [['column0', 'column1'], ['column0', 'column1', 'column2']]
    })
    generator = RegressionParameterOptimizer(grid_param,
                                             n_evaluations=4,
                                             maximize=maximization)

    grid = GridOptimalSearchCV(SklearnClassifier(clf=AdaBoostClassifier()),
                               generator,
                               scorer,
                               parallel_profile='threads-3')

    _ = check_grid(grid, False, False, False, use_weights=True)
    classifier = check_grid(grid, False, False, False, use_weights=False)

    # Check parameters of best fitted classifier
    assert 2 <= len(classifier.features) <= 3, 'Features were not set'
    params = classifier.get_params()
    for key in grid_param:
        if key in params:
            assert params[key] == grid.generator.best_params_[key]
        else:
            assert params['clf__' + key] == grid.generator.best_params_[key]
示例#9
0
def test_complex_stacking_mn():
    # Ada over kFold over MatrixNet
    base_kfold = FoldingClassifier(base_estimator=MatrixNetClassifier(
        iterations=30))
    check_classifier(SklearnClassifier(
        clf=AdaBoostClassifier(base_estimator=base_kfold, n_estimators=3)),
                     has_staged_pp=False,
                     has_importances=False)
示例#10
0
    def _make_clf(self, clf, bagging=None):
        """Creates a classifier from a dict or returns the clf"""
        if isinstance(clf, dict):
            key, val = clf.popitem()
            try:
                val = self.__DEFAULT_CLF_CFG.get(key) if val is None else val
            except KeyError:
                logger.error(str(val) + " not an implemented classifier.")
                raise

            temp_bagging = val.pop('bagging', bagging)
            bagging = temp_bagging if bagging is None else bagging

            if key == 'rdf':
                config_clf = dict(val)  # possible multi-threading arguments
                clf = SklearnClassifier(RandomForestClassifier(**config_clf))
            elif key == 'erf':
                config_clf = dict(val)  # possible multi-threading arguments
                clf = SklearnClassifier(ExtraTreesClassifier(**config_clf))
            elif key == 'nn':
                config_clf = dict(val)  # possible multi-threading arguments
                clf = TheanetsClassifier(**config_clf)
            elif key == 'ada':
                config_clf = dict(val)  # possible multi-threading arguments
                clf = SklearnClassifier(AdaBoostClassifier(**config_clf))
            elif key == 'gb':
                config_clf = dict(val)  # possible multi-threading arguments
                clf = SklearnClassifier(GradientBoostingClassifier(**config_clf))
            elif key == 'xgb':
                config_clf = dict(val)  # possible multi-threading arguments
                clf = XGBoostClassifier(**config_clf)
            elif hasattr(clf, 'fit'):
                bagging = False  # return the classifier

            # bagging over the instantiated estimators
            if isinstance(bagging, int) and bagging >= 1:
                bagging = dict(self.__DEFAULT_BAG_CFG, n_estimators=bagging)
            if isinstance(bagging, dict):
                # TODO: implement multi-thread:
                bagging.update({'base_estimator': clf})
                clf = SklearnClassifier(BaggingClassifier(**bagging))
        else:
            raise ValueError(str(clf) + " not valid as a classifier.")

        clf = {key: clf}
        return clf
示例#11
0
文件: myfuncs.py 项目: ezbc/airbnb
def fit_categorical_labels(df_train, df_test, df_labels,
        fit_type='regressor', fit_framework='theanets', labels_list=None):

    from rep.estimators import SklearnClassifier, SklearnRegressor
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.ensemble import GradientBoostingRegressor
    from rep.estimators.neurolab import NeurolabRegressor
    from rep.estimators.theanets import TheanetsRegressor
    #from rep.estimators import XGBoostRegressor
    #from rep.estimators import XGBoostRegressor

    # Using gradient boosting with default settings
    if fit_framework == 'sklearn':
        if fit_type == 'classifier':
            sk = SklearnClassifier(GradientBoostingClassifier(),
                                   features=df_train.columns.values)
        elif fit_type == 'regressor':
            sk = SklearnRegressor(GradientBoostingRegressor(),
                                  features=df_train.columns.values)
    elif fit_framework == 'neural':
        if fit_type == 'regressor':
            sk = NeurolabRegressor(features=df_train.columns.values,
                                   )
    elif fit_framework == 'xgboost':
        if fit_type == 'regressor':
            sk = XGBoostRegressor(features=df_train.columns.values,
                                  )
    elif fit_framework == 'theanets':
        if fit_type == 'regressor':
            sk = TheanetsRegressor(features=df_train.columns.values,
                                  )
    else:
        raise ValueError('No correct combo of fit_type and fit_framework found')

    prediction_array = np.empty((len(df_test), len(df_labels.columns)))
    for i, column in enumerate(df_labels.columns.values):
        # get a single column to predict
        labels = df_labels[column]

        # fit the data with the training set
        sk.fit(df_train, labels)

        # predict new countries
        prediction = np.squeeze(sk.predict(df_test))
        prediction_array[:, i] = prediction

        #prediction = pd.read_pickle(filename).squeeze()

    df_predict = pd.DataFrame(prediction_array, columns=df_labels.columns.values)
    df_predict = gather_dummy_predictions(df_predict, labels_list)

    #print('unique labels', np.unique(df_predict))

    return df_predict
示例#12
0
def test_classifier_with_dataframe():
    try:
        from rep.estimators import SklearnClassifier
        clf = SklearnClassifier(GradientBoostingClassifier(n_estimators=1))
        X, y = generate_sample(n_samples=100, n_features=4)
        for X_ in [X, pandas.DataFrame(X)]:
            lookup = LookupClassifier(clf, n_bins=16).fit(X_, y)
            lookup.predict_proba(X)
    except ImportError:
        print('expected fail: yandex/rep not installed')
示例#13
0
def test_gridsearch_threads(n_threads=3):
    scorer = FoldingScorer(numpy.random.choice([OptimalAMS(), RocAuc()]))

    grid_param = OrderedDict({
        "n_estimators": [10, 20],
        "learning_rate": [0.1, 0.05],
        'features': [['column0', 'column1'], ['column0', 'column1', 'column2']]
    })
    generator = RegressionParameterOptimizer(grid_param, n_evaluations=4)

    base = SklearnClassifier(clf=AdaBoostClassifier())
    grid = GridOptimalSearchCV(base,
                               generator,
                               scorer,
                               parallel_profile='threads-{}'.format(n_threads))

    X, y, sample_weight = generate_classification_data()
    grid.fit(X, y, sample_weight=sample_weight)
示例#14
0
def test_own_classification_reports():
    """
    testing clf.test_on
    """
    X, y, sample_weight = generate_classification_data()
    clf = SklearnClassifier(RandomForestClassifier())
    clf.fit(X, y, sample_weight=sample_weight)
    report = clf.test_on(X, y, sample_weight=sample_weight)
    roc1 = report.compute_metric(RocAuc())

    lds = LabeledDataStorage(X, y, sample_weight=sample_weight)
    roc2 = clf.test_on_lds(lds=lds).compute_metric(RocAuc())
    assert roc1 == roc2, 'Something wrong with test_on'
示例#15
0
def grid_sklearn(score_function):
    grid_param = OrderedDict({
        "n_estimators": [10, 20],
        "learning_rate": [0.1, 0.05],
        'features': [['column0', 'column1'], ['column0', 'column1', 'column2']]
    })
    generator = RegressionParameterOptimizer(grid_param)
    scorer = FoldingScorer(score_function)

    grid = GridOptimalSearchCV(SklearnClassifier(clf=AdaBoostClassifier()),
                               generator, scorer)

    cl = check_grid(grid, False, False, False)
    assert 1 <= len(cl.features) <= 3
    params = cl.get_params()
    for key in grid_param:
        if key in params:
            assert params[key] == grid.generator.best_params_[key]
        else:
            assert params['clf__' + key] == grid.generator.best_params_[key]
示例#16
0
def grid_custom(custom):
    grid_param = OrderedDict({
        "n_estimators": [10, 20],
        "learning_rate": [0.1, 0.05],
        'features': [['column0', 'column1'], ['column0', 'column1', 'column2']]
    })
    generator = SubgridParameterOptimizer(grid_param)

    grid = GridOptimalSearchCV(
        SklearnClassifier(clf=AdaBoostClassifier(),
                          features=['column0', 'column1']), generator, custom)

    cl = check_grid(grid, False, False, False)
    assert 1 <= len(cl.features) <= 3
    params = cl.get_params()
    for key in grid_param:
        if key in params:
            assert params[key] == grid.generator.best_params_[key]
        else:
            assert params['clf__' + key] == grid.generator.best_params_[key]
示例#17
0
def test_grid_with_custom_scorer():
    """
    Introducing here special scorer which always uses all data passed to gridsearch.fit as training
    and tests on another fixed dataset (which was passed to scorer) bu computing roc_auc_score from sklearn.
    """
    class CustomScorer(object):
        def __init__(self, testX, testY):
            self.testY = testY
            self.testX = testX

        def __call__(self, base_estimator, params, X, y, sample_weight=None):
            cl = clone(base_estimator)
            cl.set_params(**params)
            if sample_weight is not None:
                cl.fit(X, y, sample_weight)
            else:
                cl.fit(X, y)
            return roc_auc_score(self.testY,
                                 cl.predict_proba(self.testX)[:, 1])

    X, y, _ = generate_classification_data()
    custom_scorer = CustomScorer(X, y)

    grid_param = OrderedDict({
        "n_estimators": [10, 20],
        "learning_rate": [0.1, 0.05],
        'features': [['column0', 'column1'], ['column0', 'column1', 'column2']]
    })
    generator = SubgridParameterOptimizer(grid_param)

    base_estimator = SklearnClassifier(clf=AdaBoostClassifier())
    grid = GridOptimalSearchCV(base_estimator, generator, custom_scorer)

    cl = check_grid(grid, False, False, False)
    assert len(cl.features) <= 3
    params = cl.get_params()
    for key in grid_param:
        if key in params:
            assert params[key] == grid.generator.best_params_[key]
        else:
            assert params['clf__' + key] == grid.generator.best_params_[key]
示例#18
0
       'IP_p0p2', 'IP_p1p2', 'isolationa', 'isolationb', 'isolationc',
       'isolationd', 'isolatione', 'isolationf', 'iso', 'CDF1', 'CDF2',
       
       
       'p0_track_Chi2Dof', 'p1_track_Chi2Dof', 'p2_track_Chi2Dof','p0_IP',
       'p1_IP', 'p2_IP', 'p0_IPSig', 'p1_IPSig', 'p2_IPSig', 
            'p0_eta', 'p1_eta',
       'p2_eta']
       
uniform_features  = ["mass"]       

n_estimators = 150
base_estimator = DecisionTreeClassifier(max_depth=4)

base_ada = GradientBoostingClassifier(max_depth=4, n_estimators=100, learning_rate=0.1)
AdaBoost = SklearnClassifier(base_ada, features=train_features)


knnloss = ugb.KnnAdaLossFunction(uniform_features, knn=10, uniform_label=1)
ugbKnn = ugb.UGradientBoostingClassifier(loss=knnloss, max_depth=4, n_estimators=n_estimators,
                                        learning_rate=0.4, train_features=train_features)
uGB+knnAda = SklearnClassifier(ugbKnn) 

uboost_clf = uboost.uBoostClassifier(uniform_features=uniform_features, uniform_label=1,
                                     base_estimator=base_estimator, 
                                     n_estimators=n_estimators, train_features=train_features, 
                                     efficiency_steps=12, n_threads=4)
uBoost = SklearnClassifier(uboost_clf)

flatnessloss = ugb.KnnFlatnessLossFunction(uniform_features, fl_coefficient=3., power=1.3, uniform_label=1)
ugbFL = ugb.UGradientBoostingClassifier(loss=flatnessloss, max_depth=4, 
示例#19
0
def test_simple_stacking_mn():
    base_mn = MatrixNetClassifier(iterations=10)
    check_classifier(SklearnClassifier(
        clf=AdaBoostClassifier(base_estimator=base_mn, n_estimators=2)),
                     has_staged_pp=True)
示例#20
0
def clf_mayou(data1, data2, n_folds=3, n_base_clf=5):
    """DEVELOPEMENT, WIP. Test a setup of clf involving bagging and stacking."""
    # import raredecay.analysis.ml_analysis as ml_ana
    # import pandas as pd
    import copy

    from rep.estimators import SklearnClassifier, XGBoostClassifier
    from rep.metaml.folding import FoldingClassifier
    from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
    from sklearn.ensemble import BaggingClassifier  # , VotingClassifier, AdaBoostClassifier
    from rep.estimators.theanets import TheanetsClassifier
    from sklearn.linear_model import LogisticRegression
    from rep.metaml.cache import CacheClassifier

    from rep.report.metrics import RocAuc

    import rep.metaml.cache
    from rep.metaml._cache import CacheHelper
    rep.metaml.cache.cache_helper = CacheHelper('/home/mayou/cache', 100000)

    #    data1.make_folds(n_folds)
    #    data2.make_folds(n_folds)
    output = {}

    # for i in range(n_folds):
    xgb_clf = XGBoostClassifier(n_estimators=350,
                                eta=0.1,
                                max_depth=4,
                                nthreads=3)
    xgb_folded = FoldingClassifier(base_estimator=xgb_clf,
                                   stratified=True,
                                   parallel_profile='threads-2')
    xgb_bagged = BaggingClassifier(base_estimator=xgb_folded,
                                   n_estimators=n_base_clf,
                                   bootstrap=False)
    xgb_bagged = SklearnClassifier(xgb_bagged)
    xgb_big_stacker = copy.deepcopy(xgb_bagged)
    xgb_bagged = CacheClassifier(name='xgb_bagged1', clf=xgb_bagged)

    xgb_single = XGBoostClassifier(n_estimators=350,
                                   eta=0.1,
                                   max_depth=4,
                                   nthreads=3)
    xgb_single = FoldingClassifier(base_estimator=xgb_single,
                                   stratified=True,
                                   n_folds=10,
                                   parallel_profile='threads-2')
    xgb_single = CacheClassifier(name='xgb_singled1', clf=xgb_single)

    rdf_clf = SklearnClassifier(
        RandomForestClassifier(n_estimators=300, n_jobs=3))
    rdf_folded = FoldingClassifier(base_estimator=rdf_clf,
                                   stratified=True,
                                   parallel_profile='threads-2')
    rdf_bagged = BaggingClassifier(base_estimator=rdf_folded,
                                   n_estimators=n_base_clf,
                                   bootstrap=False)
    rdf_bagged = SklearnClassifier(rdf_bagged)
    rdf_bagged = CacheClassifier(name='rdf_bagged1', clf=rdf_bagged)

    gb_clf = SklearnClassifier(GradientBoostingClassifier(n_estimators=50))
    gb_folded = FoldingClassifier(base_estimator=gb_clf,
                                  stratified=True,
                                  parallel_profile='threads-6')
    gb_bagged = BaggingClassifier(base_estimator=gb_folded,
                                  n_estimators=n_base_clf,
                                  bootstrap=False,
                                  n_jobs=5)
    gb_bagged = SklearnClassifier(gb_bagged)
    gb_bagged = CacheClassifier(name='gb_bagged1', clf=gb_bagged)

    nn_clf = TheanetsClassifier(layers=[300, 300],
                                hidden_dropout=0.03,
                                trainers=[{
                                    'optimize': 'adagrad',
                                    'patience': 5,
                                    'learning_rate': 0.2,
                                    'min_improvement': 0.1,
                                    'momentum': 0.4,
                                    'nesterov': True,
                                    'loss': 'xe'
                                }])
    nn_folded = FoldingClassifier(base_estimator=nn_clf,
                                  stratified=True,
                                  parallel_profile=None)  # 'threads-6')
    nn_bagged = BaggingClassifier(base_estimator=nn_folded,
                                  n_estimators=n_base_clf,
                                  bootstrap=False,
                                  n_jobs=1)
    nn_bagged = CacheClassifier(name='nn_bagged1', clf=nn_bagged)

    nn_single_clf = TheanetsClassifier(layers=[300, 300, 300],
                                       hidden_dropout=0.03,
                                       trainers=[{
                                           'optimize': 'adagrad',
                                           'patience': 5,
                                           'learning_rate': 0.2,
                                           'min_improvement': 0.1,
                                           'momentum': 0.4,
                                           'nesterov': True,
                                           'loss': 'xe'
                                       }])
    nn_single = FoldingClassifier(base_estimator=nn_single_clf,
                                  n_folds=3,
                                  stratified=True)
    nn_single = CacheClassifier(name='nn_single1', clf=nn_single)

    logit_stacker = SklearnClassifier(
        LogisticRegression(penalty='l2', solver='sag'))
    logit_stacker = FoldingClassifier(base_estimator=logit_stacker,
                                      n_folds=n_folds,
                                      stratified=True,
                                      parallel_profile='threads-6')
    logit_stacker = CacheClassifier(name='logit_stacker1', clf=logit_stacker)

    xgb_stacker = XGBoostClassifier(n_estimators=400,
                                    eta=0.1,
                                    max_depth=4,
                                    nthreads=8)
    # HACK
    xgb_stacker = xgb_big_stacker
    xgb_stacker = FoldingClassifier(base_estimator=xgb_stacker,
                                    n_folds=n_folds,
                                    random_state=42,
                                    stratified=True,
                                    parallel_profile='threads-6')
    xgb_stacker = CacheClassifier(name='xgb_stacker1', clf=xgb_stacker)

    #        train1, test1 = data1.get_fold(i)
    #        train2, test2 = data1.get_fold(i)
    #
    #        t_data, t_targets, t_weights =
    data, targets, weights = data1.make_dataset(data2, weights_ratio=1)

    #    xgb_bagged.fit(data, targets, weights)
    #    xgb_report = xgb_bagged.test_on(data, targets, weights)
    #    xgb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC xgb_base classifier")
    #    output['xgb_base'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc()))
    #    xgb_proba = xgb_report.prediction['clf'][:, 1]
    #    del xgb_bagged, xgb_folded, xgb_clf, xgb_report
    #
    #    xgb_single.fit(data, targets, weights)
    #    xgb_report = xgb_single.test_on(data, targets, weights)
    #    xgb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC xgb_single classifier")
    #    output['xgb_single'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc()))
    #    xgb_proba = xgb_report.prediction['clf'][:, 1]
    #    del xgb_single, xgb_report

    nn_single.fit(data, targets, weights)
    nn_report = nn_single.test_on(data, targets, weights)
    nn_report.roc(physics_notion=True).plot(
        new_plot=True, title="ROC AUC nn_single classifier")
    output['nn_single'] = "roc auc:" + str(
        nn_report.compute_metric(metric=RocAuc()))
    # nn_proba = nn_report.prediction['clf'][:, 1]
    del nn_single, nn_report

    #    rdf_bagged.fit(data, targets, weights)
    #    rdf_report = rdf_bagged.test_on(data, targets, weights)
    #    rdf_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC rdf_base classifier")
    #    output['rdf_base'] = "roc auc:" + str(rdf_report.compute_metric(metric=RocAuc()))
    #    rdf_proba = rdf_report.prediction['clf'][:, 1]
    #    del rdf_bagged, rdf_clf, rdf_folded, rdf_report

    #    gb_bagged.fit(data, targets, weights)
    #    gb_report = gb_bagged.test_on(data, targets, weights)
    #    gb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC gb_base classifier")
    #    output['gb_base'] = "roc auc:" + str(gb_report.compute_metric(metric=RocAuc()))
    #    gb_proba = gb_report.prediction['clf'][:, 1]
    #    del gb_bagged, gb_clf, gb_folded, gb_report

    #    nn_bagged.fit(data, targets, weights)
    #    nn_report = nn_bagged.test_on(data, targets, weights)
    #    nn_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC nn_base classifier")
    #    output['nn_base'] = "roc auc:" + str(nn_report.compute_metric(metric=RocAuc()))
    #    nn_proba = nn_report.prediction['clf'][:, 1]
    #    del nn_bagged, nn_clf, nn_folded, nn_report
    #
    #    base_predict = pd.DataFrame({'xgb': xgb_proba,
    #                                 #'rdf': rdf_proba,
    #                                 #'gb': gb_proba,
    #                                 'nn': nn_proba
    #                                 })
    #
    #
    #    xgb_stacker.fit(base_predict, targets, weights)
    #    xgb_report = xgb_stacker.test_on(base_predict, targets, weights)
    #    xgb_report.roc(physics_notion=True).plot(new_plot=True,
    #    title="ROC AUC xgb_stacked classifier")
    #    output['stacker_xgb'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc()))
    #    del xgb_stacker, xgb_report
    #
    #    logit_stacker.fit(base_predict, targets, weights)
    #    logit_report = logit_stacker.test_on(base_predict, targets, weights)
    #    logit_report.roc(physics_notion=True).plot(new_plot=True,
    #    title="ROC AUC logit_stacked classifier")
    #    output['stacker_logit'] = "roc auc:" + str(logit_report.compute_metric(metric=RocAuc()))
    #    del logit_stacker, logit_report

    print output
示例#21
0
           ", test_size=" + str(uconfig.training.size))

# create classifiers
classifiers = ClassifiersFactory()
weights = OrderedDict()

# standard bdt
if "bdt" in uconfig.training.algorithms:
    base_grad = GradientBoostingClassifier(
        max_depth=uconfig.hyper.max_depth,
        n_estimators=uconfig.hyper.n_estimators,
        subsample=uconfig.hyper.subsample,
        learning_rate=uconfig.hyper.learning_rate,
        min_samples_leaf=uconfig.hyper.min_samples_leaf,
    )
    classifiers["bdt"] = SklearnClassifier(base_grad,
                                           features=uconfig.features.train)
    weights["bdt"] = trainW[uconfig.training.algorithms["bdt"]]

# uniform bdt
if "ubdt" in uconfig.training.algorithms:
    if uconfig.hyper.uloss == "log":
        from mods import flat_log_loss
        flat_log_loss()
    flatnessloss = ugb.BinFlatnessLossFunction(
        uconfig.features.uniform,
        fl_coefficient=uconfig.hyper.fl_coefficient,
        power=uconfig.hyper.power,
        uniform_label=uconfig.hyper.uniform_label,
        n_bins=uconfig.hyper.n_bins,
    )
    ugbFL = ugb.UGradientBoostingClassifier(
示例#22
0
    if primitiv:
        X = pd.DataFrame({'odin': np.array([2., 2., 2., 2., 3., 3., 2., 3., 8.,
                                            7., 8., 7., 8., 8., 7., 8.]),
                          'dwa': np.array([2.2, 2.1, 2.2, 2.3, 3.1, 3.1, 2.1, 3.2, 8.1,
                                           7.5, 8.2, 7.1, 8.5, 8.2, 7.6, 8.1])
                          })
        y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1])
        w = np.ones(16)
        branch_names = ['odin', 'dwa']
    print branch_names
    X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, w, test_size=0.33)

    lds = LabeledDataStorage(X_test, y_test, w_test)
    # CLASSIFIER
    clf_stacking = SklearnClassifier(RandomForestClassifier(n_estimators=5000, bootstrap=False,
                                                            n_jobs=7))
    # clf_stacking = XGBoostClassifier(n_estimators=700, eta=0.1, nthreads=8,
    #                                 subsample=0.5
    #                                 )
    # clf_stacking='nn'
    clf = Mayou(base_estimators={'xgb': None}, bagging_base=None, bagging_stack=8,
                stacking=clf_stacking, features_stack=branch_names,
                transform=False, transform_pred=False)
    # clf = SklearnClassifier(GaussianNB())
    # clf = SklearnClassifier(BaggingClassifier(n_jobs=1, max_features=1.,
    # bootstrap=False, base_estimator=clf, n_estimators=20, max_samples=0.1))
    # clf = XGBoostClassifier(n_estimators=400, eta=0.1, nthreads=6)
    # clf = SklearnClassifier(BaggingClassifier(clf, max_samples=0.8))
    # clf = SklearnClassifier(NuSVC(cache_size=1000000))
    # clf = SklearnClassifier(clf)
    if folding:
示例#23
0
def test_simple_stacking_pybrain():
    base_pybrain = PyBrainClassifier(epochs=2)
    base_bagging = BaggingClassifier(base_estimator=base_pybrain, n_estimators=3)
    check_classifier(SklearnClassifier(clf=base_bagging), **classifier_params)
示例#24
0
data = data.drop('g', axis=1)
import numpy
import numexpr
import pandas
from rep import utils
from sklearn.ensemble import GradientBoostingClassifier
from rep.report.metrics import RocAuc
from rep.metaml import GridOptimalSearchCV, FoldingScorer, RandomParameterOptimizer
from rep.estimators import SklearnClassifier, TMVAClassifier, XGBoostRegressor
# define grid parameters
grid_param = {}
grid_param['learning_rate'] = [0.2, 0.1, 0.05, 0.02, 0.01]
grid_param['max_depth'] = [2, 3, 4, 5]
# use random hyperparameter optimization algorithm
generator = RandomParameterOptimizer(grid_param)
# define folding scorer
scorer = FoldingScorer(RocAuc(), folds=3, fold_checks=3)
estimator = SklearnClassifier(GradientBoostingClassifier(n_estimators=30))
#grid_finder = GridOptimalSearchCV(estimator, generator, scorer)
#% time grid_finder.fit(data, labels)
grid_finder = GridOptimalSearchCV(estimator,
                                  generator,
                                  scorer,
                                  parallel_profile="default")
print "start grid search"
grid_finder.fit(data, labels)

grid_finder.params_generator.print_results()

assert 10 == grid_finder.params_generator.n_evaluations, "oops"