Пример #1
0
def _classification_mask_report(report, mask, X, labels_dict):
    report.features_correlation_matrix(mask=mask).plot()
    report.features_correlation_matrix_by_class(
        mask=mask, labels_dict=labels_dict).plot()
    report.efficiencies(features=X.columns[1:3],
                        mask=mask,
                        labels_dict=labels_dict).plot()
    report.features_pdf(mask=mask, labels_dict=labels_dict).plot()
    report.learning_curve(RocAuc(), mask=mask, metric_label='roc').plot()
    significance = lambda s, b: s / (numpy.sqrt(b) + 0.01)
    report.metrics_vs_cut(significance, mask=mask, metric_label='sign').plot()
    report.prediction_pdf(mask=mask, labels_dict=labels_dict).plot()
    report.scatter([X.columns[:2], X.columns[1:3]],
                   mask=mask,
                   labels_dict=labels_dict).plot()
    report.feature_importance().plot()
    if labels_dict is None:
        report.feature_importance_shuffling(mask=mask).plot()
        report.roc(mask=mask).plot()
        report.roc(mask=mask, physics_notion=False).plot()
    report.efficiencies_2d(['column0', 'column1'],
                           0.3,
                           mask=mask,
                           labels_dict=labels_dict)
    print(report.compute_metric(RocAuc()))
Пример #2
0
def test_own_classification_reports():
    """
    testing clf.test_on
    """
    X, y, sample_weight = generate_classification_data()
    clf = SklearnClassifier(RandomForestClassifier())
    clf.fit(X, y, sample_weight=sample_weight)
    report = clf.test_on(X, y, sample_weight=sample_weight)
    roc1 = report.compute_metric(RocAuc())

    lds = LabeledDataStorage(X, y, sample_weight=sample_weight)
    roc2 = clf.test_on_lds(lds=lds).compute_metric(RocAuc())
    assert roc1 == roc2, 'Something wrong with test_on'
Пример #3
0
def test_gridsearch_sklearn():
    metric = numpy.random.choice([OptimalAMS(), RocAuc(), LogLoss()])
    scorer = ClassificationFoldingScorer(metric)
    maximization = True
    if isinstance(metric, LogLoss):
        maximization = False
    grid_param = OrderedDict({
        "n_estimators": [10, 20],
        "learning_rate": [0.1, 0.05],
        'features': [['column0', 'column1'], ['column0', 'column1', 'column2']]
    })
    generator = RegressionParameterOptimizer(grid_param,
                                             n_evaluations=4,
                                             maximize=maximization)

    grid = GridOptimalSearchCV(SklearnClassifier(clf=AdaBoostClassifier()),
                               generator,
                               scorer,
                               parallel_profile='threads-3')

    _ = check_grid(grid, False, False, False, use_weights=True)
    classifier = check_grid(grid, False, False, False, use_weights=False)

    # Check parameters of best fitted classifier
    assert 2 <= len(classifier.features) <= 3, 'Features were not set'
    params = classifier.get_params()
    for key in grid_param:
        if key in params:
            assert params[key] == grid.generator.best_params_[key]
        else:
            assert params['clf__' + key] == grid.generator.best_params_[key]
Пример #4
0
def test_gridsearch_metrics_threads(n_threads=3):
    X, y, sample_weight = generate_classification_data(n_classes=2,
                                                       distance=0.7)
    param_grid = OrderedDict({'reg_param': numpy.linspace(0, 1, 20)})

    from itertools import cycle

    optimizers = cycle([
        RegressionParameterOptimizer(param_grid=param_grid,
                                     n_evaluations=4,
                                     start_evaluations=2),
        SubgridParameterOptimizer(param_grid=param_grid, n_evaluations=4),
        RandomParameterOptimizer(param_grid=param_grid, n_evaluations=4),
    ])

    for metric in [RocAuc(), OptimalAMS(), OptimalSignificance(), log_loss]:
        scorer = FoldingScorer(metric)
        clf = SklearnClassifier(QDA())
        grid = GridOptimalSearchCV(
            estimator=clf,
            params_generator=next(optimizers),
            scorer=scorer,
            parallel_profile='threads-{}'.format(n_threads))
        grid.fit(X, y)
        print(grid.params_generator.best_score_)
        print(grid.params_generator.best_params_)
        grid.params_generator.print_results()
Пример #5
0
def check_report_with_mask(report, mask, X):
    report.roc(mask=mask).plot()
    report.prediction_pdf(mask=mask).plot()
    report.features_pdf(mask=mask).plot()
    report.efficiencies(list(X.columns), mask=mask).plot()
    report.features_correlation_matrix(mask=mask).plot()
    report.feature_importance().plot()
    report.scatter([(X.columns[0], X.columns[2])], mask=mask).plot()
    report.learning_curve(RocAuc(), mask=mask).plot()
    report.metrics_vs_cut(significance, mask=mask).plot()
Пример #6
0
def check_classification_learning_curve_masks(report, n_classes):
    """testing predict_only_mask option"""
    for mask in [None, "column0 > 0", "column0 > -0.5"]:
        if n_classes == 2:
            loss = RocAuc()
        else:
            loss = log_loss
        lc1 = report.learning_curve(loss,
                                    mask=mask,
                                    predict_only_masked=True,
                                    steps=3)
        lc2 = report.learning_curve(loss,
                                    mask=mask,
                                    predict_only_masked=False,
                                    steps=3)
        assert lc1.functions == lc2.functions
Пример #7
0
def test_gridsearch_threads(n_threads=3):
    scorer = FoldingScorer(numpy.random.choice([OptimalAMS(), RocAuc()]))

    grid_param = OrderedDict({
        "n_estimators": [10, 20],
        "learning_rate": [0.1, 0.05],
        'features': [['column0', 'column1'], ['column0', 'column1', 'column2']]
    })
    generator = RegressionParameterOptimizer(grid_param, n_evaluations=4)

    base = SklearnClassifier(clf=AdaBoostClassifier())
    grid = GridOptimalSearchCV(base,
                               generator,
                               scorer,
                               parallel_profile='threads-{}'.format(n_threads))

    X, y, sample_weight = generate_classification_data()
    grid.fit(X, y, sample_weight=sample_weight)
Пример #8
0
def test_gridsearch_on_tmva():
    metric = numpy.random.choice([OptimalAMS(), RocAuc()])
    scorer = FoldingScorer(metric)

    grid_param = OrderedDict({"MaxDepth": [4, 5], "NTrees": [10, 20]})
    generator = SubgridParameterOptimizer(grid_param)

    try:
        from rep.estimators import TMVAClassifier

        grid = GridOptimalSearchCV(
            TMVAClassifier(features=['column0', 'column1']), generator, scorer)
        classifier = check_grid(grid, False, False, False)
        # checking parameters
        assert len(classifier.features) == 2
        params = classifier.get_params()
        for key in grid_param:
            assert params[key] == grid.generator.best_params_[key]
    except ImportError:
        pass
Пример #9
0
def test_gridsearch_on_tmva():
    metric = numpy.random.choice([OptimalAMS(), RocAuc()])
    scorer = FoldingScorer(metric)

    grid_param = OrderedDict({"MaxDepth": [4, 5], "NTrees": [10, 20]})
    generator = SubgridParameterOptimizer(n_evaluations=5,
                                          param_grid=grid_param)

    try:
        from rep.estimators import TMVAClassifier

        base_tmva = TMVAClassifier(
            factory_options="Silent=True:V=False:DrawProgressBar=False",
            features=['column0', 'column1'],
            method='kBDT')
        grid = GridOptimalSearchCV(base_tmva, generator, scorer)
        classifier = check_grid(grid, False, False, False)
        # checking parameters
        assert len(classifier.features) == 2
        params = classifier.get_params()
        for key in grid_param:
            assert params[key] == grid.generator.best_params_[key]
    except ImportError:
        pass
Пример #10
0
def clf_mayou(data1, data2, n_folds=3, n_base_clf=5):
    """DEVELOPEMENT, WIP. Test a setup of clf involving bagging and stacking."""
    # import raredecay.analysis.ml_analysis as ml_ana
    # import pandas as pd
    import copy

    from rep.estimators import SklearnClassifier, XGBoostClassifier
    from rep.metaml.folding import FoldingClassifier
    from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
    from sklearn.ensemble import BaggingClassifier  # , VotingClassifier, AdaBoostClassifier
    from rep.estimators.theanets import TheanetsClassifier
    from sklearn.linear_model import LogisticRegression
    from rep.metaml.cache import CacheClassifier

    from rep.report.metrics import RocAuc

    import rep.metaml.cache
    from rep.metaml._cache import CacheHelper
    rep.metaml.cache.cache_helper = CacheHelper('/home/mayou/cache', 100000)

    #    data1.make_folds(n_folds)
    #    data2.make_folds(n_folds)
    output = {}

    # for i in range(n_folds):
    xgb_clf = XGBoostClassifier(n_estimators=350,
                                eta=0.1,
                                max_depth=4,
                                nthreads=3)
    xgb_folded = FoldingClassifier(base_estimator=xgb_clf,
                                   stratified=True,
                                   parallel_profile='threads-2')
    xgb_bagged = BaggingClassifier(base_estimator=xgb_folded,
                                   n_estimators=n_base_clf,
                                   bootstrap=False)
    xgb_bagged = SklearnClassifier(xgb_bagged)
    xgb_big_stacker = copy.deepcopy(xgb_bagged)
    xgb_bagged = CacheClassifier(name='xgb_bagged1', clf=xgb_bagged)

    xgb_single = XGBoostClassifier(n_estimators=350,
                                   eta=0.1,
                                   max_depth=4,
                                   nthreads=3)
    xgb_single = FoldingClassifier(base_estimator=xgb_single,
                                   stratified=True,
                                   n_folds=10,
                                   parallel_profile='threads-2')
    xgb_single = CacheClassifier(name='xgb_singled1', clf=xgb_single)

    rdf_clf = SklearnClassifier(
        RandomForestClassifier(n_estimators=300, n_jobs=3))
    rdf_folded = FoldingClassifier(base_estimator=rdf_clf,
                                   stratified=True,
                                   parallel_profile='threads-2')
    rdf_bagged = BaggingClassifier(base_estimator=rdf_folded,
                                   n_estimators=n_base_clf,
                                   bootstrap=False)
    rdf_bagged = SklearnClassifier(rdf_bagged)
    rdf_bagged = CacheClassifier(name='rdf_bagged1', clf=rdf_bagged)

    gb_clf = SklearnClassifier(GradientBoostingClassifier(n_estimators=50))
    gb_folded = FoldingClassifier(base_estimator=gb_clf,
                                  stratified=True,
                                  parallel_profile='threads-6')
    gb_bagged = BaggingClassifier(base_estimator=gb_folded,
                                  n_estimators=n_base_clf,
                                  bootstrap=False,
                                  n_jobs=5)
    gb_bagged = SklearnClassifier(gb_bagged)
    gb_bagged = CacheClassifier(name='gb_bagged1', clf=gb_bagged)

    nn_clf = TheanetsClassifier(layers=[300, 300],
                                hidden_dropout=0.03,
                                trainers=[{
                                    'optimize': 'adagrad',
                                    'patience': 5,
                                    'learning_rate': 0.2,
                                    'min_improvement': 0.1,
                                    'momentum': 0.4,
                                    'nesterov': True,
                                    'loss': 'xe'
                                }])
    nn_folded = FoldingClassifier(base_estimator=nn_clf,
                                  stratified=True,
                                  parallel_profile=None)  # 'threads-6')
    nn_bagged = BaggingClassifier(base_estimator=nn_folded,
                                  n_estimators=n_base_clf,
                                  bootstrap=False,
                                  n_jobs=1)
    nn_bagged = CacheClassifier(name='nn_bagged1', clf=nn_bagged)

    nn_single_clf = TheanetsClassifier(layers=[300, 300, 300],
                                       hidden_dropout=0.03,
                                       trainers=[{
                                           'optimize': 'adagrad',
                                           'patience': 5,
                                           'learning_rate': 0.2,
                                           'min_improvement': 0.1,
                                           'momentum': 0.4,
                                           'nesterov': True,
                                           'loss': 'xe'
                                       }])
    nn_single = FoldingClassifier(base_estimator=nn_single_clf,
                                  n_folds=3,
                                  stratified=True)
    nn_single = CacheClassifier(name='nn_single1', clf=nn_single)

    logit_stacker = SklearnClassifier(
        LogisticRegression(penalty='l2', solver='sag'))
    logit_stacker = FoldingClassifier(base_estimator=logit_stacker,
                                      n_folds=n_folds,
                                      stratified=True,
                                      parallel_profile='threads-6')
    logit_stacker = CacheClassifier(name='logit_stacker1', clf=logit_stacker)

    xgb_stacker = XGBoostClassifier(n_estimators=400,
                                    eta=0.1,
                                    max_depth=4,
                                    nthreads=8)
    # HACK
    xgb_stacker = xgb_big_stacker
    xgb_stacker = FoldingClassifier(base_estimator=xgb_stacker,
                                    n_folds=n_folds,
                                    random_state=42,
                                    stratified=True,
                                    parallel_profile='threads-6')
    xgb_stacker = CacheClassifier(name='xgb_stacker1', clf=xgb_stacker)

    #        train1, test1 = data1.get_fold(i)
    #        train2, test2 = data1.get_fold(i)
    #
    #        t_data, t_targets, t_weights =
    data, targets, weights = data1.make_dataset(data2, weights_ratio=1)

    #    xgb_bagged.fit(data, targets, weights)
    #    xgb_report = xgb_bagged.test_on(data, targets, weights)
    #    xgb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC xgb_base classifier")
    #    output['xgb_base'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc()))
    #    xgb_proba = xgb_report.prediction['clf'][:, 1]
    #    del xgb_bagged, xgb_folded, xgb_clf, xgb_report
    #
    #    xgb_single.fit(data, targets, weights)
    #    xgb_report = xgb_single.test_on(data, targets, weights)
    #    xgb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC xgb_single classifier")
    #    output['xgb_single'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc()))
    #    xgb_proba = xgb_report.prediction['clf'][:, 1]
    #    del xgb_single, xgb_report

    nn_single.fit(data, targets, weights)
    nn_report = nn_single.test_on(data, targets, weights)
    nn_report.roc(physics_notion=True).plot(
        new_plot=True, title="ROC AUC nn_single classifier")
    output['nn_single'] = "roc auc:" + str(
        nn_report.compute_metric(metric=RocAuc()))
    # nn_proba = nn_report.prediction['clf'][:, 1]
    del nn_single, nn_report

    #    rdf_bagged.fit(data, targets, weights)
    #    rdf_report = rdf_bagged.test_on(data, targets, weights)
    #    rdf_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC rdf_base classifier")
    #    output['rdf_base'] = "roc auc:" + str(rdf_report.compute_metric(metric=RocAuc()))
    #    rdf_proba = rdf_report.prediction['clf'][:, 1]
    #    del rdf_bagged, rdf_clf, rdf_folded, rdf_report

    #    gb_bagged.fit(data, targets, weights)
    #    gb_report = gb_bagged.test_on(data, targets, weights)
    #    gb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC gb_base classifier")
    #    output['gb_base'] = "roc auc:" + str(gb_report.compute_metric(metric=RocAuc()))
    #    gb_proba = gb_report.prediction['clf'][:, 1]
    #    del gb_bagged, gb_clf, gb_folded, gb_report

    #    nn_bagged.fit(data, targets, weights)
    #    nn_report = nn_bagged.test_on(data, targets, weights)
    #    nn_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC nn_base classifier")
    #    output['nn_base'] = "roc auc:" + str(nn_report.compute_metric(metric=RocAuc()))
    #    nn_proba = nn_report.prediction['clf'][:, 1]
    #    del nn_bagged, nn_clf, nn_folded, nn_report
    #
    #    base_predict = pd.DataFrame({'xgb': xgb_proba,
    #                                 #'rdf': rdf_proba,
    #                                 #'gb': gb_proba,
    #                                 'nn': nn_proba
    #                                 })
    #
    #
    #    xgb_stacker.fit(base_predict, targets, weights)
    #    xgb_report = xgb_stacker.test_on(base_predict, targets, weights)
    #    xgb_report.roc(physics_notion=True).plot(new_plot=True,
    #    title="ROC AUC xgb_stacked classifier")
    #    output['stacker_xgb'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc()))
    #    del xgb_stacker, xgb_report
    #
    #    logit_stacker.fit(base_predict, targets, weights)
    #    logit_report = logit_stacker.test_on(base_predict, targets, weights)
    #    logit_report.roc(physics_notion=True).plot(new_plot=True,
    #    title="ROC AUC logit_stacked classifier")
    #    output['stacker_logit'] = "roc auc:" + str(logit_report.compute_metric(metric=RocAuc()))
    #    del logit_stacker, logit_report

    print output
Пример #11
0
data = data.drop('g', axis=1)
import numpy
import numexpr
import pandas
from rep import utils
from sklearn.ensemble import GradientBoostingClassifier
from rep.report.metrics import RocAuc
from rep.metaml import GridOptimalSearchCV, FoldingScorer, RandomParameterOptimizer
from rep.estimators import SklearnClassifier, TMVAClassifier, XGBoostRegressor
# define grid parameters
grid_param = {}
grid_param['learning_rate'] = [0.2, 0.1, 0.05, 0.02, 0.01]
grid_param['max_depth'] = [2, 3, 4, 5]
# use random hyperparameter optimization algorithm
generator = RandomParameterOptimizer(grid_param)
# define folding scorer
scorer = FoldingScorer(RocAuc(), folds=3, fold_checks=3)
estimator = SklearnClassifier(GradientBoostingClassifier(n_estimators=30))
#grid_finder = GridOptimalSearchCV(estimator, generator, scorer)
#% time grid_finder.fit(data, labels)
grid_finder = GridOptimalSearchCV(estimator,
                                  generator,
                                  scorer,
                                  parallel_profile="default")
print "start grid search"
grid_finder.fit(data, labels)

grid_finder.params_generator.print_results()

assert 10 == grid_finder.params_generator.n_evaluations, "oops"