Пример #1
0
def test_gridsearch_metrics_threads(n_threads=3):
    X, y, sample_weight = generate_classification_data(n_classes=2,
                                                       distance=0.7)
    param_grid = OrderedDict({'reg_param': numpy.linspace(0, 1, 20)})

    from itertools import cycle

    optimizers = cycle([
        RegressionParameterOptimizer(param_grid=param_grid,
                                     n_evaluations=4,
                                     start_evaluations=2),
        SubgridParameterOptimizer(param_grid=param_grid, n_evaluations=4),
        RandomParameterOptimizer(param_grid=param_grid, n_evaluations=4),
    ])

    for metric in [RocAuc(), OptimalAMS(), OptimalSignificance(), log_loss]:
        scorer = FoldingScorer(metric)
        clf = SklearnClassifier(QDA())
        grid = GridOptimalSearchCV(
            estimator=clf,
            params_generator=next(optimizers),
            scorer=scorer,
            parallel_profile='threads-{}'.format(n_threads))
        grid.fit(X, y)
        print(grid.params_generator.best_score_)
        print(grid.params_generator.best_params_)
        grid.params_generator.print_results()
Пример #2
0
def test_gridsearch_threads(n_threads=3):
    scorer = FoldingScorer(numpy.random.choice([OptimalAMS(), RocAuc()]))

    grid_param = OrderedDict({"n_estimators": [10, 20],
                              "learning_rate": [0.1, 0.05],
                              'features': [['column0', 'column1'], ['column0', 'column1', 'column2']]})
    generator = RegressionParameterOptimizer(grid_param, n_evaluations=4)

    base = SklearnClassifier(clf=AdaBoostClassifier())
    grid = GridOptimalSearchCV(base, generator, scorer, parallel_profile='threads-{}'.format(n_threads))

    X, y, sample_weight = generate_classification_data()
    grid.fit(X, y, sample_weight=sample_weight)
Пример #3
0
def test_gridsearch_sklearn():
    metric = numpy.random.choice([OptimalAMS(), RocAuc(), LogLoss()])
    scorer = ClassificationFoldingScorer(metric)
    maximization = True
    if isinstance(metric, LogLoss):
        maximization = False
    grid_param = OrderedDict({
        "n_estimators": [10, 20],
        "learning_rate": [0.1, 0.05],
        'features': [['column0', 'column1'], ['column0', 'column1', 'column2']]
    })
    generator = RegressionParameterOptimizer(grid_param,
                                             n_evaluations=4,
                                             maximize=maximization)

    grid = GridOptimalSearchCV(SklearnClassifier(clf=AdaBoostClassifier()),
                               generator,
                               scorer,
                               parallel_profile='threads-3')

    _ = check_grid(grid, False, False, False, use_weights=True)
    classifier = check_grid(grid, False, False, False, use_weights=False)

    # Check parameters of best fitted classifier
    assert 2 <= len(classifier.features) <= 3, 'Features were not set'
    params = classifier.get_params()
    for key in grid_param:
        if key in params:
            assert params[key] == grid.generator.best_params_[key]
        else:
            assert params['clf__' + key] == grid.generator.best_params_[key]
Пример #4
0
def test_gridsearch_sklearn_regression():
    scorer = RegressionFoldingScorer(mean_squared_error)

    grid_param = OrderedDict({
        "n_estimators": [10, 20],
        "learning_rate": [0.1, 0.05],
        'features': [['column0', 'column1'], ['column0', 'column1', 'column2']]
    })
    generator = RegressionParameterOptimizer(grid_param, n_evaluations=4)

    grid = GridOptimalSearchCV(SklearnRegressor(clf=AdaBoostRegressor()),
                               generator, scorer)
    # parallel_profile='threads-3')

    _ = check_grid(grid,
                   False,
                   False,
                   False,
                   use_weights=True,
                   classification=False)
    regressor = check_grid(grid,
                           False,
                           False,
                           False,
                           use_weights=False,
                           classification=False)

    # Check parameters of best fitted classifier
    assert 2 <= len(regressor.features) <= 3, 'Features were not set'
    params = regressor.get_params()
    for key in grid_param:
        if key in params:
            assert params[key] == grid.generator.best_params_[key]
        else:
            assert params['clf__' + key] == grid.generator.best_params_[key]
Пример #5
0
def test_gridsearch_threads(n_threads=3):
    scorer = FoldingScorer(numpy.random.choice([OptimalAMS(), RocAuc()]))

    grid_param = OrderedDict({
        "n_estimators": [10, 20],
        "learning_rate": [0.1, 0.05],
        'features': [['column0', 'column1'], ['column0', 'column1', 'column2']]
    })
    generator = RegressionParameterOptimizer(grid_param, n_evaluations=4)

    base = SklearnClassifier(clf=AdaBoostClassifier())
    grid = GridOptimalSearchCV(base,
                               generator,
                               scorer,
                               parallel_profile='threads-{}'.format(n_threads))

    X, y, sample_weight = generate_classification_data()
    grid.fit(X, y, sample_weight=sample_weight)
Пример #6
0
def test_gridsearch_metrics_threads(n_threads=3):
    X, y, sample_weight = generate_classification_data(n_classes=2, distance=0.7)
    param_grid = OrderedDict({
        'reg_param': numpy.linspace(0, 1, 20)
    })

    from itertools import cycle

    optimizers = cycle([
        RegressionParameterOptimizer(param_grid=param_grid, n_evaluations=4, start_evaluations=2),
        SubgridParameterOptimizer(param_grid=param_grid, n_evaluations=4),
        RandomParameterOptimizer(param_grid=param_grid, n_evaluations=4),
    ])

    for metric in [RocAuc(), OptimalAMS(), OptimalSignificance(), log_loss]:
        scorer = FoldingScorer(metric)
        clf = SklearnClassifier(QuadraticDiscriminantAnalysis())
        grid = GridOptimalSearchCV(estimator=clf, params_generator=next(optimizers),
                                   scorer=scorer, parallel_profile='threads-{}'.format(n_threads))
        grid.fit(X, y)
        print(grid.params_generator.best_score_)
        print(grid.params_generator.best_params_)
        grid.params_generator.print_results()
Пример #7
0
def grid_tmva(score_function):
    grid_param = OrderedDict({"MaxDepth": [4, 5], "NTrees": [10, 20]})

    generator = SubgridParameterOptimizer(grid_param)
    scorer = FoldingScorer(score_function)
    from rep.estimators import TMVAClassifier

    grid = GridOptimalSearchCV(TMVAClassifier(features=['column0', 'column1']),
                               generator, scorer)

    cl = check_grid(grid, False, False, False)
    assert 1 <= len(cl.features) <= 3
    params = cl.get_params()
    for key in grid_param:
        assert params[key] == grid.generator.best_params_[key]
Пример #8
0
def grid_custom(custom):
    grid_param = OrderedDict({
        "n_estimators": [10, 20],
        "learning_rate": [0.1, 0.05],
        'features': [['column0', 'column1'], ['column0', 'column1', 'column2']]
    })
    generator = SubgridParameterOptimizer(grid_param)

    grid = GridOptimalSearchCV(
        SklearnClassifier(clf=AdaBoostClassifier(),
                          features=['column0', 'column1']), generator, custom)

    cl = check_grid(grid, False, False, False)
    assert 1 <= len(cl.features) <= 3
    params = cl.get_params()
    for key in grid_param:
        if key in params:
            assert params[key] == grid.generator.best_params_[key]
        else:
            assert params['clf__' + key] == grid.generator.best_params_[key]
Пример #9
0
def grid_sklearn(score_function):
    grid_param = OrderedDict({
        "n_estimators": [10, 20],
        "learning_rate": [0.1, 0.05],
        'features': [['column0', 'column1'], ['column0', 'column1', 'column2']]
    })
    generator = RegressionParameterOptimizer(grid_param)
    scorer = FoldingScorer(score_function)

    grid = GridOptimalSearchCV(SklearnClassifier(clf=AdaBoostClassifier()),
                               generator, scorer)

    cl = check_grid(grid, False, False, False)
    assert 1 <= len(cl.features) <= 3
    params = cl.get_params()
    for key in grid_param:
        if key in params:
            assert params[key] == grid.generator.best_params_[key]
        else:
            assert params['clf__' + key] == grid.generator.best_params_[key]
Пример #10
0
def test_gridsearch_on_tmva():
    metric = numpy.random.choice([OptimalAMS(), RocAuc()])
    scorer = FoldingScorer(metric)

    grid_param = OrderedDict({"MaxDepth": [4, 5], "NTrees": [10, 20]})
    generator = SubgridParameterOptimizer(grid_param)

    try:
        from rep.estimators import TMVAClassifier

        grid = GridOptimalSearchCV(
            TMVAClassifier(features=['column0', 'column1']), generator, scorer)
        classifier = check_grid(grid, False, False, False)
        # checking parameters
        assert len(classifier.features) == 2
        params = classifier.get_params()
        for key in grid_param:
            assert params[key] == grid.generator.best_params_[key]
    except ImportError:
        pass
Пример #11
0
def test_grid_with_custom_scorer():
    """
    Introducing here special scorer which always uses all data passed to gridsearch.fit as training
    and tests on another fixed dataset (which was passed to scorer) bu computing roc_auc_score from sklearn.
    """
    class CustomScorer(object):
        def __init__(self, testX, testY):
            self.testY = testY
            self.testX = testX

        def __call__(self, base_estimator, params, X, y, sample_weight=None):
            cl = clone(base_estimator)
            cl.set_params(**params)
            if sample_weight is not None:
                cl.fit(X, y, sample_weight)
            else:
                cl.fit(X, y)
            return roc_auc_score(self.testY,
                                 cl.predict_proba(self.testX)[:, 1])

    X, y, _ = generate_classification_data()
    custom_scorer = CustomScorer(X, y)

    grid_param = OrderedDict({
        "n_estimators": [10, 20],
        "learning_rate": [0.1, 0.05],
        'features': [['column0', 'column1'], ['column0', 'column1', 'column2']]
    })
    generator = SubgridParameterOptimizer(grid_param)

    base_estimator = SklearnClassifier(clf=AdaBoostClassifier())
    grid = GridOptimalSearchCV(base_estimator, generator, custom_scorer)

    cl = check_grid(grid, False, False, False)
    assert len(cl.features) <= 3
    params = cl.get_params()
    for key in grid_param:
        if key in params:
            assert params[key] == grid.generator.best_params_[key]
        else:
            assert params['clf__' + key] == grid.generator.best_params_[key]
Пример #12
0
def test_gridsearch_on_tmva():
    metric = numpy.random.choice([OptimalAMS(), RocAuc()])
    scorer = FoldingScorer(metric)

    grid_param = OrderedDict({"MaxDepth": [4, 5], "NTrees": [10, 20]})
    generator = SubgridParameterOptimizer(n_evaluations=5,
                                          param_grid=grid_param)

    try:
        from rep.estimators import TMVAClassifier

        base_tmva = TMVAClassifier(
            factory_options="Silent=True:V=False:DrawProgressBar=False",
            features=['column0', 'column1'],
            method='kBDT')
        grid = GridOptimalSearchCV(base_tmva, generator, scorer)
        classifier = check_grid(grid, False, False, False)
        # checking parameters
        assert len(classifier.features) == 2
        params = classifier.get_params()
        for key in grid_param:
            assert params[key] == grid.generator.best_params_[key]
    except ImportError:
        pass
Пример #13
0
print "Downloaded magic04.data"
data = pandas.read_csv('toy_datasets/magic04.data', names=columns)
labels = numpy.array(data['g'] == 'g', dtype=int)
data = data.drop('g', axis=1)
import numpy
import numexpr
import pandas
from rep import utils
from sklearn.ensemble import GradientBoostingClassifier
from rep.report.metrics import RocAuc
from rep.metaml import GridOptimalSearchCV, FoldingScorer, RandomParameterOptimizer
from rep.estimators import SklearnClassifier, TMVAClassifier, XGBoostRegressor
# define grid parameters
grid_param = {}
grid_param['learning_rate'] = [0.2, 0.1, 0.05, 0.02, 0.01]
grid_param['max_depth'] = [2, 3, 4, 5]
# use random hyperparameter optimization algorithm
generator = RandomParameterOptimizer(grid_param)
# define folding scorer
scorer = FoldingScorer(RocAuc(), folds=3, fold_checks=3)
estimator = SklearnClassifier(GradientBoostingClassifier(n_estimators=30))
#grid_finder = GridOptimalSearchCV(estimator, generator, scorer)
#% time grid_finder.fit(data, labels)
grid_finder = GridOptimalSearchCV(estimator, generator, scorer, parallel_profile="default")
print "start grid search"
grid_finder.fit(data, labels)

grid_finder.params_generator.print_results()

assert 10 == grid_finder.params_generator.n_evaluations, "oops"
Пример #14
0
data = data.drop('g', axis=1)
import numpy
import numexpr
import pandas
from rep import utils
from sklearn.ensemble import GradientBoostingClassifier
from rep.report.metrics import RocAuc
from rep.metaml import GridOptimalSearchCV, FoldingScorer, RandomParameterOptimizer
from rep.estimators import SklearnClassifier, TMVAClassifier, XGBoostRegressor
# define grid parameters
grid_param = {}
grid_param['learning_rate'] = [0.2, 0.1, 0.05, 0.02, 0.01]
grid_param['max_depth'] = [2, 3, 4, 5]
# use random hyperparameter optimization algorithm
generator = RandomParameterOptimizer(grid_param)
# define folding scorer
scorer = FoldingScorer(RocAuc(), folds=3, fold_checks=3)
estimator = SklearnClassifier(GradientBoostingClassifier(n_estimators=30))
#grid_finder = GridOptimalSearchCV(estimator, generator, scorer)
#% time grid_finder.fit(data, labels)
grid_finder = GridOptimalSearchCV(estimator,
                                  generator,
                                  scorer,
                                  parallel_profile="default")
print "start grid search"
grid_finder.fit(data, labels)

grid_finder.params_generator.print_results()

assert 10 == grid_finder.params_generator.n_evaluations, "oops"