Exemplo n.º 1
0
def test_gridsearch_sklearn():
    metric = numpy.random.choice([OptimalAMS(), RocAuc(), LogLoss()])
    scorer = ClassificationFoldingScorer(metric)
    maximization = True
    if isinstance(metric, LogLoss):
        maximization = False
    grid_param = OrderedDict({
        "n_estimators": [10, 20],
        "learning_rate": [0.1, 0.05],
        'features': [['column0', 'column1'], ['column0', 'column1', 'column2']]
    })
    generator = RegressionParameterOptimizer(grid_param,
                                             n_evaluations=4,
                                             maximize=maximization)

    grid = GridOptimalSearchCV(SklearnClassifier(clf=AdaBoostClassifier()),
                               generator,
                               scorer,
                               parallel_profile='threads-3')

    _ = check_grid(grid, False, False, False, use_weights=True)
    classifier = check_grid(grid, False, False, False, use_weights=False)

    # Check parameters of best fitted classifier
    assert 2 <= len(classifier.features) <= 3, 'Features were not set'
    params = classifier.get_params()
    for key in grid_param:
        if key in params:
            assert params[key] == grid.generator.best_params_[key]
        else:
            assert params['clf__' + key] == grid.generator.best_params_[key]
Exemplo n.º 2
0
def test_gridsearch_metrics_threads(n_threads=3):
    X, y, sample_weight = generate_classification_data(n_classes=2,
                                                       distance=0.7)
    param_grid = OrderedDict({'reg_param': numpy.linspace(0, 1, 20)})

    from itertools import cycle

    optimizers = cycle([
        RegressionParameterOptimizer(param_grid=param_grid,
                                     n_evaluations=4,
                                     start_evaluations=2),
        SubgridParameterOptimizer(param_grid=param_grid, n_evaluations=4),
        RandomParameterOptimizer(param_grid=param_grid, n_evaluations=4),
    ])

    for metric in [RocAuc(), OptimalAMS(), OptimalSignificance(), log_loss]:
        scorer = FoldingScorer(metric)
        clf = SklearnClassifier(QDA())
        grid = GridOptimalSearchCV(
            estimator=clf,
            params_generator=next(optimizers),
            scorer=scorer,
            parallel_profile='threads-{}'.format(n_threads))
        grid.fit(X, y)
        print(grid.params_generator.best_score_)
        print(grid.params_generator.best_params_)
        grid.params_generator.print_results()
Exemplo n.º 3
0
def test_optimal_metric_function(size=10000):
    labels = numpy.random.randint(0, 2, size=size)
    predictions = numpy.random.random(size=[size, 2])
    predictions /= predictions.sum(axis=1, keepdims=True)
    sample_weight = numpy.random.random(size=size)

    for metric, optimal_metric in [(significance, OptimalSignificance()),
                                   (ams, OptimalAMS())]:
        optimal_metric.fit(None, labels, sample_weight=sample_weight)
        value = optimal_metric(labels,
                               predictions,
                               sample_weight=sample_weight)
        thresholds, values = optimal_metric.compute(
            labels, predictions, sample_weight=sample_weight)
        assert numpy.max(values) == value, "maximal value doesn't coincide"
        index = numpy.random.randint(0, len(thresholds))
        threshold = thresholds[index]
        passed = numpy.array(predictions[:, 1] >= threshold)

        s = optimal_metric.expected_s * numpy.average(
            passed, weights=sample_weight * (labels == 1))
        b = optimal_metric.expected_b * numpy.average(
            passed, weights=sample_weight * (labels == 0))
        assert numpy.allclose(metric(s, b), values[index]), \
            'no coincidence {} {} {}'.format(type(optimal_metric), metric(s, b), values[index])
Exemplo n.º 4
0
def test_gridsearch_threads(n_threads=3):
    scorer = FoldingScorer(numpy.random.choice([OptimalAMS(), RocAuc()]))

    grid_param = OrderedDict({
        "n_estimators": [10, 20],
        "learning_rate": [0.1, 0.05],
        'features': [['column0', 'column1'], ['column0', 'column1', 'column2']]
    })
    generator = RegressionParameterOptimizer(grid_param, n_evaluations=4)

    base = SklearnClassifier(clf=AdaBoostClassifier())
    grid = GridOptimalSearchCV(base,
                               generator,
                               scorer,
                               parallel_profile='threads-{}'.format(n_threads))

    X, y, sample_weight = generate_classification_data()
    grid.fit(X, y, sample_weight=sample_weight)
Exemplo n.º 5
0
def test_gridsearch_on_tmva():
    metric = numpy.random.choice([OptimalAMS(), RocAuc()])
    scorer = FoldingScorer(metric)

    grid_param = OrderedDict({"MaxDepth": [4, 5], "NTrees": [10, 20]})
    generator = SubgridParameterOptimizer(grid_param)

    try:
        from rep.estimators import TMVAClassifier

        grid = GridOptimalSearchCV(
            TMVAClassifier(features=['column0', 'column1']), generator, scorer)
        classifier = check_grid(grid, False, False, False)
        # checking parameters
        assert len(classifier.features) == 2
        params = classifier.get_params()
        for key in grid_param:
            assert params[key] == grid.generator.best_params_[key]
    except ImportError:
        pass
Exemplo n.º 6
0
def test_gridsearch_on_tmva():
    metric = numpy.random.choice([OptimalAMS(), RocAuc()])
    scorer = FoldingScorer(metric)

    grid_param = OrderedDict({"MaxDepth": [4, 5], "NTrees": [10, 20]})
    generator = SubgridParameterOptimizer(n_evaluations=5,
                                          param_grid=grid_param)

    try:
        from rep.estimators import TMVAClassifier

        base_tmva = TMVAClassifier(
            factory_options="Silent=True:V=False:DrawProgressBar=False",
            features=['column0', 'column1'],
            method='kBDT')
        grid = GridOptimalSearchCV(base_tmva, generator, scorer)
        classifier = check_grid(grid, False, False, False)
        # checking parameters
        assert len(classifier.features) == 2
        params = classifier.get_params()
        for key in grid_param:
            assert params[key] == grid.generator.best_params_[key]
    except ImportError:
        pass