示例#1
0
def test_theanets_partial_fit():
    clf_complete = TheanetsClassifier(layers=[2],
                                      trainers=[{
                                          'algo': 'rmsprop',
                                          'learning_rate': 0.1
                                      }, {
                                          'algo': 'rprop',
                                          'learning_rate': 0.1
                                      }])
    clf_partial = TheanetsClassifier(layers=[2],
                                     trainers=[{
                                         'algo': 'rmsprop',
                                         'learning_rate': 0.1
                                     }])
    X, y, sample_weight = generate_classification_data()
    clf_complete.fit(X, y)
    clf_partial.fit(X, y)
    clf_partial.partial_fit(X, y, algo='rprop', learning_rate=0.1)

    assert clf_complete.trainers == clf_partial.trainers, 'trainers not saved in partial fit'

    auc_complete = roc_auc_score(y, clf_complete.predict_proba(X)[:, 1])
    auc_partial = roc_auc_score(y, clf_partial.predict_proba(X)[:, 1])

    # Known fail of theanets
    assert auc_complete == auc_partial, 'same networks return different results'
示例#2
0
def test_cache_classifier():
    cache_helper.clear_cache()

    for Wrapper, Model in [(CacheClassifier, LogisticRegression),
                           (CacheRegressor, LinearRegression)]:
        X, y, weights = generate_classification_data(n_classes=2)
        clf = Wrapper('first', Model()).fit(X, y)
        assert clf._used_cache == False
        clf = Wrapper('first', Model()).fit(X + 0, y + 0)
        assert clf._used_cache == True
        # changed name
        clf = Wrapper('second', Model()).fit(X, y)
        assert clf._used_cache == False
        # changed data
        X_new = X.copy()
        X_new.iloc[0, 0] += 1
        clf = Wrapper('first', Model()).fit(X_new, y)
        assert clf._used_cache == False
        # changed labels
        y_new = y.copy()
        y_new[0] += 1
        clf = Wrapper('first', Model()).fit(X, y_new)
        assert clf._used_cache == False
        # added weights
        clf = Wrapper('first', Model()).fit(X, y, sample_weight=None)
        assert clf._used_cache == False
        # changed parameters
        clf = Wrapper('first', Model(n_jobs=2)).fit(X, y)
        assert clf._used_cache == False
        # fitting previous once again. Checking that overwriting is correct.
        clf = Wrapper('first', Model(n_jobs=2)).fit(X, y)
        assert clf._used_cache == True

    cache_helper.clear_cache()
示例#3
0
def very_basic_xgboost_test():
    X, y, w = generate_classification_data(n_classes=2)
    clf = XGBoostClassifier(n_estimators=10).fit(X, y)
    clf.predict(X)
    clf.predict_proba(X)
    # testing that returned features in importances are correct and in the same order
    assert numpy.all(clf.features == clf.get_feature_importances().index)
示例#4
0
def test_basic_xgboost():
    X, y, w = generate_classification_data(n_classes=2)
    clf = XGBoostClassifier(n_estimators=10).fit(X, y)
    clf.predict(X)
    clf.predict_proba(X)
    # testing that returned features in importances are correct and in the same order
    assert numpy.all(clf.features == clf.get_feature_importances().index)
示例#5
0
def test_gridsearch_metrics_threads(n_threads=3):
    X, y, sample_weight = generate_classification_data(n_classes=2,
                                                       distance=0.7)
    param_grid = OrderedDict({'reg_param': numpy.linspace(0, 1, 20)})

    from itertools import cycle

    optimizers = cycle([
        RegressionParameterOptimizer(param_grid=param_grid,
                                     n_evaluations=4,
                                     start_evaluations=2),
        SubgridParameterOptimizer(param_grid=param_grid, n_evaluations=4),
        RandomParameterOptimizer(param_grid=param_grid, n_evaluations=4),
    ])

    for metric in [RocAuc(), OptimalAMS(), OptimalSignificance(), log_loss]:
        scorer = FoldingScorer(metric)
        clf = SklearnClassifier(QDA())
        grid = GridOptimalSearchCV(
            estimator=clf,
            params_generator=next(optimizers),
            scorer=scorer,
            parallel_profile='threads-{}'.format(n_threads))
        grid.fit(X, y)
        print(grid.params_generator.best_score_)
        print(grid.params_generator.best_params_)
        grid.params_generator.print_results()
示例#6
0
def check_folding(classifier, check_instance=True, has_staged_pp=True, has_importances=True):
    X, y, sample_weight = generate_classification_data(distance=0.6)

    assert classifier == classifier.fit(X, y, sample_weight=sample_weight)
    assert list(classifier.features) == list(X.columns)

    check_classification_model(classifier, X, y, check_instance=check_instance, has_staged_pp=has_staged_pp,
                               has_importances=has_importances)

    def mean_vote(x):
        return numpy.mean(x, axis=0)

    labels = classifier.predict(X, mean_vote)
    proba = classifier.predict_proba(X, mean_vote)
    assert numpy.all(proba == classifier.predict_proba(X, mean_vote))

    score = accuracy_score(y, labels)
    print(score)
    assert score > 0.7
    assert numpy.allclose(proba.sum(axis=1), 1), 'probabilities do not sum to 1'
    assert numpy.all(proba >= 0.), 'negative probabilities'

    auc_score = roc_auc_score(y, proba[:, 1])
    print(auc_score)
    assert auc_score > 0.8
    if has_staged_pp:
        for p in classifier.staged_predict_proba(X, mean_vote):
            assert p.shape == (len(X), 2)
        # checking that last iteration coincides with previous
        assert numpy.all(p == proba)
示例#7
0
def check_folding(classifier, check_instance=True, has_staged_pp=True, has_importances=True):
    X, y, sample_weight = generate_classification_data(distance=0.6)

    assert classifier == classifier.fit(X, y, sample_weight=sample_weight)
    assert list(classifier.features) == list(X.columns)

    check_classification_model(classifier, X, y, check_instance=check_instance, has_staged_pp=has_staged_pp,
                has_importances=has_importances)

    def mean_vote(x):
        return numpy.mean(x, axis=0)

    labels = classifier.predict(X, mean_vote)
    proba = classifier.predict_proba(X, mean_vote)
    assert numpy.all(proba == classifier.predict_proba(X, mean_vote))

    score = accuracy_score(y, labels)
    print(score)
    assert score > 0.7
    assert numpy.allclose(proba.sum(axis=1), 1), 'probabilities do not sum to 1'
    assert numpy.all(proba >= 0.), 'negative probabilities'

    auc_score = roc_auc_score(y, proba[:, 1])
    print(auc_score)
    assert auc_score > 0.8
    if has_staged_pp:
        for p in classifier.staged_predict_proba(X, mean_vote):
            assert p.shape == (len(X), 2)
        # checking that last iteration coincides with previous
        assert numpy.all(p == proba)
示例#8
0
def test_xgboost_random_states():
    X, y, weights = generate_classification_data(n_classes=2, distance=5)
    for random_state in [
            145, None,
            check_random_state(None),
            check_random_state(145)
    ]:
        clf1 = XGBoostClassifier(n_estimators=5,
                                 max_depth=1,
                                 subsample=0.1,
                                 random_state=random_state)
        clf1.fit(X, y)
        clf2 = XGBoostClassifier(n_estimators=5,
                                 max_depth=1,
                                 subsample=0.1,
                                 random_state=random_state)
        clf2.fit(X, y)
        if isinstance(random_state, numpy.random.RandomState):
            assert not numpy.allclose(
                clf1.predict_proba(X),
                clf2.predict_proba(X)), 'seed: {}'.format(random_state)
        else:
            assert numpy.allclose(
                clf1.predict_proba(X),
                clf2.predict_proba(X)), 'seed: {}'.format(random_state)
示例#9
0
def test_cache_classifier():
    cache_helper.clear_cache()

    for Wrapper, Model in [(CacheClassifier, LogisticRegression), (CacheRegressor, LinearRegression)]:
        X, y, weights = generate_classification_data(n_classes=2)
        clf = Wrapper('first', Model()).fit(X, y)
        assert clf._used_cache == False
        clf = Wrapper('first', Model()).fit(X + 0, y + 0)
        assert clf._used_cache == True
        # changed name
        clf = Wrapper('second', Model()).fit(X, y)
        assert clf._used_cache == False
        # changed data
        X_new = X.copy()
        X_new.iloc[0, 0] += 1
        clf = Wrapper('first', Model()).fit(X_new, y)
        assert clf._used_cache == False
        # changed labels
        y_new = y.copy()
        y_new[0] += 1
        clf = Wrapper('first', Model()).fit(X, y_new)
        assert clf._used_cache == False
        # added weights
        clf = Wrapper('first', Model()).fit(X, y, sample_weight=None)
        assert clf._used_cache == False
        # changed parameters
        clf = Wrapper('first', Model(n_jobs=2)).fit(X, y)
        assert clf._used_cache == False
        # fitting previous once again. Checking that overwriting is correct.
        clf = Wrapper('first', Model(n_jobs=2)).fit(X, y)
        assert clf._used_cache == True

    cache_helper.clear_cache()
示例#10
0
def test_xgboost_works_with_different_dtypes():
    dtypes = ['float32', 'float64', 'int32', 'int64', 'uint32']
    for dtype in dtypes:
        X, y, weights = generate_classification_data(n_classes=2, distance=5)
        clf = XGBoostClassifier(n_estimators=10)
        clf.fit(X.astype(dtype=dtype), y.astype(dtype=dtype), sample_weight=weights.astype(dtype))
        probabilities = clf.predict_proba(X.astype(dtype))

    # testing single pandas.DataFrame with different dtypes
    X, y, weights = generate_classification_data(n_classes=2, distance=5)
    import pandas
    X = pandas.DataFrame()
    for dtype in dtypes:
        X[dtype] = numpy.random.normal(0, 10, size=len(y)).astype(dtype)
    clf = XGBoostClassifier(n_estimators=10)
    clf.fit(X, y, sample_weight=weights)
    probabilities = clf.predict_proba(X)
示例#11
0
def test_theanets_reproducibility():
    clf = TheanetsClassifier(trainers=[{
        'algo': 'nag',
        'min_improvement': 0.1,
        'max_updates': 10
    }])
    X, y, _ = generate_classification_data()
    check_classification_reproducibility(clf, X, y)
示例#12
0
def test_pybrain_reproducibility():
    # This test fails. Because PyBrain can't reproduce training.
    X, y, _ = generate_classification_data()
    clf1 = PyBrainClassifier(layers=[4], epochs=2).fit(X, y)
    clf2 = PyBrainClassifier(layers=[4], epochs=2).fit(X, y)
    print(clf1.predict_proba(X) - clf2.predict_proba(X))
    assert numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'different predicitons'
    check_classification_reproducibility(clf1, X, y)
示例#13
0
def test_nolearn_reproducibility():
    X, y, sample_weight = generate_classification_data()
    cl = NolearnClassifier()
    y_predicted_1 = cl.fit(X, y).predict(X)
    y_predicted_2 = cl.fit(X, y).predict(X)
    assert (y_predicted_1 == y_predicted_2).all(), 'fitting the classifier twice gives different predictions'
    y_predicted_3 = clone(cl).fit(X, y).predict(X)
    assert (y_predicted_1 == y_predicted_3).all(), 'cloned classifier gives different prediction'
示例#14
0
def test_factory():
    factory = RegressorsFactory()
    try:
        from rep.estimators.tmva import TMVARegressor
        factory.add_regressor('tmva', TMVARegressor())
    except ImportError:
        pass
    factory.add_regressor('rf', RandomForestRegressor(n_estimators=10))
    factory.add_regressor('ada', AdaBoostRegressor(n_estimators=20))

    X, y, sample_weight = generate_classification_data()
    assert factory == factory.fit(X,
                                  y,
                                  sample_weight=sample_weight,
                                  features=list(X.columns))
    values = factory.predict(X)

    for cl in factory.values():
        assert list(cl.features) == list(X.columns)

    for key, val in values.items():
        score = mean_squared_error(y, val)
        print(score)
        assert score < 0.2

    for key, iterator in factory.staged_predict(X).items():
        assert key != 'tmva', 'tmva does not support staged pp'
        for p in iterator:
            assert p.shape == (len(X), )

        # checking that last iteration coincides with previous
        assert numpy.all(p == values[key])

    # testing picklability
    dump_string = cPickle.dumps(factory)
    clf_loaded = cPickle.loads(dump_string)

    assert type(factory) == type(clf_loaded)

    probs1 = factory.predict(X)
    probs2 = clf_loaded.predict(X)
    for key, val in probs1.items():
        assert numpy.all(val == probs2[key]), 'something strange was loaded'

    report = RegressionReport({'rf': factory['rf']},
                              LabeledDataStorage(X, y, sample_weight))
    report.feature_importance_shuffling(mean_squared_mod).plot(new_plot=True,
                                                               figsize=(18, 3))
    report = factory.test_on_lds(LabeledDataStorage(X, y, sample_weight))
    report = factory.test_on(X, y, sample_weight=sample_weight)
    report.feature_importance()
    report.features_correlation_matrix()
    report.predictions_scatter()

    val = numpy.mean(X['column0'])
    report_mask(report, "column0 > %f" % val, X)
    report_mask(report, lambda x: numpy.array(x['column0']) < val, X)
    report_mask(report, None, X)
示例#15
0
def test_pybrain_reproducibility():
    # This test fails. Because PyBrain can't reproduce training.
    X, y, _ = generate_classification_data()
    clf1 = PyBrainClassifier(layers=[4], epochs=2).fit(X, y)
    clf2 = PyBrainClassifier(layers=[4], epochs=2).fit(X, y)
    print(clf1.predict_proba(X) - clf2.predict_proba(X))
    assert numpy.allclose(clf1.predict_proba(X),
                          clf2.predict_proba(X)), 'different predicitons'
    check_classification_reproducibility(clf1, X, y)
示例#16
0
def test_factory():
    factory = ClassifiersFactory()
    try:
        from rep.estimators.tmva import TMVAClassifier
        factory.add_classifier('tmva', TMVAClassifier())
    except ImportError:
        pass
    factory.add_classifier('rf', RandomForestClassifier(n_estimators=10))
    factory.add_classifier('ada', AdaBoostClassifier(n_estimators=20))

    X, y, sample_weight = generate_classification_data()
    assert factory == factory.fit(X, y, sample_weight=sample_weight, features=list(X.columns),
                                  parallel_profile='threads-4')
    for cl in factory.values():
        assert list(cl.features) == list(X.columns)
    proba = factory.predict_proba(X, parallel_profile='threads-4')
    labels = factory.predict(X, parallel_profile='threads-4')
    for key, val in labels.items():
        score = accuracy_score(y, val)
        print(key, score)
        assert score > 0.7, key

    for key, val in proba.items():
        assert numpy.allclose(val.sum(axis=1), 1), 'probabilities do not sum to 1'
        assert numpy.all(val >= 0.), 'negative probabilities'

        auc_score = roc_auc_score(y, val[:, 1])
        print(auc_score)
        assert auc_score > 0.8

    for key, iterator in factory.staged_predict_proba(X).items():
        assert key != 'tmva', 'tmva does not support staged pp'
        for p in iterator:
            assert p.shape == (len(X), 2)

        # checking that last iteration coincides with previous
        assert numpy.all(p == proba[key])

    # testing picklability
    dump_string = cPickle.dumps(factory)
    clf_loaded = cPickle.loads(dump_string)

    assert type(factory) == type(clf_loaded)

    probs1 = factory.predict_proba(X)
    probs2 = clf_loaded.predict_proba(X)
    for key, val in probs1.items():
        assert numpy.all(val == probs2[key]), 'something strange was loaded'

    report = ClassificationReport({'rf': factory['rf']}, LabeledDataStorage(X, y, sample_weight))
    report.feature_importance_shuffling(roc_auc_score_mod).plot(new_plot=True, figsize=(18, 3))
    report = factory.test_on_lds(LabeledDataStorage(X, y, sample_weight))
    report = factory.test_on(X, y, sample_weight=sample_weight)
    val = numpy.mean(X['column0'])
    yield check_report_with_mask, report, "column0 > %f" % (val / 2.), X
    yield check_report_with_mask, report, lambda x: numpy.array(x['column0']) < val * 2., X
    yield check_report_with_mask, report, None, X
示例#17
0
def test_theanets_reproducibility():
    clf = TheanetsClassifier(trainers=[{
        'algo': 'nag',
        'min_improvement': 0.1
    }])
    X, y, _ = generate_classification_data()
    import numpy
    numpy.random.seed(43)
    check_classification_reproducibility(clf, X, y)
示例#18
0
def test_xgboost_works_with_different_dtypes():
    dtypes = ['float32', 'float64', 'int32', 'int64', 'uint32']
    for dtype in dtypes:
        X, y, weights = generate_classification_data(n_classes=2, distance=5)
        clf = XGBoostClassifier(n_estimators=10)
        clf.fit(X.astype(dtype=dtype),
                y.astype(dtype=dtype),
                sample_weight=weights.astype(dtype))
        probabilities = clf.predict_proba(X.astype(dtype))

    # testing single pandas.DataFrame with different dtypes
    X, y, weights = generate_classification_data(n_classes=2, distance=5)
    import pandas
    X = pandas.DataFrame()
    for dtype in dtypes:
        X[dtype] = numpy.random.normal(0, 10, size=len(y)).astype(dtype)
    clf = XGBoostClassifier(n_estimators=10)
    clf.fit(X, y, sample_weight=weights)
    probabilities = clf.predict_proba(X)
示例#19
0
def test_xgboost_feature_importance():
    X, y, weights = generate_classification_data(n_classes=2, distance=5)
    clf = XGBoostClassifier(n_estimators=1, max_depth=1)
    clf.fit(X, y)
    importances = clf.get_feature_importances()
    original_features = set(X.columns)
    importances_features = set(importances.index)
    print(original_features, importances_features)
    assert original_features == importances_features, 'feature_importances_ return something wrong'

    assert len(original_features) == len(clf.feature_importances_)
示例#20
0
def test_pretrain():
    trainX, trainY, _ = generate_classification_data()
    trainers = [{
        'algo': 'pretrain',
        'learning_rate': 0.5,
        'patience': 1,
        'validate_every': 1
    }]
    # only checking that fitting doesn't throw errors
    # this frequently gets stuck on CI
    TheanetsClassifier(layers=[5], trainers=trainers).fit(trainX, trainY)
示例#21
0
def test_xgboost_random_states():
    X, y, weights = generate_classification_data(n_classes=2, distance=5)
    for random_state in [145, None, check_random_state(None), check_random_state(145)]:
        clf1 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state)
        clf1.fit(X, y)
        clf2 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state)
        clf2.fit(X, y)
        if isinstance(random_state, numpy.random.RandomState):
            assert not numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state)
        else:
            assert numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state)
示例#22
0
def test_xgboost_feature_importance():
    X, y, weights = generate_classification_data(n_classes=2, distance=5)
    clf = XGBoostClassifier(n_estimators=1, max_depth=1)
    clf.fit(X, y)
    importances = clf.get_feature_importances()
    original_features = set(X.columns)
    importances_features = set(importances.index)
    print(original_features, importances_features)
    assert original_features == importances_features, 'feature_importances_ return something wrong'

    assert len(original_features) == len(clf.feature_importances_)
示例#23
0
def test_factory():
    factory = RegressorsFactory()
    try:
        from rep.estimators.tmva import TMVARegressor
        factory.add_regressor('tmva', TMVARegressor())
    except ImportError:
        pass
    factory.add_regressor('rf', RandomForestRegressor(n_estimators=10))
    factory.add_regressor('ada', AdaBoostRegressor(n_estimators=20))

    X, y, sample_weight = generate_classification_data()
    assert factory == factory.fit(X, y, sample_weight=sample_weight, features=list(X.columns))
    values = factory.predict(X)

    for cl in factory.values():
        assert list(cl.features) == list(X.columns)

    for key, val in values.items():
        score = mean_squared_error(y, val)
        print(score)
        assert score < 0.2

    for key, iterator in factory.staged_predict(X).items():
        assert key != 'tmva', 'tmva does not support staged pp'
        for p in iterator:
            assert p.shape == (len(X), )

        # checking that last iteration coincides with previous
        assert numpy.all(p == values[key])

    # testing picklability
    dump_string = cPickle.dumps(factory)
    clf_loaded = cPickle.loads(dump_string)

    assert type(factory) == type(clf_loaded)

    probs1 = factory.predict(X)
    probs2 = clf_loaded.predict(X)
    for key, val in probs1.items():
        assert numpy.all(val == probs2[key]), 'something strange was loaded'

    report = RegressionReport({'rf': factory['rf']}, LabeledDataStorage(X, y, sample_weight))
    report.feature_importance_shuffling(mean_squared_mod).plot(new_plot=True, figsize=(18, 3))
    report = factory.test_on_lds(LabeledDataStorage(X, y, sample_weight))
    report = factory.test_on(X, y, sample_weight=sample_weight)
    report.feature_importance()
    report.features_correlation_matrix()
    report.predictions_scatter()

    val = numpy.mean(X['column0'])
    report_mask(report, "column0 > %f" % val, X)
    report_mask(report, lambda x: numpy.array(x['column0']) < val, X)
    report_mask(report, None, X)
示例#24
0
def test_own_classification_reports():
    """
    testing clf.test_on
    """
    X, y, sample_weight = generate_classification_data()
    clf = SklearnClassifier(RandomForestClassifier())
    clf.fit(X, y, sample_weight=sample_weight)
    report = clf.test_on(X, y, sample_weight=sample_weight)
    roc1 = report.compute_metric(RocAuc())

    lds = LabeledDataStorage(X, y, sample_weight=sample_weight)
    roc2 = clf.test_on_lds(lds=lds).compute_metric(RocAuc())
    assert roc1 == roc2, 'Something wrong with test_on'
示例#25
0
def test_gridsearch_threads(n_threads=3):
    scorer = FoldingScorer(numpy.random.choice([OptimalAMS(), RocAuc()]))

    grid_param = OrderedDict({"n_estimators": [10, 20],
                              "learning_rate": [0.1, 0.05],
                              'features': [['column0', 'column1'], ['column0', 'column1', 'column2']]})
    generator = RegressionParameterOptimizer(grid_param, n_evaluations=4)

    base = SklearnClassifier(clf=AdaBoostClassifier())
    grid = GridOptimalSearchCV(base, generator, scorer, parallel_profile='threads-{}'.format(n_threads))

    X, y, sample_weight = generate_classification_data()
    grid.fit(X, y, sample_weight=sample_weight)
示例#26
0
def test_own_classification_reports():
    """
    testing clf.test_on
    """
    X, y, sample_weight = generate_classification_data()
    clf = SklearnClassifier(RandomForestClassifier())
    clf.fit(X, y, sample_weight=sample_weight)
    report = clf.test_on(X, y, sample_weight=sample_weight)
    roc1 = report.compute_metric(RocAuc())

    lds = LabeledDataStorage(X, y, sample_weight=sample_weight)
    roc2 = clf.test_on_lds(lds=lds).compute_metric(RocAuc())
    assert roc1 == roc2, 'Something wrong with test_on'
示例#27
0
def test_theanets_reproducibility():
    clf = TheanetsClassifier()
    X, y, sample_weight = generate_classification_data()
    clf.fit(X, y)
    auc = roc_auc_score(y, clf.predict_proba(X)[:, 1])
    for i in range(2):
        clf.fit(X, y)
        curr_auc = roc_auc_score(y, clf.predict_proba(X)[:, 1])
        assert auc == curr_auc, 'running a network twice produces different results'

    cloned_clf = clone(clf)
    cloned_clf.fit(X, y)
    cloned_auc = roc_auc_score(y, cloned_clf.predict_proba(X)[:, 1])
    assert cloned_auc == auc, 'cloned network produces different result'
示例#28
0
def test_theanets_partial_fit():
    clf_complete = TheanetsClassifier(trainers=[{'optimize': 'rmsprop', 'patience': 0}, {'optimize': 'rprop'}])
    clf_partial = TheanetsClassifier(trainers=[{'optimize': 'rmsprop', 'patience': 0}])
    X, y, sample_weight = generate_classification_data()
    clf_complete.fit(X, y)
    clf_partial.fit(X, y)
    clf_partial.partial_fit(X, y, optimize='rprop')

    assert clf_complete.trainers == clf_partial.trainers, 'trainers not saved in partial fit'

    auc_complete = roc_auc_score(y, clf_complete.predict_proba(X)[:, 1])
    auc_partial = roc_auc_score(y, clf_partial.predict_proba(X)[:, 1])

    assert auc_complete == auc_partial, 'same networks return different results'
示例#29
0
def test_theanets_reproducibility():
    clf = TheanetsClassifier()
    X, y, sample_weight = generate_classification_data()
    clf.fit(X, y)
    auc = roc_auc_score(y, clf.predict_proba(X)[:, 1])
    for i in range(2):
        clf.fit(X, y)
        curr_auc = roc_auc_score(y, clf.predict_proba(X)[:, 1])
        assert auc == curr_auc, 'running a network twice produces different results'

    cloned_clf = clone(clf)
    cloned_clf.fit(X, y)
    cloned_auc = roc_auc_score(y, cloned_clf.predict_proba(X)[:, 1])
    assert cloned_auc == auc, 'cloned network produces different result'
示例#30
0
def test_feature_importances():
    clf = XGBoostClassifier()
    X, y, sample_weight = generate_classification_data()
    clf.fit(X, y, sample_weight=sample_weight)
    # checking feature importance (three ways)

    res_default = clf.xgboost_classifier.get_fscore()
    res2 = clf._get_fscore()
    res3 = clf.feature_importances_

    assert res_default == res2, res_default
    for i, val in enumerate(res3):
        if val > 0.0:
            assert val == res_default['f' + str(i)]
示例#31
0
def test_folding_regressor_functions():
    """Testing folding functions """
    data, y, sample_weight = generate_classification_data()

    for X in [data, numpy.array(data)]:
        kfolder = FoldingRegressor(SklearnRegressor(GradientBoostingRegressor(n_estimators=5)), n_folds=2)
        kfolder.fit(X, y, sample_weight=sample_weight)
        preds = kfolder.predict(X)
        for p in kfolder.staged_predict(X):
            pass
        assert numpy.allclose(p, preds)

        importances = kfolder.feature_importances_
        other_importances = kfolder.get_feature_importances()
示例#32
0
def test_folding_regressor_functions():
    """Testing folding functions """
    data, y, sample_weight = generate_classification_data()

    for X in [data, numpy.array(data)]:
        kfolder = FoldingRegressor(SklearnRegressor(GradientBoostingRegressor(n_estimators=5)), n_folds=2)
        kfolder.fit(X, y, sample_weight=sample_weight)
        preds = kfolder.predict(X)
        for p in kfolder.staged_predict(X):
            pass
        assert numpy.allclose(p, preds)

        importances = kfolder.feature_importances_
        other_importances = kfolder.get_feature_importances()
示例#33
0
def test_feature_splitter():
    # testing splitter
    from rep.metaml import FeatureSplitter

    X, y, sample_weight = generate_classification_data(n_classes=3)
    split_column = X.columns[0]
    splitters = numpy.random.randint(0, 3, size=len(X))
    X[split_column] = splitters
    X.ix[splitters == 1, :] += 4
    X.ix[splitters == 2, :] -= 4
    fs = FeatureSplitter(base_estimator=XGBoostClassifier(features=list(X.columns[1:]), n_estimators=10, max_depth=3),
                         split_feature=split_column)
    fs.fit(X, y, sample_weight=sample_weight)
    assert fs.score(X, y) > 0.9
示例#34
0
def test_theanets_partial_fit():
    clf_complete = TheanetsClassifier(layers=[2], trainers=[{'algo': 'rmsprop', 'learning_rate': 0.1},
                                                            {'algo': 'rprop', 'learning_rate': 0.1}])
    clf_partial = TheanetsClassifier(layers=[2], trainers=[{'algo': 'rmsprop', 'learning_rate': 0.1}])
    X, y, sample_weight = generate_classification_data()
    clf_complete.fit(X, y)
    clf_partial.fit(X, y)
    clf_partial.partial_fit(X, y, algo='rprop', learning_rate=0.1)

    assert clf_complete.trainers == clf_partial.trainers, 'trainers not saved in partial fit'

    auc_complete = roc_auc_score(y, clf_complete.predict_proba(X)[:, 1])
    auc_partial = roc_auc_score(y, clf_partial.predict_proba(X)[:, 1])

    # Known fail of theanets
    assert auc_complete == auc_partial, 'same networks return different results'
示例#35
0
文件: test_grid.py 项目: 0x0all/rep
def test_grid():
    def generate_scorer(test, labels):
        def custom(base_estimator, params, X, y, sample_weight=None):
            cl = clone(base_estimator)
            cl.set_params(**params)
            if sample_weight is not None:
                cl.fit(X, y, sample_weight)
            else:
                cl.fit(X, y)
            return roc_auc_score(labels, cl.predict_proba(test)[:, 1])

        return custom
    X, y, _ = generate_classification_data()

    grid_custom(generate_scorer(X, y))
    run_grid(grid_sklearn)
    run_grid(grid_tmva)
示例#36
0
def test_feature_splitter():
    # testing splitter
    from rep.metaml import FeatureSplitter

    X, y, sample_weight = generate_classification_data(n_classes=3)
    split_column = X.columns[0]
    splitters = numpy.random.randint(0, 3, size=len(X))
    X[split_column] = splitters
    X.ix[splitters == 1, :] += 4
    X.ix[splitters == 2, :] -= 4
    fs = FeatureSplitter(base_estimator=XGBoostClassifier(features=list(
        X.columns[1:]),
                                                          n_estimators=10,
                                                          max_depth=3),
                         split_feature=split_column)
    fs.fit(X, y, sample_weight=sample_weight)
    assert fs.score(X, y) > 0.9
示例#37
0
def test_theanets_partial_fit():
    clf_complete = TheanetsClassifier(trainers=[{
        'optimize': 'rmsprop'
    }, {
        'optimize': 'rprop'
    }])
    clf_partial = TheanetsClassifier(trainers=[{'optimize': 'rmsprop'}])
    X, y, sample_weight = generate_classification_data()
    clf_complete.fit(X, y)
    clf_partial.fit(X, y)
    clf_partial.partial_fit(X, y, optimize='rprop')

    assert clf_complete.trainers == clf_partial.trainers, 'trainers not saved in partial fit'

    auc_complete = roc_auc_score(y, clf_complete.predict_proba(X)[:, 1])
    auc_partial = roc_auc_score(y, clf_partial.predict_proba(X)[:, 1])

    assert auc_complete == auc_partial, 'same networks return different results'
示例#38
0
def test_grid():
    def generate_scorer(test, labels):
        def custom(base_estimator, params, X, y, sample_weight=None):
            cl = clone(base_estimator)
            cl.set_params(**params)
            if sample_weight is not None:
                cl.fit(X, y, sample_weight)
            else:
                cl.fit(X, y)
            return roc_auc_score(labels, cl.predict_proba(test)[:, 1])

        return custom

    X, y, _ = generate_classification_data()

    grid_custom(generate_scorer(X, y))
    run_grid(grid_sklearn)
    run_grid(grid_tmva)
示例#39
0
文件: test_stacking.py 项目: Afey/rep
def test_feature_splitter():
    # testing splitter
    from rep.metaml import FeatureSplitter

    X, y, sample_weight = generate_classification_data(n_classes=3)
    split_column = X.columns[0]
    splitters = numpy.random.randint(0, 3, size=len(X))
    X[split_column] = splitters
    X.ix[splitters == 1, :] += 4
    X.ix[splitters == 2, :] -= 4
    fs = FeatureSplitter(base_estimator=XGBoostClassifier(features=list(X.columns[1:]), n_estimators=10, max_depth=3),
                         split_feature=split_column)
    fs.fit(X, y, sample_weight=sample_weight)
    assert fs.score(X, y) > 0.9
    p_final = fs.predict_proba(X)
    for p in fs.staged_predict_proba(X):
        pass
    assert numpy.allclose(p_final, p), 'end of iterations differs from expected'
示例#40
0
def test_feature_splitter():
    # testing splitter
    from rep.metaml import FeatureSplitter

    X, y, sample_weight = generate_classification_data(n_classes=3)
    split_column = X.columns[0]
    splitters = numpy.random.randint(0, 3, size=len(X))
    X[split_column] = splitters
    X.ix[splitters == 1, :] += 4
    X.ix[splitters == 2, :] -= 4
    fs = FeatureSplitter(base_estimator=XGBoostClassifier(n_estimators=10, max_depth=3),
                         split_feature=split_column, train_features=list(X.columns[1:]))
    fs.fit(X, y, sample_weight=sample_weight)
    assert fs.score(X, y) > 0.9
    p_final = fs.predict_proba(X)
    for p in fs.staged_predict_proba(X):
        pass
    assert numpy.allclose(p_final, p), 'end of iterations differs from expected'
示例#41
0
def test_gridsearch_threads(n_threads=3):
    scorer = FoldingScorer(numpy.random.choice([OptimalAMS(), RocAuc()]))

    grid_param = OrderedDict({
        "n_estimators": [10, 20],
        "learning_rate": [0.1, 0.05],
        'features': [['column0', 'column1'], ['column0', 'column1', 'column2']]
    })
    generator = RegressionParameterOptimizer(grid_param, n_evaluations=4)

    base = SklearnClassifier(clf=AdaBoostClassifier())
    grid = GridOptimalSearchCV(base,
                               generator,
                               scorer,
                               parallel_profile='threads-{}'.format(n_threads))

    X, y, sample_weight = generate_classification_data()
    grid.fit(X, y, sample_weight=sample_weight)
示例#42
0
def test_grid_with_custom_scorer():
    """
    Introducing here special scorer which always uses all data passed to gridsearch.fit as training
    and tests on another fixed dataset (which was passed to scorer) bu computing roc_auc_score from sklearn.
    """
    class CustomScorer(object):
        def __init__(self, testX, testY):
            self.testY = testY
            self.testX = testX

        def __call__(self, base_estimator, params, X, y, sample_weight=None):
            cl = clone(base_estimator)
            cl.set_params(**params)
            if sample_weight is not None:
                cl.fit(X, y, sample_weight)
            else:
                cl.fit(X, y)
            return roc_auc_score(self.testY,
                                 cl.predict_proba(self.testX)[:, 1])

    X, y, _ = generate_classification_data()
    custom_scorer = CustomScorer(X, y)

    grid_param = OrderedDict({
        "n_estimators": [10, 20],
        "learning_rate": [0.1, 0.05],
        'features': [['column0', 'column1'], ['column0', 'column1', 'column2']]
    })
    generator = SubgridParameterOptimizer(grid_param)

    base_estimator = SklearnClassifier(clf=AdaBoostClassifier())
    grid = GridOptimalSearchCV(base_estimator, generator, custom_scorer)

    cl = check_grid(grid, False, False, False)
    assert len(cl.features) <= 3
    params = cl.get_params()
    for key in grid_param:
        if key in params:
            assert params[key] == grid.generator.best_params_[key]
        else:
            assert params['clf__' + key] == grid.generator.best_params_[key]
示例#43
0
def test_grid_with_custom_scorer():
    """
    Introducing here special scorer which always uses all data passed to gridsearch.fit as training
    and tests on another fixed dataset (which was passed to scorer) bu computing roc_auc_score from sklearn.
    """

    class CustomScorer(object):
        def __init__(self, testX, testY):
            self.testY = testY
            self.testX = testX

        def __call__(self, base_estimator, params, X, y, sample_weight=None):
            cl = clone(base_estimator)
            cl.set_params(**params)
            if sample_weight is not None:
                cl.fit(X, y, sample_weight)
            else:
                cl.fit(X, y)
            return roc_auc_score(self.testY, cl.predict_proba(self.testX)[:, 1])

    X, y, _ = generate_classification_data()
    custom_scorer = CustomScorer(X, y)

    grid_param = OrderedDict({"n_estimators": [10, 20],
                              "learning_rate": [0.1, 0.05],
                              'features': [['column0', 'column1'], ['column0', 'column1', 'column2']]})
    generator = SubgridParameterOptimizer(grid_param)

    base_estimator = SklearnClassifier(clf=AdaBoostClassifier())
    grid = GridOptimalSearchCV(base_estimator, generator, custom_scorer)

    cl = check_grid(grid, False, False, False)
    assert len(cl.features) <= 3
    params = cl.get_params()
    for key in grid_param:
        if key in params:
            assert params[key] == grid.generator.best_params_[key]
        else:
            assert params['clf__' + key] == grid.generator.best_params_[key]
示例#44
0
def test_gridsearch_metrics_threads(n_threads=3):
    X, y, sample_weight = generate_classification_data(n_classes=2, distance=0.7)
    param_grid = OrderedDict({
        'reg_param': numpy.linspace(0, 1, 20)
    })

    from itertools import cycle

    optimizers = cycle([
        RegressionParameterOptimizer(param_grid=param_grid, n_evaluations=4, start_evaluations=2),
        SubgridParameterOptimizer(param_grid=param_grid, n_evaluations=4),
        RandomParameterOptimizer(param_grid=param_grid, n_evaluations=4),
    ])

    for metric in [RocAuc(), OptimalAMS(), OptimalSignificance(), log_loss]:
        scorer = FoldingScorer(metric)
        clf = SklearnClassifier(QuadraticDiscriminantAnalysis())
        grid = GridOptimalSearchCV(estimator=clf, params_generator=next(optimizers),
                                   scorer=scorer, parallel_profile='threads-{}'.format(n_threads))
        grid.fit(X, y)
        print(grid.params_generator.best_score_)
        print(grid.params_generator.best_params_)
        grid.params_generator.print_results()
示例#45
0
def check_grid(estimator, check_instance=True, has_staged_pp=True, has_importances=True, use_weights=False,
               classification=True):
    if classification:
        X, y, sample_weight = generate_classification_data()
    else:
        X, y, sample_weight = generate_regression_data()
    assert len(sample_weight) == len(X), 'somehow lengths are different'

    if use_weights:
        assert estimator == estimator.fit(X, y, sample_weight=sample_weight)
        estimator = estimator.fit_best_estimator(X, y, sample_weight=sample_weight)
    else:
        assert estimator == estimator.fit(X, y)
        estimator = estimator.fit_best_estimator(X, y)

    if classification:
        check_classification_model(estimator, X, y, check_instance=check_instance, has_staged_pp=has_staged_pp,
                                   has_importances=has_importances)
    else:
        check_regression_model(estimator, X, y, check_instance=check_instance, has_stages=has_staged_pp,
                               has_importances=has_importances)

    return estimator
示例#46
0
def check_grid(estimator,
               check_instance=True,
               has_staged_pp=True,
               has_importances=True,
               use_weights=False,
               classification=True):
    if classification:
        X, y, sample_weight = generate_classification_data()
    else:
        X, y, sample_weight = generate_regression_data()
    assert len(sample_weight) == len(X), 'somehow lengths are different'

    if use_weights:
        assert estimator == estimator.fit(X, y, sample_weight=sample_weight)
        estimator = estimator.fit_best_estimator(X,
                                                 y,
                                                 sample_weight=sample_weight)
    else:
        assert estimator == estimator.fit(X, y)
        estimator = estimator.fit_best_estimator(X, y)

    if classification:
        check_classification_model(estimator,
                                   X,
                                   y,
                                   check_instance=check_instance,
                                   has_staged_pp=has_staged_pp,
                                   has_importances=has_importances)
    else:
        check_regression_model(estimator,
                               X,
                               y,
                               check_instance=check_instance,
                               has_stages=has_staged_pp,
                               has_importances=has_importances)

    return estimator
示例#47
0
def test_theanets_reproducibility():
    clf = TheanetsClassifier(trainers=[{'min_improvement': 1}])
    X, y, _ = generate_classification_data()
    check_classification_reproducibility(clf, X, y)
示例#48
0
def test_Exception_trained_status():
    X, _, _ = generate_classification_data()
    cl = MatrixNetClassifier(api_config_file=CONFIG_FILE_WRONG_URL, iterations=50)
    cl.training_status()
示例#49
0
def test_mn_reproducibility():
    clf = MatrixNetClassifier(iterations=10)
    X, y, _ = generate_classification_data()
    check_classification_reproducibility(clf, X, y)
示例#50
0
def test_Exception_synchronized():
    X, _, _ = generate_classification_data()
    cl = MatrixNetClassifier(api_config_file=CONFIG_FILE_WRONG_URL, iterations=50)
    cl.synchronize()
示例#51
0
def test_mn_reproducibility():
    clf = MatrixNetClassifier(iterations=10)
    X, y, _ = generate_classification_data()
    check_classification_reproducibility(clf, X, y)
示例#52
0
def test_theanets_reproducibility():
    clf = TheanetsClassifier(trainers=[{'min_improvement': 1}])
    X, y, _ = generate_classification_data()
    check_classification_reproducibility(clf, X, y)
示例#53
0
def test_neurolab_reproducibility():
    clf = NeurolabClassifier(layers=[4, 5], epochs=2, trainf=nl.train.train_gd)
    X, y, _ = generate_classification_data()
    check_classification_reproducibility(clf, X, y)
示例#54
0
def pybrain_test_partial_fit():
    clf = PyBrainClassifier(layers=[4], epochs=2)
    X, y, _ = generate_classification_data()
    clf.partial_fit(X, y)
    clf.partial_fit(X[:2], y[:2])
示例#55
0
def test_neurolab_reproducibility():
    clf = NeurolabClassifier(layers=[4, 5], epochs=2, trainf=nl.train.train_gd)
    X, y, _ = generate_classification_data()
    check_classification_reproducibility(clf, X, y)
示例#56
0
def test_partial_fit():
    clf = NeurolabClassifier(layers=[4, 5], epochs=2, trainf=nl.train.train_gd)
    X, y, _ = generate_classification_data()
    clf.fit(X, y)
    clf.partial_fit(X[:2], y[:2])
示例#57
0
def test_partial_fit():
    clf = NeurolabClassifier(layers=[4, 5], epochs=2, trainf=nl.train.train_gd)
    X, y, _ = generate_classification_data()
    clf.fit(X, y)
    clf.partial_fit(X[:2], y[:2])