예제 #1
0
def test_theanets_single_classification():
    check_classifier(TheanetsClassifier(),
                     supports_weight=False,
                     has_staged_pp=False,
                     has_importances=False)
    check_classifier(TheanetsClassifier(layers=[]),
                     supports_weight=False,
                     has_staged_pp=False,
                     has_importances=False)
    check_classifier(TheanetsClassifier(layers=[20],
                                        trainers=[{
                                            'optimize': 'sgd',
                                            'learning_rate': 0.3
                                        }]),
                     supports_weight=False,
                     has_staged_pp=False,
                     has_importances=False)
    check_classifier(TheanetsClassifier(layers=[5, 5],
                                        trainers=[{
                                            'optimize': 'sgd',
                                            'learning_rate': 0.3
                                        }]),
                     supports_weight=False,
                     has_staged_pp=False,
                     has_importances=False)
    check_classifier(TheanetsClassifier(layers=[5, 5],
                                        trainers=[{
                                            'optimize': 'sgd',
                                            'learning_rate': 0.3
                                        }]),
                     supports_weight=False,
                     has_staged_pp=False,
                     has_importances=False)
예제 #2
0
def test_theanets_configurations():
    check_classifier(
        TheanetsClassifier(layers=[13], scaler=False,
                           trainers=[{'algo': 'nag', 'learning_rate': 0.1}]),
        **classifier_params)
    check_classifier(
        TheanetsClassifier(layers=[5, 5], scaler='minmax',
                           trainers=[{'algo': 'adadelta', 'learning_rate': 0.1}]),
        **classifier_params)
예제 #3
0
def test_theanets_single_classification():
    check_classifier(TheanetsClassifier(trainers=[{
        'patience': 0
    }]), **classifier_params)
    check_classifier(
        TheanetsClassifier(layers=[],
                           scaler='minmax',
                           trainers=[{
                               'patience': 0
                           }]), **classifier_params)
예제 #4
0
def test_theanets_configurations():
    check_classifier(
        TheanetsClassifier(
            layers=[13],
            scaler=False,
            trainers=[dict(algo='nag', learning_rate=0.1, **impatient)]),
        **classifier_params)
    check_classifier(
        TheanetsClassifier(
            layers=[5, 5],
            trainers=[dict(algo='adam', learning_rate=0.01, momentum=0.9)]),
        **classifier_params)
예제 #5
0
def test_theanets_partial_fit():
    clf_complete = TheanetsClassifier(trainers=[{'optimize': 'rmsprop', 'patience': 0}, {'optimize': 'rprop'}])
    clf_partial = TheanetsClassifier(trainers=[{'optimize': 'rmsprop', 'patience': 0}])
    X, y, sample_weight = generate_classification_data()
    clf_complete.fit(X, y)
    clf_partial.fit(X, y)
    clf_partial.partial_fit(X, y, optimize='rprop')

    assert clf_complete.trainers == clf_partial.trainers, 'trainers not saved in partial fit'

    auc_complete = roc_auc_score(y, clf_complete.predict_proba(X)[:, 1])
    auc_partial = roc_auc_score(y, clf_partial.predict_proba(X)[:, 1])

    assert auc_complete == auc_partial, 'same networks return different results'
예제 #6
0
def test_theanets_configurations():
    check_classifier(
        TheanetsClassifier(layers=[20],
                           scaler=False,
                           trainers=[{
                               'optimize': 'nag',
                               'learning_rate': 0.3,
                               'min_improvement': 0.5
                           }]), **classifier_params)
    check_classifier(
        TheanetsClassifier(layers=[5, 5],
                           trainers=[{
                               'optimize': 'nag',
                               'learning_rate': 0.3,
                               'min_improvement': 0.5
                           }]), **classifier_params)
예제 #7
0
def test_theanets_multiclassification():
    check_classifier(TheanetsClassifier(trainers=[{
        'min_improvement': 0.1,
        'learning_rate': 0.1
    }]),
                     n_classes=4,
                     **classifier_params)
예제 #8
0
def test_theanets_partial_fit():
    clf_complete = TheanetsClassifier(layers=[2], trainers=[{'algo': 'rmsprop', 'learning_rate': 0.1},
                                                            {'algo': 'rprop', 'learning_rate': 0.1}])
    clf_partial = TheanetsClassifier(layers=[2], trainers=[{'algo': 'rmsprop', 'learning_rate': 0.1}])
    X, y, sample_weight = generate_classification_data()
    clf_complete.fit(X, y)
    clf_partial.fit(X, y)
    clf_partial.partial_fit(X, y, algo='rprop', learning_rate=0.1)

    assert clf_complete.trainers == clf_partial.trainers, 'trainers not saved in partial fit'

    auc_complete = roc_auc_score(y, clf_complete.predict_proba(X)[:, 1])
    auc_partial = roc_auc_score(y, clf_partial.predict_proba(X)[:, 1])

    # Known fail of theanets
    assert auc_complete == auc_partial, 'same networks return different results'
예제 #9
0
def test_theanets_simple_stacking():
    base_tnt = TheanetsClassifier()
    check_classifier(SklearnClassifier(
        clf=BaggingClassifier(base_estimator=base_tnt, n_estimators=3)),
                     supports_weight=False,
                     has_staged_pp=False,
                     has_importances=False)
예제 #10
0
def test_theanets_multiple_classification():
    check_classifier(
        TheanetsClassifier(trainers=[{
            'optimize': 'adadelta',
            'min_improvement': 0.5
        }, {
            'optimize': 'nag'
        }]), **classifier_params)
예제 #11
0
def test_theanets_reproducibility():
    clf = TheanetsClassifier(trainers=[{
        'algo': 'nag',
        'min_improvement': 0.1,
        'max_updates': 10
    }])
    X, y, _ = generate_classification_data()
    check_classification_reproducibility(clf, X, y)
예제 #12
0
def test_theanets_reproducibility():
    clf = TheanetsClassifier(trainers=[{
        'algo': 'nag',
        'min_improvement': 0.1
    }])
    X, y, _ = generate_classification_data()
    import numpy
    numpy.random.seed(43)
    check_classification_reproducibility(clf, X, y)
예제 #13
0
def test_theanets_multiple_classification():
    check_classifier(TheanetsClassifier(trainers=[{
        'optimize': 'adadelta'
    }, {
        'optimize': 'nag'
    }]),
                     supports_weight=False,
                     has_staged_pp=False,
                     has_importances=False)
예제 #14
0
def test_pretrain():
    clf = TheanetsClassifier(layers=[5, 5],
                             trainers=[{
                                 'algo': 'pretrain',
                                 'learning_rate': 0.1
                             }, {
                                 'algo': 'nag',
                                 'learning_rate': 0.1
                             }])
    check_classifier(clf, **classifier_params)
예제 #15
0
def test_pretrain():
    clf = TheanetsClassifier(trainers=[{
        'optimize': 'pretrain',
        'patience': 1,
        'learning_rate': 0.1
    }, {
        'optimize': 'nag',
        'patience': 1
    }])
    check_classifier(clf, **classifier_params)
예제 #16
0
def test_pretrain():
    trainX, trainY, _ = generate_classification_data()
    trainers = [{
        'algo': 'pretrain',
        'learning_rate': 0.5,
        'patience': 1,
        'validate_every': 1
    }]
    # only checking that fitting doesn't throw errors
    # this frequently gets stuck on CI
    TheanetsClassifier(layers=[5], trainers=trainers).fit(trainX, trainY)
예제 #17
0
def test_theanets_reproducibility():
    clf = TheanetsClassifier()
    X, y, sample_weight = generate_classification_data()
    clf.fit(X, y)
    auc = roc_auc_score(y, clf.predict_proba(X)[:, 1])
    for i in range(2):
        clf.fit(X, y)
        curr_auc = roc_auc_score(y, clf.predict_proba(X)[:, 1])
        assert auc == curr_auc, 'running a network twice produces different results'

    cloned_clf = clone(clf)
    cloned_clf.fit(X, y)
    cloned_auc = roc_auc_score(y, cloned_clf.predict_proba(X)[:, 1])
    assert cloned_auc == auc, 'cloned network produces different result'
예제 #18
0
def test_theanets_reproducibility():
    clf = TheanetsClassifier()
    X, y, sample_weight = generate_classification_data()
    clf.fit(X, y)
    auc = roc_auc_score(y, clf.predict_proba(X)[:, 1])
    for i in range(2):
        clf.fit(X, y)
        curr_auc = roc_auc_score(y, clf.predict_proba(X)[:, 1])
        assert auc == curr_auc, 'running a network twice produces different results'

    cloned_clf = clone(clf)
    cloned_clf.fit(X, y)
    cloned_auc = roc_auc_score(y, cloned_clf.predict_proba(X)[:, 1])
    assert cloned_auc == auc, 'cloned network produces different result'
예제 #19
0
    def _make_clf(self, clf, bagging=None):
        """Creates a classifier from a dict or returns the clf"""
        if isinstance(clf, dict):
            key, val = clf.popitem()
            try:
                val = self.__DEFAULT_CLF_CFG.get(key) if val is None else val
            except KeyError:
                logger.error(str(val) + " not an implemented classifier.")
                raise

            temp_bagging = val.pop('bagging', bagging)
            bagging = temp_bagging if bagging is None else bagging

            if key == 'rdf':
                config_clf = dict(val)  # possible multi-threading arguments
                clf = SklearnClassifier(RandomForestClassifier(**config_clf))
            elif key == 'erf':
                config_clf = dict(val)  # possible multi-threading arguments
                clf = SklearnClassifier(ExtraTreesClassifier(**config_clf))
            elif key == 'nn':
                config_clf = dict(val)  # possible multi-threading arguments
                clf = TheanetsClassifier(**config_clf)
            elif key == 'ada':
                config_clf = dict(val)  # possible multi-threading arguments
                clf = SklearnClassifier(AdaBoostClassifier(**config_clf))
            elif key == 'gb':
                config_clf = dict(val)  # possible multi-threading arguments
                clf = SklearnClassifier(GradientBoostingClassifier(**config_clf))
            elif key == 'xgb':
                config_clf = dict(val)  # possible multi-threading arguments
                clf = XGBoostClassifier(**config_clf)
            elif hasattr(clf, 'fit'):
                bagging = False  # return the classifier

            # bagging over the instantiated estimators
            if isinstance(bagging, int) and bagging >= 1:
                bagging = dict(self.__DEFAULT_BAG_CFG, n_estimators=bagging)
            if isinstance(bagging, dict):
                # TODO: implement multi-thread:
                bagging.update({'base_estimator': clf})
                clf = SklearnClassifier(BaggingClassifier(**bagging))
        else:
            raise ValueError(str(clf) + " not valid as a classifier.")

        clf = {key: clf}
        return clf
예제 #20
0
def clf_mayou(data1, data2, n_folds=3, n_base_clf=5):
    """DEVELOPEMENT, WIP. Test a setup of clf involving bagging and stacking."""
    # import raredecay.analysis.ml_analysis as ml_ana
    # import pandas as pd
    import copy

    from rep.estimators import SklearnClassifier, XGBoostClassifier
    from rep.metaml.folding import FoldingClassifier
    from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
    from sklearn.ensemble import BaggingClassifier  # , VotingClassifier, AdaBoostClassifier
    from rep.estimators.theanets import TheanetsClassifier
    from sklearn.linear_model import LogisticRegression
    from rep.metaml.cache import CacheClassifier

    from rep.report.metrics import RocAuc

    import rep.metaml.cache
    from rep.metaml._cache import CacheHelper
    rep.metaml.cache.cache_helper = CacheHelper('/home/mayou/cache', 100000)

    #    data1.make_folds(n_folds)
    #    data2.make_folds(n_folds)
    output = {}

    # for i in range(n_folds):
    xgb_clf = XGBoostClassifier(n_estimators=350,
                                eta=0.1,
                                max_depth=4,
                                nthreads=3)
    xgb_folded = FoldingClassifier(base_estimator=xgb_clf,
                                   stratified=True,
                                   parallel_profile='threads-2')
    xgb_bagged = BaggingClassifier(base_estimator=xgb_folded,
                                   n_estimators=n_base_clf,
                                   bootstrap=False)
    xgb_bagged = SklearnClassifier(xgb_bagged)
    xgb_big_stacker = copy.deepcopy(xgb_bagged)
    xgb_bagged = CacheClassifier(name='xgb_bagged1', clf=xgb_bagged)

    xgb_single = XGBoostClassifier(n_estimators=350,
                                   eta=0.1,
                                   max_depth=4,
                                   nthreads=3)
    xgb_single = FoldingClassifier(base_estimator=xgb_single,
                                   stratified=True,
                                   n_folds=10,
                                   parallel_profile='threads-2')
    xgb_single = CacheClassifier(name='xgb_singled1', clf=xgb_single)

    rdf_clf = SklearnClassifier(
        RandomForestClassifier(n_estimators=300, n_jobs=3))
    rdf_folded = FoldingClassifier(base_estimator=rdf_clf,
                                   stratified=True,
                                   parallel_profile='threads-2')
    rdf_bagged = BaggingClassifier(base_estimator=rdf_folded,
                                   n_estimators=n_base_clf,
                                   bootstrap=False)
    rdf_bagged = SklearnClassifier(rdf_bagged)
    rdf_bagged = CacheClassifier(name='rdf_bagged1', clf=rdf_bagged)

    gb_clf = SklearnClassifier(GradientBoostingClassifier(n_estimators=50))
    gb_folded = FoldingClassifier(base_estimator=gb_clf,
                                  stratified=True,
                                  parallel_profile='threads-6')
    gb_bagged = BaggingClassifier(base_estimator=gb_folded,
                                  n_estimators=n_base_clf,
                                  bootstrap=False,
                                  n_jobs=5)
    gb_bagged = SklearnClassifier(gb_bagged)
    gb_bagged = CacheClassifier(name='gb_bagged1', clf=gb_bagged)

    nn_clf = TheanetsClassifier(layers=[300, 300],
                                hidden_dropout=0.03,
                                trainers=[{
                                    'optimize': 'adagrad',
                                    'patience': 5,
                                    'learning_rate': 0.2,
                                    'min_improvement': 0.1,
                                    'momentum': 0.4,
                                    'nesterov': True,
                                    'loss': 'xe'
                                }])
    nn_folded = FoldingClassifier(base_estimator=nn_clf,
                                  stratified=True,
                                  parallel_profile=None)  # 'threads-6')
    nn_bagged = BaggingClassifier(base_estimator=nn_folded,
                                  n_estimators=n_base_clf,
                                  bootstrap=False,
                                  n_jobs=1)
    nn_bagged = CacheClassifier(name='nn_bagged1', clf=nn_bagged)

    nn_single_clf = TheanetsClassifier(layers=[300, 300, 300],
                                       hidden_dropout=0.03,
                                       trainers=[{
                                           'optimize': 'adagrad',
                                           'patience': 5,
                                           'learning_rate': 0.2,
                                           'min_improvement': 0.1,
                                           'momentum': 0.4,
                                           'nesterov': True,
                                           'loss': 'xe'
                                       }])
    nn_single = FoldingClassifier(base_estimator=nn_single_clf,
                                  n_folds=3,
                                  stratified=True)
    nn_single = CacheClassifier(name='nn_single1', clf=nn_single)

    logit_stacker = SklearnClassifier(
        LogisticRegression(penalty='l2', solver='sag'))
    logit_stacker = FoldingClassifier(base_estimator=logit_stacker,
                                      n_folds=n_folds,
                                      stratified=True,
                                      parallel_profile='threads-6')
    logit_stacker = CacheClassifier(name='logit_stacker1', clf=logit_stacker)

    xgb_stacker = XGBoostClassifier(n_estimators=400,
                                    eta=0.1,
                                    max_depth=4,
                                    nthreads=8)
    # HACK
    xgb_stacker = xgb_big_stacker
    xgb_stacker = FoldingClassifier(base_estimator=xgb_stacker,
                                    n_folds=n_folds,
                                    random_state=42,
                                    stratified=True,
                                    parallel_profile='threads-6')
    xgb_stacker = CacheClassifier(name='xgb_stacker1', clf=xgb_stacker)

    #        train1, test1 = data1.get_fold(i)
    #        train2, test2 = data1.get_fold(i)
    #
    #        t_data, t_targets, t_weights =
    data, targets, weights = data1.make_dataset(data2, weights_ratio=1)

    #    xgb_bagged.fit(data, targets, weights)
    #    xgb_report = xgb_bagged.test_on(data, targets, weights)
    #    xgb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC xgb_base classifier")
    #    output['xgb_base'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc()))
    #    xgb_proba = xgb_report.prediction['clf'][:, 1]
    #    del xgb_bagged, xgb_folded, xgb_clf, xgb_report
    #
    #    xgb_single.fit(data, targets, weights)
    #    xgb_report = xgb_single.test_on(data, targets, weights)
    #    xgb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC xgb_single classifier")
    #    output['xgb_single'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc()))
    #    xgb_proba = xgb_report.prediction['clf'][:, 1]
    #    del xgb_single, xgb_report

    nn_single.fit(data, targets, weights)
    nn_report = nn_single.test_on(data, targets, weights)
    nn_report.roc(physics_notion=True).plot(
        new_plot=True, title="ROC AUC nn_single classifier")
    output['nn_single'] = "roc auc:" + str(
        nn_report.compute_metric(metric=RocAuc()))
    # nn_proba = nn_report.prediction['clf'][:, 1]
    del nn_single, nn_report

    #    rdf_bagged.fit(data, targets, weights)
    #    rdf_report = rdf_bagged.test_on(data, targets, weights)
    #    rdf_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC rdf_base classifier")
    #    output['rdf_base'] = "roc auc:" + str(rdf_report.compute_metric(metric=RocAuc()))
    #    rdf_proba = rdf_report.prediction['clf'][:, 1]
    #    del rdf_bagged, rdf_clf, rdf_folded, rdf_report

    #    gb_bagged.fit(data, targets, weights)
    #    gb_report = gb_bagged.test_on(data, targets, weights)
    #    gb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC gb_base classifier")
    #    output['gb_base'] = "roc auc:" + str(gb_report.compute_metric(metric=RocAuc()))
    #    gb_proba = gb_report.prediction['clf'][:, 1]
    #    del gb_bagged, gb_clf, gb_folded, gb_report

    #    nn_bagged.fit(data, targets, weights)
    #    nn_report = nn_bagged.test_on(data, targets, weights)
    #    nn_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC nn_base classifier")
    #    output['nn_base'] = "roc auc:" + str(nn_report.compute_metric(metric=RocAuc()))
    #    nn_proba = nn_report.prediction['clf'][:, 1]
    #    del nn_bagged, nn_clf, nn_folded, nn_report
    #
    #    base_predict = pd.DataFrame({'xgb': xgb_proba,
    #                                 #'rdf': rdf_proba,
    #                                 #'gb': gb_proba,
    #                                 'nn': nn_proba
    #                                 })
    #
    #
    #    xgb_stacker.fit(base_predict, targets, weights)
    #    xgb_report = xgb_stacker.test_on(base_predict, targets, weights)
    #    xgb_report.roc(physics_notion=True).plot(new_plot=True,
    #    title="ROC AUC xgb_stacked classifier")
    #    output['stacker_xgb'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc()))
    #    del xgb_stacker, xgb_report
    #
    #    logit_stacker.fit(base_predict, targets, weights)
    #    logit_report = logit_stacker.test_on(base_predict, targets, weights)
    #    logit_report.roc(physics_notion=True).plot(new_plot=True,
    #    title="ROC AUC logit_stacked classifier")
    #    output['stacker_logit'] = "roc auc:" + str(logit_report.compute_metric(metric=RocAuc()))
    #    del logit_stacker, logit_report

    print output
예제 #21
0
def test_theanets_reproducibility():
    clf = TheanetsClassifier(trainers=[{'min_improvement': 1}])
    X, y, _ = generate_classification_data()
    check_classification_reproducibility(clf, X, y)
예제 #22
0
def test_theanets_multiclassification():
    check_classifier(TheanetsClassifier(trainers=[{
        'patience': 0
    }]),
                     n_classes=4,
                     **classifier_params)
예제 #23
0
def test_theanets_partial_fit():
    clf_complete = TheanetsClassifier(trainers=[{
        'optimize': 'rmsprop'
    }, {
        'optimize': 'rprop'
    }])
    clf_partial = TheanetsClassifier(trainers=[{'optimize': 'rmsprop'}])
    X, y, sample_weight = generate_classification_data()
    clf_complete.fit(X, y)
    clf_partial.fit(X, y)
    clf_partial.partial_fit(X, y, optimize='rprop')

    assert clf_complete.trainers == clf_partial.trainers, 'trainers not saved in partial fit'

    auc_complete = roc_auc_score(y, clf_complete.predict_proba(X)[:, 1])
    auc_partial = roc_auc_score(y, clf_partial.predict_proba(X)[:, 1])

    assert auc_complete == auc_partial, 'same networks return different results'
예제 #24
0
def test_theanets_simple_stacking():
    base_tnt = TheanetsClassifier(trainers=[{'min_improvement': 0.1}])
    base_bagging = BaggingClassifier(base_estimator=base_tnt, n_estimators=3)
    check_classifier(SklearnClassifier(clf=base_bagging), **classifier_params)
예제 #25
0
def test_theanets_partial_fit():
    clf_complete = TheanetsClassifier(layers=[2],
                                      trainers=[{
                                          'algo': 'rmsprop',
                                          'learning_rate': 0.1
                                      }, {
                                          'algo': 'rprop',
                                          'learning_rate': 0.1
                                      }])
    clf_partial = TheanetsClassifier(layers=[2],
                                     trainers=[{
                                         'algo': 'rmsprop',
                                         'learning_rate': 0.1
                                     }])
    X, y, sample_weight = generate_classification_data()
    clf_complete.fit(X, y)
    clf_partial.fit(X, y)
    clf_partial.partial_fit(X, y, algo='rprop', learning_rate=0.1)

    assert clf_complete.trainers == clf_partial.trainers, 'trainers not saved in partial fit'

    auc_complete = roc_auc_score(y, clf_complete.predict_proba(X)[:, 1])
    auc_partial = roc_auc_score(y, clf_partial.predict_proba(X)[:, 1])

    # Known fail of theanets
    assert auc_complete == auc_partial, 'same networks return different results'