Пример #1
0
def test_factory():
    factory = ClassifiersFactory()
    try:
        from rep.estimators.tmva import TMVAClassifier
        factory.add_classifier('tmva', TMVAClassifier())
    except ImportError:
        pass
    factory.add_classifier('rf', RandomForestClassifier(n_estimators=10))
    factory.add_classifier('ada', AdaBoostClassifier(n_estimators=20))

    X, y, sample_weight = generate_classification_data()
    assert factory == factory.fit(X, y, sample_weight=sample_weight, features=list(X.columns),
                                  parallel_profile='threads-4')
    for cl in factory.values():
        assert list(cl.features) == list(X.columns)
    proba = factory.predict_proba(X, parallel_profile='threads-4')
    labels = factory.predict(X, parallel_profile='threads-4')
    for key, val in labels.items():
        score = accuracy_score(y, val)
        print(key, score)
        assert score > 0.7, key

    for key, val in proba.items():
        assert numpy.allclose(val.sum(axis=1), 1), 'probabilities do not sum to 1'
        assert numpy.all(val >= 0.), 'negative probabilities'

        auc_score = roc_auc_score(y, val[:, 1])
        print(auc_score)
        assert auc_score > 0.8

    for key, iterator in factory.staged_predict_proba(X).items():
        assert key != 'tmva', 'tmva does not support staged pp'
        for p in iterator:
            assert p.shape == (len(X), 2)

        # checking that last iteration coincides with previous
        assert numpy.all(p == proba[key])

    # testing picklability
    dump_string = cPickle.dumps(factory)
    clf_loaded = cPickle.loads(dump_string)

    assert type(factory) == type(clf_loaded)

    probs1 = factory.predict_proba(X)
    probs2 = clf_loaded.predict_proba(X)
    for key, val in probs1.items():
        assert numpy.all(val == probs2[key]), 'something strange was loaded'

    report = ClassificationReport({'rf': factory['rf']}, LabeledDataStorage(X, y, sample_weight))
    report.feature_importance_shuffling(roc_auc_score_mod).plot(new_plot=True, figsize=(18, 3))
    report = factory.test_on_lds(LabeledDataStorage(X, y, sample_weight))
    report = factory.test_on(X, y, sample_weight=sample_weight)
    val = numpy.mean(X['column0'])
    yield check_report_with_mask, report, "column0 > %f" % (val / 2.), X
    yield check_report_with_mask, report, lambda x: numpy.array(x['column0']) < val * 2., X
    yield check_report_with_mask, report, None, X
Пример #2
0
# save classifiers to pkl file
with open(cname, 'wb') as outfile:
    pickle.dump(classifiers, outfile)

# save in TMVA format
tmva_vars = [(f, 'F') for f in uconfig.features.train]

if "bdt" in uconfig.training.algorithms:
    skTMVA.convert_bdt__Grad(classifiers["bdt"], tmva_vars, wname_bdt)

# make UGradientBoostingClassifier compatible w/ sklearn GradientBoostingClassifier
if "ubdt" in uconfig.training.algorithms:
    from mods import uGB_to_GB
    uGB_to_GB(classifiers["ubdt"])
    skTMVA.convert_bdt__Grad(classifiers["ubdt"], tmva_vars, wname_ubdt)

# save reports
reports = {}
# have to evaluate with all sets of weights because of report structure
for weight in sorted(W):
    reports["train" + weight] = classifiers.test_on(
        trainX, trainY, sample_weight=trainW[weight])
    reports["test" + weight] = classifiers.test_on(testX,
                                                   testY,
                                                   sample_weight=testW[weight])
with open(rname, 'wb') as outfile:
    pickle.dump(reports, outfile)

if args.verbose:
    fprint("Finish saving")