예제 #1
0
def _test_classification_report(n_classes=2):
    classifiers = ClassifiersFactory()
    classifiers.add_classifier('gb',
                               GradientBoostingClassifier(n_estimators=10))
    classifiers.add_classifier('rf', RandomForestClassifier())
    classifiers.add_classifier('ada', AdaBoostClassifier(n_estimators=10))

    X, y = generate_classification_sample(1000, 5, n_classes=n_classes)
    classifiers.fit(X, y)

    X, y = generate_classification_sample(1000, 5, n_classes=n_classes)
    test_lds = LabeledDataStorage(X, y, sample_weight=None)
    report = classifiers.test_on_lds(test_lds)

    val = numpy.mean(X['column0'])
    labels_dict = None
    if n_classes > 2:
        labels_dict = {}
        for i in range(n_classes):
            labels_dict[i] = str(i)
    _classification_mask_report(report, "column0 > %f" % val, X, labels_dict)
    _classification_mask_report(report,
                                lambda x: numpy.array(x['column0']) < val, X,
                                labels_dict)
    _classification_mask_report(report, None, X, labels_dict)
예제 #2
0
def test_factory():
    factory = ClassifiersFactory()
    try:
        from rep.estimators.tmva import TMVAClassifier
        factory.add_classifier('tmva', TMVAClassifier())
    except ImportError:
        pass
    factory.add_classifier('rf', RandomForestClassifier(n_estimators=10))
    factory.add_classifier('ada', AdaBoostClassifier(n_estimators=20))

    X, y, sample_weight = generate_classification_data()
    assert factory == factory.fit(X, y, sample_weight=sample_weight, features=list(X.columns),
                                  parallel_profile='threads-4')
    for cl in factory.values():
        assert list(cl.features) == list(X.columns)
    proba = factory.predict_proba(X, parallel_profile='threads-4')
    labels = factory.predict(X, parallel_profile='threads-4')
    for key, val in labels.items():
        score = accuracy_score(y, val)
        print(key, score)
        assert score > 0.7, key

    for key, val in proba.items():
        assert numpy.allclose(val.sum(axis=1), 1), 'probabilities do not sum to 1'
        assert numpy.all(val >= 0.), 'negative probabilities'

        auc_score = roc_auc_score(y, val[:, 1])
        print(auc_score)
        assert auc_score > 0.8

    for key, iterator in factory.staged_predict_proba(X).items():
        assert key != 'tmva', 'tmva does not support staged pp'
        for p in iterator:
            assert p.shape == (len(X), 2)

        # checking that last iteration coincides with previous
        assert numpy.all(p == proba[key])

    # testing picklability
    dump_string = cPickle.dumps(factory)
    clf_loaded = cPickle.loads(dump_string)

    assert type(factory) == type(clf_loaded)

    probs1 = factory.predict_proba(X)
    probs2 = clf_loaded.predict_proba(X)
    for key, val in probs1.items():
        assert numpy.all(val == probs2[key]), 'something strange was loaded'

    report = ClassificationReport({'rf': factory['rf']}, LabeledDataStorage(X, y, sample_weight))
    report.feature_importance_shuffling(roc_auc_score_mod).plot(new_plot=True, figsize=(18, 3))
    report = factory.test_on_lds(LabeledDataStorage(X, y, sample_weight))
    report = factory.test_on(X, y, sample_weight=sample_weight)
    val = numpy.mean(X['column0'])
    yield check_report_with_mask, report, "column0 > %f" % (val / 2.), X
    yield check_report_with_mask, report, lambda x: numpy.array(x['column0']) < val * 2., X
    yield check_report_with_mask, report, None, X
예제 #3
0
    def __init__(self, base_estimators=None, bagging_base=None, stacking='xgb',
                 features_stack=None, bagging_stack=None, hunting=False,
                 transform=True, transform_pred=True):
        """blablabla


        Parameters
        ----------
        base_estimators : dict('clf': classifier OR keyword-parameters)
            Contains all the level-0 classifiers. The key is the name of the
            classifier and the value is either a **prefitted** classifier or
            a dictionary containing the keyword arguments to instantiate
            such a classifier.

            If no pre-trained classifier is provided, the key-value has to
            be one of the following:
             - **'xgb'** creates an XGBoost classifier
             - **'rdf'** creates a Random Forest classifier
             - **'erf'** creates a Forest consisting of Extra Randomized Trees
             - **'nn'** creates an artificial Neural Network from TheaNets
             - **'ada'** creates an AdaBoost instance with Decision Trees as
               basis
             - **'gb'** creates a Gradient Boosted classifier with Decision
               Trees as basis
        """
        if base_estimators is None:
            OrderedDict(self.__DEFAULT_CLF_CFG)
        else:
            self._base_estimators = base_estimators
        if isinstance(stacking, str):
            self._clf_1 = {stacking: None}
        elif isinstance(stacking, dict):
            self._clf_1 = stacking
        elif stacking in (False, None):
            stacking = False
        else:
            self._clf_1 = {'clf_stacking': stacking}  # stacking is a classifier

        self._transform_data = transform
        self._bagging = bagging_base
        self._hunting = hunting
        self._clf_1_bagging = bagging_stack
        self._features_stack = features_stack
        self._clf_0 = {}
        self._factory = ClassifiersFactory()
        self._base_scaler = None
        self._pred_scaler = None
예제 #4
0
    if args.verbose: fprint("Start loading input")
    with open(args.dir + "/train_uniform_reports.pkl", 'rb') as infile:
        reports = pickle.load(infile)
    if args.verbose: fprint("Finish loading input")

    from mods import config_path
    config_path()
    args.config = args.config.replace(".py", "")
    uconfig = getattr(__import__(args.config, fromlist="uconfig"), "uconfig")

    # check for subset of classifiers
    if len(args.classifiers) > 0:
        for rname, report in reports.iteritems():
            est_old = report.estimators
            pred_old = report.prediction
            est_new = ClassifiersFactory()
            pred_new = OrderedDict()
            for classifier in args.classifiers:
                if classifier in est_old:
                    est_new[classifier] = est_old[classifier]
                    pred_new[classifier] = pred_old[classifier]
                else:
                    raise ValueError("Requested classifier " + classifier +
                                     " not found in report " + rname)
            report.estimators = est_new
            report.prediction = pred_new

    # to serialize plots
    repplots = OrderedDict()
    matplots = OrderedDict()
예제 #5
0
trainX = train_data[0]
testX = test_data[0]
trainY = train_data[1]
testY = test_data[1]
trainW = {}
testW = {}
for iw, weight in enumerate(sorted(W)):
    trainW[weight] = train_data[iw + 2]
    testW[weight] = test_data[iw + 2]

if args.verbose:
    fprint("Split data into train_size=" + str(uconfig.training.size) +
           ", test_size=" + str(uconfig.training.size))

# create classifiers
classifiers = ClassifiersFactory()
weights = OrderedDict()

# standard bdt
if "bdt" in uconfig.training.algorithms:
    base_grad = GradientBoostingClassifier(
        max_depth=uconfig.hyper.max_depth,
        n_estimators=uconfig.hyper.n_estimators,
        subsample=uconfig.hyper.subsample,
        learning_rate=uconfig.hyper.learning_rate,
        min_samples_leaf=uconfig.hyper.min_samples_leaf,
    )
    classifiers["bdt"] = SklearnClassifier(base_grad,
                                           features=uconfig.features.train)
    weights["bdt"] = trainW[uconfig.training.algorithms["bdt"]]