Пример #1
0
def apply():
    if len(sys.argv) < 4:
        print("Usage: ", sys.argv[0], " apply datafile weightfile")
        return 1

    datafile = sys.argv[2]
    weightfile = sys.argv[3]

    X, y, w = readDataFile(datafile)

    forest = FastBDT.Classifier()
    forest.load(weightfile)
    analyse(forest, X, y)
    return 0
Пример #2
0
def worker(reduced, data, workerID=None):
    """Worker process
    
    reduced:
        Feature subset
    data:
        Data
    """
    # Make a copy of data
    data = data.copy()
    bdt = FastBDT.Classifier()
    bdt.fit(data[reduced], data.isSignal)
    data['mva'] = bdt.predict(data[reduced])
    auc = roc_auc_score(data.isSignal, data.mva)
    if workerID is not None:
        print("Worker %d finished the job!" % workerID)
    return {'auc': auc}
Пример #3
0
def output():
    if len(sys.argv) < 4:
        print("Usage: ", sys.argv[0], " output datafile weightfile")
        return 1

    datafile = sys.argv[2]
    weightfile = sys.argv[3]

    X, y, w = readDataFile(datafile)

    forest = FastBDT.Classifier()
    forest.load(weightfile)

    p = forest.predict(X)
    for i, p in enumerate(p):
        print(int(y[i] == 1), p)
    return 0
Пример #4
0
def train():
    if len(sys.argv) < 4:
        print(
            "Usage: ", sys.argv[0],
            " train datafile weightfile [nCuts=4] [nTrees=100] [nLevels=3] [shrinkage=0.1] [randRatio=0.5]"
        )
        return 1

    datafile = sys.argv[2]
    weightfile = sys.argv[3]

    forest = FastBDT.Classifier(*sys.argv[4:])
    X, y, w = readDataFile(datafile)

    forest.fit(X, y, w)
    forest.save(weightfile)
    analyse(forest, X, y)
Пример #5
0
def backward_selection(features, data, min_auc=0.9975):
    # Make a copy of features so we do not accidentally modify them
    features = features.copy()

    print('Fitting the model with all the features...')
    bdt = FastBDT.Classifier()
    bdt.fit(data[features], data.isSignal)
    data['mva'] = bdt.predict(data[features])
    init_auc = roc_auc_score(data.isSignal, data.mva)
    print('Initial AUC:', init_auc)
    print('Minimum AUC to continue the search: %.4f' % min_auc)

    current_auc = init_auc
    best = pd.DataFrame()
    best = best.append({
        'n_features': len(features),
        'best_auc': current_auc
    },
                       ignore_index=True)

    while current_auc >= min_auc:
        print("Trying to remove one feature...")
        result = pd.DataFrame()
        n_workers = 10
        pool = multiprocessing.Pool(n_workers)
        worker_results = []
        for i, _ in enumerate(features):
            reduced = [features[j] for j in range(len(features)) if j != i]
            worker_results.append(pool.apply_async(worker, [reduced, data]))
            print('Worker[%d]: Trying to remove %s' % (i, features[i]))
        print("Waiting for the workers to finish...")
        for i, res in enumerate(worker_results):
            print("Return value from worker %d is %r" % (i, res.get()))
            result = result.append({
                'feature': features[i],
                **res.get()
            },
                                   ignore_index=True)
        print(result)
        result = result.sort_values(by='auc', ascending=False)
        current_auc = result.iloc[0].auc
        best = best.append(
            {
                'n_features': len(features) - 1,
                'best_auc': current_auc
            },
            ignore_index=True)
        features = result.iloc[1:].feature.values

        print("Highest AUC = %.4f" % current_auc)
        print("%s will be removed" % result.iloc[0].feature)
        print('Features left (%d):' % (len(features)))
        for f in features:
            print(f)
        print("AUC table begins".center(40, '='))
        print(result)
        print("AUC table ends".center(40, '='))

        fig = plot_auc_chart(init_auc, result)
        fig.savefig('../models/multiproc/auc_%d.png' % (len(features)),
                    bbox_inches="tight")

    return best
        for i in range(len(mean)):
            for j in range(i + 1, len(mean)):
                cov[j][i] = cov[i][j]

        N_train, N_test = 100000, 2000
        data = np.random.multivariate_normal(mean, cov, N_train + N_test)
        X_train, y_train = data[:N_train, 1:], data[:N_train, 0] > 0
        X_test, y_test = data[N_train:, 1:], data[N_train:, 0] > 0

        # First variable is the variable we want to have independent of our network output
        prior = Prior(X_train[y_train == 1, 0], X_train[y_train == 0, 0])
        p_prior = prior.get_prior(X_test[:, 0])
        evaluation("Prior", X_test, y_test, p_prior, p_prior)

        p = FastBDT.Classifier().fit(X=X_train, y=y_train).predict(X_test)
        evaluation("Full", X_test, y_test, p, p_prior)

        p = FastBDT.Classifier().fit(X=X_train[:, 1:],
                                     y=y_train).predict(X_test[:, 1:])
        evaluation("Restricted", X_test, y_test, p, p_prior)

        boost_p = FastBDT.Classifier().fit(
            X=numpy.r_[X_train[:, 1:], X_train[:, 1:]],
            y=numpy.r_[numpy.ones(N_train),
                       numpy.zeros(N_train)],
            weights=prior.get_boost_weights(X_train[:,
                                                    0])).predict(X_train[:,
                                                                         1:])

        p = FastBDT.Classifier().fit(X=X_train[:, 1:],
Пример #7
0
               [0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
               [0.0, 0.0, 0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]]

        for i in range(len(mean)):
            for j in range(i + 1, len(mean)):
                cov[j][i] = cov[i][j]

        N_train, N_test = 100000, 2000
        data = np.random.multivariate_normal(mean, cov, N_train + N_test)
        X_train, y_train = data[:N_train, 1:], data[:N_train, 0] > 0
        X_test, y_test = data[N_train:, 1:], data[N_train:, 0] > 0

        # First variable is the variable we want to have independent of our network output
        prior = Prior(X_train[y_train == 1, 0], X_train[y_train == 0, 0])
        p_prior = prior.get_prior(X_test[:, 0])
        evaluation("Prior", X_test, y_test, p_prior, p_prior)

        p = FastBDT.Classifier(flatnessLoss=1.0,
                               numberOfFlatnessFeatures=1).fit(
                                   X=np.c_[X_train[:, 1:], X_train[:, 0]],
                                   y=y_train).predict(X_test[:, 1:])
        print(p)
        evaluation("UBoost", X_test, y_test, p, p_prior)

        p = FastBDT.Classifier().fit(X=X_train, y=y_train).predict(X_test)
        evaluation("Full", X_test, y_test, p, p_prior)

        p = FastBDT.Classifier().fit(X=X_train[:, 1:],
                                     y=y_train).predict(X_test[:, 1:])
        evaluation("Restricted", X_test, y_test, p, p_prior)
Пример #8
0
    mean = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
    cov = [[1.0, 0.8, 0.4, 0.2, 0.1, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
           [0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
           [0.0, 0.0, 0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]]

    for i in range(len(mean)):
        for j in range(i + 1, len(mean)):
            cov[j][i] = cov[i][j]

    N_train, N_test = 10000, 10000
    data = np.random.multivariate_normal(mean, cov, N_train + N_test)
    X_train, y_train = data[:N_train, 1:], data[:N_train, 0] > 0
    X_test, y_test = data[N_train:, 1:], data[N_train:, 0] > 0

    # Train FastBDT using its PythonInterface, which is based on the SKLearn classifiers
    clf = FastBDT.Classifier(purityTransformation=1)
    clf.fit(X=X_train, y=y_train)
    p = clf.predict(X_test)
    global_auc = sklearn.metrics.roc_auc_score(y_test, p)
    print("Global AUC", global_auc)

    # Intern feature importance is calculated using the sum of the information gains
    # provided by each feature in all decision trees
    print("Intern Feature Importance")
    print(clf.internFeatureImportance())

    # Extern feature importance is calculated using the drop in the area under the receiver operating characteristics curve
    # if the most important feature is left out recursively
    print("Extern Feature Importance")
    print(
        clf.externFeatureImportance(X_train, y_train, None, X_test, y_test,
Пример #9
0
from PyFastBDT import FastBDT

import pandas
import numpy as np
import sklearn.metrics

if __name__ == '__main__':

    data = np.arange(100000)
    X = (data % 100).reshape((100000, 1))
    y = (data % 2) == 1

    clf = FastBDT.Classifier(nTrees=1, depth=1, shrinkage=0.1, subsample=1.0, purityTransformation=[False]).fit(X=X, y=y)
    p = clf.predict(X)
    print('No Purity Transformation', sklearn.metrics.roc_auc_score(y, p))

    clf = FastBDT.Classifier(nTrees=1, depth=1, shrinkage=0.1, subsample=1.0, purityTransformation=[True]).fit(X=X, y=y)
    p = clf.predict(X)
    print('With Purity Transformation', sklearn.metrics.roc_auc_score(y, p))
Пример #10
0
           [0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
           [0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
           [0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
           [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]]

    for i in range(len(mean)):
        for j in range(i+1, len(mean)):
            cov[j][i] = cov[i][j]

    N_train, N_test = 10000, 10000
    data = np.random.multivariate_normal(mean, cov, N_train + N_test)
    X_train, y_train = data[:N_train, 1:], data[:N_train, 0] > 0 
    X_test, y_test = data[N_train:, 1:], data[N_train:, 0] > 0 

    # Train FastBDT using its PythonInterface, which is based on the SKLearn classifiers
    clf = FastBDT.Classifier()
    clf.fit(X=X_train, y=y_train)
    p = clf.predict(X_test)
    global_auc = sklearn.metrics.roc_auc_score(y_test, p)
    print("Global AUC", global_auc)

    # Intern feature importance is calculated using the sum of the information gains
    # provided by each feature in all decision trees
    print("Intern Feature Importance")
    print(clf.internFeatureImportance())

    # Extern feature importance is calculated using the drop in the area under the receiver operating characteristics curve
    # if the most important feature is left out recursively
    print("Extern Feature Importance")
    print(clf.externFeatureImportance(X_train, y_train, None, X_test, y_test, None))
Пример #11
0
    mean = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
    cov = [[1.0, 0.8, 0.4, 0.2, 0.1, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
           [0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
           [0.0, 0.0, 0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]]

    for i in range(len(mean)):
        for j in range(i + 1, len(mean)):
            cov[j][i] = cov[i][j]

    N_train, N_test = 10000, 10000
    data = np.random.multivariate_normal(mean, cov, N_train + N_test)
    X_train, y_train = data[:N_train, 1:], data[:N_train, 0] > 0
    X_test, y_test = data[N_train:, 1:], data[N_train:, 0] > 0

    # Train FastBDT using its PythonInterface, which is based on the SKLearn classifiers
    clf = FastBDT.Classifier()
    clf.fit(X=X_train, y=y_train)
    p = clf.predict(X_test)
    global_auc = sklearn.metrics.roc_auc_score(y_test, p)
    print("Global AUC", global_auc)

    # Intern feature importance is calculated using the sum of the information gains
    # provided by each feature in all decision trees
    print("Intern Feature Importance")
    print(clf.internFeatureImportance())

    # Extern feature importance is calculated using the drop in the area under the receiver operating characteristics curve
    # if the most important feature is left out recursively
    print("Extern Feature Importance")
    print(
        clf.externFeatureImportance(X_train, y_train, None, X_test, y_test,
Пример #12
0
               [0.0, 0.0, 0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]]

        for i in range(len(mean)):
            for j in range(i + 1, len(mean)):
                cov[j][i] = cov[i][j]

        N_train, N_test = 100000, 2000
        data = np.random.multivariate_normal(mean, cov, N_train + N_test)
        X_train, y_train = data[:N_train, 1:], data[:N_train, 0] > 0
        X_test, y_test = data[N_train:, 1:], data[N_train:, 0] > 0

        # First variable is the variable we want to have independent of our network output
        prior = Prior(X_train[y_train == 1, 0], X_train[y_train == 0, 0])
        p_prior = prior.get_prior(X_test[:, 0])
        evaluation("Prior", X_test, y_test, p_prior, p_prior)

        p = FastBDT.Classifier(flatnessLoss=10.0).fit(X=np.c_[X_train[:, 1:],
                                                              X_train[:, 0]],
                                                      y=y_train,
                                                      nSpectators=1).predict(
                                                          X_test[:, 1:])
        print(p)
        evaluation("UBoost", X_test, y_test, p, p_prior)

        p = FastBDT.Classifier().fit(X=X_train, y=y_train).predict(X_test)
        evaluation("Full", X_test, y_test, p, p_prior)

        p = FastBDT.Classifier().fit(X=X_train[:, 1:],
                                     y=y_train).predict(X_test[:, 1:])
        evaluation("Restricted", X_test, y_test, p, p_prior)