def apply(): if len(sys.argv) < 4: print("Usage: ", sys.argv[0], " apply datafile weightfile") return 1 datafile = sys.argv[2] weightfile = sys.argv[3] X, y, w = readDataFile(datafile) forest = FastBDT.Classifier() forest.load(weightfile) analyse(forest, X, y) return 0
def worker(reduced, data, workerID=None): """Worker process reduced: Feature subset data: Data """ # Make a copy of data data = data.copy() bdt = FastBDT.Classifier() bdt.fit(data[reduced], data.isSignal) data['mva'] = bdt.predict(data[reduced]) auc = roc_auc_score(data.isSignal, data.mva) if workerID is not None: print("Worker %d finished the job!" % workerID) return {'auc': auc}
def output(): if len(sys.argv) < 4: print("Usage: ", sys.argv[0], " output datafile weightfile") return 1 datafile = sys.argv[2] weightfile = sys.argv[3] X, y, w = readDataFile(datafile) forest = FastBDT.Classifier() forest.load(weightfile) p = forest.predict(X) for i, p in enumerate(p): print(int(y[i] == 1), p) return 0
def train(): if len(sys.argv) < 4: print( "Usage: ", sys.argv[0], " train datafile weightfile [nCuts=4] [nTrees=100] [nLevels=3] [shrinkage=0.1] [randRatio=0.5]" ) return 1 datafile = sys.argv[2] weightfile = sys.argv[3] forest = FastBDT.Classifier(*sys.argv[4:]) X, y, w = readDataFile(datafile) forest.fit(X, y, w) forest.save(weightfile) analyse(forest, X, y)
def backward_selection(features, data, min_auc=0.9975): # Make a copy of features so we do not accidentally modify them features = features.copy() print('Fitting the model with all the features...') bdt = FastBDT.Classifier() bdt.fit(data[features], data.isSignal) data['mva'] = bdt.predict(data[features]) init_auc = roc_auc_score(data.isSignal, data.mva) print('Initial AUC:', init_auc) print('Minimum AUC to continue the search: %.4f' % min_auc) current_auc = init_auc best = pd.DataFrame() best = best.append({ 'n_features': len(features), 'best_auc': current_auc }, ignore_index=True) while current_auc >= min_auc: print("Trying to remove one feature...") result = pd.DataFrame() n_workers = 10 pool = multiprocessing.Pool(n_workers) worker_results = [] for i, _ in enumerate(features): reduced = [features[j] for j in range(len(features)) if j != i] worker_results.append(pool.apply_async(worker, [reduced, data])) print('Worker[%d]: Trying to remove %s' % (i, features[i])) print("Waiting for the workers to finish...") for i, res in enumerate(worker_results): print("Return value from worker %d is %r" % (i, res.get())) result = result.append({ 'feature': features[i], **res.get() }, ignore_index=True) print(result) result = result.sort_values(by='auc', ascending=False) current_auc = result.iloc[0].auc best = best.append( { 'n_features': len(features) - 1, 'best_auc': current_auc }, ignore_index=True) features = result.iloc[1:].feature.values print("Highest AUC = %.4f" % current_auc) print("%s will be removed" % result.iloc[0].feature) print('Features left (%d):' % (len(features))) for f in features: print(f) print("AUC table begins".center(40, '=')) print(result) print("AUC table ends".center(40, '=')) fig = plot_auc_chart(init_auc, result) fig.savefig('../models/multiproc/auc_%d.png' % (len(features)), bbox_inches="tight") return best
for i in range(len(mean)): for j in range(i + 1, len(mean)): cov[j][i] = cov[i][j] N_train, N_test = 100000, 2000 data = np.random.multivariate_normal(mean, cov, N_train + N_test) X_train, y_train = data[:N_train, 1:], data[:N_train, 0] > 0 X_test, y_test = data[N_train:, 1:], data[N_train:, 0] > 0 # First variable is the variable we want to have independent of our network output prior = Prior(X_train[y_train == 1, 0], X_train[y_train == 0, 0]) p_prior = prior.get_prior(X_test[:, 0]) evaluation("Prior", X_test, y_test, p_prior, p_prior) p = FastBDT.Classifier().fit(X=X_train, y=y_train).predict(X_test) evaluation("Full", X_test, y_test, p, p_prior) p = FastBDT.Classifier().fit(X=X_train[:, 1:], y=y_train).predict(X_test[:, 1:]) evaluation("Restricted", X_test, y_test, p, p_prior) boost_p = FastBDT.Classifier().fit( X=numpy.r_[X_train[:, 1:], X_train[:, 1:]], y=numpy.r_[numpy.ones(N_train), numpy.zeros(N_train)], weights=prior.get_boost_weights(X_train[:, 0])).predict(X_train[:, 1:]) p = FastBDT.Classifier().fit(X=X_train[:, 1:],
[0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]] for i in range(len(mean)): for j in range(i + 1, len(mean)): cov[j][i] = cov[i][j] N_train, N_test = 100000, 2000 data = np.random.multivariate_normal(mean, cov, N_train + N_test) X_train, y_train = data[:N_train, 1:], data[:N_train, 0] > 0 X_test, y_test = data[N_train:, 1:], data[N_train:, 0] > 0 # First variable is the variable we want to have independent of our network output prior = Prior(X_train[y_train == 1, 0], X_train[y_train == 0, 0]) p_prior = prior.get_prior(X_test[:, 0]) evaluation("Prior", X_test, y_test, p_prior, p_prior) p = FastBDT.Classifier(flatnessLoss=1.0, numberOfFlatnessFeatures=1).fit( X=np.c_[X_train[:, 1:], X_train[:, 0]], y=y_train).predict(X_test[:, 1:]) print(p) evaluation("UBoost", X_test, y_test, p, p_prior) p = FastBDT.Classifier().fit(X=X_train, y=y_train).predict(X_test) evaluation("Full", X_test, y_test, p, p_prior) p = FastBDT.Classifier().fit(X=X_train[:, 1:], y=y_train).predict(X_test[:, 1:]) evaluation("Restricted", X_test, y_test, p, p_prior)
mean = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] cov = [[1.0, 0.8, 0.4, 0.2, 0.1, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]] for i in range(len(mean)): for j in range(i + 1, len(mean)): cov[j][i] = cov[i][j] N_train, N_test = 10000, 10000 data = np.random.multivariate_normal(mean, cov, N_train + N_test) X_train, y_train = data[:N_train, 1:], data[:N_train, 0] > 0 X_test, y_test = data[N_train:, 1:], data[N_train:, 0] > 0 # Train FastBDT using its PythonInterface, which is based on the SKLearn classifiers clf = FastBDT.Classifier(purityTransformation=1) clf.fit(X=X_train, y=y_train) p = clf.predict(X_test) global_auc = sklearn.metrics.roc_auc_score(y_test, p) print("Global AUC", global_auc) # Intern feature importance is calculated using the sum of the information gains # provided by each feature in all decision trees print("Intern Feature Importance") print(clf.internFeatureImportance()) # Extern feature importance is calculated using the drop in the area under the receiver operating characteristics curve # if the most important feature is left out recursively print("Extern Feature Importance") print( clf.externFeatureImportance(X_train, y_train, None, X_test, y_test,
from PyFastBDT import FastBDT import pandas import numpy as np import sklearn.metrics if __name__ == '__main__': data = np.arange(100000) X = (data % 100).reshape((100000, 1)) y = (data % 2) == 1 clf = FastBDT.Classifier(nTrees=1, depth=1, shrinkage=0.1, subsample=1.0, purityTransformation=[False]).fit(X=X, y=y) p = clf.predict(X) print('No Purity Transformation', sklearn.metrics.roc_auc_score(y, p)) clf = FastBDT.Classifier(nTrees=1, depth=1, shrinkage=0.1, subsample=1.0, purityTransformation=[True]).fit(X=X, y=y) p = clf.predict(X) print('With Purity Transformation', sklearn.metrics.roc_auc_score(y, p))
[0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]] for i in range(len(mean)): for j in range(i+1, len(mean)): cov[j][i] = cov[i][j] N_train, N_test = 10000, 10000 data = np.random.multivariate_normal(mean, cov, N_train + N_test) X_train, y_train = data[:N_train, 1:], data[:N_train, 0] > 0 X_test, y_test = data[N_train:, 1:], data[N_train:, 0] > 0 # Train FastBDT using its PythonInterface, which is based on the SKLearn classifiers clf = FastBDT.Classifier() clf.fit(X=X_train, y=y_train) p = clf.predict(X_test) global_auc = sklearn.metrics.roc_auc_score(y_test, p) print("Global AUC", global_auc) # Intern feature importance is calculated using the sum of the information gains # provided by each feature in all decision trees print("Intern Feature Importance") print(clf.internFeatureImportance()) # Extern feature importance is calculated using the drop in the area under the receiver operating characteristics curve # if the most important feature is left out recursively print("Extern Feature Importance") print(clf.externFeatureImportance(X_train, y_train, None, X_test, y_test, None))
mean = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] cov = [[1.0, 0.8, 0.4, 0.2, 0.1, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]] for i in range(len(mean)): for j in range(i + 1, len(mean)): cov[j][i] = cov[i][j] N_train, N_test = 10000, 10000 data = np.random.multivariate_normal(mean, cov, N_train + N_test) X_train, y_train = data[:N_train, 1:], data[:N_train, 0] > 0 X_test, y_test = data[N_train:, 1:], data[N_train:, 0] > 0 # Train FastBDT using its PythonInterface, which is based on the SKLearn classifiers clf = FastBDT.Classifier() clf.fit(X=X_train, y=y_train) p = clf.predict(X_test) global_auc = sklearn.metrics.roc_auc_score(y_test, p) print("Global AUC", global_auc) # Intern feature importance is calculated using the sum of the information gains # provided by each feature in all decision trees print("Intern Feature Importance") print(clf.internFeatureImportance()) # Extern feature importance is calculated using the drop in the area under the receiver operating characteristics curve # if the most important feature is left out recursively print("Extern Feature Importance") print( clf.externFeatureImportance(X_train, y_train, None, X_test, y_test,
weightfile = sys.argv[3] X, y, w = readDataFile(datafile) forest = FastBDT.Classifier() forest.load(weightfile) p = forest.predict(X) for i, p in enumerate(p): print(int(y[i] == 1), p) return 0 if __name__ == '__main__': FastBDT.PrintVersion() if len(sys.argv) <= 1: print("Usage ", sys.argv[0], " [train|apply|output]") sys.exit(1) if sys.argv[1] == 'train': ret = train() elif sys.argv[1] == 'apply': ret = apply() elif sys.argv[1] == 'output': ret = output() else: print("Unkown option", sys.argv[1]) ret = 1
[0.0, 0.0, 0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]] for i in range(len(mean)): for j in range(i + 1, len(mean)): cov[j][i] = cov[i][j] N_train, N_test = 100000, 2000 data = np.random.multivariate_normal(mean, cov, N_train + N_test) X_train, y_train = data[:N_train, 1:], data[:N_train, 0] > 0 X_test, y_test = data[N_train:, 1:], data[N_train:, 0] > 0 # First variable is the variable we want to have independent of our network output prior = Prior(X_train[y_train == 1, 0], X_train[y_train == 0, 0]) p_prior = prior.get_prior(X_test[:, 0]) evaluation("Prior", X_test, y_test, p_prior, p_prior) p = FastBDT.Classifier(flatnessLoss=10.0).fit(X=np.c_[X_train[:, 1:], X_train[:, 0]], y=y_train, nSpectators=1).predict( X_test[:, 1:]) print(p) evaluation("UBoost", X_test, y_test, p, p_prior) p = FastBDT.Classifier().fit(X=X_train, y=y_train).predict(X_test) evaluation("Full", X_test, y_test, p, p_prior) p = FastBDT.Classifier().fit(X=X_train[:, 1:], y=y_train).predict(X_test[:, 1:]) evaluation("Restricted", X_test, y_test, p, p_prior)