def check_ks_of_expression(expression): ''' Do KS test on original and reweighted distributions Draws a bar chart to compare results for different weighting methods Original version taken from hep-ml package ''' col_original = original_test.eval(expression, engine='python') col_target = target_test.eval(expression, engine='python') w_target = np.ones(len(col_target), dtype='float') w_foldingTarget = np.ones(len(targetCollisions['p3_energy']),dtype='float') noReweight = ks_2samp_weighted(col_original, col_target, weights1=original_weights_test, weights2=w_target) binReweight = ks_2samp_weighted(col_original, col_target, weights1=bins_weights_test, weights2=w_target) gbReweight = ks_2samp_weighted(col_original, col_target, weights1=gb_weights_test, weights2=w_target) #Folding reweight uses whole dataset, so hardwire in for now foldingReweight = ks_2samp_weighted(originalCollisions.eval('p3_momentum * p4_momentum * p3_energy * p4_energy * p3_theta * p4_theta'), targetCollisions.eval('p3_momentum * p4_momentum * p3_energy * p4_energy * p3_theta * p4_theta'), weights1=folding_weights, weights2=np.ones(len(targetCollisions['p3_energy']),dtype='float')) print('No Reweight KS:', noReweight) print('Bins Reweight KS:', binReweight) print('GB Reweight KS:', gbReweight) print('Folding Reweight KS:', foldingReweight) plt.bar(['No Weights','Bin Reweighting','GB Weights','Folding Weights','NN Weights', 'Ada NN','lbfgs NN'],[noReweight,binReweight,gbReweight,foldingReweight,neuralReweight,adaReweight,lbfgsReweight],color=['green','blue','blue','blue','red','red','red'])
def check_reweighter(n_dimensions, n_samples, reweighter): mean_original = numpy.random.normal(size=n_dimensions) cov_original = numpy.diag([1.] * n_dimensions) mean_target = numpy.random.mtrand.multivariate_normal(mean=mean_original, cov=cov_original) cov_target = cov_original * 0.4 + numpy.ones([n_dimensions, n_dimensions]) * 0.2 original = numpy.random.mtrand.multivariate_normal(mean=mean_original, cov=cov_original, size=n_samples + 1) original_weight = numpy.ones(n_samples + 1) target = numpy.random.mtrand.multivariate_normal(mean=mean_target, cov=cov_target, size=n_samples) target_weight = numpy.ones(n_samples) reweighter.fit(original, target, original_weight=original_weight, target_weight=target_weight) new_weights = reweighter.predict_weights(original, original_weight=original_weight) av_orig = numpy.average(original, weights=original_weight, axis=0) print('WAS', av_orig) av_now = numpy.average(original, weights=new_weights, axis=0) print('NOW:', av_now) av_ideal = numpy.average(target, weights=target_weight, axis=0) print('IDEAL:', av_ideal) print('COVARIATION') print('WAS', weighted_covar(original, original_weight)) print('NOW', weighted_covar(original, new_weights)) print('IDEAL', weighted_covar(target, target_weight)) assert numpy.all(abs(av_now - av_ideal) < abs(av_orig - av_ideal)), 'deviation is wrong' for dim in range(n_dimensions): diff1 = ks_2samp_weighted(original[:, dim], target[:, dim], original_weight, target_weight) diff2 = ks_2samp_weighted(original[:, dim], target[:, dim], new_weights, target_weight) print('KS', diff1, diff2) assert diff2 < diff1, 'Differences {} {}'.format(diff1, diff2)
def check_reweighter(n_dimensions, n_samples, reweighter, folding=False): mean_original = numpy.random.normal(size=n_dimensions) cov_original = numpy.diag([1.] * n_dimensions) mean_target = numpy.random.mtrand.multivariate_normal(mean=mean_original, cov=cov_original) cov_target = cov_original * 0.4 + numpy.ones([n_dimensions, n_dimensions ]) * 0.2 original = numpy.random.mtrand.multivariate_normal(mean=mean_original, cov=cov_original, size=n_samples + 1) original_weight = numpy.ones(n_samples + 1) target = numpy.random.mtrand.multivariate_normal(mean=mean_target, cov=cov_target, size=n_samples) target_weight = numpy.ones(n_samples) reweighter.fit(original, target, original_weight=original_weight, target_weight=target_weight) new_weights_array = [] new_weights_array.append( reweighter.predict_weights(original, original_weight=original_weight)) if folding: def mean_vote(x): return numpy.mean(x, axis=0) new_weights_array.append( reweighter.predict_weights(original, original_weight=original_weight, vote_function=mean_vote)) for new_weights in new_weights_array: av_orig = numpy.average(original, weights=original_weight, axis=0) print('WAS', av_orig) av_now = numpy.average(original, weights=new_weights, axis=0) print('NOW:', av_now) av_ideal = numpy.average(target, weights=target_weight, axis=0) print('IDEAL:', av_ideal) print('COVARIANCE') print('WAS', weighted_covariance(original, original_weight)) print('NOW', weighted_covariance(original, new_weights)) print('IDEAL', weighted_covariance(target, target_weight)) assert numpy.all( abs(av_now - av_ideal) < abs(av_orig - av_ideal)), 'averages are wrong' for dim in range(n_dimensions): diff1 = ks_2samp_weighted(original[:, dim], target[:, dim], original_weight, target_weight) diff2 = ks_2samp_weighted(original[:, dim], target[:, dim], new_weights, target_weight) print('KS', diff1, diff2) assert diff2 < diff1, 'Differences {} {}'.format(diff1, diff2)
def check_ks_of_expression(expression): col_original = original_test.eval(expression, engine='python') col_target = target_test.eval(expression, engine='python') w_target = numpy.ones(len(col_target), dtype='float') print('Variable: %s'%expression) print('No reweight KS:', ks_2samp_weighted(col_original, col_target, weights1=original_weights_test, weights2=w_target)) print('GB Reweight KS:', ks_2samp_weighted(col_original, col_target, weights1=gb_weights_test, weights2=w_target))
def draw_distributions_weighted(original, target, new_original_weights, target_sWeights, filename): fig = plt.figure() for id, column in enumerate(columns, 1): xlim = numpy.percentile(numpy.hstack([target[column]]), [0.01, 99.99]) ax = plt.subplot(2, 3, id) ax.hist(original[column], weights=new_original_weights, range=xlim, **hist_settings) ax.hist(target[column], weights=target_sWeights, range=xlim, **hist_settings) ax.set_title(column) print('KS over %s = %s' % (column, ks_2samp_weighted(original[column], target[column], weights1=new_original_weights, weights2=target_sWeights))) fig.savefig(filename)
def draw_distributions(myoriginal, mytarget, new_original_weights, targetwts): sum_ks = 0 ctr = 0 plt.figure(figsize=[15, 7]) for id, column in enumerate(columns[0:len], 1): ctr = ctr + 1 xlim = numpy.percentile(numpy.hstack([mytarget[column]]), [0.01, 99.99]) plt.subplot(2, 3, id) plt.hist(myoriginal[column], weights=new_original_weights, range=xlim, **hist_settings) plt.hist(mytarget[column], weights=targetwts, range=xlim, **hist_settings) plt.title(column) myks = ks_2samp_weighted(myoriginal[column], mytarget[column], weights1=new_original_weights, weights2=targetwts) sum_ks = sum_ks + myks # print('KS over ', column, ' = ', myks) plt.draw() # plt.figure(figsize=[15, 7]) # for id, column in enumerate(columns[6:len], 1): # xlim = numpy.percentile(numpy.hstack([mytarget[column]]), # [0.01, 99.99]) # plt.subplot(2, 3, id) # plt.hist(myoriginal[column], weights=new_original_weights, range=xlim, # **hist_settings) # plt.hist(mytarget[column], weights=targetwts, range=xlim, # **hist_settings) # plt.title(column) # myks = ks_2samp_weighted(myoriginal[column], mytarget[column], # weights1=new_original_weights, # weights2=targetwts) # sum_ks = sum_ks + myks # # print('KS over ', column, ' = ', myks) # plt.draw() # plt.figure(figsize=[15, 7]) # for id, column in enumerate(columns[12:14], 1): # xlim = numpy.percentile(numpy.hstack([mytarget[column]]), # [0.01, 99.99]) # plt.subplot(2, 3, id) # plt.hist(myoriginal[column], weights=new_original_weights, range=xlim, # **hist_settings) # plt.hist(mytarget[column], weights=targetwts, range=xlim, # **hist_settings) # plt.title(column) # myks = ks_2samp_weighted(myoriginal[column], mytarget[column], # weights1=new_original_weights, # weights2=targetwts) # sum_ks = sum_ks + myks # # print('KS over ', column, ' = ', myks) # plt.draw() avg_ks = sum_ks / ctr print('average of KS distances = ', avg_ks) return avg_ks
def draw_distributions(original, target, new_original_weights, splot_weights): hist1_settings = {'bins': 20, 'density': True, 'alpha': 0.7} plt.figure(figsize=[15, 7]) for id, column in enumerate(used_branch, 1): xlim = numpy.percentile(numpy.hstack([target[column]]), [0.01, 99.99]) plt.subplot(2, 3, id) plt.hist(original[column], weights=new_original_weights, range=xlim, **hist1_settings, label="MC(weighted)") plt.hist(target[column], weights=splot_weights, range=xlim, **hist1_settings, label="Data(splot)") handles, labels = plt.gca().get_legend_handles_labels() plt.legend(loc='best') plt.title(column) print( 'KS over ', column, ' = ', ks_2samp_weighted(original[column], target[column], weights1=new_original_weights, weights2=splot_weights)) plt.savefig('compare_show.pdf') plt.show()
def draw_distributions(original, target, new_original_weights, evaluation_method = 'ks'): #Draws histograms of target data and reweighted monte carlo data #Evaluation method is 'ks' or 'kl' for Kolmogorov–Smirnov or Kullback-Leibler plt.figure(figsize=[15,8]) #swap these around for id, column in enumerate(columns, 1): xlim = np.percentile(np.hstack([target[column]]), [0.01, 99.99]) plt.subplot(2,3, id) # and these around to change how hists are stacked #Plot angles in degrees rather than radians if column == 'p3_theta' or column == 'p4_theta' or column == 'p5_theta' or column == 'p6_theta': plt.hist(original[column]*(180/math.pi), weights=new_original_weights, range=xlim*(180/math.pi), **hist_settings) plt.hist(target[column]*(180/math.pi), range=xlim*(180/math.pi), **hist_settings) plt.title(column) else: plt.hist(original[column], weights=new_original_weights, range=xlim, **hist_settings) plt.hist(target[column], range=xlim, **hist_settings) plt.title(column) if evaluation_method == 'ks': print('KS over ', column, ' = ', ks_2samp_weighted(original[column], target[column], weights1=new_original_weights, weights2=np.ones(len(target), dtype=float))) elif evaluation_method == 'kl': target_hist = np.histogram(target[column],density=True,bins=20) original_hist = np.histogram(original[column],density=True,weights=new_original_weights,bins=20) print('KL over ', column, ' = ', entropy(original_hist,target_hist))
def check_reweighter(n_dimensions, n_samples, reweighter): mean_original = numpy.random.normal(size=n_dimensions) cov_original = numpy.diag([1.] * n_dimensions) mean_target = numpy.random.mtrand.multivariate_normal(mean=mean_original, cov=cov_original) cov_target = cov_original * 0.4 + numpy.ones([n_dimensions, n_dimensions ]) * 0.2 original = numpy.random.mtrand.multivariate_normal(mean=mean_original, cov=cov_original, size=n_samples + 1) original_weight = numpy.ones(n_samples + 1) target = numpy.random.mtrand.multivariate_normal(mean=mean_target, cov=cov_target, size=n_samples) target_weight = numpy.ones(n_samples) reweighter.fit(original, target, original_weight=original_weight, target_weight=target_weight) new_weights = reweighter.predict_weights(original, original_weight=original_weight) av_orig = numpy.average(original, weights=original_weight, axis=0) print('WAS', av_orig) av_now = numpy.average(original, weights=new_weights, axis=0) print('NOW:', av_now) av_ideal = numpy.average(target, weights=target_weight, axis=0) print('IDEAL:', av_ideal) print('COVARIATION') print('WAS', weighted_covar(original, original_weight)) print('NOW', weighted_covar(original, new_weights)) print('IDEAL', weighted_covar(target, target_weight)) assert numpy.all( abs(av_now - av_ideal) < abs(av_orig - av_ideal)), 'deviation is wrong' for dim in range(n_dimensions): diff1 = ks_2samp_weighted(original[:, dim], target[:, dim], original_weight, target_weight) diff2 = ks_2samp_weighted(original[:, dim], target[:, dim], new_weights, target_weight) print('KS', diff1, diff2) assert diff2 < diff1, 'Differences {} {}'.format(diff1, diff2)
def test_ks2samp_fast(size=1000): y1 = RandomState().uniform(size=size) y2 = y1[RandomState().uniform(size=size) > 0.5] a = ks_2samp(y1, y2)[0] prep_data, prep_weights, prep_F = prepare_distibution(y1, numpy.ones(len(y1))) b = _ks_2samp_fast(prep_data, y2, prep_weights, numpy.ones(len(y2)), F1=prep_F) c = _ks_2samp_fast(prep_data, y2, prep_weights, numpy.ones(len(y2)), F1=prep_F) d = ks_2samp_weighted(y1, y2, numpy.ones(len(y1)) / 3, numpy.ones(len(y2)) / 4) assert numpy.allclose(a, b, rtol=1e-2, atol=1e-3) assert numpy.allclose(b, c) assert numpy.allclose(b, d) print('ks2samp is ok')
def test_ks2samp_fast(size=1000): y1 = RandomState().uniform(size=size) y2 = y1[RandomState().uniform(size=size) > 0.5] a = ks_2samp(y1, y2)[0] prep_data, prep_weights, prep_F = prepare_distribution(y1, numpy.ones(len(y1))) b = _ks_2samp_fast(prep_data, y2, prep_weights, numpy.ones(len(y2)), cdf1=prep_F) c = _ks_2samp_fast(prep_data, y2, prep_weights, numpy.ones(len(y2)), cdf1=prep_F) d = ks_2samp_weighted(y1, y2, numpy.ones(len(y1)) / 3, numpy.ones(len(y2)) / 4) assert numpy.allclose(a, b, rtol=1e-2, atol=1e-3) assert numpy.allclose(b, c) assert numpy.allclose(b, d) print('ks2samp is ok')
def ks_test(original, target, variables, original_weights): ksresults = [] ##original_weights i = 0 for id, column in enumerate(variables, 1): ks = ks_2samp_weighted(original[column], target[column], weights1=original_weights, weights2=numpy.ones(len(target), dtype=int)) ksresults.append(ks) i += 1 return ksresults
def averageKS(original, target, weights): ''' Averages the KS value for all the variables in a data set after applying weights original(DataFrame): original collisions target(DataFrame): target collisions weights(array): weights for the original data ''' nColumns = len(original.columns) total = 0 for column in original.columns: total += ks_2samp_weighted(original[column], target[column], weights1=weights, weights2=np.ones(len(target), dtype=float)) return(total/nColumns)
def print_statistics(names, original, target, original_weights=None, target_weights=None): # Assume weights to be equal if there are not provided original_weights = numpy.ones(len(original)) if \ original_weights is None else original_weights target_weights = numpy.ones(len(target)) if \ target_weights is None else target_weights for n in names: print('KS over %s = %s' % (n, ks_2samp_weighted(original[n], target[n], weights1=original_weights, weights2=target_weights))) print('========')
def test_ks2samp(n_samples1=100, n_samples2=100): """ checking that KS can be computed with ROC curve """ data1 = numpy.random.normal(size=n_samples1) weights1 = numpy.random.random(size=n_samples1) data2 = numpy.random.normal(size=n_samples2) weights2 = numpy.random.random(size=n_samples2) print(weights1.sum(), 'SUM') KS = ks_2samp_weighted(data1, data2, weights1=weights1, weights2=weights2) # alternative way to check labels = [0] * len(data1) + [1] * len(data2) data = numpy.concatenate([data1, data2]) weights = numpy.concatenate([weights1, weights2]) from sklearn.metrics import roc_curve fpr, tpr, _ = roc_curve(labels, data, sample_weight=weights) KS2 = numpy.max(numpy.abs(symmetrize(fpr) - symmetrize(tpr))) print(KS, KS2) print(weights1.sum(), 'SUM') assert numpy.allclose(KS, KS2), 'different values of KS'