def get_efficiencies(prediction, spectator, sample_weight=None, bins_number=20, thresholds=None, errors=False, ignored_sideband=0.0): prediction, spectator, sample_weight = \ check_arrays(prediction, spectator, sample_weight) spectator_min, spectator_max = weighted_quantile( spectator, [ignored_sideband, (1. - ignored_sideband)]) mask = (spectator >= spectator_min) & (spectator <= spectator_max) spectator = spectator[mask] prediction = prediction[mask] bins_number = min(bins_number, len(prediction)) sample_weight = sample_weight if sample_weight is None else numpy.array( sample_weight)[mask] if thresholds is None: thresholds = [ weighted_quantile(prediction, quantiles=1 - eff, sample_weight=sample_weight) for eff in [0.2, 0.4, 0.5, 0.6, 0.8] ] binner = Binner(spectator, bins_number=bins_number) if sample_weight is None: sample_weight = numpy.ones(len(prediction)) bins_data = binner.split_into_bins(spectator, prediction, sample_weight) bin_edges = numpy.array([spectator_min] + list(binner.limits) + [spectator_max]) xerr = numpy.diff(bin_edges) / 2. result = OrderedDict() for threshold in thresholds: x_values = [] y_values = [] N_in_bin = [] for num, (masses, probabilities, weights) in enumerate(bins_data): if len(weights) == 0 or sum(weights) == 0.0: continue y_values.append( numpy.average(probabilities > threshold, weights=weights)) N_in_bin.append(numpy.sum(weights)) if errors: x_values.append((bin_edges[num + 1] + bin_edges[num]) / 2.) else: x_values.append(numpy.mean(masses)) x_values, y_values, N_in_bin = check_arrays( x_values, y_values, N_in_bin) if errors: result[threshold] = (x_values, y_values, numpy.sqrt(y_values * (1 - y_values) / N_in_bin), xerr) else: result[threshold] = (x_values, y_values) return result
def plot_flatness_particle(labels, predictions_dict, spectator, spectator_name, particle_name, weights=None, bins_number=30, ignored_sideband=0.1, thresholds=None, cuts_values=False): plt.figure(figsize=(18, 22)) for n, (name, label) in enumerate(names_labels_correspondence.items()): plt.subplot(3, 2, n + 1) mask = labels == names_labels_correspondence[particle_name] probs = predictions_dict[label][mask] mask_signal = labels == label probs_signal = predictions_dict[label][mask_signal] if cuts_values: thresholds_values = cut_values else: thresholds_values = [weighted_quantile(probs_signal, quantiles=1 - eff / 100., sample_weight=None if weights is None else weights[mask_signal]) for eff in thresholds] eff = get_efficiencies(probs, spectator[mask], sample_weight=None if weights is None else weights[mask], bins_number=bins_number, errors=True, ignored_sideband=ignored_sideband, thresholds=thresholds_values) for thr in thresholds_values: eff[thr] = (eff[thr][0], 100*numpy.array(eff[thr][1]), 100*numpy.array(eff[thr][2]), eff[thr][3]) plot_fig = ErrorPlot(eff) plot_fig.xlabel = '{} {}'.format(particle_name, spectator_name) plot_fig.ylabel = 'Efficiency' plot_fig.title = 'MVA {}'.format(name) plot_fig.ylim = (0, 100) plot_fig.plot(fontsize=22) plt.xticks(fontsize=12), plt.yticks(fontsize=12) if not cuts_values: plt.legend(['Signal Eff {}%'.format(thr) for thr in thresholds], loc='best', fontsize=18, framealpha=0.5)
def plot_flatness_by_particle(labels, predictions_dict, spectator, spectator_name, predictions_dict_comparison=None, names_algorithms=['MVA', 'Baseline'], weights=None, bins_number=30, ignored_sideband=0.1, thresholds=None, cuts_values=False, ncol=1): plt.figure(figsize=(22, 20)) for n, (name, label) in enumerate(names_labels_correspondence.items()): plt.subplot(3, 2, n + 1) mask =labels == label legends = [] for preds, name_algo in zip([predictions_dict, predictions_dict_comparison], names_algorithms): if preds is None: continue probs = preds[label][mask] if cuts_values: thresholds_values = cut_values else: thresholds_values = [weighted_quantile(probs, quantiles=1 - eff / 100., sample_weight=None if weights is None else weights[mask]) for eff in thresholds] eff = get_efficiencies(probs, spectator[mask], sample_weight=None if weights is None else weights[mask], bins_number=bins_number, errors=True, ignored_sideband=ignored_sideband, thresholds=thresholds_values) for thr in thresholds_values: eff[thr] = (eff[thr][0], 100*numpy.array(eff[thr][1]), 100*numpy.array(eff[thr][2]), eff[thr][3]) plot_fig = ErrorPlot(eff) plot_fig.xlabel = '{} {}'.format(name, spectator_name) plot_fig.ylabel = 'Efficiency' plot_fig.title = name plot_fig.ylim = (0, 100) plot_fig.plot(fontsize=22) plt.xticks(fontsize=12), plt.yticks(fontsize=12) legends.append(['{} Eff {}%'.format(thr, name_algo) for thr in thresholds]) plt.legend(numpy.concatenate(legends), loc='best', fontsize=12, framealpha=0.5, ncol=ncol)
def plot_flatness_by_particle(labels, predictions_dict, spectator, spectator_name, predictions_dict_comparison=None, names_algorithms=['MVA', 'Baseline'], weights=None, bins_number=30, ignored_sideband=0.1, thresholds=None, cuts_values=False, ncol=1): plt.figure(figsize=(22, 20)) for n, (name, label) in enumerate(names_labels_correspondence.items()): plt.subplot(3, 2, n + 1) mask = labels == label legends = [] for preds, name_algo in zip( [predictions_dict, predictions_dict_comparison], names_algorithms): if preds is None: continue probs = preds[label][mask] if cuts_values: thresholds_values = cut_values else: thresholds_values = [ weighted_quantile(probs, quantiles=1 - eff / 100., sample_weight=None if weights is None else weights[mask]) for eff in thresholds ] eff = get_efficiencies( probs, spectator[mask], sample_weight=None if weights is None else weights[mask], bins_number=bins_number, errors=True, ignored_sideband=ignored_sideband, thresholds=thresholds_values) for thr in thresholds_values: eff[thr] = (eff[thr][0], 100 * numpy.array(eff[thr][1]), 100 * numpy.array(eff[thr][2]), eff[thr][3]) plot_fig = ErrorPlot(eff) plot_fig.xlabel = '{} {}'.format(name, spectator_name) plot_fig.ylabel = 'Efficiency' plot_fig.title = name plot_fig.ylim = (0, 100) plot_fig.plot(fontsize=22) plt.xticks(fontsize=12), plt.yticks(fontsize=12) legends.append( ['{} Eff {}%'.format(thr, name_algo) for thr in thresholds]) plt.legend(numpy.concatenate(legends), loc='best', fontsize=12, framealpha=0.5, ncol=ncol)
def test_weighted_quantile(size=10000): x = numpy.random.normal(size=size) weights = numpy.random.random(size=size) quantile_level = numpy.random.random() quantile_value = utils.weighted_quantile(x, quantile_level, sample_weight=weights) passed_weight = numpy.sum((x < quantile_value) * weights) expected_weight = quantile_level * numpy.sum(weights) assert numpy.abs(passed_weight - expected_weight) < 1.1, 'wrong cut'
def plot_flatness_particle(labels, predictions_dict, spectator, spectator_name, particle_name, weights=None, bins_number=30, ignored_sideband=0.1, thresholds=None, cuts_values=False): plt.figure(figsize=(18, 22)) for n, (name, label) in enumerate(names_labels_correspondence.items()): plt.subplot(3, 2, n + 1) mask = labels == names_labels_correspondence[particle_name] probs = predictions_dict[label][mask] mask_signal = labels == label probs_signal = predictions_dict[label][mask_signal] if cuts_values: thresholds_values = cut_values else: thresholds_values = [ weighted_quantile(probs_signal, quantiles=1 - eff / 100., sample_weight=None if weights is None else weights[mask_signal]) for eff in thresholds ] eff = get_efficiencies( probs, spectator[mask], sample_weight=None if weights is None else weights[mask], bins_number=bins_number, errors=True, ignored_sideband=ignored_sideband, thresholds=thresholds_values) for thr in thresholds_values: eff[thr] = (eff[thr][0], 100 * numpy.array(eff[thr][1]), 100 * numpy.array(eff[thr][2]), eff[thr][3]) plot_fig = ErrorPlot(eff) plot_fig.xlabel = '{} {}'.format(particle_name, spectator_name) plot_fig.ylabel = 'Efficiency' plot_fig.title = 'MVA {}'.format(name) plot_fig.ylim = (0, 100) plot_fig.plot(fontsize=22) plt.xticks(fontsize=12), plt.yticks(fontsize=12) if not cuts_values: plt.legend(['Signal Eff {}%'.format(thr) for thr in thresholds], loc='best', fontsize=18, framealpha=0.5)
def get_profiles(prediction, spectator, sample_weight=None, bins_number=20, errors=False, ignored_sideband=0.0): """ Construct profile of prediction vs. spectator :param binner: Binner object with bins computed from combined sig+bkg spectator value list :param prediction: list of probabilities :param spectator: list of spectator's values :param bins_number: int, count of bins for plot :return: if errors=False tuple (x_values, y_values) if errors=True tuple (x_values, y_values, y_err, x_err) All the parts: x_values, y_values, y_err, x_err are numpy.arrays of the same length. """ prediction, spectator, sample_weight = check_arrays( prediction, spectator, sample_weight) spectator_min, spectator_max = weighted_quantile( spectator, [ignored_sideband, (1. - ignored_sideband)]) mask = (spectator >= spectator_min) & (spectator <= spectator_max) spectator = spectator[mask] prediction = prediction[mask] bins_number = min(bins_number, len(prediction)) sample_weight = sample_weight if sample_weight is None else numpy.array( sample_weight)[mask] binner = Binner(spectator, bins_number=bins_number) if sample_weight is None: sample_weight = numpy.ones(len(prediction)) bins_data = binner.split_into_bins(spectator, prediction, sample_weight) bin_edges = numpy.array([spectator_min] + list(binner.limits) + [spectator_max]) x_err = numpy.diff(bin_edges) / 2. result = OrderedDict() x_values = [] y_values = [] N_in_bin = [] y_err = [] for num, (masses, probabilities, weights) in enumerate(bins_data): y_values.append( numpy.average(probabilities, weights=weights) if len(weights) > 0 and sum(weights) > 0.0 else 0) y_err.append( numpy.sqrt( numpy. cov(probabilities, aweights=numpy.abs(weights), ddof=0) / numpy.sum(weights) ) if len(weights) > 0 and sum(weights) > 0.0 else 0) N_in_bin.append(numpy.sum(weights)) x_values.append((bin_edges[num + 1] + bin_edges[num]) / 2.) x_values, y_values, N_in_bin = check_arrays(x_values, y_values, N_in_bin) if errors: return (x_values, y_values, y_err, x_err) else: return (x_values, y_values)
def plot_flatness_by_particle(labels, predictions, spectator, spectator_name, predictions_comparison=None, names_algorithms=['MVA', 'Baseline'], for_particle=None, weights=None, bins_number=30, ignored_sideband=0.1, thresholds=None, n_col=1): """ Build a flatness-plot, which demonstrates the dependency between efficiency and some observable. :param labels: [n_samples], contains targets :param predictions: [n_samples, n_particle_types] with predictions of an algorithm :param spectator: [n_samples], values of spectator variable :param spectator_name: str, name shown on the plot :param predictions_comparison: [n_samples, n_particle types], optionally for comparison this may be provided :param names_algorithms: names for compared algorithms :param weights: [n_samples], optional :param bins_number: int, :param ignored_sideband: fraction of ignored sidebands :param thresholds: efficiencies, for which flatness is drawn :param n_col: number of columns in legend. """ plt.figure(figsize=(22, 24)) if predictions_comparison is not None: colors = ['blue', 'green'] markers = ['o', 's', 'v', 'o', 's', 'v'] else: colors = [None, None] markers = ['o'] * len(thresholds) for n, (particle_name, label) in enumerate(names_labels_correspondence.items()): plt.subplot(3, 2, n + 1) title = '{} algorithm'.format(particle_name) xlim_all = (1e10, -1e10) ylim_all = (20, -1e8) legends = [] for preds, algo_name, color in zip( [predictions, predictions_comparison], names_algorithms, colors): if preds is None: continue particle_mask = labels == label particle_probs = preds[particle_mask, label] particle_weights = None if weights is None else weights[ particle_mask] thresholds_values = [ weighted_quantile(particle_probs, quantiles=1 - eff / 100., sample_weight=particle_weights) for eff in thresholds ] if for_particle is not None: particle_mask = labels == names_labels_correspondence[ for_particle] particle_probs = preds[particle_mask, label] particle_weights = None if weights is None else weights[ particle_mask] title = '{} algorithm for {}'.format(particle_name, for_particle) eff = get_efficiencies(particle_probs, spectator[particle_mask], sample_weight=particle_weights, bins_number=bins_number, errors=True, ignored_sideband=ignored_sideband, thresholds=thresholds_values) for thr in thresholds_values: eff[thr] = (eff[thr][0], 100 * numpy.array(eff[thr][1]), 100 * numpy.array(eff[thr][2]), eff[thr][3]) xlim, ylim = compute_limits_and_plot_errorbar(eff, markers, color=color) plt.xlabel('{} {}\n\n'.format(particle_name, spectator_name), fontsize=22) plt.ylabel('Efficiency', fontsize=22) plt.title('\n\n'.format(title), fontsize=22) plt.xticks(fontsize=12), plt.yticks(fontsize=12) legends.append( ['{} Eff {}%'.format(algo_name, thr) for thr in thresholds]) plt.grid(True) xlim_all = (min(xlim_all[0], xlim[0]), max(xlim_all[1], xlim[1])) ylim_all = (min(ylim_all[0], ylim[0]), max(ylim_all[1], ylim[1])) plt.legend(numpy.concatenate(legends), loc='best', fontsize=16, framealpha=0.5, ncol=n_col) plt.xlim(xlim_all[0], xlim_all[1]) plt.ylim(ylim_all[0], ylim_all[1])