def _calculate_binning( self, theta, x, weights=None, observables=None, lower_cutoff_percentile=0.0, upper_cutoff_percentile=100.0 ): all_theta_x = np.hstack([theta, x]).T # Number of bins n_samples = x.shape[0] n_parameters = theta.shape[1] n_all_observables = x.shape[1] # Observables to actually use if observables is None: observables = list(range(n_all_observables)) # Number of bins all_n_bins_x = [1 for _ in range(n_all_observables)] for i in observables: all_n_bins_x[i] = self.n_bins_x if isinstance(self.n_bins_thetas, int): all_n_bins_theta = [self.n_bins_thetas for _ in range(n_parameters)] elif len(self.n_bins_thetas) == n_parameters: all_n_bins_theta = self.n_bins_thetas else: raise RuntimeError( "Inconsistent bin numbers for parameters: {} vs {} parameters".format(self.n_bins_thetas, n_parameters) ) all_n_bins = all_n_bins_theta + all_n_bins_x # Find edges based on percentiles all_edges = [] all_ranges = [] for i, (data, n_bins) in enumerate(zip(all_theta_x, all_n_bins)): edges = weighted_quantile( data, quantiles=np.linspace(lower_cutoff_percentile / 100.0, upper_cutoff_percentile / 100.0, n_bins + 1), sample_weight=weights, old_style=True, ) range_ = (np.nanmin(data) - 0.01, np.nanmax(data) + 0.01) edges[0], edges[-1] = range_ # Remove zero-width bins widths = np.array(list(edges[1:] - edges[:-1]) + [1.0]) edges = edges[widths > 1.0e-9] all_n_bins[i] = len(edges) - 1 all_edges.append(edges) all_ranges.append(range_) return all_n_bins, all_edges, all_ranges
def _adaptive_binning(x, n_bins, weights=None, lower_cutoff_percentile=0.1, upper_cutoff_percentile=99.9): edges = weighted_quantile( x, quantiles=np.linspace(lower_cutoff_percentile / 100.0, upper_cutoff_percentile / 100.0, n_bins + 1), sample_weight=weights, old_style=True, ) # Increase range by some safety margin # range_ = (np.nanmin(x) - 0.5 * (edges[1] - edges[0]), np.nanmax(x) + 0.5 * (edges[-1] - edges[-2])) # logger.debug("Increasing histogram range from %s to %s", (edges[0], edges[-1]), range_) # edges[0], edges[-1] = range_ # Remove zero-width bins widths = np.array(list(edges[1:] - edges[:-1]) + [1.0]) edges = edges[widths > 1.0e-9] return edges
def plot_distributions( filename, observables=None, parameter_points=None, uncertainties="nuisance", nuisance_parameters=None, draw_nuisance_toys=None, normalize=False, log=False, observable_labels=None, n_bins=50, line_labels=None, colors=None, linestyles=None, linewidths=1.5, toy_linewidths=0.5, alpha=0.15, toy_alpha=0.75, n_events=None, n_toys=100, n_cols=3, ): """ Plots one-dimensional histograms of observables in a MadMiner file for a given set of benchmarks. Parameters ---------- filename : str Filename of a MadMiner HDF5 file. observables : list of str or None, optional Which observables to plot, given by a list of their names. If None, all observables in the file are plotted. Default value: None. parameter_points : list of (str or ndarray) or None, optional Which parameter points to use for histogramming the data. Given by a list, each element can either be the name of a benchmark in the MadMiner file, or an ndarray specifying any parameter point in a morphing setup. If None, all physics (non-nuisance) benchmarks defined in the MadMiner file are plotted. Default value: None. uncertainties : {"nuisance", "none"}, optional Defines how uncertainty bands are drawn. With "nuisance", the variation in cross section from all nuisance parameters is added in quadrature. With "none", no error bands are drawn. nuisance_parameters : None or list of int, optional If uncertainties is "nuisance", this can restrict which nuisance parameters are used to draw the uncertainty bands. Each entry of this list is the index of one nuisance parameter (same order as in the MadMiner file). draw_nuisance_toys : None or int, optional If not None and uncertainties is "nuisance", sets the number of nuisance toy distributions that are drawn (in addition to the error bands). normalize : bool, optional Whether the distribution is normalized to the total cross section. Default value: False. log : bool, optional Whether to draw the y axes on a logarithmic scale. Defaul value: False. observable_labels : None or list of (str or None), optional x-axis labels naming the observables. If None, the observable names from the MadMiner file are used. Default value: None. n_bins : int, optional Number of histogram bins. Default value: 50. line_labels : None or list of (str or None), optional Labels for the different parameter points. If None and if parameter_points is None, the benchmark names from the MadMiner file are used. Default value: None. colors : None or str or list of str, optional Matplotlib line (and error band) colors for the distributions. If None, uses default colors. Default value: None. linestyles : None or str or list of str, optional Matplotlib line styles for the distributions. If None, uses default linestyles. Default value: None. linewidths : float or list of float, optional Line widths for the contours. Default value: 1.5. toy_linewidths : float or list of float or None, optional Line widths for the toy replicas, if uncertainties is "nuisance" and draw_nuisance_toys is not None. If None, linewidths is used. Default value: 1. alpha : float, optional alpha value for the uncertainty bands. Default value: 0.25. toy_alpha : float, optional alpha value for the toy replicas, if uncertainties is "nuisance" and draw_nuisance_toys is not None. Default value: 0.75. n_events : None or int, optional If not None, sets the number of events from the MadMiner file that will be analyzed and plotted. Default value: None. n_toys : int, optional Number of toy nuisance parameter vectors used to estimate the systematic uncertainties. Default value: 100. n_cols : int, optional Number of columns of subfigures in the plot. Default value: 3. Returns ------- figure : Figure Plot as Matplotlib Figure instance. """ # Load data sa = SampleAugmenter(filename, include_nuisance_parameters=True) if uncertainties == "nuisance": nuisance_morpher = NuisanceMorpher( sa.nuisance_parameters, list(sa.benchmarks.keys()), reference_benchmark=sa.reference_benchmark) # Default settings if parameter_points is None: parameter_points = [] for key, is_nuisance in zip(sa.benchmarks, sa.benchmark_is_nuisance): if not is_nuisance: parameter_points.append(key) if line_labels is None: line_labels = parameter_points n_parameter_points = len(parameter_points) if colors is None: colors = ["C" + str(i) for i in range(10)] * (n_parameter_points // 10 + 1) elif not isinstance(colors, list): colors = [colors for _ in range(n_parameter_points)] if linestyles is None: linestyles = ["solid", "dashed", "dotted", "dashdot" ] * (n_parameter_points // 4 + 1) elif not isinstance(linestyles, list): linestyles = [linestyles for _ in range(n_parameter_points)] if not isinstance(linewidths, list): linewidths = [linewidths for _ in range(n_parameter_points)] if toy_linewidths is None: toy_linewidths = linewidths if not isinstance(toy_linewidths, list): toy_linewidths = [toy_linewidths for _ in range(n_parameter_points)] # Observables observable_indices = [] if observables is None: observable_indices = list(range(len(sa.observables))) else: all_observables = list(sa.observables.keys()) for obs in observables: try: observable_indices.append(all_observables.index(str(obs))) except ValueError: logging.warning("Ignoring unknown observable %s", obs) logger.debug("Observable indices: %s", observable_indices) n_observables = len(observable_indices) if observable_labels is None: all_observables = list(sa.observables.keys()) observable_labels = [ all_observables[obs] for obs in observable_indices ] # Get event data (observations and weights) x, weights_benchmarks = sa.extract_raw_data() logger.debug("Loaded raw data with shapes %s, %s", x.shape, weights_benchmarks.shape) # Remove negative weights sane_event_filter = np.all(weights_benchmarks >= 0.0, axis=1) n_events_before = weights_benchmarks.shape[0] x = x[sane_event_filter] weights_benchmarks = weights_benchmarks[sane_event_filter] n_events_removed = n_events_before - weights_benchmarks.shape[0] if int(np.sum(sane_event_filter, dtype=np.int)) < len(sane_event_filter): logger.warning("Removed %s / %s events with negative weights", n_events_removed, n_events_before) # Shuffle events x, weights_benchmarks = shuffle(x, weights_benchmarks) # Only analyze n_events if n_events is not None and n_events < x.shape[0]: logger.debug("Only analyzing first %s / %s events", n_events, x.shape[0]) x = x[:n_events] weights_benchmarks = weights_benchmarks[:n_events] if uncertainties != "nuisance": n_toys = 0 n_nuisance_toys_drawn = 0 if draw_nuisance_toys is not None: n_nuisance_toys_drawn = draw_nuisance_toys theta_matrices = [] for theta in parameter_points: if isinstance(theta, six.string_types): matrix = get_theta_benchmark_matrix("benchmark", theta, sa.benchmarks) else: matrix = get_theta_benchmark_matrix("morphing", theta, sa.benchmarks, sa.morpher) theta_matrices.append(matrix) logger.debug("Calculated %s theta matrices", len(theta_matrices)) # Nuisance parameters nuisance_toy_factors = [] if uncertainties == "nuisance": n_nuisance_params = sa.n_nuisance_parameters if not n_nuisance_params > 0: raise RuntimeError( "Cannot draw systematic uncertainties -- no nuisance parameters found!" ) logger.debug("Drawing nuisance toys") nuisance_toys = np.random.normal(loc=0.0, scale=1.0, size=n_nuisance_params * n_toys) nuisance_toys = nuisance_toys.reshape(n_toys, n_nuisance_params) # Restrict nuisance parameters if nuisance_parameters is not None: for i in range(n_nuisance_params): if i not in nuisance_parameters: nuisance_toys[:, i] = 0.0 logger.debug("Drew %s toy values for nuisance parameters", n_toys * n_nuisance_params) nuisance_toy_factors = np.array([ nuisance_morpher.calculate_nuisance_factors( nuisance_toy, weights_benchmarks) for nuisance_toy in nuisance_toys ]) # Shape (n_toys, n_events) nuisance_toy_factors = sanitize_array(nuisance_toy_factors, min_value=1.0e-2, max_value=100.0) # Shape (n_toys, n_events) # Preparing plot n_rows = (n_observables + n_cols - 1) // n_cols n_events_for_range = 10000 if n_events is None else min(10000, n_events) fig = plt.figure(figsize=(4.0 * n_cols, 4.0 * n_rows)) for i_panel, (i_obs, xlabel) in enumerate( zip(observable_indices, observable_labels)): logger.debug("Plotting panel %s: observable %s, label %s", i_panel, i_obs, xlabel) # Figure out x range xmins, xmaxs = [], [] for theta_matrix in theta_matrices: x_small = x[:n_events_for_range] weights_small = mdot(theta_matrix, weights_benchmarks[:n_events_for_range]) xmin = weighted_quantile(x_small[:, i_obs], 0.05, weights_small) xmax = weighted_quantile(x_small[:, i_obs], 0.95, weights_small) xwidth = xmax - xmin xmin -= xwidth * 0.1 xmax += xwidth * 0.1 xmin = max(xmin, np.min(x[:, i_obs])) xmax = min(xmax, np.max(x[:, i_obs])) xmins.append(xmin) xmaxs.append(xmax) xmin = min(xmins) xmax = max(xmaxs) x_range = (xmin, xmax) logger.debug("Ranges for observable %s: min = %s, max = %s", xlabel, xmins, xmaxs) # Subfigure ax = plt.subplot(n_rows, n_cols, i_panel + 1) # Calculate histograms bin_edges = None histos = [] histos_up = [] histos_down = [] histos_toys = [] for i_theta, theta_matrix in enumerate(theta_matrices): theta_weights = mdot(theta_matrix, weights_benchmarks) # Shape (n_events,) histo, bin_edges = np.histogram(x[:, i_obs], bins=n_bins, range=x_range, weights=theta_weights, density=normalize) histos.append(histo) if uncertainties == "nuisance": histos_toys_this_theta = [] for i_toy, nuisance_toy_factors_this_toy in enumerate( nuisance_toy_factors): toy_histo, _ = np.histogram( x[:, i_obs], bins=n_bins, range=x_range, weights=theta_weights * nuisance_toy_factors_this_toy, density=normalize, ) histos_toys_this_theta.append(toy_histo) histos_up.append( np.percentile(histos_toys_this_theta, 84.0, axis=0)) histos_down.append( np.percentile(histos_toys_this_theta, 16.0, axis=0)) histos_toys.append( histos_toys_this_theta[:n_nuisance_toys_drawn]) # Draw error bands if uncertainties == "nuisance": for histo_up, histo_down, lw, color, label, ls in zip( histos_up, histos_down, linewidths, colors, line_labels, linestyles): bin_edges_ = np.repeat(bin_edges, 2)[1:-1] histo_down_ = np.repeat(histo_down, 2) histo_up_ = np.repeat(histo_up, 2) plt.fill_between(bin_edges_, histo_down_, histo_up_, facecolor=color, edgecolor="none", alpha=alpha) # Draw some toys for histo_toys, lw, color, ls in zip(histos_toys, toy_linewidths, colors, linestyles): for k in range(n_nuisance_toys_drawn): bin_edges_ = np.repeat(bin_edges, 2)[1:-1] histo_ = np.repeat(histo_toys[k], 2) plt.plot(bin_edges_, histo_, color=color, alpha=toy_alpha, lw=lw, ls=ls) # Draw central lines for histo, lw, color, label, ls in zip(histos, linewidths, colors, line_labels, linestyles): bin_edges_ = np.repeat(bin_edges, 2)[1:-1] histo_ = np.repeat(histo, 2) plt.plot(bin_edges_, histo_, color=color, lw=lw, ls=ls, label=label, alpha=1.0) plt.legend() plt.xlabel(xlabel) if normalize: plt.ylabel("Normalized distribution") else: plt.ylabel(r"$\frac{d\sigma}{dx}$ [pb / bin]") plt.xlim(x_range[0], x_range[1]) if log: ax.set_yscale("log", nonposy="clip") else: plt.ylim(0.0, None) plt.tight_layout() return fig