class DataAnalyzer(object): """ Collects common functionality that is used when analysing data in the MadMiner file. Parameters ---------- filename : str Path to MadMiner file (for instance the output of `madminer.delphes.DelphesProcessor.save()`). disable_morphing : bool, optional If True, the morphing setup is not loaded from the file. Default value: False. include_nuisance_parameters : bool, optional If True, nuisance parameters are taken into account. Default value: True. """ def __init__(self, filename, disable_morphing=False, include_nuisance_parameters=True): # Save setup self.include_nuisance_parameters = include_nuisance_parameters self.madminer_filename = filename # Load data logger.info("Loading data from %s", filename) ( self.parameters, self.benchmarks, self.benchmark_is_nuisance, self.morphing_components, self.morphing_matrix, self.observables, self.n_samples, _, self.reference_benchmark, self.nuisance_parameters, self.n_events_generated_per_benchmark, self.n_events_backgrounds, ) = load_madminer_settings( filename, include_nuisance_benchmarks=include_nuisance_parameters) self.n_parameters = len(self.parameters) self.n_benchmarks = len(self.benchmarks) self.n_benchmarks_phys = np.sum( np.logical_not(self.benchmark_is_nuisance)) self.n_observables = 0 if self.observables is None else len( self.observables) self.n_nuisance_parameters = 0 if self.nuisance_parameters is not None and include_nuisance_parameters: self.n_nuisance_parameters = len(self.nuisance_parameters) else: self.nuisance_parameters = None # Morphing self.morpher = None if self.morphing_matrix is not None and self.morphing_components is not None and not disable_morphing: self.morpher = PhysicsMorpher(self.parameters) self.morpher.set_components(self.morphing_components) self.morpher.set_basis(self.benchmarks, morphing_matrix=self.morphing_matrix) # Nuisance morphing self.nuisance_morpher = None if self.nuisance_parameters is not None: self.nuisance_morpher = NuisanceMorpher( self.nuisance_parameters, list(self.benchmarks.keys()), self.reference_benchmark) else: self.include_nuisance_parameters = False # Check event numbers self._check_n_events() self._report_setup() def event_loader( self, start=0, end=None, batch_size=100000, include_nuisance_parameters=None, generated_close_to=None, return_sampling_ids=False, ): """ Yields batches of events in the MadMiner file. Parameters ---------- start : int, optional First event index to load end : int or None, optional Last event index to load batch_size : int, optional Batch size include_nuisance_parameters : bool, optional Whether nuisance parameter benchmarks are included in the returned data generated_close_to : None or ndarray, optional If None, this function yields all events. Otherwise, it just yields just the events that were generated at the closest benchmark point to a given parameter point. return_sampling_ids : bool, optional If True, the iterator returns the sampling IDs in additioin to observables and weights. Yields ------ observations : ndarray Event data weights : ndarray Event weights sampling_ids : int Sampling IDs (benchmark used for sampling for signal events, -1 for background events). Only returned if return_sampling_ids = True was set. """ if include_nuisance_parameters is None: include_nuisance_parameters = self.include_nuisance_parameters sampling_benchmark = self._find_closest_benchmark(generated_close_to) logger.debug("Sampling benchmark closest to %s: %s", generated_close_to, sampling_benchmark) if sampling_benchmark is None: sampling_factors = self._calculate_sampling_factors() else: sampling_factors = np.ones(self.n_benchmarks_phys + 1) logger.debug("Sampling factors: %s", sampling_factors) for data in madminer_event_loader( self.madminer_filename, start, end, batch_size, include_nuisance_parameters, benchmark_is_nuisance=self.benchmark_is_nuisance, sampling_benchmark=sampling_benchmark, sampling_factors=sampling_factors, return_sampling_ids=return_sampling_ids, ): yield data def weighted_events( self, theta=None, nu=None, start_event=None, end_event=None, derivative=False, generated_close_to=None, n_draws=None, ): """ Returns all events together with the benchmark weights (if theta is None) or weights for a given theta. Parameters ---------- theta : None or ndarray or str, optional If None, the function returns all benchmark weights. If str, the function returns the weights for a given benchmark name. If ndarray, it uses morphing to calculate the weights for this value of theta. Default value: None. nu : None or ndarray, optional If None, the nuisance parameters are set to their nominal values. Otherwise, and if theta is an ndarray, sets the values of the nuisance parameters. start_event : int Index (in the MadMiner file) of the first event to consider. end_event : int Index (in the MadMiner file) of the last unweighted event to consider. derivative : bool, optional If True and if theta is not None, the derivative of the weights with respect to theta are returned. Default value: False. generated_close_to : None or int, optional Only returns benchmarks generated from this benchmark (and background events). Default value: None. n_draws : None or int, optional If not None, returns only this number of events, drawn randomly. Returns ------- x : ndarray Observables with shape `(n_unweighted_samples, n_observables)`. weights : ndarray If theta is None and derivative is False, benchmark weights with shape `(n_unweighted_samples, n_benchmarks)` in pb. If theta is not None and derivative is True, the gradient of the weight for the given parameter with respect to theta with shape `(n_unweighted_samples, n_gradients)` in pb. Otherwise, weights for the given parameter theta with shape `(n_unweighted_samples,)` in pb. """ x, weights_benchmarks = next( self.event_loader(batch_size=None, start=start_event, end=end_event, generated_close_to=generated_close_to)) # Pick events randomly n_events = len(x) if n_draws is not None and n_draws < n_events: idx = np.random.choice(n_events, n_draws, replace=False) x = x[idx] weights_benchmarks = weights_benchmarks[idx] elif n_draws is not None: logger.warning("Requested %s events, but only %s available", n_draws, n_events) # Process and return appropriate weights if theta is None: return x, weights_benchmarks elif isinstance(theta, six.string_types): i_benchmark = list(self.benchmarks.keys()).index(theta) return x, weights_benchmarks[:, i_benchmark] elif derivative: dtheta_matrix = self._get_dtheta_benchmark_matrix(theta) gradients_theta = mdot( dtheta_matrix, weights_benchmarks) # (n_gradients, n_samples) gradients_theta = gradients_theta.T return x, gradients_theta else: # TODO: nuisance params if nu is not None: raise NotImplementedError theta_matrix = self._get_theta_benchmark_matrix(theta) weights_theta = mdot(theta_matrix, weights_benchmarks) return x, weights_theta def xsecs( self, thetas=None, nus=None, events="all", test_split=0.2, include_nuisance_benchmarks=True, batch_size=100000, generated_close_to=None, ): """ Returns the total cross sections for benchmarks or parameter points. Parameters ---------- thetas : None or list of (ndarray or str), optional If None, the function returns all benchmark cross sections. Otherwise, it returns the cross sections for a series of parameter points that are either given by their benchmark name (as a str), their benchmark index (as an int), or their parameter value (as an ndarray, using morphing). Default value: None. nus : None or list of (None or ndarray), optional If None, the nuisance parameters are set to their nominal values (0), i.e. no systematics are taken into account. Otherwise, the list has to have the same number of elements as thetas, and each entry can specify nuisance parameters at nominal value (None) or a value of the nuisance parameters (ndarray). include_nuisance_benchmarks : bool, optional Whether to include nuisance benchmarks if thetas is None. Default value: True. test_split : float, optional Fraction of events reserved for testing. Default value: 0.2. events : {"train", "test", "all"}, optional Which events to use. Default: "all". batch_size : int, optional Size of the batches of events that are loaded into memory at the same time. Default value: 100000. generated_close_to : None or ndarray, optional If not None, only events originally generated from the closest benchmark to this parameter point will be used. Default value : None. Returns ------- xsecs : ndarray Calculated cross sections in pb. xsec_uncertainties : ndarray Cross-section uncertainties in pb. Basically calculated as sum(weights**2)**0.5. """ logger.debug("Calculating cross sections for thetas = %s and nus = %s", thetas, nus) # Inputs if thetas is not None: include_nuisance_benchmarks = True if thetas is not None: if nus is None: nus = [None for _ in thetas] assert len(nus) == len( thetas), "Numbers of thetas and nus don't match!" # Which events to use if events == "all": start_event, end_event = None, None correction_factor = 1.0 elif events == "train": start_event, end_event, correction_factor = self._train_test_split( True, test_split) elif events == "test": start_event, end_event, correction_factor = self._train_test_split( False, test_split) else: raise ValueError( "Events has to be either 'all', 'train', or 'test', but got {}!" .format(events)) # Theta matrices (translation of benchmarks to theta, at nominal nuisance params) theta_matrices = [ self._get_theta_benchmark_matrix(theta) for theta in thetas ] theta_matrices = np.asarray( theta_matrices) # Shape (n_thetas, n_benchmarks) # Loop over events xsecs = 0.0 xsec_uncertainties = 0.0 n_events = 0 for i_batch, (_, benchmark_weights) in enumerate( self.event_loader( start=start_event, end=end_event, include_nuisance_parameters=include_nuisance_benchmarks, batch_size=batch_size, generated_close_to=generated_close_to, )): n_batch, _ = benchmark_weights.shape n_events += n_batch # Benchmark xsecs if thetas is None: xsecs += np.sum(benchmark_weights, axis=0) xsec_uncertainties += np.sum(benchmark_weights * benchmark_weights, axis=0) # xsecs at given parameters(theta, nu) else: # Weights at nominal nuisance params (nu=0) weights_nom = mdot( theta_matrices, benchmark_weights) # Shape (n_thetas, n_batch) weights_sq_nom = mdot(theta_matrices, benchmark_weights * benchmark_weights) # same # Effect of nuisance parameters nuisance_factors = self._calculate_nuisance_factors( nus, benchmark_weights) weights = nuisance_factors * weights_nom weights_sq = nuisance_factors * weights_sq_nom # Sum up xsecs += np.sum(weights, axis=1) xsec_uncertainties += np.sum(weights_sq, axis=1) if n_events == 0: raise RuntimeError( "Did not find events with test_split = %s and generated_close_to = %s", test_split, generated_close_to) xsec_uncertainties = np.maximum(xsec_uncertainties, 0.0)**0.5 # Correct for not using all events xsecs *= correction_factor xsec_uncertainties *= correction_factor logger.debug("xsecs and uncertainties [pb]:") for this_xsec, this_uncertainty in zip(xsecs, xsec_uncertainties): logger.debug(" (%4f +/- %4f) pb (%4f %%)", this_xsec, this_uncertainty, 100 * this_uncertainty / this_xsec) return xsecs, xsec_uncertainties def xsec_gradients( self, thetas, nus=None, events="all", test_split=0.2, gradients="all", batch_size=100000, generated_close_to=None, ): """ Returns the gradient of total cross sections with respect to parameters. Parameters ---------- thetas : list of (ndarray or str), optional If None, the function returns all benchmark cross sections. Otherwise, it returns the cross sections for a series of parameter points that are either given by their benchmark name (as a str), their benchmark index (as an int), or their parameter value (as an ndarray, using morphing). Default value: None. nus : None or list of (None or ndarray), optional If None, the nuisance parameters are set to their nominal values (0), i.e. no systematics are taken into account. Otherwise, the list has to have the same number of elements as thetas, and each entry can specify nuisance parameters at nominal value (None) or a value of the nuisance parameters (ndarray). test_split : float, optional Fraction of events reserved for testing. Default value: 0.2. events : {"train", "test", "all"}, optional Which events to use. Default: "all". gradients : {"all", "theta", "nu"}, optional Which gradients to calculate. Default value: "all". batch_size : int, optional Size of the batches of events that are loaded into memory at the same time. Default value: 100000. generated_close_to : None or ndarray, optional If not None, only events originally generated from the closest benchmark to this parameter point will be used. Default value : None. Returns ------- xsecs_gradients : ndarray Calculated cross section gradients in pb with shape (n_gradients,). """ logger.debug( "Calculating cross section gradients for thetas = %s and nus = %s", thetas, nus) # Inputs include_nuisance_benchmarks = nus is not None or gradients in [ "all", "nu" ] if nus is None: nus = [None for _ in thetas] assert len(nus) == len( thetas), "Numbers of thetas and nus don't match!" if gradients not in ["all", "theta", "nu"]: raise RuntimeError( "Gradients has to be 'all', 'theta', or 'nu', but got {}". format(gradients)) # Which events to use if events == "all": start_event, end_event = None, None correction_factor = 1.0 elif events == "train": start_event, end_event, correction_factor = self._train_test_split( True, test_split) elif events == "test": start_event, end_event, correction_factor = self._train_test_split( False, test_split) else: raise ValueError( "Events has to be either 'all', 'train', or 'test', but got {}!" .format(events)) # Theta matrices (translation of benchmarks to theta, at nominal nuisance params) theta_matrices = np.asarray([ self._get_theta_benchmark_matrix(theta) for theta in thetas ]) # shape (n_thetas, n_benchmarks) theta_gradient_matrices = np.asarray([ self._get_dtheta_benchmark_matrix(theta) for theta in thetas ]) # shape (n_thetas, n_gradients, n_benchmarks) # Loop over events xsec_gradients = 0.0 for i_batch, (_, benchmark_weights) in enumerate( self.event_loader( start=start_event, end=end_event, include_nuisance_parameters=include_nuisance_benchmarks, batch_size=batch_size, generated_close_to=generated_close_to, )): n_batch, _ = benchmark_weights.shape logger.debug("Batch %s with %s events", i_batch + 1, n_batch) if gradients in ["all", "theta"]: nom_gradients = mdot( theta_gradient_matrices, benchmark_weights ) # Shape (n_thetas, n_phys_gradients, n_batch) nuisance_factors = self._calculate_nuisance_factors( nus, benchmark_weights) # Shape (n_thetas, n_batch) try: dweight_dtheta = nuisance_factors[:, np. newaxis, :] * nom_gradients except TypeError: dweight_dtheta = nom_gradients if gradients in ["all", "nu"]: weights_nom = mdot( theta_matrices, benchmark_weights) # Shape (n_thetas, n_batch) nuisance_factor_gradients = np.asarray([ self.nuisance_morpher.calculate_nuisance_factor_gradients( nu, benchmark_weights) for nu in nus ]) # Shape (n_thetas, n_nuisance_gradients, n_batch) dweight_dnu = nuisance_factor_gradients * weights_nom[:, np. newaxis, :] if gradients == "all": dweight_dall = np.concatenate((dweight_dtheta, dweight_dnu), 1) elif gradients == "theta": dweight_dall = dweight_dtheta elif gradients == "nu": dweight_dall = dweight_dnu xsec_gradients += np.sum(dweight_dall, axis=2) # Correct for not using all events xsec_gradients *= correction_factor return xsec_gradients def _check_n_events(self): if self.n_events_generated_per_benchmark is None: return n_events_check = sum(self.n_events_generated_per_benchmark) if self.n_events_backgrounds is not None: n_events_check += self.n_events_backgrounds if self.n_samples != n_events_check: logger.warning( "Inconsistent event numbers in HDF5 file! Please recalculate them by calling " "combine_and_shuffle(recalculate_header=True).") def _report_setup(self): logger.info("Found %s parameters", self.n_parameters) for key, values in six.iteritems(self.parameters): logger.debug( " %s (LHA: %s %s, maximal power in squared ME: %s, range: %s)", key, values[0], values[1], values[2], values[3], ) if self.nuisance_parameters is not None: logger.info("Found %s nuisance parameters", self.n_nuisance_parameters) for key, values in six.iteritems(self.nuisance_parameters): logger.debug(" %s (%s)", key, values) else: logger.info("Did not find nuisance parameters") self.include_nuisance_parameters = False logger.info("Found %s benchmarks, of which %s physical", self.n_benchmarks, self.n_benchmarks_phys) for (key, values), is_nuisance in zip(six.iteritems(self.benchmarks), self.benchmark_is_nuisance): if is_nuisance: logger.debug(" %s: systematics", key) else: logger.debug(" %s: %s", key, format_benchmark(values)) logger.info("Found %s observables", self.n_observables) if self.observables is not None: for i, obs in enumerate(self.observables): logger.debug(" %2.2s %s", i, obs) logger.info("Found %s events", self.n_samples) if self.n_events_generated_per_benchmark is not None: for events, name in zip(self.n_events_generated_per_benchmark, six.iterkeys(self.benchmarks)): if events > 0: logger.info(" %s signal events sampled from benchmark %s", events, name) if self.n_events_backgrounds is not None and self.n_events_backgrounds > 0: logger.info(" %s background events", self.n_events_backgrounds) else: logger.debug(" Did not find sample summary information") if self.morpher is not None: logger.info("Found morphing setup with %s components", len(self.morphing_components)) else: logger.info("Did not find morphing setup.") if self.nuisance_morpher is not None: logger.info("Found nuisance morphing setup") else: logger.info("Did not find nuisance morphing setup") def _calculate_nuisance_factors(self, nus, benchmark_weights): if self._any_nontrivial_nus(nus): return np.asarray([ self.nuisance_morpher.calculate_nuisance_factors( nu, benchmark_weights) for nu in nus ]) # Shape (n_thetas, n_batch) else: return 1.0 @staticmethod def _any_nontrivial_nus(nus): if nus is None: return False for nu in nus: if nu is not None: return True return False def _weights(self, thetas, nus, benchmark_weights, theta_matrices=None): """ Turns benchmark weights into weights for given parameter points (theta, nu). Parameters ---------- thetas : list of (ndarray or str) If None, the function returns all benchmark cross sections. Otherwise, it returns the cross sections for a series of parameter points that are either given by their benchmark name (as a str), their benchmark index (as an int), or their parameter value (as an ndarray, using morphing). nus : None or list of (None or ndarray) If None, the nuisance parameters are set to their nominal values (0), i.e. no systematics are taken into account. Otherwise, the list has to have the same number of elements as thetas, and each entry can specify nuisance parameters at nominal value (None) or a value of the nuisance parameters (ndarray). Returns ------- weights : ndarray Calculated weights in pb. """ n_events, _ = benchmark_weights.shape # Inputs include_nuisance_benchmarks = nus is not None if nus is None: nus = [None for _ in thetas] assert len(nus) == len( thetas), "Numbers of thetas and nus don't match!" # Theta matrices (translation of benchmarks to theta, at nominal nuisance params) if theta_matrices is None: theta_matrices = [ self._get_theta_benchmark_matrix(theta) for theta in thetas ] theta_matrices = np.asarray( theta_matrices) # Shape (n_thetas, n_benchmarks) # Weights at nominal nuisance params (nu=0) weights_nom = mdot(theta_matrices, benchmark_weights) # Shape (n_thetas, n_batch) # Effect of nuisance parameters nuisance_factors = self._calculate_nuisance_factors( nus, benchmark_weights) weights = nuisance_factors * weights_nom return weights def _weight_gradients(self, thetas, nus, benchmark_weights, gradients="all", theta_matrices=None, theta_gradient_matrices=None): """ Turns benchmark weights into weights for given parameter points (theta, nu). Parameters ---------- thetas : list of (ndarray or str) If None, the function returns all benchmark cross sections. Otherwise, it returns the cross sections for a series of parameter points that are either given by their benchmark name (as a str), their benchmark index (as an int), or their parameter value (as an ndarray, using morphing). nus : None or list of (None or ndarray) If None, the nuisance parameters are set to their nominal values (0), i.e. no systematics are taken into account. Otherwise, the list has to have the same number of elements as thetas, and each entry can specify nuisance parameters at nominal value (None) or a value of the nuisance parameters (ndarray). gradients : {"all", "theta", "nu"}, optional Which gradients to calculate. Default value: "all". Returns ------- gradients : ndarray Calculated gradients in pb. """ n_events, _ = benchmark_weights.shape # Inputs if gradients == "all" and self.n_nuisance_parameters == 0: gradients = "theta" if nus is None: nus = [None for _ in thetas] assert len(nus) == len( thetas), "Numbers of thetas and nus don't match!" # Theta matrices (translation of benchmarks to theta, at nominal nuisance params) if theta_matrices is None: theta_matrices = [ self._get_theta_benchmark_matrix(theta) for theta in thetas ] if theta_gradient_matrices is None: theta_gradient_matrices = [ self._get_dtheta_benchmark_matrix(theta) for theta in thetas ] theta_matrices = np.asarray( theta_matrices) # Shape (n_thetas, n_benchmarks) theta_gradient_matrices = np.asarray( theta_gradient_matrices ) # Shape (n_thetas, n_gradients, n_benchmarks) # Calculate theta gradient if gradients in ["all", "theta"]: nom_gradients = mdot( theta_gradient_matrices, benchmark_weights) # (n_thetas, n_phys_gradients, n_batch) nuisance_factors = self._calculate_nuisance_factors( nus, benchmark_weights) try: dweight_dtheta = nuisance_factors[:, np. newaxis, :] * nom_gradients except TypeError: dweight_dtheta = nom_gradients else: dweight_dtheta = None # Calculate nu gradient if gradients in ["all", "nu"]: weights_nom = mdot(theta_matrices, benchmark_weights) # Shape (n_thetas, n_batch) nuisance_factor_gradients = np.asarray([ self.nuisance_morpher.calculate_nuisance_factor_gradients( nu, benchmark_weights) for nu in nus ]) # Shape (n_thetas, n_nuisance_gradients, n_batch) dweight_dnu = nuisance_factor_gradients * weights_nom[:, np. newaxis, :] else: dweight_dnu = None if gradients == "theta": return dweight_dtheta elif gradients == "nu": return dweight_dnu return np.concatenate((dweight_dtheta, dweight_dnu), 1) def _train_test_split(self, train, test_split): """ Returns the start and end event for train samples (train = True) or test samples (train = False). Parameters ---------- train : bool True if training data is generated, False if test data is generated. test_split : float Fraction of events reserved for testing. Returns ------- start_event : int Index (in the MadMiner file) of the first event to consider. end_event : int Index (in the MadMiner file) of the last unweighted event to consider. correction_factor : float Factor with which the weights and cross sections will have to be multiplied to make up for the missing events. """ if train: start_event = 0 if test_split is None or test_split <= 0.0 or test_split >= 1.0: end_event = None correction_factor = 1.0 else: end_event = int(round((1.0 - test_split) * self.n_samples, 0)) correction_factor = 1.0 / (1.0 - test_split) if end_event < 0 or end_event > self.n_samples: raise ValueError( "Irregular train / test split: sample {} / {}", end_event, self.n_samples) else: if test_split is None or test_split <= 0.0 or test_split >= 1.0: start_event = 0 correction_factor = 1.0 else: start_event = int(round( (1.0 - test_split) * self.n_samples, 0)) + 1 correction_factor = 1.0 / test_split if start_event < 0 or start_event > self.n_samples: raise ValueError( "Irregular train / test split: sample {} / {}", start_event, self.n_samples) end_event = None return start_event, end_event, correction_factor def _get_theta_value(self, theta): if isinstance(theta, six.string_types): benchmark = self.benchmarks[theta] theta_value = np.array([benchmark[key] for key in benchmark]) elif isinstance(theta, int): benchmark = self.benchmarks[list(self.benchmarks.keys())[theta]] theta_value = np.array([benchmark[key] for key in benchmark]) else: theta_value = np.asarray(theta) return theta_value def _get_nu_value(self, nu): if nu is None: nu_value = np.zeros(self.n_nuisance_parameters) else: nu_value = np.asarray(nu) return nu_value def _get_theta_benchmark_matrix(self, theta, zero_pad=True): """Calculates vector A such that dsigma(theta) = A * dsigma_benchmarks""" if zero_pad: unpadded_theta_matrix = self._get_theta_benchmark_matrix( theta, zero_pad=False) theta_matrix = np.zeros(self.n_benchmarks) theta_matrix[:unpadded_theta_matrix. shape[0]] = unpadded_theta_matrix elif isinstance(theta, six.string_types): i_benchmark = list(self.benchmarks).index(theta) theta_matrix = self._get_theta_benchmark_matrix(i_benchmark) elif isinstance(theta, int): n_benchmarks = len(self.benchmarks) theta_matrix = np.zeros(n_benchmarks) theta_matrix[theta] = 1.0 else: theta_matrix = self.morpher.calculate_morphing_weights(theta) return theta_matrix def _get_dtheta_benchmark_matrix(self, theta, zero_pad=True): """Calculates matrix A_ij such that d dsigma(theta) / d theta_i = A_ij * dsigma (benchmark j)""" if self.morpher is None: raise RuntimeError("Cannot calculate score without morphing") if zero_pad: unpadded_theta_matrix = self._get_dtheta_benchmark_matrix( theta, zero_pad=False) dtheta_matrix = np.zeros( (unpadded_theta_matrix.shape[0], self.n_benchmarks)) dtheta_matrix[:, :unpadded_theta_matrix. shape[1]] = unpadded_theta_matrix elif isinstance(theta, six.string_types): benchmark = self.benchmarks[theta] benchmark = np.array( [value for _, value in six.iteritems(benchmark)]) dtheta_matrix = self._get_dtheta_benchmark_matrix(benchmark) elif isinstance(theta, int): benchmark = self.benchmarks[list(self.benchmarks.keys())[theta]] benchmark = np.array( [value for _, value in six.iteritems(benchmark)]) dtheta_matrix = self._get_dtheta_benchmark_matrix(benchmark) else: dtheta_matrix = self.morpher.calculate_morphing_weight_gradient( theta) # Shape (n_parameters, n_benchmarks_phys) return dtheta_matrix def _calculate_sampling_factors(self): events = np.asarray(self.n_events_generated_per_benchmark, dtype=np.float) logger.debug("Events per benchmark: %s", events) factors = events / np.sum(events) factors = np.hstack((factors, 1.0)) # background events return factors def _find_closest_benchmark(self, theta): if theta is None: return None benchmarks = self._benchmark_array() distances = [ np.linalg.norm(benchmark - theta) for benchmark in benchmarks ] logger.debug("Distances from %s: %s", theta, distances) # Don't use benchmarks where we don't actually have events if self.n_events_generated_per_benchmark is not None: logger.debug("n_events_generated_per_benchmark: %s", self.n_events_generated_per_benchmark) distances = distances + 1.0e9 * ( self.n_events_generated_per_benchmark == 0).astype(np.float) closest_idx = np.argmin(distances) return closest_idx def _benchmark_array(self): benchmarks_array = [] for benchmark in six.itervalues(self.benchmarks): benchmarks_array.append(list(six.itervalues(benchmark))) return np.asarray(benchmarks_array)
class MadMiner: """ The central class to manage parameter spaces, benchmarks, and the generation of events through MadGraph and Pythia. An instance of this class is the starting point of most MadMiner applications. It is typically used in four steps: * Defining the parameter space through `MadMiner.add_parameter` * Defining the benchmarks, i.e. the points at which the squared matrix elements will be evaluated in MadGraph, with `MadMiner.add_benchmark()` or, if operator morphing is used, with `MadMiner.set_benchmarks_from_morphing()` * Saving this setup with `MadMiner.save()` (it can be loaded in a new instance with `MadMiner.load()`) * Running MadGraph and Pythia with the appropriate settings with `MadMiner.run()` or `MadMiner.run_multiple()` (the latter allows the user to combine runs from multiple run cards and sampling points) Please see the tutorial for a hands-on introduction to its methods. """ def __init__(self): self.parameters = OrderedDict() self.benchmarks = OrderedDict() self.default_benchmark = None self.morpher = None self.export_morphing = False self.systematics = OrderedDict() self.finite_difference_benchmarks = OrderedDict() self.finite_difference_epsilon = 0.0 def _reset_systematics(self): self.systematics = OrderedDict() def _reset_benchmarks(self): self.benchmarks = OrderedDict() self.default_benchmark = None def _reset_morpher(self): self.morpher = None self.export_morphing = False def add_parameter( self, lha_block, lha_id, parameter_name=None, param_card_transform=None, morphing_max_power=2, parameter_range=(0.0, 1.0), ): """ Adds an individual parameter. Parameters ---------- lha_block : str The name of the LHA block as used in the param_card. Case-sensitive. lha_id : int The LHA id as used in the param_card. parameter_name : str or None An internal name for the parameter. If None, a the default 'benchmark_i' is used. morphing_max_power : int The maximal power with which this parameter contributes to the squared matrix element of the process of interest. Typically at tree level, this maximal number is 2 for parameters that affect one vertex (e.g. only production or only decay of a particle), and 4 for parameters that affect two vertices (e.g. production and decay). Default value: 2. param_card_transform : None or str Represents a one-parameter function mapping the parameter (`"theta"`) to the value that should be written in the parameter cards. This str is parsed by Python's `eval()` function, and `"theta"` is parsed as the parameter value. Default value: None. parameter_range : tuple of float The range of parameter values of primary interest. Only affects the basis optimization. Default value: (0., 1.). Returns ------- None """ # Default names if parameter_name is None: parameter_name = f"parameter_{len(self.parameters)}" if param_card_transform is None: param_card_transform = "_" # Check and sanitize input assert isinstance(lha_block, str), f"LHA block is not a string: {lha_block}" assert isinstance(lha_id, int), f"LHA id is not an integer: {lha_id}" assert isinstance( parameter_name, str), f"Parameter name is not a string: {parameter_name}" assert isinstance( morphing_max_power, int), f"Morphing max power is not an integer: {morphing_max_power}" parameter_name = parameter_name.replace(" ", "_") parameter_name = parameter_name.replace("-", "_") assert parameter_name not in self.parameters, f"Parameter already exists: {parameter_name}" parameter = AnalysisParameter( parameter_name, lha_block, lha_id, morphing_max_power, parameter_range, param_card_transform, ) # Add parameter logger.info("Adding parameter: %s", parameter) self.parameters[parameter_name] = parameter # The morphing information is not accurate anymore logger.warning("Resetting benchmarks and morphing") self._reset_benchmarks() self._reset_morpher() def set_parameters(self, parameters: Union[Dict[str, AnalysisParameter], List[tuple]]): """ Manually sets all parameters, overwriting previously added parameters. Parameters ---------- parameters : dict or list If parameters is an dict, the keys should be str and give the parameter names, and the values are AnalysisParameter model instances. If parameters is a list, the items should be tuples of the form (LHA_block, LHA_ID). Returns ------- None """ self.parameters = OrderedDict() if isinstance(parameters, dict): for param in parameters.values(): self.add_parameter( lha_block=param.lha_block, lha_id=param.lha_id, parameter_name=param.name, morphing_max_power=param.max_power, parameter_range=param.val_range, ) elif isinstance(parameters, list): for values in parameters: self.add_parameter(values[0], values[1]) else: raise RuntimeError(f"Invalid set of parameters: {parameters}") # The morphing information is not accurate anymore logger.warning("Resetting benchmarks and morphing") self._reset_benchmarks() self._reset_morpher() def add_benchmark(self, parameter_values: Dict[str, float], benchmark_name: str = None, verbose: float = True): """ Manually adds an individual benchmark, that is, a parameter point that will be evaluated by MadGraph. Parameters ---------- parameter_values : dict The keys of this dict should be the parameter names and the values the corresponding parameter values. benchmark_name : str or None, optional Name of benchmark. If None, a default name is used. Default value: None. verbose : bool, optional If True, prints output about each benchmark. Default value: True. Returns ------- None Raises ------ RuntimeError If a benchmark with the same name already exists, if parameter_values is not a dict, or if a key of parameter_values does not correspond to a defined parameter. """ # Default names if benchmark_name is None: benchmark_name = f"benchmark_{len(self.benchmarks)}" # Check input if not isinstance(parameter_values, dict): raise RuntimeError( f"Parameter values are not a dict: {parameter_values}") for p_name in parameter_values.keys(): if p_name not in self.parameters.keys(): raise RuntimeError(f"Unknown parameter: {p_name}") if benchmark_name in self.benchmarks.keys(): raise RuntimeError(f"Benchmark {benchmark_name} exists already") # Add benchmark self.benchmarks[benchmark_name] = Benchmark( name=benchmark_name, values=parameter_values, ) # If first benchmark, this will be the default for sampling if len(self.benchmarks) == 1: self.default_benchmark = benchmark_name if verbose: logger.info("Added benchmark %s", self.benchmarks[benchmark_name]) else: logger.debug("Added benchmark %s", self.benchmarks[benchmark_name]) def set_benchmarks(self, benchmarks: Union[Dict[str, dict], List[dict]], verbose: bool = True): """ Manually sets all benchmarks, that is, parameter points that will be evaluated by MadGraph. Calling this function overwrites all previously defined benchmarks. Parameters ---------- benchmarks : dict or list Specifies all benchmarks. If None, all benchmarks are reset. If dict, the keys are the benchmark names and the values the Benchmark instances. If list, the entries are dicts {parameter_name:value} (and the benchmark names are chosen automatically). Default value: None. verbose : bool, optional If True, prints output about each benchmark. Default value: True. Returns ------- None """ self.benchmarks = OrderedDict() self.default_benchmark = None if isinstance(benchmarks, dict): for name, values in benchmarks.items(): self.add_benchmark(values, name, verbose=verbose) elif isinstance(benchmarks, list): for values in benchmarks: self.add_benchmark(values) else: raise RuntimeError(f"Invalid set of benchmarks: {benchmarks}") # After manually adding benchmarks, the morphing information is not accurate anymore if self.morpher is not None: logger.warning("Reset morphing") self.morpher = None self.export_morphing = False def set_morphing( self, max_overall_power=4, n_bases=1, include_existing_benchmarks=True, n_trials=100, n_test_thetas=100, ): """ Sets up the morphing environment. Sets benchmarks, i.e. parameter points that will be evaluated by MadGraph, for a morphing algorithm, and calculates all information required for morphing. Morphing is a technique that allows MadMax to infer the full probability distribution `p(x_i | theta)` for each simulated event `x_i` and any `theta`, not just the benchmarks. The morphing basis is optimized with respect to the expected mean squared morphing weights over the parameter region of interest. If keep_existing_benchmarks=True, benchmarks defined previously will be incorporated in the morphing basis and only the remaining basis points will be optimized. Note that any subsequent call to `set_benchmarks` or `add_benchmark` will overwrite the morphing setup. The correct order is therefore to manually define benchmarks first, using `set_benchmarks` or `add_benchmark`, and then to create the morphing setup and complete the basis by calling `set_benchmarks_from_morphing(keep_existing_benchmarks=True)`. Parameters ---------- max_overall_power : int, optional The maximal sum of powers of all parameters contributing to the squared matrix element. Typically, if parameters can affect the couplings at n vertices, this number is 2n. Default value: 4. n_bases : int, optional The number of morphing bases generated. If n_bases > 1, multiple bases are combined, and the weights for each basis are reduced by a factor 1 / n_bases. Currently only the default choice of 1 is fully implemented. Do not use any other value for now. Default value: 1. include_existing_benchmarks : bool, optional If True, the previously defined benchmarks are included in the morphing basis. In that case, the number of free parameters in the optimization routine is reduced. If False, the existing benchmarks will still be simulated, but are not part of the morphing routine. Default value: True. n_trials : int, optional Number of random basis configurations tested in the optimization procedure. A larger number will increase the run time of the optimization, but lead to better results. Default value: 100. n_test_thetas : int, optional Number of random parameter points used to evaluate the expected mean squared morphing weights. A larger number will increase the run time of the optimization, but lead to better results. Default value: 100. Returns ------- None """ logger.info("Optimizing basis for morphing") morpher = PhysicsMorpher(parameters_from_madminer=self.parameters) morpher.find_components(max_overall_power) if include_existing_benchmarks: n_predefined_benchmarks = len(self.benchmarks) basis = morpher.optimize_basis( n_bases=n_bases, benchmarks_from_madminer=self.benchmarks, n_trials=n_trials, n_test_thetas=n_test_thetas, ) else: n_predefined_benchmarks = 0 basis = morpher.optimize_basis( n_bases=n_bases, benchmarks_from_madminer=None, n_trials=n_trials, n_test_thetas=n_test_thetas, ) basis.update(self.benchmarks) self.set_benchmarks(basis, verbose=False) self.morpher = morpher self.export_morphing = True logger.info( "Set up morphing with %s parameters, %s morphing components, %s predefined basis points, and %s " "new basis points", morpher.n_parameters, morpher.n_components, n_predefined_benchmarks, morpher.n_components - n_predefined_benchmarks, ) def finite_differences(self, epsilon=0.01): """ Adds benchmarks so that the score can be computed from finite differences Don't add any more benchmarks or parameters after calling this! """ logger.info("Adding finite-differences benchmarks with epsilon = %s", epsilon) self.finite_difference_epsilon = epsilon # Copy is necessary to avoid endless loop :/ for b_name, benchmark in self.benchmarks.copy().items(): fd_keys = {} for param_name, param_value in benchmark.values.items(): fd_key = f"{b_name}_plus_{param_name}" fd_obj = benchmark.copy() fd_obj.values[param_name] += epsilon self.add_benchmark(fd_obj, fd_key) fd_keys[param_name] = fd_key self.finite_difference_benchmarks[b_name].shift_names = fd_keys def add_systematics( self, effect, systematic_name=None, norm_variation=1.1, scale="mu", scale_variations=(0.5, 1.0, 2.0), pdf_variation="CT10", ): """ Parameters ---------- effect : {"norm", "scale", "pdf"} Type of the nuisance parameter. If "norm", it will affect the overall normalization of one or multiple samples in the process. If "scale", the nuisance parameter effect will be determined by varying factorization or regularization scales (depending on scale_variation and scales). If "pdf", the effect of the nuisance parameters will be determined by varying the PDF used. systematic_name : None or str, optional scale : {"mu", "mur", "muf"}, optional If type is "scale", this sets whether only the regularization scale ("mur"), only the factorization scale ("muf"), or both simultaneously ("mu") are varied. Default value: "mu". norm_variation : float, optional If type is "norm", this sets the relative effect of the nuisance parameter on the cross section at the "plus 1 sigma" variation. 1.1 corresponds to a 10% increase, 0.9 to a 10% decrease relative to the nominal cross section. Default value: 1.1. scale_variations : tuple of float, optional If type is "scale", this sets how the regularization and / or factorization scales are varied. A tuple like (0.5, 1.0, 2.0) specifies the factors with which they are varied. Default value: (0.5, 1.0, 2.0). pdf_variation : str, optional If type is "pdf", defines the PDF set for the variation. The option is passed along to the `--pdf` option of MadGraph's systematics module. See https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics for a list. The option "CT10" would, as an example, run over all the eigenvectors of the CTEQ10 set. Default value: "CT10". Returns ------- None """ assert scale in ["mu", "mur", "muf"] # Default name if systematic_name is None: i = 0 while f"{effect}_{i}" in list(self.systematics.keys()): i += 1 systematic_name = f"{type}_{i}" systematic_name = systematic_name.replace(" ", "_") systematic_name = systematic_name.replace("-", "_") scale = SystematicScale.from_str(scale) effect = SystematicType.from_str(effect) if effect is SystematicType.PDF: self.systematics[systematic_name] = Systematic( systematic_name, SystematicType.PDF, pdf_variation, ) elif effect is SystematicType.SCALE: scale_variation_string = ",".join( (str(factor) for factor in scale_variations)) self.systematics[systematic_name] = Systematic( systematic_name, SystematicType.SCALE, scale_variation_string, scale, ) elif effect is SystematicType.NORM: self.systematics[systematic_name] = Systematic( systematic_name, SystematicType.NORM, norm_variation, ) def load(self, filename, disable_morphing=False): """ Loads MadMiner setup from a file. All parameters, benchmarks, and morphing settings are overwritten. See `save` for more details. Parameters ---------- filename : str Path to the MadMiner file. disable_morphing : bool, optional If True, the morphing setup is not loaded from the file. Default value: False. Returns ------- None """ # Load data ( self.parameters, self.benchmarks, _, morphing_components, morphing_matrix, _, _, self.systematics, _, _, _, _, self.finite_difference_benchmarks, self.finite_difference_epsilon, ) = load_madminer_settings(filename, include_nuisance_benchmarks=False) logger.info("Found %s parameters:", len(self.parameters)) for param in self.parameters.values(): logger.info(" %s", param) logger.info("Found %s benchmarks:", len(self.benchmarks)) for benchmark in self.benchmarks.values(): logger.info(" %s", benchmark) if self.default_benchmark is None: self.default_benchmark = benchmark.name # Morphing self.morpher = None self.export_morphing = False if morphing_matrix is not None and morphing_components is not None and not disable_morphing: self.morpher = PhysicsMorpher(self.parameters) self.morpher.set_components(morphing_components) self.morpher.set_basis(self.benchmarks, morphing_matrix=morphing_matrix) self.export_morphing = True logger.info("Found morphing setup with %s components", len(morphing_components)) else: logger.info("Did not find morphing setup.") # Systematics setup if len(self.systematics) == 0: logger.info("Did not find systematics setup.") else: logger.info("Found systematics setup with %s groups", len(self.systematics)) for name, systematic in self.systematics.items(): logger.debug(" %s: %s", name, systematic) def save(self, filename): """ Saves MadMiner setup into a file. The file format follows the HDF5 standard. The saved information includes: * the parameter definitions, * the benchmark points, * the systematics setup (if defined), and * the morphing setup (if defined). This file is an important input to later stages in the analysis chain, including the processing of generated events, extraction of training samples, and calculation of Fisher information matrices. In these downstream tasks, additional information will be written to the MadMiner file, including the observations and event weights. Parameters ---------- filename : str Path to the MadMiner file. Returns ------- None """ Path(filename) \ .parent \ .mkdir(parents=True, exist_ok=True) if self.morpher is not None: logger.info("Saving setup (including morphing) to %s", filename) save_madminer_settings( file_name=filename, file_override=True, parameters=self.parameters, benchmarks=self.benchmarks, morphing_components=self.morpher.components, morphing_matrix=self.morpher.morphing_matrix, systematics=self.systematics, finite_differences=self.finite_difference_benchmarks, finite_differences_epsilon=self.finite_difference_epsilon, ) else: logger.info("Saving setup (without morphing) to %s", filename) save_madminer_settings( file_name=filename, file_override=True, parameters=self.parameters, benchmarks=self.benchmarks, systematics=self.systematics, finite_differences=self.finite_difference_benchmarks, finite_differences_epsilon=self.finite_difference_epsilon, ) def _export_cards( self, param_card_template_file, mg_process_directory, sample_benchmark=None, param_card_filename=None, reweight_card_filename=None, include_param_card=True, benchmarks=None, ): """ Writes out a param_card and reweight_card for MadGraph. Instead of this low-level function, it is recommended to use `run` or `run_multiple`. Parameters ---------- param_card_template_file : str Path to a param_card.dat of the used model. mg_process_directory : str Path to the directory of the MG process. sample_benchmark : str or None, optional Name of the benchmark used for sampling. If None, the very first defined benchmark is used. Default value: None. param_card_filename : str or None, optional Output filename for the generated param card. If None, a default filename in the MG process folder is used. Default value: None. reweight_card_filename : str or None, optional str or None. Output filename for the generated reweight card. If None, a default filename in the MG process folder is used. Default value: None. include_param_card : bool, optional If False, no param card is exported, only a reweight card benchmarks : None or OrderedDict, optional If None, uses all benchmarks. Otherwise uses these benchmarks. Returns ------- None """ if param_card_filename is None or reweight_card_filename is None: logger.info("Creating param and reweight cards in %s", mg_process_directory) else: logger.info("Creating param and reweight cards in %s, %s", param_card_filename, reweight_card_filename) if benchmarks is None: benchmarks = self.benchmarks # Check status assert self.default_benchmark is not None assert len(self.benchmarks) > 0 # Default benchmark if sample_benchmark is None: sample_benchmark = self.default_benchmark # Export param card if include_param_card: export_param_card( benchmark=benchmarks[sample_benchmark], parameters=self.parameters, param_card_template_file=param_card_template_file, mg_process_directory=mg_process_directory, param_card_filename=param_card_filename, ) # Export reweight card export_reweight_card( sample_benchmark=sample_benchmark, benchmarks=benchmarks, parameters=self.parameters, mg_process_directory=mg_process_directory, reweight_card_filename=reweight_card_filename, ) def run( self, mg_directory, proc_card_file, param_card_template_file, run_card_file=None, mg_process_directory=None, pythia8_card_file=None, configuration_file=None, sample_benchmark=None, is_background=False, only_prepare_script=False, ufo_model_directory=None, log_directory=None, temp_directory=None, initial_command=None, systematics=None, order="LO", python_executable=None, ): """ High-level function that creates the the MadGraph process, all required cards, and prepares or runs the event generation for one combination of cards. If `only_prepare_scripts=True`, the event generation is not run directly, but a bash script is created in `<process_folder>/madminer/run.sh` that will start the event generation with the correct settings. High-level function that creates the the MadGraph process, all required cards, and prepares or runs the event generation for multiple combinations of run_cards or importance samplings (`sample_benchmarks`). If `only_prepare_scripts=True`, the event generation is not run directly, but a bash script is created in `<process_folder>/madminer/run.sh` that will start the event generation with the correct settings. Parameters ---------- mg_directory : str Path to the MadGraph 5 base directory. proc_card_file : str Path to the process card that tells MadGraph how to generate the process. param_card_template_file : str Path to a param card that will be used as template to create the appropriate param cards for these runs. run_card_file : str Paths to the MadGraph run card. If None, the default run_card is used. mg_process_directory : str or None, optional Path to the MG process directory. If None, MadMiner uses ./MG_process. Default value: None. pythia8_card_file : str or None, optional Path to the MadGraph Pythia8 card. If None, the card present in the process folder is used. Default value: None. configuration_file : str, optional Path to the MadGraph me5_configuration card. If None, the card present in the process folder is used. Default value: None. sample_benchmark : list of str or None, optional Lists the names of benchmarks that should be used to sample events. A different sampling does not change the expected differential cross sections, but will change which regions of phase space have many events (small variance) or few events (high variance). If None, the benchmark added first is used. Default value: None. is_background : bool, optional Should be True for background processes, i.e. process in which the differential cross section does not depend on the parameters (i.e. is the same for all benchmarks). In this case, no reweighting is run, which can substantially speed up the event generation. Default value: False. only_prepare_script : bool, optional If True, the event generation is not started, but instead a run.sh script is created in the process directory. Default value: False. ufo_model_directory : str or None, optional Path to an UFO model directory that should be used, but is not yet installed in mg_directory/models. The model will be copied to the MadGraph model directory before the process directory is generated. (Default value = None. log_directory : str or None, optional Directory for log files with the MadGraph output. If None, ./logs is used. Default value: None. temp_directory : str or None, optional Path to a temporary directory. If None, a system default is used. Default value: None. initial_command : str or None, optional Initial shell commands that have to be executed before MG is run (e.g. to load a virtual environment). Default value: None. systematics : None or list of str, optional If list of str, defines which systematics are used for this run. order : 'LO' or 'NLO', optional Differentiates between LO and NLO order runs. Minor changes to writing, reading and naming cards. Default value: 'LO' python_executable : None or str, optional Provides a path to the Python executable that should be used to call MadMiner. Default: None. Returns ------- None """ if sample_benchmark is None: sample_benchmark = self.default_benchmark self.run_multiple( mg_directory=mg_directory, proc_card_file=proc_card_file, param_card_template_file=param_card_template_file, run_card_files=[run_card_file], mg_process_directory=mg_process_directory, pythia8_card_file=pythia8_card_file, configuration_file=configuration_file, sample_benchmarks=[sample_benchmark], is_background=is_background, only_prepare_script=only_prepare_script, ufo_model_directory=ufo_model_directory, log_directory=log_directory, temp_directory=temp_directory, initial_command=initial_command, systematics=systematics, order=order, python_executable=python_executable, ) def run_multiple( self, mg_directory, proc_card_file, param_card_template_file, run_card_files, mg_process_directory=None, pythia8_card_file=None, configuration_file=None, sample_benchmarks=None, is_background=False, only_prepare_script=False, ufo_model_directory=None, log_directory=None, temp_directory=None, initial_command=None, systematics=None, order="LO", python_executable=None, ): """ High-level function that creates the the MadGraph process, all required cards, and prepares or runs the event generation for multiple combinations of run_cards or importance samplings (`sample_benchmarks`). If `only_prepare_scripts=True`, the event generation is not run directly, but a bash script is created in `<process_folder>/madminer/run.sh` that will start the event generation with the correct settings. Parameters ---------- mg_directory : str Path to the MadGraph 5 base directory. proc_card_file : str Path to the process card that tells MadGraph how to generate the process. param_card_template_file : str Path to a param card that will be used as template to create the appropriate param cards for these runs. run_card_files : list of str Paths to the MadGraph run card. mg_process_directory : str or None, optional Path to the MG process directory. If None, MadMiner uses ./MG_process. Default value: None. pythia8_card_file : str, optional Path to the MadGraph Pythia8 card. If None, the card present in the process folder is used. Default value: None. configuration_file : str, optional Path to the MadGraph me5_configuration card. If None, the card present in the process folder is used. Default value: None. sample_benchmarks : list of str or None, optional Lists the names of benchmarks that should be used to sample events. A different sampling does not change the expected differential cross sections, but will change which regions of phase space have many events (small variance) or few events (high variance). If None, a run is started for each of the benchmarks, which should map out all regions of phase space well. Default value: None. is_background : bool, optional Should be True for background processes, i.e. process in which the differential cross section does not depend on the parameters (i.e. is the same for all benchmarks). In this case, no reweighting is run, which can substantially speed up the event generation. Default value: False. only_prepare_script : bool, optional If True, the event generation is not started, but instead a run.sh script is created in the process directory. Default value: False. ufo_model_directory : str or None, optional Path to an UFO model directory that should be used, but is not yet installed in mg_directory/models. The model will be copied to the MadGraph model directory before the process directory is generated. (Default value = None) log_directory : str or None, optional Directory for log files with the MadGraph output. If None, ./logs is used. Default value: None. temp_directory : str or None, optional Path to a temporary directory. If None, a system default is used. Default value: None. initial_command : str or None, optional Initial shell commands that have to be executed before MG is run (e.g. to load a virtual environment). If not specified and `python2_override` is True, it adds the user-installed Python2 binaries to the PATH. Default value: None. systematics : None or list of str, optional If list of str, defines which systematics are used for these runs. order : 'LO' or 'NLO', optional Differentiates between LO and NLO order runs. Minor changes to writing, reading and naming cards. Default value: 'LO' python_executable : None or str, optional Provides a path to the Python executable that should be used to call MadMiner. Default: None. Returns ------- None """ # Defaults if mg_process_directory is None: mg_process_directory = "./MG_process" if temp_directory is None: temp_directory = tempfile.gettempdir() if log_directory is None: log_directory = "./logs" if sample_benchmarks is None: sample_benchmarks = [ benchmark for benchmark in self.benchmarks.keys() ] # This snippet is useful when using virtual envs. # (Derives from a Python2 - Python3 issue). # Ref: https://github.com/madminer-tool/madminer/issues/422 if python_executable and initial_command is None: logger.info(f"Adding {python_executable} bin folder to PATH") binary_path = os.popen( f"command -v {python_executable}").read().strip() binary_folder = Path(binary_path).parent initial_command = f"export PATH={binary_folder}:$PATH" logger.info(f"Using Python executable {binary_path}") # Generate process folder log_file_generate = f"{log_directory}/generate.log" generate_mg_process( mg_directory, temp_directory, proc_card_file, mg_process_directory, ufo_model_directory=ufo_model_directory, initial_command=initial_command, log_file=log_file_generate, python_executable=python_executable, ) # Make MadMiner folders Path(mg_process_directory, "madminer", "cards").mkdir(parents=True, exist_ok=True) Path(mg_process_directory, "madminer", "scripts").mkdir(parents=True, exist_ok=True) # Systematics if systematics is None: systematics_used = self.systematics else: systematics_used = OrderedDict() for key in systematics: systematics_used[key] = self.systematics[key] # Loop over settings i = 0 mg_scripts = [] for run_card_file in run_card_files: for sample_benchmark in sample_benchmarks: # Files script_file = f"madminer/scripts/run_{i}.sh" log_file_run = f"run_{i}.log" mg_commands_filename = f"madminer/cards/mg_commands_{i}.dat" param_card_file = f"madminer/cards/param_card_{i}.dat" reweight_card_file = f"madminer/cards/reweight_card_{i}.dat" new_pythia8_card_file = None if pythia8_card_file is not None: new_pythia8_card_file = f"madminer/cards/pythia8_card_{i}.dat" new_run_card_file = None if run_card_file is not None: new_run_card_file = f"madminer/cards/run_card_{i}.dat" new_configuration_file = None if configuration_file is not None: new_configuration_file = f"madminer/cards/me5_configuration_{i}.txt" logger.info("Run %s", i) logger.info(" Sampling from benchmark: %s", sample_benchmark) logger.info(" Original run card: %s", run_card_file) logger.info(" Original Pythia8 card: %s", pythia8_card_file) logger.info(" Original config card: %s", configuration_file) logger.info(" Copied run card: %s", new_run_card_file) logger.info(" Copied Pythia8 card: %s", new_pythia8_card_file) logger.info(" Copied config card: %s", new_configuration_file) logger.info(" Param card: %s", param_card_file) logger.info(" Reweight card: %s", reweight_card_file) logger.info(" Log file: %s", log_file_run) # Check input if run_card_file is None and any( syst.type in {SystematicType.PDF, SystematicType.SCALE} for syst in systematics_used.values()): logger.warning( "Warning: No run card given, but PDF or scale variation set up. The correct systematics" " settings are not set automatically. Make sure to set them correctly!" ) # Create param and reweight cards self._export_cards( param_card_template_file, mg_process_directory, sample_benchmark=sample_benchmark, param_card_filename= f"{mg_process_directory}/{param_card_file}", reweight_card_filename= f"{mg_process_directory}/{reweight_card_file}", ) # Create run card if run_card_file is not None: export_run_card( template_filename=run_card_file, run_card_filename= f"{mg_process_directory}/{new_run_card_file}", systematics=systematics_used, order=order, ) # Copy Pythia card if pythia8_card_file is not None: copy_file( pythia8_card_file, f"{mg_process_directory}/{new_pythia8_card_file}") # Copy Configuration card if configuration_file is not None: copy_file( configuration_file, f"{mg_process_directory}/{new_configuration_file}") # Run MG and Pythia if only_prepare_script: mg_script = setup_mg_with_scripts( mg_process_directory, proc_card_filename_from_mgprocdir=mg_commands_filename, run_card_file_from_mgprocdir=new_run_card_file, param_card_file_from_mgprocdir=param_card_file, reweight_card_file_from_mgprocdir=reweight_card_file, pythia8_card_file_from_mgprocdir=new_pythia8_card_file, configuration_file_from_mgprocdir= new_configuration_file, is_background=is_background, script_file_from_mgprocdir=script_file, initial_command=initial_command, log_dir=log_directory, log_file_from_logdir=log_file_run, python_executable=python_executable, order=order, ) mg_scripts.append(mg_script) else: run_mg( mg_directory, mg_process_directory, f"{mg_process_directory}/{mg_commands_filename}", f"{mg_process_directory}/{new_run_card_file}", f"{mg_process_directory}/{param_card_file}", f"{mg_process_directory}/{reweight_card_file}", None if new_pythia8_card_file is None else f"{mg_process_directory}/{new_pythia8_card_file}", None if new_configuration_file is None else f"{mg_process_directory}/{new_configuration_file}", is_background=is_background, initial_command=initial_command, log_file=f"{log_directory}/{log_file_run}", python_executable=python_executable, order=order, ) i += 1 n_runs_total = i # Master shell script if only_prepare_script: master_script_filename = f"{mg_process_directory}/madminer/run.sh" create_master_script( log_directory, master_script_filename, mg_directory, mg_process_directory, mg_scripts, ) logger.info( "To generate events, please run:\n\n %s [MG_directory] [MG_process_directory] [log_dir]\n\n", master_script_filename, ) else: expected_event_files = [ f"{mg_process_directory}/Events/run_{(i+1):02d}" for i in range(n_runs_total) ] expected_event_files = "\n".join(expected_event_files) logger.info( "Finished running MadGraph! Please check that events were successfully generated in the following " "folders:\n\n%s\n\n", expected_event_files, ) def reweight_existing_sample( self, mg_process_directory, run_name, param_card_template_file, sample_benchmark, reweight_benchmarks=None, only_prepare_script=False, log_directory=None, initial_command=None, ): """ High-level function that adds the weights required for MadMiner to an existing sample. If `only_prepare_scripts=True`, the event generation is not run directly, but a bash script is created in `<process_folder>/madminer/run.sh` that will start the event generation with the correct settings. Currently does not support adding systematics. Parameters ---------- mg_process_directory : str Path to the MG process directory. If None, MadMiner uses ./MG_process. run_name : str Run name. param_card_template_file : str Path to a param card that will be used as template to create the appropriate param cards for these runs. sample_benchmark : str The name of the benchmark used to generate this sample. reweight_benchmarks : list of str or None Lists the names of benchmarks to which the sample should be reweighted. If None, all benchmarks (except sample_benchmarks) are used. only_prepare_script : bool, optional If True, the event generation is not started, but instead a run.sh script is created in the process directory. Default value: False. log_directory : str or None, optional Directory for log files with the MadGraph output. If None, ./logs is used. Default value: None. initial_command : str or None, optional Initial shell commands that have to be executed before MG is run (e.g. to load a virtual environment). Default value: None. Returns ------- None """ # TODO: check that we don't reweight to benchmarks that already have weights in the LHE file # TODO: add systematics # Defaults if log_directory is None: log_directory = "./logs" # Make MadMiner folders Path(mg_process_directory, "madminer", "cards").mkdir(parents=True, exist_ok=True) Path(mg_process_directory, "madminer", "scripts").mkdir(parents=True, exist_ok=True) # Files script_file = "madminer/scripts/run_reweight.sh" log_file_run = "reweight.log" reweight_card_file = "/madminer/cards/reweight_card_reweight.dat" # Missing benchmarks missing_benchmarks = OrderedDict() for benchmark_name in reweight_benchmarks: missing_benchmarks[benchmark_name] = self.benchmarks[ benchmark_name] # Inform user logger.info("Reweighting setup") logger.info(" Originally sampled from benchmark: %s", sample_benchmark) logger.info(" Now reweighting to benchmarks: %s", reweight_benchmarks) logger.info(" Reweight card: %s", reweight_card_file) logger.info(" Log file: %s", log_file_run) # Create param and reweight cards self._export_cards( param_card_template_file, mg_process_directory, sample_benchmark=sample_benchmark, reweight_card_filename= f"{mg_process_directory}/{reweight_card_file}", include_param_card=False, benchmarks=missing_benchmarks, ) # Run reweighting if only_prepare_script: call_instruction = setup_mg_reweighting_with_scripts( mg_process_directory, run_name=run_name, reweight_card_file_from_mgprocdir=reweight_card_file, script_file_from_mgprocdir=script_file, initial_command=initial_command, log_dir=log_directory, log_file_from_logdir=log_file_run, ) logger.info("To generate events, please run:\n\n %s \n\n", call_instruction) else: run_mg_reweighting( mg_process_directory, run_name=run_name, reweight_card_file= f"{mg_process_directory}/{reweight_card_file}", initial_command=initial_command, log_file=f"{log_directory}/{log_file_run}", ) logger.info( "Finished running reweighting! Please check that events were successfully reweighted in the following " "folder:\n\n %s/Events/%s \n\n", mg_process_directory, run_name, )
class MadMiner: """ The central class to manage parameter spaces, benchmarks, and the generation of events through MadGraph and Pythia. An instance of this class is the starting point of most MadMiner applications. It is typically used in four steps: * Defining the parameter space through `MadMiner.add_parameter` * Defining the benchmarks, i.e. the points at which the squared matrix elements will be evaluated in MadGraph, with `MadMiner.add_benchmark()` or, if operator morphing is used, with `MadMiner.set_benchmarks_from_morphing()` * Saving this setup with `MadMiner.save()` (it can be loaded in a new instance with `MadMiner.load()`) * Running MadGraph and Pythia with the appropriate settings with `MadMiner.run()` or `MadMiner.run_multiple()` (the latter allows the user to combine runs from multiple run cards and sampling points) Please see the tutorial for a hands-on introduction to its methods. """ def __init__(self): self.parameters = OrderedDict() self.benchmarks = OrderedDict() self.default_benchmark = None self.morpher = None self.export_morphing = False self.systematics = OrderedDict() def add_parameter( self, lha_block, lha_id, parameter_name=None, param_card_transform=None, morphing_max_power=2, parameter_range=(0.0, 1.0), ): """ Adds an individual parameter. Parameters ---------- lha_block : str The name of the LHA block as used in the param_card. Case-sensitive. lha_id : int The LHA id as used in the param_card. parameter_name : str or None An internal name for the parameter. If None, a the default 'benchmark_i' is used. morphing_max_power : int or tuple of int The maximal power with which this parameter contributes to the squared matrix element of the process of interest. If a tuple is given, gives this maximal power for each of several operator configurations. Typically at tree level, this maximal number is 2 for parameters that affect one vertex (e.g. only production or only decay of a particle), and 4 for parameters that affect two vertices (e.g. production and decay). Default value: 2. param_card_transform : None or str Represents a one-parameter function mapping the parameter (`"theta"`) to the value that should be written in the parameter cards. This str is parsed by Python's `eval()` function, and `"theta"` is parsed as the parameter value. Default value: None. parameter_range : tuple of float The range of parameter values of primary interest. Only affects the basis optimization. Default value: (0., 1.). Returns ------- None """ # Default names if parameter_name is None: parameter_name = "parameter_" + str(len(self.parameters)) # Check and sanitize input assert isinstance( parameter_name, six.string_types), "Parameter name is not a string: {}".format( parameter_name) assert isinstance( lha_block, six.string_types), "LHA block is not a string: {}".format( lha_block) assert isinstance(lha_id, int), "LHA id is not an integer: {}".format(lha_id) parameter_name = parameter_name.replace(" ", "_") parameter_name = parameter_name.replace("-", "_") assert parameter_name not in self.parameters, "Parameter name exists already: {}".format( parameter_name) if isinstance(morphing_max_power, int): morphing_max_power = (morphing_max_power, ) # Add parameter self.parameters[parameter_name] = (lha_block, lha_id, morphing_max_power, parameter_range, param_card_transform) # After manually adding parameters, the morphing information is not accurate anymore self.morpher = None logger.info( "Added parameter %s (LHA: %s %s, maximal power in squared ME: %s, range: %s)", parameter_name, lha_block, lha_id, morphing_max_power, parameter_range, ) # Reset benchmarks if len(self.benchmarks) > 0: logger.warning("Resetting benchmarks and morphing") self.benchmarks = OrderedDict() self.default_benchmark = None self.morpher = None self.export_morphing = False def set_parameters(self, parameters=None): """ Manually sets all parameters, overwriting previously added parameters. Parameters ---------- parameters : dict or list or None, optional If parameters is None, resets parameters. If parameters is an dict, the keys should be str and give the parameter names, and the values are tuples of the form (LHA_block, LHA_ID, morphing_max_power, param_min, param_max) or of the form (LHA_block, LHA_ID). If parameters is a list, the items should be tuples of the form (LHA_block, LHA_ID). Default value: None. Returns ------- None """ if parameters is None: parameters = OrderedDict() self.parameters = OrderedDict() if isinstance(parameters, dict): for key, values in six.iteritems(parameters): if len(values) == 5: self.add_parameter( lha_block=values[0], lha_id=values[1], parameter_name=key, parameter_range=[values[3], values[4]], morphing_max_power=values[2], ) elif len(values) == 2: self.add_parameter(lha_block=values[0], lha_id=values[1], parameter_name=key) else: raise ValueError( "Parameter properties has unexpected length: {0}". format(values)) else: for values in parameters: assert len( values ) == 2, "Parameter list entry does not have length 2: {0}".format( values) self.add_parameter(values[0], values[1]) # After manually adding parameters, the morphing information is not accurate anymore if len(self.benchmarks) > 0: logger.warning("Resetting benchmarks and morphing") self.benchmarks = OrderedDict() self.default_benchmark = None self.morpher = None self.export_morphing = False def add_benchmark(self, parameter_values, benchmark_name=None, verbose=True): """ Manually adds an individual benchmark, that is, a parameter point that will be evaluated by MadGraph. If this command is called before Parameters ---------- parameter_values : dict The keys of this dict should be the parameter names and the values the corresponding parameter values. benchmark_name : str or None, optional Name of benchmark. If None, a default name is used. Default value: None. verbose : bool, optional If True, prints output about each benchmark. Default value: True. Returns ------- None Raises ------ RuntimeError If a benchmark with the same name already exists, if parameter_values is not a dict, or if a key of parameter_values does not correspond to a defined parameter. """ # Default names if benchmark_name is None: benchmark_name = "benchmark_" + str(len(self.benchmarks)) # Check input if not isinstance(parameter_values, dict): raise RuntimeError( "Parameter values are not a dict: {}".format(parameter_values)) for key, value in six.iteritems(parameter_values): if key not in self.parameters: raise RuntimeError("Unknown parameter: {0}".format(key)) if benchmark_name in self.benchmarks: raise RuntimeError( "Benchmark name {} exists already".format(benchmark_name)) # Add benchmark self.benchmarks[benchmark_name] = parameter_values # If first benchmark, this will be the default for sampling if len(self.benchmarks) == 1: self.default_benchmark = benchmark_name if verbose: logger.info("Added benchmark %s: %s)", benchmark_name, format_benchmark(parameter_values)) else: logger.debug("Added benchmark %s: %s)", benchmark_name, format_benchmark(parameter_values)) def set_benchmarks(self, benchmarks=None, verbose=True): """ Manually sets all benchmarks, that is, parameter points that will be evaluated by MadGraph. Calling this function overwrites all previously defined benchmarks. Parameters ---------- benchmarks : dict or list or None, optional Specifies all benchmarks. If None, all benchmarks are reset. If dict, the keys are the benchmark names and the values are dicts of the form {parameter_name:value}. If list, the entries are dicts {parameter_name:value} (and the benchmark names are chosen automatically). Default value: None. verbose : bool, optional If True, prints output about each benchmark. Default value: True. Returns ------- None """ if benchmarks is None: benchmarks = OrderedDict() self.benchmarks = OrderedDict() self.default_benchmark = None if isinstance(benchmarks, dict): for name, values in six.iteritems(benchmarks): self.add_benchmark(values, name, verbose=verbose) else: for values in benchmarks: self.add_benchmark(values) # After manually adding benchmarks, the morphing information is not accurate anymore if self.morpher is not None: logger.warning("Reset morphing") self.morpher = None self.export_morphing = False def set_morphing(self, max_overall_power=4, n_bases=1, include_existing_benchmarks=True, n_trials=100, n_test_thetas=100): """ Sets up the morphing environment. Sets benchmarks, i.e. parameter points that will be evaluated by MadGraph, for a morphing algorithm, and calculates all information required for morphing. Morphing is a technique that allows MadMax to infer the full probability distribution `p(x_i | theta)` for each simulated event `x_i` and any `theta`, not just the benchmarks. The morphing basis is optimized with respect to the expected mean squared morphing weights over the parameter region of interest. If keep_existing_benchmarks=True, benchmarks defined previously will be incorporated in the morphing basis and only the remaining basis points will be optimized. Note that any subsequent call to `set_benchmarks` or `add_benchmark` will overwrite the morphing setup. The correct order is therefore to manually define benchmarks first, using `set_benchmarks` or `add_benchmark`, and then to create the morphing setup and complete the basis by calling `set_benchmarks_from_morphing(keep_existing_benchmarks=True)`. Parameters ---------- max_overall_power : int or tuple of int, optional The maximal sum of powers of all parameters contributing to the squared matrix element. If a tuple is given, gives the maximal sum of powers for each of several operator configurations (see `add_parameter`). Typically, if parameters can affect the couplings at n vertices, this number is 2n. Default value: 4. n_bases : int, optional The number of morphing bases generated. If n_bases > 1, multiple bases are combined, and the weights for each basis are reduced by a factor 1 / n_bases. Currently only the default choice of 1 is fully implemented. Do not use any other value for now. Default value: 1. include_existing_benchmarks : bool, optional If True, the previously defined benchmarks are included in the morphing basis. In that case, the number of free parameters in the optimization routine is reduced. If False, the existing benchmarks will still be simulated, but are not part of the morphing routine. Default value: True. n_trials : int, optional Number of random basis configurations tested in the optimization procedure. A larger number will increase the run time of the optimization, but lead to better results. Default value: 100. n_test_thetas : int, optional Number of random parameter points used to evaluate the expected mean squared morphing weights. A larger number will increase the run time of the optimization, but lead to better results. Default value: 100. Returns ------- None """ logger.info("Optimizing basis for morphing") if isinstance(max_overall_power, int): max_overall_power = (max_overall_power, ) morpher = PhysicsMorpher(parameters_from_madminer=self.parameters) morpher.find_components(max_overall_power) if include_existing_benchmarks: n_predefined_benchmarks = len(self.benchmarks) basis = morpher.optimize_basis( n_bases=n_bases, fixed_benchmarks_from_madminer=self.benchmarks, n_trials=n_trials, n_test_thetas=n_test_thetas, ) else: n_predefined_benchmarks = 0 basis = morpher.optimize_basis(n_bases=n_bases, fixed_benchmarks_from_madminer=None, n_trials=n_trials, n_test_thetas=n_test_thetas) basis.update(self.benchmarks) self.set_benchmarks(basis, verbose=False) self.morpher = morpher self.export_morphing = True logger.info( "Set up morphing with %s parameters, %s morphing components, %s predefined basis points, and %s " "new basis points", morpher.n_parameters, morpher.n_components, n_predefined_benchmarks, morpher.n_components - n_predefined_benchmarks, ) def reset_systematics(self): self.systematics = OrderedDict() def add_systematics( self, effect, systematic_name=None, norm_variation=1.1, scale="mu", scale_variations=(0.5, 1.0, 2.0), pdf_variation="CT10", ): """ Parameters ---------- effect : {"norm", "scale", "pdf"} Type of the nuisance parameter. If "norm", it will affect the overall normalization of one or multiple samples in the process. If "scale", the nuisance parameter effect will be determined by varying factorization or regularization scales (depending on scale_variation and scales). If "pdf", the effect of the nuisance parameters will be determined by varying the PDF used. systematic_name : None or str, optional scale : {"mu", "mur", "muf"}, optional If type is "scale", this sets whether only the regularization scale ("mur"), only the factorization scale ("muf"), or both simulatenously ("mu") are varied. Default value: "mu". norm_variation : float, optional If type is "norm", this sets the relative effect of the nuisance parameter on the cross section at the "plus 1 sigma" variation. 1.1 corresponds to a 10% increase, 0.9 to a 10% decrease relative to the nominal cross section. Default value: 1.1. scale_variations : tuple of float, optional If type is "scale", this sets how the regularization and / or factorization scales are varied. A tuple like (0.5,1.,2.) specifies the factors with which they are varied. Default value: (0.5,1.,2.0). pdf_variation : str, optional If type is "pdf", defines the PDF set for the variation. The option is passed along to the `--pdf` option of MadGraph's systematics module. See https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics for a list. The option "CT10" would, as an example, run over all the eigenvectors of the CTEQ10 set. Default value: "CT10". Returns ------- None """ # Default name if systematic_name is None: i = 0 while "{}_{}".format(effect, i) in list(six.iterkeys(self.systematics)): i += 1 systematic_name = "{}_{}".format(type, i) systematic_name = systematic_name.replace(" ", "_") systematic_name = systematic_name.replace("-", "_") if effect == "pdf": self.systematics[systematic_name] = ("pdf", pdf_variation) elif effect == "scale": scale_variation_string = ",".join( [str(factor) for factor in scale_variations]) assert scale in ["mu", "mur", "muf"] self.systematics[systematic_name] = ("scale", scale, scale_variation_string) elif effect == "norm": self.systematics[systematic_name] = ("norm", norm_variation) else: raise ValueError( "Unknown systematic type {}, has to be one of 'norm', 'scale', or 'pdf'!" .format(type)) def load(self, filename, disable_morphing=False): """ Loads MadMiner setup from a file. All parameters, benchmarks, and morphing settings are overwritten. See `save` for more details. Parameters ---------- filename : str Path to the MadMiner file. disable_morphing : bool, optional If True, the morphing setup is not loaded from the file. Default value: False. Returns ------- None """ # Load data ( self.parameters, self.benchmarks, _, morphing_components, morphing_matrix, _, _, self.systematics, _, _, _, _, ) = load_madminer_settings(filename, include_nuisance_benchmarks=False) logger.info("Found %s parameters:", len(self.parameters)) for key, values in six.iteritems(self.parameters): logger.info( " %s (LHA: %s %s, maximal power in squared ME: %s, range: %s)", key, values[0], values[1], values[2], values[3], ) logger.info("Found %s benchmarks:", len(self.benchmarks)) for key, values in six.iteritems(self.benchmarks): logger.info(" %s: %s", key, format_benchmark(values)) if self.default_benchmark is None: self.default_benchmark = key # Morphing self.morpher = None self.export_morphing = False if morphing_matrix is not None and morphing_components is not None and not disable_morphing: self.morpher = PhysicsMorpher(self.parameters) self.morpher.set_components(morphing_components) self.morpher.set_basis(self.benchmarks, morphing_matrix=morphing_matrix) self.export_morphing = True logger.info("Found morphing setup with %s components", len(morphing_components)) else: logger.info("Did not find morphing setup.") # Systematics setup if len(self.systematics) == 0: logger.info("Did not find systematics setup.") else: logger.info( "Found systematics setup with %s nuisance parameter groups", len(self.systematics)) for key, value in six.iteritems(self.systematics): logger.debug(" %s: %s", key, " / ".join(str(x) for x in value)) def save(self, filename): """ Saves MadMiner setup into a file. The file format follows the HDF5 standard. The saved information includes: * the parameter definitions, * the benchmark points, * the systematics setup (if defined), and * the morphing setup (if defined). This file is an important input to later stages in the analysis chain, including the processing of generated events, extraction of training samples, and calculation of Fisher information matrices. In these downstream tasks, additional information will be written to the MadMiner file, including the observations and event weights. Parameters ---------- filename : str Path to the MadMiner file. Returns ------- None """ create_missing_folders([os.path.dirname(filename)]) if self.morpher is not None: logger.info("Saving setup (including morphing) to %s", filename) save_madminer_settings( filename=filename, parameters=self.parameters, benchmarks=self.benchmarks, morphing_components=self.morpher.components, morphing_matrix=self.morpher.morphing_matrix, systematics=self.systematics, overwrite_existing_files=True, ) else: logger.info("Saving setup (without morphing) to %s", filename) save_madminer_settings( filename=filename, parameters=self.parameters, benchmarks=self.benchmarks, systematics=self.systematics, overwrite_existing_files=True, ) def _export_cards( self, param_card_template_file, mg_process_directory, sample_benchmark=None, param_card_filename=None, reweight_card_filename=None, include_param_card=True, benchmarks=None, ): """ Writes out a param_card and reweight_card for MadGraph. Instead of this low-level function, it is recommended to use `run` or `run_multiple`. Parameters ---------- param_card_template_file : str Path to a param_card.dat of the used model. mg_process_directory : str Path to the directory of the MG process. sample_benchmark : str or None, optional Name of the benchmark used for sampling. If None, the very first defined benchmark is used. Default value: None. param_card_filename : str or None, optional Output filename for the generated param card. If None, a default filename in the MG process folder is used. Default value: None. reweight_card_filename : str or None, optional str or None. Output filename for the generated reweight card. If None, a default filename in the MG process folder is used. Default value: None. include_param_card : bool, optional If False, no param card is exported, only a reweight card benchmarks : None or OrderedDict, optional If None, uses all benchmarks. Otherwise uses these benchmarks. Returns ------- None """ if param_card_filename is None or reweight_card_filename is None: logger.info("Creating param and reweight cards in %s", mg_process_directory) else: logger.info("Creating param and reweight cards in %s, %s", param_card_filename, reweight_card_filename) if benchmarks is None: benchmarks = self.benchmarks # Check status assert self.default_benchmark is not None assert len(self.benchmarks) > 0 # Default benchmark if sample_benchmark is None: sample_benchmark = self.default_benchmark # Export param card if include_param_card: export_param_card( benchmark=benchmarks[sample_benchmark], parameters=self.parameters, param_card_template_file=param_card_template_file, mg_process_directory=mg_process_directory, param_card_filename=param_card_filename, ) # Export reweight card export_reweight_card( sample_benchmark=sample_benchmark, benchmarks=benchmarks, parameters=self.parameters, mg_process_directory=mg_process_directory, reweight_card_filename=reweight_card_filename, ) def run( self, mg_directory, proc_card_file, param_card_template_file, run_card_file=None, mg_process_directory=None, pythia8_card_file=None, configuration_file=None, sample_benchmark=None, is_background=False, only_prepare_script=False, ufo_model_directory=None, log_directory=None, temp_directory=None, initial_command=None, python2_override=False, systematics=None, ): """ High-level function that creates the the MadGraph process, all required cards, and prepares or runs the event generation for one combination of cards. If `only_prepare_scripts=True`, the event generation is not run directly, but a bash script is created in `<process_folder>/madminer/run.sh` that will start the event generation with the correct settings. High-level function that creates the the MadGraph process, all required cards, and prepares or runs the event generation for multiple combinations of run_cards or importance samplings (`sample_benchmarks`). If `only_prepare_scripts=True`, the event generation is not run directly, but a bash script is created in `<process_folder>/madminer/run.sh` that will start the event generation with the correct settings. Parameters ---------- mg_directory : str Path to the MadGraph 5 base directory. proc_card_file : str Path to the process card that tells MadGraph how to generate the process. param_card_template_file : str Path to a param card that will be used as template to create the appropriate param cards for these runs. run_card_file : str Paths to the MadGraph run card. If None, the default run_card is used. mg_process_directory : str or None, optional Path to the MG process directory. If None, MadMiner uses ./MG_process. Default value: None. pythia8_card_file : str or None, optional Path to the MadGraph Pythia8 card. If None, the card present in the process folder is used. Default value: None. configuration_file : str, optional Path to the MadGraph me5_configuration card. If None, the card present in the process folder is used. Default value: None. sample_benchmark : list of str or None, optional Lists the names of benchmarks that should be used to sample events. A different sampling does not change the expected differential cross sections, but will change which regions of phase space have many events (small variance) or few events (high variance). If None, the benchmark added first is used. Default value: None. is_background : bool, optional Should be True for background processes, i.e. process in which the differential cross section does not depend on the parameters (i.e. is the same for all benchmarks). In this case, no reweighting is run, which can substantially speed up the event generation. Default value: False. only_prepare_script : bool, optional If True, the event generation is not started, but instead a run.sh script is created in the process directory. Default value: False. ufo_model_directory : str or None, optional Path to an UFO model directory that should be used, but is not yet installed in mg_directory/models. The model will be copied to the MadGraph model directory before the process directory is generated. (Default value = None. log_directory : str or None, optional Directory for log files with the MadGraph output. If None, ./logs is used. Default value: None. temp_directory : str or None, optional Path to a temporary directory. If None, a system default is used. Default value: None. initial_command : str or None, optional Initial shell commands that have to be executed before MG is run (e.g. to load a virtual environment). Default value: None. python2_override : bool, optional If True, MadMiner explicitly calls "python2" instead of relying on the system Python version to be Python 2.6 or Python 2.7. If you use systematics, make sure that the python interface of LHAPDF was compiled with the Python version you are using. Default: False. systematics : None or list of str, optional If list of str, defines which systematics are used for this run. Returns ------- None """ if sample_benchmark is None: sample_benchmark = self.default_benchmark self.run_multiple( mg_directory=mg_directory, proc_card_file=proc_card_file, param_card_template_file=param_card_template_file, run_card_files=[run_card_file], mg_process_directory=mg_process_directory, pythia8_card_file=pythia8_card_file, configuration_file=configuration_file, sample_benchmarks=[sample_benchmark], is_background=is_background, only_prepare_script=only_prepare_script, ufo_model_directory=ufo_model_directory, log_directory=log_directory, temp_directory=temp_directory, initial_command=initial_command, python2_override=python2_override, systematics=systematics, ) def run_multiple( self, mg_directory, proc_card_file, param_card_template_file, run_card_files, mg_process_directory=None, pythia8_card_file=None, configuration_file=None, sample_benchmarks=None, is_background=False, only_prepare_script=False, ufo_model_directory=None, log_directory=None, temp_directory=None, initial_command=None, python2_override=False, systematics=None, ): """ High-level function that creates the the MadGraph process, all required cards, and prepares or runs the event generation for multiple combinations of run_cards or importance samplings (`sample_benchmarks`). If `only_prepare_scripts=True`, the event generation is not run directly, but a bash script is created in `<process_folder>/madminer/run.sh` that will start the event generation with the correct settings. Parameters ---------- mg_directory : str Path to the MadGraph 5 base directory. proc_card_file : str Path to the process card that tells MadGraph how to generate the process. param_card_template_file : str Path to a param card that will be used as template to create the appropriate param cards for these runs. run_card_files : list of str Paths to the MadGraph run card. mg_process_directory : str or None, optional Path to the MG process directory. If None, MadMiner uses ./MG_process. Default value: None. pythia8_card_file : str, optional Path to the MadGraph Pythia8 card. If None, the card present in the process folder is used. Default value: None. configuration_file : str, optional Path to the MadGraph me5_configuration card. If None, the card present in the process folder is used. Default value: None. sample_benchmarks : list of str or None, optional Lists the names of benchmarks that should be used to sample events. A different sampling does not change the expected differential cross sections, but will change which regions of phase space have many events (small variance) or few events (high variance). If None, a run is started for each of the benchmarks, which should map out all regions of phase space well. Default value: None. is_background : bool, optional Should be True for background processes, i.e. process in which the differential cross section does not depend on the parameters (i.e. is the same for all benchmarks). In this case, no reweighting is run, which can substantially speed up the event generation. Default value: False. only_prepare_script : bool, optional If True, the event generation is not started, but instead a run.sh script is created in the process directory. Default value: False. ufo_model_directory : str or None, optional Path to an UFO model directory that should be used, but is not yet installed in mg_directory/models. The model will be copied to the MadGraph model directory before the process directory is generated. (Default value = None) log_directory : str or None, optional Directory for log files with the MadGraph output. If None, ./logs is used. Default value: None. temp_directory : str or None, optional Path to a temporary directory. If None, a system default is used. Default value: None. initial_command : str or None, optional Initial shell commands that have to be executed before MG is run (e.g. to load a virtual environment). If not specified and `python2_override` is True, it adds the user-installed Python2 binaries to the PATH. Default value: None. python2_override : bool, optional If True, MadMiner explicitly calls "python2" instead of relying on the system Python version to be Python 2.6 or Python 2.7. If you use systematics, make sure that the python interface of LHAPDF was compiled with the Python version you are using. Default: False. systematics : None or list of str, optional If list of str, defines which systematics are used for these runs. Returns ------- None """ # Defaults if mg_process_directory is None: mg_process_directory = "./MG_process" if temp_directory is None: temp_directory = tempfile.gettempdir() if log_directory is None: log_directory = "./logs" if sample_benchmarks is None: sample_benchmarks = [benchmark for benchmark in self.benchmarks] # Gives 'python2_override' full power if 'initial_command' is empty. # (Reference: https://github.com/diana-hep/madminer/issues/422) if python2_override and initial_command is None: logger.info("Adding Python2.7 bin folder to PATH") binary_path = os.popen("command -v python2.7").read().strip() binary_folder = os.path.dirname(os.path.realpath(binary_path)) initial_command = "export PATH={}:$PATH".format(binary_folder) # Generate process folder log_file_generate = log_directory + "/generate.log" generate_mg_process( mg_directory, temp_directory, proc_card_file, mg_process_directory, ufo_model_directory=ufo_model_directory, initial_command=initial_command, log_file=log_file_generate, explicit_python_call=python2_override, ) # Make MadMiner folders create_missing_folders([ mg_process_directory + "/madminer", mg_process_directory + "/madminer/cards", mg_process_directory + "/madminer/scripts", ]) # Systematics if systematics is None: systematics_used = self.systematics else: systematics_used = OrderedDict() for key in systematics: systematics_used[key] = self.systematics[key] # Loop over settings i = 0 mg_scripts = [] for run_card_file in run_card_files: for sample_benchmark in sample_benchmarks: # Files script_file = "madminer/scripts/run_{}.sh".format(i) log_file_run = "run_{}.log".format(i) mg_commands_filename = "madminer/cards/mg_commands_{}.dat".format( i) param_card_file = "madminer/cards/param_card_{}.dat".format(i) reweight_card_file = "madminer/cards/reweight_card_{}.dat".format( i) new_pythia8_card_file = None if pythia8_card_file is not None: new_pythia8_card_file = "madminer/cards/pythia8_card_{}.dat".format( i) new_run_card_file = None if run_card_file is not None: new_run_card_file = "madminer/cards/run_card_{}.dat".format( i) new_configuration_file = None if configuration_file is not None: new_configuration_file = "madminer/cards/me5_configuration_{}.txt".format( i) logger.info("Run %s", i) logger.info(" Sampling from benchmark: %s", sample_benchmark) logger.info(" Original run card: %s", run_card_file) logger.info(" Original Pythia8 card: %s", pythia8_card_file) logger.info(" Original config card: %s", configuration_file) logger.info(" Copied run card: %s", new_run_card_file) logger.info(" Copied Pythia8 card: %s", new_pythia8_card_file) logger.info(" Copied config card: %s", new_configuration_file) logger.info(" Param card: %s", param_card_file) logger.info(" Reweight card: %s", reweight_card_file) logger.info(" Log file: %s", log_file_run) # Check input if run_card_file is None and self._check_pdf_or_scale_variation( systematics_used): logger.warning( "Warning: No run card given, but PDF or scale variation set up. The correct systematics" " settings are not set automatically. Make sure to set them correctly!" ) # Create param and reweight cards self._export_cards( param_card_template_file, mg_process_directory, sample_benchmark=sample_benchmark, param_card_filename=mg_process_directory + "/" + param_card_file, reweight_card_filename=mg_process_directory + "/" + reweight_card_file, ) # Create run card if run_card_file is not None: export_run_card( template_filename=run_card_file, run_card_filename=mg_process_directory + "/" + new_run_card_file, systematics=systematics_used, ) # Copy Pythia card if pythia8_card_file is not None: copy_file( pythia8_card_file, mg_process_directory + "/" + new_pythia8_card_file) # Copy Configuration card if configuration_file is not None: copy_file( configuration_file, mg_process_directory + "/" + new_configuration_file) # Run MG and Pythia if only_prepare_script: mg_script = setup_mg_with_scripts( mg_process_directory, proc_card_filename_from_mgprocdir=mg_commands_filename, run_card_file_from_mgprocdir=new_run_card_file, param_card_file_from_mgprocdir=param_card_file, reweight_card_file_from_mgprocdir=reweight_card_file, pythia8_card_file_from_mgprocdir=new_pythia8_card_file, configuration_file_from_mgprocdir= new_configuration_file, is_background=is_background, script_file_from_mgprocdir=script_file, initial_command=initial_command, log_dir=log_directory, log_file_from_logdir=log_file_run, explicit_python_call=python2_override, ) mg_scripts.append(mg_script) else: run_mg( mg_directory, mg_process_directory, mg_process_directory + "/" + mg_commands_filename, mg_process_directory + "/" + new_run_card_file, mg_process_directory + "/" + param_card_file, mg_process_directory + "/" + reweight_card_file, None if new_pythia8_card_file is None else mg_process_directory + "/" + new_pythia8_card_file, None if new_configuration_file is None else mg_process_directory + "/" + new_configuration_file, is_background=is_background, initial_command=initial_command, log_file=log_directory + "/" + log_file_run, explicit_python_call=python2_override, ) i += 1 n_runs_total = i # Master shell script if only_prepare_script: master_script_filename = "{}/madminer/run.sh".format( mg_process_directory) create_master_script(log_directory, master_script_filename, mg_directory, mg_process_directory, mg_scripts) logger.info( "To generate events, please run:\n\n %s [MG_directory] [MG_process_directory] [log_dir]\n\n", master_script_filename, ) else: expected_event_files = [ mg_process_directory + "/Events/run_{:02d}".format(i + 1) for i in range(n_runs_total) ] expected_event_files = "\n".join(expected_event_files) logger.info( "Finished running MadGraph! Please check that events were succesfully generated in the following " "folders:\n\n%s\n\n", expected_event_files, ) def reweight_existing_sample( self, mg_process_directory, run_name, param_card_template_file, sample_benchmark, reweight_benchmarks=None, only_prepare_script=False, log_directory=None, temp_directory=None, initial_command=None, ): """ High-level function that adds the weights required for MadMiner to an existing sample. If `only_prepare_scripts=True`, the event generation is not run directly, but a bash script is created in `<process_folder>/madminer/run.sh` that will start the event generation with the correct settings. Currently does not support adding systematics. Parameters ---------- mg_process_directory : str Path to the MG process directory. If None, MadMiner uses ./MG_process. run_name : str Run name. param_card_template_file : str Path to a param card that will be used as template to create the appropriate param cards for these runs. sample_benchmark : str The name of the benchmark used to generate this sample. reweight_benchmarks : list of str or None Lists the names of benchmarks to which the sample should be reweighted. If None, all benchmarks (except sample_benchmarks) are used. only_prepare_script : bool, optional If True, the event generation is not started, but instead a run.sh script is created in the process directory. Default value: False. log_directory : str or None, optional Directory for log files with the MadGraph output. If None, ./logs is used. Default value: None. initial_command : str or None, optional Initial shell commands that have to be executed before MG is run (e.g. to load a virtual environment). Default value: None. Returns ------- None """ # TODO: check that we don't reweight to benchmarks that already have weights in the LHE file # TODO: add systematics # Defaults if log_directory is None: log_directory = "./logs" # Make MadMiner folders create_missing_folders([ mg_process_directory + "/madminer", mg_process_directory + "/madminer/cards", mg_process_directory + "/madminer/scripts", ]) # Files script_file = "madminer/scripts/run_reweight.sh" log_file_run = "reweight.log" reweight_card_file = "/madminer/cards/reweight_card_reweight.dat" # Missing benchmarks missing_benchmarks = OrderedDict() for benchmark in reweight_benchmarks: missing_benchmarks[benchmark] = self.benchmarks[benchmark] # Inform user logger.info("Reweighting setup") logger.info(" Originally sampled from benchmark: %s", sample_benchmark) logger.info(" Now reweighting to benchmarks: %s", reweight_benchmarks) logger.info(" Reweight card: %s", reweight_card_file) logger.info(" Log file: %s", log_file_run) # Create param and reweight cards self._export_cards( param_card_template_file, mg_process_directory, sample_benchmark=sample_benchmark, reweight_card_filename=mg_process_directory + "/" + reweight_card_file, include_param_card=False, benchmarks=missing_benchmarks, ) # Run reweighting if only_prepare_script: call_instruction = setup_mg_reweighting_with_scripts( mg_process_directory, run_name=run_name, reweight_card_file_from_mgprocdir=reweight_card_file, script_file_from_mgprocdir=script_file, initial_command=initial_command, log_dir=log_directory, log_file_from_logdir=log_file_run, ) logger.info("To generate events, please run:\n\n %s \n\n", call_instruction) else: run_mg_reweighting( mg_process_directory, run_name=run_name, reweight_card_file=mg_process_directory + "/" + reweight_card_file, initial_command=initial_command, log_file=log_directory + "/" + log_file_run, ) logger.info( "Finished running reweighting! Please check that events were succesfully reweighted in the following " "folder:\n\n %s/Events/%s \n\n", mg_process_directory, run_name, ) def _check_pdf_or_scale_variation(self, systematics): for value in six.itervalues(systematics): if value[0] in ["pdf", "scale"]: return True return False
class DataAnalyzer: """ Collects common functionality that is used when analysing data in the MadMiner file. Parameters ---------- filename : str Path to MadMiner file (for instance the output of `madminer.delphes.DelphesProcessor.save()`). disable_morphing : bool, optional If True, the morphing setup is not loaded from the file. Default value: False. include_nuisance_parameters : bool, optional If True, nuisance parameters are taken into account. Default value: True. """ def __init__(self, filename, disable_morphing=False, include_nuisance_parameters=True): # Save setup self.include_nuisance_parameters = include_nuisance_parameters self.madminer_filename = filename # Load data logger.info("Loading data from %s", filename) ( self.parameters, self.benchmarks, self.benchmark_nuisance_flags, self.morphing_components, self.morphing_matrix, self.observables, self.n_samples, self.systematics, self.reference_benchmark, self.nuisance_parameters, self.n_events_generated_per_benchmark, self.n_events_backgrounds, self.finite_difference_benchmarks, self.finite_difference_epsilon, ) = load_madminer_settings( filename, include_nuisance_benchmarks=include_nuisance_parameters) self.n_observables = len(self.observables) self.n_parameters = len(self.parameters) self.n_benchmarks = len(self.benchmarks) self.n_benchmarks_phys = np.sum( np.logical_not(self.benchmark_nuisance_flags)) self.n_nuisance_parameters = len(self.nuisance_parameters) # Morphing self.morpher = None if self.morphing_matrix is not None and self.morphing_components is not None and not disable_morphing: self.morpher = PhysicsMorpher(self.parameters) self.morpher.set_components(self.morphing_components) self.morpher.set_basis(self.benchmarks, morphing_matrix=self.morphing_matrix) # Nuisance morphing self.nuisance_morpher = None if self.n_nuisance_parameters > 0: self.nuisance_morpher = NuisanceMorpher( self.nuisance_parameters, self.benchmarks.keys(), self.reference_benchmark, ) # Check event numbers self._check_n_events() self._report_setup() def event_loader( self, start=0, end=None, batch_size=100000, include_nuisance_parameters=None, generated_close_to=None, return_sampling_ids=False, ): """ Yields batches of events in the MadMiner file. Parameters ---------- start : int, optional First event index to load end : int or None, optional Last event index to load batch_size : int, optional Batch size include_nuisance_parameters : bool, optional Whether nuisance parameter benchmarks are included in the returned data generated_close_to : None or ndarray, optional If None, this function yields all events. Otherwise, it just yields just the events that were generated at the closest benchmark point to a given parameter point. return_sampling_ids : bool, optional If True, the iterator returns the sampling IDs in addition to observables and weights. Yields ------ observations : ndarray Event data weights : ndarray Event weights sampling_ids : int Sampling IDs (benchmark used for sampling for signal events, -1 for background events). Only returned if return_sampling_ids = True was set. """ if include_nuisance_parameters is None: include_nuisance_parameters = self.include_nuisance_parameters sampling_benchmark = self._find_closest_benchmark(generated_close_to) logger.debug( f"Sampling benchmark closest to {generated_close_to}: {sampling_benchmark}" ) if sampling_benchmark is None: sampling_factors = self._calculate_sampling_factors() else: sampling_factors = np.ones(self.n_benchmarks_phys + 1) for data in load_events( file_name=self.madminer_filename, start_index=start, final_index=end, batch_size=batch_size, benchmark_nuisance_flags=self.benchmark_nuisance_flags, sampling_benchmark=sampling_benchmark, sampling_factors=sampling_factors, include_nuisance_params=include_nuisance_parameters, include_sampling_ids=return_sampling_ids, ): yield data def weighted_events( self, theta=None, nu=None, start_event=None, end_event=None, derivative=False, generated_close_to=None, n_draws=None, ): """ Returns all events together with the benchmark weights (if theta is None) or weights for a given theta. Parameters ---------- theta : None or ndarray or str, optional If None, the function returns all benchmark weights. If str, the function returns the weights for a given benchmark name. If ndarray, it uses morphing to calculate the weights for this value of theta. Default value: None. nu : None or ndarray, optional If None, the nuisance parameters are set to their nominal values. Otherwise, and if theta is an ndarray, sets the values of the nuisance parameters. start_event : int Index (in the MadMiner file) of the first event to consider. end_event : int Index (in the MadMiner file) of the last unweighted event to consider. derivative : bool, optional If True and if theta is not None, the derivative of the weights with respect to theta are returned. Default value: False. generated_close_to : None or int, optional Only returns benchmarks generated from this benchmark (and background events). Default value: None. n_draws : None or int, optional If not None, returns only this number of events, drawn randomly. Returns ------- x : ndarray Observables with shape `(n_unweighted_samples, n_observables)`. weights : ndarray If theta is None and derivative is False, benchmark weights with shape `(n_unweighted_samples, n_benchmarks)` in pb. If theta is not None and derivative is True, the gradient of the weight for the given parameter with respect to theta with shape `(n_unweighted_samples, n_gradients)` in pb. Otherwise, weights for the given parameter theta with shape `(n_unweighted_samples,)` in pb. """ x, weights_benchmarks = next( self.event_loader( start=start_event, end=end_event, batch_size=None, generated_close_to=generated_close_to, )) # Pick events randomly n_events = len(x) if n_draws is not None and n_draws < n_events: idx = np.random.choice(n_events, n_draws, replace=False) x = x[idx] weights_benchmarks = weights_benchmarks[idx] elif n_draws is not None: logger.warning( f"Requested {n_draws} events, but only {n_events} available") # Process and return appropriate weights if theta is None: return x, weights_benchmarks elif isinstance(theta, str): i_benchmark = list(self.benchmarks.keys()).index(theta) return x, weights_benchmarks[:, i_benchmark] elif derivative: dtheta_matrix = self._get_dtheta_benchmark_matrix(theta) gradients_theta = mdot( dtheta_matrix, weights_benchmarks) # (n_gradients, n_samples) gradients_theta = gradients_theta.T return x, gradients_theta else: # TODO: nuisance params if nu is not None: raise NotImplementedError() theta_matrix = self._get_theta_benchmark_matrix(theta) weights_theta = mdot(theta_matrix, weights_benchmarks) return x, weights_theta def xsecs( self, thetas=None, nus=None, partition="all", test_split=0.2, validation_split=0.2, include_nuisance_benchmarks=True, batch_size=100000, generated_close_to=None, ): """ Returns the total cross sections for benchmarks or parameter points. Parameters ---------- thetas : None or list of (ndarray or str), optional If None, the function returns all benchmark cross sections. Otherwise, it returns the cross sections for a series of parameter points that are either given by their benchmark name (as a str), their benchmark index (as an int), or their parameter value (as an ndarray, using morphing). Default value: None. nus : None or list of (None or ndarray), optional If None, the nuisance parameters are set to their nominal values (0), i.e. no systematics are taken into account. Otherwise, the list has to have the same number of elements as thetas, and each entry can specify nuisance parameters at nominal value (None) or a value of the nuisance parameters (ndarray). partition : {"train", "test", "validation", "all"}, optional Which event partition to use. Default: "all". test_split : float, optional Fraction of events reserved for testing. Default value: 0.2. validation_split : float, optional Fraction of weighted events reserved for validation. Default value: 0.2. include_nuisance_benchmarks : bool, optional Whether to include nuisance benchmarks if thetas is None. Default value: True. batch_size : int, optional Size of the batches of events that are loaded into memory at the same time. Default value: 100000. generated_close_to : None or ndarray, optional If not None, only events originally generated from the closest benchmark to this parameter point will be used. Default value : None. Returns ------- xsecs : ndarray Calculated cross sections in pb. xsec_uncertainties : ndarray Cross-section uncertainties in pb. Basically calculated as sum(weights**2)**0.5. """ logger.debug("Calculating cross sections for thetas = %s and nus = %s", thetas, nus) # Inputs if thetas is not None: include_nuisance_benchmarks = True if thetas is not None: if nus is None: nus = [None for _ in thetas] assert len(nus) == len( thetas), "Numbers of thetas and nus don't match!" # Which events to use if partition == "all": start_event, end_event = None, None correction_factor = 1.0 elif partition in ["train", "validation", "test"]: start_event, end_event, correction_factor = self._train_validation_test_split( partition, test_split, validation_split) else: raise ValueError(f"Invalid partition type: {partition}") # Theta matrices (translation of benchmarks to theta, at nominal nuisance params) if thetas is None: theta_matrices = np.identity(self.n_benchmarks) else: theta_matrices = [ self._get_theta_benchmark_matrix(theta) for theta in thetas ] theta_matrices = np.asarray( theta_matrices) # Shape (n_thetas, n_benchmarks) # Loop over events xsecs = 0.0 xsec_uncertainties = 0.0 n_events = 0 for i_batch, (_, benchmark_weights) in enumerate( self.event_loader( start=start_event, end=end_event, include_nuisance_parameters=include_nuisance_benchmarks, batch_size=batch_size, generated_close_to=generated_close_to, )): n_batch, _ = benchmark_weights.shape n_events += n_batch # Benchmark xsecs if thetas is None: xsecs += np.sum(benchmark_weights, axis=0) xsec_uncertainties += np.sum(benchmark_weights * benchmark_weights, axis=0) # xsecs at given parameters(theta, nu) else: # Weights at nominal nuisance params (nu=0) weights_nom = mdot( theta_matrices, benchmark_weights) # Shape (n_thetas, n_batch) weights_sq_nom = mdot(theta_matrices, benchmark_weights * benchmark_weights) # same # Effect of nuisance parameters nuisance_factors = self._calculate_nuisance_factors( nus, benchmark_weights) weights = nuisance_factors * weights_nom weights_sq = nuisance_factors * weights_sq_nom # Sum up xsecs += np.sum(weights, axis=1) xsec_uncertainties += np.sum(weights_sq, axis=1) if n_events == 0: raise RuntimeError( f"Did not find events with test_split = {test_split} " f"and generated_close_to = {generated_close_to}") xsec_uncertainties = np.maximum(xsec_uncertainties, 0.0)**0.5 # Correct for not using all events xsecs *= correction_factor xsec_uncertainties *= correction_factor logger.debug("xsecs and uncertainties [pb]:") for this_xsec, this_uncertainty in zip(xsecs, xsec_uncertainties): logger.debug(" (%4f +/- %4f) pb (%4f %%)", this_xsec, this_uncertainty, 100 * this_uncertainty / this_xsec) return xsecs, xsec_uncertainties def xsec_gradients( self, thetas, nus=None, partition="all", test_split=0.2, validation_split=0.2, gradients="all", batch_size=100000, generated_close_to=None, ): """ Returns the gradient of total cross sections with respect to parameters. Parameters ---------- thetas : list of (ndarray or str), optional If None, the function returns all benchmark cross sections. Otherwise, it returns the cross sections for a series of parameter points that are either given by their benchmark name (as a str), their benchmark index (as an int), or their parameter value (as an ndarray, using morphing). Default value: None. nus : None or list of (None or ndarray), optional If None, the nuisance parameters are set to their nominal values (0), i.e. no systematics are taken into account. Otherwise, the list has to have the same number of elements as thetas, and each entry can specify nuisance parameters at nominal value (None) or a value of the nuisance parameters (ndarray). partition : {"train", "test", "validation", "all"}, optional Which events to use. Default: "all". test_split : float, optional Fraction of events reserved for testing. Default value: 0.2. validation_split : float, optional Fraction of weighted events reserved for validation. Default value: 0.2. gradients : {"all", "theta", "nu"}, optional Which gradients to calculate. Default value: "all". batch_size : int, optional Size of the batches of events that are loaded into memory at the same time. Default value: 100000. generated_close_to : None or ndarray, optional If not None, only events originally generated from the closest benchmark to this parameter point will be used. Default value : None. Returns ------- xsecs_gradients : ndarray Calculated cross section gradients in pb with shape (n_gradients,). """ logger.debug( f"Calculating cross section gradients for thetas = {thetas} and nus = {nus}" ) # Inputs include_nuisance_benchmarks = nus is not None or gradients in [ "all", "nu" ] if nus is None: nus = [None for _ in thetas] assert len(nus) == len( thetas), "Numbers of thetas and nus don't match!" if gradients not in ["all", "theta", "nu"]: raise RuntimeError(f"Invalid gradients type: {gradients}") # Which events to use if partition == "all": start_event, end_event = None, None correction_factor = 1.0 elif partition in ["train", "validation", "test"]: start_event, end_event, correction_factor = self._train_validation_test_split( partition, test_split, validation_split) else: raise ValueError(f"Invalid partition type: {partition}") # Theta matrices (translation of benchmarks to theta, at nominal nuisance params) theta_matrices = np.asarray([ self._get_theta_benchmark_matrix(theta) for theta in thetas ]) # shape (n_thetas, n_benchmarks) theta_gradient_matrices = np.asarray([ self._get_dtheta_benchmark_matrix(theta) for theta in thetas ]) # shape (n_thetas, n_gradients, n_benchmarks) # Loop over events xsec_gradients = 0.0 for i_batch, (_, benchmark_weights) in enumerate( self.event_loader( start=start_event, end=end_event, include_nuisance_parameters=include_nuisance_benchmarks, batch_size=batch_size, generated_close_to=generated_close_to, )): n_batch, _ = benchmark_weights.shape logger.debug(f"Batch {i_batch+1} with {n_batch} events") if gradients in ["all", "theta"]: nom_gradients = mdot( theta_gradient_matrices, benchmark_weights ) # Shape (n_thetas, n_phys_gradients, n_batch) nuisance_factors = self._calculate_nuisance_factors( nus, benchmark_weights) # Shape (n_thetas, n_batch) try: dweight_dtheta = nuisance_factors[:, np. newaxis, :] * nom_gradients except TypeError: dweight_dtheta = nom_gradients if gradients in ["all", "nu"]: weights_nom = mdot( theta_matrices, benchmark_weights) # Shape (n_thetas, n_batch) nuisance_factor_gradients = np.asarray([ self.nuisance_morpher.calculate_nuisance_factor_gradients( nu, benchmark_weights) for nu in nus ]) # Shape (n_thetas, n_nuisance_gradients, n_batch) dweight_dnu = nuisance_factor_gradients * weights_nom[:, np. newaxis, :] if gradients == "all": dweight_dall = np.concatenate((dweight_dtheta, dweight_dnu), 1) elif gradients == "theta": dweight_dall = dweight_dtheta elif gradients == "nu": dweight_dall = dweight_dnu xsec_gradients += np.sum(dweight_dall, axis=2) # Correct for not using all events xsec_gradients *= correction_factor return xsec_gradients def _check_n_events(self): n_events_check = \ sum(self.n_events_generated_per_benchmark) \ + self.n_events_backgrounds if self.n_samples != n_events_check: logger.warning( "Inconsistent event numbers in HDF5 file! Please recalculate them by calling " "combine_and_shuffle(recalculate_header=True).") def _report_setup(self): logger.info(f"Found {self.n_parameters} parameters") for i, param in enumerate(self.parameters.values()): logger.info(" %s: %s", i, param) if self.n_nuisance_parameters > 0: logger.info( f"Found {self.n_nuisance_parameters} nuisance parameters") for i, param in enumerate(self.nuisance_parameters.values()): logger.info(" %s: %s", i, param) else: logger.info("Did not find nuisance parameters") logger.info(f"Found {self.n_benchmarks} benchmarks") for benchmark in self.benchmarks.values(): if benchmark.is_nuisance: logger.debug(" %s: systematics", benchmark.name) else: logger.debug(" %s", benchmark) logger.info(f"Found {self.n_observables} observables") for i, obs in enumerate(self.observables): logger.debug(" %2.2s %s", i, obs) logger.info(f"Found {self.n_samples} events") if len(self.n_events_generated_per_benchmark) > 0: for events, name in zip(self.n_events_generated_per_benchmark, self.benchmarks.keys()): logger.info(" %s signal events sampled from benchmark %s", events, name) if self.n_events_backgrounds is not None and self.n_events_backgrounds > 0: logger.info(" %s background events", self.n_events_backgrounds) else: logger.debug(" Did not find sample summary information") if self.morpher is not None: logger.info("Found morphing setup with %s components", len(self.morphing_components)) else: logger.info("Did not find morphing setup.") if self.nuisance_morpher is not None: logger.info("Found nuisance morphing setup") else: logger.info("Did not find nuisance morphing setup") def _calculate_nuisance_factors(self, nus, benchmark_weights): if self._any_nontrivial_nus(nus): return np.asarray([ self.nuisance_morpher.calculate_nuisance_factors( nu, benchmark_weights) for nu in nus ]) # Shape (n_thetas, n_batch) else: return 1.0 def _finite_differences_theta_gradient_matrices(self): """ Constructs the matrix that translates benchmark weights to the gradient of the weight evaluated at the benchmarks """ assert self.finite_difference_benchmarks is not None assert self.finite_difference_epsilon is not None matrix = np.zeros( (self.n_benchmarks, self.n_parameters, self.n_benchmarks)) # (n_thetas, n_gradients, n_benchmarks) benchmark_names = list(self.benchmarks.keys()) # We'll generally try to find the tuples p, i, j, k such that # matrix[i, p, j] = - 1 / eps and matrix[i, p, i] = 1 / eps for i, b_name in enumerate(self.benchmarks.keys()): # For the FD-shifted benchmarks, we assume that the gradients are # the same as at the original point, and will just copy the matrix later copy_to = [] if b_name not in self.finite_difference_benchmarks.keys(): continue for p, p_name in enumerate(self.parameters.keys()): shifted_benchmark_dict = self.finite_difference_benchmarks[ b_name].shift_names shifted_benchmark_name = shifted_benchmark_dict[p_name] j = benchmark_names.index(shifted_benchmark_name) copy_to.append(j) matrix[i, p, j] = +1.0 / self.finite_difference_epsilon matrix[i, p, i] = -1.0 / self.finite_difference_epsilon for j in copy_to: matrix[j, :, :] = matrix[i, :, :] return matrix @staticmethod def _any_nontrivial_nus(nus): if nus is None: return False for nu in nus: if nu is not None: return True return False def _derivative_mode(self): if self.morpher is not None: mode = "morphing" elif self.finite_difference_benchmarks is not None: mode = "fd" else: raise RuntimeError( "Cannot compute xsec gradients when neither morphing nor finite differences are correctly set up!" ) return mode def _weights(self, thetas, nus, benchmark_weights, theta_matrices=None): """ Turns benchmark weights into weights for given parameter points (theta, nu). Parameters ---------- thetas : list of (ndarray or str) If None, the function returns all benchmark cross sections. Otherwise, it returns the cross sections for a series of parameter points that are either given by their benchmark name (as a str), their benchmark index (as an int), or their parameter value (as an ndarray, using morphing). nus : None or list of (None or ndarray) If None, the nuisance parameters are set to their nominal values (0), i.e. no systematics are taken into account. Otherwise, the list has to have the same number of elements as thetas, and each entry can specify nuisance parameters at nominal value (None) or a value of the nuisance parameters (ndarray). Returns ------- weights : ndarray Calculated weights in pb. """ n_events, _ = benchmark_weights.shape # Inputs if nus is None: nus = [None for _ in thetas] assert len(nus) == len( thetas), "Numbers of thetas and nus don't match!" # Theta matrices (translation of benchmarks to theta, at nominal nuisance params) if theta_matrices is None: theta_matrices = [ self._get_theta_benchmark_matrix(theta) for theta in thetas ] theta_matrices = np.asarray( theta_matrices) # Shape (n_thetas, n_benchmarks) # Weights at nominal nuisance params (nu=0) weights_nom = mdot(theta_matrices, benchmark_weights) # Shape (n_thetas, n_batch) # Effect of nuisance parameters nuisance_factors = self._calculate_nuisance_factors( nus, benchmark_weights) weights = nuisance_factors * weights_nom return weights def _weight_gradients(self, thetas, nus, benchmark_weights, gradients="all", theta_matrices=None, theta_gradient_matrices=None): """ Turns benchmark weights into weights for given parameter points (theta, nu). Parameters ---------- thetas : list of (ndarray or str) If None, the function returns all benchmark cross sections. Otherwise, it returns the cross sections for a series of parameter points that are either given by their benchmark name (as a str), their benchmark index (as an int), or their parameter value (as an ndarray, using morphing). nus : None or list of (None or ndarray) If None, the nuisance parameters are set to their nominal values (0), i.e. no systematics are taken into account. Otherwise, the list has to have the same number of elements as thetas, and each entry can specify nuisance parameters at nominal value (None) or a value of the nuisance parameters (ndarray). gradients : {"all", "theta", "nu"}, optional Which gradients to calculate. Default value: "all". Returns ------- gradients : ndarray Calculated gradients in pb. """ n_events, _ = benchmark_weights.shape # Inputs if gradients == "all" and self.n_nuisance_parameters == 0: gradients = "theta" if nus is None: nus = [None for _ in thetas] assert len(nus) == len( thetas), "Numbers of thetas and nus don't match!" # Theta matrices (translation of benchmarks to theta, at nominal nuisance params) if theta_matrices is None: theta_matrices = [ self._get_theta_benchmark_matrix(theta) for theta in thetas ] if theta_gradient_matrices is None: theta_gradient_matrices = [ self._get_dtheta_benchmark_matrix(theta) for theta in thetas ] theta_matrices = np.asarray( theta_matrices) # Shape (n_thetas, n_benchmarks) theta_gradient_matrices = np.asarray( theta_gradient_matrices ) # Shape (n_thetas, n_gradients, n_benchmarks) # Calculate theta gradient if gradients in ["all", "theta"]: nom_gradients = mdot( theta_gradient_matrices, benchmark_weights) # (n_thetas, n_phys_gradients, n_batch) nuisance_factors = self._calculate_nuisance_factors( nus, benchmark_weights) try: dweight_dtheta = nuisance_factors[:, np. newaxis, :] * nom_gradients except TypeError: dweight_dtheta = nom_gradients else: dweight_dtheta = None # Calculate nu gradient if gradients in ["all", "nu"]: weights_nom = mdot(theta_matrices, benchmark_weights) # Shape (n_thetas, n_batch) nuisance_factor_gradients = np.asarray([ self.nuisance_morpher.calculate_nuisance_factor_gradients( nu, benchmark_weights) for nu in nus ]) # Shape (n_thetas, n_nuisance_gradients, n_batch) dweight_dnu = nuisance_factor_gradients * weights_nom[:, np. newaxis, :] else: dweight_dnu = None if gradients == "theta": return dweight_dtheta elif gradients == "nu": return dweight_dnu return np.concatenate((dweight_dtheta, dweight_dnu), 1) def _train_test_split(self, train, test_split): """ Returns the start and end event for train samples (train = True) or test samples (train = False). Parameters ---------- train : bool True if training data is generated, False if test data is generated. test_split : float Fraction of events reserved for testing. Returns ------- start_event : int Index (in the MadMiner file) of the first event to consider. end_event : int Index (in the MadMiner file) of the last unweighted event to consider. correction_factor : float Factor with which the weights and cross sections will have to be multiplied to make up for the missing events. """ if train: start_event = 0 if test_split is None or test_split <= 0.0 or test_split >= 1.0: end_event = None correction_factor = 1.0 else: end_event = int(round((1.0 - test_split) * self.n_samples, 0)) correction_factor = 1.0 / (1.0 - test_split) if end_event < 0 or end_event > self.n_samples: raise ValueError( f"Irregular split: sample {end_event} / {self.n_samples}" ) else: if test_split is None or test_split <= 0.0 or test_split >= 1.0: start_event = 0 correction_factor = 1.0 else: start_event = int(round( (1.0 - test_split) * self.n_samples, 0)) + 1 correction_factor = 1.0 / test_split if start_event < 0 or start_event > self.n_samples: raise ValueError( f"Irregular split: sample {start_event} / {self.n_samples}" ) end_event = None return start_event, end_event, correction_factor def _train_validation_test_split(self, partition, test_split, validation_split): """ Returns the start and end event for train samples (train = True) or test samples (train = False). Parameters ---------- partition : ["train", "validation", "test"] test_split : float Fraction of events reserved for testing. validation_split : float Fraction of events reserved for testing. Returns ------- start_event : int Index (in the MadMiner file) of the first event to consider. end_event : int Index (in the MadMiner file) of the last unweighted event to consider. correction_factor : float Factor with which the weights and cross sections will have to be multiplied to make up for the missing events. """ if test_split is None or test_split < 0.0: test_split = 0.0 if validation_split is None or validation_split < 0.0: validation_split = 0.0 assert test_split + validation_split <= 1.0 train_split = 1.0 - test_split - validation_split if partition == "train": start_event = 0 if test_split is None or test_split <= 0.0 or test_split >= 1.0: end_event = None correction_factor = 1.0 else: end_event = int(round(train_split * self.n_samples, 0)) correction_factor = 1.0 / train_split if end_event < 0 or end_event > self.n_samples: raise ValueError( f"Irregular split: sample {end_event} / {self.n_samples}" ) elif partition == "validation": if validation_split is None or validation_split <= 0.0 or validation_split >= 1.0: start_event = 0 end_event = None correction_factor = 1.0 else: start_event = int(round(train_split * self.n_samples, 0)) + 1 end_event = int(round((1.0 - test_split) * self.n_samples, 0)) correction_factor = 1.0 / validation_split if start_event < 0 or start_event > self.n_samples: raise ValueError( f"Irregular split: sample {start_event} / {self.n_samples}" ) if end_event < 0 or end_event > self.n_samples: raise ValueError( f"Irregular split: sample {end_event} / {self.n_samples}" ) elif partition == "test": end_event = None if test_split is None or test_split <= 0.0 or test_split >= 1.0: start_event = 0 correction_factor = 1.0 else: start_event = int(round( (1.0 - test_split) * self.n_samples, 0)) + 1 correction_factor = 1.0 / test_split if start_event < 0 or start_event > self.n_samples: raise ValueError( f"Irregular split: sample {start_event} / {self.n_samples}" ) else: raise RuntimeError(f"Unknown partition {partition}") return start_event, end_event, correction_factor def _get_theta_value(self, theta): if isinstance(theta, str): benchmark = self.benchmarks[theta] theta_value = np.array([val for val in benchmark.values.values()]) elif isinstance(theta, int): benchmark = self.benchmarks[list(self.benchmarks.keys())[theta]] theta_value = np.array([val for val in benchmark.values.values()]) else: theta_value = np.asarray(theta) return theta_value def _get_nu_value(self, nu): if nu is None: nu_value = np.zeros(self.n_nuisance_parameters) else: nu_value = np.asarray(nu) return nu_value def _get_theta_benchmark_matrix(self, theta, zero_pad=True): """Calculates vector A such that dsigma(theta) = A * dsigma_benchmarks""" if zero_pad: unpadded_theta_matrix = self._get_theta_benchmark_matrix( theta, zero_pad=False) theta_matrix = np.zeros(self.n_benchmarks) theta_matrix[:unpadded_theta_matrix. shape[0]] = unpadded_theta_matrix elif isinstance(theta, str): i_benchmark = list(self.benchmarks).index(theta) theta_matrix = self._get_theta_benchmark_matrix(i_benchmark) elif isinstance(theta, int): n_benchmarks = len(self.benchmarks) theta_matrix = np.zeros(n_benchmarks) theta_matrix[theta] = 1.0 else: theta_matrix = self.morpher.calculate_morphing_weights(theta) return theta_matrix def _get_dtheta_benchmark_matrix(self, theta, zero_pad=True): """Calculates matrix A_ij such that d dsigma(theta) / d theta_i = A_ij * dsigma (benchmark j)""" mode = self._derivative_mode() if zero_pad: unpadded_theta_matrix = self._get_dtheta_benchmark_matrix( theta, zero_pad=False) dtheta_matrix = np.zeros( (unpadded_theta_matrix.shape[0], self.n_benchmarks)) dtheta_matrix[:, :unpadded_theta_matrix. shape[1]] = unpadded_theta_matrix elif isinstance(theta, str) and mode == "morphing": benchmark = self.benchmarks[theta] benchmark = np.array([val for val in benchmark.values.values()]) dtheta_matrix = self._get_dtheta_benchmark_matrix(benchmark) elif isinstance(theta, int) and mode == "morphing": benchmark = self.benchmarks[list(self.benchmarks.keys())[theta]] benchmark = np.array([val for val in benchmark.values.values()]) dtheta_matrix = self._get_dtheta_benchmark_matrix(benchmark) elif isinstance(theta, str): benchmark_id = list(self.benchmarks.keys()).index(theta) dtheta_matrix = self._get_dtheta_benchmark_matrix(benchmark_id) elif isinstance(theta, int): # finite differences # TODO: avoid constructing the full matrix every time dtheta_matrix = self._finite_differences_theta_gradient_matrices( )[theta] else: if mode == "fd": raise RuntimeError( "Cannot calculate score for arbitrary parameter points without morphing setup" ) # Shape (n_parameters, n_benchmarks_phys) dtheta_matrix = self.morpher.calculate_morphing_weight_gradient( theta) return dtheta_matrix def _calculate_sampling_factors(self): events = np.asarray(self.n_events_generated_per_benchmark, dtype=np.float64) logger.debug(f"Events per benchmark: {events}") factors = events / np.sum(events) factors = np.hstack((factors, 1.0)) # background events return factors def _find_closest_benchmark(self, theta): if theta is None: return None benchmarks = self._benchmark_array()[:self.n_benchmarks_phys] distances = [ np.linalg.norm(benchmark - theta) for benchmark in benchmarks ] # Don't use benchmarks where we don't actually have events if len(self.n_events_generated_per_benchmark) > 0: distances = distances + 1.0e9 * ( self.n_events_generated_per_benchmark == 0).astype(np.float64) closest_idx = np.argmin(distances) return closest_idx def _benchmark_array(self): return np.asarray([ list(benchmark.values.values()) for benchmark in self.benchmarks.values() ])