def test_simfactory_factory_split(sim_factory): """Test function splitting is done properly.""" # With yields syst_fac = ToyRandomizer(sim_factory) assert list(syst_fac._gen_pdfs.keys()) == ['label1', 'label2'] assert len(syst_fac._gen_pdfs['label1']) == 2 assert syst_fac._gen_pdfs['label1'][0].expectedEvents( list_to_rooargset( sim_factory.get_observables())) == 999 assert syst_fac._gen_pdfs['label1'][1].expectedEvents( list_to_rooargset( sim_factory.get_observables())) == 320 assert len(syst_fac._gen_pdfs['label2']) == 1 assert syst_fac._gen_pdfs['label2'][0].expectedEvents( list_to_rooargset( sim_factory.get_observables())) == 231
def get_constraints(self): child_constraints = list_to_rooargset(self._constraints) for child in self.get_children().values(): tmp_constraints = child.get_constraints() if tmp_constraints: child_constraints = ROOT.RooArgSet(child_constraints, child.get_constraints()) return child_constraints
def test_prodfactory_split(prod_factory): """Test function splitting is done properly.""" syst_fac = ToyRandomizer(prod_factory) assert list(syst_fac._gen_pdfs.keys()) == [None] assert len(syst_fac._gen_pdfs[None]) == 1 assert syst_fac._gen_pdfs[None][0].expectedEvents( list_to_rooargset( prod_factory.get_observables())) == 999
def test_sumfactory_factory_split(sum_factory, sum_factory_frac): """Test function splitting is done properly.""" # With yields syst_fac = ToyRandomizer(sum_factory) assert list(syst_fac._gen_pdfs.keys()) == [None] assert len(syst_fac._gen_pdfs[None]) == 2 assert syst_fac._gen_pdfs[None][0].expectedEvents( list_to_rooargset( sum_factory.get_observables())) == 999 assert syst_fac._gen_pdfs[None][1].expectedEvents( list_to_rooargset( sum_factory.get_observables())) == 999 # With fraction syst_fac = ToyRandomizer(sum_factory_frac, {'yield': 100}) assert list(syst_fac._gen_pdfs.keys()) == [None] assert len(syst_fac._gen_pdfs[None]) == 1 assert syst_fac._gen_pdfs[None][0].expectedEvents( list_to_rooargset( sum_factory.get_observables())) == 100
def get_dataset(self, randomize=True): """Get dataset generated from the input model. If an acceptance was given on initialization, accept-reject is applied on the dataset, and an extra variable representing the inverse of the per-event weight (`fit_weight`) is added as weight. Arguments: randomize (bool, optional): Randomize the parameters? Defaults to `True`. Return: `ROOT.RooDataSet`. """ import ROOT # TODO: Add weights? if randomize: logger.debug("Applying randomization") self.randomize() obs = list_to_rooargset(self._model.get_observables()) datasets_to_merge = [] cats = list_to_rooarglist(self._model.get_category_vars()) for label, pdf_list in self._gen_pdfs.items(): if cats: for lab_num, lab in enumerate(label.split(',')): cats[lab_num].setLabel(lab) for pdf in pdf_list: logger.debug("Generating PDF -> %s", pdf.GetName()) if self._gen_acceptance: # TODO: Fixed yields yield_to_generate = poisson.rvs(pdf.expectedEvents(obs)) pandas_dataset = None while yield_to_generate: events = self._gen_acceptance.apply_accept_reject( pandas_from_dataset( pdf.generate(obs, yield_to_generate * 2))) # Sample if the dataset is too large if events.shape[0] > yield_to_generate: events = events.sample(yield_to_generate) # Merge with existing if not pandas_dataset: pandas_dataset = events else: pandas_dataset = pandas_dataset.append(events, ignore_index=True) yield_to_generate -= len(events) logger.debug("Adding fitting weights") pandas_dataset['fit_weight'] = self._fit_acceptance.get_fit_weights(pandas_dataset) dataset = dataset_from_pandas(pandas_dataset, "GenData", "GenData", weight_var='fit_weight') else: dataset = pdf.generate(obs, ROOT.RooFit.Extended(True)) if cats: dataset.addColumns(cats) datasets_to_merge.append(dataset) return merge_root(datasets_to_merge, 'GenData', 'GenData')
def test_factory_split(factory, factory_with_yield): """Test function splitting is done properly.""" try: ToyRandomizer(factory) except ValueError: pass # Manual yield syst_fac = ToyRandomizer(factory, {'yield': 100}) assert list(syst_fac._gen_pdfs.keys()) == [None] assert len(syst_fac._gen_pdfs[None]) == 1 assert syst_fac._gen_pdfs[None][0].expectedEvents( list_to_rooargset( factory.get_observables())) == 100 # Yield from config syst_fac = ToyRandomizer(factory_with_yield) assert list(syst_fac._gen_pdfs.keys()) == [None] assert len(syst_fac._gen_pdfs[None]) == 1 assert syst_fac._gen_pdfs[None][0].expectedEvents( list_to_rooargset( factory.get_observables())) == 1000
def __init__(self, factories, children_yields, parameters=None): """Initialize. In this case, the children are a map of PDF name -> Factory. Raise: InvalidRequestError: When the observables of the factories are incompatible. KeyError: On configuration error. """ # Check observable compatibility if len({ tuple([obs.GetName() for obs in factory.get_observables()]) for factory in factories.values() }) != 1: raise InvalidRequestError("Incompatible observables") # Check children yields type if not isinstance(children_yields, OrderedDict): raise ValueError("children_yields must be an ordered dictionary") super(SumPhysicsFactory, self).__init__({}, parameters) # Set children self._children = factories # Set observables observables = { obs.getStringAttribute('originalName'): obs for obs in list(self._children.values())[0].get_observables() } for obs_name, obs in observables.items(): for child in list(self._children.values())[1:]: child.set_observable(obs_name, obs=obs) # Set yields yield_ = None if parameters and 'yield' in parameters: yield_, constraint = parameters.pop('yield') yield_values = [ child_yield for child_yield, _ in children_yields.values() ] if len(factories) == len(children_yields): # Extended if yield_ is not None: raise KeyError("Specified yield on a sum of RooExtendPdf") self['Yield'] = ROOT.RooAddition("Yield", "Yield", list_to_rooarglist(yield_values)) self._constraints.update( {constraint for _, constraint in children_yields.values()}) for child_name, child in self._children.items(): child.set_yield_var(children_yields[child_name]) elif (len(factories) - len(children_yields)) == 1: # Check order is correct if list(self._children.keys())[-1] in children_yields.keys(): logger.error( "The last child should not be in `children_keys` to ensure consistency." ) raise ValueError("Wrong PDF ordering") # Store the fractions and propagate for yield_val in yield_values: if yield_val.getVal() > 1: raise ValueError( "Specified a fraction larger than 1 -> {}".format( yield_val.GetName())) # Not very good heuristics if yield_val.getStringAttribute('shared') != 'true': yield_val.SetName(yield_val.GetName().replace( 'Yield', 'Fraction')) yield_val.SetTitle(yield_val.GetTitle().replace( 'Yield', 'Fraction')) self['Fractions'] = yield_values for child_name, child in self._children.items(): if child_name in children_yields: child_yield, child_constraint = children_yields[child_name] child['Fraction'] = child_yield child._constraints.add(child_constraint) else: # Need no rename because RooFracRemainder needs a RooArgSet and there will be clashes # between vars named 'Fraction'. It's stupid, since the name is not used after. for yield_num, yield_val in enumerate(yield_values): yield_val.SetName('{}_{}'.format( yield_val.GetName(), yield_num)) child['Fraction'] = ROOT.RooFracRemainder( "Fraction", "Fraction", list_to_rooargset(yield_values)) child._constraints.update({ constraint for _, constraint in children_yields.values() if constraint }) # Put names back where they belong for yield_num, yield_val in enumerate(yield_values): yield_val.SetName('_'.join( yield_val.GetName().split('_')[:-1])) # Final rename if yield_ is not None: self.set_yield_var((yield_, constraint)) else: raise KeyError("Badly specified yields/fractions")
def dataset_from_pandas(frame, name, title, var_list=None, weight_var=None, categories=None, ranges=None): """Build RooDataset from a Pandas DataFrame. Arguments: frame (pandas.DataFrame): DataFrame to convert. name (str): RooDataSet name. title (str): RooDataSet title. var_list (list[str], optional): List of variables to add to the dataset. If not given, all variables are converted. weight_var (str, optional): Assign the given variable name as weight. Defaults to None. categories (list[`ROOT.RooCategory`], optional): Categories to separate the data in. Their name must correspond to a column in the `frame`. ranges (dict, optional): Variables to set a range for. Defaults to `None`, in which case all variables are unbounded. Return: ROOT.RooDataSet: Frame converted to dataset. Raise: KeyError: If the weight_var or the category is not present in `frame`. """ def fill_dataset(name, title, var_set, input_data): """Fill a dataset from a pandas DataFrame. Arguments: name (str): Name of the dataset. title (str): Title of the dataset. var_set (ROOT.RooArgSet): Variables in the dataset. input_data (pandas.DataFrame): Input data. Return: ROOT.RooDataSet: Output data set. """ dataset = ROOT.RooDataSet(name, title, var_set) for _, row in input_data.iterrows(): for var_name in var_names: if isinstance(row[var_name], (float, int)): var_set.setRealValue(var_name, row[var_name]) for cat_name in cat_names: var_set.setCatLabel(cat_name, row[cat_name]) dataset.add(var_set) return dataset var_names = var_list if var_list else list(frame.columns) if weight_var and weight_var not in frame.columns: raise KeyError("Cannot find weight variable -> {}".format(weight_var)) cat_names = [] roovar_list = [] if categories: for category in categories: cat_var = category.GetName() if cat_var not in frame.columns: raise KeyError("Cannot find category variable -> {}".format(cat_var)) roovar_list.append(category) if cat_var in var_names: var_names.pop(var_names.index(cat_var)) cat_names.append(cat_var) super_category = 'x'.join(cat.GetName() for cat in categories) if super_category in var_names: logger.warning("You asked for variable %s but this is the name of a SuperCategory. Ignoring it.", super_category) var_names.pop(var_names.index(super_category)) roovar_list.extend([ROOT.RooRealVar(var_name, var_name, 0.0) for var_name in var_names]) dataset_set = list_to_rooargset(roovar_list) if ranges: for var_name, (min_, max_) in ranges.items(): dataset_set[var_name].setMin(min_) dataset_set[var_name].setMax(max_) dataset = fill_dataset(name, title, dataset_set, frame) if weight_var: dataset = ROOT.RooDataSet(name, title, dataset_set, ROOT.RooFit.Import(dataset), ROOT.RooFit.WeightVar(weight_var)) return dataset
def generate(physics_factory, n_events): """Perform generation of toys. Note: If the factory is simultaneous, events are generated in steps. For that reason, the configuration for 'gen/nevents' must be a dictionary of {label -> nevents} keys. Arguments: physics_factory (`analysis.physics.PhysicsFactory`): Physics factory object to get observables, parameters and PDFs from. n_events (dict, int): Number of events to generate. Return: `pandas.DataFrame`: Generated events. Raise: ValueError: If the number of events to generate is not properly specified. KeyError: If an unknown simultaneous category label is requested. """ def generate_events(gen_pdf, obs_set, n_events): """Generate events according to the given PDF. Result is converted to a pandas data frame. Arguments: gen_pdf (`ROOT.RooAbsPdf`): PDF to use for generation. obs_set (`ROOT.RooArgSet`): Observables to generate. n_events (int): Number of events to generate. Return: `pandas.DataFrame`: Generated events. """ data = gen_pdf.generate(obs_set, n_events) dataframe = pandas_from_dataset(data) destruct_object(data) return dataframe observables = list_to_rooargset(physics_factory.get_observables()) if physics_factory.is_simultaneous(): if not isinstance(n_events, dict): raise ValueError( "Generation of a simultaneous requires a dictionary for the number of events." ) output_dataset = None for label, n_events_label in n_events.items(): label_factory = physics_factory.get_children().get(label) if not label_factory: raise KeyError("Unknown label -> {}".format(label)) label_df = generate_events( label_factory.get_pdf("GenPdf_{}".format(label), "GenPdf_{}".format(label)), observables, n_events_label).assign(category=label) if output_dataset is None: output_dataset = label_df else: output_dataset = output_dataset.append(label_df) return output_dataset else: if not isinstance(n_events, int): raise ValueError("Number of events to generate is not an integer") return generate_events(physics_factory.get_pdf("GenPdf", "GenPdf"), observables, n_events)