Exemplo n.º 1
0
    def get_dataset(self, randomize=True):
        """Get dataset generated from the input model.

        If an acceptance was given on initialization, accept-reject is applied on the dataset,
        and an extra variable representing the inverse of the per-event weight (`fit_weight`)
        is added as weight.

        Arguments:
            randomize (bool, optional): Randomize the parameters? Defaults to `True`.

        Return:
            `ROOT.RooDataSet`.

        """
        import ROOT

        # TODO: Add weights?
        if randomize:
            logger.debug("Applying randomization")
            self.randomize()
        obs = list_to_rooargset(self._model.get_observables())
        datasets_to_merge = []
        cats = list_to_rooarglist(self._model.get_category_vars())
        for label, pdf_list in self._gen_pdfs.items():
            if cats:
                for lab_num, lab in enumerate(label.split(',')):
                    cats[lab_num].setLabel(lab)
            for pdf in pdf_list:
                logger.debug("Generating PDF -> %s", pdf.GetName())
                if self._gen_acceptance:
                    # TODO: Fixed yields
                    yield_to_generate = poisson.rvs(pdf.expectedEvents(obs))
                    pandas_dataset = None
                    while yield_to_generate:
                        events = self._gen_acceptance.apply_accept_reject(
                            pandas_from_dataset(
                                pdf.generate(obs, yield_to_generate * 2)))
                        # Sample if the dataset is too large
                        if events.shape[0] > yield_to_generate:
                            events = events.sample(yield_to_generate)
                        # Merge with existing
                        if not pandas_dataset:
                            pandas_dataset = events
                        else:
                            pandas_dataset = pandas_dataset.append(events, ignore_index=True)
                        yield_to_generate -= len(events)
                    logger.debug("Adding fitting weights")
                    pandas_dataset['fit_weight'] = self._fit_acceptance.get_fit_weights(pandas_dataset)
                    dataset = dataset_from_pandas(pandas_dataset, "GenData", "GenData", weight_var='fit_weight')
                else:
                    dataset = pdf.generate(obs, ROOT.RooFit.Extended(True))
                if cats:
                    dataset.addColumns(cats)
                datasets_to_merge.append(dataset)
        return merge_root(datasets_to_merge, 'GenData', 'GenData')
Exemplo n.º 2
0
def _get_root_from_dataframe(frame, kwargs):
    """Properly load a pandas DataFrame into a `ROOT.RooDataSet`.

    Needed keys in `kwargs` are:
        + `name`: Name of the `RooDataSet`.
        + `title`: Title of the `RooDataSet`.

    Optional keys are:
        + `variables`: List of variables to load.
        + `selection`: Selection to apply.
        + `weights-to-normalize`: Variables defining the weights that are normalized
            to the total number of entries of the dataset.
        + `weights-not-to-normalize`: Variables defining the weights that are not normalized.
        + `weight-var-name`: Name of the weight variable. If there is only one weight,
            it is not needed. Otherwise it has to be specified.
        + `acceptance`: Load an acceptance. This needs to be accompanied with a weight
            specification, either in `weights-to-normalize` or `weights-not-to-normalize`, which
            is either `acceptance_fit` or `acceptance_gen`. Depending on which one is
            specified, `acceptance.get_fit_weights` or `acceptance.get_gen_weights` is used.
        + `categories`: RooCategory variables to use.
        + `ranges`: Dictionary specifying min and max for the given variables. If not given,
            variables are unbound.

    Arguments:
        file_name (str): File to load.
        tree_name (str): Tree to load.
        **kwargs (dict): Extra configuration.

    Return:
        ROOT.RooDataSet: pandas.DataFrame converted to RooDataSet.

    Raise:
        KeyError: If there are errors in the `kwargs` variables.
        ValueError: If there is an error in loading the acceptance.

    """
    logger.debug("Loading pandas DataFrame in RooDataSet format")
    # Checks and variable preparation
    try:
        name = kwargs['name']
        title = kwargs.get('title', name)
    except KeyError as error:
        raise KeyError("Missing configuration key -> {}".format(error))
    # Check weights
    try:
        weight_var, weights_to_normalize, weights_not_to_normalize = _analyze_weight_config(
            kwargs)
    except KeyError:
        raise KeyError("Badly specified weights")
    # Variables
    var_list = list(frame.columns)
    # Raise an error if some weights are not loaded.
    if var_list and not set(weights_to_normalize +
                            weights_not_to_normalize).issubset(set(var_list)):
        raise ValueError(
            "Missing weights in the list of variables read from input file.")
    acc_var = ''
    # Acceptance specified
    if 'acceptance' in kwargs:
        if any('acceptance_fit' in weights
               for weights in (weights_to_normalize,
                               weights_not_to_normalize)):
            acc_var = 'acceptance_fit'
        if any('acceptance_gen' in weights
               for weights in (weights_to_normalize,
                               weights_not_to_normalize)):
            if acc_var:
                raise ValueError(
                    "Specified both 'acceptance_fit' and 'acceptance_gen' as weights."
                )
            acc_var = 'acceptance_gen'
        if not acc_var:
            logger.warning(
                "Requested acceptance but it has not been specified as a weight to use. Ignoring."
            )

    if weight_var:
        if 'acceptance' in kwargs:
            if any('acceptance_fit' in weights
                   for weights in (weights_to_normalize,
                                   weights_not_to_normalize)):
                acc_var = 'acceptance_fit'
            if any('acceptance_gen' in weights
                   for weights in (weights_to_normalize,
                                   weights_not_to_normalize)):
                if acc_var:
                    raise ValueError(
                        "Specified both 'acceptance_fit' and 'acceptance_gen' as weights."
                    )
                acc_var = 'acceptance_gen'
            if not acc_var:
                logger.warning(
                    "Requested acceptance but it has not been specified as a weight to use. Ignoring."
                )
    if acc_var:
        from analysis.efficiency import get_acceptance
        try:
            acceptance = get_acceptance(kwargs['acceptance'])
        except Exception as error:
            raise ValueError(str(error))
        if acc_var in frame.columns:
            raise ValueError(
                "Name clash: the column '{}' is present in the dataset".format(
                    acc_var))
        if acc_var == 'acceptance_fit':
            frame['acceptance_fit'] = acceptance.get_fit_weights(frame)
        else:
            frame['acceptance_gen'] = acceptance.get_gen_weights(frame)
    # Apply weights
    if weight_var:
        frame[weight_var] = np.prod(
            [frame[w_var] for w_var in weights_to_normalize], axis=0)
        frame[weight_var] = frame[weight_var] / frame[weight_var].sum(
        ) * frame.shape[0]
        frame[weight_var] = np.prod([
            frame[w_var] for w_var in weights_not_to_normalize + [weight_var]
        ],
                                    axis=0)
    if var_list is not None and weight_var:
        var_list.append(weight_var)
    # Process ranges
    ranges = kwargs.get('ranges')
    if ranges:
        for var_name, range_val in ranges.items():
            try:
                if isinstance(range_val, str):
                    min_, max_ = range_val.split()
                else:
                    min_, max_ = range_val
            except ValueError:
                raise KeyError(
                    "Malformed range specification for {} -> {}".format(
                        var_name, range_val))
            ranges[var_name] = (float(min_), float(max_))
    # Convert it
    return dataset_from_pandas(frame,
                               name,
                               title,
                               var_list=var_list,
                               weight_var=weight_var,
                               categories=kwargs.get('categories'),
                               ranges=ranges)
Exemplo n.º 3
0
def get_datasets(data_frames, acceptance, fit_models):
    """Build the datasets from the input toys.

    If an acceptance is specified, events are selected using accept-reject.

    Logic regarding the poisson variation of yields is as follows:
        - If the fit model is extended, the yields of the individual data sets are
            varied as randomly following a poisson distribution.
        - If the fit model is not extended, the yields of the individual data set
            are exactly the number of events given in the configuration.

    Note:
        Proper handling of poissonian variations is not fool proof. If datasets contain a
        mixture of populations that have different yield parameters in the fit model, they
        cannot be varied properly and therefore their pulls will be wrong.

    Arguments:
        data_frames (dict[tuple(pandas.DataFrame, int, bool, str)]): Data frames with
            - the requested number of events (int)
            - a boolean telling whether to use poisson statistics (bool)
            - the corresponding category (str)
        acceptance (analysis.efficiency.acceptance.Acceptance): Acceptance description.
            Can be None, in which case it is ignored.
        fit_models (dict): Fit models to use to transform datasets and (possibly) establish
            the data categories, with the name of the output as key.

    Return:
        tuple (dict (str: ROOT.RooDataSet), dict (str: int)): Datasets made of the
            combination of the several input sources with the transformations applied,
            and number of generated events per data sample.

    Raise:
        KeyError: If there is information missing from the data configuration.

    """
    dataset = None
    sample_sizes = {}
    weight_var = None
    logger.debug("Sampling datasets -> %s", list(data_frames.keys()))
    is_extended = fit_models.values()[0].is_extended()
    for data_name, (data, n_events, do_poisson,
                    category) in data_frames.items():
        if acceptance:
            data = acceptance.apply_accept_reject(data)
        # Do poisson if it is extended and it has not been disabled
        if do_poisson is None:
            do_poisson = is_extended
        sample_sizes[data_name] = poisson.rvs(
            n_events) if do_poisson else n_events
        # Extract suitable number of rows and transform them
        rows = data.sample(sample_sizes[data_name])  # TODO: Weights support?
        # Add category column
        if category:
            # By default the label is stored in the 'category' column
            if 'category' in rows:
                if len(set(rows['category'])) > 1:
                    logger.error(
                        "Data %s contains more then one category: %s!", rows,
                        set(rows['category']))
                    raise ValueError(
                        "Data {} contains more then one category: ".format(
                            data_name))  # TODO: replace with DataError
                elif not rows['category'].iloc[0] == category:
                    logger.info(
                        "Data %s contains a category %s, dropping it and "
                        "replacing it with the specified one %s", rows,
                        rows['category'].iloc[0], category)
            rows['category'] = category
        elif 'category' in rows:
            logger.warning(
                "Data %s contains a 'category' column but no category was specified"
                "in the config file -> ignoring 'category column for the fit",
                rows)
            del rows['category']
        # Append to merged dataset
        if dataset is None:
            dataset = rows
        else:
            dataset = pd.concat([dataset, rows])
    logger.debug("Done loading")
    # Get the fit weight
    if acceptance:
        logger.debug("Adding fitting weights")
        weight_var = 'fit_weight'
        dataset[weight_var] = acceptance.get_fit_weights(dataset)
    # Convert dataset to RooDataset
    try:
        # TODO: Check the categories
        return ({
            ds_name: dataset_from_pandas(model.transform_dataset(dataset),
                                         "data_{}".format(ds_name),
                                         "data_{}".format(ds_name),
                                         weight_var=weight_var,
                                         categories=model.get_category_vars())
            for ds_name, model in fit_models.items()
        }, sample_sizes)
    except KeyError:
        logger.error("Error transforming dataset.")
        raise