Exemplo n.º 1
0
    def from_yaml_file(name):
        """Initialize from a YAML file.

        File name is determined by get_fit_result_path.

        Arguments:
            name (str): Name of the fit result.

        Return:
            self

        Raise:
            OSError: If the file cannot be found.
            KeyError: If any of the FitResult data is missing from the input file.

        """
        try:
            yaml_config = load_config(
                _paths.get_fit_result_path(name),
                validate=('fit-parameters', 'fit-parameters-initial',
                          'covariance-matrix/quality',
                          'covariance-matrix/matrix', 'status'))
            return FitResult.from_yaml(yaml_config)
        except ConfigError as error:
            raise KeyError("Missing keys in input file -> {}".format(','.join(
                error.missing_keys)))
Exemplo n.º 2
0
def load_data(config_file, key=None, **kwargs):
    """Load from file.

    Arguments:
        config_file (str): Name of the configuration file.
        key (str, optional): Key to load in the configuration file. If none is
            given, the root of the YAML file will be used as configuration.
        **kwargs (dict): Dictionary to override keys from the dictionary.

    Return:
        object: Data object.

    Raise:
        FileNotFoundError: If the config file cannot be loaded.
        ConfigError: If the validation of the ConfigFile fails.

    """
    config_file = os.path.abspath(config_file)
    if not os.path.exists(config_file):
        raise FileNotFoundError(
            "Cannot find config file -> {}".format(config_file))
    return get_data(
        load_config(config_file,
                    root=key,
                    validate=['source', 'tree', 'output-format']), **kwargs)
Exemplo n.º 3
0
def get_acceptance(config):
    """Get an acceptance object.

    Arguments:
        config (dict): Acceptance to load. Its keys are:
            + `variables` (list[str]): List of variable names.
            + `generation` (dict): Generation configuration. It needs to have a `name` entry, which corresponds
                to the name of the generator efficiency. Any other key will be passed to `get_efficiency` as
                `extra_parameters`
            + `reconstruction` (dict): Reconstruction configuration. It needs to have a `name` entry, which corresponds
                to the name of the reconstruction efficiency. Any other key will be passed to `get_efficiency` as
                `extra_parameters`

    Return:
        `analysis.efficiency.acceptance.Acceptance`: Acceptance object.

    Raise:
        analysis.utils.config.ConfigError: If the input config is missing keys.
        See `analysis.utils.config.load_config`.

    """
    config_keys = [key for key, _ in unfold_config(config)]
    # missing_keys should be empty if the needed keys have been provided. Otherwise complain!
    missing_keys = set(('variables', 'generation/name',
                        'reconstruction/name')) - set(config_keys)

    if missing_keys:
        raise ConfigError(
            "Missing configuration key! -> {}".format(missing_keys))
    # Load the efficiencies
    gen_efficiency = get_efficiency_model(
        load_config(get_efficiency_path(config['generation'].pop('name')),
                    validate=('model', 'variables', 'parameters')),
        **config['generation'])
    reco_efficiency = get_efficiency_model(
        load_config(get_efficiency_path(config['reconstruction'].pop('name')),
                    validate=('model', 'variables', 'parameters')),
        **config['reconstruction'])
    # Check the variables
    if set(config['variables']) != set(gen_efficiency.get_variables()):
        raise ConfigError(
            "Mismatch in variables between acceptance and generation")
    if set(config['variables']) != set(reco_efficiency.get_variables()):
        raise ConfigError(
            "Mismatch in variables between acceptance and reconstruction")
    # Now create the acceptance
    return Acceptance(config['variables'], gen_efficiency, reco_efficiency)
Exemplo n.º 4
0
def run(config_files):
    """Run the script.

    Analyze the toys according to the configuration.

    Arguments:
        config_files (list[str]): Path to the configuration files.

    Raise:
        OSError: If the configuration file or some other input does not exist.
        KeyError: If some configuration data are missing.
        RuntimeError: If there is a problem during the analysis.

    """
    try:
        config = _config.load_config(*config_files,
                                     validate=['toys-to-analyze',
                                               'analysis'])
    except OSError:
        raise OSError("Cannot load configuration files: {}".format(config_files))
    except _config.ConfigError as error:
        if 'toys-to-analyze' in error.missing_keys:
            logger.error("Toys to analyze not specified")
        if 'analysis' in error.missing_keys:
            logger.error("Analysis actions not specified")
        raise KeyError("ConfigError raised -> {}".format(error.missing_keys))
    except KeyError as error:
        logger.error("YAML parsing error -> %s", error)
        raise
    # Load the input toys
    input_toys = _paths.get_toy_fit_path(config['toys-to-analyze'])
    if not os.path.exists(input_toys):
        raise OSError("Cannot find input toy file: {}".format(input_toys))
    # Make sure analysis is in the correct format
    analysis_tasks = config['analysis']
    if not isinstance(analysis_tasks, list):
        analysis_tasks = [analysis_tasks]
    task_results = []
    with pd.HDFStore(input_toys) as hdf_file:
        for analysis_task in analysis_tasks:
            try:
                task_action = analysis_task.pop('action')
            except KeyError:
                logger.error("Missing analysis task action -> %s", analysis_task)
                raise KeyError("Malformed analysis task")
            if task_action not in ANALYSIS_TASKS:
                raise KeyError("Unknown analysis task -> {}".format(task_action))
            try:
                task_result = ANALYSIS_TASKS[task_action](hdf_file, analysis_task)
            except ValueError as error:
                raise RuntimeError(repr(error))
            if isinstance(task_result, dict):
                task_result = [task_result]
            if not isinstance(task_result, list):
                raise RuntimeError("Wrong format for task result -> {}".format(type(task_result)))
            task_results.extend(task_result)
Exemplo n.º 5
0
    def __init__(self, config_files, link_from, extend, overwrite, verbose=False):
        """Configure the toy submitter.

        Arguments:
            config_files (list[str]): Configuration files.
            link_from (str): Storage to link from.
            extend (bool): Extend the production?
            overwrite (bool): Overwrite an existing production?

        Raise:
            NotImplementedError: If some of the mandatory attributes are not
                set.
            OSError: If there is a problem with the configuration file, either in
                loading or validation.
            ValueError: If conflicting options are passed.

        """
        # Check the getters
        if any(getter is None
               for getter in (self.TOY_PATH_GETTER,
                              self.TOY_CONFIG_PATH_GETTER,
                              self.NTOYS_KEY,
                              self.NTOYS_PER_JOB_KEY)):
            raise NotImplementedError("Getters not implemented")
        try:
            config = _config.load_config(*config_files,
                                         validate=self.VALIDATION.keys())
        except _config.ConfigError as error:
            for key, error_message in self.VALIDATION.items():
                if key in error.missing_keys:
                    logger.error(error_message)
            raise KeyError()
        except OSError as error:
            raise OSError("Cannot load configuration file: {}"
                          .format(config_files))
        except KeyError as error:
            # logger.error(str(error))
            raise
        # Check conflicting arguments
        if extend and overwrite:
            logger.error("The --extend nor --overwrite options have been specified at the same time!")
            raise ValueError()
        # Store infotmation
        self.config = config
        self.allowed_config_diffs = set([self.NTOYS_KEY, self.NTOYS_PER_JOB_KEY, 'batch/runtime']
                                        + self.ALLOWED_CONFIG_DIFFS)
        # Assign link-from giving priority to the argument
        self.config['link-from'] = link_from if link_from else config.get('link-from')
        self.link_from = link_from
        self.extend = extend
        self.overwrite = overwrite
        self.verbose = verbose
        # Get the batch system
        self.batch_system = get_batch_system()
Exemplo n.º 6
0
def load_efficiency_model(model_name, **extra_parameters):
    """Load efficiency from file.

    The file path is determined from the `name` using the `paths.get_efficiency_path`
    function.

    Arguments:
        model_name (str): Name of the efficiency model.
        **extra_parameters (dict): Extra configuration parameters to override the entries
            in the `parameters` node loaded from the efficiency file.

    Raise:
        OSError: If the efficiency file does not exist.
        analysis.utils.config.ConfigError: If there is a problem with the efficiency model.

    """
    path = get_efficiency_path(model_name)
    if not os.path.exists(path):
        raise OSError("Cannot find efficiency file -> {}".format(path))
    config = load_config(path, validate=('model', 'variables', 'parameters'))
    return get_efficiency_model(config, **extra_parameters)
Exemplo n.º 7
0
def load_acceptance(name, **extra_parameters):
    """Load an acceptance configuration file.

    The file path is determined from the `name` using the `paths.get_acceptance_path`
    function.

    Note:
        For the exact configuration, see `get_acceptance`.

    Arguments:
        name (str): Name of the acceptance.
        **extra_parameters (dict): Extra configuration parameters to override the entries
            in the `parameters` nodes from the `generation` and `reconstruction` efficiencies.
            As such, the extra parameters need to be placed under the `generation` or
            `reconstruction` keys. For example:

                >>> load_acceptance('Test',
                                    reconstruction={'rename-vars':{'acc_q2':'q2',
                                                                   'acc_cosThetaL':'ctl'}})

    Return:
        `analysis.efficiency.Acceptance`: Acceptance object.

    Raise:
        OSError: If the efficiency file does not exist.
        analysis.utils.config.ConfigError: If there is a problem with the efficiency model.

    """
    # pylint: disable=E1101
    path = get_acceptance_path(name)
    if not os.path.exists(path):
        raise OSError("Cannot find efficiency file -> {}".format(path))
    config = load_config(path,
                         validate=('variables', 'generation',
                                   'reconstruction'))
    config['generation'].update(extra_parameters.get('generation', {}))
    config['reconstruction'].update(extra_parameters.get('reconstruction', {}))
    return get_acceptance(config)
Exemplo n.º 8
0
def main():
    """Toy fitting submission application.

    Parses the command line, configures the toy fitters and submit the
    jobs, catching intermediate errors and transforming them to status codes.

    Status codes:
        0: All good.
        1: Error in the configuration files.
        2: Error in preparing the output folders.
        3: Conflicting options given.
        4: A non-matching configuration file was found in the output.
        5: The queue submission command cannot be found.
        128: Uncaught error. An exception is logged.

    """

    def flatten(list_, typ_):
        """Flatten a list."""
        return list(sum(list_, typ_))

    parser = argparse.ArgumentParser()
    parser.add_argument('-v', '--verbose',
                        action='store_true',
                        help="Verbose output")
    parser.add_argument('--link-from',
                        action='store', type=str,
                        help="Folder to actually store the toy files")
    parser.add_argument('--extend',
                        action='store_true', default=False,
                        help="Extend previous production")
    parser.add_argument('--overwrite',
                        action='store_true', default=False,
                        help="Overwrite previous production")
    parser.add_argument('config',
                        action='store', type=str, nargs='+',
                        help="Configuration file")
    args = parser.parse_args()
    if args.verbose:
        get_logger('analysis').setLevel(1)
        logger.setLevel(1)
    try:
        config = _config.load_config(*args.config)
        # Which type of toy are we running?
        script_to_run = None
        submitter = None
        for toy_type, (toy_class, script_name) in TOY_TYPES.items():
            if toy_type in config:
                script_to_run = script_name
                submitter = toy_class
        if submitter is None:
            raise KeyError("Unknown job type")
        # Is there something to scan?
        scan_config = 'scan' in config
        if scan_config:
            config_files = []
            base_config = _config.unfold_config(config)
            scan_groups = []
            for scan_group in config['scan']:
                scan_group_dict = {}
                for key, val_str in scan_group.items():
                    scan_group_dict[key] = process_scan_val(val_str, scan_group_dict)
                scan_groups.append(scan_group_dict)
            # Check lengths
            if not all(len({len(val) for val in scan_group.values()}) == 1
                       for scan_group in scan_groups):
                raise ValueError("Unmatched length in scan parameters")
            # Build values to scan
            keys, values = list(zip(*[zip(*scan_group.items()) for scan_group in scan_groups]))
            keys = flatten(keys, tuple())
            for value_tuple in itertools.product(*[zip(*val) for val in values]):
                values = dict(zip(keys, flatten(value_tuple, tuple())))
                temp_config = dict(base_config)
                del temp_config['scan']
                temp_config['name'] = temp_config['name'].format(**values)
                for key, value in values.items():
                    temp_config[key] = value
                logger.debug("Creating configuration %s for scan values -> %s",
                             temp_config['name'],
                             ", ".join('{}: {}'.format(*val) for val in values.items()))
                # Write temp_file
                with tempfile.NamedTemporaryFile(delete=False) as file_:
                    file_name = file_.name
                _config.write_config(_config.fold_config(list(temp_config.items())), file_name)
                config_files.append(file_name)
        else:
            config_files = args.config
    # pylint: disable=W0702
    except:
        logger.exception("Bad configuration given")
        parser.exit(1)
    try:
        script_to_run = os.path.join(get_global_var('BASE_PATH'),
                                     'toys',
                                     script_to_run)
        for config_file in config_files:
            submitter(config_files=[config_file],
                      link_from=args.link_from,
                      extend=args.extend,
                      overwrite=args.overwrite,
                      verbose=args.verbose).run(script_to_run, )
            if scan_config:
                os.remove(config_file)
        exit_status = 0
    except KeyError:
        logger.error("Bad configuration given")
        exit_status = 1
    except OSError as error:
        logger.error(str(error))
        exit_status = 2
    except ValueError:
        logger.error("Conflicting options found")
        exit_status = 3
    except AttributeError:
        logger.error("Mismatching configuration found")
        exit_status = 4
    except AssertionError:
        logger.error("Cannot find the queue submission command")
        exit_status = 5
    # pylint: disable=W0703
    except Exception as error:
        exit_status = 128
        logger.exception('Uncaught exception -> %s', repr(error))
    finally:
        parser.exit(exit_status)
Exemplo n.º 9
0
def run(config_files, link_from, verbose):
    """Run the script.

    Run a generate/fit sequence as many times as requested.

    Arguments:
        config_files (list[str]): Path to the configuration files.
        link_from (str): Path to link the results from.
        verbose (bool): Give verbose output?

    Raise:
        OSError: If the configuration file or some other input does not exist.
        AttributeError: If the input data are incompatible with a previous fit.
        KeyError: If some configuration data are missing.
        ValueError: If there is any problem in configuring the PDF factories.
        RuntimeError: If there is a problem during the fitting.

    """
    try:
        config = _config.load_config(
            *config_files, validate=['syst/ntoys', 'name', 'randomizer'])
    except OSError:
        raise OSError(
            "Cannot load configuration files: {}".format(config_files))
    except _config.ConfigError as error:
        if 'syst/ntoys' in error.missing_keys:
            logger.error("Number of toys not specified")
        if 'name' in error.missing_keys:
            logger.error("No name was specified in the config file!")
        if 'randomizer' in error.missing_keys:
            logger.error(
                "No randomizer configuration specified in config file!")
        raise KeyError("ConfigError raised -> {}".format(error.missing_keys))
    except KeyError as error:
        logger.error("YAML parsing error -> %s", error)
        raise
    model_name = config['syst'].get('model',
                                    'model')  # TODO: 'model' returns name?
    try:
        model_config = config[model_name]
    except KeyError as error:
        logger.error("Missing model configuration -> %s", str(error))
        raise KeyError("Missing model configuration")
    # Load fit model
    try:
        fit_model = configure_model(copy.deepcopy(model_config))
        randomizer_model = configure_model(copy.deepcopy(model_config))
    except KeyError:
        logger.exception('Error loading model')
        raise ValueError('Error loading model')
    # Some info
    ntoys = config['syst'].get('ntoys-per-job', config['syst']['ntoys'])
    logger.info("Doing %s generate/fit sequences", ntoys)
    logger.info("Systematics job name: %s", config['name'])
    if link_from:
        config['link-from'] = link_from
    if 'link-from' in config:
        logger.info("Linking toy data from %s", config['link-from'])
    else:
        logger.debug("No linking specified")
    # Now load the acceptance
    try:
        acceptance = get_acceptance(config['acceptance']) \
            if 'acceptance' in config \
            else None
    except _config.ConfigError as error:
        raise KeyError("Error loading acceptance -> {}".format(error))
    # Fit strategy
    fit_strategy = config['syst'].get('strategy', 'simple')
    # Load randomizer configuration
    randomizer = get_randomizer(config['randomizer'])(
        model=randomizer_model,
        config=config['randomizer'],
        acceptance=acceptance)
    # Set seed
    job_id = get_job_id()
    # Start looping
    fit_results = {}
    logger.info("Starting sampling-fit loop (print frequency is 20)")
    initial_mem = memory_usage()
    initial_time = default_timer()
    do_extended = config['syst'].get('extended', False)
    do_minos = config['syst'].get('minos', False)
    for fit_num in range(ntoys):
        # Logging
        if (fit_num + 1) % 20 == 0:
            logger.info("  Fitting event %s/%s", fit_num + 1, ntoys)
        # Generate a dataset
        seed = get_urandom_int(4)
        np.random.seed(seed=seed)
        ROOT.RooRandom.randomGenerator().SetSeed(seed)
        try:
            # Get a randomized dataset and fit it with the nominal fit
            dataset = randomizer.get_dataset(randomize=True)
            gen_values = randomizer.get_current_values()
            fit_result_nominal = fit(fit_model,
                                     model_name,
                                     fit_strategy,
                                     dataset,
                                     verbose,
                                     Extended=do_extended,
                                     Minos=do_minos)
            # Fit the randomized dataset with the randomized values as nominal
            fit_result_rand = fit(randomizer_model,
                                  model_name,
                                  fit_strategy,
                                  dataset,
                                  verbose,
                                  Extended=do_extended,
                                  Minos=do_minos)
            randomizer.reset_values(
            )  # Needed to avoid generating unphysical values
        except ValueError:
            raise RuntimeError()
        except Exception:
            # logger.exception()
            raise RuntimeError()  # TODO: provide more information?
        result = {}
        result['fitnum'] = fit_num
        result['seed'] = seed
        # Save the results of the randomized fit
        result_roofit_rand = FitResult.from_roofit(fit_result_rand)
        result['param_names'] = result_roofit_rand.get_fit_parameters().keys()
        result['rand'] = result_roofit_rand.to_plain_dict()
        result['rand_cov'] = result_roofit_rand.get_covariance_matrix()
        _root.destruct_object(fit_result_rand)
        # Save the results of the nominal fit
        result_roofit_nominal = FitResult.from_roofit(fit_result_nominal)
        result['nominal'] = result_roofit_nominal.to_plain_dict()
        result['nominal_cov'] = result_roofit_nominal.get_covariance_matrix()
        result['gen'] = gen_values
        _root.destruct_object(result_roofit_nominal)
        _root.destruct_object(dataset)
        fit_results[fit_num] = result
        logger.debug("Cleaning up")
    logger.info("Fitting loop over")
    logger.info("--> Memory leakage: %.2f MB/sample-fit",
                (memory_usage() - initial_mem) / ntoys)
    logger.info("--> Spent %.0f ms/sample-fit",
                (default_timer() - initial_time) * 1000.0 / ntoys)
    logger.info("Saving to disk")
    data_res = []
    cov_matrices = {}
    # Get covariance matrices
    for fit_num, fit_res_i in fit_results.items():
        fit_res = {
            'fitnum': fit_res_i['fitnum'],
            'seed': fit_res_i['seed'],
            'model_name': model_name,
            'fit_strategy': fit_strategy
        }
        param_names = fit_res_i['param_names']
        cov_folder_rand = os.path.join(str(job_id), str(fit_res['fitnum']),
                                       'rand')
        cov_matrices[cov_folder_rand] = pd.DataFrame(fit_res_i['rand_cov'],
                                                     index=param_names,
                                                     columns=param_names)
        cov_folder_nominal = os.path.join(str(job_id), str(fit_res['fitnum']),
                                          'nominal')
        cov_matrices[cov_folder_nominal] = pd.DataFrame(
            fit_res_i['nominal_cov'], index=param_names, columns=param_names)
        for res_name, res_value in fit_res_i['rand'].items():
            fit_res['{}_rand'.format(res_name)] = res_value
        for res_name, res_value in fit_res_i['nominal'].items():
            fit_res['{}_nominal'.format(res_name)] = res_value
        for res_name, res_value in fit_res_i['gen'].items():
            fit_res['{}_gen'.format(res_name)] = res_value
        data_res.append(fit_res)
    data_frame = pd.DataFrame(data_res)
    fit_result_frame = pd.concat([
        data_frame,
        pd.concat([pd.DataFrame({'jobid': [job_id]})] *
                  data_frame.shape[0]).reset_index(drop=True)
    ],
                                 axis=1)
    try:
        # pylint: disable=E1101
        with _paths.work_on_file(config['name'],
                                 path_func=_paths.get_toy_fit_path,
                                 link_from=config.get('link-from',
                                                      None)) as toy_fit_file:
            with modify_hdf(toy_fit_file) as hdf_file:
                # First fit results
                hdf_file.append('fit_results', fit_result_frame)
                # Save covarinance matrix under 'covariance/jobid/fitnum
                for cov_folder, cov_matrix in cov_matrices.items():
                    cov_path = os.path.join('covariance', cov_folder)
                    hdf_file.append(cov_path, cov_matrix)
                # Generator info
                hdf_file.append(
                    'input_values',
                    pd.DataFrame.from_dict(randomizer.get_input_values(),
                                           orient='index'))

            logger.info("Written output to %s", toy_fit_file)
            if 'link-from' in config:
                logger.info("Linked to %s", config['link-from'])
    except OSError as excp:
        logger.error(str(excp))
        raise
    except ValueError as error:
        logger.exception("Exception on dataset saving")
        raise RuntimeError(str(error))
Exemplo n.º 10
0
def run(config_files, link_from):
    """Run the script.

    Arguments:
        config_files (list[str]): Path to the configuration files.
        link_from (str): Path to link the results from.

    Return:
        int: Number of submitted jobs.

    Raise:
        OSError: If the configuration file does not exist.
        KeyError: If some configuration data are missing.
        ValueError: If no suitable batch backend is found.
        RuntimeError: If something goes wrong during submission.

    """
    try:
        config = _config.load_config(*config_files,
                                     validate=[
                                         'event-type', 'simulation-version',
                                         'year', 'magnet-polarity',
                                         'prod/nevents', 'prod/nevents-per-job'
                                     ])
    except OSError:
        raise OSError("Cannot load configuration files: %s", config_files)
    except _config.ConfigError as error:
        if 'event-type' in error.missing_keys:
            logger.error("No event type was specified in the config file!")
        if 'simulation-version' in error.missing_keys:
            logger.error(
                "No simulation version was specified in the config file!")
        if 'year' in error.missing_keys:
            logger.error(
                "No simulation year was specified in the config file!")
        if 'magnet-polarity' in error.missing_keys:
            logger.error(
                "No magnet polarity was specified in the config file!")
        if 'prod/nevents' in error.missing_keys:
            logger.error(
                "The number of events to produce was not specified in the config file!"
            )
        if 'prod/nevents-per-job' in error.missing_keys:
            logger.error(
                "The number of events per job was not specified in the config file!"
            )
        raise KeyError("ConfigError raised -> %s" % error.missing_keys)
    except KeyError as error:
        logger.error("YAML parsing error -> %s", error)
        raise
    # Event type
    evt_type = config['event-type']
    try:
        evt_type = int(evt_type)
    except ValueError:  # There's non-numerical chars, we assume it's a path
        decfile = evt_type if os.path.isabs(evt_type) else os.path.abspath(
            evt_type)
        evt_type = os.path.splitext(os.path.split(decfile)[1])[0]
    else:
        decfile = '$DECFILESROOT/options/{}.py'.format(evt_type)
    # Prepare job
    _, _, log_file = _paths.prepare_path(
        name='mc/{}'.format(evt_type),
        path_func=_paths.get_log_path,
        link_from=link_from)  # No linking is done for logs
    # MC config
    sim_version = config['simulation-version'].lower()
    year = int(config['year'])
    magnet_polarity = config['magnet-polarity'].lower().lstrip(
        'magnet').lstrip('mag')
    remove_detector = config.get('remove-detector', True)
    # Prepare paths
    do_link, output_path, output_path_link = _paths.prepare_path(
        name='',
        path_func=_paths.get_genlevel_mc_path,
        link_from=link_from,
        evt_type=evt_type,
        sim_version=sim_version,
        year=year,
        magnet_polarity=magnet_polarity,
        remove_detector=remove_detector)
    link_status = 'true' if do_link else 'false'
    try:
        options = get_gaudirun_options(sim_version, year, magnet_polarity,
                                       remove_detector)
        gauss_version = get_gauss_version(sim_version, year)
        dddb_tag, conddb_tag = get_db_tags(sim_version, year, magnet_polarity)
    except KeyError as error:
        logger.error("Unknown Gauss configuration")
        raise KeyError(str(error))
    # Add compression and our decfile
    options.append('$APPCONFIGOPTS/Persistency/Compression-ZLIB-1.py')
    options.append(decfile)
    # Prepare to submit
    nevents = min(config['prod']['nevents-per-job'], config['prod']['nevents'])
    logger.info("Generating %s events of decfile -> %s", nevents, decfile)
    logger.info("Output path: %s", output_path)
    logger.info("Log file location: %s", os.path.dirname(log_file))
    if do_link:
        logger.info("Linking to %s", output_path_link)
    extra_config = {
        'workdir': '$TMPDIR',
        'do_link': link_status,
        'gaudirun_options': ' '.join(options),
        'gauss_version': gauss_version,
        'dddb_tag': dddb_tag,
        'conddb_tag': conddb_tag,
        'output_extension': 'xgen' if remove_detector else 'sim',
        'output_path': output_path,
        'output_path_link': output_path_link,
        'n_events': nevents
    }
    # Prepare batch
    batch_config = config.get('batch', {})
    try:
        batch_system = get_batch_system(batch_config.get('backend', None))
    except ValueError:
        raise
    # Submit
    njobs = int(
        ceil(1.0 * config['prod']['nevents'] /
             config['prod']['nevents-per-job']))
    logger.info("About to send %s jobs with %s events each.", njobs, nevents)
    for _ in range(njobs):
        # Submit
        try:
            job_id = batch_system.submit_job('MC_%s' % evt_type,
                                             SCRIPT,
                                             log_file,
                                             extra_config=extra_config,
                                             **batch_config)
            if 'submit error' in job_id:
                logger.error(job_id)
                raise Exception
            logger.debug("Submitted job -> %s", job_id)
        except Exception:
            logger.exception('Error submitting MC production job')
            raise RuntimeError
    return njobs
Exemplo n.º 11
0
def run(config_files, link_from):
    """Run the script.

    If the efficiency file exists, only the plots are remade.

    Arguments:
        config_files (list[str]): Path to the configuration files.
        link_from (str): Path to link the results from.

    Raise:
        OSError: If there either the configuration file does not exist some
            of the input files cannot be found.
        KeyError: If some configuration data are missing.
        ValueError: If there is any problem in configuring the efficiency model.
        RuntimeError: If there is a problem during the efficiency fitting.

    """
    try:
        config = _config.load_config(*config_files,
                                     validate=[
                                         'name', 'data/source', 'data/tree',
                                         'parameters', 'model', 'variables'
                                     ])
    except OSError:
        raise OSError(
            "Cannot load configuration files: {}".format(config_files))
    except _config.ConfigError as error:
        if 'name' in error.missing_keys:
            logger.error("No name was specified in the config file!")
        if 'data/file' in error.missing_keys:
            logger.error("No input data specified in the config file!")
        if 'data/tree' in error.missing_keys:
            logger.error("No input data specified in the config file!")
        if 'model' in error.missing_keys:
            logger.error("No efficiency model specified in the config file!")
        if 'parameters' in error.missing_keys:
            logger.error(
                "No efficiency model parameters specified in the config file!")
        if 'variables' in error.missing_keys:
            logger.error(
                "No efficiency variables to model have been specified in the config file!"
            )
        raise KeyError("ConfigError raised -> {}".format(error.missing_keys))
    except KeyError as error:
        logger.error("YAML parsing error -> %s", error)
        raise
    # Do checks and load things
    plot_files = {}
    if config.get('plot', False):
        for var_name in config['variables']:
            plot_files[var_name] = get_efficiency_plot_path(config['name'],
                                                            var=var_name)
    efficiency_class = get_efficiency_model_class(config['model'])
    if not efficiency_class:
        raise ValueError("Unknown efficiency model -> {}".format(
            config['model']))
    # Let's do it
    # pylint: disable=E1101
    if not all(os.path.exists(file_name)
               for file_name in plot_files.values()) or \
            not os.path.exists(_paths.get_efficiency_path(config['name'])):  # If plots don't exist, we load data
        logger.info("Loading data, this may take a while...")
        weight_var = config['data'].get('weight-var-name', None)
        # Prepare data
        config['data']['output-format'] = 'pandas'
        config['data']['variables'] = list(config['variables'])
        if weight_var:
            config['data']['variables'].append(weight_var)
        input_data = get_data(config['data'], **{'output-format': 'pandas'})
        if weight_var:
            logger.info("Data loaded, using %s as weight", weight_var)
        else:
            logger.info("Data loaded, not using any weights")

        if not os.path.exists(_paths.get_efficiency_path(config['name'])):
            logger.info("Fitting efficiency model")
            try:
                eff = efficiency_class.fit(input_data, config['variables'],
                                           weight_var, **config['parameters'])
            except (ValueError, TypeError) as error:
                raise ValueError(
                    "Cannot configure the efficiency model -> {}".format(
                        error.message))
            except KeyError as error:
                raise RuntimeError("Missing key -> {}".format(error))
            except Exception as error:
                raise RuntimeError(error)
            output_file = eff.write_to_disk(config['name'], link_from)
            logger.info("Written efficiency file -> %s", output_file)
        else:
            logger.warning(
                "Output efficiency already exists, only redoing plots")
            eff = load_efficiency_model(config['name'])
        if plot_files:
            import seaborn as sns
            sns.set_style("white")
            plt.style.use('file://{}'.format(
                os.path.join(get_global_var('STYLE_PATH'),
                             'matplotlib_LHCb.mplstyle')))
            plots = eff.plot(input_data,
                             weight_var,
                             labels=config.get('plot-labels', {}))
            for var_name, plot in plots.items():
                logger.info("Plotting '%s' efficiency -> %s", var_name,
                            plot_files[var_name])
                plot.savefig(plot_files[var_name], bbox_inches='tight')
    else:
        logger.info("Efficiency file exists: %s. Nothing to do!",
                    _paths.get_efficiency_path(config['name']))
Exemplo n.º 12
0
    def run(self, script_to_run):
        """Run the script.

        If the output exists and no extension or overwrite has been configured, nothing
        is done.

        Arguments:
            script_to_run (str): Script to run in the cluster.

        Raise:
            AssertionError: If the qsub command cannot be found.
            AttributeError: If non-matching configuration file was found.
            OSError: If there is a problem preparing the output path.

        """
        flat_config = dict(_config.unfold_config(self.config))
        # Check if it has not been produced yet
        # pylint: disable=E1102
        config_file_dest = self.TOY_CONFIG_PATH_GETTER(self.config['name'])
        # First check the config (we may have already checked)
        if os.path.exists(config_file_dest):  # It exists, check they match
            config_dest = _config.load_config(config_file_dest)
            if _config.compare_configs(flat_config, config_dest).difference(self.allowed_config_diffs):
                logger.error("Non-matching configuration already exists with that name!")
                raise AttributeError()
        # Now check output
        _, expected_src, expected_dest = _paths.prepare_path(name=self.config['name'],
                                                             path_func=self.TOY_PATH_GETTER,
                                                             link_from=self.config['link-from'])
        # Check file existence
        if os.path.exists(expected_src):
            logger.warning("Output data file exists! %s", expected_src)
            if self.overwrite:
                os.remove(expected_src)
                if os.path.exists(expected_dest):
                    os.remove(expected_dest)
            else:
                # Create de symlink if necessary
                if not os.path.exists(expected_dest):
                    os.symlink(expected_src, expected_dest)
                if not self.extend:
                    logger.info("Nor --extend nor --overwrite have been specified. Nothing to do.")
                    return
        # Source doesn't exist, delete the destination if needed
        else:
            if os.path.exists(expected_dest):
                os.remove(expected_dest)
        # Some bookkeeping
        if not os.path.exists(script_to_run):
            raise OSError("Cannot find {}!".format(script_to_run))
        script_args = []
        if self.config['link-from']:
            script_args.append('--link-from={}'.format(self.config['link-from']))
        if self.verbose:
            script_args.append('--verbose')
        script_args.append(config_file_dest)
        # Prepare paths
        # pylint: disable=E1101
        _, log_file_fmt, _ = _paths.prepare_path(name=self.config['name'],
                                                 path_func=_paths.get_log_path,
                                                 link_from=None)  # No linking is done for logs
        # Calculate number of jobs and submit
        ntoys = flat_config[self.NTOYS_KEY]
        ntoys_per_job = flat_config.get(self.NTOYS_PER_JOB_KEY, ntoys)
        n_jobs = int(1.0 * ntoys / ntoys_per_job)
        if ntoys % ntoys_per_job:
            n_jobs += 1
        # Submit!
        _config.write_config(self.config, config_file_dest)
        for _ in range(n_jobs):
            # Write the config file
            job_id = self.batch_system.submit_script(job_name=self.config['name'],
                                                     cmd_script=script_to_run,
                                                     script_args=script_args,
                                                     log_file=log_file_fmt,
                                                     **self.config.get('batch', {}))
            logger.info('Submitted JobID: %s', job_id)
Exemplo n.º 13
0
def test_global_replace(config_simple_globals_1, config_simple_globals_2,
                        config_simple_globals_target):
    config = load_config(config_simple_globals_1, config_simple_globals_2)
    assert config == config_simple_globals_target
Exemplo n.º 14
0
def test_fails_loudly(config_simple_fail_noload):
    with pytest.raises(ConfigError) as error_info:
        load_config(config_simple_fail_noload)
Exemplo n.º 15
0
def test_simple_signal(config_simple_load_signal, config_simple_load_target):
    config = load_config(config_simple_load_signal)
    assert config == config_simple_load_target
Exemplo n.º 16
0
def test_simple(config_simple_load, config_simple_load_target):
    config = load_config(config_simple_load)
    assert config == config_simple_load_target
Exemplo n.º 17
0
def run(config_files, link_from):
    """Run the script.

    Arguments:
        config_files (list[str]): Path to the configuration files.
        link_from (str): Path to link the results from.

    Raise:
        KeyError: If some configuration data are missing.
        OSError: If there either the configuration file does not exist or if
            there is a problem preparing the output path.
        ValueError: If there is any problem in configuring the PDF factories.
        RuntimeError: If there is a problem during the generation.

    """
    # Configure
    try:
        config = load_config(*config_files,
                             validate=['gen/nevents', 'name', 'gen-model'])
    except OSError:
        raise OSError(
            "Cannot load configuration files: {}".format(config_files))
    except ConfigError as error:
        if 'gen/nevents' in error.missing_keys:
            logger.error("Number of events not specified")
        if 'name' in error.missing_keys:
            logger.error("No name was specified in the config file!")
        if 'gen-model' in error.missing_keys:
            logger.error(
                "No generation model were specified in the config file!")
        raise KeyError("ConfigError raised -> {}".format(error.missing_keys))
    except KeyError as error:
        logger.error("YAML parsing error -> %s", error)
        raise
    # Ignore renaming
    logger.info("Generating %s events", config['gen']['nevents'])
    logger.info("Generation job name: %s", config['name'])
    if link_from:
        config['link-from'] = link_from
    if 'link-from' in config:
        logger.info("Linking toy data from %s", config['link-from'])
    else:
        logger.debug("No linking specified")
    # Set seed
    job_id = get_job_id()
    seed = get_urandom_int(4)
    np.random.seed(seed=seed)
    ROOT.RooRandom.randomGenerator().SetSeed(seed)
    # Generate
    try:
        physics = configure_model(config['gen-model'])
    except KeyError as error:
        logger.error("Cannot find physics factory")
        raise ValueError('{}'.format(error))
    except ValueError:
        logger.error("Problem dealing with shared parameters")
        raise
    if isinstance(physics, (SumPhysicsFactory, SimultaneousPhysicsFactory)):
        logger.warning("Generating a RooAddPdf or a RooSimultaneous: "
                       "yields will be generated at a fixed value")
    try:
        dataset = generate(
            physics, config['gen'].get('nevents-per-job',
                                       config['gen']['nevents']))
    except ValueError as error:
        logger.exception("Exception on generation")
        raise RuntimeError(str(error))
    # Get toy information
    toy_info = {
        var.GetName(): [var.getVal()]
        for var in physics.get_gen_parameters()
    }
    n_evts = sum(config['gen']['nevents'].values()) \
        if isinstance(config['gen']['nevents'], dict) \
        else config['gen']['nevents']
    toy_info.update({'seed': [seed], 'jobid': [job_id], 'nevents': n_evts})
    try:
        # Save
        with work_on_file(config['name'],
                          path_func=get_toy_path,
                          link_from=config.get('link-from')) as toy_file:
            with modify_hdf(toy_file) as hdf_file:
                hdf_file.append('data', dataset.assign(jobid=job_id))
                hdf_file.append('toy_info', pd.DataFrame(toy_info))
        # Say something
        logger.info("Written output to %s", toy_file)
        if 'link-from' in config:
            logger.info("Linked to %s", config['link-from'])
    except OSError as excp:
        logger.error(str(excp))
        raise
    except ValueError as error:
        logger.exception("Exception on dataset saving")
        raise RuntimeError(str(error))
Exemplo n.º 18
0
def run(config_files, link_from, verbose):
    """Run the script.

    Run a sample/fit sequence as many times as requested.

    Arguments:
        config_files (list[str]): Path to the configuration files.
        link_from (str): Path to link the results from.
        verbose (bool): Give verbose output?

    Raise:
        OSError: If there either the configuration file does not exist some
            of the input toys cannot be found.
        AttributeError: If the input data are incompatible with a previous fit.
        KeyError: If some configuration data are missing.
        ValueError: If there is any problem in configuring the PDF factories.
        RuntimeError: If there is a problem during the fitting.

    """
    try:
        config = _config.load_config(*config_files,
                                     validate=['fit/nfits', 'name', 'data'])
    except OSError:
        raise OSError(
            "Cannot load configuration files: {}".format(config_files))
    except ConfigError as error:
        if 'fit/nfits' in error.missing_keys:
            logger.error("Number of fits not specified")
        if 'name' in error.missing_keys:
            logger.error("No name was specified in the config file!")
        if 'data' in error.missing_keys:
            logger.error("No input data specified in the config file!")
        raise KeyError("ConfigError raised -> {}".format(error.missing_keys))
    except KeyError as error:
        logger.error("YAML parsing error -> %s", error)
    try:
        models = {
            model_name: config[model_name]
            for model_name in config['fit'].get('models', ['model'])
        }
    except KeyError as error:
        logger.error("Missing model configuration -> %s", str(error))
        raise KeyError("Missing model configuration")
    if not models:
        logger.error(
            "Empty list specified in the config file under 'fit/models'!")
        raise KeyError()
    fit_strategies = config['fit'].get('strategies', ['simple'])
    if not fit_strategies:
        logger.error("Empty fit strategies were specified in the config file!")
        raise KeyError()
    # Some info
    nfits = config['fit'].get('nfits-per-job', config['fit']['nfits'])
    logger.info("Doing %s sample/fit sequences", nfits)
    logger.info("Fit job name: %s", config['name'])
    if link_from:
        config['link-from'] = link_from
    if 'link-from' in config:
        logger.info("Linking toy data from %s", config['link-from'])
    else:
        logger.debug("No linking specified")
    # Analyze data requirements
    logger.info("Loading input data")
    data = {}
    gen_values = {}
    if len(set('category' in data_source
               for data_source in config['data'])) > 1:
        raise KeyError("Categories in 'data' not consistently specified.")
    for data_id, data_source in config['data'].items():
        try:
            source_toy = data_source['source']
        except KeyError:
            logger.error("Data source not specified")
            raise
        data[data_id] = (get_data({
            'source': source_toy,
            'source-type': 'toy',
            'tree': 'data',
            'output-format': 'pandas',
            'selection': data_source.get('selection')
        }), data_source['nevents'], data_source.get('poisson'),
                         data_source.get('category'))
        # Generator values
        toy_info = get_data({
            'source': source_toy,
            'source-type': 'toy',
            'tree': 'toy_info',
            'output-format': 'pandas'
        })
        gen_values[data_id] = {}
        for var_name in toy_info.columns:
            if var_name in ('seed', 'jobid', 'nevents'):
                continue
            gen_values[data_id][var_name] = toy_info[var_name].iloc[0]
    try:
        fit_models = {}
        for model_name in models:
            if model_name not in config:
                raise KeyError(
                    "Missing model definition -> {}".format(model_name))
            fit_models[model_name] = configure_model(config[model_name])
            if any(yield_.isConstant()
                   for yield_ in fit_models[model_name].get_yield_vars()
                   if yield_):
                logger.warning(
                    "Model %s has constant yields. "
                    "Be careful when configuring the input data, you may need to disable poisson sampling",
                    model_name)
    except KeyError:
        logger.exception("Error loading model")
        raise ValueError("Error loading model")
    if len(set(model.is_extended() for model in fit_models.values())) == 2:
        logger.error("Mix of extended and non-extended models!")
        raise ValueError("Error loading fit models")
    # Let's check these generator values against the output file
    try:
        gen_values_frame = {}
        # pylint: disable=E1101
        with _paths.work_on_file(config['name'], _paths.get_toy_fit_path,
                                 config.get('link-from')) as toy_fit_file:
            with modify_hdf(toy_fit_file) as hdf_file:
                logger.debug("Checking generator values")
                test_gen = [('gen_{}'.format(data_source)) in hdf_file
                            for data_source in gen_values]
                if all(test_gen
                       ):  # The data were written already, crosscheck values
                    for source_id, gen_value in gen_values.items():
                        if not all(
                                hdf_file['gen_{}'.format(data_source)]
                            [var_name].iloc[0] == var_value
                                for var_name, var_value in gen_value.items()):
                            raise AttributeError(
                                "Generated and stored values don't match for source '{}'"
                                .format(source_id))
                elif not any(test_gen):  # No data were there, just overwrite
                    for source_id, gen_values in gen_values.items():
                        gen_data = {
                            'id':
                            source_id,
                            'source':
                            _paths.get_toy_path(
                                config['data'][source_id]['source']),
                            'nevents':
                            config['data'][source_id]['nevents']
                        }
                        gen_data.update(gen_values)
                        gen_values_frame['gen_{}'.format(
                            source_id)] = pd.DataFrame([gen_data])
                else:
                    raise AttributeError("Inconsistent number of data sources")
    except OSError as excp:
        logger.error(str(excp))
        raise
    # Now load the acceptance
    try:
        acceptance = get_acceptance(config['acceptance']) \
            if 'acceptance' in config \
            else None
    except ConfigError as error:
        raise KeyError("Error loading acceptance -> {}".format(error))
    # Prepare output
    gen_events = defaultdict(list)
    # Set seed
    job_id = get_job_id()
    if job_id:
        seed = int(job_id.split('.')[0])
    else:
        import random
        job_id = 'local'
        seed = random.randint(0, 100000)
    np.random.seed(seed=seed)
    ROOT.RooRandom.randomGenerator().SetSeed(seed)
    # Start looping
    fit_results = defaultdict(list)
    logger.info("Starting sampling-fit loop (print frequency is 20)")
    initial_mem = memory_usage()
    initial_time = default_timer()
    for fit_num in range(nfits):
        # Logging
        if (fit_num + 1) % 20 == 0:
            logger.info("  Fitting event %s/%s", fit_num + 1, nfits)
        # Get a compound dataset
        seed = get_urandom_int(4)
        np.random.seed(seed=seed)
        ROOT.RooRandom.randomGenerator().SetSeed(seed)
        try:
            logger.debug("Sampling input data")
            datasets, sample_sizes = get_datasets(data, acceptance, fit_models)
            for sample_name, sample_size in sample_sizes.items():
                gen_events['N^{{{}}}_{{gen}}'.format(sample_name)].append(
                    sample_size)
            logger.debug("Sampling finalized")
        except KeyError:
            logger.exception("Bad data configuration")
            raise
        logger.debug("Fitting")
        for model_name in models:
            dataset = datasets.pop(model_name)
            fit_model = fit_models[model_name]
            # Now fit
            for fit_strategy in fit_strategies:
                toy_key = (model_name, fit_strategy)
                try:
                    fit_result = fit(fit_model,
                                     model_name,
                                     fit_strategy,
                                     dataset,
                                     verbose,
                                     Extended=config['fit'].get(
                                         'extended', False),
                                     Minos=config['fit'].get('minos', False))
                except ValueError:
                    raise RuntimeError()
                # Now results are in fit_parameters
                result_roofit = FitResult.from_roofit(fit_result)
                result = result_roofit.to_plain_dict()
                result['cov_matrix'] = result_roofit.get_covariance_matrix()
                result['param_names'] = result_roofit.get_fit_parameters(
                ).keys()
                result['fitnum'] = fit_num
                result['seed'] = seed
                fit_results[toy_key].append(result)
                _root.destruct_object(fit_result)
            _root.destruct_object(dataset)
        logger.debug("Cleaning up")
    logger.info("Fitting loop over")
    logger.info("--> Memory leakage: %.2f MB/sample-fit",
                (memory_usage() - initial_mem) / nfits)
    logger.info("--> Spent %.0f ms/sample-fit",
                (default_timer() - initial_time) * 1000.0 / nfits)
    logger.info("Saving to disk")
    data_res = []
    cov_matrices = {}
    # Get gen values for this model
    for (model_name, fit_strategy), fits in fit_results.items():
        for fit_res in fits:
            fit_res = fit_res.copy()
            fit_res['model_name'] = model_name
            fit_res['fit_strategy'] = fit_strategy

            cov_folder = os.path.join(str(job_id), str(fit_res['fitnum']))
            param_names = fit_res.pop('param_names')
            cov_matrices[cov_folder] = pd.DataFrame(fit_res.pop('cov_matrix'),
                                                    index=param_names,
                                                    columns=param_names)
            data_res.append(fit_res)
    data_frame = pd.DataFrame(data_res)
    fit_result_frame = pd.concat([
        pd.DataFrame(gen_events), data_frame,
        pd.concat([pd.DataFrame({'jobid': [job_id]})] *
                  data_frame.shape[0]).reset_index(drop=True)
    ],
                                 axis=1)
    try:
        # pylint: disable=E1101
        with _paths.work_on_file(
                config['name'],
                path_func=_paths.get_toy_fit_path,
                link_from=config.get('link-from')) as toy_fit_file:
            with modify_hdf(toy_fit_file) as hdf_file:
                # First fit results
                hdf_file.append('fit_results', fit_result_frame)
                # Save covarinance matrix under 'covariance/jobid/fitnum
                for cov_folder, cov_matrix in cov_matrices.items():
                    cov_path = os.path.join('covariance', cov_folder)
                    hdf_file.append(cov_path, cov_matrix)
                # Generator info
                for key_name, gen_frame in gen_values_frame.items():
                    hdf_file.append(key_name, gen_frame)

            logger.info("Written output to %s", toy_fit_file)
            if 'link-from' in config:
                logger.info("Linked to %s", config['link-from'])
    except OSError as excp:
        logger.error(str(excp))
        raise
    except ValueError as error:
        logger.exception("Exception on dataset saving")
        raise RuntimeError(str(error))