def from_yaml_file(name): """Initialize from a YAML file. File name is determined by get_fit_result_path. Arguments: name (str): Name of the fit result. Return: self Raise: OSError: If the file cannot be found. KeyError: If any of the FitResult data is missing from the input file. """ try: yaml_config = load_config( _paths.get_fit_result_path(name), validate=('fit-parameters', 'fit-parameters-initial', 'covariance-matrix/quality', 'covariance-matrix/matrix', 'status')) return FitResult.from_yaml(yaml_config) except ConfigError as error: raise KeyError("Missing keys in input file -> {}".format(','.join( error.missing_keys)))
def load_data(config_file, key=None, **kwargs): """Load from file. Arguments: config_file (str): Name of the configuration file. key (str, optional): Key to load in the configuration file. If none is given, the root of the YAML file will be used as configuration. **kwargs (dict): Dictionary to override keys from the dictionary. Return: object: Data object. Raise: FileNotFoundError: If the config file cannot be loaded. ConfigError: If the validation of the ConfigFile fails. """ config_file = os.path.abspath(config_file) if not os.path.exists(config_file): raise FileNotFoundError( "Cannot find config file -> {}".format(config_file)) return get_data( load_config(config_file, root=key, validate=['source', 'tree', 'output-format']), **kwargs)
def get_acceptance(config): """Get an acceptance object. Arguments: config (dict): Acceptance to load. Its keys are: + `variables` (list[str]): List of variable names. + `generation` (dict): Generation configuration. It needs to have a `name` entry, which corresponds to the name of the generator efficiency. Any other key will be passed to `get_efficiency` as `extra_parameters` + `reconstruction` (dict): Reconstruction configuration. It needs to have a `name` entry, which corresponds to the name of the reconstruction efficiency. Any other key will be passed to `get_efficiency` as `extra_parameters` Return: `analysis.efficiency.acceptance.Acceptance`: Acceptance object. Raise: analysis.utils.config.ConfigError: If the input config is missing keys. See `analysis.utils.config.load_config`. """ config_keys = [key for key, _ in unfold_config(config)] # missing_keys should be empty if the needed keys have been provided. Otherwise complain! missing_keys = set(('variables', 'generation/name', 'reconstruction/name')) - set(config_keys) if missing_keys: raise ConfigError( "Missing configuration key! -> {}".format(missing_keys)) # Load the efficiencies gen_efficiency = get_efficiency_model( load_config(get_efficiency_path(config['generation'].pop('name')), validate=('model', 'variables', 'parameters')), **config['generation']) reco_efficiency = get_efficiency_model( load_config(get_efficiency_path(config['reconstruction'].pop('name')), validate=('model', 'variables', 'parameters')), **config['reconstruction']) # Check the variables if set(config['variables']) != set(gen_efficiency.get_variables()): raise ConfigError( "Mismatch in variables between acceptance and generation") if set(config['variables']) != set(reco_efficiency.get_variables()): raise ConfigError( "Mismatch in variables between acceptance and reconstruction") # Now create the acceptance return Acceptance(config['variables'], gen_efficiency, reco_efficiency)
def run(config_files): """Run the script. Analyze the toys according to the configuration. Arguments: config_files (list[str]): Path to the configuration files. Raise: OSError: If the configuration file or some other input does not exist. KeyError: If some configuration data are missing. RuntimeError: If there is a problem during the analysis. """ try: config = _config.load_config(*config_files, validate=['toys-to-analyze', 'analysis']) except OSError: raise OSError("Cannot load configuration files: {}".format(config_files)) except _config.ConfigError as error: if 'toys-to-analyze' in error.missing_keys: logger.error("Toys to analyze not specified") if 'analysis' in error.missing_keys: logger.error("Analysis actions not specified") raise KeyError("ConfigError raised -> {}".format(error.missing_keys)) except KeyError as error: logger.error("YAML parsing error -> %s", error) raise # Load the input toys input_toys = _paths.get_toy_fit_path(config['toys-to-analyze']) if not os.path.exists(input_toys): raise OSError("Cannot find input toy file: {}".format(input_toys)) # Make sure analysis is in the correct format analysis_tasks = config['analysis'] if not isinstance(analysis_tasks, list): analysis_tasks = [analysis_tasks] task_results = [] with pd.HDFStore(input_toys) as hdf_file: for analysis_task in analysis_tasks: try: task_action = analysis_task.pop('action') except KeyError: logger.error("Missing analysis task action -> %s", analysis_task) raise KeyError("Malformed analysis task") if task_action not in ANALYSIS_TASKS: raise KeyError("Unknown analysis task -> {}".format(task_action)) try: task_result = ANALYSIS_TASKS[task_action](hdf_file, analysis_task) except ValueError as error: raise RuntimeError(repr(error)) if isinstance(task_result, dict): task_result = [task_result] if not isinstance(task_result, list): raise RuntimeError("Wrong format for task result -> {}".format(type(task_result))) task_results.extend(task_result)
def __init__(self, config_files, link_from, extend, overwrite, verbose=False): """Configure the toy submitter. Arguments: config_files (list[str]): Configuration files. link_from (str): Storage to link from. extend (bool): Extend the production? overwrite (bool): Overwrite an existing production? Raise: NotImplementedError: If some of the mandatory attributes are not set. OSError: If there is a problem with the configuration file, either in loading or validation. ValueError: If conflicting options are passed. """ # Check the getters if any(getter is None for getter in (self.TOY_PATH_GETTER, self.TOY_CONFIG_PATH_GETTER, self.NTOYS_KEY, self.NTOYS_PER_JOB_KEY)): raise NotImplementedError("Getters not implemented") try: config = _config.load_config(*config_files, validate=self.VALIDATION.keys()) except _config.ConfigError as error: for key, error_message in self.VALIDATION.items(): if key in error.missing_keys: logger.error(error_message) raise KeyError() except OSError as error: raise OSError("Cannot load configuration file: {}" .format(config_files)) except KeyError as error: # logger.error(str(error)) raise # Check conflicting arguments if extend and overwrite: logger.error("The --extend nor --overwrite options have been specified at the same time!") raise ValueError() # Store infotmation self.config = config self.allowed_config_diffs = set([self.NTOYS_KEY, self.NTOYS_PER_JOB_KEY, 'batch/runtime'] + self.ALLOWED_CONFIG_DIFFS) # Assign link-from giving priority to the argument self.config['link-from'] = link_from if link_from else config.get('link-from') self.link_from = link_from self.extend = extend self.overwrite = overwrite self.verbose = verbose # Get the batch system self.batch_system = get_batch_system()
def load_efficiency_model(model_name, **extra_parameters): """Load efficiency from file. The file path is determined from the `name` using the `paths.get_efficiency_path` function. Arguments: model_name (str): Name of the efficiency model. **extra_parameters (dict): Extra configuration parameters to override the entries in the `parameters` node loaded from the efficiency file. Raise: OSError: If the efficiency file does not exist. analysis.utils.config.ConfigError: If there is a problem with the efficiency model. """ path = get_efficiency_path(model_name) if not os.path.exists(path): raise OSError("Cannot find efficiency file -> {}".format(path)) config = load_config(path, validate=('model', 'variables', 'parameters')) return get_efficiency_model(config, **extra_parameters)
def load_acceptance(name, **extra_parameters): """Load an acceptance configuration file. The file path is determined from the `name` using the `paths.get_acceptance_path` function. Note: For the exact configuration, see `get_acceptance`. Arguments: name (str): Name of the acceptance. **extra_parameters (dict): Extra configuration parameters to override the entries in the `parameters` nodes from the `generation` and `reconstruction` efficiencies. As such, the extra parameters need to be placed under the `generation` or `reconstruction` keys. For example: >>> load_acceptance('Test', reconstruction={'rename-vars':{'acc_q2':'q2', 'acc_cosThetaL':'ctl'}}) Return: `analysis.efficiency.Acceptance`: Acceptance object. Raise: OSError: If the efficiency file does not exist. analysis.utils.config.ConfigError: If there is a problem with the efficiency model. """ # pylint: disable=E1101 path = get_acceptance_path(name) if not os.path.exists(path): raise OSError("Cannot find efficiency file -> {}".format(path)) config = load_config(path, validate=('variables', 'generation', 'reconstruction')) config['generation'].update(extra_parameters.get('generation', {})) config['reconstruction'].update(extra_parameters.get('reconstruction', {})) return get_acceptance(config)
def main(): """Toy fitting submission application. Parses the command line, configures the toy fitters and submit the jobs, catching intermediate errors and transforming them to status codes. Status codes: 0: All good. 1: Error in the configuration files. 2: Error in preparing the output folders. 3: Conflicting options given. 4: A non-matching configuration file was found in the output. 5: The queue submission command cannot be found. 128: Uncaught error. An exception is logged. """ def flatten(list_, typ_): """Flatten a list.""" return list(sum(list_, typ_)) parser = argparse.ArgumentParser() parser.add_argument('-v', '--verbose', action='store_true', help="Verbose output") parser.add_argument('--link-from', action='store', type=str, help="Folder to actually store the toy files") parser.add_argument('--extend', action='store_true', default=False, help="Extend previous production") parser.add_argument('--overwrite', action='store_true', default=False, help="Overwrite previous production") parser.add_argument('config', action='store', type=str, nargs='+', help="Configuration file") args = parser.parse_args() if args.verbose: get_logger('analysis').setLevel(1) logger.setLevel(1) try: config = _config.load_config(*args.config) # Which type of toy are we running? script_to_run = None submitter = None for toy_type, (toy_class, script_name) in TOY_TYPES.items(): if toy_type in config: script_to_run = script_name submitter = toy_class if submitter is None: raise KeyError("Unknown job type") # Is there something to scan? scan_config = 'scan' in config if scan_config: config_files = [] base_config = _config.unfold_config(config) scan_groups = [] for scan_group in config['scan']: scan_group_dict = {} for key, val_str in scan_group.items(): scan_group_dict[key] = process_scan_val(val_str, scan_group_dict) scan_groups.append(scan_group_dict) # Check lengths if not all(len({len(val) for val in scan_group.values()}) == 1 for scan_group in scan_groups): raise ValueError("Unmatched length in scan parameters") # Build values to scan keys, values = list(zip(*[zip(*scan_group.items()) for scan_group in scan_groups])) keys = flatten(keys, tuple()) for value_tuple in itertools.product(*[zip(*val) for val in values]): values = dict(zip(keys, flatten(value_tuple, tuple()))) temp_config = dict(base_config) del temp_config['scan'] temp_config['name'] = temp_config['name'].format(**values) for key, value in values.items(): temp_config[key] = value logger.debug("Creating configuration %s for scan values -> %s", temp_config['name'], ", ".join('{}: {}'.format(*val) for val in values.items())) # Write temp_file with tempfile.NamedTemporaryFile(delete=False) as file_: file_name = file_.name _config.write_config(_config.fold_config(list(temp_config.items())), file_name) config_files.append(file_name) else: config_files = args.config # pylint: disable=W0702 except: logger.exception("Bad configuration given") parser.exit(1) try: script_to_run = os.path.join(get_global_var('BASE_PATH'), 'toys', script_to_run) for config_file in config_files: submitter(config_files=[config_file], link_from=args.link_from, extend=args.extend, overwrite=args.overwrite, verbose=args.verbose).run(script_to_run, ) if scan_config: os.remove(config_file) exit_status = 0 except KeyError: logger.error("Bad configuration given") exit_status = 1 except OSError as error: logger.error(str(error)) exit_status = 2 except ValueError: logger.error("Conflicting options found") exit_status = 3 except AttributeError: logger.error("Mismatching configuration found") exit_status = 4 except AssertionError: logger.error("Cannot find the queue submission command") exit_status = 5 # pylint: disable=W0703 except Exception as error: exit_status = 128 logger.exception('Uncaught exception -> %s', repr(error)) finally: parser.exit(exit_status)
def run(config_files, link_from, verbose): """Run the script. Run a generate/fit sequence as many times as requested. Arguments: config_files (list[str]): Path to the configuration files. link_from (str): Path to link the results from. verbose (bool): Give verbose output? Raise: OSError: If the configuration file or some other input does not exist. AttributeError: If the input data are incompatible with a previous fit. KeyError: If some configuration data are missing. ValueError: If there is any problem in configuring the PDF factories. RuntimeError: If there is a problem during the fitting. """ try: config = _config.load_config( *config_files, validate=['syst/ntoys', 'name', 'randomizer']) except OSError: raise OSError( "Cannot load configuration files: {}".format(config_files)) except _config.ConfigError as error: if 'syst/ntoys' in error.missing_keys: logger.error("Number of toys not specified") if 'name' in error.missing_keys: logger.error("No name was specified in the config file!") if 'randomizer' in error.missing_keys: logger.error( "No randomizer configuration specified in config file!") raise KeyError("ConfigError raised -> {}".format(error.missing_keys)) except KeyError as error: logger.error("YAML parsing error -> %s", error) raise model_name = config['syst'].get('model', 'model') # TODO: 'model' returns name? try: model_config = config[model_name] except KeyError as error: logger.error("Missing model configuration -> %s", str(error)) raise KeyError("Missing model configuration") # Load fit model try: fit_model = configure_model(copy.deepcopy(model_config)) randomizer_model = configure_model(copy.deepcopy(model_config)) except KeyError: logger.exception('Error loading model') raise ValueError('Error loading model') # Some info ntoys = config['syst'].get('ntoys-per-job', config['syst']['ntoys']) logger.info("Doing %s generate/fit sequences", ntoys) logger.info("Systematics job name: %s", config['name']) if link_from: config['link-from'] = link_from if 'link-from' in config: logger.info("Linking toy data from %s", config['link-from']) else: logger.debug("No linking specified") # Now load the acceptance try: acceptance = get_acceptance(config['acceptance']) \ if 'acceptance' in config \ else None except _config.ConfigError as error: raise KeyError("Error loading acceptance -> {}".format(error)) # Fit strategy fit_strategy = config['syst'].get('strategy', 'simple') # Load randomizer configuration randomizer = get_randomizer(config['randomizer'])( model=randomizer_model, config=config['randomizer'], acceptance=acceptance) # Set seed job_id = get_job_id() # Start looping fit_results = {} logger.info("Starting sampling-fit loop (print frequency is 20)") initial_mem = memory_usage() initial_time = default_timer() do_extended = config['syst'].get('extended', False) do_minos = config['syst'].get('minos', False) for fit_num in range(ntoys): # Logging if (fit_num + 1) % 20 == 0: logger.info(" Fitting event %s/%s", fit_num + 1, ntoys) # Generate a dataset seed = get_urandom_int(4) np.random.seed(seed=seed) ROOT.RooRandom.randomGenerator().SetSeed(seed) try: # Get a randomized dataset and fit it with the nominal fit dataset = randomizer.get_dataset(randomize=True) gen_values = randomizer.get_current_values() fit_result_nominal = fit(fit_model, model_name, fit_strategy, dataset, verbose, Extended=do_extended, Minos=do_minos) # Fit the randomized dataset with the randomized values as nominal fit_result_rand = fit(randomizer_model, model_name, fit_strategy, dataset, verbose, Extended=do_extended, Minos=do_minos) randomizer.reset_values( ) # Needed to avoid generating unphysical values except ValueError: raise RuntimeError() except Exception: # logger.exception() raise RuntimeError() # TODO: provide more information? result = {} result['fitnum'] = fit_num result['seed'] = seed # Save the results of the randomized fit result_roofit_rand = FitResult.from_roofit(fit_result_rand) result['param_names'] = result_roofit_rand.get_fit_parameters().keys() result['rand'] = result_roofit_rand.to_plain_dict() result['rand_cov'] = result_roofit_rand.get_covariance_matrix() _root.destruct_object(fit_result_rand) # Save the results of the nominal fit result_roofit_nominal = FitResult.from_roofit(fit_result_nominal) result['nominal'] = result_roofit_nominal.to_plain_dict() result['nominal_cov'] = result_roofit_nominal.get_covariance_matrix() result['gen'] = gen_values _root.destruct_object(result_roofit_nominal) _root.destruct_object(dataset) fit_results[fit_num] = result logger.debug("Cleaning up") logger.info("Fitting loop over") logger.info("--> Memory leakage: %.2f MB/sample-fit", (memory_usage() - initial_mem) / ntoys) logger.info("--> Spent %.0f ms/sample-fit", (default_timer() - initial_time) * 1000.0 / ntoys) logger.info("Saving to disk") data_res = [] cov_matrices = {} # Get covariance matrices for fit_num, fit_res_i in fit_results.items(): fit_res = { 'fitnum': fit_res_i['fitnum'], 'seed': fit_res_i['seed'], 'model_name': model_name, 'fit_strategy': fit_strategy } param_names = fit_res_i['param_names'] cov_folder_rand = os.path.join(str(job_id), str(fit_res['fitnum']), 'rand') cov_matrices[cov_folder_rand] = pd.DataFrame(fit_res_i['rand_cov'], index=param_names, columns=param_names) cov_folder_nominal = os.path.join(str(job_id), str(fit_res['fitnum']), 'nominal') cov_matrices[cov_folder_nominal] = pd.DataFrame( fit_res_i['nominal_cov'], index=param_names, columns=param_names) for res_name, res_value in fit_res_i['rand'].items(): fit_res['{}_rand'.format(res_name)] = res_value for res_name, res_value in fit_res_i['nominal'].items(): fit_res['{}_nominal'.format(res_name)] = res_value for res_name, res_value in fit_res_i['gen'].items(): fit_res['{}_gen'.format(res_name)] = res_value data_res.append(fit_res) data_frame = pd.DataFrame(data_res) fit_result_frame = pd.concat([ data_frame, pd.concat([pd.DataFrame({'jobid': [job_id]})] * data_frame.shape[0]).reset_index(drop=True) ], axis=1) try: # pylint: disable=E1101 with _paths.work_on_file(config['name'], path_func=_paths.get_toy_fit_path, link_from=config.get('link-from', None)) as toy_fit_file: with modify_hdf(toy_fit_file) as hdf_file: # First fit results hdf_file.append('fit_results', fit_result_frame) # Save covarinance matrix under 'covariance/jobid/fitnum for cov_folder, cov_matrix in cov_matrices.items(): cov_path = os.path.join('covariance', cov_folder) hdf_file.append(cov_path, cov_matrix) # Generator info hdf_file.append( 'input_values', pd.DataFrame.from_dict(randomizer.get_input_values(), orient='index')) logger.info("Written output to %s", toy_fit_file) if 'link-from' in config: logger.info("Linked to %s", config['link-from']) except OSError as excp: logger.error(str(excp)) raise except ValueError as error: logger.exception("Exception on dataset saving") raise RuntimeError(str(error))
def run(config_files, link_from): """Run the script. Arguments: config_files (list[str]): Path to the configuration files. link_from (str): Path to link the results from. Return: int: Number of submitted jobs. Raise: OSError: If the configuration file does not exist. KeyError: If some configuration data are missing. ValueError: If no suitable batch backend is found. RuntimeError: If something goes wrong during submission. """ try: config = _config.load_config(*config_files, validate=[ 'event-type', 'simulation-version', 'year', 'magnet-polarity', 'prod/nevents', 'prod/nevents-per-job' ]) except OSError: raise OSError("Cannot load configuration files: %s", config_files) except _config.ConfigError as error: if 'event-type' in error.missing_keys: logger.error("No event type was specified in the config file!") if 'simulation-version' in error.missing_keys: logger.error( "No simulation version was specified in the config file!") if 'year' in error.missing_keys: logger.error( "No simulation year was specified in the config file!") if 'magnet-polarity' in error.missing_keys: logger.error( "No magnet polarity was specified in the config file!") if 'prod/nevents' in error.missing_keys: logger.error( "The number of events to produce was not specified in the config file!" ) if 'prod/nevents-per-job' in error.missing_keys: logger.error( "The number of events per job was not specified in the config file!" ) raise KeyError("ConfigError raised -> %s" % error.missing_keys) except KeyError as error: logger.error("YAML parsing error -> %s", error) raise # Event type evt_type = config['event-type'] try: evt_type = int(evt_type) except ValueError: # There's non-numerical chars, we assume it's a path decfile = evt_type if os.path.isabs(evt_type) else os.path.abspath( evt_type) evt_type = os.path.splitext(os.path.split(decfile)[1])[0] else: decfile = '$DECFILESROOT/options/{}.py'.format(evt_type) # Prepare job _, _, log_file = _paths.prepare_path( name='mc/{}'.format(evt_type), path_func=_paths.get_log_path, link_from=link_from) # No linking is done for logs # MC config sim_version = config['simulation-version'].lower() year = int(config['year']) magnet_polarity = config['magnet-polarity'].lower().lstrip( 'magnet').lstrip('mag') remove_detector = config.get('remove-detector', True) # Prepare paths do_link, output_path, output_path_link = _paths.prepare_path( name='', path_func=_paths.get_genlevel_mc_path, link_from=link_from, evt_type=evt_type, sim_version=sim_version, year=year, magnet_polarity=magnet_polarity, remove_detector=remove_detector) link_status = 'true' if do_link else 'false' try: options = get_gaudirun_options(sim_version, year, magnet_polarity, remove_detector) gauss_version = get_gauss_version(sim_version, year) dddb_tag, conddb_tag = get_db_tags(sim_version, year, magnet_polarity) except KeyError as error: logger.error("Unknown Gauss configuration") raise KeyError(str(error)) # Add compression and our decfile options.append('$APPCONFIGOPTS/Persistency/Compression-ZLIB-1.py') options.append(decfile) # Prepare to submit nevents = min(config['prod']['nevents-per-job'], config['prod']['nevents']) logger.info("Generating %s events of decfile -> %s", nevents, decfile) logger.info("Output path: %s", output_path) logger.info("Log file location: %s", os.path.dirname(log_file)) if do_link: logger.info("Linking to %s", output_path_link) extra_config = { 'workdir': '$TMPDIR', 'do_link': link_status, 'gaudirun_options': ' '.join(options), 'gauss_version': gauss_version, 'dddb_tag': dddb_tag, 'conddb_tag': conddb_tag, 'output_extension': 'xgen' if remove_detector else 'sim', 'output_path': output_path, 'output_path_link': output_path_link, 'n_events': nevents } # Prepare batch batch_config = config.get('batch', {}) try: batch_system = get_batch_system(batch_config.get('backend', None)) except ValueError: raise # Submit njobs = int( ceil(1.0 * config['prod']['nevents'] / config['prod']['nevents-per-job'])) logger.info("About to send %s jobs with %s events each.", njobs, nevents) for _ in range(njobs): # Submit try: job_id = batch_system.submit_job('MC_%s' % evt_type, SCRIPT, log_file, extra_config=extra_config, **batch_config) if 'submit error' in job_id: logger.error(job_id) raise Exception logger.debug("Submitted job -> %s", job_id) except Exception: logger.exception('Error submitting MC production job') raise RuntimeError return njobs
def run(config_files, link_from): """Run the script. If the efficiency file exists, only the plots are remade. Arguments: config_files (list[str]): Path to the configuration files. link_from (str): Path to link the results from. Raise: OSError: If there either the configuration file does not exist some of the input files cannot be found. KeyError: If some configuration data are missing. ValueError: If there is any problem in configuring the efficiency model. RuntimeError: If there is a problem during the efficiency fitting. """ try: config = _config.load_config(*config_files, validate=[ 'name', 'data/source', 'data/tree', 'parameters', 'model', 'variables' ]) except OSError: raise OSError( "Cannot load configuration files: {}".format(config_files)) except _config.ConfigError as error: if 'name' in error.missing_keys: logger.error("No name was specified in the config file!") if 'data/file' in error.missing_keys: logger.error("No input data specified in the config file!") if 'data/tree' in error.missing_keys: logger.error("No input data specified in the config file!") if 'model' in error.missing_keys: logger.error("No efficiency model specified in the config file!") if 'parameters' in error.missing_keys: logger.error( "No efficiency model parameters specified in the config file!") if 'variables' in error.missing_keys: logger.error( "No efficiency variables to model have been specified in the config file!" ) raise KeyError("ConfigError raised -> {}".format(error.missing_keys)) except KeyError as error: logger.error("YAML parsing error -> %s", error) raise # Do checks and load things plot_files = {} if config.get('plot', False): for var_name in config['variables']: plot_files[var_name] = get_efficiency_plot_path(config['name'], var=var_name) efficiency_class = get_efficiency_model_class(config['model']) if not efficiency_class: raise ValueError("Unknown efficiency model -> {}".format( config['model'])) # Let's do it # pylint: disable=E1101 if not all(os.path.exists(file_name) for file_name in plot_files.values()) or \ not os.path.exists(_paths.get_efficiency_path(config['name'])): # If plots don't exist, we load data logger.info("Loading data, this may take a while...") weight_var = config['data'].get('weight-var-name', None) # Prepare data config['data']['output-format'] = 'pandas' config['data']['variables'] = list(config['variables']) if weight_var: config['data']['variables'].append(weight_var) input_data = get_data(config['data'], **{'output-format': 'pandas'}) if weight_var: logger.info("Data loaded, using %s as weight", weight_var) else: logger.info("Data loaded, not using any weights") if not os.path.exists(_paths.get_efficiency_path(config['name'])): logger.info("Fitting efficiency model") try: eff = efficiency_class.fit(input_data, config['variables'], weight_var, **config['parameters']) except (ValueError, TypeError) as error: raise ValueError( "Cannot configure the efficiency model -> {}".format( error.message)) except KeyError as error: raise RuntimeError("Missing key -> {}".format(error)) except Exception as error: raise RuntimeError(error) output_file = eff.write_to_disk(config['name'], link_from) logger.info("Written efficiency file -> %s", output_file) else: logger.warning( "Output efficiency already exists, only redoing plots") eff = load_efficiency_model(config['name']) if plot_files: import seaborn as sns sns.set_style("white") plt.style.use('file://{}'.format( os.path.join(get_global_var('STYLE_PATH'), 'matplotlib_LHCb.mplstyle'))) plots = eff.plot(input_data, weight_var, labels=config.get('plot-labels', {})) for var_name, plot in plots.items(): logger.info("Plotting '%s' efficiency -> %s", var_name, plot_files[var_name]) plot.savefig(plot_files[var_name], bbox_inches='tight') else: logger.info("Efficiency file exists: %s. Nothing to do!", _paths.get_efficiency_path(config['name']))
def run(self, script_to_run): """Run the script. If the output exists and no extension or overwrite has been configured, nothing is done. Arguments: script_to_run (str): Script to run in the cluster. Raise: AssertionError: If the qsub command cannot be found. AttributeError: If non-matching configuration file was found. OSError: If there is a problem preparing the output path. """ flat_config = dict(_config.unfold_config(self.config)) # Check if it has not been produced yet # pylint: disable=E1102 config_file_dest = self.TOY_CONFIG_PATH_GETTER(self.config['name']) # First check the config (we may have already checked) if os.path.exists(config_file_dest): # It exists, check they match config_dest = _config.load_config(config_file_dest) if _config.compare_configs(flat_config, config_dest).difference(self.allowed_config_diffs): logger.error("Non-matching configuration already exists with that name!") raise AttributeError() # Now check output _, expected_src, expected_dest = _paths.prepare_path(name=self.config['name'], path_func=self.TOY_PATH_GETTER, link_from=self.config['link-from']) # Check file existence if os.path.exists(expected_src): logger.warning("Output data file exists! %s", expected_src) if self.overwrite: os.remove(expected_src) if os.path.exists(expected_dest): os.remove(expected_dest) else: # Create de symlink if necessary if not os.path.exists(expected_dest): os.symlink(expected_src, expected_dest) if not self.extend: logger.info("Nor --extend nor --overwrite have been specified. Nothing to do.") return # Source doesn't exist, delete the destination if needed else: if os.path.exists(expected_dest): os.remove(expected_dest) # Some bookkeeping if not os.path.exists(script_to_run): raise OSError("Cannot find {}!".format(script_to_run)) script_args = [] if self.config['link-from']: script_args.append('--link-from={}'.format(self.config['link-from'])) if self.verbose: script_args.append('--verbose') script_args.append(config_file_dest) # Prepare paths # pylint: disable=E1101 _, log_file_fmt, _ = _paths.prepare_path(name=self.config['name'], path_func=_paths.get_log_path, link_from=None) # No linking is done for logs # Calculate number of jobs and submit ntoys = flat_config[self.NTOYS_KEY] ntoys_per_job = flat_config.get(self.NTOYS_PER_JOB_KEY, ntoys) n_jobs = int(1.0 * ntoys / ntoys_per_job) if ntoys % ntoys_per_job: n_jobs += 1 # Submit! _config.write_config(self.config, config_file_dest) for _ in range(n_jobs): # Write the config file job_id = self.batch_system.submit_script(job_name=self.config['name'], cmd_script=script_to_run, script_args=script_args, log_file=log_file_fmt, **self.config.get('batch', {})) logger.info('Submitted JobID: %s', job_id)
def test_global_replace(config_simple_globals_1, config_simple_globals_2, config_simple_globals_target): config = load_config(config_simple_globals_1, config_simple_globals_2) assert config == config_simple_globals_target
def test_fails_loudly(config_simple_fail_noload): with pytest.raises(ConfigError) as error_info: load_config(config_simple_fail_noload)
def test_simple_signal(config_simple_load_signal, config_simple_load_target): config = load_config(config_simple_load_signal) assert config == config_simple_load_target
def test_simple(config_simple_load, config_simple_load_target): config = load_config(config_simple_load) assert config == config_simple_load_target
def run(config_files, link_from): """Run the script. Arguments: config_files (list[str]): Path to the configuration files. link_from (str): Path to link the results from. Raise: KeyError: If some configuration data are missing. OSError: If there either the configuration file does not exist or if there is a problem preparing the output path. ValueError: If there is any problem in configuring the PDF factories. RuntimeError: If there is a problem during the generation. """ # Configure try: config = load_config(*config_files, validate=['gen/nevents', 'name', 'gen-model']) except OSError: raise OSError( "Cannot load configuration files: {}".format(config_files)) except ConfigError as error: if 'gen/nevents' in error.missing_keys: logger.error("Number of events not specified") if 'name' in error.missing_keys: logger.error("No name was specified in the config file!") if 'gen-model' in error.missing_keys: logger.error( "No generation model were specified in the config file!") raise KeyError("ConfigError raised -> {}".format(error.missing_keys)) except KeyError as error: logger.error("YAML parsing error -> %s", error) raise # Ignore renaming logger.info("Generating %s events", config['gen']['nevents']) logger.info("Generation job name: %s", config['name']) if link_from: config['link-from'] = link_from if 'link-from' in config: logger.info("Linking toy data from %s", config['link-from']) else: logger.debug("No linking specified") # Set seed job_id = get_job_id() seed = get_urandom_int(4) np.random.seed(seed=seed) ROOT.RooRandom.randomGenerator().SetSeed(seed) # Generate try: physics = configure_model(config['gen-model']) except KeyError as error: logger.error("Cannot find physics factory") raise ValueError('{}'.format(error)) except ValueError: logger.error("Problem dealing with shared parameters") raise if isinstance(physics, (SumPhysicsFactory, SimultaneousPhysicsFactory)): logger.warning("Generating a RooAddPdf or a RooSimultaneous: " "yields will be generated at a fixed value") try: dataset = generate( physics, config['gen'].get('nevents-per-job', config['gen']['nevents'])) except ValueError as error: logger.exception("Exception on generation") raise RuntimeError(str(error)) # Get toy information toy_info = { var.GetName(): [var.getVal()] for var in physics.get_gen_parameters() } n_evts = sum(config['gen']['nevents'].values()) \ if isinstance(config['gen']['nevents'], dict) \ else config['gen']['nevents'] toy_info.update({'seed': [seed], 'jobid': [job_id], 'nevents': n_evts}) try: # Save with work_on_file(config['name'], path_func=get_toy_path, link_from=config.get('link-from')) as toy_file: with modify_hdf(toy_file) as hdf_file: hdf_file.append('data', dataset.assign(jobid=job_id)) hdf_file.append('toy_info', pd.DataFrame(toy_info)) # Say something logger.info("Written output to %s", toy_file) if 'link-from' in config: logger.info("Linked to %s", config['link-from']) except OSError as excp: logger.error(str(excp)) raise except ValueError as error: logger.exception("Exception on dataset saving") raise RuntimeError(str(error))
def run(config_files, link_from, verbose): """Run the script. Run a sample/fit sequence as many times as requested. Arguments: config_files (list[str]): Path to the configuration files. link_from (str): Path to link the results from. verbose (bool): Give verbose output? Raise: OSError: If there either the configuration file does not exist some of the input toys cannot be found. AttributeError: If the input data are incompatible with a previous fit. KeyError: If some configuration data are missing. ValueError: If there is any problem in configuring the PDF factories. RuntimeError: If there is a problem during the fitting. """ try: config = _config.load_config(*config_files, validate=['fit/nfits', 'name', 'data']) except OSError: raise OSError( "Cannot load configuration files: {}".format(config_files)) except ConfigError as error: if 'fit/nfits' in error.missing_keys: logger.error("Number of fits not specified") if 'name' in error.missing_keys: logger.error("No name was specified in the config file!") if 'data' in error.missing_keys: logger.error("No input data specified in the config file!") raise KeyError("ConfigError raised -> {}".format(error.missing_keys)) except KeyError as error: logger.error("YAML parsing error -> %s", error) try: models = { model_name: config[model_name] for model_name in config['fit'].get('models', ['model']) } except KeyError as error: logger.error("Missing model configuration -> %s", str(error)) raise KeyError("Missing model configuration") if not models: logger.error( "Empty list specified in the config file under 'fit/models'!") raise KeyError() fit_strategies = config['fit'].get('strategies', ['simple']) if not fit_strategies: logger.error("Empty fit strategies were specified in the config file!") raise KeyError() # Some info nfits = config['fit'].get('nfits-per-job', config['fit']['nfits']) logger.info("Doing %s sample/fit sequences", nfits) logger.info("Fit job name: %s", config['name']) if link_from: config['link-from'] = link_from if 'link-from' in config: logger.info("Linking toy data from %s", config['link-from']) else: logger.debug("No linking specified") # Analyze data requirements logger.info("Loading input data") data = {} gen_values = {} if len(set('category' in data_source for data_source in config['data'])) > 1: raise KeyError("Categories in 'data' not consistently specified.") for data_id, data_source in config['data'].items(): try: source_toy = data_source['source'] except KeyError: logger.error("Data source not specified") raise data[data_id] = (get_data({ 'source': source_toy, 'source-type': 'toy', 'tree': 'data', 'output-format': 'pandas', 'selection': data_source.get('selection') }), data_source['nevents'], data_source.get('poisson'), data_source.get('category')) # Generator values toy_info = get_data({ 'source': source_toy, 'source-type': 'toy', 'tree': 'toy_info', 'output-format': 'pandas' }) gen_values[data_id] = {} for var_name in toy_info.columns: if var_name in ('seed', 'jobid', 'nevents'): continue gen_values[data_id][var_name] = toy_info[var_name].iloc[0] try: fit_models = {} for model_name in models: if model_name not in config: raise KeyError( "Missing model definition -> {}".format(model_name)) fit_models[model_name] = configure_model(config[model_name]) if any(yield_.isConstant() for yield_ in fit_models[model_name].get_yield_vars() if yield_): logger.warning( "Model %s has constant yields. " "Be careful when configuring the input data, you may need to disable poisson sampling", model_name) except KeyError: logger.exception("Error loading model") raise ValueError("Error loading model") if len(set(model.is_extended() for model in fit_models.values())) == 2: logger.error("Mix of extended and non-extended models!") raise ValueError("Error loading fit models") # Let's check these generator values against the output file try: gen_values_frame = {} # pylint: disable=E1101 with _paths.work_on_file(config['name'], _paths.get_toy_fit_path, config.get('link-from')) as toy_fit_file: with modify_hdf(toy_fit_file) as hdf_file: logger.debug("Checking generator values") test_gen = [('gen_{}'.format(data_source)) in hdf_file for data_source in gen_values] if all(test_gen ): # The data were written already, crosscheck values for source_id, gen_value in gen_values.items(): if not all( hdf_file['gen_{}'.format(data_source)] [var_name].iloc[0] == var_value for var_name, var_value in gen_value.items()): raise AttributeError( "Generated and stored values don't match for source '{}'" .format(source_id)) elif not any(test_gen): # No data were there, just overwrite for source_id, gen_values in gen_values.items(): gen_data = { 'id': source_id, 'source': _paths.get_toy_path( config['data'][source_id]['source']), 'nevents': config['data'][source_id]['nevents'] } gen_data.update(gen_values) gen_values_frame['gen_{}'.format( source_id)] = pd.DataFrame([gen_data]) else: raise AttributeError("Inconsistent number of data sources") except OSError as excp: logger.error(str(excp)) raise # Now load the acceptance try: acceptance = get_acceptance(config['acceptance']) \ if 'acceptance' in config \ else None except ConfigError as error: raise KeyError("Error loading acceptance -> {}".format(error)) # Prepare output gen_events = defaultdict(list) # Set seed job_id = get_job_id() if job_id: seed = int(job_id.split('.')[0]) else: import random job_id = 'local' seed = random.randint(0, 100000) np.random.seed(seed=seed) ROOT.RooRandom.randomGenerator().SetSeed(seed) # Start looping fit_results = defaultdict(list) logger.info("Starting sampling-fit loop (print frequency is 20)") initial_mem = memory_usage() initial_time = default_timer() for fit_num in range(nfits): # Logging if (fit_num + 1) % 20 == 0: logger.info(" Fitting event %s/%s", fit_num + 1, nfits) # Get a compound dataset seed = get_urandom_int(4) np.random.seed(seed=seed) ROOT.RooRandom.randomGenerator().SetSeed(seed) try: logger.debug("Sampling input data") datasets, sample_sizes = get_datasets(data, acceptance, fit_models) for sample_name, sample_size in sample_sizes.items(): gen_events['N^{{{}}}_{{gen}}'.format(sample_name)].append( sample_size) logger.debug("Sampling finalized") except KeyError: logger.exception("Bad data configuration") raise logger.debug("Fitting") for model_name in models: dataset = datasets.pop(model_name) fit_model = fit_models[model_name] # Now fit for fit_strategy in fit_strategies: toy_key = (model_name, fit_strategy) try: fit_result = fit(fit_model, model_name, fit_strategy, dataset, verbose, Extended=config['fit'].get( 'extended', False), Minos=config['fit'].get('minos', False)) except ValueError: raise RuntimeError() # Now results are in fit_parameters result_roofit = FitResult.from_roofit(fit_result) result = result_roofit.to_plain_dict() result['cov_matrix'] = result_roofit.get_covariance_matrix() result['param_names'] = result_roofit.get_fit_parameters( ).keys() result['fitnum'] = fit_num result['seed'] = seed fit_results[toy_key].append(result) _root.destruct_object(fit_result) _root.destruct_object(dataset) logger.debug("Cleaning up") logger.info("Fitting loop over") logger.info("--> Memory leakage: %.2f MB/sample-fit", (memory_usage() - initial_mem) / nfits) logger.info("--> Spent %.0f ms/sample-fit", (default_timer() - initial_time) * 1000.0 / nfits) logger.info("Saving to disk") data_res = [] cov_matrices = {} # Get gen values for this model for (model_name, fit_strategy), fits in fit_results.items(): for fit_res in fits: fit_res = fit_res.copy() fit_res['model_name'] = model_name fit_res['fit_strategy'] = fit_strategy cov_folder = os.path.join(str(job_id), str(fit_res['fitnum'])) param_names = fit_res.pop('param_names') cov_matrices[cov_folder] = pd.DataFrame(fit_res.pop('cov_matrix'), index=param_names, columns=param_names) data_res.append(fit_res) data_frame = pd.DataFrame(data_res) fit_result_frame = pd.concat([ pd.DataFrame(gen_events), data_frame, pd.concat([pd.DataFrame({'jobid': [job_id]})] * data_frame.shape[0]).reset_index(drop=True) ], axis=1) try: # pylint: disable=E1101 with _paths.work_on_file( config['name'], path_func=_paths.get_toy_fit_path, link_from=config.get('link-from')) as toy_fit_file: with modify_hdf(toy_fit_file) as hdf_file: # First fit results hdf_file.append('fit_results', fit_result_frame) # Save covarinance matrix under 'covariance/jobid/fitnum for cov_folder, cov_matrix in cov_matrices.items(): cov_path = os.path.join('covariance', cov_folder) hdf_file.append(cov_path, cov_matrix) # Generator info for key_name, gen_frame in gen_values_frame.items(): hdf_file.append(key_name, gen_frame) logger.info("Written output to %s", toy_fit_file) if 'link-from' in config: logger.info("Linked to %s", config['link-from']) except OSError as excp: logger.error(str(excp)) raise except ValueError as error: logger.exception("Exception on dataset saving") raise RuntimeError(str(error))