def __init__(self, filename, config_path, config_raw, config, message_store): self.logger = get_logger() self.task_index = {t: i for i, t in enumerate(self.task_order)} self.message_store = message_store self.filename = filename self.filename_path = config_path self.file_raw = config_raw self.run_config = config self.global_config = get_config() self.prefix = self.global_config["QUEUE"]["prefix"] + "_" + filename self.max_jobs = int(self.global_config["QUEUE"]["max_jobs"]) self.max_jobs_gpu = int(self.global_config["QUEUE"]["max_gpu_jobs"]) self.max_jobs_in_queue = int( self.global_config["QUEUE"]["max_jobs_in_queue"]) self.max_jobs_in_queue_gpu = int( self.global_config["QUEUE"]["max_gpu_jobs_in_queue"]) self.logger.debug(self.global_config.keys()) self.sbatch_cpu_path = get_data_loc( self.global_config["SBATCH"]["cpu_location"]) with open(self.sbatch_cpu_path, 'r') as f: self.sbatch_cpu_header = f.read() self.sbatch_gpu_path = get_data_loc( self.global_config["SBATCH"]["gpu_location"]) with open(self.sbatch_gpu_path, 'r') as f: self.sbatch_gpu_header = f.read() self.sbatch_cpu_header = self.clean_header(self.sbatch_cpu_header) self.sbatch_gpu_header = self.clean_header(self.sbatch_gpu_header) self.setup_task_location = self.global_config["SETUP"]["location"] self.load_task_setup() self.output_dir = os.path.join(get_output_dir(), self.filename) self.tasks = None self.num_jobs_queue = 0 self.num_jobs_queue_gpu = 0 self.start = None self.finish = None self.force_refresh = False self.force_ignore_stage = None self.running = [] self.done = [] self.failed = [] self.blocked = []
def get_sys_file_in(self): set_file = self.options.get("SYS_SCALE") if set_file is not None: self.logger.debug(f"Explicit SYS_SCALE file specified: {set_file}") path = get_data_loc(set_file) if path is None: raise ValueError(f"Unable to resolve path to {set_file}") else: self.logger.debug( "Searching for SYS_SCALE source from biascor task") fitopt_files = [ f for f in self.biascor_dep.output["fitopt_files"] if f is not None ] assert len( set(fitopt_files) ) < 2, f"Cannot automatically determine scaling from FITOPT file as you have multiple files: {fitopt_files}" if fitopt_files: path = fitopt_files[0] else: path = None self.options[ "SYS_SCALE"] = path # Save to options so its serialised out self.logger.info(f"Setting systematics scaling file to {path}") return path
def __init__(self, name, output_dir, config, dependencies, mode, options, index=0, model_name=None): super().__init__(name, output_dir, config, dependencies, mode, options, index=index, model_name=model_name) self.global_config = get_config() self.options = options self.gpu = self.options.get("GPU", True) self.conda_env = self.global_config["SCONE"]["conda_env_cpu"] if not self.gpu else self.global_config["SCONE"]["conda_env_gpu"] self.path_to_classifier = self.global_config["SCONE"]["location"] self.job_base_name = os.path.basename(Path(output_dir).parents[1]) + "__" + os.path.basename(output_dir) self.batch_file = self.options.get("BATCH_FILE") if self.batch_file is not None: self.batch_file = get_data_loc(self.batch_file) self.batch_replace = self.options.get("BATCH_REPLACE", {}) self.config_path = os.path.join(self.output_dir, "model_config.yml") self.heatmaps_path = os.path.join(self.output_dir, "heatmaps") self.csvs_path = os.path.join(self.output_dir, "sim_csvs") self.slurm = """{sbatch_header} {task_setup}""" self.logfile = os.path.join(self.output_dir, "output.log") remake_heatmaps = self.options.get("REMAKE_HEATMAPS", False) self.keep_heatmaps = not remake_heatmaps
def validate_fitopts(self, config): # Loading fitopts fitopts = config.get("FITOPTS", []) if isinstance(fitopts, str): fitopts = [fitopts] self.logger.debug("Loading fitopts") has_file = False self.output["fitopt_file"] = None self.raw_fitopts = [] for f in fitopts: self.logger.debug(f"Parsing fitopt {f}") potential_path = get_data_loc(f) if potential_path is not None and os.path.exists(potential_path): if has_file: raise ValueError("It seems that you're trying to load in two files for the FITOPTS! Please specify only one file path!") self.logger.debug(f"Loading in fitopts from {potential_path}") y = read_yaml(potential_path) assert isinstance(y, dict), "New FITOPT format for external files is a yaml dictionary. See global.yml for an example." has_file = True self.raw_fitopts.append(y) self.logger.debug(f"Loaded a fitopt dictionary file from {potential_path}") self.output["fitopt_file"] = potential_path else: assert f.strip().startswith( "/" ), f"Manual fitopt {f} for lcfit {self.name} should specify a label wrapped with /. If this is meant to be a file, it doesnt exist." self.logger.debug(f"Adding manual fitopt {f}") self.raw_fitopts.append(f)
def __init__(self, name, output_dir, options, global_config, dependencies=None, index=0): base_file = get_data_loc("create_cov/input_file.txt") super().__init__(name, output_dir, base_file, default_assignment=": ", dependencies=dependencies) self.options = options self.global_config = get_config() self.index = index self.job_name = os.path.basename( Path(output_dir).parents[1]) + "_CREATE_COV_" + name self.path_to_code = os.path.abspath( os.path.dirname(inspect.stack()[0][1]) + "/external") self.logfile = os.path.join(self.output_dir, "output.log") self.sys_file_in = get_data_loc( options.get("SYS_SCALE", "surveys/des/bbc/scale_5yr.list")) self.sys_file_out = os.path.join(self.output_dir, "sys_scale.LIST") self.chain_dir = os.path.join(self.output_dir, "chains/") self.config_dir = os.path.join(self.output_dir, "output") self.biascor_dep = self.get_dep(BiasCor, fail=True) self.output["blind"] = self.biascor_dep.output["blind"] self.input_file = os.path.join( self.output_dir, self.biascor_dep.output["subdirs"][index] + ".input") self.output["hubble_plot"] = self.biascor_dep.output["hubble_plot"] self.output["ini_dir"] = self.config_dir covopts_map = {"ALL": 0} for i, covopt in enumerate(self.options.get("COVOPTS", [])): covopts_map[covopt.split("]")[0][1:]] = i + 1 self.output["covopts"] = covopts_map self.output["index"] = index self.output["bcor_name"] = self.biascor_dep.name self.slurm = """#!/bin/bash
def load_task_setup(self): tasks = [ 'cosmomc', 'snirf', 'analyse', 'supernnova', 'nearest_neighbour', 'create_cov', 'supernnova_yml', 'scone', 'dataprep' ] self.task_setup = {} for task in tasks: with open(get_data_loc(f"{self.setup_task_location}/{task}"), 'r') as f: self.task_setup[task] = f.read()
def add_plot_script_to_run(self, script_name): script_path = get_data_loc(script_name, extra=self.plot_code_dir) if script_path is None: self.fail_config( f"Cannot resolve script {script_name} relative to {self.plot_code_dir}. Please use a variable or abs path." ) else: self.logger.debug( f"Adding script path {script_path} to plotting code.") self.path_to_codes.append(script_path) self.done_files.append( os.path.join(self.output_dir, os.path.basename(script_name).split(".")[0] + ".done"))
def validate_model(self): if self.mode == Classifier.PREDICT: model = self.options.get("MODEL") if model is None: Task.fail_config( f"Classifier {self.name} is in predict mode but does not have a model specified" ) model_classifier = self.get_model_classifier() if model_classifier is not None and model_classifier.name == model: return True path = get_data_loc(model) if not os.path.exists(path): Task.fail_config( f"Classifier {self.name} does not have a classifier dependency and model is not a serialised file path" ) return True
def __init__(self, name, output_dir, config, dependencies, mode, options, index=0, model_name=None): super().__init__(name, output_dir, config, dependencies, mode, options, index=index, model_name=model_name) self.global_config = get_config() self.num_jobs = 1 self.conda_env = self.global_config["SNIRF"]["conda_env"] self.path_to_classifier = os.path.dirname(inspect.stack()[0][1]) self.job_base_name = os.path.basename( Path(output_dir).parents[1]) + "__" + os.path.basename(output_dir) self.features = options.get( "FEATURES", "zHD x1 c cERR x1ERR COV_x1_c COV_x1_x0 COV_c_x0 PKMJDERR") # self.model_pk_file = self.get_unique_name() + ".pkl" self.model_pk_file = "model.pkl" self.output_pk_file = os.path.join(self.output_dir, self.model_pk_file) self.predictions_filename = os.path.join(self.output_dir, "predictions.csv") self.fitopt = options.get("FITOPT", "DEFAULT") self.batch_file = self.options.get("BATCH_FILE") if self.batch_file is not None: self.batch_file = get_data_loc(self.batch_file) self.batch_replace = self.options.get("BATCH_REPLACE", {}) self.output["predictions_filename"] = self.predictions_filename self.output["model_filename"] = self.output_pk_file self.validate_model() self.slurm = """{sbatch_header}
def calculate_input(self): self.logger.debug(f"Calculating input") self.set_property("COSMOMC_TEMPLATES", get_data_loc("cosmomc_templates")) self.set_property("BASEOUTPUT", self.name) self.set_property("SYSFILE", self.sys_file_out) self.set_property("TOPDIR", self.biascor_dep.output["fit_output_dir"]) self.set_property("OUTPUTDIR", self.config_dir) self.set_property("SUBDIR", self.biascor_dep.output["subdirs"][self.index]) self.set_property("ROOTDIR", self.chain_dir) self.set_property("SYSDEFAULT", self.options.get("SYSDEFAULT", 0)) # More bs hacks covopt_str = "" for i, covopt in enumerate(self.options.get("COVOPTS", [])): if i > 0: covopt_str += "COVOPT: " covopt_str += covopt + "\n" self.set_property("COVOPT", covopt_str) # Load in sys file, add muopt arguments if needed # Get the MUOPT_SCALES and FITOPT scales keywords self.logger.debug(f"Leading sys scaling from {self.sys_file_in}") with open(self.sys_file_in) as f: sys_scale = f.read().splitlines() # Overwrite the fitopt scales fitopt_scale_overwrites = self.options.get("FITOPT_SCALES", {}) for label, overwrite in fitopt_scale_overwrites.items(): for i, line in enumerate(sys_scale): comps = line.split() if label in comps[1]: sys_scale[i] = " ".join(comps[:-1] + [f"{overwrite}"]) self.logger.debug( f"FITOPT_SCALES: Setting {' '.join(comps)} to {sys_scale[i]}" ) # Set the muopts scales muopt_scales = self.options.get("MUOPT_SCALES", {}) muopts = self.biascor_dep.output["muopts"] for muopt in muopts: scale = muopt_scales.get(muopt, 1.0) sys_scale.append(f"ERRSCALE: DEFAULT {muopt} {scale}") return sys_scale
def __init__(self, name, output_dir, config, dependencies, mode, options, index=0, model_name=None): super().__init__(name, output_dir, config, dependencies, mode, options, index=index, model_name=model_name) self.global_config = get_config() self.num_jobs = 4 self.conda_env = self.global_config["SNIRF"]["conda_env"] self.path_to_classifier = get_output_loc( self.global_config["SNIRF"]["location"]) self.job_base_name = os.path.basename( Path(output_dir).parents[1]) + "__" + os.path.basename(output_dir) self.features = options.get("FEATURES", "x1 c zHD x1ERR cERR PKMJDERR") self.validate_model() self.model_pk_file = "model.pkl" self.output_pk_file = os.path.join(self.output_dir, self.model_pk_file) self.fitopt = options.get("FITOPT", "DEFAULT") self.fitres_filename = None self.fitres_file = None self.batch_file = self.options.get("BATCH_FILE") if self.batch_file is not None: self.batch_file = get_data_loc(self.batch_file) self.batch_replace = self.options.get("BATCH_REPLACE", {}) self.slurm = """{sbatch_header}
def __init__(self, name, output_dir, create_cov_tasks, config, options, global_config): # First check if all required options exist # In this case, WFITOPTS must exist with at least 1 entry self.wfitopts = options.get("WFITOPTS") if self.wfitopts is None: Task.fail_config( f"You have not specified any WFITOPTS for task {name}") Task.logger.debug(f"WFITOPTS for task {name}: {self.wfitopts}") if len(self.wfitopts) == 0: Task.fail_config( f"WFITOPTS for task {name} does not have any options!") base_file = get_data_loc("wfit/input_file.INPUT") super().__init__(name, output_dir, config, base_file, default_assignment=": ", dependencies=create_cov_tasks) self.num_jobs = len(self.wfitopts) self.create_cov_tasks = create_cov_tasks self.logger.debug("CreateCov tasks: {self.create_cov_tasks}") self.create_cov_dirs = [ os.path.join(t.output_dir, "output") for t in self.create_cov_tasks ] self.logger.debug("CreateCov directories: {self.create_cov_dirs}") self.options = options self.global_config = global_config self.done_file = os.path.join(self.output_dir, "output", "ALL.DONE") self.job_name = os.path.basename( Path(output_dir).parents[1]) + "_WFIT_" + name self.logfile = os.path.join(self.output_dir, "output.log") self.input_name = f"{self.job_name}.INPUT" self.input_file = os.path.join(self.output_dir, self.input_name)
def calculate_input(self): self.logger.debug(f"Calculating input") if self.prepare_cosmomc: self.yaml["COSMOMC_TEMPLATES_PATH"] = get_data_loc( self.templates_dir) else: self.yaml.pop("COSMOMC_TEMPLATES_PATH", None) self.yaml["NAME"] = self.name self.yaml["SYS_SCALE_FILE"] = self.sys_file_out self.yaml["INPUT_DIR"] = self.biascor_dep.output["fit_output_dir"] self.yaml["OUTDIR"] = self.config_dir self.yaml["VERSION"] = self.biascor_dep.output["subdirs"][self.index] self.yaml["MUOPT_SCALES"] = self.biascor_dep.output["muopt_scales"] self.yaml["COVOPTS"] = self.options.get("COVOPTS", []) self.yaml["EXTRA_COVS"] = self.options.get("EXTRA_COVS", []) self.yaml["CALIBRATORS"] = self.calibration_set # Load in sys file, add muopt arguments if needed # Get the MUOPT_SCALES and FITOPT scales keywords sys_scale = { **self.get_scales_from_fitopt_file(), **self.options.get("FITOPT_SCALES", {}) } return sys_scale
def _run(self): if self.static: self.logger.info( "CMB only constraints detected, copying static files") cosmomc_static_loc = get_data_loc(self.static_path + self.ini_prefix) if cosmomc_static_loc is None: self.logger.error( "Seems like we can't find the static chains...") return False else: new_hash = self.get_hash_from_string(cosmomc_static_loc) if self._check_regenerate(new_hash): self.logger.debug("Regenerating and copying static chains") shutil.rmtree(self.chain_dir, ignore_errors=True) shutil.copytree(cosmomc_static_loc, self.chain_dir) for done_file in self.done_files: df = os.path.join(self.output_dir, done_file) with open(df, "w") as f: f.write("SUCCESS") self.save_new_hash(new_hash) else: self.should_be_done() self.logger.info("Hash check passed, not rerunning") else: ini_filecontents = self.get_ini_file() if ini_filecontents is None: return False if self.batch_file is None: if self.gpu: self.sbatch_header = self.sbatch_gpu_header else: self.sbatch_header = self.sbatch_cpu_header else: with open(self.batch_file, 'r') as f: self.sbatch_header = f.read() self.sbatch_header = self.clean_header(self.sbatch_header) header_dict = { "REPLACE_NAME": self.job_name, "REPLACE_WALLTIME": "34:00:00", "REPLACE_LOGFILE": self.logfile, "REPLACE_MEM": "2GB", "APPEND": [ f"#SBATCH --ntasks={self.ntasks}", f"#SBATCH --array=1-{len(self.ini_files)}", "#SBATCH --cpus-per-task=1" ] } header_dict = merge_dict(header_dict, self.batch_replace) self.update_header(header_dict) setup_dict = { "done_files": " ".join(self.done_files), "path_to_cosmomc": self.path_to_cosmomc, "output_dir": self.output_dir, "ini_files": " ".join(self.ini_files), "num_jobs": len(self.ini_files), "num_walkers": self.num_walkers, } format_dict = { "sbatch_header": self.sbatch_header, "task_setup": self.update_setup(setup_dict, self.task_setup['cosmomc']) } final_slurm = self.slurm.format(**format_dict) new_hash = self.get_hash_from_string(final_slurm + " ".join(ini_filecontents)) if self._check_regenerate(new_hash): self.logger.debug("Regenerating and launching task") shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) self.save_new_hash(new_hash) slurm_output_file = os.path.join(self.output_dir, "slurm.job") with open(slurm_output_file, "w") as f: f.write(final_slurm) for file, content in zip(self.ini_files, ini_filecontents): filepath = os.path.join(self.output_dir, file) with open(filepath, "w") as f: f.write(content) mkdirs(self.chain_dir) needed_dirs = [ "data", "paramnames", "camb", "batch1", "batch2", "batch3" ] for d in needed_dirs: self.logger.debug(f"Creating symlink to {d} dir") original_data_dir = os.path.join(self.path_to_cosmomc, d) new_data_dir = os.path.join(self.output_dir, d) os.symlink(original_data_dir, new_data_dir, target_is_directory=True) self.logger.info(f"Submitting batch job for data prep") subprocess.run(["sbatch", slurm_output_file], cwd=self.output_dir) else: self.should_be_done() self.logger.info("Hash check passed, not rerunning") return True
def _run(self, force_refresh): if self.static: self.logger.info( "CMB only constraints detected, copying static files") cosmomc_static_loc = get_data_loc(self.static_path + self.ini_prefix) if cosmomc_static_loc is None: self.logger.error( "Seems like we can't find the static chains...") return False else: new_hash = self.get_hash_from_string(cosmomc_static_loc) old_hash = self.get_old_hash() if force_refresh or new_hash != old_hash: self.logger.debug("Regenerating and copying static chains") shutil.rmtree(self.chain_dir, ignore_errors=True) shutil.copytree(cosmomc_static_loc, self.chain_dir) for done_file in self.done_files: df = os.path.join(self.output_dir, done_file) with open(df, "w") as f: f.write("SUCCESS") self.save_new_hash(new_hash) else: self.should_be_done() self.logger.info("Hash check passed, not rerunning") else: ini_filecontents = self.get_ini_file() if ini_filecontents is None: return False format_dict = { "job_name": self.job_name, "log_file": self.logfile, "done_files": " ".join(self.done_files), "path_to_cosmomc": self.path_to_cosmomc, "output_dir": self.output_dir, "ini_files": " ".join(self.ini_files), "num_jobs": len(self.ini_files), "num_walkers": self.num_walkers, } final_slurm = self.slurm.format(**format_dict) new_hash = self.get_hash_from_string(final_slurm + " ".join(ini_filecontents)) old_hash = self.get_old_hash() if force_refresh or new_hash != old_hash: self.logger.debug("Regenerating and launching task") shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) self.save_new_hash(new_hash) slurm_output_file = os.path.join(self.output_dir, "slurm.job") with open(slurm_output_file, "w") as f: f.write(final_slurm) for file, content in zip(self.ini_files, ini_filecontents): filepath = os.path.join(self.output_dir, file) with open(filepath, "w") as f: f.write(content) mkdirs(self.chain_dir) needed_dirs = [ "data", "paramnames", "camb", "batch1", "batch2", "batch3" ] for d in needed_dirs: self.logger.debug(f"Creating symlink to {d} dir") original_data_dir = os.path.join(self.path_to_cosmomc, d) new_data_dir = os.path.join(self.output_dir, d) os.symlink(original_data_dir, new_data_dir, target_is_directory=True) self.logger.info(f"Submitting batch job for data prep") subprocess.run(["sbatch", slurm_output_file], cwd=self.output_dir) else: self.should_be_done() self.logger.info("Hash check passed, not rerunning") return True
def __init__(self, name, output_dir, sim_task, config, global_config): self.config = config self.global_config = global_config base = config.get("BASE") if base is None: Task.fail_config( f"You have not specified a BASE nml file for task {name}") self.base_file = get_data_loc(base) if self.base_file is None: Task.fail_config( f"Base file {base} cannot be found for task {name}") super().__init__(name, output_dir, self.base_file, " = ", dependencies=[sim_task]) self.sim_task = sim_task self.sim_version = sim_task.output["genversion"] self.config_path = self.output_dir + "/FIT_" + self.sim_version + ".nml" self.lc_output_dir = os.path.join(self.output_dir, "output") self.lc_log_dir = os.path.join(self.lc_output_dir, "SPLIT_JOBS_LCFIT") self.fitres_dirs = [ os.path.join(self.lc_output_dir, os.path.basename(s)) for s in self.sim_task.output["sim_folders"] ] self.logging_file = self.config_path.replace(".nml", ".nml_log") self.done_file = f"{self.output_dir}/FINISHED.DONE" secondary_log = os.path.join(self.lc_log_dir, "MERGELOGS/MERGE2.LOG") self.log_files = [self.logging_file, secondary_log] self.num_empty_threshold = 20 # Damn that tarball creation can be so slow self.display_threshold = 8 self.output["fitres_dirs"] = self.fitres_dirs self.output["nml_file"] = self.config_path self.output["genversion"] = self.sim_version self.output["sim_name"] = sim_task.output["name"] self.output["blind"] = sim_task.output["blind"] self.output["lc_output_dir"] = self.lc_output_dir self.str_pattern = re.compile("[A-DG-SU-Za-dg-su-z]") is_data = False for d in self.dependencies: if isinstance(d, DataPrep): is_data = not d.output["is_sim"] self.output["is_data"] = is_data # Loading fitopts fitopts = config.get("FITOPTS", []) if isinstance(fitopts, str): fitopts = [fitopts] self.logger.debug("Loading fitopts") self.fitopts = [] for f in fitopts: potential_path = get_data_loc(f) if os.path.exists(potential_path): self.logger.debug(f"Loading in fitopts from {potential_path}") with open(potential_path) as f: new_fitopts = list(f.read().splitlines()) self.fitopts += new_fitopts self.logger.debug( f"Loaded {len(new_fitopts)} fitopts file from {potential_path}" ) else: assert "[" in f and "]" in f, f"Manual fitopt {f} for lcfit {self.name} should specify a label in square brackets" if not f.startswith("FITOPT:"): f = "FITOPT: " + f self.logger.debug(f"Adding manual fitopt {f}") self.fitopts.append(f) # Map the fitopt outputs mapped = {"DEFAULT": "FITOPT000.FITRES"} mapped2 = {0: "DEFAULT"} for i, line in enumerate(self.fitopts): label = line.split("[")[1].split("]")[0] mapped[line] = f"FITOPT{i + 1:3d}.FITRES" mapped2[i] = label self.output["fitopt_map"] = mapped self.output["fitopt_index"] = mapped self.output["fitres_file"] = os.path.join(self.fitres_dirs[0], mapped["DEFAULT"]) self.options = self.config.get("OPTS", {}) # Try to determine how many jobs will be put in the queue try: property = self.options.get("BATCH_INFO") or self.get_property( "BATCH_INFO", assignment=": ") self.num_jobs = int(property.split()[-1]) except Exception: self.num_jobs = 10
def __init__(self, name, output_dir, sim_task, config, global_config): self.config = config self.global_config = global_config base = config.get("BASE") if base is None: Task.fail_config(f"You have not specified a BASE nml file for task {name}") self.base_file = get_data_loc(base) if self.base_file is None: Task.fail_config(f"Base file {base} cannot be found for task {name}") super().__init__(name, output_dir, config, self.base_file, " = ", dependencies=[sim_task]) self.sim_task = sim_task self.sim_version = sim_task.output["genversion"] self.config_path = self.output_dir + "/FIT_" + self.sim_version + ".nml" self.lc_output_dir = os.path.join(self.output_dir, "output") self.lc_log_dir = os.path.join(self.lc_output_dir, "SPLIT_JOBS_LCFIT") self.fitres_dirs = [os.path.join(self.lc_output_dir, os.path.basename(s)) for s in self.sim_task.output["sim_folders"]] self.logging_file = self.config_path.replace(".nml", ".LOG") self.kill_file = self.config_path.replace(".input", "_KILL.LOG") self.done_file = f"{self.lc_output_dir}/ALL.DONE" self.merge_log = os.path.join(self.lc_output_dir, "MERGE.LOG") self.log_files = [self.logging_file] self.num_empty_threshold = 20 # Damn that tarball creation can be so slow self.display_threshold = 8 self.output["fitres_dirs"] = self.fitres_dirs self.output["base_file"] = self.base_file self.output["nml_file"] = self.config_path self.output["genversion"] = self.sim_version self.output["sim_name"] = sim_task.output["name"] self.output["blind"] = sim_task.output["blind"] self.output["lc_output_dir"] = self.lc_output_dir self.str_pattern = re.compile("[A-DG-SU-Za-dg-su-z]") self.validate_fitopts(config) is_data = False for d in self.dependencies: if isinstance(d, DataPrep): is_data = not d.output["is_sim"] self.output["is_data"] = is_data self.options = self.config.get("OPTS", {}) # Try to determine how many jobs will be put in the queue # First see if it's been explicitly set num_jobs = self.options.get("NUM_JOBS") if num_jobs is not None: self.num_jobs = num_jobs self.logger.debug("Num jobs set by NUM_JOBS option") else: try: property = self.options.get("BATCH_INFO") or self.yaml["CONFIG"].get("BATCH_INFO") self.num_jobs = int(property.split()[-1]) self.logger.debug("Num jobs set by BATCH_INFO") except Exception: self.logger.warning("Could not determine BATCH_INFO for job, setting num_jobs to 10") self.num_jobs = 10 self.logger.debug("Num jobs set to default")
def __init__(self, name, output_dir, config, options, global_config, dependencies=None): super().__init__(name, output_dir, config=config, dependencies=dependencies) self.options = options self.global_config = get_config() self.logfile = os.path.join(self.output_dir, "output.log") self.conda_env = self.global_config["DataSkimmer"]["conda_env"] self.path_to_task = output_dir self.unparsed_raw = self.options.get("RAW_DIR") self.raw_dir = get_data_loc(self.unparsed_raw) if self.raw_dir is None: Task.fail_config(f"Unable to find {self.options.get('RAW_DIR')}") self.genversion = os.path.basename(self.raw_dir) self.data_path = os.path.dirname(self.raw_dir) if self.unparsed_raw == "$SCRATCH_SIMDIR" or "SNDATA_ROOT/SIM" in self.raw_dir: self.logger.debug("Removing PRIVATE_DATA_PATH from NML file") self.data_path = "" self.job_name = os.path.basename( Path(output_dir).parents[1]) + "_DATAPREP_" + self.name self.output_info = os.path.join(self.output_dir, f"{self.genversion}.YAML") self.output["genversion"] = self.genversion self.opt_setpkmjd = options.get("OPT_SETPKMJD", 16) self.photflag_mskrej = options.get("PHOTFLAG_MSKREJ", 1016) self.output["data_path"] = self.data_path self.output["photometry_dirs"] = [get_output_loc(self.raw_dir)] self.output["sim_folders"] = [get_output_loc(self.raw_dir)] self.output["raw_dir"] = self.raw_dir self.clump_file = os.path.join(self.output_dir, self.genversion + ".SNANA.TEXT") self.output["clump_file"] = self.clump_file self.output["ranseed_change"] = False is_sim = options.get("SIM", False) self.output["is_sim"] = is_sim self.output["blind"] = options.get("BLIND", True) self.types_dict = options.get("TYPES") if self.types_dict is None: self.types_dict = { "IA": [1], "NONIA": [ 2, 20, 21, 22, 29, 30, 31, 32, 33, 39, 40, 41, 42, 43, 80, 81 ] } else: for key in self.types_dict.keys(): self.types_dict[key] = [int(c) for c in self.types_dict[key]] self.batch_file = self.options.get("BATCH_FILE") if self.batch_file is not None: self.batch_file = get_data_loc(self.batch_file) self.batch_replace = self.options.get("BATCH_REPLACE", {}) self.logger.debug(f"\tIA types are {self.types_dict['IA']}") self.logger.debug(f"\tNONIA types are {self.types_dict['NONIA']}") self.output["types_dict"] = self.types_dict self.types = OrderedDict() for n in self.types_dict["IA"]: self.types.update({n: "Ia"}) for n in self.types_dict["NONIA"]: self.types.update({n: "II"}) self.output["types"] = self.types self.slurm = """{sbatch_header} {task_setup}""" self.clump_command = """#
def __init__(self, name, output_dir, genversion, config, global_config, combine="combine.input"): self.data_dirs = global_config["DATA_DIRS"] base_file = get_data_loc(combine) super().__init__(name, output_dir, base_file, ": ") self.genversion = genversion if len(genversion) < 30: self.genprefix = self.genversion else: hash = get_hash(self.genversion)[:5] self.genprefix = self.genversion[:25] + hash self.config = config self.options = config.get("OPTS", {}) self.reserved_keywords = ["BASE"] self.config_path = f"{self.output_dir}/{self.genversion}.input" # Make sure this syncs with the tmp file name # Deterime the type of each component keys = [k for k in config.keys() if k != "GLOBAL" and k != "OPTS"] self.base_ia = [] self.base_cc = [] types = {} types_dict = {"IA": [], "NONIA": []} for k in keys: d = config[k] base_file = d.get("BASE") if base_file is None: Task.fail_config( f"Your simulation component {k} for sim name {self.name} needs to specify a BASE input file" ) base_path = get_data_loc(base_file) if base_path is None: Task.fail_config( f"Cannot find sim component {k} base file at {base_path} for sim name {self.name}" ) gentype, genmodel = None, None with open(base_path) as f: for line in f.read().splitlines(): if line.upper().strip().startswith("GENTYPE:"): gentype = line.upper().split(":")[1].strip() if line.upper().strip().startswith("GENMODEL:"): genmodel = line.upper().split(":")[1].strip() gentype = gentype or d.get("GENTYPE") genmodel = genmodel or d.get("GENMODEL") if not gentype: Task.fail_config( f"Cannot find GENTYPE for component {k} and base file {base_path}" ) if not genmodel: Task.fail_config( f"Cannot find GENMODEL for component {k} and base file {base_path}" ) type2 = "1" + f"{int(gentype):02d}" if "SALT2" in genmodel: self.base_ia.append(base_file) types[gentype] = "Ia" types[type2] = "Ia" types_dict["IA"].append(int(gentype)) types_dict["IA"].append(int(type2)) else: self.base_cc.append(base_file) types[gentype] = "II" types[type2] = "II" types_dict["NONIA"].append(int(gentype)) types_dict["NONIA"].append(int(type2)) sorted_types = collections.OrderedDict(sorted(types.items())) self.logger.debug(f"Types found: {json.dumps(sorted_types)}") self.output["types_dict"] = types_dict self.output["types"] = sorted_types self.global_config = global_config rankeys = [ r for r in config["GLOBAL"].keys() if r.startswith("RANSEED_") ] value = int( config["GLOBAL"][rankeys[0]].split(" ")[0]) if rankeys else 1 self.set_num_jobs(2 * value) self.sim_log_dir = f"{self.output_dir}/LOGS" self.total_summary = os.path.join(self.sim_log_dir, "TOTAL_SUMMARY.LOG") self.done_file = f"{self.output_dir}/FINISHED.DONE" self.logging_file = self.config_path.replace(".input", ".LOG") self.output["blind"] = self.options.get("BLIND", False) self.derived_batch_info = None # Determine if all the top level input files exist if len(self.base_ia + self.base_cc) == 0: Task.fail_config( "Your sim has no components specified! Please add something to simulate!" ) # Try to determine how many jobs will be put in the queue try: # If BATCH_INFO is set, we'll use that batch_info = self.config.get("GLOBAL", {}).get("BATCH_INFO") default_batch_info = self.get_property("BATCH_INFO", assignment=": ") # If its not set, lets check for ranseed_repeat or ranseed_change if batch_info is None: ranseed_repeat = self.config.get("GLOBAL", {}).get("RANSEED_REPEAT") ranseed_change = self.config.get("GLOBAL", {}).get("RANSEED_CHANGE") ranseed = ranseed_repeat or ranseed_change if ranseed: num_jobs = int(ranseed.strip().split()[0]) self.logger.debug( f"Found a randseed with {num_jobs}, deriving batch info" ) comps = default_batch_info.strip().split() comps[-1] = str(num_jobs) self.derived_batch_info = " ".join(comps) self.num_jobs = num_jobs else: # self.logger.debug(f"BATCH INFO property detected as {property}") self.num_jobs = int(default_batch_info.split()[-1]) except Exception: self.logger.warning( f"Unable to determine how many jobs simulation {self.name} has" ) self.num_jobs = 10 self.output["genversion"] = self.genversion self.output["genprefix"] = self.genprefix ranseed_change = self.config.get("GLOBAL", {}).get("RANSEED_CHANGE") base = os.path.expandvars( f"{self.global_config['SNANA']['sim_dir']}/{self.genversion}") if ranseed_change: num_sims = int(ranseed_change.split()[0]) self.logger.debug( "Detected randseed change with {num_sims} sims, updating sim_folders" ) self.sim_folders = [ base + f"-{i + 1:04d}" for i in range(num_sims) ] else: self.sim_folders = [base] self.output["ranseed_change"] = ranseed_change is not None self.output["sim_folders"] = self.sim_folders
def write_input(self, force_refresh): self.set_property("GENVERSION", self.genversion, assignment=": ", section_end="ENDLIST_GENVERSION") self.set_property("LOGDIR", os.path.basename(self.sim_log_dir), assignment=": ", section_end="ENDLIST_GENVERSION") for k in self.config.keys(): if k.upper() != "GLOBAL": run_config = self.config[k] run_config_keys = list(run_config.keys()) assert "BASE" in run_config_keys, "You must specify a base file for each option" for key in run_config_keys: if key.upper() in self.reserved_keywords: continue base_file = run_config["BASE"] match = os.path.basename(base_file).split(".")[0] val = run_config[key] if not isinstance(val, list): val = [val] for v in val: self.set_property(f"GENOPT({match})", f"{key} {v}", section_end="ENDLIST_GENVERSION", only_add=True) if len(self.data_dirs) > 1: data_dir = self.data_dirs[0] self.set_property("PATH_USER_INPUT", data_dir, assignment=": ") for key in self.config.get("GLOBAL", []): if key.upper() == "BASE": continue direct_set = [ "FORMAT_MASK", "RANSEED_REPEAT", "RANSEED_CHANGE", "BATCH_INFO", "BATCH_MEM", "NGEN_UNIT", "RESET_CIDOFF" ] if key in direct_set: self.set_property(key, self.config["GLOBAL"][key], assignment=": ") else: self.set_property(f"GENOPT_GLOBAL: {key}", self.config["GLOBAL"][key], assignment=" ") if self.derived_batch_info: self.set_property("BATCH_INFO", self.derived_batch_info, assignment=": ") if key == "RANSEED_CHANGE": self.delete_property("RANSEED_REPEAT") elif key == "RANSEED_REPEAT": self.delete_property("RANSEED_CHANGE") self.set_property( "SIMGEN_INFILE_Ia", " ".join([os.path.basename(f) for f in self.base_ia]) if self.base_ia else None) self.set_property( "SIMGEN_INFILE_NONIa", " ".join([os.path.basename(f) for f in self.base_cc]) if self.base_cc else None) self.set_property("GENPREFIX", self.genprefix) # Put config in a temp directory temp_dir_obj = tempfile.TemporaryDirectory() temp_dir = temp_dir_obj.name # Copy the base files across input_paths = [] for f in self.base_ia + self.base_cc: resolved = get_data_loc(f) shutil.copy(resolved, temp_dir) input_paths.append(os.path.join(temp_dir, os.path.basename(f))) self.logger.debug(f"Copying input file {resolved} to {temp_dir}") # Copy the include input file if there is one input_copied = [] fs = self.base_ia + self.base_cc for ff in fs: if ff not in input_copied: input_copied.append(ff) path = get_data_loc(ff) copied_path = os.path.join(temp_dir, os.path.basename(path)) with open(path, "r") as f: for line in f.readlines(): line = line.strip() if line.startswith("INPUT_FILE_INCLUDE"): include_file = line.split(":")[-1].strip() include_file_path = get_data_loc(include_file) self.logger.debug( f"Copying INPUT_FILE_INCLUDE file {include_file_path} to {temp_dir}" ) include_file_basename = os.path.basename( include_file_path) include_file_output = os.path.join( temp_dir, include_file_basename) if include_file_output not in input_copied: # Copy include file into the temp dir shutil.copy(include_file_path, temp_dir) # Then SED the file to replace the full path with just the basename if include_file != include_file_basename: sed_command = f"sed -i -e 's|{include_file}|{include_file_basename}|g' {copied_path}" self.logger.debug( f"Running sed command: {sed_command}") subprocess.run(sed_command, stderr=subprocess.STDOUT, cwd=temp_dir, shell=True) # And make sure we dont do this file again fs.append(include_file_output) # Write the primary input file main_input_file = f"{temp_dir}/{self.genversion}.input" with open(main_input_file, "w") as f: f.writelines(map(lambda s: s + "\n", self.base)) self.logger.info(f"Input file written to {main_input_file}") # Remove any duplicates and order the output files output_files = [ f"{temp_dir}/{a}" for a in sorted(os.listdir(temp_dir)) ] self.logger.debug( f"{len(output_files)} files used to create simulation. Hashing them." ) # Get current hash new_hash = self.get_hash_from_files(output_files) old_hash = self.get_old_hash() regenerate = force_refresh or (old_hash is None or old_hash != new_hash) if regenerate: self.logger.info(f"Running simulation") # Clean output dir. God I feel dangerous doing this, so hopefully unnecessary check if "//" not in self.output_dir and len(self.output_dir) > 30: self.logger.debug( f"Cleaning output directory {self.output_dir}") shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) self.logger.debug( f"Copying from {temp_dir} to {self.output_dir}") copytree(temp_dir, self.output_dir) self.save_new_hash(new_hash) else: self.logger.error( f"Seems to be an issue with the output dir path: {self.output_dir}" ) chown_dir(self.output_dir) else: self.logger.info("Hash check passed, not rerunning") temp_dir_obj.cleanup() return regenerate, new_hash
def __init__(self, name, output_dir, dependencies=None, config=None, done_file="done.txt"): self.name = name self.output_dir = output_dir self.num_jobs = 1 if dependencies is None: dependencies = [] self.dependencies = dependencies if config is None: config = {} self.config = copy.deepcopy(config) self.output = {} # Determine if this is an external (already done) job or not external_dirs = self.config.get("EXTERNAL_DIRS", []) external_names = [os.path.basename(d) for d in external_dirs] external_map = self.config.get("EXTERNAL_MAP", {}) output_name = os.path.basename(output_dir) name_match = external_map.get(output_name) if external_dirs: # This will only trigger if EXTERNAL_MAP is defined and output_name is in external_map if name_match is not None: matching_dirs = [d for d in external_dirs if name_match in d] if len(matching_dirs) == 0: self.logger.error( f"Task {output_name} has external mapping {name_match} but there were no matching EXTERNAL_DIRS" ) else: if len(matching_dirs) > 1: self.logger.warning( f"Task {output_name} has external mapping {name_match} which matched with multiple EXTERNAL_DIRS: {matching_dirs}. Defaulting to {matching_dirs[0]}" ) self.logger.info(f"Found external match for {output_name}") self.config["EXTERNAL"] = matching_dirs[0] # If you haven't specified an EXTERNAL_MAP for this output_name, check for exact match elif output_name in external_names: self.config["EXTERNAL"] = external_dirs[external_names.index( output_name)] else: self.logger.info(f"No external match found for {output_name}") self.external = self.config.get("EXTERNAL") if self.external is not None: self.logger.debug(f"External config stated to be {self.external}") self.external = get_data_loc(self.external) # External directory might be compressed if not os.path.exists(self.external): self.logger.warning( f"External config {self.external} does not exist, checking if it's compressed" ) compressed_dir = self.external + ".tar.gz" if not os.path.exists(compressed_dir): self.logger.error( f"{self.external} and {compressed_dir} do not exist") else: self.external = compressed_dir self.logger.debug( f"External config file path resolved to {self.external}" ) with tarfile.open(self.external, "r:gz") as tar: for member in tar: if member.isfile(): filename = os.path.basename(member.name) if filename != "config.yml": continue with tar.extractfile(member) as f: external_config = yaml.load( f, Loader=yaml.Loader) conf = external_config.get("CONFIG", {}) conf.update(self.config) self.config = conf self.output = external_config.get( "OUTPUT", {}) self.logger.debug( "Loaded external config successfully") else: if os.path.isdir(self.external): self.external = os.path.join(self.external, "config.yml") self.logger.debug( f"External config file path resolved to {self.external}") with open(self.external, "r") as f: external_config = yaml.load(f, Loader=yaml.Loader) conf = external_config.get("CONFIG", {}) conf.update(self.config) self.config = conf self.output = external_config.get("OUTPUT", {}) self.logger.debug("Loaded external config successfully") self.hash = None self.hash_file = os.path.join(self.output_dir, "hash.txt") self.done_file = os.path.join(self.output_dir, done_file) # Info about the job run self.start_time = None self.end_time = None self.wall_time = None self.stage = None self.fresh_run = True self.num_empty = 0 self.num_empty_threshold = 10 self.display_threshold = 0 self.gpu = False self.force_refresh = False self.force_ignore = False self.output.update({ "name": name, "output_dir": output_dir, "hash_file": self.hash_file, "done_file": self.done_file }) self.config_file = os.path.join(output_dir, "config.yml")
def __init__(self, name, output_dir, config, options, global_config, dependencies=None, index=0): base_file = get_data_loc("create_cov/input_file.txt") super().__init__(name, output_dir, config, base_file, default_assignment=": ", dependencies=dependencies) if options is None: options = {} self.options = options self.templates_dir = self.options.get("INI_DIR", "cosmomc_templates") self.global_config = get_config() self.index = index self.job_name = os.path.basename( Path(output_dir).parents[1]) + "_CREATE_COV_" + name #self.path_to_code = os.path.abspath(os.path.dirname(inspect.stack()[0][1]) + "/external/") self.path_to_code = '$SNANA_DIR/util/' #Now maintained by SNANA self.batch_mem = options.get("BATCH_MEM", "4GB") self.logfile = os.path.join(self.output_dir, "output.log") self.sys_file_out = os.path.join(self.output_dir, "sys_scale.yml") self.chain_dir = os.path.join(self.output_dir, "chains/") self.config_dir = os.path.join(self.output_dir, "output") self.subtract_vpec = options.get("SUBTRACT_VPEC", False) self.unbinned_covmat_addin = options.get("UNBINNED_COVMAT_ADDIN", []) self.batch_file = self.options.get("BATCH_FILE") if self.batch_file is not None: self.batch_file = get_data_loc(self.batch_file) self.batch_replace = self.options.get("BATCH_REPLACE", {}) self.binned = options.get("BINNED", not self.subtract_vpec) self.rebinned_x1 = options.get("REBINNED_X1", "") if self.rebinned_x1 != "": self.rebinned_x1 = f"--nbin_x1 {self.rebinned_x1}" self.rebinned_c = options.get("REBINNED_C", "") if self.rebinned_c != "": self.rebinned_c = f"--nbin_c {self.rebinned_c}" self.biascor_dep = self.get_dep(BiasCor, fail=True) self.sys_file_in = self.get_sys_file_in() self.output["blind"] = self.biascor_dep.output["blind"] self.input_file = os.path.join( self.output_dir, self.biascor_dep.output["subdirs"][index] + ".input") self.calibration_set = options.get("CALIBRATORS", []) self.output["hubble_plot"] = self.biascor_dep.output["hubble_plot"] if self.config.get("COSMOMC", True): self.logger.info("Generating cosmomc output") self.output["ini_dir"] = os.path.join(self.config_dir, "cosmomc") self.prepare_cosmomc = True else: self.logger.info("Not generating cosmomc output") self.prepare_cosmomc = False covopts_map = {"ALL": 0} for i, covopt in enumerate(self.options.get("COVOPTS", [])): covopts_map[covopt.split("]")[0][1:]] = i + 1 self.output["covopts"] = covopts_map self.output["index"] = index self.output["bcor_name"] = self.biascor_dep.name self.slurm = """{sbatch_header}
def __init__(self, name, output_dir, config, dependencies, options, global_config): base = get_data_loc(config.get("BASE", "surveys/des/bbc/bbc_5yr.input")) self.base_file = base super().__init__(name, output_dir, config, base, "=", dependencies=dependencies) self.options = options self.logging_file = os.path.join(self.output_dir, "output.log") self.global_config = get_config() self.prob_cols = config["PROB_COLS"] self.merged_data = config.get("DATA") self.merged_iasim = config.get("SIMFILE_BIASCOR") self.merged_ccsim = config.get("SIMFILE_CCPRIOR") self.classifier = config.get("CLASSIFIER") if self.classifier is not None: self.config["CLASSIFIER"] = self.classifier.name self.make_all = config.get("MAKE_ALL_HUBBLE", True) self.use_recalibrated = config.get("USE_RECALIBRATED", False) self.consistent_sample = config.get("CONSISTENT_SAMPLE", True) self.bias_cor_fits = None self.cc_prior_fits = None self.data = None self.data_fitres = None self.sim_names = [m.output["sim_name"] for m in self.merged_data] self.blind = self.get_blind(config, options) self.logger.debug(f"Blinding set to {self.blind}") self.output["blind"] = self.blind self.genversions = [m.output["genversion"] for m in self.merged_data] self.num_verions = [ len(m.output["fitres_dirs"]) for m in self.merged_data ] self.output["fitopt_files"] = [ m.output.get("fitopt_file") for m in self.merged_data ] self.genversion = "_".join(self.sim_names) + ( "" if self.classifier is None else "_" + self.classifier.name) self.config_filename = f"{self.name}.input" # Make sure this syncs with the tmp file name self.config_path = os.path.join(self.output_dir, self.config_filename) self.kill_file = self.config_path.replace(".input", "_KILL.LOG") self.job_name = os.path.basename(self.config_path) self.fit_output_dir = os.path.join(self.output_dir, "output") self.merge_log = os.path.join(self.fit_output_dir, "MERGE.LOG") self.reject_list = os.path.join(self.output_dir, "reject.list") self.done_file = os.path.join(self.fit_output_dir, f"ALL.DONE") self.done_file_iteration = os.path.join(self.output_dir, "RESUBMITTED.DONE") self.run_iteration = 1 if os.path.exists( self.done_file_iteration) else 0 self.probability_column_name = None if self.config.get("PROB_COLUMN_NAME") is not None: self.probability_column_name = self.config.get("PROB_COLUMN_NAME") elif self.classifier is not None: self.probability_column_name = self.prob_cols[self.classifier.name] self.output["prob_column_name"] = self.probability_column_name if self.use_recalibrated: new_name = self.probability_column_name.replace("PROB_", "CPROB_") self.logger.debug( f"Updating prob column name from {self.probability_column_name} to {new_name}. I hope it exists!" ) self.probability_column_name = new_name self.output["fit_output_dir"] = self.fit_output_dir self.output["NSPLITRAN"] = "NSPLITRAN" in [ x.upper() for x in self.options.keys() ] if self.output["NSPLITRAN"]: self.output["NSPLITRAN_VAL"] = { x.upper(): y for x, y in self.options.items() }["NSPLITRAN"] self.w_summary = os.path.join(self.fit_output_dir, "BBC_SUMMARY_wfit.FITRES") self.output["w_summary"] = self.w_summary self.set_m0dif_dirs() if not self.make_all: self.output_plots = [self.output_plots[0]] self.logger.debug(f"Making {len(self.output_plots)} plots") self.muopts = self.config.get("MUOPTS", {}) self.muopt_order = list(self.muopts.keys()) self.output["muopts"] = self.muopt_order self.output["hubble_plot"] = self.output_plots self.devel = self.options.get('devel', 0) self.logger.debug(f"Devel option: {self.devel}") self.do_iterate = False # Temp flag to stop iterating as BBC will reiterate natively self.logger.debug(f"Do iterate: {self.do_iterate}") self.logger.debug(f"SNANA_DIR: {os.environ['SNANA_DIR']}")
def __init__(self, name, output_dir, config, dependencies, mode, options, index=0, model_name=None): super().__init__(name, output_dir, config, dependencies, mode, options, index=index, model_name=model_name) self.global_config = get_config() self.dump_dir = output_dir + "/dump" self.job_base_name = os.path.basename(output_dir) self.gpu = config.get("GPU", True) self.tmp_output = None self.done_file = os.path.join(self.output_dir, "done_task.txt") self.done_file2 = os.path.join(self.output_dir, "done_task2.txt") self.variant = options.get("VARIANT", "vanilla").lower() self.redshift = "zspe" if options.get("REDSHIFT", True) else "none" self.norm = options.get("NORM", "cosmo") self.cyclic = options.get("CYCLIC", True) self.seed = options.get("SEED", 0) self.clean = config.get("CLEAN", True) self.batch_size = options.get("BATCH_SIZE", 128) self.num_layers = options.get("NUM_LAYERS", 2) self.hidden_dim = options.get("HIDDEN_DIM", 32) # Setup yml files self.data_yml_file = options.get("DATA_YML", None) self.output_data_yml = os.path.join(self.output_dir, "data.yml") self.classification_yml_file = options.get("CLASSIFICATION_YML", None) self.output_classification_yml = os.path.join(self.output_dir, "classification.yml") # XOR - only runs if either but not both yml's are None if (self.data_yml_file is None) ^ (self.classification_yml_file is None): self.logger.error( f"If using yml inputs, both 'DATA_YML' (currently {self.data_yml} and 'CLASSIFICATION_YML' (currently {self.classification_yml}) must be provided" ) elif self.data_yml_file is not None: with open(self.data_yml_file, 'r') as f: self.data_yml = f.read() with open(self.classification_yml_file, 'r') as f: self.classification_yml = f.read() self.has_yml = True self.variant = self.get_variant_from_yml(self.classification_yml) else: self.data_yml = None self.classification_yml = None self.has_yml = False self.batch_file = self.options.get("BATCH_FILE") if self.batch_file is not None: self.batch_file = get_data_loc(self.batch_file) self.batch_replace = self.options.get("BATCH_REPLACE", {}) self.validate_model() assert self.norm in [ "global", "cosmo", "perfilter", "cosmo_quantile", "none", ], f"Norm option is set to {self.norm}, needs to be one of 'global', 'cosmo', 'perfilter', 'cosmo_quantile" assert self.variant in [ "vanilla", "variational", "bayesian" ], f"Variant {self.variant} is not vanilla, variational or bayesian" self.slurm = """{sbatch_header} {task_setup} """ self.conda_env = self.global_config["SuperNNova"]["conda_env"] self.path_to_classifier = get_output_loc( self.global_config["SuperNNova"]["location"])
def __init__(self, name, output_dir, config, options, global_config, dependencies=None): super().__init__(name, output_dir, config=config, dependencies=dependencies) self.options = options self.global_config = global_config self.job_name = os.path.basename( Path(output_dir).parents[1]) + "_COSMOMC_" + name self.logfile = os.path.join(self.output_dir, "output.log") self.path_to_cosmomc = get_output_loc( self.global_config["CosmoMC"]["location"]) self.create_cov_dep = self.get_dep(CreateCov) self.blind = self.create_cov_dep.output[ "blind"] if self.create_cov_dep is not None else self.options.get( "BLIND", False) assert isinstance( self.blind, (bool, np.bool_)), "Blind should be set to a boolan value!" self.ini_prefix = options.get("INI").replace(".ini", "") self.static = self.ini_prefix.replace(".ini", "") in ["cmb_omw", "cmb_omol"] self.static_path = "cosmomc_static_chains/" if self.create_cov_dep is None: self.ini_files = [f"{self.ini_prefix}.ini"] self.num_walkers = 4 self.covopts = ["ALL"] self.covopts_numbers = [0] self.labels = [self.name] self.num_jobs = 1 else: self.num_walkers = options.get("NUM_WALKERS", 8) avail_cov_opts = self.create_cov_dep.output["covopts"] self.covopts = options.get("COVOPTS") or list( avail_cov_opts.keys()) self.covopts_numbers = [avail_cov_opts[k] for k in self.covopts] self.ini_files = [ f"{self.ini_prefix}_{num}.ini" for num in self.covopts_numbers ] self.output["hubble_plot"] = self.create_cov_dep.output[ "hubble_plot"] self.output["bcor_name"] = self.create_cov_dep.output["bcor_name"] self.labels = [self.name + "_" + c for c in self.covopts] self.num_jobs = len(self.covopts) self.ntasks = 10 self.logger.debug(f"Num Walkers: {self.num_walkers}") self.chain_dir = os.path.join(self.output_dir, "chains/") self.param_dict = { l: os.path.join(self.chain_dir, i.replace(".ini", ".paramnames")) for l, i in zip(self.covopts, self.ini_files) } self.done_files = [f"done_{num}.txt" for num in self.covopts_numbers] self.chain_dict = { l: os.path.join(self.chain_dir, i.replace(".ini", f"_{n + 1}.txt")) for l, i in zip(self.covopts, self.ini_files) for n in range(self.ntasks) } self.base_dict = { l: os.path.join(self.chain_dir, i.replace(".ini", "")) for l, i in zip(self.covopts, self.ini_files) for n in range(self.ntasks) } self.output["chain_dir"] = self.chain_dir self.output["param_dict"] = self.param_dict self.output["chain_dict"] = self.chain_dict self.output["base_dict"] = self.base_dict self.output["covopts"] = self.covopts self.output["blind"] = self.blind self.output["label"] = (self.options.get( "LABEL", f"({' + '.join(self.ini_prefix.upper().split('_')[:-1])})") + " " + (self.create_cov_dep.output["name"] if self.create_cov_dep is not None else "")) # TODO: Better logic here please final = self.ini_prefix.split("_")[-1] ps = { "omw": ["omegam", "w"], "flatomol": ["omegam"], "omol": ["omegam", "omegal"], "wnu": ["w", "nu"], "wwa": ["w", "wa"] } if final not in ps.keys(): self.fail_config( f"The filename passed in ({self.ini_prefix}) needs to have format 'components_cosmology.ini', where the cosmology is omw, omol, wnu or wwa. Is this a custom file?" ) self.output["cosmology_params"] = ps[final] self.batch_file = self.options.get("BATCH_FILE") if self.batch_file is not None: self.batch_file = get_data_loc(self.batch_file) self.batch_replace = self.options.get("BATCH_REPLACE", {}) self.slurm = """{sbatch_header}
def __init__(self, name, output_dir, config, options, dependencies=None): super().__init__(name, output_dir, config=config, dependencies=dependencies) self.options = options self.global_config = get_config() self.logfile = os.path.join(self.output_dir, "output.log") self.job_name = os.path.basename( Path(output_dir).parents[1]) + "_ANALYSE_" + os.path.basename( output_dir) self.path_to_codes = [] self.done_files = [] self.plot_code_dir = os.path.join( os.path.dirname(inspect.stack()[0][1]), "external") self.covopts = options.get("COVOPTS") self.singular_blind = options.get("SINGULAR_BLIND", False) if isinstance(self.covopts, str): self.covopts = [self.covopts] self.cosmomc_input_files = [] self.cosmomc_output_files = [] self.cosmomc_covopts = [] self.names = [] self.params = [] # Assuming all deps are cosmomc tasks self.cosmomc_deps = self.get_deps(CosmoMC) self.blind = np.any([c.output["blind"] for c in self.cosmomc_deps]) if self.blind: self.blind_params = ["w", "om", "ol", "omegam", "omegal"] else: if options.get("BLIND", False): self.blind_params = options.get("BLIND") else: self.blind_params = [] self.biascor_deps = self.get_deps(BiasCor) self.lcfit_deps = self.get_deps(SNANALightCurveFit) if self.cosmomc_deps: self.add_plot_script_to_run("parse_cosmomc.py") self.add_plot_script_to_run("plot_cosmomc.py") self.add_plot_script_to_run("plot_errbudget.py") if self.biascor_deps: self.add_plot_script_to_run("parse_biascor.py") self.add_plot_script_to_run("plot_biascor.py") if self.lcfit_deps: self.add_plot_script_to_run("parse_lcfit.py") self.add_plot_script_to_run("plot_histogram.py") self.add_plot_script_to_run("plot_efficiency.py") if self.options.get("ADDITIONAL_SCRIPTS") is not None: vals = ensure_list(self.options.get("ADDITIONAL_SCRIPTS")) for v in vals: self.add_plot_script_to_run(v) self.done_file = self.done_files[-1] for c in self.cosmomc_deps: for covopt in c.output["covopts"]: self.cosmomc_input_files.append(c.output["base_dict"][covopt]) self.cosmomc_output_files.append(c.output["label"] + "_" + covopt + ".csv.gz") self.cosmomc_covopts.append(covopt) self.names.append(c.output["label"].replace("_", " ") + " " + covopt) for p in c.output["cosmology_params"]: if p not in self.params: self.params.append(p) self.logger.debug( f"Analyse task will create CosmoMC plots with {len(self.cosmomc_input_files)} covopts/plots" ) self.wsummary_files = [ b.output["w_summary"] for b in self.biascor_deps ] # Get the fitres and m0diff files we'd want to parse for Hubble diagram plotting self.biascor_fitres_input_files = [ os.path.join(m, "FITOPT000_MUOPT000.FITRES.gz") for b in self.biascor_deps for m in b.output["m0dif_dirs"] ] self.biascor_prob_col_names = [ b.output["prob_column_name"] for b in self.biascor_deps for m in b.output["m0dif_dirs"] ] self.biascor_fitres_output_files = [ b.name + "__" + os.path.basename(m).replace("OUTPUT_BBCFIT", "1") + "__FITOPT0_MUOPT0.fitres.gz" for b in self.biascor_deps for m in b.output["m0dif_dirs"] ] self.biascor_m0diffs = [] self.biascor_m0diff_output = "all_biascor_m0diffs.csv" self.biascor_fitres_combined = "all_biascor_fitres.csv.gz" self.batch_file = self.options.get("BATCH_FILE") if self.batch_file is not None: self.batch_file = get_data_loc(self.batch_file) self.batch_replace = self.options.get("BATCH_REPLACE", {}) self.slurm = """{sbatch_header}
def write_input(self): # As Pippin only does one GENVERSION at a time, lets extract it first, and also the config c = self.yaml["CONFIG"] d = self.yaml["GENVERSION_LIST"][0] g = self.yaml["GENOPT_GLOBAL"] # Ensure g is a dict with a ref we can update if g is None: g = {} self.yaml["GENOPT_GLOBAL"] = g # Start setting properties in the right area d["GENVERSION"] = self.genversion # Logging now goes in the "CONFIG" c["LOGDIR"] = os.path.basename(self.sim_log_dir) for k in self.config.keys(): if k.upper() not in self.reserved_top: run_config = self.config[k] run_config_keys = list(run_config.keys()) assert "BASE" in run_config_keys, "You must specify a base file for each option" for key in run_config_keys: if key.upper() in self.reserved_keywords: continue base_file = run_config["BASE"] match = os.path.basename(base_file).split(".")[0] val = run_config[key] if not isinstance(val, list): val = [val] lookup = f"GENOPT({match})" if lookup not in d: d[lookup] = {} for v in val: d[lookup][key] = v if len(self.data_dirs) > 1: data_dir = self.data_dirs[0] c["PATH_USER_INPUT"] = data_dir for key in self.config.get("GLOBAL", []): if key.upper() == "BASE": continue direct_set = [ "FORMAT_MASK", "RANSEED_REPEAT", "RANSEED_CHANGE", "BATCH_INFO", "BATCH_MEM", "NGEN_UNIT", "RESET_CIDOFF" ] if key in direct_set: c[key] = self.config["GLOBAL"][key] else: g[key] = self.config["GLOBAL"][key] if self.derived_batch_info: c["BATCH_INFO"] = self.derived_batch_info if key == "RANSEED_CHANGE" and c.get("RANSEED_REPEAT") is not None: del c["RANSEED_REPEAT"] elif key == "RANSEED_REPEAT" and c.get( "RANSEED_CHANGE") is not None: del c["RANSEED_CHANGE"] if self.base_ia: c["SIMGEN_INFILE_Ia"] = [os.path.basename(f) for f in self.base_ia] else: del c["SIMGEN_INFILE_Ia"] if self.base_cc: c["SIMGEN_INFILE_NONIa"] = [ os.path.basename(f) for f in self.base_cc ] else: del c["SIMGEN_INFILE_NONIa"] c["GENPREFIX"] = self.genprefix # Put config in a temp directory temp_dir_obj = tempfile.TemporaryDirectory() temp_dir = temp_dir_obj.name # Copy the base files across input_paths = [] for f in self.base_ia + self.base_cc: resolved = get_data_loc(f) shutil.copy(resolved, temp_dir) input_paths.append(os.path.join(temp_dir, os.path.basename(f))) self.logger.debug(f"Copying input file {resolved} to {temp_dir}") # Copy the include input file if there is one input_copied = [] fs = self.base_ia + self.base_cc for ff in fs: if ff not in input_copied: input_copied.append(ff) path = get_data_loc(ff) copied_path = os.path.join(temp_dir, os.path.basename(path)) with open(path, "r") as f: for line in f.readlines(): line = line.strip() if line.startswith("INPUT_FILE_INCLUDE"): include_file = line.split(":")[-1].strip() include_file_path = get_data_loc(include_file) self.logger.debug( f"Copying INPUT_FILE_INCLUDE file {include_file_path} to {temp_dir}" ) include_file_basename = os.path.basename( include_file_path) include_file_output = os.path.join( temp_dir, include_file_basename) if include_file_output not in input_copied: # Copy include file into the temp dir shutil.copy(include_file_path, temp_dir) # Then SED the file to replace the full path with just the basename if include_file != include_file_basename: sed_command = f"sed -i -e 's|{include_file}|{include_file_basename}|g' {copied_path}" self.logger.debug( f"Running sed command: {sed_command}") subprocess.run(sed_command, stderr=subprocess.STDOUT, cwd=temp_dir, shell=True) # And make sure we dont do this file again fs.append(include_file_output) # Write the primary input file main_input_file = f"{temp_dir}/{self.genversion}.input" self.write_output_file(main_input_file) # Remove any duplicates and order the output files output_files = [ f"{temp_dir}/{a}" for a in sorted(os.listdir(temp_dir)) ] self.logger.debug( f"{len(output_files)} files used to create simulation. Hashing them." ) # Get current hash new_hash = self.get_hash_from_files(output_files) regenerate = self._check_regenerate(new_hash) if regenerate: self.logger.info(f"Running simulation") # Clean output dir. God I feel dangerous doing this, so hopefully unnecessary check if "//" not in self.output_dir and len(self.output_dir) > 30: self.logger.debug( f"Cleaning output directory {self.output_dir}") shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) self.logger.debug( f"Copying from {temp_dir} to {self.output_dir}") copytree(temp_dir, self.output_dir) self.save_new_hash(new_hash) else: self.logger.error( f"Seems to be an issue with the output dir path: {self.output_dir}" ) chown_dir(self.output_dir) else: self.logger.info("Hash check passed, not rerunning") temp_dir_obj.cleanup() return regenerate, new_hash
def __init__(self, name, output_dir, options, global_config, dependencies=None): super().__init__(name, output_dir, dependencies=dependencies) self.options = options self.global_config = get_config() self.logfile = os.path.join(self.output_dir, "output.log") self.conda_env = self.global_config["DataSkimmer"]["conda_env"] self.path_to_task = output_dir self.unparsed_raw = self.options.get("RAW_DIR") self.raw_dir = get_data_loc(self.unparsed_raw) if self.raw_dir is None: Task.fail_config(f"Unable to find {self.options.get('RAW_DIR')}") self.genversion = os.path.basename(self.raw_dir) self.data_path = os.path.dirname(self.raw_dir) if self.unparsed_raw == "$SCRATCH_SIMDIR" or "SNDATA_ROOT/SIM" in self.raw_dir: self.logger.debug("Removing PRIVATE_DATA_PATH from NML file") self.data_path = "" self.job_name = os.path.basename( Path(output_dir).parents[1]) + "_DATAPREP_" + self.name self.output["genversion"] = self.genversion self.output["data_path"] = self.data_path self.output["photometry_dirs"] = [get_output_loc(self.raw_dir)] self.output["sim_folders"] = [get_output_loc(self.raw_dir)] self.output["raw_dir"] = self.raw_dir self.clump_file = os.path.join(self.output_dir, self.genversion + ".SNANA.TEXT") self.output["clump_file"] = self.clump_file self.output["ranseed_change"] = False is_sim = options.get("SIM", False) self.output["is_sim"] = is_sim self.output["blind"] = options.get("BLIND", not is_sim) self.types_dict = options.get("TYPES") if self.types_dict is None: self.types_dict = { "IA": [1], "NONIA": [ 2, 20, 21, 22, 29, 30, 31, 32, 33, 39, 40, 41, 42, 42, 43, 80, 81 ] } else: for key in self.types_dict.keys(): self.types_dict[key] = [int(c) for c in self.types_dict[key]] self.logger.debug(f"\tIA types are {self.types_dict['IA']}") self.logger.debug(f"\tNONIA types are {self.types_dict['NONIA']}") self.output["types_dict"] = self.types_dict self.types = OrderedDict() for n in self.types_dict["IA"]: self.types.update({n: "Ia"}) for n in self.types_dict["NONIA"]: self.types.update({n: "II"}) self.output["types"] = self.types self.slurm = """#!/bin/bash #SBATCH --job-name={job_name} #SBATCH --time=0:20:00 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 #SBATCH --partition=broadwl #SBATCH --output={log_file} #SBATCH --account=pi-rkessler #SBATCH --mem=2GB cd {path_to_task} snana.exe clump.nml if [ $? -eq 0 ]; then echo SUCCESS > {done_file} else echo FAILURE > {done_file} fi """ self.clump_command = """#
def __init__(self, name, output_dir, config, global_config, combine="combine.input"): self.data_dirs = global_config["DATA_DIRS"] base_file = get_data_loc(combine) super().__init__(name, output_dir, config, base_file, ": ") # Check for any replacements path_sndata_sim = get_config().get("SNANA").get("sim_dir") self.logger.debug(f"Setting PATH_SNDATA_SIM to {path_sndata_sim}") self.yaml["CONFIG"]["PATH_SNDATA_SIM"] = path_sndata_sim self.genversion = self.config["GENVERSION"] if len(self.genversion) < 30: self.genprefix = self.genversion else: hash = get_hash(self.genversion)[:5] self.genprefix = self.genversion[:25] + hash self.options = self.config.get("OPTS", {}) self.reserved_keywords = ["BASE"] self.reserved_top = ["GENVERSION", "GLOBAL", "OPTS", "EXTERNAL"] self.config_path = f"{self.output_dir}/{self.genversion}.input" # Make sure this syncs with the tmp file name self.global_config = global_config self.sim_log_dir = f"{self.output_dir}/LOGS" self.total_summary = os.path.join(self.sim_log_dir, "MERGE.LOG") self.done_file = f"{self.output_dir}/LOGS/ALL.DONE" self.logging_file = self.config_path.replace(".input", ".LOG") self.kill_file = self.config_path.replace(".input", "_KILL.LOG") if "EXTERNAL" not in self.config.keys(): # Deterime the type of each component keys = [ k for k in self.config.keys() if k not in self.reserved_top ] self.base_ia = [] self.base_cc = [] types = {} types_dict = {"IA": [], "NONIA": []} for k in keys: d = self.config[k] base_file = d.get("BASE") if base_file is None: Task.fail_config( f"Your simulation component {k} for sim name {self.name} needs to specify a BASE input file" ) base_path = get_data_loc(base_file) if base_path is None: Task.fail_config( f"Cannot find sim component {k} base file at {base_path} for sim name {self.name}" ) gentype, genmodel = None, None with open(base_path) as f: for line in f.read().splitlines(): if line.upper().strip().startswith("GENTYPE:"): gentype = line.upper().split(":")[1].strip() if line.upper().strip().startswith("GENMODEL:"): genmodel = line.upper().split(":")[1].strip() gentype = gentype or d.get("GENTYPE") if gentype is None: self.fail_config( f"The simulation component {k} needs to specify a GENTYPE in its input file" ) gentype = int(gentype) genmodel = genmodel or d.get("GENMODEL") if not gentype: Task.fail_config( f"Cannot find GENTYPE for component {k} and base file {base_path}" ) if not genmodel: Task.fail_config( f"Cannot find GENMODEL for component {k} and base file {base_path}" ) type2 = 100 + gentype if "SALT2" in genmodel: self.base_ia.append(base_file) types[gentype] = "Ia" types[type2] = "Ia" types_dict["IA"].append(gentype) types_dict["IA"].append(type2) else: self.base_cc.append(base_file) types[gentype] = "II" types[type2] = "II" types_dict["NONIA"].append(gentype) types_dict["NONIA"].append(type2) sorted_types = dict(sorted(types.items())) self.logger.debug(f"Types found: {json.dumps(sorted_types)}") self.output["types_dict"] = types_dict self.output["types"] = sorted_types rankeys = [ r for r in self.config["GLOBAL"].keys() if r.startswith("RANSEED_") ] value = int(self.config["GLOBAL"][rankeys[0]].split(" ") [0]) if rankeys else 1 self.set_num_jobs(2 * value) self.output["blind"] = self.options.get("BLIND", False) self.derived_batch_info = None # Determine if all the top level input files exist if len(self.base_ia + self.base_cc) == 0: Task.fail_config( "Your sim has no components specified! Please add something to simulate!" ) # Try to determine how many jobs will be put in the queue # First see if it's been explicitly set num_jobs = self.options.get("NUM_JOBS") if num_jobs is not None: self.num_jobs = num_jobs self.logger.debug( f"Num jobs set by NUM_JOBS option to {self.num_jobs}") else: try: # If BATCH_INFO is set, we'll use that batch_info = self.config.get("GLOBAL", {}).get("BATCH_INFO") default_batch_info = self.yaml["CONFIG"].get("BATCH_INFO") # If its not set, lets check for ranseed_repeat or ranseed_change if batch_info is None: ranseed_repeat = self.config.get( "GLOBAL", {}).get("RANSEED_REPEAT") ranseed_change = self.config.get( "GLOBAL", {}).get("RANSEED_CHANGE") default = self.yaml.get("CONFIG", {}).get("RANSEED_REPEAT") ranseed = ranseed_repeat or ranseed_change or default if ranseed: num_jobs = int(ranseed.strip().split()[0]) self.logger.debug( f"Found a randseed with {num_jobs}, deriving batch info" ) comps = default_batch_info.strip().split() comps[-1] = str(num_jobs) self.derived_batch_info = " ".join(comps) self.num_jobs = num_jobs self.logger.debug( f"Num jobs set by RANSEED to {self.num_jobs}") else: # self.logger.debug(f"BATCH INFO property detected as {property}") self.num_jobs = int(batch_info.split()[-1]) self.logger.debug( f"Num jobs set by BATCH_INFO to {self.num_jobs}") except Exception: self.logger.warning( f"Unable to determine how many jobs simulation {self.name} has" ) self.num_jobs = 1 self.output["genversion"] = self.genversion self.output["genprefix"] = self.genprefix self.ranseed_change = self.config.get("GLOBAL", {}).get("RANSEED_CHANGE") base = os.path.expandvars(self.global_config["SNANA"]["sim_dir"]) self.output["ranseed_change"] = self.ranseed_change is not None self.output["ranseed_change_val"] = self.ranseed_change self.get_sim_folders(base, self.genversion) self.output["sim_folders"] = self.sim_folders else: self.sim_folders = self.output["sim_folders"]