def classify(self): new_hash = self.get_hash_from_string(self.name) if self._check_regenerate(new_hash): mkdirs(self.output_dir) input = self.get_fit_dependency() fitres_file = os.path.join(input["fitres_dirs"][self.index], input["fitopt_map"][self.fitopt]) self.logger.debug(f"Looking for {fitres_file}") if not os.path.exists(fitres_file): self.logger.error( f"FITRES file could not be found at {fitres_file}, classifer has nothing to work with" ) self.passed = False return False df = pd.read_csv(fitres_file, delim_whitespace=True, comment="#") df = df[[ "CID", "FITPROB" ]].rename(columns={"FITPROB": self.get_prob_column_name()}) self.logger.info(f"Saving probabilities to {self.output_file}") df.to_csv(self.output_file, index=False, float_format="%0.4f") chown_dir(self.output_dir) with open(self.done_file, "w") as f: f.write("SUCCESS") self.save_new_hash(new_hash) self.passed = True return True
def classify(self, force_refresh, command): format_dict = { "job_name": self.job_base_name, "conda_env": self.conda_env, "path_to_classifier": self.path_to_classifier, "command_opts": command, "done_file": self.done_file, } slurm_script = self.slurm.format(**format_dict) old_hash = self.get_old_hash() new_hash = self.get_hash_from_string(slurm_script) if force_refresh or new_hash != old_hash: self.logger.debug("Regenerating") shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) slurm_output_file = self.output_dir + "/job.slurm" with open(slurm_output_file, "w") as f: f.write(slurm_script) self.save_new_hash(new_hash) self.logger.info(f"Submitting batch job {slurm_output_file}") subprocess.run(["sbatch", slurm_output_file], cwd=self.output_dir) else: self.logger.info("Hash check passed, not rerunning") self.should_be_done() return True
def write_input(self, force_refresh): self.bias_cor_fits = ",".join([m.output["fitres_file"] for m in self.merged_iasim]) self.cc_prior_fits = ",".join([m.output["fitres_file"] for m in self.merged_ccsim]) self.data = [m.output["fitres_dir"] for m in self.merged_data] self.set_property("simfile_biascor", self.bias_cor_fits) self.set_property("simfile_ccprior", self.cc_prior_fits) self.set_property("varname_pIa", self.probability_column_name) final_output = "\n".join(self.base) new_hash = self.get_hash_from_string(final_output) old_hash = self.get_old_hash() if force_refresh or new_hash != old_hash: self.logger.debug("Regenerating results") shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) with open(self.config_path, "w") as f: f.writelines(final_output) self.logger.info(f"Input file written to {self.config_path}") self.save_new_hash(new_hash) return True else: self.logger.debug("Hash check passed, not rerunning") return False
def _run(self, force_refresh): command_string = self.clump_command.format(genversion=self.genversion, data_path=self.data_path) format_dict = { "job_name": self.job_name, "log_file": self.logfile, "path_to_task": self.path_to_task, "done_file": self.done_file } final_slurm = self.slurm.format(**format_dict) new_hash = self.get_hash_from_string(command_string + final_slurm) old_hash = self.get_old_hash() if force_refresh or new_hash != old_hash: self.logger.debug("Regenerating and launching task") shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) self.save_new_hash(new_hash) slurm_output_file = os.path.join(self.output_dir, "slurm.job") clump_file = os.path.join(self.output_dir, "clump.nml") with open(slurm_output_file, "w") as f: f.write(final_slurm) with open(clump_file, "w") as f: f.write(command_string) self.logger.info(f"Submitting batch job for data prep") subprocess.run(["sbatch", slurm_output_file], cwd=self.output_dir) else: self.should_be_done() self.logger.info("Hash check passed, not rerunning") return True
def write_nml(self, force_refresh): # Parse config, first SNLCINP and then FITINP for key, value in self.config.get("SNLCINP", {}).items(): self.set_snlcinp(key, value) for key, value in self.config.get("FITINP", {}).items(): self.set_fitinp(key, value) for key, value in self.options.items(): self.set_property(key, value, assignment=": ", section_end="&SNLCINP") if self.sim_task.output["ranseed_change"]: self.set_property("VERSION", self.sim_version + "-0*", assignment=": ", section_end="&SNLCINP") else: self.set_property("VERSION", self.sim_version, assignment=": ", section_end="&SNLCINP") self.set_property("OUTDIR", self.lc_output_dir, assignment=": ", section_end="&SNLCINP") self.set_property("DONE_STAMP", "FINISHED.DONE", assignment=": ", section_end="&SNLCINP") if isinstance(self.sim_task, DataPrep): self.set_snlcinp("PRIVATE_DATA_PATH", f"'{self.sim_task.output['data_path']}'") self.set_snlcinp("VERSION_PHOTOMETRY", f"'{self.sim_task.output['genversion']}'") # We want to do our hashing check here string_to_hash = self.fitopts + self.base new_hash = self.get_hash_from_string("".join(string_to_hash)) old_hash = self.get_old_hash() regenerate = force_refresh or (old_hash is None or old_hash != new_hash) if regenerate: self.logger.info(f"Running Light curve fit. Removing output_dir") shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) # Write main file with open(self.config_path, "w") as f: f.writelines(map(lambda s: s + "\n", string_to_hash)) self.logger.info(f"NML file written to {self.config_path}") self.save_new_hash(new_hash) chown_dir(self.output_dir) else: self.logger.info("Hash check passed, not rerunning") return regenerate, new_hash
def classify(self): new_hash = self.get_hash_from_string(self.name + f"{self.prob_ia}_{self.prob_cc}") if self._check_regenerate(new_hash): shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) try: name = self.get_prob_column_name() cid = "CID" s = self.get_simulation_dependency() df = None phot_dir = s.output["photometry_dirs"][self.index] headers = [ os.path.join(phot_dir, a) for a in os.listdir(phot_dir) if "HEAD" in a ] if not headers: Task.fail_config( f"No HEAD fits files found in {phot_dir}!") else: types = self.get_simulation_dependency( ).output["types_dict"] self.logger.debug(f"Input types are {types}") for h in headers: with fits.open(h) as hdul: data = hdul[1].data snid = np.array(data.field("SNID")) sntype = np.array(data.field("SNTYPE")).astype( np.int64) is_ia = np.isin(sntype, types["IA"]) prob = (is_ia * self.prob_ia) + (~is_ia * self.prob_cc) dataframe = pd.DataFrame({cid: snid, name: prob}) dataframe[cid] = dataframe[cid].apply(str) dataframe[cid] = dataframe[cid].str.strip() if df is None: df = dataframe else: df = pd.concat([df, dataframe]) df.drop_duplicates(subset=cid, inplace=True) self.logger.info(f"Saving probabilities to {self.output_file}") df.to_csv(self.output_file, index=False, float_format="%0.4f") chown_dir(self.output_dir) with open(self.done_file, "w") as f: f.write("SUCCESS") self.save_new_hash(new_hash) except Exception as e: self.logger.exception(e, exc_info=True) self.passed = False with open(self.done_file, "w") as f: f.write("FAILED") return False else: self.should_be_done() self.passed = True return True
def get_ini_file(self): mkdirs(self.chain_dir) directory = self.create_cov_dep.output["ini_dir"] self.logger.debug(f"Directory: {directory}") input_files = [] for file in self.ini_files: path = os.path.join(directory, file) self.logger.debug(f"Path: {path}") if not os.path.exists(path): self.logger.error( f"Cannot find the file {path}, make sure you specified a correct INI string matching an existing template" ) return None self.logger.debug(f"Reading in {path} to format") with open(path) as f: input_files.append(f.read().format( **{ "path_to_cosmomc": self.path_to_cosmomc, "ini_dir": self.create_cov_dep.output["ini_dir"], "root_dir": self.chain_dir })) self.logger.debug(f"Input Files: {input_files}") return input_files
def classify(self, force_refresh): new_hash = self.check_regenerate(force_refresh) if new_hash: mkdirs(self.output_dir) input = self.get_fit_dependency() fitres_file = input["fitres_file"] self.logger.debug(f"Looking for {fitres_file}") if not os.path.exists(fitres_file): self.logger.error( f"FITRES file could not be found at {fitres_file}, classifer has nothing to work with" ) self.passed = False return False df = pd.read_csv(fitres_file, sep='\s+', comment="#", compression="infer") df = df[[ "CID", "FITPROB" ]].rename(columns={"FITPROB": self.get_prob_column_name()}) self.logger.info(f"Saving probabilities to {self.output_file}") df.to_csv(self.output_file, index=False, float_format="%0.4f") chown_dir(self.output_dir) with open(self.done_file, "w") as f: f.write("SUCCESS") self.save_new_hash(new_hash) self.passed = True return True
def classify(self, command): self.setup() if self.batch_file is None: if self.gpu: self.sbatch_header = self.sbatch_gpu_header else: self.sbatch_header = self.sbatch_cpu_header else: with open(self.batch_file, 'r') as f: self.sbatch_header = f.read() self.sbatch_header = self.clean_header(self.sbatch_header) header_dict = { "REPLACE_NAME": self.job_base_name, "REPLACE_LOGFILE": "output.log", "REPLACE_WALLTIME": "00:55:00", "REPLACE_MEM": "8GB", "APPEND": ["#SBATCH --ntasks=1", "#SBATCH --cpus-per-task=4"] } header_dict = merge_dict(header_dict, self.batch_replace) self.update_header(header_dict) setup_dict = { "job_name": self.job_base_name, "conda_env": self.conda_env, "path_to_classifier": self.path_to_classifier, "command_opts": command } format_dict = { "done_file": self.done_file, "sbatch_header": self.sbatch_header, "task_setup": self.update_setup(setup_dict, self.task_setup['nearest_neighbour']) } slurm_script = self.slurm.format(**format_dict) new_hash = self.get_hash_from_string(slurm_script) if self._check_regenerate(new_hash): self.logger.debug("Regenerating") shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) slurm_output_file = self.output_dir + "/job.slurm" with open(slurm_output_file, "w") as f: f.write(slurm_script) self.save_new_hash(new_hash) self.logger.info(f"Submitting batch job {slurm_output_file}") subprocess.run(["sbatch", slurm_output_file], cwd=self.output_dir) else: self.logger.info("Hash check passed, not rerunning") self.should_be_done() return True
def write_nml(self, force_refresh): self.logger.debug(f"Loading fitopts file from {self.fitopts_file}") with open(self.fitopts_file, "r") as f: self.fitopts = list(f.read().splitlines()) self.logger.info( f"Loaded {len(self.fitopts)} fitopts file from {self.fitopts_file}" ) # Parse config, first SNLCINP and then FITINP for key, value in self.config.get("SNLCINP", {}).items(): self.set_snlcinp(key, value) for key, value in self.config.get("FITINP", {}).items(): self.set_fitinp(key, value) self.set_property( "VERSION", self.sim_version + "*", assignment=": ", section_end="&SNLCINP") # TODO FIX THIS, DOUBLE VERSION KEY self.set_property("OUTDIR", self.lc_output_dir, assignment=": ", section_end="&SNLCINP") if isinstance(self.sim_task, DataPrep): self.set_snlcinp("PRIVATE_DATA_PATH", f"'{self.sim_task.output['data_path']}'") self.set_snlcinp("VERSION_PHOTOMETRY", f"'{self.sim_task.output['genversion']}'") # We want to do our hashing check here string_to_hash = self.fitopts + self.base # with open(os.path.abspath(inspect.stack()[0][1]), "r") as f: # string_to_hash += f.read() new_hash = self.get_hash_from_string("".join(string_to_hash)) old_hash = self.get_old_hash() regenerate = force_refresh or (old_hash is None or old_hash != new_hash) if regenerate: self.logger.info(f"Running Light curve fit. Removing output_dir") shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) # Write main file with open(self.config_path, "w") as f: f.writelines(map(lambda s: s + '\n', string_to_hash)) self.logger.info(f"NML file written to {self.config_path}") self.save_new_hash(new_hash) chown_dir(self.output_dir) else: self.logger.info("Hash check passed, not rerunning") return regenerate, new_hash
def predict(self, force_refresh): train_info = self.get_fit_dependency() model = self.options.get("MODEL") assert model is not None, "If TRAIN is not specified, you have to point to a model to use" for t in self.dependencies: if model == t.name: self.logger.debug( f"Found task dependency {t.name} with model file {t.output['model_filename']}" ) model = t.output["model_filename"] model_path = get_output_loc(model) self.logger.debug(f"Looking for model in {model_path}") if not os.path.exists(model_path): self.logger.error(f"Cannot find {model_path}") return False old_hash = self.get_old_hash() new_hash = self.get_hash_from_string(self.name + model_path) if force_refresh or new_hash != old_hash: self.logger.debug("Regenerating") if os.path.exists(self.output_dir): shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) self.save_new_hash(new_hash) job_name = 'nearnbr_apply.exe' inArgs = f'-inFile_data {train_info["fitres_file"]} -inFile_MLpar {model_path}' outArgs = f'-outFile {self.outfile_predict} -varName_prob {self.get_prob_column_name()}' cmd_job = ('%s %s %s' % (job_name, inArgs, outArgs)) self.logger.debug(f"Executing command {cmd_job}") with open(self.logging_file, "w") as f: val = subprocess.run(cmd_job.split(" "), stdout=f, stderr=subprocess.STDOUT, cwd=self.output_dir) with open(self.done_file, "w") as f: if val.returncode == 0: f.write("SUCCESS") else: f.write("FAILURE") else: self.logger.debug("Not regenerating") return True
def write_nml(self): # Parse config, first SNLCINP and then FITINP for key, value in self.config.get("SNLCINP", {}).items(): self.set_snlcinp(key, value) for key, value in self.config.get("FITINP", {}).items(): self.set_fitinp(key, value) for key, value in self.options.items(): #print(key,value) self.yaml["CONFIG"][key] = value self.compute_fitopts() if self.sim_task.output["ranseed_change"]: self.yaml["CONFIG"]["VERSION"] = [self.sim_version + "-0*"] else: self.yaml["CONFIG"]["VERSION"] = [self.sim_version] self.yaml["CONFIG"]["OUTDIR"] = self.lc_output_dir # self.yaml["CONFIG"]["DONE_STAMP"] = "ALL.DONE" if isinstance(self.sim_task, DataPrep): data_path = self.sim_task.output["data_path"] if "SNDATA_ROOT/lcmerge" not in data_path: self.set_snlcinp("PRIVATE_DATA_PATH", f"'{self.sim_task.output['data_path']}'") self.set_snlcinp("VERSION_PHOTOMETRY", f"'{self.sim_task.output['genversion']}'") # We want to do our hashing check here string_to_hash = self.get_output_string() new_hash = self.get_hash_from_string(string_to_hash) regenerate = self._check_regenerate(new_hash) if regenerate: self.logger.info(f"Running Light curve fit. Removing output_dir") shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) # Write main file # Write the primary input file self.write_output_file(self.config_path) self.logger.info(f"NML file written to {self.config_path}") self.save_new_hash(new_hash) chown_dir(self.output_dir) else: self.logger.info("Hash check passed, not rerunning") return regenerate, new_hash
def _run(self): self.yaml["CONFIG"]["WFITOPT"] = self.wfitopts self.yaml["CONFIG"]["INPDIR"] = self.create_cov_dirs self.yaml["CONFIG"]["OUTDIR"] = os.path.join(self.output_dir, "output") # Pass all OPTS keys through to the yaml dictionary for k, v in self.options.items(): # Clobber WFITOPTS to WFITOPT if k == "WFITOPTS": k = "WFITOPT" self.yaml["CONFIG"][k] = v final_output_for_hash = self.get_output_string() new_hash = self.get_hash_from_string(final_output_for_hash) if self._check_regenerate(new_hash): self.logger.debug("Regenerating and launching task") shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) self.save_new_hash(new_hash) with open(self.input_file, "w") as f: f.write(self.get_output_string()) cmd = ["submit_batch_jobs.sh", os.path.basename(self.input_file)] self.logger.debug( f"Submitting wfit job: {' '.join(cmd)} in cwd: {self.output_dir}" ) self.logger.debug(f"Logging to {self.logfile}") with open(self.logfile, 'w') as f: subprocess.run(' '.join(cmd), stdout=f, stderr=subprocess.STDOUT, cwd=self.output_dir, shell=True) chown_dir(self.output_dir) else: self.should_be_done() self.logger.info("Has check passed, not rerunning") return True
def classify(self): mkdirs(self.output_dir) fitres = f"{self.fit_dir}/FITOPT000.FITRES.gz" self.logger.debug(f"Looking for {fitres}") if not os.path.exists(fitres): self.logger.error( f"FITRES file could not be found at {fitres}, classifer has nothing to work with" ) return False data = pd.read_csv(fitres, sep='\s+', comment="#", compression="infer") ids = data["CID"].values probability = np.random.uniform(size=ids.size) combined = np.vstack((ids, probability)).T output_file = self.output_dir + "/prob.txt" self.logger.info(f"Saving probabilities to {output_file}") np.savetxt(output_file, combined) chown_dir(self.output_dir) return True # change to hash
def _run(self, force_refresh): command = self.cmd_prefix + [ self.lc_fit["fitres_file"], self.agg["merge_key_filename"] ] + self.cmd_suffix old_hash = self.get_old_hash() new_hash = self.get_hash_from_string(" ".join(command)) if force_refresh or new_hash != old_hash: shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) self.logger.debug("Regenerating, running combine_fitres") self.save_new_hash(new_hash) with open(self.logfile, "w") as f: subprocess.run(command, stdout=f, stderr=subprocess.STDOUT, cwd=self.output_dir) else: self.logger.debug("Not regnerating") return True
def run(args): # Load YAML config file yaml_path = os.path.abspath(os.path.expandvars(args.yaml)) assert os.path.exists(yaml_path), f"File {yaml_path} cannot be found." with open(yaml_path, "r") as f: config = yaml.safe_load(f) overwrites = config.get("GLOBAL") if config.get("GLOBALS") is not None: logging.warning( "Your config file has a GLOBALS section in it. If you're trying to overwrite cfg.yml, rename this to GLOBAL" ) global_config = get_config(initial_path=args.config, overwrites=overwrites) config_filename = os.path.basename(args.yaml).split(".")[0].upper() output_dir = get_output_dir() logging_folder = os.path.abspath(os.path.join(output_dir, config_filename)) if not args.check: mkdirs(logging_folder) message_store, logging_filename = setup_logging(config_filename, logging_folder, args) for i, d in enumerate(global_config["DATA_DIRS"]): logging.debug(f"Data directory {i + 1} set as {d}") assert d is not None, "Data directory is none, which means it failed to resolve. Check the error message above for why." manager = Manager(config_filename, yaml_path, config, message_store) if args.start is not None: args.refresh = True manager.set_start(args.start) manager.set_finish(args.finish) manager.set_force_refresh(args.refresh) manager.execute(args.check) chown_file(logging_filename) return manager
def _run(self, force_refresh): sys_scale = self.calculate_input() format_dict = { "job_name": self.job_name, "log_file": self.logfile, "done_file": self.done_file, "path_to_code": self.path_to_code, "input_file": self.input_file, } final_slurm = self.slurm.format(**format_dict) new_hash = self.get_hash_from_string("\n".join(self.base + sys_scale) + final_slurm) old_hash = self.get_old_hash() if force_refresh or new_hash != old_hash: self.logger.debug("Regenerating and launching task") shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) mkdirs(self.config_dir) self.save_new_hash(new_hash) # Write sys scales and the main input file with open(self.sys_file_out, "w") as f: f.write("\n".join(sys_scale)) with open(self.input_file, "w") as f: f.write("\n".join(self.base)) # Write out slurm job script slurm_output_file = os.path.join(self.output_dir, "slurm.job") with open(slurm_output_file, "w") as f: f.write(final_slurm) self.logger.info(f"Submitting batch job for data prep") subprocess.run(["sbatch", slurm_output_file], cwd=self.output_dir) else: self.should_be_done() self.logger.info("Hash check passed, not rerunning") return True
def _run(self): self.output["fitopt_map"] = self.lc_fit["fitopt_map"] self.output["fitopt_index"] = self.lc_fit["fitopt_index"] self.output["fitres_file"] = self.lc_fit["fitres_file"] self.output["SURVEY"] = self.lc_fit["SURVEY"] self.output["SURVEY_ID"] = self.lc_fit["SURVEY_ID"] fitres_files, symlink_files = [], [] for index, (fitres_dir, outdir) in enumerate( zip(self.lc_fit["fitres_dirs"], self.fitres_outdirs)): files = os.listdir(fitres_dir) fitres_files += [ (fitres_dir, outdir, f, index, self.lc_fit["name"]) for f in files if "FITRES" in f and not os.path.islink(os.path.join(fitres_dir, f)) ] symlink_files += [(fitres_dir, outdir, f, index, self.lc_fit["name"]) for f in files if "FITRES" in f and os.path.islink(os.path.join(fitres_dir, f))] new_hash = self.get_hash_from_string(" ".join([ a + b + c + f"{d}" + e for a, b, c, d, e in (fitres_files + symlink_files) ])) if self._check_regenerate(new_hash): shutil.rmtree(self.output_dir, ignore_errors=True) self.logger.debug("Regenerating, running combine_fitres") try: for fitres_dir in self.fitres_outdirs: self.logger.debug(f"Creating directory {fitres_dir}") mkdirs(fitres_dir) for f in fitres_files: if f[1] == fitres_dir: self.add_to_fitres(os.path.join(f[0], f[2]), f[1], f[4], index=f[3]) for s in symlink_files: if s[1] == fitres_dir: self.logger.debug( f"Creating symlink for {os.path.join(s[1], s[2])} to {os.path.join(s[1], 'FITOPT000.FITRES.gz')}" ) os.symlink( os.path.join(s[1], "FITOPT000.FITRES.gz"), os.path.join(s[1], s[2])) self.logger.debug(f"Copying MERGE.LOG") filenames = ["MERGE.LOG", "SUBMIT.INFO"] for f in filenames: original = os.path.join(self.lc_fit["lc_output_dir"], f) moved = os.path.join(self.suboutput_dir, f) if not os.path.exists(moved): self.logger.debug( f"Copying file {f} into output directory") shutil.copy(original, moved) self.save_new_hash(new_hash) with open(self.done_file, "w") as f: f.write("SUCCESS\n") except Exception as e: self.logger.error("Error running merger!") self.logger.error(f"Check log at {self.logfile}") self.logger.exception(e, exc_info=True) return False else: self.should_be_done() self.logger.info("Hash check passed, not rerunning") return True
def _run(self, force_refresh): if self.static: self.logger.info( "CMB only constraints detected, copying static files") cosmomc_static_loc = get_data_loc(self.static_path + self.ini_prefix) if cosmomc_static_loc is None: self.logger.error( "Seems like we can't find the static chains...") return False else: new_hash = self.get_hash_from_string(cosmomc_static_loc) old_hash = self.get_old_hash() if force_refresh or new_hash != old_hash: self.logger.debug("Regenerating and copying static chains") shutil.rmtree(self.chain_dir, ignore_errors=True) shutil.copytree(cosmomc_static_loc, self.chain_dir) for done_file in self.done_files: df = os.path.join(self.output_dir, done_file) with open(df, "w") as f: f.write("SUCCESS") self.save_new_hash(new_hash) else: self.should_be_done() self.logger.info("Hash check passed, not rerunning") else: ini_filecontents = self.get_ini_file() if ini_filecontents is None: return False format_dict = { "job_name": self.job_name, "log_file": self.logfile, "done_files": " ".join(self.done_files), "path_to_cosmomc": self.path_to_cosmomc, "output_dir": self.output_dir, "ini_files": " ".join(self.ini_files), "num_jobs": len(self.ini_files), "num_walkers": self.num_walkers, } final_slurm = self.slurm.format(**format_dict) new_hash = self.get_hash_from_string(final_slurm + " ".join(ini_filecontents)) old_hash = self.get_old_hash() if force_refresh or new_hash != old_hash: self.logger.debug("Regenerating and launching task") shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) self.save_new_hash(new_hash) slurm_output_file = os.path.join(self.output_dir, "slurm.job") with open(slurm_output_file, "w") as f: f.write(final_slurm) for file, content in zip(self.ini_files, ini_filecontents): filepath = os.path.join(self.output_dir, file) with open(filepath, "w") as f: f.write(content) mkdirs(self.chain_dir) needed_dirs = [ "data", "paramnames", "camb", "batch1", "batch2", "batch3" ] for d in needed_dirs: self.logger.debug(f"Creating symlink to {d} dir") original_data_dir = os.path.join(self.path_to_cosmomc, d) new_data_dir = os.path.join(self.output_dir, d) os.symlink(original_data_dir, new_data_dir, target_is_directory=True) self.logger.info(f"Submitting batch job for data prep") subprocess.run(["sbatch", slurm_output_file], cwd=self.output_dir) else: self.should_be_done() self.logger.info("Hash check passed, not rerunning") return True
def execute(self, check_config): self.logger.info(f"Executing pipeline for prefix {self.prefix}") self.logger.info(f"Output will be located in {self.output_dir}") if check_config: self.logger.info("Only verifying config, not launching anything") mkdirs(self.output_dir) c = self.run_config self.tasks = self.get_tasks(c) if check_config: self.logger.notice("Config verified, exiting") return self.num_jobs_queue = 0 self.num_jobs_queue_gpu = 0 running_tasks = [] done_tasks = [] failed_tasks = [] blocked_tasks = [] squeue = None start_sleep_time = self.global_config["OUTPUT"]["ping_frequency"] max_sleep_time = self.global_config["OUTPUT"]["max_ping_frequency"] current_sleep_time = start_sleep_time config_file_output = os.path.join(self.output_dir, os.path.basename(self.filename_path)) if not check_config and self.filename_path != config_file_output: self.logger.info( f"Saving parsed config file from {self.filename_path} to {config_file_output}" ) shutil.copy(self.filename_path, config_file_output) chown_file(config_file_output) # Welcome to the primary loop while self.tasks or running_tasks: small_wait = False # Check status of current jobs for t in running_tasks: try: completed = self.check_task_completion( t, blocked_tasks, done_tasks, failed_tasks, running_tasks, squeue) small_wait = small_wait or completed except Exception as e: self.logger.exception(e, exc_info=True) self.fail_task(t, running_tasks, failed_tasks, blocked_tasks) # Submit new jobs if needed while self.num_jobs_queue < self.max_jobs: t = self.get_task_to_run(self.tasks, done_tasks) if t is not None: self.logger.info("") self.tasks.remove(t) self.logger.notice(f"LAUNCHING: {t}") try: started = t.run(self.get_force_refresh(t)) except Exception as e: self.logger.exception(e, exc_info=True) started = False if started: if t.gpu: self.num_jobs_queue_gpu += t.num_jobs else: self.num_jobs_queue += t.num_jobs self.logger.notice( f"LAUNCHED: {t} with total {self.num_jobs_queue} jobs" ) running_tasks.append(t) completed = self.check_task_completion( t, blocked_tasks, done_tasks, failed_tasks, running_tasks, squeue) small_wait = small_wait or completed else: self.logger.error(f"FAILED TO LAUNCH: {t}") self.fail_task(t, running_tasks, failed_tasks, blocked_tasks) small_wait = True else: break # Check quickly if we've added a new job, etc, in case of immediate failure if small_wait: self.log_status(self.tasks, running_tasks, done_tasks, failed_tasks, blocked_tasks) current_sleep_time = start_sleep_time time.sleep(0.1) squeue = None else: time.sleep(current_sleep_time) current_sleep_time *= 2 if current_sleep_time > max_sleep_time: current_sleep_time = max_sleep_time squeue = [ i.strip() for i in subprocess.check_output( f"squeue -h -u $USER -o '%.200j'", shell=True, text=True).splitlines() ] n = len(squeue) if n == 0 or n > self.max_jobs: self.logger.debug( f"Squeue is reporting {n} jobs in the queue... this is either 0 or toeing the line as to too many" ) self.log_finals(done_tasks, failed_tasks, blocked_tasks)
def write_input(self): # Load previous hash here if it exists old_hash = None hash_file = f"{self.output_dir}/hash.txt" if os.path.exists(hash_file): with open(hash_file, "r") as f: old_hash = f.read().strip() self.logger.debug(f"Previous result found, hash is {old_hash}") # Put config in a temp directory temp_dir_obj = tempfile.TemporaryDirectory() temp_dir = temp_dir_obj.name # Copy the base files across for f in self.base_ia: shutil.copy(self.data_dir + f, temp_dir) for f in self.base_cc: shutil.copy(self.data_dir + f, temp_dir) # Copy the include input file if there is one input_copied = [] fs = self.base_ia + self.base_cc for ff in fs: if ff not in input_copied: input_copied.append(ff) with open(self.data_dir + ff, "r") as f: for line in f.readlines(): line = line.strip() if line.startswith("INPUT_FILE_INCLUDE"): include_file = line.split(":")[-1].strip() self.logger.debug(f"Copying included file {include_file}") shutil.copy(self.data_dir + include_file, temp_dir) # Write the primary input file main_input_file = f"{temp_dir}/{self.genversion}.input" with open(main_input_file, "w") as f: f.writelines(map(lambda s: s + '\n', self.base)) self.logger.info(f"Input file written to {main_input_file}") # Remove any duplicates and order the output files output_files = [f"{temp_dir}/{a}" for a in sorted(os.listdir(temp_dir))] self.logger.debug(f"{len(output_files)} files used to create simulation. Hashing them.") # Also add this file to the hash, so if the code changes we also regenerate. Smart. output_files.append(os.path.abspath(inspect.stack()[0][1])) # Get current hash string_to_hash = "" for file in output_files: with open(file, "r") as f: string_to_hash += f.read() new_hash = get_hash(string_to_hash) self.logger.debug(f"Current hash set to {new_hash}") regenerate = old_hash is None or old_hash != new_hash if regenerate: self.logger.info(f"Running simulation, hash check failed") # Clean output dir. God I feel dangerous doing this, so hopefully unnecessary check if "//" not in self.output_dir and "Pippin" in self.output_dir: self.logger.debug(f"Cleaning output directory {self.output_dir}") shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) self.logger.debug(f"Copying from {temp_dir} to {self.output_dir}") copytree(temp_dir, self.output_dir) with open(hash_file, "w") as f: f.write(str(new_hash)) self.logger.debug(f"New hash saved to {hash_file}") self.hash_file = hash_file chown_dir(self.output_dir) else: self.logger.info("Hash check passed, not rerunning") temp_dir_obj.cleanup() return regenerate, new_hash
def _run(self, force_refresh): new_hash = self.check_regenerate(force_refresh) if new_hash: mkdirs(self.output_dir) prediction_files = [ d.output["predictions_filename"] for d in self.classifiers ] df = None for f in prediction_files: dataframe = self.load_prediction_file(f) dataframe = dataframe.rename( columns={dataframe.columns[0]: self.id}) if df is None: df = dataframe self.logger.debug( f"Merging on column {self.id} for file {f}") else: self.logger.debug( f"Merging on column {self.id} for file {f}") df = pd.merge( df, dataframe, on=self.id, how="outer" ) # Inner join atm, should I make this outer? if self.include_type: self.logger.info("Finding original types") s = self.get_underlying_sim_task() type_df = None phot_dir = s.output["photometry_dir"] headers = [ os.path.join(phot_dir, a) for a in os.listdir(phot_dir) if "HEAD" in a ] if not headers: self.logger.error( f"Not HEAD fits files found in {phot_dir}!") else: for h in headers: with fits.open(h) as hdul: data = hdul[1].data snid = np.array(data.field("SNID")).astype( np.int64) sntype = np.array(data.field("SNTYPE")).astype( np.int64) dataframe = pd.DataFrame({ self.id: snid, self.type_name: sntype }) if type_df is None: type_df = dataframe else: type_df = pd.concat([type_df, dataframe]) df = pd.merge(df, type_df, on=self.id) if self.plot: self._plot(df) self.logger.info( f"Merged into dataframe of {df.shape[0]} rows, with columns {list(df.columns)}" ) df.to_csv(self.output_df, index=False, float_format="%0.4f") self.save_key_format(df) self.logger.debug(f"Saving merged dataframe to {self.output_df}") self.save_new_hash(new_hash) self.output["merge_predictions_filename"] = self.output_df self.output["merge_key_filename"] = self.output_df_key self.output["sn_column_name"] = self.id if self.include_type: self.output["sn_type_name"] = self.type_name self.passed = True return True
def run(args): if args is None: return None init() # Load YAML config file yaml_path = os.path.abspath(os.path.expandvars(args.yaml)) assert os.path.exists(yaml_path), f"File {yaml_path} cannot be found." config_raw, config = load_yaml(yaml_path) #with open(yaml_path, "r") as f: # config = yaml.safe_load(f) overwrites = config.get("GLOBAL") if config.get("GLOBALS") is not None: logging.warning( "Your config file has a GLOBALS section in it. If you're trying to overwrite cfg.yml, rename this to GLOBAL" ) cfg = None if config.get("GLOBAL"): cfg = config.get("GLOBAL").get("CFG_PATH") if cfg is None: cfg = args.config global_config = get_config(initial_path=cfg, overwrites=overwrites) config_filename = os.path.basename(args.yaml).split(".")[0].upper() output_dir = get_output_dir() logging_folder = os.path.abspath(os.path.join(output_dir, config_filename)) if not args.check: mkdirs(logging_folder) if os.path.exists(logging_folder): chown_dir(logging_folder, walk=args.permission) if args.permission: return message_store, logging_filename = setup_logging(config_filename, logging_folder, args) for i, d in enumerate(global_config["DATA_DIRS"]): logging.debug(f"Data directory {i + 1} set as {d}") assert d is not None, "Data directory is none, which means it failed to resolve. Check the error message above for why." logging.info( f"Running on: {os.environ.get('HOSTNAME', '$HOSTNAME not set')} login node." ) manager = Manager(config_filename, yaml_path, config_raw, config, message_store) # Gracefully hand Ctrl-c def handler(signum, frame): logging.error("Ctrl-c was pressed.") logging.warning( "All remaining tasks will be killed and their hash reset") manager.kill_remaining_tasks() exit(1) signal.signal(signal.SIGINT, handler) if args.start is not None: args.refresh = True manager.set_start(args.start) manager.set_finish(args.finish) manager.set_force_refresh(args.refresh) manager.set_force_ignore_stage(args.ignore) manager.execute(args.check, args.compress, args.uncompress) chown_file(logging_filename) return manager
def prepare_train_job(self, force_refresh): self.logger.debug("Preparing NML file for Nearest Neighbour training") fit_output = self.get_fit_dependency() genversion = fit_output["genversion"] fitres_dir = fit_output["fitres_dir"] fitres_file = fit_output["fitres_file"] nml_file_orig = fit_output["nml_file"] # Put config in a temp directory temp_dir_obj = tempfile.TemporaryDirectory() temp_dir = temp_dir_obj.name outfile_train = f'{self.name}_train.out' nml_file_train1 = f'{temp_dir}/{genversion}-2.nml' nml_file_train2 = f'{self.output_dir}/{genversion}-2.nml' train_info_local = { "outfile_NNtrain": outfile_train, "nml_file_NNtrain": nml_file_train2, } # construct sed to copy original NMLFILE and to # + replace OUTDIR: # + include ROOTFILE_OUT (to store histograms for NN train) # + include DONE stamp for Sam/pippen # + run afterburner to process ROOT file and get NN_trainPar; # copy NN_trainPar up to where pippin can find it # # TODO: Check with Rick if the FITOPT000.ROOT is needed / should be hardcoded afterBurn = f'nearnbr_maxFoM.exe FITOPT000.ROOT -truetype 1 -outfile {outfile_train} ; cp {outfile_train} {self.outfile_train}' sedstr = 'sed' sedstr += (r" -e '/OUTDIR:/a\OUTDIR: %s' " % self.splitfit_output_dir) sedstr += r" -e '/OUTDIR:/d'" sedstr += r" -e '/DONE_STAMP:/d'" sedstr += r" -e '/SNTABLE_LIST/a\ ROOTFILE_OUT = \"bla.root\"'" sedstr += r" -e '/_OUT/d '" sedstr += (r" -e '/VERSION:/a\VERSION_AFTERBURNER: %s'" % afterBurn) sedstr += (r" -e '/VERSION:/a\DONE_STAMP: %s'" % self.done_file) sed_command = ("%s %s > %s" % (sedstr, nml_file_orig, nml_file_train1)) # use system call to apply sed command # self.logger.debug(f"Running sed command {sed_command}") subprocess.run(sed_command, stderr=subprocess.STDOUT, cwd=temp_dir, shell=True) # make sure that the new NML file is really there if not os.path.isfile(nml_file_train1): self.logger.error( f"Unable to create {nml_file_train1} with sed command {sed_command}" ) return None # check that expected FITRES ref file is really there. if not os.path.exists(fitres_file): self.logger.error( 'Cannot find expected FITRES file at {fitres_path}') return None # open NML file in append mode and tack on NNINP namelist with open(nml_file_train1, 'a') as f: f.write("\n# NNINP below added by prepare_NNtrainJob\n") f.write("\n&NNINP \n") f.write(" NEARNBR_TRAINFILE_PATH = '%s' \n" % fitres_dir) f.write(" NEARNBR_TRAINFILE_LIST = '%s' \n" % os.path.basename(fitres_file)) f.write(" NEARNBR_SEPMAX_VARDEF = '%s' \n" % self.nn_options) f.write(" NEARNBR_TRUETYPE_VARNAME = 'SIM_TYPE_INDEX' \n") f.write(" NEARNBR_TRAIN_ODDEVEN = T \n") f.write("\n&END\n") input_files = [nml_file_train1] old_hash = self.get_old_hash() new_hash = self.get_hash_from_files(input_files) if force_refresh or new_hash != old_hash: self.logger.debug("Regenerating") shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) self.logger.debug(f"Copying from {temp_dir} to {self.output_dir}") copytree(temp_dir, self.output_dir) self.save_new_hash(new_hash) return new_hash, train_info_local else: self.logger.debug("Not regenerating") return None, train_info_local
"--finish", help="Stage to finish at (it runs this stage too)", default=None) parser.add_argument("-r", "--refresh", help="Refresh all tasks, do not use hash", action="store_true") args = parser.parse_args() level = logging.DEBUG if args.verbose else logging.INFO # Get base filename config_filename = os.path.basename(args.config).split(".")[0].upper() logging_folder = os.path.abspath( f"{get_config()['OUTPUT']['output_dir']}/{config_filename}") mkdirs(logging_folder) logging_filename = f"{logging_folder}/{config_filename}.log" message_store = MessageStore() NOTICE_LEVELV_NUM = 25 logging.addLevelName(NOTICE_LEVELV_NUM, "NOTICE") def notice(self, message, *args, **kws): if self.isEnabledFor(NOTICE_LEVELV_NUM): self._log(NOTICE_LEVELV_NUM, message, args, **kws) logging.Logger.notice = notice fmt = "[%(levelname)8s |%(filename)21s:%(lineno)3d] %(message)s" if args.verbose else "%(message)s" logging.basicConfig(level=level, format=fmt, handlers=[
def _run(self): # Get the m0diff files for everything for b in self.biascor_deps: for m in b.output["m0dif_dirs"]: self.logger.info(f"Looking at M0diff dir {m}") sim_number = 1 if os.path.basename(m).isdigit(): sim_number = int(os.path.basename(m)) files = [ f for f in sorted(os.listdir(m)) if f.endswith(".M0DIF") or f.endswith(".M0DIF.gz") ] for f in files: muopt_num = int(f.split("MUOPT")[-1].split(".")[0]) fitopt_num = int(f.split("FITOPT")[-1].split("_")[0]) if muopt_num == 0: muopt = "DEFAULT" else: muopt = b.output["muopts"][muopt_num - 1] # Because 0 is default if fitopt_num == 0: fitopt = "DEFAULT" else: fitopt = b.output["fitopt_index"][fitopt_num] self.biascor_m0diffs.append( (b.name, sim_number, muopt, muopt_num, fitopt, fitopt_num, os.path.join(m, f))) data_fitres_files = [ os.path.join(l.output["fitres_dirs"][0], l.output["fitopt_map"]["DEFAULT"]) for l in self.lcfit_deps if l.output["is_data"] ] data_fitres_output = [ d.split("/")[-4] + ".csv.gz" for d in data_fitres_files ] sim_fitres_files = [ os.path.join(l.output["fitres_dirs"][0], l.output["fitopt_map"]["DEFAULT"]) for l in self.lcfit_deps if not l.output["is_data"] ] sim_fitres_output = [ d.split("/")[-4] + ".csv.gz" for d in sim_fitres_files ] types = list( set([ a for l in self.lcfit_deps for a in l.sim_task.output["types_dict"]["IA"] ])) input_yml_file = "input.yml" output_dict = { "COSMOMC": { "INPUT_FILES": self.cosmomc_input_files, "PARSED_FILES": self.cosmomc_output_files, "PARSED_COVOPTS": self.cosmomc_covopts, "PARAMS": self.params, "SHIFT": self.options.get("SHIFT", False), "PRIOR": self.options.get("PRIOR"), "NAMES": self.names, "CONTOUR_COVOPTS": self.covopts, "SINGULAR_BLIND": self.singular_blind, }, "BIASCOR": { "WFIT_SUMMARY_INPUT": self.wsummary_files, "WFIT_SUMMARY_OUTPUT": "all_biascor.csv", "FITRES_INPUT": self.biascor_fitres_input_files, "FITRES_PROB_COLS": self.biascor_prob_col_names, "FITRES_PARSED": self.biascor_fitres_output_files, "M0DIFF_INPUTS": self.biascor_m0diffs, "M0DIFF_PARSED": self.biascor_m0diff_output, "FITRES_COMBINED": self.biascor_fitres_combined, }, "OUTPUT_NAME": self.name, "BLIND": self.blind_params, "LCFIT": { "DATA_FITRES_INPUT": data_fitres_files, "SIM_FITRES_INPUT": sim_fitres_files, "DATA_FITRES_PARSED": data_fitres_output, "SIM_FITRES_PARSED": sim_fitres_output, "IA_TYPES": types, }, } if self.batch_file is None: if self.gpu: self.sbatch_header = self.sbatch_gpu_header else: self.sbatch_header = self.sbatch_cpu_header else: with open(self.batch_file, 'r') as f: self.sbatch_header = f.read() self.sbatch_header = self.clean_header(self.sbatch_header) header_dict = { "REPLACE_NAME": self.job_name, "REPLACE_WALLTIME": "1:00:00", "REPLACE_LOGFILE": self.logfile, "REPLACE_MEM": "20GB", "APPEND": ["#SBATCH --ntasks=1", "#SBATCH --cpus-per-task=1"] } header_dict = merge_dict(header_dict, self.batch_replace) self.update_header(header_dict) setup_dict = {"output_dir": self.output_dir} format_dict = { "sbatch_header": self.sbatch_header, "task_setup": self.update_setup(setup_dict, self.task_setup['analyse']), "input_yml": input_yml_file } final_slurm = self.get_slurm_raw().format(**format_dict) new_hash = self.get_hash_from_string(final_slurm + json.dumps(output_dict)) if self._check_regenerate(new_hash): self.logger.debug("Regenerating and launching task") shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) self.save_new_hash(new_hash) for c in self.path_to_codes: shutil.copy(c, self.output_dir) input_yml_path = os.path.join(self.output_dir, input_yml_file) with open(input_yml_path, "w") as f: json.dump(output_dict, f, indent=2) self.logger.debug( f"Input yml file written out to {input_yml_path}") slurm_output_file = os.path.join(self.output_dir, "slurm.job") with open(slurm_output_file, "w") as f: f.write(final_slurm) self.logger.info(f"Submitting batch job for analyse chains") subprocess.run(["sbatch", slurm_output_file], cwd=self.output_dir) else: self.logger.info("Hash check passed, not rerunning") return True
def classify(self, training): model = self.options.get("MODEL") model_path = "" if not training: assert model is not None, "If TRAIN is not specified, you have to point to a model to use" if not os.path.exists(get_output_loc(model)): for t in self.dependencies: if model == t.name: self.logger.debug( f"Found task dependency {t.name} with model file {t.output['model_filename']}" ) model = t.output["model_filename"] model_path = get_output_loc(model) self.logger.debug(f"Looking for model in {model_path}") assert os.path.exists(model_path), f"Cannot find {model_path}" types = self.get_types() if types is None: types = OrderedDict({ "1": "Ia", "0": "unknown", "2": "SNIax", "3": "SNIa-pec", "20": "SNIIP", "21": "SNIIL", "22": "SNIIn", "29": "SNII", "32": "SNIb", "33": "SNIc", "39": "SNIbc", "41": "SLSN-I", "42": "SLSN-II", "43": "SLSN-R", "80": "AGN", "81": "galaxy", "98": "None", "99": "pending", "101": "Ia", "120": "SNII", "130": "SNIbc", }) else: has_ia = False has_cc = False self.logger.debug(f"Input types set to {types}") for key, value in types.items(): if value.upper() == "IA": has_ia = True elif value.upper() in ["II", "IBC"]: has_cc = True if not has_ia: self.logger.debug("No Ia type found, injecting type") types[1] = "Ia" types = dict( sorted(types.items(), key=lambda x: -1 if x[0] == 1 else x[0])) self.logger.debug(f"Inject types with Ias are {types}") if not has_cc: self.logger.debug("No cc type found, injecting type") types[29] = "II" str_types = json.dumps(types) self.logger.debug(f"Types set to {str_types}") sim_dep = self.get_simulation_dependency() light_curve_dir = sim_dep.output["photometry_dirs"][self.index] self.raw_dir = light_curve_dir fit = self.get_fit_dependency() fit_dir = f"" if fit is None else f"--fits_dir {fit['fitres_dirs'][self.index]}" cyclic = "--cyclic" if self.variant in ["vanilla", "variational" ] and self.cyclic else "" batch_size = f"--batch_size {self.batch_size}" num_layers = f"--num_layers {self.num_layers}" hidden_dim = f"--hidden_dim {self.hidden_dim}" variant = f"--model {self.variant}" if self.variant == "bayesian": variant += " --num_inference_samples 20" clump = sim_dep.output.get("clump_file") if clump is None: clump_txt = "" else: clump_txt = f"--photo_window_files {clump}" if self.batch_file is None: if self.gpu: self.sbatch_header = self.sbatch_gpu_header else: self.sbatch_header = self.sbatch_cpu_header else: with open(self.batch_file, 'r') as f: self.sbatch_header = f.read() self.sbatch_header = self.clean_header(self.sbatch_header) if self.has_yml: self.update_yml() setup_file = "supernnova_yml" else: setup_file = "supernnova" header_dict = { "REPLACE_NAME": self.job_base_name, "REPLACE_WALLTIME": "23:00:00", "REPLACE_LOGFILE": "output.log", "REPLACE_MEM": "32GB", "APPEND": ["#SBATCH --ntasks=1", "#SBATCH --cpus-per-task=1"] } header_dict = merge_dict(header_dict, self.batch_replace) self.update_header(header_dict) setup_dict = { "conda_env": self.conda_env, "dump_dir": self.dump_dir, "photometry_dir": light_curve_dir, "fit_dir": fit_dir, "path_to_classifier": self.path_to_classifier, "job_name": self.job_base_name, "command": "--train_rnn" if training else "--validate_rnn", "sntypes": str_types, "variant": variant, "cyclic": cyclic, "model": "" if training else f"--model_files {model_path}", "phot": "", "test_or_train": "" if training else "--data_testing", "redshift": "--redshift " + self.redshift, "norm": "--norm " + self.norm, "done_file": self.done_file, "clump": clump_txt, "done_file2": self.done_file2, "partition": "gpu2" if self.gpu else "broadwl", "gres": "#SBATCH --gres=gpu:1" if self.gpu else "", "cuda": "--use_cuda" if self.gpu else "", "clean_command": f"rm -rf {self.dump_dir}/processed" if self.clean else "", "seed": f"--seed {self.seed}" if self.seed else "", "batch_size": batch_size, "num_layers": num_layers, "hidden_dim": hidden_dim, "data_yml": self.output_data_yml, "classification_yml": self.output_classification_yml, "classification_command": "train_rnn" if training else "validate_rnn" } format_dict = { "sbatch_header": self.sbatch_header, "task_setup": self.update_setup(setup_dict, self.task_setup[setup_file]) } slurm_output_file = self.output_dir + "/job.slurm" self.logger.info( f"Running SuperNNova, slurm job outputting to {slurm_output_file}") slurm_text = self.slurm.format(**format_dict) new_hash = self.get_hash_from_string(slurm_text) if not self._check_regenerate(new_hash): self.should_be_done() else: self.logger.info("Rerunning. Cleaning output_dir") shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) if self.has_yml: with open(self.output_data_yml, 'w') as f: f.write(self.data_yml) with open(self.output_classification_yml, 'w') as f: f.write(self.classification_yml) self.save_new_hash(new_hash) with open(slurm_output_file, "w") as f: f.write(slurm_text) self.logger.info( f"Submitting batch job to {'train' if training else 'predict using'} SuperNNova" ) subprocess.run(["sbatch", slurm_output_file], cwd=self.output_dir) return True
def write_input(self, force_refresh): self.set_property("GENVERSION", self.genversion, assignment=": ", section_end="ENDLIST_GENVERSION") for k in self.config.keys(): if k.upper() != "GLOBAL": run_config = self.config[k] run_config_keys = list(run_config.keys()) assert "BASE" in run_config_keys, "You must specify a base file for each option" for key in run_config_keys: if key.upper() in self.reserved_keywords: continue base_file = run_config["BASE"] match = base_file.split(".")[0] self.set_property(f"GENOPT({match})", f"{key} {run_config[key]}", section_end="ENDLIST_GENVERSION") for key in self.config.get("GLOBAL", []): if key.upper() == "BASE": continue self.set_property(key, self.config['GLOBAL'][key]) if key == "RANSEED_CHANGE": self.delete_property("RANSEED_REPEAT") elif key == "RANSEED_REPEAT": self.delete_property("RANSEED_CHANGE") self.set_property("SIMGEN_INFILE_Ia", " ".join(self.base_ia) if self.base_ia else None) self.set_property("SIMGEN_INFILE_NONIa", " ".join(self.base_cc) if self.base_cc else None) self.set_property("GENPREFIX", self.genversion) # Put config in a temp directory temp_dir_obj = tempfile.TemporaryDirectory() temp_dir = temp_dir_obj.name # Copy the base files across for f in self.base_ia: shutil.copy(self.data_dir + f, temp_dir) for f in self.base_cc: shutil.copy(self.data_dir + f, temp_dir) # Copy the include input file if there is one input_copied = [] fs = self.base_ia + self.base_cc for ff in fs: if ff not in input_copied: input_copied.append(ff) with open(self.data_dir + ff, "r") as f: for line in f.readlines(): line = line.strip() if line.startswith("INPUT_FILE_INCLUDE"): include_file = line.split(":")[-1].strip() self.logger.debug( f"Copying included file {include_file}") shutil.copy(self.data_dir + include_file, temp_dir) # Write the primary input file main_input_file = f"{temp_dir}/{self.genversion}.input" with open(main_input_file, "w") as f: f.writelines(map(lambda s: s + '\n', self.base)) self.logger.info(f"Input file written to {main_input_file}") # Remove any duplicates and order the output files output_files = [ f"{temp_dir}/{a}" for a in sorted(os.listdir(temp_dir)) ] self.logger.debug( f"{len(output_files)} files used to create simulation. Hashing them." ) # Get current hash new_hash = self.get_hash_from_files(output_files) old_hash = self.get_old_hash() regenerate = force_refresh or (old_hash is None or old_hash != new_hash) if regenerate: self.logger.info(f"Running simulation") # Clean output dir. God I feel dangerous doing this, so hopefully unnecessary check if "//" not in self.output_dir and len(self.output_dir) > 30: self.logger.debug( f"Cleaning output directory {self.output_dir}") shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) self.logger.debug( f"Copying from {temp_dir} to {self.output_dir}") copytree(temp_dir, self.output_dir) self.save_new_hash(new_hash) else: self.logger.error( f"Seems to be an issue with the output dir path: {self.output_dir}" ) chown_dir(self.output_dir) else: self.logger.info("Hash check passed, not rerunning") temp_dir_obj.cleanup() return regenerate, new_hash
def execute(self, check_config, compress_output, uncompress_output): self.logger.info(f"Executing pipeline for prefix {self.prefix}") self.logger.info(f"Output will be located in {self.output_dir}") if check_config: self.logger.info("Only verifying config, not launching anything") assert not ( compress_output and uncompress_output ), "-C / --compress and -U / --uncompress are mutually exclusive" # Whilst compressing is being debugged, false by default self.compress = False if compress_output: self.compress = True self.logger.info("Compressing output") if uncompress_output: self.compress = False self.logger.info("Uncompressing output") mkdirs(self.output_dir) c = self.run_config self.tasks = self.get_tasks(c) self.num_jobs_queue = 0 self.num_jobs_queue_gpu = 0 squeue = None if check_config: if compress_output: self.compress_all() if uncompress_output: self.uncompress_all() self.logger.notice("Config verified, exiting") return self.print_dashboard() start_sleep_time = self.global_config["OUTPUT"]["ping_frequency"] max_sleep_time = self.global_config["OUTPUT"]["max_ping_frequency"] current_sleep_time = start_sleep_time config_file_output = os.path.join(self.output_dir, os.path.basename(self.filename_path)) if not check_config and self.filename_path != config_file_output: self.logger.info( f"Saving processed and parsed config file to {config_file_output}" ) with open(config_file_output, 'w') as f: f.write(self.file_raw) #shutil.copy(self.filename_path, config_file_output) chown_file(config_file_output) # Welcome to the primary loop while self.tasks or self.running: small_wait = False # Check status of current jobs for t in self.running: try: completed = self.check_task_completion(t, squeue) small_wait = small_wait or completed except Exception as e: self.logger.exception(e, exc_info=True) self.fail_task(t) # Submit new jobs if needed while self.num_jobs_queue < self.max_jobs: t = self.get_task_to_run() if t is not None: self.logger.info("") self.tasks.remove(t) self.logger.notice(f"LAUNCHING: {t}") try: t.set_force_refresh(self.get_force_refresh(t)) t.set_force_ignore(self.get_force_ignore(t)) t.set_sbatch_cpu_header(self.sbatch_cpu_header) t.set_sbatch_gpu_header(self.sbatch_gpu_header) t.set_setup(self.task_setup) started = t.run() except Exception as e: self.logger.exception(e, exc_info=True) started = False if started: if t.gpu: self.num_jobs_queue_gpu += t.num_jobs message = ( f"LAUNCHED: {t} with {t.num_jobs} GPU NUM_JOBS. Total GPU NUM_JOBS now {self.num_jobs_queue_gpu}/{self.max_jobs_in_queue_gpu}" ) else: self.num_jobs_queue += t.num_jobs message = f"LAUNCHED: {t} with {t.num_jobs} NUM_JOBS. Total NUM_JOBS now {self.num_jobs_queue}/{self.max_jobs_in_queue}" self.logger.notice(message) self.running.append(t) completed = False try: completed = self.check_task_completion(t, squeue) except Exception as e: self.logger.exception(e, exc_info=True) self.fail_task(t) small_wait = small_wait or completed else: self.logger.error(f"FAILED TO LAUNCH: {t}") self.fail_task(t) small_wait = True else: break # Check quickly if we've added a new job, etc, in case of immediate failure if small_wait: self.log_status() current_sleep_time = start_sleep_time time.sleep(0.1) squeue = None else: time.sleep(current_sleep_time) current_sleep_time *= 2 if current_sleep_time > max_sleep_time: current_sleep_time = max_sleep_time p = subprocess.run(f"squeue -h -u $USER -o '%.j'", shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if (p.returncode != 0) or (p.stderr != ""): self.logger.error( f"Command '{p.args}' failed with exit status '{p.returncode}' and error '{p.stderr.strip()}'" ) else: squeue = [i.strip() for i in p.stdout.splitlines()] n = len(squeue) if n == 0 or n > self.max_jobs: self.logger.debug( f"Squeue is reporting {n} NUM_JOBS in the queue... this is either 0 or toeing the line as to too many" ) self.log_finals()
def classify(self, training, force_refresh): model = self.options.get("MODEL") model_path = "" if not training: assert model is not None, "If TRAIN is not specified, you have to point to a model to use" if not os.path.exists(get_output_loc(model)): for t in self.dependencies: if model == t.name: self.logger.debug( f"Found task dependency {t.name} with model file {t.output['model_filename']}" ) model = t.output["model_filename"] model_path = get_output_loc(model) self.logger.debug(f"Looking for model in {model_path}") assert os.path.exists(model_path), f"Cannot find {model_path}" types = self.get_types() if types is None: types = OrderedDict({ "1": "Ia", "0": "unknown", "2": "SNIax", "3": "SNIa-pec", "20": "SNIIP", "21": "SNIIL", "22": "SNIIn", "29": "SNII", "32": "SNIb", "33": "SNIc", "39": "SNIbc", "41": "SLSN-I", "42": "SLSN-II", "43": "SLSN-R", "80": "AGN", "81": "galaxy", "98": "None", "99": "pending", "101": "Ia", "120": "SNII", "130": "SNIbc", }) else: has_ia = False has_cc = False self.logger.debug(f"Input types set to {types}") for key, value in types.items(): if value.upper() == "IA": has_ia = True elif value.upper() in ["II", "IBC"]: has_cc = True if not has_ia: self.logger.debug("No Ia type found, injecting type") types.update({"1": "Ia"}) types.move_to_end("1", last=False) if not has_cc: self.logger.debug("No cc type found, injecting type") types.update({"29": "II"}) str_types = json.dumps(types) self.logger.debug(f"Types set to {str_types}") sim_dep = self.get_simulation_dependency() light_curve_dir = sim_dep.output["photometry_dirs"][self.index] fit = self.get_fit_dependency() fit_dir = f"" if fit is None else f"--fits_dir {fit['fitres_dirs'][self.index]}" cyclic = "--cyclic" if self.variant in ["vanilla", "variational" ] else "" variant = f"--model {self.variant}" if self.variant == "bayesian": variant += " --num_inference_samples 20" clump = sim_dep.output.get("clump_file") if clump is None: clump_txt = "" else: clump_txt = f"--photo_window_files {clump}" format_dict = { "conda_env": self.conda_env, "dump_dir": self.dump_dir, "photometry_dir": light_curve_dir, "fit_dir": fit_dir, "path_to_classifier": self.path_to_classifier, "job_name": self.job_base_name, "command": "--train_rnn" if training else "--validate_rnn", "sntypes": str_types, "variant": variant, "cyclic": cyclic, "model": "" if training else f"--model_files {model_path}", "phot": "", "test_or_train": "" if training else "--data_testing", "redshift": "--redshift " + self.redshift, "norm": "--norm " + self.norm, "done_file": self.done_file, "clump": clump_txt, "done_file2": self.done_file2, } slurm_output_file = self.output_dir + "/job.slurm" self.logger.info( f"Running SuperNNova, slurm job outputting to {slurm_output_file}") slurm_text = self.slurm.format(**format_dict) old_hash = self.get_old_hash() new_hash = self.get_hash_from_string(slurm_text) if not force_refresh and new_hash == old_hash: self.logger.info("Hash check passed, not rerunning") self.should_be_done() else: self.logger.info("Rerunning. Cleaning output_dir") shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) self.save_new_hash(new_hash) with open(slurm_output_file, "w") as f: f.write(slurm_text) self.logger.info( f"Submitting batch job to {'train' if training else 'predict using'} SuperNNova" ) subprocess.run(["sbatch", slurm_output_file], cwd=self.output_dir) return True