def check_mgmt_db(self): """Create errors for all entries in management db that did not complete""" base_proc_types = [ const.ProcessType.EMOD3D, const.ProcessType.HF, const.ProcessType.BB, const.ProcessType.IM_calculation, ] db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir)) entries = db.command_builder( allowed_tasks=base_proc_types, allowed_states=[const.Status.unknown, const.Status.failed], blocked_ids=self.canceled_running, ) for entry in entries: self.errors.append( Error( "Slurm task", "Run {} did not complete task {} " "(Status {}, JobId {}".format( entry.run_name, const.ProcessType(entry.proc_type), const.Status(entry.status), entry.job_id, ), ))
def cancel_running(self, proc_types: List[const.ProcessType]): """Looks for any running task of the specified process types and attempts to cancel one of each. """ # Get all running jobs in the mgmt db db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir)) entries = db.command_builder(allowed_tasks=proc_types, allowed_states=[const.Status.running]) # Cancel one for each process type for entry in entries: if entry.proc_type in proc_types: print( f"Checkpoint testing: Cancelling job-id {entry.job_id} " "for {entry.run_name} and process type {entry.proc_type}") out, err = Scheduler.get_scheduler().cancel_job(entry.job_id) print("Scancel out: ", out, err) if "error" not in out.lower() and "error" not in err.lower(): self.canceled_running.append(str(entry.job_id)) proc_types.remove(entry.proc_type) print("Cancelled job-id {}".format(entry.job_id)) return proc_types
def check_mgmt_db_progress(self): """Checks auto submit progress in the management db""" base_proc_types = [ const.ProcessType.EMOD3D, const.ProcessType.HF, const.ProcessType.BB, const.ProcessType.IM_calculation, ] db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir)) total_count = len(db.command_builder(allowed_tasks=base_proc_types)) comp_count = len( db.command_builder(allowed_tasks=base_proc_types, allowed_states=[const.Status.completed])) failed_count = len( db.command_builder( allowed_tasks=base_proc_types, allowed_states=[const.Status.failed, const.Status.unknown], )) return total_count, comp_count, failed_count
def check_completed(self): """Checks all simulations that have completed""" base_proc_types = [const.ProcessType.IM_calculation] db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir)) entries = db.command_builder(allowed_tasks=base_proc_types, allowed_states=[const.Status.completed]) completed_sims = [sim_t.run_name for sim_t in entries] # Only check the ones that haven't been checked already completed_new = set(completed_sims) - (self._sim_passed | self._sim_failed) for sim in completed_new: result = self.check_sim_result( os.path.join(self.runs_dir, sim_struct.get_fault_from_realisation(sim), sim)) if not result: self._sim_failed.add(sim) if self._stop_on_error: print("Quitting as the following errors occured: ") self.print_errors() return False else: print("The following error occured for simulation {}:". format(sim)) print("ERROR: {}, {}\n".format(self.errors[-1].location, self.errors[-1].error)) else: self._sim_passed.add(sim) print("Passed/Failed/Total simulations: {}/{}/{}, ".format( len(self._sim_passed), len(self._sim_failed), len(self.sim_dirs))) return True