def __init__(self, url: Optional[str] = None, config: Optional[SlurmExecutorConfig] = None): """Initializes a :class:`~SlurmJobExecutor`.""" if not config: config = SlurmExecutorConfig() super().__init__(config=config) self.generator = TemplatedScriptGenerator(config, Path(__file__).parent / 'slurm' / 'slurm.mustache')
class _TestJobExecutor(BatchSchedulerExecutor): _STATE_MAP = { 'F': JobState.FAILED, 'X': JobState.CANCELED, 'C': JobState.COMPLETED, 'Q': JobState.QUEUED, 'R': JobState.ACTIVE, } def __init__(self, url: Optional[str] = None, config: Optional[_TestExecutorConfig] = None): if not config: config = _TestExecutorConfig() super().__init__(config=config) self.generator = TemplatedScriptGenerator(config, Path(__file__).parent / 'test' / 'test.mustache') def generate_submit_script(self, job: Job, context: Dict[str, object], submit_file: TextIO) -> None: self.generator.generate_submit_script(job, context, submit_file) def get_submit_command(self, job: Job, submit_file_path: Path) -> List[str]: return [sys.executable, QSUB_PATH, str(submit_file_path.absolute())] def get_cancel_command(self, native_id: str) -> List[str]: return [sys.executable, QDEL_PATH, native_id] def process_cancel_command_output(self, exit_code: int, out: str) -> None: if exit_code == 16: raise InvalidJobStateError() else: raise SubmitException(out) def get_status_command(self, native_ids: Collection[str]) -> List[str]: ids = ','.join(native_ids) return [sys.executable, QSTAT_PATH, ids] def job_id_from_submit_output(self, out: str) -> str: return out.strip().split()[-1] def parse_status_output(self, exit_code: int, out: str) -> Dict[str, JobStatus]: check_status_exit_code(QSTAT_PATH, exit_code, out) r = {} lines = iter(out.split('\n')) for line in lines: if not line: continue cols = line.split(maxsplit=2) native_id = cols[0] state = self._get_state(cols[1]) msg = cols[2] if len(cols) == 3 else None r[native_id] = JobStatus(state, message=msg) return r def _get_state(self, state: str) -> JobState: assert state in _TestJobExecutor._STATE_MAP return _TestJobExecutor._STATE_MAP[state]
def __init__(self, url: Optional[str], config: Optional[LsfExecutorConfig] = None): """Initializes a :class:`~LsfJobExecutor`.""" if not config: config = LsfExecutorConfig() super().__init__(config=config) self.generator = TemplatedScriptGenerator( config, Path(__file__).parent / "lsf" / "lsf.mustache")
def __init__(self, url: Optional[str] = None, config: Optional[PBSProExecutorConfig] = None): """Initializes a :class:`~PBSProJobExecutor`.""" if not config: config = PBSProExecutorConfig() super().__init__(url=url, config=config) self.generator = TemplatedScriptGenerator( config, Path(__file__).parent / 'pbspro' / 'pbspro.mustache')
class LsfJobExecutor(BatchSchedulerExecutor): """A :class:`~psij.JobExecutor` for the LSF Workload Manager. The `IBM Spectrum LSF workload manager <https://www.ibm.com/docs/en/spectrum-lsf>`_ is the system resource manager on LLNL's Sierra and Lassen, and ORNL's Summit. Uses the 'bsub', 'bjobs', and 'bkill' commands, respectively, to submit, monitor, and cancel jobs. Creates a batch script with #BSUB directives when submitting a job. """ # see https://www.ibm.com/docs/en/spectrum-lsf/10.1.0?topic=bjobs-description _STATE_MAP = { "PEND": JobState.QUEUED, # normal queued state "PROV": JobState.QUEUED, # The job has been dispatched... "PSUSP": JobState.QUEUED, # The job has been suspended... while pending. "RUN": JobState.ACTIVE, # The job is currently running. "USUSP": JobState.ACTIVE, # The job has been suspended... while running. "SSUSP": JobState.ACTIVE, # The job has been suspended by LSF "DONE": JobState.COMPLETED, # normal 0 exit "EXIT": JobState.FAILED, # nonzero exit "UNKWN": JobState.ACTIVE, # mbatchd has lost contact with the job host... "WAIT": JobState.QUEUED, # For... members of a chunk job waiting to run. "ZOMBI": JobState.ACTIVE, # something unusual, but probably active } def __init__(self, url: Optional[str], config: Optional[LsfExecutorConfig] = None): """Initializes a :class:`~LsfJobExecutor`.""" if not config: config = LsfExecutorConfig() super().__init__(config=config) self.generator = TemplatedScriptGenerator( config, Path(__file__).parent / "lsf" / "lsf.mustache") def generate_submit_script(self, job: Job, context: Dict[str, object], submit_file: TextIO) -> None: """See :meth:`~.BatchSchedulerExecutor.generate_submit_script`.""" assert (job.spec is not None) context["job_duration"] = int( job.spec.attributes.duration.total_seconds() // 60) self.generator.generate_submit_script(job, context, submit_file) def get_submit_command(self, job: Job, submit_file_path: Path) -> List[str]: """See :meth:`~.BatchSchedulerExecutor.get_submit_command`.""" return ["bsub", str(submit_file_path.absolute())] def get_cancel_command(self, native_id: str) -> List[str]: """See :meth:`~.BatchSchedulerExecutor.get_cancel_command`. ``bkill`` will exit with an error set if the job does not exist or has already finished. """ return ["bkill", native_id] def process_cancel_command_output(self, exit_code: int, out: str) -> None: """See :meth:`~.BatchSchedulerExecutor.process_cancel_command_output`. Check if the error was raised only because a job already exited. """ if _BKILL_FAILURE_REGEX.search(out) is None: raise SubmitException(out) def get_status_command(self, native_ids: Collection[str]) -> List[str]: """See :meth:`~.BatchSchedulerExecutor.get_status_command`.""" return [ _BJOBS_COMMAND, "-o", "JOBID STAT EXIT_REASON KILL_REASON SUSPEND_REASON", "-json", "-a", *native_ids, ] def parse_status_output(self, exit_code: int, out: str) -> Dict[str, JobStatus]: """See :meth:`~.BatchSchedulerExecutor.parse_status_output`. Iterate through the RECORDS entry, grabbing JOBID and STAT entries, as well as any state-change reasons if present. """ check_status_exit_code(_BJOBS_COMMAND, exit_code, out) output = json.loads(out) status_map = {} for entry in output["RECORDS"]: if "ERROR" in entry: continue state = self._STATE_MAP[entry["STAT"]] message = None for reason in ("EXIT_REASON", "KILL_REASON", "SUSPEND_REASON"): if entry[reason]: message = entry["reason"] break status_map[entry["JOBID"]] = JobStatus(state, message=message) return status_map def job_id_from_submit_output(self, out: str) -> str: """See :meth:`~.BatchSchedulerExecutor.job_id_from_submit_output`.""" match = _BSUB_REGEX.search(out) if match is None: raise SubmitException(out) return match.group(0)[5:-1]
class SlurmJobExecutor(BatchSchedulerExecutor): """A :class:`~psij.JobExecutor` for the Slurm Workload Manager. The `Slurm Workload Manager <https://slurm.schedmd.com/overview.html>`_ is a widely used resource manager running on machines such as NERSC's Perlmutter, as well as a variety of LLNL machines. Uses the 'sbatch', 'squeue', and 'scancel' commands, respectively, to submit, monitor, and cancel jobs. Creates a batch script with #SBATCH directives when submitting a job. """ # see https://slurm.schedmd.com/squeue.html _STATE_MAP = { 'BF': JobState.FAILED, 'CA': JobState.CANCELED, 'CD': JobState.COMPLETED, 'CF': JobState.QUEUED, 'CG': JobState.ACTIVE, 'DL': JobState.FAILED, 'F': JobState.FAILED, 'NF': JobState.FAILED, 'OOM': JobState.FAILED, 'PD': JobState.QUEUED, 'PR': JobState.FAILED, 'R': JobState.ACTIVE, 'RD': JobState.QUEUED, 'RF': JobState.QUEUED, 'RH': JobState.QUEUED, 'RQ': JobState.ACTIVE, 'SO': JobState.ACTIVE, 'TO': JobState.FAILED, # TODO: double-check these 'RS': JobState.ACTIVE, 'RV': JobState.QUEUED, 'SI': JobState.ACTIVE, 'SE': JobState.ACTIVE, 'ST': JobState.ACTIVE, 'S': JobState.ACTIVE } # see https://slurm.schedmd.com/squeue.html _REASONS_MAP = { 'AssociationJobLimit': 'The job\'s association has reached its maximum job count.', 'AssociationResourceLimit': 'The job\'s association has reached some resource limit.', 'AssociationTimeLimit': 'The job\'s association has reached its time limit.', 'BadConstraints': 'The job\'s constraints can not be satisfied.', 'BeginTime': 'The job\'s earliest start time has not yet been reached.', 'Cleaning': 'The job is being requeued and still cleaning up from its previous execution.', 'Dependency': 'This job is waiting for a dependent job to complete.', 'FrontEndDown': 'No front end node is available to execute this job.', 'InactiveLimit': 'The job reached the system InactiveLimit.', 'InvalidAccount': 'The job\'s account is invalid.', 'InvalidQOS': 'The job\'s QOS is invalid.', 'JobHeldAdmin': 'The job is held by a system administrator.', 'JobHeldUser': '******', 'JobLaunchFailure': 'The job could not be launched.This may be due to a file system ' 'problem, invalid program name, etc.', 'Licenses': 'The job is waiting for a license.', 'NodeDown': 'A node required by the job is down.', 'NonZeroExitCode': 'The job terminated with a non-zero exit code.', 'PartitionDown': 'The partition required by this job is in a DOWN state.', 'PartitionInactive': 'The partition required by this job is in an Inactive state and not ' 'able to start jobs.', 'PartitionNodeLimit': 'The number of nodes required by this job is outside of its ' 'partition\'s current limits. Can also indicate that required nodes ' 'are DOWN or DRAINED.', 'PartitionTimeLimit': 'The job\'s time limit exceeds its partition\'s current time limit.', 'Priority': 'One or more higher priority jobs exist for this partition or advanced ' 'reservation.', 'Prolog': 'Its PrologSlurmctld program is still running.', 'QOSJobLimit': 'The job\'s QOS has reached its maximum job count.', 'QOSResourceLimit': 'The job\'s QOS has reached some resource limit.', 'QOSTimeLimit': 'The job\'s QOS has reached its time limit.', 'ReqNodeNotAvail': 'Some node specifically required by the job is not currently available. ' 'The node may currently be in use, reserved for another job, in an ' 'advanced reservation, DOWN, DRAINED, or not responding. Nodes which ' 'are DOWN, DRAINED, or not responding will be identified as part of ' 'the job\'s "reason" field as "UnavailableNodes". Such nodes will ' 'typically require the intervention of a system administrator to make ' 'available.', 'Reservation': 'The job is waiting its advanced reservation to become available.', 'Resources': 'The job is waiting for resources to become available.', 'SystemFailure': 'Failure of the Slurm system, a file system, the network, etc.', 'TimeLimit': 'The job exhausted its time limit.', 'QOSUsageThreshold': 'Required QOS threshold has been breached.', 'WaitingForScheduling': 'No reason has been set for this job yet. Waiting for the ' 'scheduler to determine the appropriate reason.' } def __init__(self, url: Optional[str] = None, config: Optional[SlurmExecutorConfig] = None): """Initializes a :class:`~SlurmJobExecutor`.""" if not config: config = SlurmExecutorConfig() super().__init__(config=config) self.generator = TemplatedScriptGenerator(config, Path(__file__).parent / 'slurm' / 'slurm.mustache') def generate_submit_script(self, job: Job, context: Dict[str, object], submit_file: TextIO) -> None: """See :meth:`~.BatchSchedulerExecutor.generate_submit_script`.""" self.generator.generate_submit_script(job, context, submit_file) def get_submit_command(self, job: Job, submit_file_path: Path) -> List[str]: """See :meth:`~.BatchSchedulerExecutor.get_submit_command`.""" return ['sbatch', str(submit_file_path.absolute())] def get_cancel_command(self, native_id: str) -> List[str]: """See :meth:`~.BatchSchedulerExecutor.get_cancel_command`.""" return ['scancel', '-Q', native_id] def process_cancel_command_output(self, exit_code: int, out: str) -> None: """See :meth:`~.BatchSchedulerExecutor.process_cancel_command_output`.""" raise SubmitException('Failed job cancel job: %s' % out) def get_status_command(self, native_ids: Collection[str]) -> List[str]: """See :meth:`~.BatchSchedulerExecutor.get_status_command`.""" ids = ','.join(native_ids) # we're not really using job arrays, so this is equivalent to the job ID. However, if # we were to use arrays, this would return one ID for the entire array rather than # listing each element of the array independently return [_SQUEUE_COMMAND, '-O', 'JobArrayID,StateCompact,Reason', '-t', 'all', '-j', ids] def parse_status_output(self, exit_code: int, out: str) -> Dict[str, JobStatus]: """See :meth:`~.BatchSchedulerExecutor.parse_status_output`.""" check_status_exit_code(_SQUEUE_COMMAND, exit_code, out) r = {} lines = iter(out.split('\n')) # skip header lines.__next__() for line in lines: if not line: continue cols = line.split() assert len(cols) == 3 native_id = cols[0] state = self._get_state(cols[1]) msg = self._get_message(cols[2]) if state == JobState.FAILED else None r[native_id] = JobStatus(state, message=msg) return r def _get_state(self, state: str) -> JobState: assert state in SlurmJobExecutor._STATE_MAP return SlurmJobExecutor._STATE_MAP[state] def _get_message(self, reason: str) -> str: assert reason in SlurmJobExecutor._REASONS_MAP return SlurmJobExecutor._REASONS_MAP[reason] def job_id_from_submit_output(self, out: str) -> str: """See :meth:`~.BatchSchedulerExecutor.job_id_from_submit_output`.""" return out.strip().split()[-1]
class CobaltJobExecutor(BatchSchedulerExecutor): """A :class:`~psij.JobExecutor` for the Cobalt Workload Manager. The `Cobalt HPC Job Scheduler <https://xgitlab.cels.anl.gov/aig-public/cobalt>`_, is used by `Argonne's <www.anl.gov>`_ `ALCF <www.alcf.anl.gov>`_ systems. Uses the ``qsub``, ``qstat``, and ``qdel`` commands, respectively, to submit, monitor, and cancel jobs. Creates a batch script with #COBALT directives when submitting a job. """ # see https://Cobalt.schedmd.com/squeue.html _STATE_MAP = { "starting": JobState.ACTIVE, "queued": JobState.QUEUED, "running": JobState.ACTIVE, "exiting": JobState.ACTIVE, "killing": JobState.FAILED, } def __init__(self, url: Optional[str] = None, config: Optional[CobaltExecutorConfig] = None): """Initializes a :class:`~CobaltJobExecutor`.""" if not config: config = CobaltExecutorConfig() super().__init__(config=config) self.generator = TemplatedScriptGenerator( config, Path(__file__).parent / "cobalt" / "cobalt.mustache") def generate_submit_script(self, job: Job, context: Dict[str, object], submit_file: TextIO) -> None: """See :meth:`~.BatchSchedulerExecutor.generate_submit_script`.""" self.generator.generate_submit_script(job, context, submit_file) def get_submit_command(self, job: Job, submit_file_path: Path) -> List[str]: """See :meth:`~.BatchSchedulerExecutor.get_submit_command`.""" str_path = str(submit_file_path.absolute()) os.chmod(str_path, os.stat(str_path).st_mode | stat.S_IEXEC) return ["qsub", str_path] def get_cancel_command(self, native_id: str) -> List[str]: """See :meth:`~.BatchSchedulerExecutor.get_cancel_command`.""" return ["qdel", native_id] def process_cancel_command_output(self, exit_code: int, out: str) -> None: """See :meth:`~.BatchSchedulerExecutor.process_cancel_command_output`. This should be unnecessary because `qdel` only seems to fail on non-integer job IDs. """ raise SubmitException("Failed job cancel job: %s" % out) def get_status_command(self, native_ids: Collection[str]) -> List[str]: """See :meth:`~.BatchSchedulerExecutor.get_status_command`.""" return [_QSTAT_COMMAND, "-l", "--header=Jobid:State", *native_ids] def parse_status_output(self, exit_code: int, out: str) -> Dict[str, JobStatus]: """See :meth:`~.BatchSchedulerExecutor.parse_status_output`.""" # if none of the job ID passed to Cobalt are recognized, qstat returns 1, # but we shouldn't treat that as an error if exit_code != 0 and out == UNKNOWN_ERROR: return {} check_status_exit_code(_QSTAT_COMMAND, exit_code, out) job_statuses = {} index = 0 lines = out.split("\n") while index < len(lines) - 1: jobid_match = _QSTAT_JOBID_REGEX.search(lines[index]) if jobid_match is not None: state_match = _QSTAT_STATE_REGEX.search(lines[index + 1]) if state_match is not None: job_statuses[jobid_match.group(2)] = JobStatus( self._STATE_MAP[state_match.group(2)]) index += 2 else: index += 1 else: index += 1 return job_statuses def job_id_from_submit_output(self, out: str) -> str: """See :meth:`~.BatchSchedulerExecutor.job_id_from_submit_output`.""" match = _QSUB_REGEX.search(out) if match is None: raise SubmitException(out) return match.group(0)
def __init__(self, url: Optional[str] = None, config: Optional[_TestExecutorConfig] = None): if not config: config = _TestExecutorConfig() super().__init__(config=config) self.generator = TemplatedScriptGenerator(config, Path(__file__).parent / 'test' / 'test.mustache')
class PBSProJobExecutor(BatchSchedulerExecutor): """A :class:`~psij.JobExecutor` for PBS Pro. `PBS Pro <https://www.altair.com/pbs-professional/>`_ is a resource manager on certain machines at Argonne National Lab, among others. Uses the 'qsub', 'qstat', and 'qdel' commands, respectively, to submit, monitor, and cancel jobs. Creates a batch script with #PBS directives when submitting a job. """ def __init__(self, url: Optional[str] = None, config: Optional[PBSProExecutorConfig] = None): """Initializes a :class:`~PBSProJobExecutor`.""" if not config: config = PBSProExecutorConfig() super().__init__(url=url, config=config) self.generator = TemplatedScriptGenerator( config, Path(__file__).parent / 'pbspro' / 'pbspro.mustache') # Submit methods def generate_submit_script(self, job: Job, context: Dict[str, object], submit_file: TextIO) -> None: """See :meth:`~.BatchSchedulerExecutor.generate_submit_script`.""" self.generator.generate_submit_script(job, context, submit_file) def get_submit_command(self, job: Job, submit_file_path: Path) -> List[str]: """See :meth:`~.BatchSchedulerExecutor.get_submit_command`.""" return ['qsub', str(submit_file_path.absolute())] def job_id_from_submit_output(self, out: str) -> str: """See :meth:`~.BatchSchedulerExecutor.job_id_from_submit_output`.""" return out.strip().split()[-1] # Cancel methods def get_cancel_command(self, native_id: str) -> List[str]: """See :meth:`~.BatchSchedulerExecutor.get_cancel_command`.""" # the slurm cancel command had a -Q parameter # which does not report an error if the job is already # completed. # TODO: whats the pbs equivalent of that? # there is -x which also removes job history (so would need to # check that this doesn't cause implicit COMPLETED states when # maybe it should be cancelled states?) return ['qdel', native_id] def process_cancel_command_output(self, exit_code: int, out: str) -> None: """See :meth:`~.BatchSchedulerExecutor.process_cancel_command_output`.""" raise SubmitException('Failed job cancel job: %s' % out) # Status methods def get_status_command(self, native_ids: Collection[str]) -> List[str]: """See :meth:`~.BatchSchedulerExecutor.get_status_command`.""" # -x will include finished jobs # -f -F json will give json status output that is more mechanically # parseable that the default human readable output. Most importantly, # native job IDs will be full length and so match up with the IDs # returned by qsub. (123.a vs 123.a.domain.foo) return [_QSTAT_COMMAND, '-f', '-F', 'json', '-x'] + list(native_ids) def parse_status_output(self, exit_code: int, out: str) -> Dict[str, JobStatus]: """See :meth:`~.BatchSchedulerExecutor.parse_status_output`.""" check_status_exit_code(_QSTAT_COMMAND, exit_code, out) r = {} report = json.loads(out) jobs = report['Jobs'] for native_id in jobs: job_report = jobs[native_id] native_state = job_report["job_state"] state = self._get_state(native_state) if state == JobState.COMPLETED: if 'Exit_status' in job_report and job_report[ 'Exit_status'] == 265: state = JobState.CANCELED elif 'Exit_status' in job_report and job_report[ 'Exit_status'] != 0: state = JobState.FAILED msg = job_report["comment"] r[native_id] = JobStatus(state, message=msg) return r def _get_state(self, state: str) -> JobState: assert state in _STATE_MAP, f"PBS state {state} is not known to PSI/J" return _STATE_MAP[state]