예제 #1
0
 def __init__(self, url: Optional[str] = None, config: Optional[SlurmExecutorConfig] = None):
     """Initializes a :class:`~SlurmJobExecutor`."""
     if not config:
         config = SlurmExecutorConfig()
     super().__init__(config=config)
     self.generator = TemplatedScriptGenerator(config, Path(__file__).parent / 'slurm'
                                               / 'slurm.mustache')
예제 #2
0
class _TestJobExecutor(BatchSchedulerExecutor):
    _STATE_MAP = {
        'F': JobState.FAILED,
        'X': JobState.CANCELED,
        'C': JobState.COMPLETED,
        'Q': JobState.QUEUED,
        'R': JobState.ACTIVE,
    }

    def __init__(self, url: Optional[str] = None, config: Optional[_TestExecutorConfig] = None):
        if not config:
            config = _TestExecutorConfig()
        super().__init__(config=config)
        self.generator = TemplatedScriptGenerator(config, Path(__file__).parent / 'test'
                                                  / 'test.mustache')

    def generate_submit_script(self, job: Job, context: Dict[str, object],
                               submit_file: TextIO) -> None:
        self.generator.generate_submit_script(job, context, submit_file)

    def get_submit_command(self, job: Job, submit_file_path: Path) -> List[str]:
        return [sys.executable, QSUB_PATH, str(submit_file_path.absolute())]

    def get_cancel_command(self, native_id: str) -> List[str]:
        return [sys.executable, QDEL_PATH, native_id]

    def process_cancel_command_output(self, exit_code: int, out: str) -> None:
        if exit_code == 16:
            raise InvalidJobStateError()
        else:
            raise SubmitException(out)

    def get_status_command(self, native_ids: Collection[str]) -> List[str]:
        ids = ','.join(native_ids)
        return [sys.executable, QSTAT_PATH, ids]

    def job_id_from_submit_output(self, out: str) -> str:
        return out.strip().split()[-1]

    def parse_status_output(self, exit_code: int, out: str) -> Dict[str, JobStatus]:
        check_status_exit_code(QSTAT_PATH, exit_code, out)
        r = {}
        lines = iter(out.split('\n'))
        for line in lines:
            if not line:
                continue
            cols = line.split(maxsplit=2)
            native_id = cols[0]
            state = self._get_state(cols[1])
            msg = cols[2] if len(cols) == 3 else None
            r[native_id] = JobStatus(state, message=msg)
        return r

    def _get_state(self, state: str) -> JobState:
        assert state in _TestJobExecutor._STATE_MAP
        return _TestJobExecutor._STATE_MAP[state]
예제 #3
0
 def __init__(self,
              url: Optional[str],
              config: Optional[LsfExecutorConfig] = None):
     """Initializes a :class:`~LsfJobExecutor`."""
     if not config:
         config = LsfExecutorConfig()
     super().__init__(config=config)
     self.generator = TemplatedScriptGenerator(
         config,
         Path(__file__).parent / "lsf" / "lsf.mustache")
예제 #4
0
 def __init__(self,
              url: Optional[str] = None,
              config: Optional[PBSProExecutorConfig] = None):
     """Initializes a :class:`~PBSProJobExecutor`."""
     if not config:
         config = PBSProExecutorConfig()
     super().__init__(url=url, config=config)
     self.generator = TemplatedScriptGenerator(
         config,
         Path(__file__).parent / 'pbspro' / 'pbspro.mustache')
예제 #5
0
class LsfJobExecutor(BatchSchedulerExecutor):
    """A :class:`~psij.JobExecutor` for the LSF Workload Manager.

    The `IBM Spectrum LSF workload manager <https://www.ibm.com/docs/en/spectrum-lsf>`_
    is the system resource manager on LLNL's Sierra and Lassen, and ORNL's Summit.

    Uses the 'bsub', 'bjobs', and 'bkill' commands, respectively, to submit,
    monitor, and cancel jobs.

    Creates a batch script with #BSUB directives when submitting a job.
    """

    # see https://www.ibm.com/docs/en/spectrum-lsf/10.1.0?topic=bjobs-description
    _STATE_MAP = {
        "PEND": JobState.QUEUED,  # normal queued state
        "PROV": JobState.QUEUED,  # The job has been dispatched...
        "PSUSP":
        JobState.QUEUED,  # The job has been suspended... while pending.
        "RUN": JobState.ACTIVE,  # The job is currently running.
        "USUSP":
        JobState.ACTIVE,  # The job has been suspended... while running.
        "SSUSP": JobState.ACTIVE,  # The job has been suspended by LSF
        "DONE": JobState.COMPLETED,  # normal 0 exit
        "EXIT": JobState.FAILED,  # nonzero exit
        "UNKWN":
        JobState.ACTIVE,  # mbatchd has lost contact with the job host...
        "WAIT":
        JobState.QUEUED,  # For... members of a chunk job waiting to run.
        "ZOMBI": JobState.ACTIVE,  # something unusual, but probably active
    }

    def __init__(self,
                 url: Optional[str],
                 config: Optional[LsfExecutorConfig] = None):
        """Initializes a :class:`~LsfJobExecutor`."""
        if not config:
            config = LsfExecutorConfig()
        super().__init__(config=config)
        self.generator = TemplatedScriptGenerator(
            config,
            Path(__file__).parent / "lsf" / "lsf.mustache")

    def generate_submit_script(self, job: Job, context: Dict[str, object],
                               submit_file: TextIO) -> None:
        """See :meth:`~.BatchSchedulerExecutor.generate_submit_script`."""
        assert (job.spec is not None)
        context["job_duration"] = int(
            job.spec.attributes.duration.total_seconds() // 60)
        self.generator.generate_submit_script(job, context, submit_file)

    def get_submit_command(self, job: Job,
                           submit_file_path: Path) -> List[str]:
        """See :meth:`~.BatchSchedulerExecutor.get_submit_command`."""
        return ["bsub", str(submit_file_path.absolute())]

    def get_cancel_command(self, native_id: str) -> List[str]:
        """See :meth:`~.BatchSchedulerExecutor.get_cancel_command`.

        ``bkill`` will exit with an error set if the job does not exist
        or has already finished.
        """
        return ["bkill", native_id]

    def process_cancel_command_output(self, exit_code: int, out: str) -> None:
        """See :meth:`~.BatchSchedulerExecutor.process_cancel_command_output`.

        Check if the error was raised only because a job already exited.
        """
        if _BKILL_FAILURE_REGEX.search(out) is None:
            raise SubmitException(out)

    def get_status_command(self, native_ids: Collection[str]) -> List[str]:
        """See :meth:`~.BatchSchedulerExecutor.get_status_command`."""
        return [
            _BJOBS_COMMAND,
            "-o",
            "JOBID STAT EXIT_REASON KILL_REASON SUSPEND_REASON",
            "-json",
            "-a",
            *native_ids,
        ]

    def parse_status_output(self, exit_code: int,
                            out: str) -> Dict[str, JobStatus]:
        """See :meth:`~.BatchSchedulerExecutor.parse_status_output`.

        Iterate through the RECORDS entry, grabbing JOBID and STAT entries, as well
        as any state-change reasons if present.
        """
        check_status_exit_code(_BJOBS_COMMAND, exit_code, out)
        output = json.loads(out)
        status_map = {}
        for entry in output["RECORDS"]:
            if "ERROR" in entry:
                continue
            state = self._STATE_MAP[entry["STAT"]]
            message = None
            for reason in ("EXIT_REASON", "KILL_REASON", "SUSPEND_REASON"):
                if entry[reason]:
                    message = entry["reason"]
                    break
            status_map[entry["JOBID"]] = JobStatus(state, message=message)
        return status_map

    def job_id_from_submit_output(self, out: str) -> str:
        """See :meth:`~.BatchSchedulerExecutor.job_id_from_submit_output`."""
        match = _BSUB_REGEX.search(out)
        if match is None:
            raise SubmitException(out)
        return match.group(0)[5:-1]
예제 #6
0
class SlurmJobExecutor(BatchSchedulerExecutor):
    """A :class:`~psij.JobExecutor` for the Slurm Workload Manager.

    The `Slurm Workload Manager <https://slurm.schedmd.com/overview.html>`_ is a
    widely used resource manager running on machines such as
    NERSC's Perlmutter, as well as a variety of LLNL machines.

    Uses the 'sbatch', 'squeue', and 'scancel' commands, respectively, to submit,
    monitor, and cancel jobs.

    Creates a batch script with #SBATCH directives when submitting a job.
    """

    # see https://slurm.schedmd.com/squeue.html
    _STATE_MAP = {
        'BF': JobState.FAILED,
        'CA': JobState.CANCELED,
        'CD': JobState.COMPLETED,
        'CF': JobState.QUEUED,
        'CG': JobState.ACTIVE,
        'DL': JobState.FAILED,
        'F': JobState.FAILED,
        'NF': JobState.FAILED,
        'OOM': JobState.FAILED,
        'PD': JobState.QUEUED,
        'PR': JobState.FAILED,
        'R': JobState.ACTIVE,
        'RD': JobState.QUEUED,
        'RF': JobState.QUEUED,
        'RH': JobState.QUEUED,
        'RQ': JobState.ACTIVE,
        'SO': JobState.ACTIVE,
        'TO': JobState.FAILED,
        # TODO: double-check these
        'RS': JobState.ACTIVE,
        'RV': JobState.QUEUED,
        'SI': JobState.ACTIVE,
        'SE': JobState.ACTIVE,
        'ST': JobState.ACTIVE,
        'S': JobState.ACTIVE
    }

    # see https://slurm.schedmd.com/squeue.html
    _REASONS_MAP = {
        'AssociationJobLimit': 'The job\'s association has reached its maximum job count.',
        'AssociationResourceLimit': 'The job\'s association has reached some resource limit.',
        'AssociationTimeLimit': 'The job\'s association has reached its time limit.',
        'BadConstraints': 'The job\'s constraints can not be satisfied.',
        'BeginTime': 'The job\'s earliest start time has not yet been reached.',
        'Cleaning': 'The job is being requeued and still cleaning up from its previous execution.',
        'Dependency': 'This job is waiting for a dependent job to complete.',
        'FrontEndDown': 'No front end node is available to execute this job.',
        'InactiveLimit': 'The job reached the system InactiveLimit.',
        'InvalidAccount': 'The job\'s account is invalid.',
        'InvalidQOS': 'The job\'s QOS is invalid.',
        'JobHeldAdmin': 'The job is held by a system administrator.',
        'JobHeldUser': '******',
        'JobLaunchFailure': 'The job could not be launched.This may be due to a file system '
                            'problem, invalid program name, etc.',
        'Licenses': 'The job is waiting for a license.',
        'NodeDown': 'A node required by the job is down.',
        'NonZeroExitCode': 'The job terminated with a non-zero exit code.',
        'PartitionDown': 'The partition required by this job is in a DOWN state.',
        'PartitionInactive': 'The partition required by this job is in an Inactive state and not '
                             'able to start jobs.',
        'PartitionNodeLimit': 'The number of nodes required by this job is outside of its '
                              'partition\'s current limits. Can also indicate that required nodes '
                              'are DOWN or DRAINED.',
        'PartitionTimeLimit': 'The job\'s time limit exceeds its partition\'s current time limit.',
        'Priority': 'One or more higher priority jobs exist for this partition or advanced '
                    'reservation.',
        'Prolog': 'Its PrologSlurmctld program is still running.',
        'QOSJobLimit': 'The job\'s QOS has reached its maximum job count.',
        'QOSResourceLimit': 'The job\'s QOS has reached some resource limit.',
        'QOSTimeLimit': 'The job\'s QOS has reached its time limit.',
        'ReqNodeNotAvail': 'Some node specifically required by the job is not currently available. '
                           'The node may currently be in use, reserved for another job, in an '
                           'advanced reservation, DOWN, DRAINED, or not responding. Nodes which '
                           'are DOWN, DRAINED, or not responding will be identified as part of '
                           'the job\'s "reason" field as "UnavailableNodes". Such nodes will '
                           'typically require the intervention of a system administrator to make '
                           'available.',
        'Reservation': 'The job is waiting its advanced reservation to become available.',
        'Resources': 'The job is waiting for resources to become available.',
        'SystemFailure': 'Failure of the Slurm system, a file system, the network, etc.',
        'TimeLimit': 'The job exhausted its time limit.',
        'QOSUsageThreshold': 'Required QOS threshold has been breached.',
        'WaitingForScheduling': 'No reason has been set for this job yet. Waiting for the '
                                'scheduler to determine the appropriate reason.'
    }

    def __init__(self, url: Optional[str] = None, config: Optional[SlurmExecutorConfig] = None):
        """Initializes a :class:`~SlurmJobExecutor`."""
        if not config:
            config = SlurmExecutorConfig()
        super().__init__(config=config)
        self.generator = TemplatedScriptGenerator(config, Path(__file__).parent / 'slurm'
                                                  / 'slurm.mustache')

    def generate_submit_script(self, job: Job, context: Dict[str, object],
                               submit_file: TextIO) -> None:
        """See :meth:`~.BatchSchedulerExecutor.generate_submit_script`."""
        self.generator.generate_submit_script(job, context, submit_file)

    def get_submit_command(self, job: Job, submit_file_path: Path) -> List[str]:
        """See :meth:`~.BatchSchedulerExecutor.get_submit_command`."""
        return ['sbatch', str(submit_file_path.absolute())]

    def get_cancel_command(self, native_id: str) -> List[str]:
        """See :meth:`~.BatchSchedulerExecutor.get_cancel_command`."""
        return ['scancel', '-Q', native_id]

    def process_cancel_command_output(self, exit_code: int, out: str) -> None:
        """See :meth:`~.BatchSchedulerExecutor.process_cancel_command_output`."""
        raise SubmitException('Failed job cancel job: %s' % out)

    def get_status_command(self, native_ids: Collection[str]) -> List[str]:
        """See :meth:`~.BatchSchedulerExecutor.get_status_command`."""
        ids = ','.join(native_ids)

        # we're not really using job arrays, so this is equivalent to the job ID. However, if
        # we were to use arrays, this would return one ID for the entire array rather than
        # listing each element of the array independently
        return [_SQUEUE_COMMAND, '-O', 'JobArrayID,StateCompact,Reason', '-t', 'all', '-j', ids]

    def parse_status_output(self, exit_code: int, out: str) -> Dict[str, JobStatus]:
        """See :meth:`~.BatchSchedulerExecutor.parse_status_output`."""
        check_status_exit_code(_SQUEUE_COMMAND, exit_code, out)
        r = {}
        lines = iter(out.split('\n'))
        # skip header
        lines.__next__()
        for line in lines:
            if not line:
                continue
            cols = line.split()
            assert len(cols) == 3
            native_id = cols[0]
            state = self._get_state(cols[1])
            msg = self._get_message(cols[2]) if state == JobState.FAILED else None
            r[native_id] = JobStatus(state, message=msg)

        return r

    def _get_state(self, state: str) -> JobState:
        assert state in SlurmJobExecutor._STATE_MAP
        return SlurmJobExecutor._STATE_MAP[state]

    def _get_message(self, reason: str) -> str:
        assert reason in SlurmJobExecutor._REASONS_MAP
        return SlurmJobExecutor._REASONS_MAP[reason]

    def job_id_from_submit_output(self, out: str) -> str:
        """See :meth:`~.BatchSchedulerExecutor.job_id_from_submit_output`."""
        return out.strip().split()[-1]
예제 #7
0
class CobaltJobExecutor(BatchSchedulerExecutor):
    """A :class:`~psij.JobExecutor` for the Cobalt Workload Manager.

    The `Cobalt HPC Job Scheduler <https://xgitlab.cels.anl.gov/aig-public/cobalt>`_,
    is used by `Argonne's <www.anl.gov>`_ `ALCF <www.alcf.anl.gov>`_ systems.

    Uses the ``qsub``, ``qstat``, and ``qdel`` commands, respectively, to submit,
    monitor, and cancel jobs.

    Creates a batch script with #COBALT directives when submitting a job.
    """

    # see https://Cobalt.schedmd.com/squeue.html
    _STATE_MAP = {
        "starting": JobState.ACTIVE,
        "queued": JobState.QUEUED,
        "running": JobState.ACTIVE,
        "exiting": JobState.ACTIVE,
        "killing": JobState.FAILED,
    }

    def __init__(self,
                 url: Optional[str] = None,
                 config: Optional[CobaltExecutorConfig] = None):
        """Initializes a :class:`~CobaltJobExecutor`."""
        if not config:
            config = CobaltExecutorConfig()
        super().__init__(config=config)
        self.generator = TemplatedScriptGenerator(
            config,
            Path(__file__).parent / "cobalt" / "cobalt.mustache")

    def generate_submit_script(self, job: Job, context: Dict[str, object],
                               submit_file: TextIO) -> None:
        """See :meth:`~.BatchSchedulerExecutor.generate_submit_script`."""
        self.generator.generate_submit_script(job, context, submit_file)

    def get_submit_command(self, job: Job,
                           submit_file_path: Path) -> List[str]:
        """See :meth:`~.BatchSchedulerExecutor.get_submit_command`."""
        str_path = str(submit_file_path.absolute())
        os.chmod(str_path, os.stat(str_path).st_mode | stat.S_IEXEC)
        return ["qsub", str_path]

    def get_cancel_command(self, native_id: str) -> List[str]:
        """See :meth:`~.BatchSchedulerExecutor.get_cancel_command`."""
        return ["qdel", native_id]

    def process_cancel_command_output(self, exit_code: int, out: str) -> None:
        """See :meth:`~.BatchSchedulerExecutor.process_cancel_command_output`.

        This should be unnecessary because `qdel` only seems to fail on
        non-integer job IDs.
        """
        raise SubmitException("Failed job cancel job: %s" % out)

    def get_status_command(self, native_ids: Collection[str]) -> List[str]:
        """See :meth:`~.BatchSchedulerExecutor.get_status_command`."""
        return [_QSTAT_COMMAND, "-l", "--header=Jobid:State", *native_ids]

    def parse_status_output(self, exit_code: int,
                            out: str) -> Dict[str, JobStatus]:
        """See :meth:`~.BatchSchedulerExecutor.parse_status_output`."""
        # if none of the job ID passed to Cobalt are recognized, qstat returns 1,
        # but we shouldn't treat that as an error
        if exit_code != 0 and out == UNKNOWN_ERROR:
            return {}
        check_status_exit_code(_QSTAT_COMMAND, exit_code, out)
        job_statuses = {}
        index = 0
        lines = out.split("\n")
        while index < len(lines) - 1:
            jobid_match = _QSTAT_JOBID_REGEX.search(lines[index])
            if jobid_match is not None:
                state_match = _QSTAT_STATE_REGEX.search(lines[index + 1])
                if state_match is not None:
                    job_statuses[jobid_match.group(2)] = JobStatus(
                        self._STATE_MAP[state_match.group(2)])
                    index += 2
                else:
                    index += 1
            else:
                index += 1
        return job_statuses

    def job_id_from_submit_output(self, out: str) -> str:
        """See :meth:`~.BatchSchedulerExecutor.job_id_from_submit_output`."""
        match = _QSUB_REGEX.search(out)
        if match is None:
            raise SubmitException(out)
        return match.group(0)
예제 #8
0
 def __init__(self, url: Optional[str] = None, config: Optional[_TestExecutorConfig] = None):
     if not config:
         config = _TestExecutorConfig()
     super().__init__(config=config)
     self.generator = TemplatedScriptGenerator(config, Path(__file__).parent / 'test'
                                               / 'test.mustache')
예제 #9
0
class PBSProJobExecutor(BatchSchedulerExecutor):
    """A :class:`~psij.JobExecutor` for PBS Pro.

    `PBS Pro <https://www.altair.com/pbs-professional/>`_ is a resource manager
    on certain machines at Argonne National Lab, among others.

    Uses the 'qsub', 'qstat', and 'qdel' commands, respectively, to submit,
    monitor, and cancel jobs.

    Creates a batch script with #PBS directives when submitting a job.
    """
    def __init__(self,
                 url: Optional[str] = None,
                 config: Optional[PBSProExecutorConfig] = None):
        """Initializes a :class:`~PBSProJobExecutor`."""
        if not config:
            config = PBSProExecutorConfig()
        super().__init__(url=url, config=config)
        self.generator = TemplatedScriptGenerator(
            config,
            Path(__file__).parent / 'pbspro' / 'pbspro.mustache')

    # Submit methods

    def generate_submit_script(self, job: Job, context: Dict[str, object],
                               submit_file: TextIO) -> None:
        """See :meth:`~.BatchSchedulerExecutor.generate_submit_script`."""
        self.generator.generate_submit_script(job, context, submit_file)

    def get_submit_command(self, job: Job,
                           submit_file_path: Path) -> List[str]:
        """See :meth:`~.BatchSchedulerExecutor.get_submit_command`."""
        return ['qsub', str(submit_file_path.absolute())]

    def job_id_from_submit_output(self, out: str) -> str:
        """See :meth:`~.BatchSchedulerExecutor.job_id_from_submit_output`."""
        return out.strip().split()[-1]

    # Cancel methods

    def get_cancel_command(self, native_id: str) -> List[str]:
        """See :meth:`~.BatchSchedulerExecutor.get_cancel_command`."""
        # the slurm cancel command had a -Q parameter
        # which does not report an error if the job is already
        # completed.
        # TODO: whats the pbs equivalent of that?
        # there is -x which also removes job history (so would need to
        # check that this doesn't cause implicit COMPLETED states when
        # maybe it should be cancelled states?)
        return ['qdel', native_id]

    def process_cancel_command_output(self, exit_code: int, out: str) -> None:
        """See :meth:`~.BatchSchedulerExecutor.process_cancel_command_output`."""
        raise SubmitException('Failed job cancel job: %s' % out)

    # Status methods

    def get_status_command(self, native_ids: Collection[str]) -> List[str]:
        """See :meth:`~.BatchSchedulerExecutor.get_status_command`."""
        # -x will include finished jobs
        # -f -F json will give json status output that is more mechanically
        # parseable that the default human readable output. Most importantly,
        # native job IDs will be full length and so match up with the IDs
        # returned by qsub. (123.a vs 123.a.domain.foo)
        return [_QSTAT_COMMAND, '-f', '-F', 'json', '-x'] + list(native_ids)

    def parse_status_output(self, exit_code: int,
                            out: str) -> Dict[str, JobStatus]:
        """See :meth:`~.BatchSchedulerExecutor.parse_status_output`."""
        check_status_exit_code(_QSTAT_COMMAND, exit_code, out)
        r = {}

        report = json.loads(out)
        jobs = report['Jobs']
        for native_id in jobs:
            job_report = jobs[native_id]
            native_state = job_report["job_state"]
            state = self._get_state(native_state)

            if state == JobState.COMPLETED:
                if 'Exit_status' in job_report and job_report[
                        'Exit_status'] == 265:
                    state = JobState.CANCELED
                elif 'Exit_status' in job_report and job_report[
                        'Exit_status'] != 0:
                    state = JobState.FAILED

            msg = job_report["comment"]
            r[native_id] = JobStatus(state, message=msg)

        return r

    def _get_state(self, state: str) -> JobState:
        assert state in _STATE_MAP, f"PBS state {state} is not known to PSI/J"
        return _STATE_MAP[state]