Exemplo n.º 1
0
    def prepare(self, commands, environs=None, **gen_opts):
        environs = environs or []
        if self.num_tasks <= 0:
            num_tasks_per_node = self.num_tasks_per_node or 1
            min_num_tasks = (-self.num_tasks
                             if self.num_tasks else num_tasks_per_node)

            try:
                guessed_num_tasks = self.guess_num_tasks()
            except NotImplementedError as e:
                raise JobError('flexible node allocation is not supported by '
                               'this backend') from e

            if guessed_num_tasks < min_num_tasks:
                raise JobError(
                    'could not satisfy the minimum task requirement: '
                    'required %s, found %s' %
                    (min_num_tasks, guessed_num_tasks))

            self.num_tasks = guessed_num_tasks
            getlogger().debug('flex_alloc_nodes: setting num_tasks to %s' %
                              self.num_tasks)

        with shell.generate_script(self.script_filename,
                                   **gen_opts) as builder:
            builder.write_prolog(self.scheduler.emit_preamble(self))
            builder.write(runtime.emit_loadenv_commands(*environs))
            for c in commands:
                builder.write_body(c)
Exemplo n.º 2
0
    def prepare(self, commands, environs=None, **gen_opts):
        environs = environs or []
        if self.num_tasks <= 0:
            num_tasks_per_node = self.num_tasks_per_node or 1
            min_num_tasks = (-self.num_tasks
                             if self.num_tasks else num_tasks_per_node)

            try:
                guessed_num_tasks = self.guess_num_tasks()
            except NotImplementedError as e:
                raise JobError('flexible task allocation is not supported by '
                               'this backend') from e

            if guessed_num_tasks < min_num_tasks:
                nodes_required = min_num_tasks // num_tasks_per_node
                nodes_found = guessed_num_tasks // num_tasks_per_node
                raise JobError('could not find enough nodes: '
                               'required %s, found %s' %
                               (nodes_required, nodes_found))

            self._num_tasks = guessed_num_tasks
            getlogger().debug('flex_alloc_tasks: setting num_tasks to %s' %
                              self._num_tasks)

        with shell.generate_script(self.script_filename,
                                   **gen_opts) as builder:
            builder.write_prolog(self.emit_preamble())
            for e in environs:
                builder.write(e.emit_load_commands())

            for c in commands:
                builder.write_body(c)
Exemplo n.º 3
0
    def guess_num_tasks(self):
        if isinstance(self.sched_flex_alloc_tasks, int):
            if self.sched_flex_alloc_tasks <= 0:
                raise JobError('invalid number of flex_alloc_tasks: %s' %
                               self.sched_flex_alloc_tasks)

            return self.sched_flex_alloc_tasks

        available_nodes = self.get_all_nodes()
        getlogger().debug('flex_alloc_tasks: total available nodes %s ' %
                          len(available_nodes))

        # Try to guess the number of tasks now
        available_nodes = self.filter_nodes(available_nodes,
                                            self.sched_access + self.options)

        if self.sched_flex_alloc_tasks == 'idle':
            available_nodes = {n for n in available_nodes
                               if n.is_available()}
            getlogger().debug(
                'flex_alloc_tasks: selecting idle nodes: '
                'available nodes now: %s' % len(available_nodes))

        num_tasks_per_node = self.num_tasks_per_node or 1
        num_tasks = len(available_nodes) * num_tasks_per_node
        return num_tasks
Exemplo n.º 4
0
    def guess_num_tasks(self):
        num_tasks_per_node = self.num_tasks_per_node or 1
        if isinstance(self.sched_flex_alloc_nodes, int):
            if self.sched_flex_alloc_nodes <= 0:
                raise JobError('invalid number of flex_alloc_nodes: %s' %
                               self.sched_flex_alloc_nodes)

            return self.sched_flex_alloc_nodes * num_tasks_per_node

        available_nodes = self.scheduler.allnodes()
        getlogger().debug(f'[F] Total available nodes: {len(available_nodes)}')

        # Try to guess the number of tasks now
        available_nodes = self.scheduler.filternodes(self, available_nodes)
        if self.sched_flex_alloc_nodes.casefold() != 'all':
            available_nodes = {
                n
                for n in available_nodes
                if n.in_state(self.sched_flex_alloc_nodes)
            }
            getlogger().debug(f'[F] Selecting nodes in state '
                              f'{self.sched_flex_alloc_nodes!r}: '
                              f'available nodes now: {len(available_nodes)}')

        return len(available_nodes) * num_tasks_per_node
Exemplo n.º 5
0
 def _extract_attribute(self, attr_name, node_descr):
     attr_match = re.search(r'%s=(\S+)' % attr_name, node_descr)
     if attr_match:
         return attr_match.group(1)
     else:
         raise JobError("could not extract attribute '%s' from "
                        "node description" % attr_name)
Exemplo n.º 6
0
    def get_all_nodes(self):
        try:
            completed = _run_strict('scontrol -a show -o nodes')
        except SpawnedProcessError as e:
            raise JobError('could not retrieve node information') from e

        node_descriptions = completed.stdout.splitlines()
        return {SlurmNode(descr) for descr in node_descriptions}
Exemplo n.º 7
0
    def allnodes(self):
        try:
            completed = _run_strict('scontrol -a show -o nodes')
        except SpawnedProcessError as e:
            raise JobError('could not retrieve node information') from e

        node_descriptions = completed.stdout.splitlines()
        return _create_nodes(node_descriptions)
Exemplo n.º 8
0
    def _cancel_if_pending_too_long(self, job):
        if not job.max_pending_time or not slurm_state_pending(job.state):
            return

        t_pending = time.time() - job.submit_time
        if t_pending >= job.max_pending_time:
            self.cancel(job)
            job._exception = JobError('maximum pending time exceeded')
Exemplo n.º 9
0
    def _get_nodes_by_name(self, nodespec):
        try:
            completed = _run_strict('scontrol -a show -o node %s' % nodespec)
        except SpawnedProcessError as e:
            raise JobError('could not retrieve the node description '
                           'of nodes: %s' % nodespec) from e

        node_descriptions = completed.stdout.splitlines()
        return {SlurmNode(descr) for descr in node_descriptions}
Exemplo n.º 10
0
    def submit(self):
        cmd = 'sbatch %s' % self.script_filename
        completed = _run_strict(cmd, timeout=settings().job_submit_timeout)
        jobid_match = re.search(r'Submitted batch job (?P<jobid>\d+)',
                                completed.stdout)
        if not jobid_match:
            raise JobError(
                'could not retrieve the job id of the submitted job')

        self._jobid = int(jobid_match.group('jobid'))
Exemplo n.º 11
0
    def __init__(self, node_descr):
        self._name = self._extract_attribute('NodeName', node_descr)
        if not self._name:
            raise JobError('could not extract NodeName from node description')

        self._partitions = self._extract_attribute(
            'Partitions', node_descr, sep=',')
        self._active_features = self._extract_attribute(
            'ActiveFeatures', node_descr, sep=',')
        self._states = self._extract_attribute('State', node_descr, sep='+')
Exemplo n.º 12
0
def test_finished_raises_error(make_job, exec_ctx):
    minimal_job = make_job(sched_access=exec_ctx.access)
    prepare_job(minimal_job, 'echo hello')
    minimal_job.submit()
    minimal_job.wait()

    # Emulate an error during polling and verify that it is raised correctly
    # when finished() is called
    minimal_job._exception = JobError('fake error')
    with pytest.raises(JobError, match='fake error'):
        minimal_job.finished()
Exemplo n.º 13
0
    def prepare(self, builder):
        if self.num_tasks == 0:
            if self.sched_reservation:
                nodes = self._get_reservation_nodes()
                num_nodes = self._count_compatible_nodes(nodes)
                getlogger().debug(
                    'found %s available node(s) in reservation %s' %
                    (num_nodes, self.sched_reservation))
                if num_nodes == 0:
                    raise JobError("could not find any node satisfying the "
                                   "required criteria in reservation '%s'" %
                                   self.sched_reservation)
                num_tasks_per_node = self.num_tasks_per_node or 1
                self._num_tasks = num_nodes * num_tasks_per_node
                getlogger().debug('automatically setting num_tasks to %s' %
                                  self.num_tasks)
            else:
                raise JobError('A reservation has to be specified '
                               'when setting the num_tasks to 0.')

        super().prepare(builder)
Exemplo n.º 14
0
    def _get_reservation_nodes(self, reservation):
        completed = _run_strict('scontrol -a show res %s' % reservation)
        node_match = re.search(r'(Nodes=\S+)', completed.stdout)
        if node_match:
            reservation_nodes = node_match[1]
        else:
            raise JobError("could not extract the nodes names for "
                           "reservation '%s'" % valid_reservation)

        completed = _run_strict('scontrol -a show -o %s' % reservation_nodes)
        node_descriptions = completed.stdout.splitlines()
        return {SlurmNode(descr) for descr in node_descriptions}
Exemplo n.º 15
0
    def _get_excluded_node_names(self):
        if not self.sched_exclude_nodelist:
            return set()

        command = 'scontrol show -o node %s' % self.sched_exclude_nodelist
        try:
            completed = os_ext.run_command(command, check=True)
        except SpawnedProcessError as e:
            raise JobError('could not retrieve the node description '
                           'of nodes: %s' % self.sched_exclude_nodelist) from e

        node_descriptions = completed.stdout.splitlines()
        slurm_nodes = (SlurmNode(descr) for descr in node_descriptions)
        return {n.name for n in slurm_nodes}
Exemplo n.º 16
0
    def submit(self):
        # `-o` and `-e` options are only recognized in command line by the PBS
        # Slurm wrappers.
        cmd = 'qsub -o %s -e %s %s' % (self.stdout, self.stderr,
                                       self.script_filename)
        completed = self._run_command(cmd, settings().job_submit_timeout)
        jobid_match = re.search(r'^(?P<jobid>\S+)', completed.stdout)
        if not jobid_match:
            raise JobError('could not retrieve the job id '
                           'of the submitted job')

        jobid, *info = jobid_match.group('jobid').split('.', maxsplit=2)
        self._jobid = int(jobid)
        if info:
            self._pbs_server = info[0]
Exemplo n.º 17
0
    def _get_reservation_nodes(self):
        command = 'scontrol show res %s' % self.sched_reservation
        completed = os_ext.run_command(command, check=True)
        node_match = re.search('(Nodes=\S+)', completed.stdout)
        if node_match:
            reservation_nodes = node_match[1]
        else:
            raise JobError("could not extract the nodes names for "
                           "reservation '%s'" % self.sched_reservation)

        completed = os_ext.run_command('scontrol show -o %s' %
                                       reservation_nodes,
                                       check=True)
        node_descriptions = completed.stdout.splitlines()
        return (SlurmNode(descr) for descr in node_descriptions)
Exemplo n.º 18
0
    def finished(self, job):
        try:
            self._update_state(job)
        except JobError as e:
            # We ignore these exceptions at this point and we simply mark the
            # job as unfinished.
            getlogger().debug('ignoring error during polling: %s' % e)
            return False
        else:
            if job.max_pending_time and job.state in [
                    'QUEUED', 'HELD', 'WAITING'
            ]:
                if datetime.now() - self._submit_time >= job.max_pending_time:
                    self.cancel(job)
                    raise JobError('maximum pending time exceeded',
                                   jobid=job.jobid)

            return job.state == 'COMPLETED'
Exemplo n.º 19
0
    def finished(self, job):
        try:
            self._update_state(job)
        except JobBlockedError:
            # Job blocked forever; reraise the exception to notify our caller
            raise
        except JobError as e:
            # We ignore these exceptions at this point and we simply mark the
            # job as unfinished.
            getlogger().debug('ignoring error during polling: %s' % e)
            return False
        else:
            if job.max_pending_time and slurm_state_pending(job.state):
                if datetime.now() - self._submit_time >= job.max_pending_time:
                    self.cancel(job)
                    raise JobError('maximum pending time exceeded',
                                   jobid=job.jobid)

            return slurm_state_completed(job.state)
Exemplo n.º 20
0
    def _update_state(self, job):
        '''Check the status of the job.'''

        completed = os_ext.run_command('qstat -f %s' % job.jobid)

        # Depending on the configuration, completed jobs will remain on the job
        # list for a limited time, or be removed upon completion.
        # If qstat cannot find the jobid, it returns code 153.
        if completed.returncode == 153:
            getlogger().debug(
                'jobid not known by scheduler, assuming job completed')
            job.state = 'COMPLETED'
            return

        if completed.returncode != 0:
            raise JobError('qstat failed: %s' % completed.stderr, job.jobid)

        nodelist_match = re.search(r'exec_host = (?P<nodespec>[\S\t\n]+)',
                                   completed.stdout, re.MULTILINE)
        if nodelist_match:
            nodespec = nodelist_match.group('nodespec')
            nodespec = re.sub(r'[\n\t]*', '', nodespec)
            self._set_nodelist(job, nodespec)

        state_match = re.search(r'^\s*job_state = (?P<state>[A-Z])',
                                completed.stdout, re.MULTILINE)
        if not state_match:
            getlogger().debug('job state not found (stdout follows)\n%s' %
                              completed.stdout)
            return

        state = state_match.group('state')
        job.state = JOB_STATES[state]
        if job.state == 'COMPLETED':
            code_match = re.search(
                r'^\s*exit_status = (?P<code>\d+)',
                completed.stdout,
                re.MULTILINE,
            )
            if not code_match:
                return

            job.exitcode = int(code_match.group('code'))
Exemplo n.º 21
0
    def guess_num_tasks(self):
        num_tasks_per_node = self.num_tasks_per_node or 1
        if isinstance(self.sched_flex_alloc_nodes, int):
            if self.sched_flex_alloc_nodes <= 0:
                raise JobError('invalid number of flex_alloc_nodes: %s' %
                               self.sched_flex_alloc_nodes)

            return self.sched_flex_alloc_nodes * num_tasks_per_node

        available_nodes = self.scheduler.allnodes()
        getlogger().debug('flex_alloc_nodes: total available nodes %s ' %
                          len(available_nodes))

        # Try to guess the number of tasks now
        available_nodes = self.scheduler.filternodes(self, available_nodes)
        if self.sched_flex_alloc_nodes == 'idle':
            available_nodes = {n for n in available_nodes if n.is_available()}
            getlogger().debug('flex_alloc_nodes: selecting idle nodes: '
                              'available nodes now: %s' % len(available_nodes))

        return len(available_nodes) * num_tasks_per_node
Exemplo n.º 22
0
    def finished(self, job):
        try:
            self._update_state(job)
        except JobError as e:
            # We ignore these exceptions at this point and we simply mark the
            # job as unfinished.
            getlogger().debug('ignoring error during polling: %s' % e)
            return False
        else:
            if job.max_pending_time and job.state in [
                    'QUEUED', 'HELD', 'WAITING'
            ]:
                if datetime.now() - self._submit_time >= job.max_pending_time:
                    self.cancel(job)
                    raise JobError('maximum pending time exceeded',
                                   jobid=job.jobid)

            stdout = os.path.join(job.workdir, job.stdout)
            stderr = os.path.join(job.workdir, job.stderr)
            output_ready = os.path.exists(stdout) and os.path.exists(stderr)
            done = self._cancelled or output_ready
            return job.state == 'COMPLETED' and done
Exemplo n.º 23
0
    def wait(self, job):
        # Quickly return in case we have finished already
        if slurm_state_completed(job.state):
            if self.is_array(job):
                self._merge_files(job)

            return

        intervals = itertools.cycle([1, 2, 3])
        self._update_state(job)

        while not slurm_state_completed(job.state):
            if job.max_pending_time and slurm_state_pending(job.state):
                if datetime.now() - self._submit_time >= job.max_pending_time:
                    self.cancel(job)
                    raise JobError('maximum pending time exceeded',
                                   jobid=job.jobid)

            time.sleep(next(intervals))
            self._update_state(job)

        if self.is_array(job):
            self._merge_files(job)
Exemplo n.º 24
0
    def poll(self, *jobs):
        def output_ready(job):
            # We report a job as finished only when its stdout/stderr are
            # written back to the working directory
            stdout = os.path.join(job.workdir, job.stdout)
            stderr = os.path.join(job.workdir, job.stderr)
            return os.path.exists(stdout) and os.path.exists(stderr)

        if jobs:
            # Filter out non-jobs
            jobs = [job for job in jobs if job is not None]

        if not jobs:
            return

        completed = osext.run_command(
            f'qstat -f {" ".join(job.jobid for job in jobs)}')

        # Depending on the configuration, completed jobs will remain on the job
        # list for a limited time, or be removed upon completion.
        # If qstat cannot find any of the job IDs, it will return 153.
        # Otherwise, it will return with return code 0 and print information
        # only for the jobs it could find.
        if completed.returncode in (153, 35):
            self.log(f'Return code is {completed.returncode}')
            for job in jobs:
                job._state = 'COMPLETED'
                if job.cancelled or output_ready(job):
                    self.log(f'Assuming job {job.jobid} completed')
                    job._completed = True

            return

        if completed.returncode != 0:
            raise JobSchedulerError(
                f'qstat failed with exit code {completed.returncode} '
                f'(standard error follows):\n{completed.stderr}')

        # Store information for each job separately
        jobinfo = {}
        for job_raw_info in completed.stdout.split('\n\n'):
            jobid_match = re.search(r'^Job Id:\s*(?P<jobid>\S+)', job_raw_info,
                                    re.MULTILINE)
            if jobid_match:
                jobid = jobid_match.group('jobid')
                jobinfo[jobid] = job_raw_info

        for job in jobs:
            if job.jobid not in jobinfo:
                self.log(f'Job {job.jobid} not known to scheduler')
                job._state = 'COMPLETED'
                if job.cancelled or output_ready(job):
                    self.log(f'Assuming job {job.jobid} completed')
                    job._completed = True

                continue

            info = jobinfo[job.jobid]
            state_match = re.search(r'^\s*job_state = (?P<state>[A-Z])', info,
                                    re.MULTILINE)
            if not state_match:
                self.log(f'Job state not found (job info follows):\n{info}')
                continue

            state = state_match.group('state')
            job._state = JOB_STATES[state]
            nodelist_match = re.search(r'exec_host = (?P<nodespec>[\S\t\n]+)',
                                       info, re.MULTILINE)
            if nodelist_match:
                nodespec = nodelist_match.group('nodespec')
                nodespec = re.sub(r'[\n\t]*', '', nodespec)
                self._update_nodelist(job, nodespec)

            if job.state == 'COMPLETED':
                exitcode_match = re.search(
                    r'^\s*exit_status = (?P<code>\d+)',
                    info,
                    re.MULTILINE,
                )
                if exitcode_match:
                    job._exitcode = int(exitcode_match.group('code'))

                # We report a job as finished only when its stdout/stderr are
                # written back to the working directory
                done = job.cancelled or output_ready(job)
                if done:
                    job._completed = True
            elif (job.state in ['QUEUED', 'HELD', 'WAITING']
                  and job.max_pending_time):
                if (time.time() - job.submit_time >= job.max_pending_time):
                    self.cancel(job)
                    job._exception = JobError('maximum pending time exceeded',
                                              job.jobid)
Exemplo n.º 25
0
    def poll(self, *jobs):
        if jobs:
            # Filter out non-jobs
            jobs = [job for job in jobs if job is not None]

        if not jobs:
            return

        for job in jobs:
            completed = _run_strict(f'oarstat -fj {job.jobid}')

            # Store information for each job separately
            jobinfo = {}

            # Typical oarstat -fj <job_id> output:
            # https://github.com/oar-team/oar/blob/0fccc4fc3bb86ee935ce58effc5aec514a3e155d/sources/core/qfunctions/oarstat#L310
            job_raw_info = completed.stdout
            jobid_match = re.search(r'^Job_Id:\s*(?P<jobid>\S+)',
                                    completed.stdout, re.MULTILINE)
            if jobid_match:
                jobid = jobid_match.group('jobid')
                jobinfo[jobid] = job_raw_info

            if job.jobid not in jobinfo:
                self.log(f'Job {job.jobid} not known to scheduler, '
                         f'assuming job completed')
                job._state = 'Terminated'
                job._completed = True
                continue

            info = jobinfo[job.jobid]
            state_match = re.search(r'^\s*state = (?P<state>[A-Z]\S+)', info,
                                    re.MULTILINE)
            if not state_match:
                self.log(f'Job state not found (job info follows):\n{info}')
                continue

            job._state = state_match.group('state')
            if oar_state_completed(job.state):
                exitcode_match = re.search(
                    r'^\s*exit_code = (?P<code>\d+)',
                    info,
                    re.MULTILINE,
                )

                if exitcode_match:
                    job._exitcode = int(exitcode_match.group('code'))

                # We report a job as finished only when its stdout/stderr are
                # written back to the working directory
                stdout = os.path.join(job.workdir, job.stdout)
                stderr = os.path.join(job.workdir, job.stderr)
                out_ready = os.path.exists(stdout) and os.path.exists(stderr)
                done = job.cancelled or out_ready
                if done:
                    job._completed = True
            elif oar_state_pending(job.state) and job.max_pending_time:
                if time.time() - job.submit_time >= job.max_pending_time:
                    self.cancel(job)
                    job._exception = JobError('maximum pending time exceeded',
                                              job.jobid)
Exemplo n.º 26
0
 def _run_command(self, cmd, timeout=None):
     """Run command cmd and re-raise any exception as a JobError."""
     try:
         return os_ext.run_command(cmd, check=True, timeout=timeout)
     except SpawnedProcessError as e:
         raise JobError(jobid=self._jobid) from e