示例#1
0
def test_job_queue__is_full():
    duration = 10
    jobs = [FakeJob(str(i), duration) for i in range(4)]
    queue = JobQueue(2, poll_interval=1)
    assert not queue.is_full()
    for job in jobs:
        queue.submit(job)
    assert queue.is_full()
示例#2
0
    def run(self):
        """Try to submit batches of jobs to the HPC.

        Returns
        -------
        bool
            Returns True if all jobs are complete.

        """
        starting_batch_index = self._batch_index
        # TODO: consider whether we need to save the real job names
        hpc_submitters = [
            AsyncHpcSubmitter.create_from_id(self._hpc_mgr,
                                             self._status_collector, x)
            for x in self._cluster.iter_hpc_job_ids()
        ]

        queue = JobQueue(
            self._max_nodes,
            existing_jobs=hpc_submitters,
            poll_interval=self._poll_interval,
        )
        # Statuses may have changed since we last ran.
        # Persistent network errors could cause this submitter to fail.
        # Another submitter will try again later (unless this is the last submitter).
        queue.process_queue()
        completed_job_names, canceled_jobs = self._update_completed_jobs()

        lock_file = Path(self._output) / self.LOCK_FILENAME
        if lock_file.exists():
            raise Exception(
                f"{lock_file} exists. A previous submitter crashed in an unknown state."
            )
        lock_file.touch()

        # Start submitting jobs. If any unexpected exception prevents us from updating the
        # status file, leave the lock_file in place and intentionally cause a deadlock.
        try:
            blocked_jobs = []
            submitted_jobs = []
            for group in self._cluster.config.submission_groups:
                if not queue.is_full():
                    self._submit_batches(queue, group, blocked_jobs,
                                         submitted_jobs)

            num_submissions = self._batch_index - starting_batch_index
            logger.info(
                "num_batches=%s num_submitted=%s num_blocked=%s new_completions=%s",
                num_submissions,
                len(submitted_jobs),
                len(blocked_jobs),
                len(completed_job_names),
            )

            hpc_job_ids = sorted([x.job_id for x in queue.outstanding_jobs])
            self._update_status(
                submitted_jobs,
                blocked_jobs,
                canceled_jobs,
                hpc_job_ids,
                completed_job_names,
            )

            is_complete = self._is_complete()
            os.remove(lock_file)
            return is_complete
        except Exception:
            logger.exception(
                "An exception occurred while the submitter was active. "
                "The state of the cluster is unknown. A deadlock will occur.")
            raise
示例#3
0
    def run(self,
            output,
            queue_depth,
            per_node_batch_size,
            num_processes,
            poll_interval=60,
            try_add_blocked_jobs=False,
            verbose=False):
        """Run all jobs defined in the configuration on the HPC."""
        queue = JobQueue(queue_depth, poll_interval=poll_interval)
        jobs = list(self._config.iter_jobs())
        while jobs:
            self._update_completed_jobs(jobs)
            batch = _BatchJobs()
            jobs_to_pop = []
            num_blocked = 0
            for i, job in enumerate(jobs):
                if batch.is_job_blocked(job, try_add_blocked_jobs):
                    num_blocked += 1
                else:
                    batch.append(job)
                    jobs_to_pop.append(i)
                    if batch.num_jobs >= per_node_batch_size:
                        break

            if batch.num_jobs > 0:
                async_submitter = self._make_async_submitter(
                    batch.serialize(),
                    num_processes,
                    output,
                    verbose,
                )
                queue.submit(async_submitter)

                # It might be better to delay submission for a limited number
                # of rounds if there are blocked jobs and the batch isn't full.
                # We can look at these events on our runs to see how this
                # logic is working with our jobs.
                event = StructuredLogEvent(
                    source=self._name,
                    category=EVENT_CATEGORY_HPC,
                    name=EVENT_NAME_HPC_SUBMIT,
                    message="Submitted HPC batch",
                    batch_size=batch.num_jobs,
                    num_blocked=num_blocked,
                    per_node_batch_size=per_node_batch_size,
                )
                log_event(event)
                for i in reversed(jobs_to_pop):
                    jobs.pop(i)
            else:
                logger.debug("No jobs are ready for submission")

            logger.debug("num_submitted=%s num_blocked=%s", batch.num_jobs,
                         num_blocked)

            if batch.num_jobs > 0 and not queue.is_full():
                # Keep submitting.
                continue

            # TODO: this will cause up to <queue_depth> slurm status commands
            # every poll.  We could send one command, get all statuses, and
            # share it among the submitters.
            queue.process_queue()
            time.sleep(poll_interval)

        queue.wait()