def test_job_queue__is_full(): duration = 10 jobs = [FakeJob(str(i), duration) for i in range(4)] queue = JobQueue(2, poll_interval=1) assert not queue.is_full() for job in jobs: queue.submit(job) assert queue.is_full()
def run(self): """Try to submit batches of jobs to the HPC. Returns ------- bool Returns True if all jobs are complete. """ starting_batch_index = self._batch_index # TODO: consider whether we need to save the real job names hpc_submitters = [ AsyncHpcSubmitter.create_from_id(self._hpc_mgr, self._status_collector, x) for x in self._cluster.iter_hpc_job_ids() ] queue = JobQueue( self._max_nodes, existing_jobs=hpc_submitters, poll_interval=self._poll_interval, ) # Statuses may have changed since we last ran. # Persistent network errors could cause this submitter to fail. # Another submitter will try again later (unless this is the last submitter). queue.process_queue() completed_job_names, canceled_jobs = self._update_completed_jobs() lock_file = Path(self._output) / self.LOCK_FILENAME if lock_file.exists(): raise Exception( f"{lock_file} exists. A previous submitter crashed in an unknown state." ) lock_file.touch() # Start submitting jobs. If any unexpected exception prevents us from updating the # status file, leave the lock_file in place and intentionally cause a deadlock. try: blocked_jobs = [] submitted_jobs = [] for group in self._cluster.config.submission_groups: if not queue.is_full(): self._submit_batches(queue, group, blocked_jobs, submitted_jobs) num_submissions = self._batch_index - starting_batch_index logger.info( "num_batches=%s num_submitted=%s num_blocked=%s new_completions=%s", num_submissions, len(submitted_jobs), len(blocked_jobs), len(completed_job_names), ) hpc_job_ids = sorted([x.job_id for x in queue.outstanding_jobs]) self._update_status( submitted_jobs, blocked_jobs, canceled_jobs, hpc_job_ids, completed_job_names, ) is_complete = self._is_complete() os.remove(lock_file) return is_complete except Exception: logger.exception( "An exception occurred while the submitter was active. " "The state of the cluster is unknown. A deadlock will occur.") raise
def run(self, output, queue_depth, per_node_batch_size, num_processes, poll_interval=60, try_add_blocked_jobs=False, verbose=False): """Run all jobs defined in the configuration on the HPC.""" queue = JobQueue(queue_depth, poll_interval=poll_interval) jobs = list(self._config.iter_jobs()) while jobs: self._update_completed_jobs(jobs) batch = _BatchJobs() jobs_to_pop = [] num_blocked = 0 for i, job in enumerate(jobs): if batch.is_job_blocked(job, try_add_blocked_jobs): num_blocked += 1 else: batch.append(job) jobs_to_pop.append(i) if batch.num_jobs >= per_node_batch_size: break if batch.num_jobs > 0: async_submitter = self._make_async_submitter( batch.serialize(), num_processes, output, verbose, ) queue.submit(async_submitter) # It might be better to delay submission for a limited number # of rounds if there are blocked jobs and the batch isn't full. # We can look at these events on our runs to see how this # logic is working with our jobs. event = StructuredLogEvent( source=self._name, category=EVENT_CATEGORY_HPC, name=EVENT_NAME_HPC_SUBMIT, message="Submitted HPC batch", batch_size=batch.num_jobs, num_blocked=num_blocked, per_node_batch_size=per_node_batch_size, ) log_event(event) for i in reversed(jobs_to_pop): jobs.pop(i) else: logger.debug("No jobs are ready for submission") logger.debug("num_submitted=%s num_blocked=%s", batch.num_jobs, num_blocked) if batch.num_jobs > 0 and not queue.is_full(): # Keep submitting. continue # TODO: this will cause up to <queue_depth> slurm status commands # every poll. We could send one command, get all statuses, and # share it among the submitters. queue.process_queue() time.sleep(poll_interval) queue.wait()