def test_job_queue__monitor_func(): has_run = [] def monitor(): has_run.append(1) duration = 0.1 jobs = [FakeJob(str(i), duration) for i in range(1)] JobQueue.run_jobs(jobs, 5, poll_interval=0.1, monitor_func=monitor) assert has_run
def _run_jobs(self, jobs, num_processes=None): num_jobs = len(jobs) if num_processes is None: max_num_workers = self._intf.get_num_cpus() else: max_num_workers = num_processes num_workers = min(num_jobs, max_num_workers) logger.info( "Generated %s jobs to execute on %s workers max=%s.", num_jobs, num_workers, max_num_workers, ) self._intf.log_environment_variables() name = f"resource_monitor_batch_{self._batch_id}_{self._node_id}" group = self._config.get_default_submission_group() monitor_type = group.submitter_params.resource_monitor_type resource_aggregator = None if monitor_type == ResourceMonitorType.AGGREGATION: resource_aggregator = ResourceMonitorAggregator(name) monitor_func = resource_aggregator.update_resource_stats resource_monitor_interval = group.submitter_params.resource_monitor_interval elif monitor_type == ResourceMonitorType.PERIODIC: resource_monitor = ResourceMonitorLogger(name) monitor_func = resource_monitor.log_resource_stats resource_monitor_interval = group.submitter_params.resource_monitor_interval elif monitor_type == ResourceMonitorType.NONE: monitor_func = None resource_monitor_interval = None else: assert False, monitor_type JobQueue.run_jobs( jobs, max_queue_depth=num_workers, monitor_func=monitor_func, monitor_interval=resource_monitor_interval, ) logger.info("Jobs are complete. count=%s", num_jobs) if resource_aggregator is not None: resource_aggregator.finalize(self._output) self._aggregate_events() return Status.GOOD # TODO
def test_job_queue__is_full(): duration = 10 jobs = [FakeJob(str(i), duration) for i in range(4)] queue = JobQueue(2, poll_interval=1) assert not queue.is_full() for job in jobs: queue.submit(job) assert queue.is_full()
def _run_jobs(self, jobs, num_processes=None): num_jobs = len(jobs) if num_processes is None: max_num_workers = self._intf.get_num_cpus() else: max_num_workers = num_processes num_workers = min(num_jobs, max_num_workers) logger.info("Generated %s jobs to execute on %s workers max=%s.", num_jobs, num_workers, max_num_workers) self._intf.log_environment_variables() name = f"resource_monitor_batch_{self._batch_id}" resource_monitor = ResourceMonitor(name) # TODO: make this non-blocking so that we can report status. JobQueue.run_jobs( jobs, max_queue_depth=num_workers, monitor_func=resource_monitor.log_resource_stats, ) logger.info("Jobs are complete. count=%s", num_jobs) self._aggregate_events() return Status.GOOD # TODO
def test_job_queue__run_jobs_ordering(): duration = 0.1 jobs = {} for i in range(1, 11): name = str(i) if i == 1: job = FakeJob(name, duration, blocking_jobs=set(["10"])) elif i == 2: job = FakeJob(name, duration, blocking_jobs=set(["1"])) elif i == 3: job = FakeJob(name, duration, blocking_jobs=set(["4", "5"])) else: job = FakeJob(name, duration) jobs[name] = job JobQueue.run_jobs(jobs.values(), 5, poll_interval=0.1) for job in jobs.values(): assert job.is_complete() assert not job.get_blocking_jobs() assert jobs["1"].start_time > jobs["10"].end_time assert jobs["2"].start_time > jobs["1"].end_time assert jobs["3"].start_time > jobs["4"].end_time assert jobs["3"].start_time > jobs["5"].end_time
def run(self): """Try to submit batches of jobs to the HPC. Returns ------- bool Returns True if all jobs are complete. """ starting_batch_index = self._batch_index # TODO: consider whether we need to save the real job names hpc_submitters = [ AsyncHpcSubmitter.create_from_id(self._hpc_mgr, self._status_collector, x) for x in self._cluster.iter_hpc_job_ids() ] queue = JobQueue( self._max_nodes, existing_jobs=hpc_submitters, poll_interval=self._poll_interval, ) # Statuses may have changed since we last ran. # Persistent network errors could cause this submitter to fail. # Another submitter will try again later (unless this is the last submitter). queue.process_queue() completed_job_names, canceled_jobs = self._update_completed_jobs() lock_file = Path(self._output) / self.LOCK_FILENAME if lock_file.exists(): raise Exception( f"{lock_file} exists. A previous submitter crashed in an unknown state." ) lock_file.touch() # Start submitting jobs. If any unexpected exception prevents us from updating the # status file, leave the lock_file in place and intentionally cause a deadlock. try: blocked_jobs = [] submitted_jobs = [] for group in self._cluster.config.submission_groups: if not queue.is_full(): self._submit_batches(queue, group, blocked_jobs, submitted_jobs) num_submissions = self._batch_index - starting_batch_index logger.info( "num_batches=%s num_submitted=%s num_blocked=%s new_completions=%s", num_submissions, len(submitted_jobs), len(blocked_jobs), len(completed_job_names), ) hpc_job_ids = sorted([x.job_id for x in queue.outstanding_jobs]) self._update_status( submitted_jobs, blocked_jobs, canceled_jobs, hpc_job_ids, completed_job_names, ) is_complete = self._is_complete() os.remove(lock_file) return is_complete except Exception: logger.exception( "An exception occurred while the submitter was active. " "The state of the cluster is unknown. A deadlock will occur.") raise
def test_job_queue__run_jobs_no_ordering(): duration = 0.1 jobs = [FakeJob(str(i), duration) for i in range(10)] JobQueue.run_jobs(jobs, 5, poll_interval=0.1) for job in jobs: assert job.is_complete()
def run(self, output, queue_depth, per_node_batch_size, num_processes, poll_interval=60, try_add_blocked_jobs=False, verbose=False): """Run all jobs defined in the configuration on the HPC.""" queue = JobQueue(queue_depth, poll_interval=poll_interval) jobs = list(self._config.iter_jobs()) while jobs: self._update_completed_jobs(jobs) batch = _BatchJobs() jobs_to_pop = [] num_blocked = 0 for i, job in enumerate(jobs): if batch.is_job_blocked(job, try_add_blocked_jobs): num_blocked += 1 else: batch.append(job) jobs_to_pop.append(i) if batch.num_jobs >= per_node_batch_size: break if batch.num_jobs > 0: async_submitter = self._make_async_submitter( batch.serialize(), num_processes, output, verbose, ) queue.submit(async_submitter) # It might be better to delay submission for a limited number # of rounds if there are blocked jobs and the batch isn't full. # We can look at these events on our runs to see how this # logic is working with our jobs. event = StructuredLogEvent( source=self._name, category=EVENT_CATEGORY_HPC, name=EVENT_NAME_HPC_SUBMIT, message="Submitted HPC batch", batch_size=batch.num_jobs, num_blocked=num_blocked, per_node_batch_size=per_node_batch_size, ) log_event(event) for i in reversed(jobs_to_pop): jobs.pop(i) else: logger.debug("No jobs are ready for submission") logger.debug("num_submitted=%s num_blocked=%s", batch.num_jobs, num_blocked) if batch.num_jobs > 0 and not queue.is_full(): # Keep submitting. continue # TODO: this will cause up to <queue_depth> slurm status commands # every poll. We could send one command, get all statuses, and # share it among the submitters. queue.process_queue() time.sleep(poll_interval) queue.wait()