示例#1
0
def test_job_queue__monitor_func():
    has_run = []

    def monitor():
        has_run.append(1)

    duration = 0.1
    jobs = [FakeJob(str(i), duration) for i in range(1)]
    JobQueue.run_jobs(jobs, 5, poll_interval=0.1, monitor_func=monitor)
    assert has_run
示例#2
0
文件: job_runner.py 项目: NREL/jade
    def _run_jobs(self, jobs, num_processes=None):
        num_jobs = len(jobs)
        if num_processes is None:
            max_num_workers = self._intf.get_num_cpus()
        else:
            max_num_workers = num_processes
        num_workers = min(num_jobs, max_num_workers)
        logger.info(
            "Generated %s jobs to execute on %s workers max=%s.",
            num_jobs,
            num_workers,
            max_num_workers,
        )
        self._intf.log_environment_variables()

        name = f"resource_monitor_batch_{self._batch_id}_{self._node_id}"
        group = self._config.get_default_submission_group()
        monitor_type = group.submitter_params.resource_monitor_type
        resource_aggregator = None
        if monitor_type == ResourceMonitorType.AGGREGATION:
            resource_aggregator = ResourceMonitorAggregator(name)
            monitor_func = resource_aggregator.update_resource_stats
            resource_monitor_interval = group.submitter_params.resource_monitor_interval
        elif monitor_type == ResourceMonitorType.PERIODIC:
            resource_monitor = ResourceMonitorLogger(name)
            monitor_func = resource_monitor.log_resource_stats
            resource_monitor_interval = group.submitter_params.resource_monitor_interval
        elif monitor_type == ResourceMonitorType.NONE:
            monitor_func = None
            resource_monitor_interval = None
        else:
            assert False, monitor_type
        JobQueue.run_jobs(
            jobs,
            max_queue_depth=num_workers,
            monitor_func=monitor_func,
            monitor_interval=resource_monitor_interval,
        )

        logger.info("Jobs are complete. count=%s", num_jobs)
        if resource_aggregator is not None:
            resource_aggregator.finalize(self._output)
        self._aggregate_events()
        return Status.GOOD  # TODO
示例#3
0
def test_job_queue__is_full():
    duration = 10
    jobs = [FakeJob(str(i), duration) for i in range(4)]
    queue = JobQueue(2, poll_interval=1)
    assert not queue.is_full()
    for job in jobs:
        queue.submit(job)
    assert queue.is_full()
示例#4
0
    def _run_jobs(self, jobs, num_processes=None):
        num_jobs = len(jobs)
        if num_processes is None:
            max_num_workers = self._intf.get_num_cpus()
        else:
            max_num_workers = num_processes
        num_workers = min(num_jobs, max_num_workers)
        logger.info("Generated %s jobs to execute on %s workers max=%s.",
                    num_jobs, num_workers, max_num_workers)
        self._intf.log_environment_variables()

        name = f"resource_monitor_batch_{self._batch_id}"
        resource_monitor = ResourceMonitor(name)
        # TODO: make this non-blocking so that we can report status.
        JobQueue.run_jobs(
            jobs,
            max_queue_depth=num_workers,
            monitor_func=resource_monitor.log_resource_stats,
        )

        logger.info("Jobs are complete. count=%s", num_jobs)
        self._aggregate_events()
        return Status.GOOD  # TODO
示例#5
0
def test_job_queue__run_jobs_ordering():
    duration = 0.1
    jobs = {}
    for i in range(1, 11):
        name = str(i)
        if i == 1:
            job = FakeJob(name, duration, blocking_jobs=set(["10"]))
        elif i == 2:
            job = FakeJob(name, duration, blocking_jobs=set(["1"]))
        elif i == 3:
            job = FakeJob(name, duration, blocking_jobs=set(["4", "5"]))
        else:
            job = FakeJob(name, duration)
        jobs[name] = job
    JobQueue.run_jobs(jobs.values(), 5, poll_interval=0.1)

    for job in jobs.values():
        assert job.is_complete()
        assert not job.get_blocking_jobs()

    assert jobs["1"].start_time > jobs["10"].end_time
    assert jobs["2"].start_time > jobs["1"].end_time
    assert jobs["3"].start_time > jobs["4"].end_time
    assert jobs["3"].start_time > jobs["5"].end_time
示例#6
0
    def run(self):
        """Try to submit batches of jobs to the HPC.

        Returns
        -------
        bool
            Returns True if all jobs are complete.

        """
        starting_batch_index = self._batch_index
        # TODO: consider whether we need to save the real job names
        hpc_submitters = [
            AsyncHpcSubmitter.create_from_id(self._hpc_mgr,
                                             self._status_collector, x)
            for x in self._cluster.iter_hpc_job_ids()
        ]

        queue = JobQueue(
            self._max_nodes,
            existing_jobs=hpc_submitters,
            poll_interval=self._poll_interval,
        )
        # Statuses may have changed since we last ran.
        # Persistent network errors could cause this submitter to fail.
        # Another submitter will try again later (unless this is the last submitter).
        queue.process_queue()
        completed_job_names, canceled_jobs = self._update_completed_jobs()

        lock_file = Path(self._output) / self.LOCK_FILENAME
        if lock_file.exists():
            raise Exception(
                f"{lock_file} exists. A previous submitter crashed in an unknown state."
            )
        lock_file.touch()

        # Start submitting jobs. If any unexpected exception prevents us from updating the
        # status file, leave the lock_file in place and intentionally cause a deadlock.
        try:
            blocked_jobs = []
            submitted_jobs = []
            for group in self._cluster.config.submission_groups:
                if not queue.is_full():
                    self._submit_batches(queue, group, blocked_jobs,
                                         submitted_jobs)

            num_submissions = self._batch_index - starting_batch_index
            logger.info(
                "num_batches=%s num_submitted=%s num_blocked=%s new_completions=%s",
                num_submissions,
                len(submitted_jobs),
                len(blocked_jobs),
                len(completed_job_names),
            )

            hpc_job_ids = sorted([x.job_id for x in queue.outstanding_jobs])
            self._update_status(
                submitted_jobs,
                blocked_jobs,
                canceled_jobs,
                hpc_job_ids,
                completed_job_names,
            )

            is_complete = self._is_complete()
            os.remove(lock_file)
            return is_complete
        except Exception:
            logger.exception(
                "An exception occurred while the submitter was active. "
                "The state of the cluster is unknown. A deadlock will occur.")
            raise
示例#7
0
def test_job_queue__run_jobs_no_ordering():
    duration = 0.1
    jobs = [FakeJob(str(i), duration) for i in range(10)]
    JobQueue.run_jobs(jobs, 5, poll_interval=0.1)
    for job in jobs:
        assert job.is_complete()
示例#8
0
    def run(self,
            output,
            queue_depth,
            per_node_batch_size,
            num_processes,
            poll_interval=60,
            try_add_blocked_jobs=False,
            verbose=False):
        """Run all jobs defined in the configuration on the HPC."""
        queue = JobQueue(queue_depth, poll_interval=poll_interval)
        jobs = list(self._config.iter_jobs())
        while jobs:
            self._update_completed_jobs(jobs)
            batch = _BatchJobs()
            jobs_to_pop = []
            num_blocked = 0
            for i, job in enumerate(jobs):
                if batch.is_job_blocked(job, try_add_blocked_jobs):
                    num_blocked += 1
                else:
                    batch.append(job)
                    jobs_to_pop.append(i)
                    if batch.num_jobs >= per_node_batch_size:
                        break

            if batch.num_jobs > 0:
                async_submitter = self._make_async_submitter(
                    batch.serialize(),
                    num_processes,
                    output,
                    verbose,
                )
                queue.submit(async_submitter)

                # It might be better to delay submission for a limited number
                # of rounds if there are blocked jobs and the batch isn't full.
                # We can look at these events on our runs to see how this
                # logic is working with our jobs.
                event = StructuredLogEvent(
                    source=self._name,
                    category=EVENT_CATEGORY_HPC,
                    name=EVENT_NAME_HPC_SUBMIT,
                    message="Submitted HPC batch",
                    batch_size=batch.num_jobs,
                    num_blocked=num_blocked,
                    per_node_batch_size=per_node_batch_size,
                )
                log_event(event)
                for i in reversed(jobs_to_pop):
                    jobs.pop(i)
            else:
                logger.debug("No jobs are ready for submission")

            logger.debug("num_submitted=%s num_blocked=%s", batch.num_jobs,
                         num_blocked)

            if batch.num_jobs > 0 and not queue.is_full():
                # Keep submitting.
                continue

            # TODO: this will cause up to <queue_depth> slurm status commands
            # every poll.  We could send one command, get all statuses, and
            # share it among the submitters.
            queue.process_queue()
            time.sleep(poll_interval)

        queue.wait()