def test_iter_with_busy_state(): d = DAG() d.add_vertex('a') d.add_vertex('b', predecessors=['a']) it = DAGIterator(d, enable_busy_state=True) for nid, data in it: if nid is None: it.leave('a')
def test_iter_with_busy_state(): d = DAG() d.add_vertex("a") d.add_vertex("b", predecessors=["a"]) it = DAGIterator(d, enable_busy_state=True) for nid, _ in it: if nid is None: it.leave("a")
def __init__( self, dag: DAG, job_factory: JobFactoryCallback, collect_result: CollectResultCallback, jobs: int = 0, dyn_poll_interval: bool = True, ): """Initialize a MultiprocessScheduler instance. :param dag: DAG in which nodes represent units of work to do and edges represent dependencies between them. :param job_factory: Callback to turn DAG nodes into corresponding Worker instances. :param collect_result: Callback to extract work result from a worker. :param jobs: Maximum of worker allowed to run in parallel. If left to 0, use the number of available cores on the current machine. :param dyn_poll_interval: If True the interval between each polling iteration is automatically updated. Otherwise it's set to 0.1 seconds. """ e = Env() self.parallelism = jobs or e.build.cpu.cores self.dag = dag self.workers: List[Optional[SomeWorker]] = [None] * self.parallelism """ List of active workers. Indexes in this list correspond to slot IDs passed to workers: `self.workers[N].slot == N` for all present wor,kers. When the worker is done, we just replace it with None, and when a slot is None we can create a new worker for it. """ self.iterator = DAGIterator(self.dag, enable_busy_state=True) """Iterator to get ready-to-run units of work.""" self.job_factory = job_factory self.collect_result = collect_result self.active_workers = 0 """Equivalent to the number of non-None slots in ``self.workers``.""" self.poll_interval = 0.1 """Time (in seconds) to wait between each round of worker polling.""" self.dyn_poll_interval = dyn_poll_interval self.no_free_item = False """ True if there is work waiting to be executed, False if all work to be scheduled depends on work that hasn't completed. """ self.no_work_left = False """
def test_reverse_dag(): d = DAG() d.add_vertex('a') d.add_vertex('b', predecessors=['a']) d.add_vertex('c', predecessors=['b']) d.add_vertex('d', predecessors=['c']) it = DAGIterator(d) assert [k for k, _ in it] == ['a', 'b', 'c', 'd'] reverse_d = d.reverse_graph() reverse_it = DAGIterator(reverse_d) assert [k for k, _ in reverse_it] == ['d', 'c', 'b', 'a']
def test_reverse_dag(): d = DAG() d.add_vertex("a") d.add_vertex("b", predecessors=["a"]) d.add_vertex("c", predecessors=["b"]) d.add_vertex("d", predecessors=["c"]) it = DAGIterator(d) assert [k for k, _ in it] == ["a", "b", "c", "d"] reverse_d = d.reverse_graph() reverse_it = DAGIterator(reverse_d) assert [k for k, _ in reverse_it] == ["d", "c", "b", "a"]
def init_state(self, dag: DAG) -> None: """Reinitialize the scheduler state (internal function). :param dag: the dag representing the list of job to execute """ # Active jobs self.active_jobs = [] # Total number of jobs in the queues self.queued_jobs = 0 # Have all jobs been queued? self.all_jobs_queued = False # Message queue to get job end notifications self.message_queue = Queue() self.dag = dag self.dag_iterator = DAGIterator(dag, enable_busy_state=True) self.start_time = datetime.now() self.stop_time = None self.max_active_jobs = 0
class MultiprocessScheduler(Generic[WorkData, SomeWorker]): """Scheduler to dispatch units of work to subprocesses.""" def __init__( self, dag: DAG, job_factory: JobFactoryCallback, collect_result: CollectResultCallback, jobs: int = 0, dyn_poll_interval: bool = True, ): """Initialize a MultiprocessScheduler instance. :param dag: DAG in which nodes represent units of work to do and edges represent dependencies between them. :param job_factory: Callback to turn DAG nodes into corresponding Worker instances. :param collect_result: Callback to extract work result from a worker. :param jobs: Maximum of worker allowed to run in parallel. If left to 0, use the number of available cores on the current machine. :param dyn_poll_interval: If True the interval between each polling iteration is automatically updated. Otherwise it's set to 0.1 seconds. """ e = Env() self.parallelism = jobs or e.build.cpu.cores self.dag = dag self.workers: List[Optional[SomeWorker]] = [None] * self.parallelism """ List of active workers. Indexes in this list correspond to slot IDs passed to workers: `self.workers[N].slot == N` for all present wor,kers. When the worker is done, we just replace it with None, and when a slot is None we can create a new worker for it. """ self.iterator = DAGIterator(self.dag, enable_busy_state=True) """Iterator to get ready-to-run units of work.""" self.job_factory = job_factory self.collect_result = collect_result self.active_workers = 0 """Equivalent to the number of non-None slots in ``self.workers``.""" self.poll_interval = 0.1 """Time (in seconds) to wait between each round of worker polling.""" self.dyn_poll_interval = dyn_poll_interval self.no_free_item = False """ True if there is work waiting to be executed, False if all work to be scheduled depends on work that hasn't completed. """ self.no_work_left = False """ True if we processed all items from ``self.iterator`` (i.e. we got a ``StopIteration`` exception from it). """ @property def has_free_slots(self) -> bool: """Return whether there is a free slot to spawn a worker.""" return self.active_workers < self.parallelism def spawn_worker(self, uid: str, data: WorkData, slot: int) -> None: """Create a worker and assign it to the given slot.""" assert self.workers[slot] is None worker = self.job_factory(uid, data, slot) self.workers[slot] = worker self.active_workers += 1 def release_worker(self, slot: int) -> None: """Release a worker, freeing the corresponding slot.""" assert self.workers[slot] is not None self.workers[slot] = None self.active_workers -= 1 def run(self) -> None: """Run the loop to execute all units of work.""" # Run the main loop until all fragments are started and have # completed. We need to wait for started fragments when catching a # KeybordInterrupt exception (user asked to stop, or the testsuite # decided to stop because of too many consecutive failures). try: while self.active_workers > 0 or not self.no_work_left: self.poll() except KeyboardInterrupt: logger.error( "Scheduling abortion requested, waiting for all active" " workers...") # Poll active workers at regular but small interval until they all # complete. while self.active_workers > 0: for slot, worker in enumerate(self.workers): if worker is None: continue # If the worker has completed, release it, but do not # collect it: we are not interested in test results created # after testsuite abortion. still_running = worker.poll(self) if not still_running: self.release_worker(slot) self.iterator.leave(worker.uid) time.sleep(0.1) # Let the caller know about abnormal interruption raise def poll(self) -> None: # Perform a linear scan to find free slots: allocate a worker for each # of them. # # Note that there is no need to go through this if we already know # that: # # * there is no work left to schedule; # * all slots are occupied by workers; # * all pending work units depend on non-completed units. if (not self.no_work_left and self.has_free_slots and not self.no_free_item): for slot, worker in enumerate(self.workers): if worker is None: # Three possible cases: # # * At least one work unit can be scheduled right now: the # call to `next` return non-None results. # # * There are work units left, but they all depend on # non-completed other units: we get two None values. # # * All work units were scheduled: the call to `next` # raises a StopIteration exception. try: uid, work_data = next(self.iterator) except StopIteration: self.no_work_left = True break if work_data is None: # All pending work units depend on non-completed units. # There is no need to continue scanning workers. self.no_free_item = True break assert isinstance(uid, str) self.spawn_worker(uid, work_data, slot) # No need to continue scanning if that was the last free # slot. if not self.has_free_slots: break # Now, wait for some work units to complete if either: # # * all worker slots are busy; # * all pending work depends on non-completed work; # * there is no work left to schedule but we have some workers to wait. logger.debug("Wait for free worker") poll_counter = 0 while (not self.has_free_slots or self.no_free_item or (self.no_work_left and self.active_workers > 0)): poll_counter += 1 for slot, worker in enumerate(self.workers): # If the worker has completed, release the corresponding slot if worker is not None and not worker.poll(self): self.release_worker(slot) self.iterator.leave(worker.uid) self.no_free_item = False self.collect_result(worker) time.sleep(self.poll_interval) # Adjust the poll interval if it is dynamic if self.dyn_poll_interval: self.poll_interval = compute_next_dyn_poll(poll_counter, self.poll_interval)
class Scheduler: """Handle parallel execution of interdependent jobs.""" def __init__( self, job_provider: JobProviderCallback, collect: Optional[CollectCallback] = None, queues: Optional[Dict[str, int]] = None, tokens: int = 1, job_timeout: int = DEFAULT_JOB_MAX_DURATION, ): """Initialize Scheduler. :param job_provider: function that returns instances of Job. The function takes as arguments: the job uid, the data associated with it, the list of predecessors id and and a notification function called when the job end. :param collect: function that collect results from the jobs. If the function returns True then the job is requeued :param queues: describes the list of queues handled by the scheduler. The format is a dictionary for which which keys are queue names and value the max number of tokens available at a given time. if empty then a single queue called "default" is created. Its size is then given by the tokens parameter. :param tokens: number of tokens for the default queue. Relevant only when queues is empty :param job_timeout: maximum execution time for a job. The default is 24h. If set to None timeout are disabled but it also make the scheduller non interruptable when waiting for a job to finish. """ self.job_provider = job_provider self.job_timeout = job_timeout if collect is None: self.collect: Callable[[Job], bool] = lambda x: False else: self.collect = collect self.active_jobs: List[Job] = [] self.queued_jobs = 0 self.all_jobs_queued = False self.message_queue: Queue[Any] = Queue() self.dag: Optional[DAG] = None self.dag_iterator: Optional[DAGIterator] = None self.start_time: Optional[datetime] = None self.stop_time: Optional[datetime] = None self.max_active_jobs = 0 # Initialize named queues self.queues: Dict[str, List[Tuple[int, int, Job]]] = {} self.tokens = {} self.n_tokens = 0 if not queues: # If no queues are specificied create a default one # with no name. queues = {"default": tokens} self.global_queue_index = 1 # Create the queues for name, max_token in queues.items(): self.queues[name] = [] self.tokens[name] = max_token self.n_tokens += max_token # Create a slot reserve. The goal is to give a given job a number # which is unique among the active jobs. We need a maximum of # self.n_tokens slots self.slots = list(range(self.n_tokens)) def safe_collect(self, job: Job) -> bool: """Protect call to collect. This ensures for job such as JobProcess that there are no calls to Run during collect. Main goal is to avoid leak of handles from collect to a process spawned by a Job. On Unixes consequences of such leak is more a security concern than an operational one. On Windows, this can lead easily to file locking issues and thus might cause crashes. """ with Job.lock: return self.collect(job) def safe_job_provider( self, uid: str, data: Any, predecessors: FrozenSet[str], notify_end: Callable[[str], None], ) -> Job: """Protect call to job_provider. See safe_collect commment above. """ with Job.lock: return self.job_provider(uid, data, predecessors, notify_end) @classmethod def simple_provider(cls, job_class: Type[Job]) -> JobProviderCallback: """Return a simple provider based on a given Job class. :param job_class: a subclass of Job """ def provider( uid: str, data: Any, predecessors: FrozenSet[str], notify_end: Callable[[str], None], ) -> Job: del predecessors return job_class(uid, data, notify_end) return provider def init_state(self, dag: DAG) -> None: """Reinitialize the scheduler state (internal function). :param dag: the dag representing the list of job to execute """ # Active jobs self.active_jobs = [] # Total number of jobs in the queues self.queued_jobs = 0 # Have all jobs been queued? self.all_jobs_queued = False # Message queue to get job end notifications self.message_queue = Queue() self.dag = dag self.dag_iterator = DAGIterator(dag, enable_busy_state=True) self.start_time = datetime.now() self.stop_time = None self.max_active_jobs = 0 @property def is_finished(self) -> bool: """Check if all jobs have been executed (internal). :return: True if complete """ # The run is considered finished once there is no more job # in the DAG, the queues and that no job is running. return self.all_jobs_queued and self.queued_jobs == 0 and not self.active_jobs def log_state(self) -> None: """Log the current state of the scheduler (internal).""" logger.debug( "non-ready?: %s, in queue: %s, running: %s", not self.all_jobs_queued, self.queued_jobs, len(self.active_jobs), ) def run(self, dag: DAG) -> None: """Launch the scheduler.""" self.init_state(dag) try: while not self.is_finished: self.enqueue() self.launch() self.log_state() self.max_active_jobs = max(self.max_active_jobs, len(self.active_jobs)) self.wait() except KeyboardInterrupt: logger.info("Interrupting jobs...") for p in self.active_jobs: p.interrupt() self.safe_collect(p) p.on_finish(self) raise KeyboardInterrupt self.stop_time = datetime.now() def push(self, job: Job) -> None: """Push a job into a queue.""" # We use a tuple here to ensure the stability of the queue # when two jobs have similar priorities. heapq.heappush( self.queues[job.queue_name], (-job.priority, self.global_queue_index, job) ) self.global_queue_index += 1 self.queued_jobs += 1 def enqueue(self) -> None: """Push into the queues job that are ready (internal).""" if self.all_jobs_queued: return if TYPE_CHECKING: assert self.dag_iterator is not None try: while True: uid, data, predecessors = self.dag_iterator.next_element() if uid is None: # No more jobs ready return job = self.safe_job_provider( typing.cast(str, uid), data, predecessors=typing.cast(typing.FrozenSet[str], predecessors), notify_end=self.message_queue.put, ) if job.should_skip: self.safe_collect(job) job.on_finish(self) self.dag_iterator.leave(uid) else: self.push(job) except StopIteration: self.all_jobs_queued = True def launch(self) -> None: """Launch next jobs in the queues (internal).""" if self.queued_jobs == 0: return for name in self.queues: q = self.queues[name] while q and q[0][2].tokens <= self.tokens[name]: _, _, next_job = heapq.heappop(q) self.queued_jobs -= 1 next_job.on_start(self) next_job.start(slot=self.slots.pop()) self.tokens[name] -= next_job.tokens self.active_jobs.append(next_job) def wait(self) -> None: """Wait for the end of an active job.""" if not self.active_jobs: return if TYPE_CHECKING: assert self.dag is not None assert self.dag_iterator is not None # Wait for message from one the active jobs while True: # The first job in active jobs is the oldest one # compute the get timeout based on its startup information first_job = self.active_jobs[0] current_timeout = self.job_timeout - first_job.timing_info.duration # Ensure waiting time is a positive number current_timeout = max(0.1, current_timeout) try: uid = self.message_queue.get(block=True, timeout=current_timeout) except Empty: # If after timeout we get an empty result, it means that # the oldest job has reached the timeout. Interrupt it # and wait for the queue to receive the end notification self.active_jobs[0].interrupt() time.sleep(0.1) else: job_index, job = next( ( (index, job) for index, job in enumerate(self.active_jobs) if job.uid == uid ) ) ti = job.timing_info logger.debug( "job %s %s after %s", uid, "interrupted" if job.interrupted else "finished", ti.duration, ) self.slots.append(job.slot) # Liberate the resources taken by the job self.tokens[job.queue_name] += job.tokens collect_result = self.safe_collect(job) job.on_finish(self) if collect_result: # Requeue when needed self.push(job) else: # Mark the job as completed if job.uid in self.dag: self.dag_iterator.leave(job.uid) del self.active_jobs[job_index] return
def __init__(self, item_list, run_job, collect_result, parallelism=None, abort_file=None, dyn_poll_interval=True): """Launch loop. :param item_list: a list of jobs or a dag :param run_job: a function that takes a job for argument and return the spawned process (:class:`e3.os_process.Run` object). Its prototype should be ``func (name, job_info)`` with name the job identifier and job_info the related information, passed in a tuple (slot_number, job_retry). Note that if you want to take advantage of the parallelism the spawned process should be launched in background (ie with bg=True when using :class:`e3.os_process.Run`). If run_job returns SKIP_EXECUTION instead of a Run object the mainloop will directly call collect_result without waiting. :param collect_result: a function called when a job is finished. The prototype should be func (name, process, job_info). If collect_result raise NeedRequeue then the test will be requeued. job_info is a tuple: (slot_number, job_nb_retry) :param parallelism: number of workers :type parallelism: int | None :param abort_file: If specified, the loop will abort if the file is present :type abort_file: str | None :param dyn_poll_interval: If True the interval between each polling iteration is automatically updated. Otherwise it's set to 0.1 seconds. :type dyn_poll_interval: bool """ e = e3.env.Env() self.parallelism = e.get_attr("main_options.mainloop_jobs", default_value=1, forced_value=parallelism) self.abort_file = e.get_attr("main_options.mainloop_abort_file", default_value=None, forced_value=abort_file) if self.parallelism == 0: if e.build.cpu.cores != UNKNOWN: self.parallelism = e.build.cpu.cores else: self.parallelism = 1 e3.log.debug("start main loop with %d workers (abort on %s)", self.parallelism, self.abort_file) self.workers = [None] * self.parallelism self.locked_items = [None] * self.parallelism if not isinstance(item_list, DAG): self.item_list = DAG() for index, item in enumerate(item_list): self.item_list.add_vertex(str(index), item) else: self.item_list = item_list self.iterator = DAGIterator(self.item_list, enable_busy_state=True) self.collect_result = collect_result active_workers = 0 max_active_workers = self.parallelism poll_sleep = 0.1 no_free_item = False try: while True: # Check for abortion if self.abort_file is not None and \ os.path.isfile(self.abort_file): logger.info('Aborting: file %s has been found', self.abort_file) self.abort() return # Exit the loop # Find free workers for slot, worker in enumerate(self.workers): if worker is None: # a worker slot is free so use it for next job next_id, next_job = next(self.iterator) if next_job is None: no_free_item = True break else: self.locked_items[slot] = next_id self.workers[slot] = Worker(next_job, run_job, collect_result, slot) active_workers += 1 poll_counter = 0 e3.log.debug('Wait for free worker') while active_workers >= max_active_workers or no_free_item: # All worker are occupied so wait for one to finish poll_counter += 1 for slot, worker in enumerate(self.workers): if worker is None: continue # Test if the worker is still active and have more # job pending if not (worker.poll() or worker.execute_next()): # If not the case free the worker slot active_workers -= 1 self.workers[slot] = None self.iterator.leave(self.locked_items[slot]) no_free_item = False self.locked_items[slot] = None sleep(poll_sleep) if dyn_poll_interval: poll_sleep = compute_next_dyn_poll(poll_counter, poll_sleep) except (StopIteration, KeyboardInterrupt) as e: if e.__class__ == KeyboardInterrupt: # Got ^C, abort the mainloop logger.error("User interrupt") # All the jobs are finished while active_workers > 0: for slot, worker in enumerate(self.workers): if worker is None: continue # Test if the worker is still active and ignore any # job pending try: still_running = worker.poll() except TooManyErrors: still_running = False # We're not spawing more jobs so we can safely # ignore all TooManyErrors exceptions. if not still_running: active_workers -= 1 self.workers[slot] = None sleep(0.1) if e.__class__ == KeyboardInterrupt: self.abort() raise except TooManyErrors: # too many failures, abort the execution logger.error("Too many errors, aborting") self.abort()