예제 #1
0
def test_iter_with_busy_state():
    d = DAG()
    d.add_vertex('a')
    d.add_vertex('b', predecessors=['a'])

    it = DAGIterator(d, enable_busy_state=True)
    for nid, data in it:
        if nid is None:
            it.leave('a')
예제 #2
0
def test_iter_with_busy_state():
    d = DAG()
    d.add_vertex('a')
    d.add_vertex('b', predecessors=['a'])

    it = DAGIterator(d, enable_busy_state=True)
    for nid, data in it:
        if nid is None:
            it.leave('a')
예제 #3
0
def test_iter_with_busy_state():
    d = DAG()
    d.add_vertex("a")
    d.add_vertex("b", predecessors=["a"])

    it = DAGIterator(d, enable_busy_state=True)
    for nid, _ in it:
        if nid is None:
            it.leave("a")
    def __init__(
        self,
        dag: DAG,
        job_factory: JobFactoryCallback,
        collect_result: CollectResultCallback,
        jobs: int = 0,
        dyn_poll_interval: bool = True,
    ):
        """Initialize a MultiprocessScheduler instance.

        :param dag: DAG in which nodes represent units of work to do and edges
            represent dependencies between them.
        :param job_factory: Callback to turn DAG nodes into corresponding
            Worker instances.
        :param collect_result: Callback to extract work result from a worker.
        :param jobs: Maximum of worker allowed to run in parallel. If left to
            0, use the number of available cores on the current machine.
        :param dyn_poll_interval: If True the interval between each polling
            iteration is automatically updated. Otherwise it's set to 0.1
            seconds.
        """
        e = Env()
        self.parallelism = jobs or e.build.cpu.cores
        self.dag = dag

        self.workers: List[Optional[SomeWorker]] = [None] * self.parallelism
        """
        List of active workers. Indexes in this list correspond to slot IDs
        passed to workers: `self.workers[N].slot == N` for all present
        wor,kers. When the worker is done, we just replace it with None, and
        when a slot is None we can create a new worker for it.
        """

        self.iterator = DAGIterator(self.dag, enable_busy_state=True)
        """Iterator to get ready-to-run units of work."""

        self.job_factory = job_factory
        self.collect_result = collect_result

        self.active_workers = 0
        """Equivalent to the number of non-None slots in ``self.workers``."""

        self.poll_interval = 0.1
        """Time (in seconds) to wait between each round of worker polling."""

        self.dyn_poll_interval = dyn_poll_interval

        self.no_free_item = False
        """
        True if there is work waiting to be executed, False if all work to be
        scheduled depends on work that hasn't completed.
        """

        self.no_work_left = False
        """
예제 #5
0
def test_reverse_dag():
    d = DAG()
    d.add_vertex('a')
    d.add_vertex('b', predecessors=['a'])
    d.add_vertex('c', predecessors=['b'])
    d.add_vertex('d', predecessors=['c'])

    it = DAGIterator(d)
    assert [k for k, _ in it] == ['a', 'b', 'c', 'd']

    reverse_d = d.reverse_graph()
    reverse_it = DAGIterator(reverse_d)
    assert [k for k, _ in reverse_it] == ['d', 'c', 'b', 'a']
예제 #6
0
def test_reverse_dag():
    d = DAG()
    d.add_vertex("a")
    d.add_vertex("b", predecessors=["a"])
    d.add_vertex("c", predecessors=["b"])
    d.add_vertex("d", predecessors=["c"])

    it = DAGIterator(d)
    assert [k for k, _ in it] == ["a", "b", "c", "d"]

    reverse_d = d.reverse_graph()
    reverse_it = DAGIterator(reverse_d)
    assert [k for k, _ in reverse_it] == ["d", "c", "b", "a"]
예제 #7
0
    def init_state(self, dag: DAG) -> None:
        """Reinitialize the scheduler state (internal function).

        :param dag: the dag representing the list of job to execute
        """
        # Active jobs
        self.active_jobs = []

        # Total number of jobs in the queues
        self.queued_jobs = 0

        # Have all jobs been queued?
        self.all_jobs_queued = False

        # Message queue to get job end notifications
        self.message_queue = Queue()

        self.dag = dag
        self.dag_iterator = DAGIterator(dag, enable_busy_state=True)
        self.start_time = datetime.now()
        self.stop_time = None
        self.max_active_jobs = 0
class MultiprocessScheduler(Generic[WorkData, SomeWorker]):
    """Scheduler to dispatch units of work to subprocesses."""
    def __init__(
        self,
        dag: DAG,
        job_factory: JobFactoryCallback,
        collect_result: CollectResultCallback,
        jobs: int = 0,
        dyn_poll_interval: bool = True,
    ):
        """Initialize a MultiprocessScheduler instance.

        :param dag: DAG in which nodes represent units of work to do and edges
            represent dependencies between them.
        :param job_factory: Callback to turn DAG nodes into corresponding
            Worker instances.
        :param collect_result: Callback to extract work result from a worker.
        :param jobs: Maximum of worker allowed to run in parallel. If left to
            0, use the number of available cores on the current machine.
        :param dyn_poll_interval: If True the interval between each polling
            iteration is automatically updated. Otherwise it's set to 0.1
            seconds.
        """
        e = Env()
        self.parallelism = jobs or e.build.cpu.cores
        self.dag = dag

        self.workers: List[Optional[SomeWorker]] = [None] * self.parallelism
        """
        List of active workers. Indexes in this list correspond to slot IDs
        passed to workers: `self.workers[N].slot == N` for all present
        wor,kers. When the worker is done, we just replace it with None, and
        when a slot is None we can create a new worker for it.
        """

        self.iterator = DAGIterator(self.dag, enable_busy_state=True)
        """Iterator to get ready-to-run units of work."""

        self.job_factory = job_factory
        self.collect_result = collect_result

        self.active_workers = 0
        """Equivalent to the number of non-None slots in ``self.workers``."""

        self.poll_interval = 0.1
        """Time (in seconds) to wait between each round of worker polling."""

        self.dyn_poll_interval = dyn_poll_interval

        self.no_free_item = False
        """
        True if there is work waiting to be executed, False if all work to be
        scheduled depends on work that hasn't completed.
        """

        self.no_work_left = False
        """
        True if we processed all items from ``self.iterator`` (i.e. we got a
        ``StopIteration`` exception from it).
        """

    @property
    def has_free_slots(self) -> bool:
        """Return whether there is a free slot to spawn a worker."""
        return self.active_workers < self.parallelism

    def spawn_worker(self, uid: str, data: WorkData, slot: int) -> None:
        """Create a worker and assign it to the given slot."""
        assert self.workers[slot] is None
        worker = self.job_factory(uid, data, slot)
        self.workers[slot] = worker
        self.active_workers += 1

    def release_worker(self, slot: int) -> None:
        """Release a worker, freeing the corresponding slot."""
        assert self.workers[slot] is not None
        self.workers[slot] = None
        self.active_workers -= 1

    def run(self) -> None:
        """Run the loop to execute all units of work."""
        # Run the main loop until all fragments are started and have
        # completed. We need to wait for started fragments when catching a
        # KeybordInterrupt exception (user asked to stop, or the testsuite
        # decided to stop because of too many consecutive failures).
        try:
            while self.active_workers > 0 or not self.no_work_left:
                self.poll()

        except KeyboardInterrupt:
            logger.error(
                "Scheduling abortion requested, waiting for all active"
                " workers...")

            # Poll active workers at regular but small interval until they all
            # complete.
            while self.active_workers > 0:
                for slot, worker in enumerate(self.workers):
                    if worker is None:
                        continue

                    # If the worker has completed, release it, but do not
                    # collect it: we are not interested in test results created
                    # after testsuite abortion.
                    still_running = worker.poll(self)
                    if not still_running:
                        self.release_worker(slot)
                        self.iterator.leave(worker.uid)

                time.sleep(0.1)

            # Let the caller know about abnormal interruption
            raise

    def poll(self) -> None:
        # Perform a linear scan to find free slots: allocate a worker for each
        # of them.
        #
        # Note that there is no need to go through this if we already know
        # that:
        #
        # * there is no work left to schedule;
        # * all slots are occupied by workers;
        # * all pending work units depend on non-completed units.
        if (not self.no_work_left and self.has_free_slots
                and not self.no_free_item):
            for slot, worker in enumerate(self.workers):
                if worker is None:
                    # Three possible cases:
                    #
                    # * At least one work unit can be scheduled right now: the
                    #   call to `next` return non-None results.
                    #
                    # * There are work units left, but they all depend on
                    #   non-completed other units: we get two None values.
                    #
                    # * All work units were scheduled: the call to `next`
                    #   raises a StopIteration exception.
                    try:
                        uid, work_data = next(self.iterator)
                    except StopIteration:
                        self.no_work_left = True
                        break

                    if work_data is None:
                        # All pending work units depend on non-completed units.
                        # There is no need to continue scanning workers.
                        self.no_free_item = True
                        break

                    assert isinstance(uid, str)
                    self.spawn_worker(uid, work_data, slot)

                    # No need to continue scanning if that was the last free
                    # slot.
                    if not self.has_free_slots:
                        break

        # Now, wait for some work units to complete if either:
        #
        # * all worker slots are busy;
        # * all pending work depends on non-completed work;
        # * there is no work left to schedule but we have some workers to wait.
        logger.debug("Wait for free worker")
        poll_counter = 0
        while (not self.has_free_slots or self.no_free_item
               or (self.no_work_left and self.active_workers > 0)):
            poll_counter += 1
            for slot, worker in enumerate(self.workers):
                # If the worker has completed, release the corresponding slot
                if worker is not None and not worker.poll(self):
                    self.release_worker(slot)
                    self.iterator.leave(worker.uid)
                    self.no_free_item = False
                    self.collect_result(worker)

            time.sleep(self.poll_interval)

        # Adjust the poll interval if it is dynamic
        if self.dyn_poll_interval:
            self.poll_interval = compute_next_dyn_poll(poll_counter,
                                                       self.poll_interval)
예제 #9
0
class Scheduler:
    """Handle parallel execution of interdependent jobs."""

    def __init__(
        self,
        job_provider: JobProviderCallback,
        collect: Optional[CollectCallback] = None,
        queues: Optional[Dict[str, int]] = None,
        tokens: int = 1,
        job_timeout: int = DEFAULT_JOB_MAX_DURATION,
    ):
        """Initialize Scheduler.

        :param job_provider: function that returns instances of Job.
            The function takes as arguments: the job uid, the data
            associated with it, the list of predecessors id and and
            a notification function called when the job end.
        :param collect: function that collect results from the
            jobs. If the function returns True then the job is requeued
        :param queues: describes the list of queues handled by the
            scheduler. The format is a dictionary for which which
            keys are queue names and value the max number of tokens
            available at a given time. if empty then a single queue
            called "default" is created. Its size is then given by
            the tokens parameter.
        :param tokens: number of tokens for the default queue. Relevant
            only when queues is empty
        :param job_timeout: maximum execution time for a job. The default
            is 24h. If set to None timeout are disabled but it also make
            the scheduller non interruptable when waiting for a job to
            finish.
        """
        self.job_provider = job_provider
        self.job_timeout = job_timeout
        if collect is None:
            self.collect: Callable[[Job], bool] = lambda x: False
        else:
            self.collect = collect

        self.active_jobs: List[Job] = []
        self.queued_jobs = 0
        self.all_jobs_queued = False
        self.message_queue: Queue[Any] = Queue()
        self.dag: Optional[DAG] = None
        self.dag_iterator: Optional[DAGIterator] = None
        self.start_time: Optional[datetime] = None
        self.stop_time: Optional[datetime] = None
        self.max_active_jobs = 0

        # Initialize named queues
        self.queues: Dict[str, List[Tuple[int, int, Job]]] = {}
        self.tokens = {}
        self.n_tokens = 0

        if not queues:
            # If no queues are specificied create a default one
            # with no name.
            queues = {"default": tokens}

        self.global_queue_index = 1

        # Create the queues
        for name, max_token in queues.items():
            self.queues[name] = []
            self.tokens[name] = max_token
            self.n_tokens += max_token

        # Create a slot reserve. The goal is to give a given job a number
        # which is unique among the active jobs. We need a maximum of
        # self.n_tokens slots
        self.slots = list(range(self.n_tokens))

    def safe_collect(self, job: Job) -> bool:
        """Protect call to collect.

        This ensures for job such as JobProcess that there are no calls to
        Run during collect. Main goal is to avoid leak of handles from collect
        to a process spawned by a Job. On Unixes consequences of such leak is
        more a security concern than an operational one. On Windows, this can
        lead easily to file locking issues and thus might cause crashes.
        """
        with Job.lock:
            return self.collect(job)

    def safe_job_provider(
        self,
        uid: str,
        data: Any,
        predecessors: FrozenSet[str],
        notify_end: Callable[[str], None],
    ) -> Job:
        """Protect call to job_provider.

        See safe_collect commment above.
        """
        with Job.lock:
            return self.job_provider(uid, data, predecessors, notify_end)

    @classmethod
    def simple_provider(cls, job_class: Type[Job]) -> JobProviderCallback:
        """Return a simple provider based on a given Job class.

        :param job_class: a subclass of Job
        """

        def provider(
            uid: str,
            data: Any,
            predecessors: FrozenSet[str],
            notify_end: Callable[[str], None],
        ) -> Job:
            del predecessors
            return job_class(uid, data, notify_end)

        return provider

    def init_state(self, dag: DAG) -> None:
        """Reinitialize the scheduler state (internal function).

        :param dag: the dag representing the list of job to execute
        """
        # Active jobs
        self.active_jobs = []

        # Total number of jobs in the queues
        self.queued_jobs = 0

        # Have all jobs been queued?
        self.all_jobs_queued = False

        # Message queue to get job end notifications
        self.message_queue = Queue()

        self.dag = dag
        self.dag_iterator = DAGIterator(dag, enable_busy_state=True)
        self.start_time = datetime.now()
        self.stop_time = None
        self.max_active_jobs = 0

    @property
    def is_finished(self) -> bool:
        """Check if all jobs have been executed (internal).

        :return: True if complete
        """
        # The run is considered finished once there is no more job
        # in the DAG, the queues and that no job is running.
        return self.all_jobs_queued and self.queued_jobs == 0 and not self.active_jobs

    def log_state(self) -> None:
        """Log the current state of the scheduler (internal)."""
        logger.debug(
            "non-ready?: %s, in queue: %s, running: %s",
            not self.all_jobs_queued,
            self.queued_jobs,
            len(self.active_jobs),
        )

    def run(self, dag: DAG) -> None:
        """Launch the scheduler."""
        self.init_state(dag)

        try:
            while not self.is_finished:
                self.enqueue()
                self.launch()
                self.log_state()
                self.max_active_jobs = max(self.max_active_jobs, len(self.active_jobs))
                self.wait()
        except KeyboardInterrupt:
            logger.info("Interrupting jobs...")
            for p in self.active_jobs:
                p.interrupt()
                self.safe_collect(p)
                p.on_finish(self)
            raise KeyboardInterrupt
        self.stop_time = datetime.now()

    def push(self, job: Job) -> None:
        """Push a job into a queue."""
        # We use a tuple here to ensure the stability of the queue
        # when two jobs have similar priorities.
        heapq.heappush(
            self.queues[job.queue_name], (-job.priority, self.global_queue_index, job)
        )
        self.global_queue_index += 1
        self.queued_jobs += 1

    def enqueue(self) -> None:
        """Push into the queues job that are ready (internal)."""
        if self.all_jobs_queued:
            return

        if TYPE_CHECKING:
            assert self.dag_iterator is not None

        try:
            while True:
                uid, data, predecessors = self.dag_iterator.next_element()
                if uid is None:
                    # No more jobs ready
                    return

                job = self.safe_job_provider(
                    typing.cast(str, uid),
                    data,
                    predecessors=typing.cast(typing.FrozenSet[str], predecessors),
                    notify_end=self.message_queue.put,
                )
                if job.should_skip:
                    self.safe_collect(job)
                    job.on_finish(self)
                    self.dag_iterator.leave(uid)
                else:
                    self.push(job)
        except StopIteration:
            self.all_jobs_queued = True

    def launch(self) -> None:
        """Launch next jobs in the queues (internal)."""
        if self.queued_jobs == 0:
            return

        for name in self.queues:
            q = self.queues[name]
            while q and q[0][2].tokens <= self.tokens[name]:
                _, _, next_job = heapq.heappop(q)
                self.queued_jobs -= 1
                next_job.on_start(self)
                next_job.start(slot=self.slots.pop())
                self.tokens[name] -= next_job.tokens
                self.active_jobs.append(next_job)

    def wait(self) -> None:
        """Wait for the end of an active job."""
        if not self.active_jobs:
            return

        if TYPE_CHECKING:
            assert self.dag is not None
            assert self.dag_iterator is not None

        # Wait for message from one the active jobs
        while True:
            # The first job in active jobs is the oldest one
            # compute the get timeout based on its startup information
            first_job = self.active_jobs[0]
            current_timeout = self.job_timeout - first_job.timing_info.duration

            # Ensure waiting time is a positive number
            current_timeout = max(0.1, current_timeout)

            try:
                uid = self.message_queue.get(block=True, timeout=current_timeout)

            except Empty:
                # If after timeout we get an empty result, it means that
                # the oldest job has reached the timeout. Interrupt it
                # and wait for the queue to receive the end notification
                self.active_jobs[0].interrupt()
                time.sleep(0.1)

            else:
                job_index, job = next(
                    (
                        (index, job)
                        for index, job in enumerate(self.active_jobs)
                        if job.uid == uid
                    )
                )
                ti = job.timing_info
                logger.debug(
                    "job %s %s after %s",
                    uid,
                    "interrupted" if job.interrupted else "finished",
                    ti.duration,
                )
                self.slots.append(job.slot)

                # Liberate the resources taken by the job
                self.tokens[job.queue_name] += job.tokens
                collect_result = self.safe_collect(job)
                job.on_finish(self)

                if collect_result:
                    # Requeue when needed
                    self.push(job)
                else:
                    # Mark the job as completed
                    if job.uid in self.dag:
                        self.dag_iterator.leave(job.uid)

                del self.active_jobs[job_index]
                return
예제 #10
0
파일: mainloop.py 프로젝트: AdaCore/e3-core
    def __init__(self,
                 item_list,
                 run_job,
                 collect_result,
                 parallelism=None,
                 abort_file=None,
                 dyn_poll_interval=True):
        """Launch loop.

        :param item_list: a list of jobs or a dag
        :param run_job: a function that takes a job for argument and
            return the spawned process (:class:`e3.os_process.Run` object).
            Its prototype should be ``func (name, job_info)`` with name the job
            identifier and job_info the related information, passed in a tuple
            (slot_number, job_retry). Note that if you want to take advantage
            of the parallelism the spawned process should be launched in
            background (ie with bg=True when using :class:`e3.os_process.Run`).
            If run_job returns SKIP_EXECUTION instead of a Run object
            the mainloop will directly call collect_result without waiting.
        :param collect_result: a function called when a job is finished. The
            prototype should be func (name, process, job_info). If
            collect_result raise NeedRequeue then the test will be requeued.
            job_info is a tuple: (slot_number, job_nb_retry)
        :param parallelism: number of workers
        :type parallelism: int | None
        :param abort_file: If specified, the loop will abort if the file is
            present
        :type abort_file: str | None
        :param dyn_poll_interval: If True the interval between each polling
            iteration is automatically updated. Otherwise it's set to 0.1
            seconds.
        :type dyn_poll_interval: bool
        """
        e = e3.env.Env()
        self.parallelism = e.get_attr("main_options.mainloop_jobs",
                                      default_value=1,
                                      forced_value=parallelism)
        self.abort_file = e.get_attr("main_options.mainloop_abort_file",
                                     default_value=None,
                                     forced_value=abort_file)

        if self.parallelism == 0:
            if e.build.cpu.cores != UNKNOWN:
                self.parallelism = e.build.cpu.cores
            else:
                self.parallelism = 1

        e3.log.debug("start main loop with %d workers (abort on %s)",
                     self.parallelism, self.abort_file)
        self.workers = [None] * self.parallelism
        self.locked_items = [None] * self.parallelism

        if not isinstance(item_list, DAG):
            self.item_list = DAG()
            for index, item in enumerate(item_list):
                self.item_list.add_vertex(str(index), item)
        else:
            self.item_list = item_list

        self.iterator = DAGIterator(self.item_list, enable_busy_state=True)
        self.collect_result = collect_result
        active_workers = 0
        max_active_workers = self.parallelism
        poll_sleep = 0.1
        no_free_item = False

        try:
            while True:
                # Check for abortion
                if self.abort_file is not None and \
                        os.path.isfile(self.abort_file):
                    logger.info('Aborting: file %s has been found',
                                self.abort_file)
                    self.abort()
                    return      # Exit the loop

                # Find free workers
                for slot, worker in enumerate(self.workers):
                    if worker is None:
                        # a worker slot is free so use it for next job
                        next_id, next_job = next(self.iterator)
                        if next_job is None:
                            no_free_item = True
                            break
                        else:
                            self.locked_items[slot] = next_id
                            self.workers[slot] = Worker(next_job,
                                                        run_job,
                                                        collect_result,
                                                        slot)
                            active_workers += 1

                poll_counter = 0
                e3.log.debug('Wait for free worker')
                while active_workers >= max_active_workers or no_free_item:
                    # All worker are occupied so wait for one to finish
                    poll_counter += 1
                    for slot, worker in enumerate(self.workers):
                        if worker is None:
                            continue

                        # Test if the worker is still active and have more
                        # job pending
                        if not (worker.poll() or worker.execute_next()):
                            # If not the case free the worker slot
                            active_workers -= 1
                            self.workers[slot] = None
                            self.iterator.leave(self.locked_items[slot])
                            no_free_item = False
                            self.locked_items[slot] = None

                    sleep(poll_sleep)

                if dyn_poll_interval:
                    poll_sleep = compute_next_dyn_poll(poll_counter,
                                                       poll_sleep)

        except (StopIteration, KeyboardInterrupt) as e:
            if e.__class__ == KeyboardInterrupt:
                # Got ^C, abort the mainloop
                logger.error("User interrupt")

            # All the jobs are finished
            while active_workers > 0:
                for slot, worker in enumerate(self.workers):
                    if worker is None:
                        continue

                    # Test if the worker is still active and ignore any
                    # job pending
                    try:
                        still_running = worker.poll()
                    except TooManyErrors:
                        still_running = False
                        # We're not spawing more jobs so we can safely
                        # ignore all TooManyErrors exceptions.
                    if not still_running:
                        active_workers -= 1
                        self.workers[slot] = None
                    sleep(0.1)

            if e.__class__ == KeyboardInterrupt:
                self.abort()
                raise

        except TooManyErrors:
            # too many failures, abort the execution
            logger.error("Too many errors, aborting")
            self.abort()