예제 #1
0
def process_batch(stage: BatchStage, items: Sequence[DataItem],
                  error_manager: ErrorManager) -> List[Optional[DataItem]]:
    """
    Execute the :meth:`.stage.BatchStage.process_batch` method of a batch stage for a batch of items
    """
    ret: List[Optional[DataItem]] = [None] * len(items)
    to_process = {}
    for i, item in enumerate(items):
        if error_manager.check_critical_errors(item):
            ret[i] = item
        else:
            _logger.debug(f"{stage} is going to process {item}")
            to_process[i] = item
    time1 = time.time()
    try:
        _logger.debug(f"{stage} is processing {len(to_process)} items")
        processed = stage.process_batch(list(to_process.values()))
        _logger.debug(
            f"{stage} has finished processing {len(to_process)} items")
    except Exception as e:
        _logger.debug(
            f"{stage} had failures in processing {len(to_process)} items")
        spent = (time.time() - time1) / (len(to_process) or 1.0)
        for i, item in to_process.items():
            item.set_timing(stage.name, spent)
            error_manager.handle(e, stage, item)
            ret[i] = item
        return ret
    spent = (time.time() - time1) / (len(to_process) or 1.0)
    for n, i in enumerate(to_process.keys()):
        item = processed[n]
        item.set_timing(stage.name, spent)
        ret[i] = item
    return ret
예제 #2
0
def test_batch_errors(caplog):
    pipeline = (_pipeline().set_error_manager(ErrorManager()).set_source(
        RandomTextSource(22)).append_stage("reverser", BatchTextReverser(
            size=5)).append_stage("error", BatchErrorStage(size=3)).build())
    for item in pipeline.run():
        assert item.has_errors()
        assert item.get_timing("reverser")
        assert item.get_timing("error")
        error = next(item.soft_errors())
        assert error.get_exception() is None
        assert str(error) == "test pipeline error"
    assert pipeline.count == 22
    assert any(caplog.records)
    pipeline = (_pipeline().set_error_manager(ErrorManager()).set_source(
        RandomTextSource(28)).append_stage("reverser", BatchTextReverser(
            size=8)).append_stage("error",
                                  BatchErrorStage(size=7)).append_stage(
                                      "duplicator",
                                      BatchTextDuplicator(size=5)).build())
    for item in pipeline.run():
        assert item.has_errors()
        assert item.get_timing("reverser")
        assert item.get_timing("duplicator")
        assert any(k.startswith("text_") for k in item.payload.keys())
        assert item.get_timing("error")
        error = next(item.soft_errors())
        assert error.get_exception() is None
        assert str(error) == "test pipeline error"
    assert pipeline.count == 28
    assert any(caplog.records)
    pipeline = (_pipeline().set_error_manager(ErrorManager()).set_source(
        RandomTextSource(10)).append_stage("reverser", BatchTextReverser(
            size=3)).append_stage("error1",
                                  BatchExceptionStage(size=7)).append_stage(
                                      "error2",
                                      BatchErrorStage(size=1)).build())
    for item in pipeline.run():
        assert item.has_critical_errors()
        assert item.get_timing("reverser")
        assert item.get_timing("error1") >= 0.0003
        assert not item.get_timing("error2")
        for error in item.critical_errors():
            assert isinstance(error.get_exception(), Exception)
            assert (str(error.get_exception()) == "test exception"
                    and str(error) != "test pipeline error")
    assert pipeline.count == 10
    assert any(caplog.records)
    with pytest.raises(Exception):
        pipeline = (_pipeline().set_source(RandomTextSource(10)).append_stage(
            "reverser", BatchTextReverser(size=4)).append_stage(
                "error", BatchExceptionStage(size=3)).build())
        for _ in pipeline.run():
            pass
        assert pipeline.count == 1
예제 #3
0
def test_batch_concurrent_stage_container1():
    manager = Manager()
    source = SourceContainer()
    source.set(ListSource([DataItem() for _ in range(200)]))
    previous = BatchStageContainer("test0", BatchTextGenerator(),
                                   ErrorManager())
    previous.set_previous(source)
    container = BatchConcurrentStageContainer(
        "test2",
        BatchTextReverser(timeout=1.0),
        ErrorManager(),
        manager.Queue,
        lambda: ProcessCounter(manager),
        manager.Event,
    )
    container.set_previous(previous)
    container.run()
    for _ in range(10):
        previous.process()
    items5 = list(_get_items(container))
    assert items5 and all(items5)
    assert container.count() == len(items5)
    for _ in range(11):
        previous.process()
    items6 = list(_get_items(container))
    assert items6 and all(items6)
    assert all(item.payload.get("text") for item in items5)
    assert all(item.payload.get("text") for item in items6)
    assert items5 != items6
    assert not container.is_stopped() and not container.is_terminated()
    container.empty_queues()
    container.terminate()

    container = BatchConcurrentStageContainer(
        "test2",
        BatchTextReverser(timeout=0.0),
        ErrorManager(),
        manager.Queue,
        lambda: ProcessCounter(manager),
        manager.Event,
    )
    container.set_previous(previous)
    container.run()
    queue = container.out_queue
    for item in items6:
        queue.put(item)
    result = list(_get_items(container))
    for i, item in enumerate(items6):
        assert item.payload == result[i].payload, "On item {}".format(i)
    container.terminate()
예제 #4
0
def test_batch_concurrent_stage_container2():
    manager = Manager()
    source = SourceContainer()
    items = [DataItem() for _ in range(10)]
    for item in items:
        item.payload["text"] = "something"
    source.set(ListSource(items))
    container = BatchConcurrentStageContainer(
        "test3",
        BatchTextGenerator(),
        ErrorManager(),
        manager.Queue,
        lambda: ProcessCounter(manager),
        manager.Event,
    )
    container.set_previous(source)
    container.run()
    for _ in range(10):
        source.pop_into_queue()
    time.sleep(2)
    assert list(_get_items(container))
    container.terminate()
    source.prepend_item(None)
    time.sleep(1)
    assert container.is_terminated()
    container.shutdown()
예제 #5
0
def batch_stage_executor(
    stage: BatchStage,
    in_queue: ItemsQueue,
    out_queue: ItemsQueue,
    error_manager: ErrorManager,
    terminated: Event,
    has_started_counter: ConcurrentCounter,
    counter: ConcurrentCounter,
):
    """
    Consume items in batches from an input queue, process and put them in a output queue, indefinitely,
    until a termination event is set
    """
    if isinstance(counter, ProcessCounter):
        # call these only if the stage and the error manager are copies of the original,
        # ergo this executor is running in a child process
        error_manager.on_start()
        stage.on_start()
    has_started_counter += 1
    while True:
        if terminated.is_set() and in_queue.empty():
            return
        items = []
        try:
            for _ in range(stage.size):
                item = in_queue.get(block=True, timeout=stage.timeout)
                # give priority to the Stop event item
                if isinstance(item, Stop):
                    out_queue.put(item, block=True)
                elif item is not None:
                    items.append(item)
                in_queue.task_done()
        except queue.Empty:
            if not any(items):
                continue
        if any(items):
            try:
                items = process_batch(stage, items, error_manager)
            except Exception as e:
                raise e
            else:
                for item in items:
                    if item is not None:
                        out_queue.put(item, block=True)
                        if not isinstance(item, Stop):
                            counter += 1
예제 #6
0
def process(stage: Stage, item: DataItem,
            error_manager: ErrorManager) -> DataItem:
    """
    Execute the :meth:`.stage.Stage.process` method of a stage for an item
    """
    if error_manager.check_critical_errors(item):
        return item
    time1 = time.time()
    try:
        _logger.debug(f"{stage} is processing {item}")
        processed_item = stage.process(item)
        _logger.debug(f"{stage} has finished processing {processed_item}")
    except Exception as e:
        _logger.debug(f"{stage} has failed processing {item}")
        item.set_timing(stage.name, time.time() - time1)
        error_manager.handle(e, stage, item)
        return item
    # this can't be in a finally, otherwise it would register the `error_manager.handle` time
    processed_item.set_timing(stage.name, time.time() - time1)
    return processed_item
예제 #7
0
def stage_executor(
    stage: Stage,
    in_queue: ItemsQueue,
    out_queue: ItemsQueue,
    error_manager: ErrorManager,
    terminated: Event,
    has_started_counter: ConcurrentCounter,
    counter: ConcurrentCounter,
):
    """
    Consume items from an input queue, process and put them in a output queue, indefinitely,
    until a termination event is set
    """
    if isinstance(counter, ProcessCounter):
        # call these only if the stage and the error manager are copies of the original,
        # ergo this executor is running in a child process
        error_manager.on_start()
        stage.on_start()
    has_started_counter += 1
    while True:
        if terminated.is_set() and in_queue.empty():
            return
        try:
            item = in_queue.get(block=True, timeout=CONCURRENCY_WAIT)
        except queue.Empty:
            continue
        if isinstance(item, Stop):
            out_queue.put(item, block=True)
            in_queue.task_done()
        elif item is not None:
            try:
                item = process(stage, item, error_manager)
            except Exception as e:
                raise e
            else:
                if item is not None:
                    out_queue.put(item, block=True)
                    if not isinstance(item, Stop):
                        counter += 1
            finally:
                in_queue.task_done()
예제 #8
0
def test_manager(caplog):
    manager = ErrorManager()
    stage = TextReverser()
    item = DataItem()
    manager.handle(SoftError(), stage, item)
    assert any(caplog.records)
    assert item.has_errors()
    assert not item.has_critical_errors()
    item = DataItem()
    manager.handle(CriticalError(), stage, item)
    assert not item.has_errors()
    assert item.has_critical_errors()
    assert any(caplog.records)
    item = DataItem()
    manager.handle(ValueError(), stage, item)
    manager.handle(KeyError(), stage, item)
    manager.handle(KeyError(), stage, item)
    assert not item.has_errors()
    assert item.has_critical_errors()
    assert len(list(item.critical_errors())) == 3
    for record in caplog.records:
        assert "has generated an error" in record.message
예제 #9
0
 def __init__(
     self,
     max_init_workers: Optional[int] = None,
     max_queues_size: int = MAX_QUEUES_SIZE,
 ):
     """
     :param max_init_workers: Number of workers to use for concurrent initialization of stages, default the number of CPUs
     :param max_queues_size: Maximum size of any queue instanced for the pipeline (stage input and output queues)
     """
     self._containers = LastOrderedDict()
     self._error_manager = ErrorManager()
     self._max_init_workers = max_init_workers
     self._init_executor = None
     self._wait_previous_executor = None
     self._pipeline_executor = None
     self._max_queues_size = max_queues_size
     self._out_queue = None
     self._enqueue_source = False
     self._sync_manager = None
     # an empty source, on which we can only occasionally send items
     self._source_container = SourceContainer()
     self._count = 0
     self._executors_ready = False
예제 #10
0
def test_batch_stage_container2():
    source = SourceContainer()
    container = BatchStageContainer("test1", BatchTextReverser(),
                                    ErrorManager())
    items = [DataItem() for _ in range(10)]
    for item in items:
        item.payload["text"] = "something"
    source.set(ListSource(items))
    container.set_previous(source)
    processed = container.process()
    assert len(processed) == 10 and not any(
        isinstance(item, Stop) for item in processed)
    reprocessed = container.process()
    assert any(isinstance(item, Stop) for item in reprocessed)
    assert container.is_stopped() and not container.is_terminated()
예제 #11
0
def test_batch_stage_container1():
    manager = Manager()
    simple_item = DataItem()
    simple_item.payload["text"] = "hello world"
    source = SourceContainer()
    source.set(ListSource([DataItem() for _ in range(200)]))
    previous = BatchStageContainer("test0", BatchTextGenerator(),
                                   ErrorManager())
    previous.set_previous(source)
    container = BatchStageContainer("test1", BatchTextReverser(),
                                    ErrorManager())
    container.set_previous(previous)
    previous.process()
    items1 = container.process()
    assert len(items1) == container.count()
    items2 = list(_get_items(container))
    assert all(items1) and all(items2)
    assert all(item.payload.get("text") for item in items1)
    assert all(item.payload.get("text") for item in items2)
    assert items1 == items2
    previous.process()
    items3 = container.process()
    items4 = list(_get_items(container))
    assert all(items3) and all(items4)
    assert all(item.payload.get("text") for item in items3)
    assert all(item.payload.get("text") for item in items4)
    assert items1 != items3
    assert items3 == items4
    assert not container.is_stopped() and not container.is_terminated()
    container.init_queue(manager.Queue)
    queue = container.out_queue
    for item in items4:
        queue.put(item)
    result = list(_get_items(container))
    for i, item in enumerate(items4):
        assert item.payload == result[i].payload
예제 #12
0
def test_stage_container():
    manager = Manager()
    simple_item = DataItem()
    simple_item.payload["text"] = "hello world"
    source = SourceContainer()
    source.set(ListSource([DataItem() for _ in range(20)]))
    previous = StageContainer("test0", TextGenerator(), ErrorManager())
    previous.set_previous(source)
    container = StageContainer("test1", TextReverser(), ErrorManager())
    container.set_previous(previous)
    previous.process()
    assert container.count() == 0
    item1 = container.process()
    item2 = container.get_processed()
    assert item1 and item2
    assert item1 == item2
    assert container.count() == 1
    previous.process()
    item3 = container.process()
    item4 = container.get_processed()
    assert container.count() == 2
    assert item3 and item4
    assert item1 != item3
    assert item3 == item4
    assert not container.is_stopped() and not container.is_terminated()
    container.init_queue(manager.Queue)
    queue = container.out_queue
    queue.put(item4)
    assert item4.payload == container.get_processed().payload
    source = SourceContainer()
    source.set(ListSource([simple_item]))
    container.set_previous(source)
    assert container.process()
    assert isinstance(container.process(), Stop)
    assert container.is_stopped() and not container.is_terminated()

    container = ConcurrentStageContainer(
        "test2",
        TextReverser(),
        ErrorManager(),
        manager.Queue,
        lambda: ProcessCounter(manager),
        manager.Event,
    )
    container.set_previous(previous)
    container.run()
    previous.process()
    item5 = container.get_processed(block=True)
    assert item5
    previous.process()
    item6 = container.get_processed(block=True)
    assert item6
    assert item5 != item6
    assert not container.is_stopped() and not container.is_terminated()
    queue = container.out_queue
    queue.put(item6)
    assert item6.payload == container.get_processed().payload
    container.terminate()
    container.shutdown()

    source = SourceContainer()
    source.set(ListSource([simple_item]))
    container = ConcurrentStageContainer(
        "test2",
        TextReverser(),
        ErrorManager(),
        manager.Queue,
        lambda: ProcessCounter(manager),
        manager.Event,
    )
    container.set_previous(source)
    container.run()
    source.pop_into_queue()
    assert container.get_processed(block=True)
    source.pop_into_queue()
    assert isinstance(container.get_processed(block=True), Stop)
    container.terminate()
    source.prepend_item(None)
    time.sleep(1)
    assert container.is_terminated()
    container.shutdown()
예제 #13
0
def _pipeline(*args, **kwargs):
    return Pipeline(*args, **kwargs).set_error_manager(
        ErrorManager().raise_on_critical_error())
예제 #14
0
def test_critical_errors(caplog):
    stage = TextReverser()
    manager = ErrorManager()
    item = DataItem()
    error = CriticalError()
    error.with_exception(Exception())
    managed_critical_error = manager.handle(error, stage, item)
    assert not item.has_errors()
    assert item.has_critical_errors()
    assert isinstance(
        next(item.critical_errors()).get_exception(),
        type(managed_critical_error.get_exception()),
    )
    assert any(caplog.records)
    manager = ErrorManager().raise_on_critical_error()
    item = DataItem()
    with pytest.raises(CriticalError):
        manager.handle(CriticalError(), stage, item)
    with pytest.raises(Exception):
        error = CriticalError().with_exception(Exception())
        manager.handle(error, stage, item)
    assert any(caplog.records)
    assert item.has_critical_errors()
    assert not item.has_errors()
    manager = ErrorManager().no_skip_on_critical_error()
    item = DataItem()
    assert manager.handle(CriticalError(), stage, item) is None
    assert not item.has_errors()
    assert item.has_critical_errors()
    assert any(caplog.records)
    item = DataItem()
    manager.handle(ValueError(), stage, item)
    manager.handle(KeyError(), stage, item)
    manager.handle(KeyError(), stage, item)
    assert not item.has_errors()
    assert item.has_critical_errors()
    assert len(list(item.critical_errors())) == 3
    for record in caplog.records:
        assert "has generated an error" in record.message
예제 #15
0
class Pipeline:
    def __init__(
        self,
        max_init_workers: Optional[int] = None,
        max_queues_size: int = MAX_QUEUES_SIZE,
    ):
        """
        :param max_init_workers: Number of workers to use for concurrent initialization of stages, default the number of CPUs
        :param max_queues_size: Maximum size of any queue instanced for the pipeline (stage input and output queues)
        """
        self._containers = LastOrderedDict()
        self._error_manager = ErrorManager()
        self._max_init_workers = max_init_workers
        self._init_executor = None
        self._wait_previous_executor = None
        self._pipeline_executor = None
        self._max_queues_size = max_queues_size
        self._out_queue = None
        self._enqueue_source = False
        self._sync_manager = None
        # an empty source, on which we can only occasionally send items
        self._source_container = SourceContainer()
        self._count = 0
        self._executors_ready = False

    def _new_mp_queue(self) -> ItemsQueue:
        """
        Construct queue for multiprocessing communication
        """
        if self._sync_manager is None:
            self._sync_manager = Manager()
        return self._sync_manager.Queue(maxsize=self._max_queues_size)

    def _new_queue(self) -> ItemsQueue:
        """
        Construct queue for communication
        """
        return Queue(maxsize=self._max_queues_size)

    def _new_mp_event(self) -> Event:
        """
        Construct synchronization event for multiprocessing
        """
        if self._sync_manager is None:
            self._sync_manager = Manager()
        return self._sync_manager.Event()

    @staticmethod
    def _new_event() -> Event:
        """
        Construct synchronization event
        """
        return Event()

    def _new_mp_counter(self) -> ProcessCounter:
        """
        Construct a safe counter for multiprocessing
        """
        if self._sync_manager is None:
            self._sync_manager = Manager()
        return ProcessCounter(self._sync_manager)

    @staticmethod
    def _new_counter() -> ThreadCounter:
        """
        Construct a safe counter for threads
        """
        return ThreadCounter()

    def _wait_executors(self, wait_seconds: float = CONCURRENCY_WAIT):
        """
        Wait for all containers to start

        :param wait_seconds: Recurrently wait these seconds for all stage initializers to finish
        """
        if self._executors_ready:
            return
        if self._init_executor is not None:
            self._init_executor.shutdown(wait=True)
            self._init_executor = None
        while not all(self._containers.values()):
            time.sleep(wait_seconds)
        self._wait_previous_executor.shutdown(wait=True)
        for name, container in self._containers.items():
            if isinstance(
                    container,
                (ConcurrentStageContainer, BatchConcurrentStageContainer)):
                container.run()
        # finalize initialization of the error manager shared by this and other stage threads
        self._error_manager.on_start()
        self._executors_ready = True
        _logger.debug("Pipeline ready to run")

    def shutdown(self):
        if self._out_queue is not None:
            self._out_queue.join()
        # if self._init_executor is not None:
        #     self._init_executor.shutdown()
        # FIXME stage shutdown may raise exception, the executor gets stuck
        # for name, stage in self._containers.items():
        #     if isinstance(stage, (ConcurrentStageContainer, BatchConcurrentStageContainer)):
        #         stage.shutdown()
        if self._sync_manager is not None:
            self._sync_manager.shutdown()

    def __del__(self):
        self.shutdown()

    def build(self) -> Pipeline:
        """
        Pipeline builder method
        """
        if not any(self._containers):
            raise ValueError("Must append at least a stage")
        _logger.debug(f"Building the pipeline on stages: {self._log_stages()}")
        self._wait_executors()
        return self

    def run(self) -> Generator[DataItem, None, None]:
        """
        Run the pipeline given a source and a concatenation of stages.
        Get the sequence of items through iteration

        :return: Iterator over processed items
        :raises ValueError: When a source has not been set for the pipeline
        """
        if not self._source_container.is_set():
            raise ValueError("Set the data source for this pipeline")
        _logger.debug(f"Running the pipeline on stages: {self._log_stages()}")
        counter = 0
        last_stage_name = self._last_stage_name()
        terminator_thread = None
        source_thread = None
        # in case the first stage is concurrent
        if self._enqueue_source:
            source_thread = Thread(
                target=self._source_container.pop_into_queue)
            source_thread.start()
        while True:
            for name, container in self._containers.items():
                try:
                    # concurrent stages run by themselves in threads/processes
                    if not isinstance(
                            container,
                        (ConcurrentStageContainer,
                         BatchConcurrentStageContainer),
                    ):
                        container.process()
                    # but me must periodically check for errors
                    else:
                        container.check_errors()
                except Exception as e:
                    self.stop()
                    # TODO in case of errors we loose pending items!
                    self._terminate_all(force=True)
                    self.shutdown()
                    self._count += 1
                    raise e
                # retrieve finally processed items from the last stage
                if name == last_stage_name:
                    for _ in range(container.size if isinstance(
                            container, BatchStageContainer) else 1):
                        item = container.get_processed()
                        if item is not None:
                            if not isinstance(item, Stop):
                                yield item
                                counter += 1
                                self._count += 1
                            # if a stop is finally signaled, start termination of all containers
                            elif (not self._all_terminated()
                                  and terminator_thread is None):
                                terminator_thread = Thread(
                                    target=self._terminate_all)
                                terminator_thread.start()
                        # an item is None if the final output queue is empty
                        else:
                            break
            # exit the loop only when all items have been returned
            if self._all_empty() and counter >= self._source_container.count():
                if source_thread is not None:
                    source_thread.join()
                if terminator_thread is not None:
                    terminator_thread.join()
                    self.shutdown()
                return

    @property
    def count(self) -> int:
        """
        Get the number of items processed by all executed runs, also for items which have failed

        :return: Count of processed items
        """
        return self._count

    def _terminate_all(self,
                       force: bool = False,
                       wait_seconds: float = CONCURRENCY_WAIT):
        """
        Terminate all running containers

        :param force: If True do not wait for a container to process all items produced by the source
        :param wait_seconds: Time to wait before pinging again a container for its termination
        """
        _logger.debug("Terminating the pipeline")
        # scroll the pipeline by its order and terminate stages after the relative queues are empty
        for container in self._containers.values():
            if not force:
                # ensure the stage has processed all source items
                while container.count() < self._source_container.count():
                    time.sleep(wait_seconds)
            container.terminate()
            if isinstance(container, ConcurrentStageContainer):
                if force:
                    # empty the queues, losing pending items
                    container.empty_queues()
                while not container.is_terminated():
                    time.sleep(wait_seconds)
                container.queues_join()
                while not container.queues_empty():
                    time.sleep(wait_seconds)
        _logger.debug("Termination done")

    def _all_terminated(self) -> bool:
        """
        Check if all containers have been alerted for termination and are exited
        """
        return all(container.is_terminated()
                   for container in self._containers.values())

    def _all_empty(self) -> bool:
        """
        Check if all containers are terminated and there are not items left in the queues
        """
        return self._all_terminated() and all(
            container.queues_empty()
            for container in self._containers.values()
            if isinstance(container, (ConcurrentStageContainer,
                                      BatchConcurrentStageContainer)))

    def process(self, item: DataItem) -> DataItem:
        """
        Process a single item synchronously (no concurrency) through the pipeline
        """
        _logger.debug(f"Processing {item} on stages: {self._log_stages()}")
        last_stage_name = self._containers.last_key()
        self._source_container.prepend_item(item)
        for name, container in self._containers.items():
            container.process()
            if name == last_stage_name:
                return container.get_processed(block=True)

    def process_async(self,
                      item: DataItem,
                      callback: Optional[Callable[[DataItem], Any]] = None):
        """
        Process a single item asynchronously through the pipeline, stages may run concurrently.
        The call returns immediately, processed items are retrieved with :meth:`.Pipeline.get_item`

        :param callback: A function to call after a successful process of the item
        """
        _logger.debug(
            f"Processing asynchronously {item} on stages: {self._log_stages()}"
        )
        if callback is not None:
            item.set_callback(callback)
        self._source_container.prepend_item(item)
        self._start_pipeline_executor()

    def stop(self):
        """
        Tell the source to stop to generate items and consequently the pipeline
        """
        self._source_container.stop()

    def get_item(self, block: bool = True) -> DataItem:
        """
        Get a single item from the asynchronous execution of the pipeline on single items from :meth:`.Pipeline.process_async`

        :param block: If True wait indefinitely for the next processed item
        :raises ValueError: When there is not output queue set, the pipeline is not running asynchronously
        :raises queue.Empty: When we do not block and the queue is empty
        """
        if self._out_queue is not None:
            item = self._out_queue.get(block)
            self._out_queue.task_done()
            return item
        else:
            raise ValueError(
                "No pipeline is running asynchronously, not item can be retrieved from the output queue"
            )

    def set_source(self, source: Source) -> Pipeline:
        """
        Set the source of the pipeline: a subclass of :class:`.stage.Source`
        """
        self._source_container.set(source)
        return self

    def set_error_manager(self, error_manager: ErrorManager) -> Pipeline:
        """
        Set the error manager for handling errors from each stage item processing
        """
        self._error_manager = error_manager
        for container in self._containers.values():
            container.set_error_manager(self._error_manager)
        return self

    def _last_stage_name(self) -> str:
        if self._containers:
            return self._containers.last_key()

    def _last_container(self) -> BaseContainer:
        if self._containers:
            return self._containers[self._last_stage_name()]
        else:
            return self._source_container

    def _wait_for_previous(
        self,
        container: ConnectedStageMixin,
        last_stage_name: str,
        wait_seconds: float = CONCURRENCY_WAIT,
    ):
        """
        Given a container we want to append to the pipeline, wait for the last one (added to the pipeline) to be created

        :param container: A container to add to the pipeline
        :param last_stage_name: Name of the last stage currently in the pipeline
        :param wait_seconds: Time to recurrently wait the construction of the container relative to the last stage in the pipeline
        """
        def _waiter():
            if last_stage_name is not None:
                while self._containers[last_stage_name] is None:
                    time.sleep(wait_seconds)
                container.set_previous(self._containers[last_stage_name])
            else:
                container.set_previous(self._source_container)

        executor = self._get_wait_previous_executor()
        executor.submit(_waiter)

    def _build_container(self, name: str, stage: StageType, concurrency: int,
                         parallel: bool) -> BaseContainer:
        """
        Get a new container instance according to the pipeline configuration

        :param name: Stage name
        :param stage: A stage instance
        :param concurrency: Number of concurrent stage executions, if 0 then just create the non-concurrent containers
        :param parallel: If True use multiprocessing, otherwise threads
        """
        if concurrency <= 0:
            constructor = (BatchStageContainer if isinstance(
                stage, BatchStage) else StageContainer)
            # if not concurrent we must explicitly finalize initialization of this single stage object
            stage.on_start()
            return constructor(name, stage, self._error_manager)
        else:
            constructor = (BatchConcurrentStageContainer if isinstance(
                stage, BatchStage) else ConcurrentStageContainer)
            if parallel:
                return constructor(
                    name,
                    stage,
                    self._error_manager,
                    self._new_mp_queue,
                    self._new_mp_counter,
                    self._new_mp_event,
                    concurrency,
                    parallel,
                )
            else:
                # if the stage is executed on multiple threads we must finalize initialization once,
                # while on multiprocessing each process executor calls it for its own copy of the stage
                stage.on_start()
                return constructor(
                    name,
                    stage,
                    self._error_manager,
                    self._new_queue,
                    self._new_counter,
                    self._new_event,
                    concurrency,
                    parallel,
                )

    def get_stage(self, name: str) -> StageType:
        """
        Get a stage instance by its name
        """
        return self._containers.get(name).stage

    def append_stage(
        self,
        name: str,
        stage: StageType,
        concurrency: int = 0,
        parallel: bool = False,
    ) -> Pipeline:
        """
        Append a stage to the pipeline just after the last one appended, or after the source if it is the first stage

        :param name: Name for identify the stage in the pipeline, it is also set in the stage and it must be unique in the pipeline
        :param stage: Instance of a stage
        :param concurrency: Number of concurrent stage executions, if 0 then threads/processes won't be involved for this stage
        :param parallel: If True use multiprocessing, otherwise threads
        """
        self._executors_ready = False
        # FIXME here we force a BatchStage to run on a thread, but we would leave it on the main thread
        if concurrency < 1 and isinstance(stage, BatchStage):
            parallel = False
            concurrency = 1
        self._check_stage_name(name)
        container = self._build_container(name, stage, concurrency, parallel)
        if concurrency > 0:
            # if it is concurrent and it is the first stage, make the source working on a output queue
            if not self._containers:
                self._enqueue_source = True
        self._wait_for_previous(
            container,
            self._last_stage_name())  # wait that previous stage is initialized
        self._containers[name] = container
        return self

    def append_stage_concurrently(
        self,
        name: str,
        stage_class: Callable,
        args: Sequence = None,
        kwargs: Mapping = None,
        concurrency: int = 0,
        parallel: bool = False,
    ) -> Pipeline:
        """
        Append a stage class to the pipeline just after the last one appended, or after the source if it is the first stage.
        The stage construction will be executed concurrently respect to the general pipeline construction

        :param name: Name for identify the stage in the pipeline, it is also set in the stage and it must be unique in the pipeline
        :param stage_class: Class of a stage
        :param args: List of arguments for the stage constructor
        :param kwargs: Dictionary of keyed arguments for the stage constructor
        :param concurrency: Number of concurrent stage executions, if 0 then threads/processes won't be involved for this stage
        :param parallel: If True use multiprocessing, otherwise threads
        """
        self._executors_ready = False
        # FIXME here we force a BatchStage to run on a thread, but we would leave it on the main thread
        if concurrency < 1 and issubclass(stage_class, BatchStage):
            parallel = False
            concurrency = 1
        if kwargs is None:
            kwargs = {}
        if args is None:
            args = []
        self._check_stage_name(name)
        # if it is concurrent and it is the first stage, make the source working on a output queue
        if concurrency > 0 and not self._containers:
            self._enqueue_source = True
        last_stage_name = self._last_stage_name()
        # set it immediately so the order of the calls of this method is followed in `_containers`
        self._containers[name] = None
        future = self._get_init_executor(parallel).submit(
            stage_class, *args, **kwargs)

        def append_stage(stage_future: Future):
            stage = stage_future.result()
            container = self._build_container(name, stage, concurrency,
                                              parallel)
            self._wait_for_previous(container, last_stage_name)
            self._containers[name] = container

        future.add_done_callback(append_stage)
        return self

    def _get_init_executor(self, parallel: bool = False) -> Executor:
        """
        Get a pool executor for concurrent stage initialization

        :param parallel: True if the executor uses multiprocessing, otherwise treads
        """
        if self._init_executor is None:
            executor = ThreadPoolExecutor if not parallel else ProcessPoolExecutor
            self._init_executor = executor(max_workers=self._max_init_workers)
        return self._init_executor

    def _get_wait_previous_executor(self) -> Executor:
        """
        Get a pool executor for the function that will recurrently wait for a container to be ready
        """
        if self._wait_previous_executor is None:
            self._wait_previous_executor = ThreadPoolExecutor()
        return self._wait_previous_executor

    def _start_pipeline_executor(self) -> Thread:
        """
        Get a thread where to run a pipeline that accepts asynchronous processing of single items
        """
        if self._pipeline_executor is None:
            self._init_out_queue()

            def pipeline_runner():
                for item in self.run():
                    item.callback()
                    self._out_queue.put(item)

            self._pipeline_executor = Thread(target=pipeline_runner,
                                             daemon=True)
            self._pipeline_executor.start()
        return self._pipeline_executor

    def _check_stage_name(self, name: str):
        """
        Check if a stage name is not already defined in the pipeline
        :raises ValueError: Stage name is already defined in the pipeline
        """
        if name in self._containers:
            raise ValueError(
                f"The stage name {name} is already used in this pipeline")

    def _init_out_queue(self):
        """
        Get the internal output pipeline for asynchronous processing of single items
        """
        self._out_queue = self._new_queue()

    def _log_stages(self):
        return ", ".join(self._containers.keys())
예제 #16
0
def test_errors(caplog):
    pipeline = (_pipeline().set_error_manager(ErrorManager()).set_source(
        RandomTextSource(10)).append_stage("reverser",
                                           TextReverser()).append_stage(
                                               "error", ErrorStage()).build())
    for item in pipeline.run():
        assert item.has_errors()
        assert item.get_timing("reverser")
        assert item.get_timing("error")
        error = next(item.soft_errors())
        assert error.get_exception() is None
        assert str(error) == "test pipeline error"
    assert all("stage error has generated an error" in record.msg.lower()
               for record in caplog.records if record.levelno == logging.ERROR)
    assert pipeline.count == 10
    pipeline = (_pipeline().set_error_manager(ErrorManager()).set_source(
        RandomTextSource(10)).append_stage("reverser",
                                           TextReverser()).append_stage(
                                               "error",
                                               ErrorStage()).append_stage(
                                                   "duplicator",
                                                   TextDuplicator()).build())
    caplog.clear()
    for item in pipeline.run():
        assert item.has_errors()
        assert item.get_timing("reverser")
        assert item.get_timing("duplicator")
        assert any(k.startswith("text_") for k in item.payload.keys())
        assert item.get_timing("error")
        error = next(item.soft_errors())
        assert error.get_exception() is None
        assert str(error) == "test pipeline error"
    assert all("stage error has generated an error" in record.msg.lower()
               for record in caplog.records if record.levelno == logging.ERROR)
    assert pipeline.count == 10
    pipeline = (_pipeline().set_error_manager(ErrorManager()).set_source(
        RandomTextSource(10)).append_stage("reverser",
                                           TextReverser()).append_stage(
                                               "error1",
                                               CriticalIOErrorStage()).build())
    for item in pipeline.run():
        assert item.has_critical_errors()
        assert item.get_timing("reverser")
        assert item.get_timing("error1")
        for error in item.critical_errors():
            assert isinstance(error.get_exception(), IOError)
            assert str(error) == "test pipeline critical IO error"
    pipeline = (_pipeline().set_error_manager(ErrorManager()).set_source(
        RandomTextSource(10)).append_stage("reverser",
                                           TextReverser()).append_stage(
                                               "error1",
                                               ExceptionStage()).append_stage(
                                                   "error2",
                                                   ErrorStage()).build())
    caplog.clear()
    for item in pipeline.run():
        assert item.has_critical_errors()
        assert item.get_timing("reverser")
        assert item.get_timing("error1")
        assert not item.get_timing("error2")
        for error in item.critical_errors():
            assert isinstance(error.get_exception(), Exception)
            assert (str(error) == "test exception"
                    and str(error.get_exception()) == "test exception"
                    and str(error) != "test pipeline error")
    assert all("stage error1 has generated an error" in record.msg.lower()
               for record in caplog.records if record.levelno == logging.ERROR)
    assert pipeline.count == 10
    with pytest.raises(Exception):
        pipeline = (_pipeline().set_source(RandomTextSource(10)).append_stage(
            "reverser", TextReverser()).append_stage("error",
                                                     ExceptionStage()).build())
        try:
            for _ in pipeline.run():
                pass
        except Exception:
            assert 'Exception("test exception")' in traceback.format_exc()
            raise
        assert pipeline.count == 1
    pipeline = (_pipeline().set_error_manager(
        ErrorManager().no_skip_on_critical_error()).set_source(
            RandomTextSource(10)).append_stage(
                "reverser1", TextReverser()).append_stage(
                    "error",
                    ExceptionStage()).append_stage("reverser2",
                                                   TextReverser()).build())
    for item in pipeline.run():
        assert item.get_timing("reverser1")
        assert item.get_timing("error")
        assert item.get_timing("reverser2")
    assert pipeline.count == 10