def __init__( self, f, workers, maxsize, on_start, on_done, dependencies, timeout, ): self.f = f self.workers = workers self.maxsize = maxsize self.on_start = on_start self.on_done = on_done self.timeout = timeout self.dependencies = dependencies self.output_queues = utils.MultiQueue() self.f_args = pypeln_utils.function_args(self.f) if self.f else set() self.on_start_args = (pypeln_utils.function_args(self.on_start) if self.on_start else set()) self.on_done_args = (pypeln_utils.function_args(self.on_done) if self.on_done else set()) ###################################### # build fields ###################################### self.input_queue = None self.stage_namespace = None self.stage_lock = None self.pipeline_namespace = None self.pipeline_error_queue = None self.pipeline_stages = None self.loop = None
def __call__(self): worker_info = WorkerInfo(index=self.index) on_start_args: tp.List[str] = (pypeln_utils.function_args( self.on_start) if self.on_start else []) on_done_args: tp.List[str] = (pypeln_utils.function_args(self.on_done) if self.on_done else []) try: if self.on_start is not None: on_start_kwargs = dict(worker_info=worker_info) kwargs = self.on_start( **{ key: value for key, value in on_start_kwargs.items() if key in on_start_args }) else: kwargs = {} if kwargs is None: kwargs = {} kwargs.setdefault("worker_info", worker_info) self.process_fn( self, **{ key: value for key, value in kwargs.items() if key in self.f_args }, ) self.stage_params.worker_done() if self.on_done is not None: kwargs.setdefault( "stage_status", StageStatus( namespace=self.stage_params.namespace, lock=self.stage_params.lock, ), ) self.on_done( **{ key: value for key, value in kwargs.items() if key in on_done_args }) except pypeln_utils.StopThreadException: pass except BaseException as e: self.main_queue.raise_exception(e) time.sleep(0.01) finally: self.namespace.done = True self.stage_params.output_queues.done()
def __init__(self, f, on_start, on_done, dependencies, timeout): self.f = f self.on_start = on_start self.on_done = on_done self.timeout = timeout self.dependencies = dependencies self.f_args = pypeln_utils.function_args(self.f) if self.f else set() self.on_start_args = ( pypeln_utils.function_args(self.on_start) if self.on_start else set() ) self.on_done_args = ( pypeln_utils.function_args(self.on_done) if self.on_done else set() )
def run(self) -> tp.Iterable: worker_info = WorkerInfo(index=0) on_start_args: tp.List[str] = (pypeln_utils.function_args( self.on_start) if self.on_start else []) on_done_args: tp.List[str] = (pypeln_utils.function_args(self.on_done) if self.on_done else []) if self.on_start is not None: on_start_kwargs = dict(worker_info=worker_info) kwargs = self.on_start( **{ key: value for key, value in on_start_kwargs.items() if key in on_start_args }) else: kwargs = {} if kwargs is None: kwargs = {} kwargs.setdefault("worker_info", worker_info) yield from self.process_fn( self, **{ key: value for key, value in kwargs.items() if key in self.f_args }, ) if self.on_done is not None: kwargs.setdefault( "stage_status", StageStatus(), ) self.on_done(**{ key: value for key, value in kwargs.items() if key in on_done_args })
def flat_map( f: FlatMapFn, stage: tp.Union[Stage[A], tp.Iterable[A], pypeln_utils.Undefined] = pypeln_utils.UNDEFINED, workers: int = 1, maxsize: int = 0, timeout: float = 0, on_start: tp.Callable = None, on_done: tp.Callable = None, ) -> tp.Union[Stage[B], pypeln_utils.Partial[Stage[B]]]: """ Creates a stage that maps a function `f` over the data, however unlike `pypeln.process.map` in this case `f` returns an iterable. As its name implies, `flat_map` will flatten out these iterables so the resulting stage just contains their elements. ```python import pypeln as pl import time from random import random def slow_integer_pair(x): time.sleep(random()) # <= some slow computation if x == 0: yield x else: yield x yield -x data = range(10) # [0, 1, 2, ..., 9] stage = pl.thread.flat_map(slow_integer_pair, data, workers=3, maxsize=4) list(stage) # e.g. [2, -2, 3, -3, 0, 1, -1, 6, -6, 4, -4, ...] ``` !!! note Because of concurrency order is not guaranteed. `flat_map` is a more general operation, you can actually implement `pypeln.process.map` and `pypeln.process.filter` with it, for example: ```python import pypeln as pl pl.thread.map(f, stage) = pl.thread.flat_map(lambda x: [f(x)], stage) pl.thread.filter(f, stage) = pl.thread.flat_map(lambda x: [x] if f(x) else [], stage) ``` Using `flat_map` with a generator function is very useful as e.g. you are able to filter out unwanted elements when there are exceptions, missing data, etc. Arguments: f: A function with signature `f(x) -> iterable`. `f` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). stage: A Stage or Iterable. workers: The number of workers the stage should contain. maxsize: The maximum number of objects the stage can hold simultaneously, if set to `0` (default) then the stage can grow unbounded. timeout: Seconds before stoping the worker if its current task is not yet completed. Defaults to `0` which means its unbounded. on_start: A function with signature `on_start(worker_info?) -> kwargs?`, where `kwargs` can be a `dict` of keyword arguments that can be consumed by `f` and `on_done`. `on_start` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). on_done: A function with signature `on_done(stage_status?)`. This function is executed once per worker when the worker finishes. `on_done` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). Returns: Returns a `Stage` if the `stage` parameters is given, else it returns a `Partial`. """ if isinstance(stage, pypeln_utils.Undefined): return pypeln_utils.Partial(lambda stage: flat_map( f, stage=stage, workers=workers, maxsize=maxsize, timeout=timeout, on_start=on_start, on_done=on_done, )) stage = to_stage(stage, maxsize=maxsize) return Stage( process_fn=FlatMap(f), workers=workers, maxsize=maxsize, timeout=timeout, total_sources=stage.workers, dependencies=[stage], on_start=on_start, on_done=on_done, f_args=pypeln_utils.function_args(f), )
def each( f: EachFn, stage: tp.Union[ Stage[A], tp.Iterable[A], pypeln_utils.Undefined ] = pypeln_utils.UNDEFINED, workers: int = 1, maxsize: int = 0, timeout: float = 0, on_start: tp.Callable = None, on_done: tp.Callable = None, run: bool = False, ) -> tp.Union[tp.Optional[Stage[None]], pypeln_utils.Partial[tp.Optional[Stage[None]]]]: """ Creates a stage that runs the function `f` for each element in the data but the stage itself yields no elements. Its useful for sink stages that perform certain actions such as writting to disk, saving to a database, etc, and dont produce any results. For example: ```python import pypeln as pl def process_image(image_path): image = load_image(image_path) image = transform_image(image) save_image(image_path, image) files_paths = get_file_paths() stage = pl.sync.each(process_image, file_paths, workers=4) pl.sync.run(stage) ``` or alternatively ```python files_paths = get_file_paths() pl.sync.each(process_image, file_paths, workers=4, run=True) ``` !!! note Because of concurrency order is not guaranteed. Arguments: f: A function with signature `f(x) -> None`. `f` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). stage: A Stage or Iterable. workers: This parameter is not used and only kept for API compatibility with the other modules. maxsize: This parameter is not used and only kept for API compatibility with the other modules. timeout: Seconds before stoping the worker if its current task is not yet completed. Defaults to `0` which means its unbounded. on_start: A function with signature `on_start(worker_info?) -> kwargs?`, where `kwargs` can be a `dict` of keyword arguments that can be consumed by `f` and `on_done`. `on_start` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). on_done: A function with signature `on_done(stage_status?)`. This function is executed once per worker when the worker finishes. `on_done` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). run: Whether or not to execute the stage immediately. !!! warning To implement `timeout` we use `stopit.ThreadingTimeout` which has some limitations. Returns: If the `stage` parameters is not given then this function returns a `Partial`, else if `run=False` (default) it return a new stage, if `run=True` then it runs the stage and returns `None`. """ if isinstance(stage, pypeln_utils.Undefined): return pypeln_utils.Partial( lambda stage: each( f, stage=stage, workers=workers, maxsize=maxsize, timeout=timeout, on_start=on_start, on_done=on_done, ) ) stage_ = to_stage(stage, maxsize=maxsize) stage_ = Stage( process_fn=Each(f), timeout=timeout, dependencies=[stage_], on_start=on_start, on_done=on_done, f_args=pypeln_utils.function_args(f), ) if not run: return stage_ for _ in stage_: pass
def filter( f: FilterFn, stage: tp.Union[Stage[A], tp.Iterable[A], tp.Iterable[A], pypeln_utils.Undefined] = pypeln_utils.UNDEFINED, workers: int = 1, maxsize: int = 0, timeout: float = 0, on_start: tp.Callable = None, on_done: tp.Callable = None, ) -> tp.Union[Stage[B], pypeln_utils.Partial[Stage[B]]]: """ Creates a stage that filter the data given a predicate function `f`. It is intended to behave like python's built-in `filter` function but with the added concurrency. ```python import pypeln as pl import time from random import random def slow_gt3(x): time.sleep(random()) # <= some slow computation return x > 3 data = range(10) # [0, 1, 2, ..., 9] stage = pl.thread.filter(slow_gt3, data, workers=3, maxsize=4) data = list(stage) # e.g. [5, 6, 3, 4, 7, 8, 9] ``` !!! note Because of concurrency order is not guaranteed. Arguments: f: A function with signature `f(x) -> bool`. `f` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). stage: A Stage or Iterable. workers: The number of workers the stage should contain. maxsize: The maximum number of objects the stage can hold simultaneously, if set to `0` (default) then the stage can grow unbounded. timeout: Seconds before stoping the worker if its current task is not yet completed. Defaults to `0` which means its unbounded. on_start: A function with signature `on_start(worker_info?) -> kwargs?`, where `kwargs` can be a `dict` of keyword arguments that can be consumed by `f` and `on_done`. `on_start` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). on_done: A function with signature `on_done(stage_status?)`. This function is executed once per worker when the worker finishes. `on_done` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). Returns: Returns a `Stage` if the `stage` parameters is given, else it returns a `Partial`. """ if isinstance(stage, pypeln_utils.Undefined): return pypeln_utils.Partial(lambda stage: filter( f, stage=stage, workers=workers, maxsize=maxsize, timeout=timeout, on_start=on_start, on_done=on_done, )) stage = to_stage(stage) return Stage( process_fn=Filter(f), workers=workers, maxsize=maxsize, timeout=timeout, total_sources=stage.workers, dependencies=[stage], on_start=on_start, on_done=on_done, f_args=pypeln_utils.function_args(f), )
def filter( f: FilterFn, stage: tp.Union[ Stage[A], tp.Iterable[A], tp.Iterable[A], pypeln_utils.Undefined ] = pypeln_utils.UNDEFINED, workers: int = 1, maxsize: int = 0, timeout: float = 0, on_start: tp.Callable = None, on_done: tp.Callable = None, ) -> tp.Union[Stage[B], pypeln_utils.Partial[Stage[B]]]: """ Creates a stage that filter the data given a predicate function `f`. exactly like python's built-in `filter` function. ```python import pypeln as pl import time from random import random def slow_gt3(x): time.sleep(random()) # <= some slow computation return x > 3 data = range(10) # [0, 1, 2, ..., 9] stage = pl.sync.filter(slow_gt3, data, workers=3, maxsize=4) data = list(stage) # [3, 4, 5, ..., 9] ``` Arguments: f: A function with signature `f(x) -> bool`. `f` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). stage: A Stage or Iterable. workers: This parameter is not used and only kept for API compatibility with the other modules. maxsize: This parameter is not used and only kept for API compatibility with the other modules. timeout: Seconds before stoping the worker if its current task is not yet completed. Defaults to `0` which means its unbounded. on_start: A function with signature `on_start(worker_info?) -> kwargs?`, where `kwargs` can be a `dict` of keyword arguments that can be consumed by `f` and `on_done`. `on_start` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). on_done: A function with signature `on_done(stage_status?)`. This function is executed once per worker when the worker finishes. `on_done` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). !!! warning To implement `timeout` we use `stopit.ThreadingTimeout` which has some limitations. Returns: Returns a `Stage` if the `stage` parameters is given, else it returns a `Partial`. """ if isinstance(stage, pypeln_utils.Undefined): return pypeln_utils.Partial( lambda stage: filter( f, stage=stage, workers=workers, maxsize=maxsize, timeout=timeout, on_start=on_start, on_done=on_done, ) ) stage_ = to_stage(stage) return Stage( process_fn=Filter(f), timeout=timeout, dependencies=[stage_], on_start=on_start, on_done=on_done, f_args=pypeln_utils.function_args(f), )
def each( f: EachFn, stage: tp.Union[Stage[A], tp.Iterable[A], tp.AsyncIterable[A], pypeln_utils.Undefined] = pypeln_utils.UNDEFINED, workers: int = 1, maxsize: int = 0, timeout: float = 0, on_start: tp.Callable = None, on_done: tp.Callable = None, run: bool = False, ) -> tp.Union[tp.Optional[Stage[B]], pypeln_utils.Partial[tp.Optional[Stage[B]]]]: """ Creates a stage that runs the function `f` for each element in the data but the stage itself yields no elements. Its useful for sink stages that perform certain actions such as writting to disk, saving to a database, etc, and dont produce any results. For example: ```python import pypeln as pl def process_image(image_path): image = load_image(image_path) image = transform_image(image) save_image(image_path, image) files_paths = get_file_paths() stage = pl.process.each(process_image, file_paths, workers=4) pl.process.run(stage) ``` or alternatively ```python files_paths = get_file_paths() pl.process.each(process_image, file_paths, workers=4, run=True) ``` !!! note Because of concurrency order is not guaranteed. Arguments: f: A function with signature `f(x) -> None`. `f` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). workers: The number of workers the stage should contain. maxsize: The maximum number of objects the stage can hold simultaneously, if set to `0` (default) then the stage can grow unbounded. timeout: Seconds before stoping the worker if its current task is not yet completed. Defaults to `0` which means its unbounded. on_start: A function with signature `on_start(worker_info?) -> kwargs?`, where `kwargs` can be a `dict` of keyword arguments that can be consumed by `f` and `on_done`. `on_start` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). on_done: A function with signature `on_done(stage_status?)`. This function is executed once per worker when the worker finishes. `on_done` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). run: Whether or not to execute the stage immediately. If each is running inside another coroutine / task then avoid using `run=True` since it will block the event loop, use `await pl.task.each(...)` instead. Returns: If the `stage` parameters is not given then this function returns a `Partial`, else if `run=False` (default) it return a new stage, if `run=True` then it runs the stage and returns `None`. """ if isinstance(stage, pypeln_utils.Undefined): return pypeln_utils.Partial(lambda stage: each( f, stage=stage, workers=workers, maxsize=maxsize, timeout=timeout, on_start=on_start, on_done=on_done, )) stage = to_stage(stage, maxsize=maxsize) stage = Stage( process_fn=Each(f), workers=workers, maxsize=maxsize, timeout=timeout, total_sources=1, dependencies=[stage], on_start=on_start, on_done=on_done, f_args=pypeln_utils.function_args(f), ) if not run: return stage for _ in stage: pass
async def __call__(self): worker_info = WorkerInfo(index=0) on_start_args: tp.List[str] = (pypeln_utils.function_args( self.on_start) if self.on_start else []) on_done_args: tp.List[str] = (pypeln_utils.function_args(self.on_done) if self.on_done else []) try: if self.on_start is not None: on_start_kwargs = dict(worker_info=worker_info) kwargs = self.on_start( **{ key: value for key, value in on_start_kwargs.items() if key in on_start_args }) if isinstance(kwargs, tp.Awaitable): kwargs = await kwargs else: kwargs = {} if kwargs is None: kwargs = {} kwargs.setdefault("worker_info", worker_info) async with self.tasks: await self.process_fn( self, **{ key: value for key, value in kwargs.items() if key in self.f_args }, ) self.stage_params.worker_done() if self.on_done is not None: kwargs.setdefault( "stage_status", StageStatus(), ) coro = self.on_done( **{ key: value for key, value in kwargs.items() if key in on_done_args }) if isinstance(coro, tp.Awaitable): await coro await self.stage_params.output_queues.worker_done() except asyncio.CancelledError: pass except BaseException as e: # print("ERRORRRR", e) # import sys, traceback # exception_type, _exception, _traceback = sys.exc_info() # traceback.print_exception(exception_type, _exception, _traceback) await self.main_queue.raise_exception(e) finally: self.is_done = True self.tasks.stop()