def _get_read_tasks( ds: Datasource, ctx: DatasetContext, cur_pg: Optional[PlacementGroup], parallelism: int, kwargs: dict, ) -> (int, int, List[ReadTask]): """Generates read tasks. Args: ds: Datasource to read from. ctx: Dataset config to use. cur_pg: The current placement group, if any. parallelism: The user-requested parallelism, or -1 for autodetection. kwargs: Additional kwargs to pass to the reader. Returns: Request parallelism from the datasource, the min safe parallelism to avoid OOM, and the list of read tasks generated. """ kwargs = _unwrap_arrow_serialization_workaround(kwargs) DatasetContext._set_current(ctx) reader = ds.create_reader(**kwargs) requested_parallelism, min_safe_parallelism = _autodetect_parallelism( parallelism, cur_pg, DatasetContext.get_current(), reader) return ( requested_parallelism, min_safe_parallelism, reader.get_read_tasks(requested_parallelism), )
def pipeline_stage(fn: Callable[[], Dataset[T]], context: DatasetContext) -> Dataset[T]: DatasetContext._set_current(context) try: prev = set_progress_bars(False) return fn() finally: set_progress_bars(prev)
def __init__(self, pipeline: "DatasetPipeline[T]", n: int, splitter: Callable[[Dataset], "DatasetPipeline[T]"], context: DatasetContext): DatasetContext._set_current(context) self.executor = PipelineExecutor(pipeline) self.n = n self.splitter = splitter self.cur_splits = [None] * self.n
def run(self, fn: Callable[[], Dataset[T]], context: DatasetContext) -> Dataset[T]: DatasetContext._set_current(context) try: prev = set_progress_bars(False) # Force eager evaluation of all blocks in the pipeline stage. This # prevents resource deadlocks due to overlapping stage execution # (e.g., task -> actor stage). return fn().force_reads() finally: set_progress_bars(prev)
def __init__( self, pipeline: "DatasetPipeline[T]", n: int, splitter: Callable[[Dataset], List["Dataset[T]"]], context: DatasetContext, ): DatasetContext._set_current(context) pipeline._optimize_stages() self.executor = PipelineExecutor(pipeline) self.n = n self.splitter = splitter self.cur_splits = [None] * self.n
def remote_read(i: int, task: ReadTask) -> MaybeBlockPartition: DatasetContext._set_current(context) stats = BlockExecStats.builder() # Execute the read task. block = task() if context.block_splitting_enabled: metadata = task.get_metadata() metadata.exec_stats = stats.build() else: metadata = BlockAccessor.for_block(block).get_metadata( input_files=task.get_metadata().input_files, exec_stats=stats.build()) stats_actor.record_task.remote(stats_uuid, i, metadata) return block
def remote_read(i: int, task: ReadTask) -> MaybeBlockPartition: DatasetContext._set_current(context) start_time, start_cpu = time.perf_counter(), time.process_time() exec_stats = BlockExecStats() # Execute the read task. block = task() exec_stats.cpu_time_s = time.process_time() - start_cpu exec_stats.wall_time_s = time.perf_counter() - start_time if context.block_splitting_enabled: metadata = task.get_metadata() metadata.exec_stats = exec_stats else: metadata = BlockAccessor.for_block(block).get_metadata( input_files=task.get_metadata().input_files, exec_stats=exec_stats) stats_actor.add.remote(stats_uuid, i, metadata) return block
def _execute_read_task( i: int, task: ReadTask, context: DatasetContext, stats_uuid: str, stats_actor: ray.actor.ActorHandle, ) -> Tuple[MaybeBlockPartition, BlockPartitionMetadata]: DatasetContext._set_current(context) stats = BlockExecStats.builder() # Execute the task. block = task() metadata = task.get_metadata() if context.block_splitting_enabled: metadata.exec_stats = stats.build() else: metadata = BlockAccessor.for_block(block).get_metadata( input_files=metadata.input_files, exec_stats=stats.build()) stats_actor.record_task.remote(stats_uuid, i, metadata) return block, metadata
def _prepare_read(ds: Datasource, ctx: DatasetContext, parallelism: int, kwargs: dict) -> List[ReadTask]: kwargs = _unwrap_s3_filesystem_workaround(kwargs) DatasetContext._set_current(ctx) return ds.prepare_read(parallelism, **kwargs)
def remote_read(task: ReadTask) -> Block: DatasetContext._set_current(context) return task()
def remote_read(task: ReadTask) -> MaybeBlockPartition: DatasetContext._set_current(context) return task()