示例#1
0
文件: read_api.py 项目: parasj/ray
def _get_read_tasks(
    ds: Datasource,
    ctx: DatasetContext,
    cur_pg: Optional[PlacementGroup],
    parallelism: int,
    kwargs: dict,
) -> (int, int, List[ReadTask]):
    """Generates read tasks.

    Args:
        ds: Datasource to read from.
        ctx: Dataset config to use.
        cur_pg: The current placement group, if any.
        parallelism: The user-requested parallelism, or -1 for autodetection.
        kwargs: Additional kwargs to pass to the reader.

    Returns:
        Request parallelism from the datasource, the min safe parallelism to avoid
        OOM, and the list of read tasks generated.
    """
    kwargs = _unwrap_arrow_serialization_workaround(kwargs)
    DatasetContext._set_current(ctx)
    reader = ds.create_reader(**kwargs)
    requested_parallelism, min_safe_parallelism = _autodetect_parallelism(
        parallelism, cur_pg, DatasetContext.get_current(), reader)
    return (
        requested_parallelism,
        min_safe_parallelism,
        reader.get_read_tasks(requested_parallelism),
    )
示例#2
0
def pipeline_stage(fn: Callable[[], Dataset[T]],
                   context: DatasetContext) -> Dataset[T]:
    DatasetContext._set_current(context)
    try:
        prev = set_progress_bars(False)
        return fn()
    finally:
        set_progress_bars(prev)
示例#3
0
 def __init__(self, pipeline: "DatasetPipeline[T]", n: int,
              splitter: Callable[[Dataset], "DatasetPipeline[T]"],
              context: DatasetContext):
     DatasetContext._set_current(context)
     self.executor = PipelineExecutor(pipeline)
     self.n = n
     self.splitter = splitter
     self.cur_splits = [None] * self.n
示例#4
0
 def run(self, fn: Callable[[], Dataset[T]],
         context: DatasetContext) -> Dataset[T]:
     DatasetContext._set_current(context)
     try:
         prev = set_progress_bars(False)
         # Force eager evaluation of all blocks in the pipeline stage. This
         # prevents resource deadlocks due to overlapping stage execution
         # (e.g., task -> actor stage).
         return fn().force_reads()
     finally:
         set_progress_bars(prev)
示例#5
0
 def __init__(
     self,
     pipeline: "DatasetPipeline[T]",
     n: int,
     splitter: Callable[[Dataset], List["Dataset[T]"]],
     context: DatasetContext,
 ):
     DatasetContext._set_current(context)
     pipeline._optimize_stages()
     self.executor = PipelineExecutor(pipeline)
     self.n = n
     self.splitter = splitter
     self.cur_splits = [None] * self.n
示例#6
0
    def remote_read(i: int, task: ReadTask) -> MaybeBlockPartition:
        DatasetContext._set_current(context)
        stats = BlockExecStats.builder()

        # Execute the read task.
        block = task()

        if context.block_splitting_enabled:
            metadata = task.get_metadata()
            metadata.exec_stats = stats.build()
        else:
            metadata = BlockAccessor.for_block(block).get_metadata(
                input_files=task.get_metadata().input_files,
                exec_stats=stats.build())
        stats_actor.record_task.remote(stats_uuid, i, metadata)
        return block
示例#7
0
文件: read_api.py 项目: novahe/ray
    def remote_read(i: int, task: ReadTask) -> MaybeBlockPartition:
        DatasetContext._set_current(context)
        start_time, start_cpu = time.perf_counter(), time.process_time()
        exec_stats = BlockExecStats()

        # Execute the read task.
        block = task()

        exec_stats.cpu_time_s = time.process_time() - start_cpu
        exec_stats.wall_time_s = time.perf_counter() - start_time
        if context.block_splitting_enabled:
            metadata = task.get_metadata()
            metadata.exec_stats = exec_stats
        else:
            metadata = BlockAccessor.for_block(block).get_metadata(
                input_files=task.get_metadata().input_files,
                exec_stats=exec_stats)
        stats_actor.add.remote(stats_uuid, i, metadata)
        return block
示例#8
0
def _execute_read_task(
    i: int,
    task: ReadTask,
    context: DatasetContext,
    stats_uuid: str,
    stats_actor: ray.actor.ActorHandle,
) -> Tuple[MaybeBlockPartition, BlockPartitionMetadata]:
    DatasetContext._set_current(context)
    stats = BlockExecStats.builder()

    # Execute the task.
    block = task()

    metadata = task.get_metadata()
    if context.block_splitting_enabled:
        metadata.exec_stats = stats.build()
    else:
        metadata = BlockAccessor.for_block(block).get_metadata(
            input_files=metadata.input_files, exec_stats=stats.build())
    stats_actor.record_task.remote(stats_uuid, i, metadata)
    return block, metadata
示例#9
0
def _prepare_read(ds: Datasource, ctx: DatasetContext, parallelism: int,
                  kwargs: dict) -> List[ReadTask]:
    kwargs = _unwrap_s3_filesystem_workaround(kwargs)
    DatasetContext._set_current(ctx)
    return ds.prepare_read(parallelism, **kwargs)
示例#10
0
 def remote_read(task: ReadTask) -> Block:
     DatasetContext._set_current(context)
     return task()
示例#11
0
 def remote_read(task: ReadTask) -> MaybeBlockPartition:
     DatasetContext._set_current(context)
     return task()