예제 #1
0
def from_pandas_refs(
    dfs: Union[ObjectRef["pandas.DataFrame"], List[ObjectRef["pandas.DataFrame"]]]
) -> Dataset[ArrowRow]:
    """Create a dataset from a list of Ray object references to Pandas
    dataframes.

    Args:
        dfs: A Ray object references to pandas dataframe, or a list of
             Ray object references to pandas dataframes.

    Returns:
        Dataset holding Arrow records read from the dataframes.
    """
    if isinstance(dfs, ray.ObjectRef):
        dfs = [dfs]
    elif isinstance(dfs, list):
        for df in dfs:
            if not isinstance(df, ray.ObjectRef):
                raise ValueError(
                    "Expected list of Ray object refs, "
                    f"got list containing {type(df)}"
                )
    else:
        raise ValueError(
            "Expected Ray object ref or list of Ray object refs, " f"got {type(df)}"
        )

    context = DatasetContext.get_current()
    if context.enable_pandas_block:
        get_metadata = cached_remote_fn(_get_metadata)
        metadata = [get_metadata.remote(df) for df in dfs]
        return Dataset(
            ExecutionPlan(BlockList(dfs, ray.get(metadata)), DatasetStats.TODO()),
            0,
            False,
        )

    df_to_block = cached_remote_fn(_df_to_block, num_returns=2)

    res = [df_to_block.remote(df) for df in dfs]
    blocks, metadata = zip(*res)
    return Dataset(
        ExecutionPlan(
            BlockList(blocks, ray.get(list(metadata))),
            DatasetStats(stages={"from_pandas_refs": metadata}, parent=None),
        ),
        0,
        False,
    )
예제 #2
0
def _test_equal_split_balanced(block_sizes, num_splits):
    blocks = []
    metadata = []
    total_rows = 0
    for block_size in block_sizes:
        block = list(range(total_rows, total_rows + block_size))
        blocks.append(ray.put(block))
        metadata.append(
            BlockAccessor.for_block(block).get_metadata(None, None))
        total_rows += block_size
    block_list = BlockList(blocks, metadata)
    ds = Dataset(
        ExecutionPlan(block_list, DatasetStats.TODO()),
        0,
        False,
    )

    splits = ds.split(num_splits, equal=True)
    split_counts = [split.count() for split in splits]
    assert len(split_counts) == num_splits
    expected_block_size = total_rows // num_splits
    # Check that all splits are the expected size.
    assert all([count == expected_block_size for count in split_counts])
    expected_total_rows = sum(split_counts)
    # Check that the expected number of rows were dropped.
    assert total_rows - expected_total_rows == total_rows % num_splits
    # Check that all rows are unique (content check).
    split_rows = [row for split in splits for row in split.take(total_rows)]
    assert len(set(split_rows)) == len(split_rows)
예제 #3
0
파일: read_api.py 프로젝트: novahe/ray
def from_items(items: List[Any], *, parallelism: int = 200) -> Dataset[Any]:
    """Create a dataset from a list of local Python objects.

    Examples:
        >>> ray.data.from_items([1, 2, 3, 4, 5])

    Args:
        items: List of local Python objects.
        parallelism: The amount of parallelism to use for the dataset.
            Parallelism may be limited by the number of items.

    Returns:
        Dataset holding the items.
    """
    block_size = max(1, len(items) // parallelism)

    blocks: List[ObjectRef[Block]] = []
    metadata: List[BlockMetadata] = []
    i = 0
    while i < len(items):
        builder = DelegatingBlockBuilder()
        for item in items[i:i + block_size]:
            builder.add(item)
        block = builder.build()
        blocks.append(ray.put(block))
        metadata.append(
            BlockAccessor.for_block(block).get_metadata(
                input_files=None, exec_stats=BlockExecStats.TODO))
        i += block_size

    return Dataset(BlockList(blocks, metadata), 0, DatasetStats.TODO())
예제 #4
0
def from_arrow_refs(
    tables: Union[
        ObjectRef[Union["pyarrow.Table", bytes]],
        List[ObjectRef[Union["pyarrow.Table", bytes]]],
    ]
) -> Dataset[ArrowRow]:
    """Create a dataset from a set of Arrow tables.

    Args:
        tables: A Ray object reference to Arrow table, or list of Ray object
                references to Arrow tables, or its streaming format in bytes.

    Returns:
        Dataset holding Arrow records from the tables.
    """
    if isinstance(tables, ray.ObjectRef):
        tables = [tables]

    get_metadata = cached_remote_fn(_get_metadata)
    metadata = [get_metadata.remote(t) for t in tables]
    return Dataset(
        ExecutionPlan(
            BlockList(tables, ray.get(metadata)),
            DatasetStats(stages={"from_arrow_refs": metadata}, parent=None),
        ),
        0,
        False,
    )
예제 #5
0
파일: plan.py 프로젝트: alipay/ray
def _rewrite_read_stage(
    in_blocks: LazyBlockList, ) -> Tuple[BlockList, DatasetStats, Stage]:
    """Rewrite the read stage to a OneToOne stage over read tasks as input.

    For example, suppose the plan was [Read -> MapBatches(Fn)]. These stages cannot
    be fused, since read stages are handled specially.
    After rewriting to [GetReadTasks -> MapBatches(DoRead) -> MapBatches(Fn)],
    now we can fuse the latter two MapBatches stages into a single OneToOne stage:
    [GetReadTasks -> MapBatches(DoRead -> Fn)].

    Args:
        blocks: Lazy block list representing read stage.

    Returns:
        Non-lazy block list containing read tasks for not-yet-read block partitions,
        new stats for the block list, and the new one-to-one read stage.
    """
    # Generate the "GetReadTasks" stage blocks.
    remote_args = in_blocks._remote_args
    blocks, metadata = [], []
    for read_task in in_blocks._tasks:
        blocks.append(ray.put(read_task._read_fn))
        metadata.append(read_task.get_metadata())
    block_list = BlockList(blocks, metadata)

    def block_fn(read_fn: Callable[[], Iterator[Block]]) -> Iterator[Block]:
        for block in read_fn():
            yield block

    stage = OneToOneStage("read", block_fn, "tasks", remote_args)
    stats = DatasetStats(stages={}, parent=None)
    return block_list, stats, stage
예제 #6
0
파일: plan.py 프로젝트: vakker/ray
 def _rewrite_read_stages(self) -> None:
     """Rewrites read stages into one-to-one stages."""
     if self._stages and self._has_read_stage():
         block_list, stage = self._rewrite_read_stage()
         self._in_blocks = block_list
         self._in_stats = DatasetStats(stages={}, parent=None)
         self._stages.insert(0, stage)
예제 #7
0
    def _optimize_stages(self):
        """Optimize this pipeline, fusing stages together as possible."""
        context = DatasetContext.get_current()

        if not context.optimize_fuse_stages:
            self._optimized_stages = self._stages
            return

        # This dummy dataset will be used to get a set of optimized stages.
        dummy_ds = Dataset(
            ExecutionPlan(BlockList([], []),
                          DatasetStats(stages={}, parent=None)),
            0,
            True,
        )
        # Apply all pipeline operations to the dummy dataset.
        for stage in self._stages:
            dummy_ds = stage(dummy_ds)
        # Get the optimized stages.
        _, _, stages = dummy_ds._plan._optimize()
        # Apply these optimized stages to the datasets underlying the pipeline.
        # These optimized stages will be executed by the PipelineExecutor.
        optimized_stages = []
        for stage in stages:
            optimized_stages.append(lambda ds, stage=stage: Dataset(
                ds._plan.with_stage(stage), ds._epoch, True))
        self._optimized_stages = optimized_stages
예제 #8
0
파일: plan.py 프로젝트: alipay/ray
    def __init__(self,
                 in_blocks: BlockList,
                 stats: DatasetStats,
                 dataset_uuid=None):
        """Create a plan with no transformation stages.

        Args:
            in_blocks: Base list of blocks.
            stats: Stats for the base blocks.
            dataset_uuid: Dataset's UUID.
        """
        self._in_blocks = in_blocks
        self._in_stats = stats
        # A computed snapshot of some prefix of stages.
        self._snapshot_blocks = None
        self._snapshot_stats = None
        # Chains of stages.
        self._stages_before_snapshot = []
        self._stages_after_snapshot = []
        # Cache of optimized stages.
        self._last_optimized_stages = None

        self._dataset_uuid = dataset_uuid or uuid.uuid4().hex
        if not stats.dataset_uuid:
            stats.dataset_uuid = self._dataset_uuid
예제 #9
0
 def stats(self) -> DatasetStats:
     """Create DatasetStats for this LazyBlockList."""
     return DatasetStats(
         stages={"read": self.get_metadata(fetch_if_missing=False)},
         parent=None,
         needs_stats_actor=True,
         stats_uuid=self._stats_uuid,
     )
예제 #10
0
파일: plan.py 프로젝트: alipay/ray
def _rewrite_read_stages(
    blocks: BlockList,
    stats: DatasetStats,
    stages: List[Stage],
    dataset_uuid: str,
) -> Tuple[BlockList, DatasetStats, List[Stage]]:
    """Rewrites read stages into one-to-one stages, if needed."""
    if _is_lazy(blocks) and stages:
        blocks, stats, stage = _rewrite_read_stage(blocks)
        stats.dataset_uuid = dataset_uuid
        stages.insert(0, stage)
    return blocks, stats, stages
예제 #11
0
파일: read_api.py 프로젝트: novahe/ray
def from_numpy(ndarrays: List[ObjectRef[np.ndarray]]) -> Dataset[ArrowRow]:
    """Create a dataset from a set of NumPy ndarrays.

    Args:
        ndarrays: A list of Ray object references to NumPy ndarrays.

    Returns:
        Dataset holding the given ndarrays.
    """
    ndarray_to_block = cached_remote_fn(_ndarray_to_block, num_returns=2)

    res = [ndarray_to_block.remote(ndarray) for ndarray in ndarrays]
    blocks, metadata = zip(*res)
    return Dataset(BlockList(blocks, ray.get(list(metadata))), 0,
                   DatasetStats.TODO())
예제 #12
0
파일: plan.py 프로젝트: vakker/ray
    def __init__(self,
                 in_blocks: BlockList,
                 stats: DatasetStats,
                 dataset_uuid=None):
        """Create a plan with no transformation stages.

        Args:
            in_blocks: Base list of blocks.
            stats: Stats for the base blocks.
        """
        self._in_blocks = in_blocks
        self._out_blocks = None
        self._in_stats = stats
        self._out_stats = None
        self._stages = []
        self._dataset_uuid = dataset_uuid or uuid.uuid4().hex
        if not stats.dataset_uuid:
            stats.dataset_uuid = self._dataset_uuid
예제 #13
0
    def _optimize_stages(self):
        """Optimize this pipeline, fusing stages together as possible."""
        context = DatasetContext.get_current()

        if not context.optimize_fuse_stages:
            self._optimized_stages = self._stages
            return

        dummy_ds = Dataset(
            ExecutionPlan(BlockList([], []),
                          DatasetStats(stages={}, parent=None)),
            0,
            True,
        )
        for stage in self._stages:
            dummy_ds = stage(dummy_ds)
        dummy_ds._plan._optimize()
        optimized_stages = []
        for stage in dummy_ds._plan._stages:
            optimized_stages.append(lambda ds, stage=stage: Dataset(
                ds._plan.with_stage(stage), ds._epoch, True))
        self._optimized_stages = optimized_stages
예제 #14
0
def from_numpy_refs(
    ndarrays: Union[ObjectRef[np.ndarray], List[ObjectRef[np.ndarray]]],
) -> Dataset[ArrowRow]:
    """Create a dataset from a list of NumPy ndarray futures.

    Args:
        ndarrays: A Ray object reference to a NumPy ndarray or a list of Ray object
            references to NumPy ndarrays.

    Returns:
        Dataset holding the given ndarrays.
    """
    if isinstance(ndarrays, ray.ObjectRef):
        ndarrays = [ndarrays]
    elif isinstance(ndarrays, list):
        for ndarray in ndarrays:
            if not isinstance(ndarray, ray.ObjectRef):
                raise ValueError(
                    "Expected list of Ray object refs, "
                    f"got list containing {type(ndarray)}"
                )
    else:
        raise ValueError(
            f"Expected Ray object ref or list of Ray object refs, got {type(ndarray)}"
        )

    ndarray_to_block = cached_remote_fn(_ndarray_to_block, num_returns=2)

    res = [ndarray_to_block.remote(ndarray) for ndarray in ndarrays]
    blocks, metadata = zip(*res)
    return Dataset(
        ExecutionPlan(
            BlockList(blocks, ray.get(list(metadata))),
            DatasetStats(stages={"from_numpy": metadata}, parent=None),
        ),
        0,
        False,
    )
예제 #15
0
파일: read_api.py 프로젝트: novahe/ray
def from_pandas_refs(
    dfs: Union[ObjectRef["pandas.DataFrame"],
               List[ObjectRef["pandas.DataFrame"]]]
) -> Dataset[ArrowRow]:
    """Create a dataset from a list of Ray object references to Pandas
    dataframes.

    Args:
        dfs: A Ray object references to pandas dataframe, or a list of
             Ray object references to pandas dataframes.

    Returns:
        Dataset holding Arrow records read from the dataframes.
    """
    if isinstance(dfs, ray.ObjectRef):
        dfs = [dfs]

    df_to_block = cached_remote_fn(_df_to_block, num_returns=2)

    res = [df_to_block.remote(df) for df in dfs]
    blocks, metadata = zip(*res)
    return Dataset(BlockList(blocks, ray.get(list(metadata))), 0,
                   DatasetStats.TODO())
예제 #16
0
def fast_repartition(blocks, num_blocks):
    from ray.data.dataset import Dataset

    wrapped_ds = Dataset(ExecutionPlan(blocks,
                                       DatasetStats(stages={}, parent=None)),
                         0,
                         lazy=False)
    # Compute the (n-1) indices needed for an equal split of the data.
    count = wrapped_ds.count()
    dataset_format = wrapped_ds._dataset_format()
    indices = []
    cur_idx = 0
    for _ in range(num_blocks - 1):
        cur_idx += count / num_blocks
        indices.append(int(cur_idx))
    assert len(indices) < num_blocks, (indices, num_blocks)
    if indices:
        splits = wrapped_ds.split_at_indices(indices)
    else:
        splits = [wrapped_ds]
    # TODO(ekl) include stats for the split tasks. We may also want to
    # consider combining the split and coalesce tasks as an optimization.

    # Coalesce each split into a single block.
    reduce_task = cached_remote_fn(
        _ShufflePartitionOp.reduce).options(num_returns=2)
    reduce_bar = ProgressBar("Repartition", position=0, total=len(splits))
    reduce_out = [
        reduce_task.remote(False, None, *s.get_internal_block_refs())
        for s in splits if s.num_blocks() > 0
    ]

    # Early-release memory.
    del splits, blocks, wrapped_ds

    new_blocks, new_metadata = zip(*reduce_out)
    new_blocks, new_metadata = list(new_blocks), list(new_metadata)
    new_metadata = reduce_bar.fetch_until_complete(new_metadata)
    reduce_bar.close()

    # Handle empty blocks.
    if len(new_blocks) < num_blocks:
        from ray.data.impl.arrow_block import ArrowBlockBuilder
        from ray.data.impl.pandas_block import PandasBlockBuilder
        from ray.data.impl.simple_block import SimpleBlockBuilder

        num_empties = num_blocks - len(new_blocks)
        if dataset_format == "arrow":
            builder = ArrowBlockBuilder()
        elif dataset_format == "pandas":
            builder = PandasBlockBuilder()
        else:
            builder = SimpleBlockBuilder()
        empty_block = builder.build()
        empty_meta = BlockAccessor.for_block(empty_block).get_metadata(
            input_files=None, exec_stats=None)  # No stats for empty block.
        empty_blocks, empty_metadata = zip(*[(ray.put(empty_block), empty_meta)
                                             for _ in range(num_empties)])
        new_blocks += empty_blocks
        new_metadata += empty_metadata

    return BlockList(new_blocks, new_metadata), {}
예제 #17
0
def read_datasource(
    datasource: Datasource[T],
    *,
    parallelism: int = 200,
    ray_remote_args: Dict[str, Any] = None,
    _spread_resource_prefix: Optional[str] = None,
    **read_args,
) -> Dataset[T]:
    """Read a dataset from a custom data source.

    Args:
        datasource: The datasource to read data from.
        parallelism: The requested parallelism of the read. Parallelism may be
            limited by the available partitioning of the datasource.
        read_args: Additional kwargs to pass to the datasource impl.
        ray_remote_args: kwargs passed to ray.remote in the read tasks.

    Returns:
        Dataset holding the data read from the datasource.
    """

    # TODO(ekl) remove this feature flag.
    if "RAY_DATASET_FORCE_LOCAL_METADATA" in os.environ:
        read_tasks = datasource.prepare_read(parallelism, **read_args)
    else:
        # Prepare read in a remote task so that in Ray client mode, we aren't
        # attempting metadata resolution from the client machine.
        ctx = DatasetContext.get_current()
        prepare_read = cached_remote_fn(_prepare_read,
                                        retry_exceptions=False,
                                        num_cpus=0)
        read_tasks = ray.get(
            prepare_read.remote(datasource, ctx, parallelism,
                                _wrap_s3_filesystem_workaround(read_args)))

    context = DatasetContext.get_current()
    stats_actor = get_or_create_stats_actor()
    stats_uuid = uuid.uuid4()
    stats_actor.record_start.remote(stats_uuid)

    def remote_read(i: int, task: ReadTask) -> MaybeBlockPartition:
        DatasetContext._set_current(context)
        stats = BlockExecStats.builder()

        # Execute the read task.
        block = task()

        if context.block_splitting_enabled:
            metadata = task.get_metadata()
            metadata.exec_stats = stats.build()
        else:
            metadata = BlockAccessor.for_block(block).get_metadata(
                input_files=task.get_metadata().input_files,
                exec_stats=stats.build())
        stats_actor.record_task.remote(stats_uuid, i, metadata)
        return block

    if ray_remote_args is None:
        ray_remote_args = {}
    # Increase the read parallelism by default to maximize IO throughput. This
    # is particularly important when reading from e.g., remote storage.
    if "num_cpus" not in ray_remote_args:
        # Note that the too many workers warning triggers at 4x subscription,
        # so we go at 0.5 to avoid the warning message.
        ray_remote_args["num_cpus"] = 0.5
    remote_read = cached_remote_fn(remote_read)

    if _spread_resource_prefix is not None:
        # Use given spread resource prefix for round-robin resource-based
        # scheduling.
        nodes = ray.nodes()
        resource_iter = _get_spread_resources_iter(nodes,
                                                   _spread_resource_prefix,
                                                   ray_remote_args)
    else:
        # If no spread resource prefix given, yield an empty dictionary.
        resource_iter = itertools.repeat({})

    calls: List[Callable[[], ObjectRef[MaybeBlockPartition]]] = []
    metadata: List[BlockPartitionMetadata] = []

    for i, task in enumerate(read_tasks):
        calls.append(
            lambda i=i, task=task, resources=next(resource_iter): remote_read.
            options(**ray_remote_args, resources=resources).remote(i, task))
        metadata.append(task.get_metadata())

    block_list = LazyBlockList(calls, metadata)

    # Get the schema from the first block synchronously.
    if metadata and metadata[0].schema is None:
        block_list.ensure_schema_for_first_block()

    return Dataset(
        block_list,
        0,
        DatasetStats(
            stages={"read": metadata},
            parent=None,
            stats_actor=stats_actor,
            stats_uuid=stats_uuid,
        ),
    )
예제 #18
0
파일: read_api.py 프로젝트: scv119/ray
def read_datasource(
    datasource: Datasource[T],
    *,
    parallelism: int = 200,
    ray_remote_args: Dict[str, Any] = None,
    _spread_resource_prefix: Optional[str] = None,
    **read_args,
) -> Dataset[T]:
    """Read a dataset from a custom data source.

    Args:
        datasource: The datasource to read data from.
        parallelism: The requested parallelism of the read. Parallelism may be
            limited by the available partitioning of the datasource.
        read_args: Additional kwargs to pass to the datasource impl.
        ray_remote_args: kwargs passed to ray.remote in the read tasks.

    Returns:
        Dataset holding the data read from the datasource.
    """
    # TODO(ekl) remove this feature flag.
    force_local = "RAY_DATASET_FORCE_LOCAL_METADATA" in os.environ
    pa_ds = _lazy_import_pyarrow_dataset()
    if pa_ds:
        partitioning = read_args.get("dataset_kwargs", {}).get("partitioning", None)
        if isinstance(partitioning, pa_ds.Partitioning):
            logger.info(
                "Forcing local metadata resolution since the provided partitioning "
                f"{partitioning} is not serializable."
            )
            force_local = True

    if force_local:
        read_tasks = datasource.prepare_read(parallelism, **read_args)
    else:
        # Prepare read in a remote task so that in Ray client mode, we aren't
        # attempting metadata resolution from the client machine.
        ctx = DatasetContext.get_current()
        prepare_read = cached_remote_fn(
            _prepare_read, retry_exceptions=False, num_cpus=0
        )
        read_tasks = ray.get(
            prepare_read.remote(
                datasource,
                ctx,
                parallelism,
                _wrap_arrow_serialization_workaround(read_args),
            )
        )

    context = DatasetContext.get_current()
    stats_actor = get_or_create_stats_actor()
    stats_uuid = uuid.uuid4()
    stats_actor.record_start.remote(stats_uuid)

    def remote_read(i: int, task: ReadTask, stats_actor) -> MaybeBlockPartition:
        DatasetContext._set_current(context)
        stats = BlockExecStats.builder()

        # Execute the read task.
        block = task()

        if context.block_splitting_enabled:
            metadata = task.get_metadata()
            metadata.exec_stats = stats.build()
        else:
            metadata = BlockAccessor.for_block(block).get_metadata(
                input_files=task.get_metadata().input_files, exec_stats=stats.build()
            )
        stats_actor.record_task.remote(stats_uuid, i, metadata)
        return block

    if ray_remote_args is None:
        ray_remote_args = {}
    if "scheduling_strategy" not in ray_remote_args:
        ray_remote_args["scheduling_strategy"] = "SPREAD"
    remote_read = cached_remote_fn(remote_read)

    if _spread_resource_prefix is not None:
        if context.optimize_fuse_stages:
            logger.warning(
                "_spread_resource_prefix has no effect when optimize_fuse_stages "
                "is enabled. Tasks are spread by default."
            )
        # Use given spread resource prefix for round-robin resource-based
        # scheduling.
        nodes = ray.nodes()
        resource_iter = _get_spread_resources_iter(
            nodes, _spread_resource_prefix, ray_remote_args
        )
    else:
        # If no spread resource prefix given, yield an empty dictionary.
        resource_iter = itertools.repeat({})

    calls: List[Callable[[], ObjectRef[MaybeBlockPartition]]] = []
    metadata: List[BlockPartitionMetadata] = []

    for i, task in enumerate(read_tasks):
        calls.append(
            lambda i=i, task=task, resources=next(resource_iter): remote_read.options(
                **ray_remote_args, resources=resources
            ).remote(i, task, stats_actor)
        )
        metadata.append(task.get_metadata())

    block_list = LazyBlockList(calls, metadata)
    # TODO(ekl) consider refactoring LazyBlockList to take read_tasks explicitly.
    block_list._read_tasks = read_tasks
    block_list._read_remote_args = ray_remote_args

    # Get the schema from the first block synchronously.
    if metadata and metadata[0].schema is None:
        block_list.ensure_schema_for_first_block()

    stats = DatasetStats(
        stages={"read": metadata},
        parent=None,
        stats_actor=stats_actor,
        stats_uuid=stats_uuid,
    )
    return Dataset(
        ExecutionPlan(block_list, stats),
        0,
        False,
    )