Exemplo n.º 1
0
def from_pandas_refs(
    dfs: Union[ObjectRef["pandas.DataFrame"], List[ObjectRef["pandas.DataFrame"]]]
) -> Dataset[ArrowRow]:
    """Create a dataset from a list of Ray object references to Pandas
    dataframes.

    Args:
        dfs: A Ray object references to pandas dataframe, or a list of
             Ray object references to pandas dataframes.

    Returns:
        Dataset holding Arrow records read from the dataframes.
    """
    if isinstance(dfs, ray.ObjectRef):
        dfs = [dfs]
    elif isinstance(dfs, list):
        for df in dfs:
            if not isinstance(df, ray.ObjectRef):
                raise ValueError(
                    "Expected list of Ray object refs, "
                    f"got list containing {type(df)}"
                )
    else:
        raise ValueError(
            "Expected Ray object ref or list of Ray object refs, " f"got {type(df)}"
        )

    context = DatasetContext.get_current()
    if context.enable_pandas_block:
        get_metadata = cached_remote_fn(_get_metadata)
        metadata = ray.get([get_metadata.remote(df) for df in dfs])
        return Dataset(
            ExecutionPlan(
                BlockList(dfs, metadata),
                DatasetStats(stages={"from_pandas_refs": metadata}, parent=None),
            ),
            0,
            False,
        )

    df_to_block = cached_remote_fn(_df_to_block, num_returns=2)

    res = [df_to_block.remote(df) for df in dfs]
    blocks, metadata = map(list, zip(*res))
    metadata = ray.get(metadata)
    return Dataset(
        ExecutionPlan(
            BlockList(blocks, metadata),
            DatasetStats(stages={"from_pandas_refs": metadata}, parent=None),
        ),
        0,
        False,
    )
Exemplo n.º 2
0
def from_arrow_refs(
    tables: Union[ObjectRef[Union["pyarrow.Table", bytes]],
                  List[ObjectRef[Union["pyarrow.Table", bytes]]], ]
) -> Dataset[ArrowRow]:
    """Create a dataset from a set of Arrow tables.

    Args:
        tables: A Ray object reference to Arrow table, or list of Ray object
                references to Arrow tables, or its streaming format in bytes.

    Returns:
        Dataset holding Arrow records from the tables.
    """
    if isinstance(tables, ray.ObjectRef):
        tables = [tables]

    get_metadata = cached_remote_fn(_get_metadata)
    metadata = ray.get([get_metadata.remote(t) for t in tables])
    return Dataset(
        ExecutionPlan(
            BlockList(tables, metadata),
            DatasetStats(stages={"from_arrow_refs": metadata}, parent=None),
        ),
        0,
        False,
    )
Exemplo n.º 3
0
Arquivo: split.py Projeto: parasj/ray
def _calculate_blocks_rows(
    blocks_with_metadata: List[Tuple[ObjectRef[Block], BlockMetadata]],
) -> List[int]:
    """Calculate the number of rows for a list of blocks with metadata."""
    get_num_rows = cached_remote_fn(_get_num_rows)
    block_rows = []
    for block, metadata in blocks_with_metadata:
        if metadata.num_rows is None:
            # Need to fetch number of rows.
            num_rows = ray.get(get_num_rows.remote(block))
        else:
            num_rows = metadata.num_rows
        block_rows.append(num_rows)
    return block_rows
Exemplo n.º 4
0
def _fetch_metadata_remotely(
    pieces: List["pyarrow._dataset.ParquetFileFragment"],
) -> List[ObjectRef["pyarrow.parquet.FileMetaData"]]:

    remote_fetch_metadata = cached_remote_fn(
        _fetch_metadata_serialization_wrapper)
    metas = []
    parallelism = min(len(pieces) // PIECES_PER_META_FETCH, 100)
    meta_fetch_bar = ProgressBar("Metadata Fetch Progress", total=parallelism)
    for pcs in np.array_split(pieces, parallelism):
        if len(pcs) == 0:
            continue
        metas.append(
            remote_fetch_metadata.remote([_SerializedPiece(p) for p in pcs]))
    metas = meta_fetch_bar.fetch_until_complete(metas)
    return list(itertools.chain.from_iterable(metas))
Exemplo n.º 5
0
 def _submit_task(
     self, task_idx: int
 ) -> Tuple[ObjectRef[MaybeBlockPartition],
            ObjectRef[BlockPartitionMetadata]]:
     """Submit the task with index task_idx."""
     stats_actor = _get_or_create_stats_actor()
     if not self._execution_started:
         stats_actor.record_start.remote(self._stats_uuid)
         self._execution_started = True
     task = self._tasks[task_idx]
     return (cached_remote_fn(_execute_read_task).options(
         num_returns=2, **self._remote_args).remote(
             i=task_idx,
             task=task,
             context=DatasetContext.get_current(),
             stats_uuid=self._stats_uuid,
             stats_actor=stats_actor,
         ))
Exemplo n.º 6
0
    def _execute_reduce_stage(
        self,
        output_num_blocks: int,
        schedule: _PushBasedShuffleTaskSchedule,
        reduce_ray_remote_args: Dict[str, Any],
        all_merge_results: List[List[ObjectRef]],
    ):
        shuffle_reduce = cached_remote_fn(self.reduce)
        # Execute the final reduce stage.
        shuffle_reduce_out = []
        for reducer_idx in range(output_num_blocks):
            merge_idx = schedule.get_merge_idx_for_reducer_idx(reducer_idx)
            # Submit one partition of reduce tasks, one for each of the P
            # outputs produced by the corresponding merge task.
            # We also add the merge task arguments so that the reduce task
            # is colocated with its inputs.
            shuffle_reduce_out.append(
                shuffle_reduce.options(
                    **reduce_ray_remote_args,
                    **schedule.get_merge_task_options(merge_idx),
                    num_returns=2,
                ).remote(
                    *self._reduce_args,
                    *[
                        merge_results.pop(0)
                        for merge_results in all_merge_results[merge_idx]
                    ],
                )
            )
        for merge_idx, merge_results in enumerate(all_merge_results):
            assert all(len(merge_result) == 0 for merge_result in merge_results), (
                "Reduce stage did not process outputs from merge tasks at index: "
                f"{merge_idx}"
            )
        assert (
            len(shuffle_reduce_out) == output_num_blocks
        ), f"Expected {output_num_blocks} outputs, produced {len(shuffle_reduce_out)}"

        reduce_bar = ProgressBar("Shuffle Reduce", total=output_num_blocks)
        reduce_blocks, reduce_metadata = zip(*shuffle_reduce_out)
        reduce_metadata = reduce_bar.fetch_until_complete(list(reduce_metadata))
        reduce_bar.close()

        return reduce_metadata, reduce_blocks
Exemplo n.º 7
0
def _fetch_metadata_remotely(
    pieces: List["pyarrow._dataset.ParquetFileFragment"],
) -> List[ObjectRef["pyarrow.parquet.FileMetaData"]]:
    from ray import cloudpickle

    remote_fetch_metadata = cached_remote_fn(
        _fetch_metadata_serialization_wrapper)
    metas = []
    parallelism = min(len(pieces) // PIECES_PER_META_FETCH, 100)
    meta_fetch_bar = ProgressBar("Metadata Fetch Progress", total=parallelism)
    try:
        _register_parquet_file_fragment_serialization()
        for pcs in np.array_split(pieces, parallelism):
            if len(pcs) == 0:
                continue
            metas.append(remote_fetch_metadata.remote(cloudpickle.dumps(pcs)))
    finally:
        _deregister_parquet_file_fragment_serialization()
    metas = meta_fetch_bar.fetch_until_complete(metas)
    return list(itertools.chain.from_iterable(metas))
Exemplo n.º 8
0
def from_numpy_refs(
    ndarrays: Union[ObjectRef[np.ndarray], List[ObjectRef[np.ndarray]]],
) -> Dataset[ArrowRow]:
    """Create a dataset from a list of NumPy ndarray futures.

    Args:
        ndarrays: A Ray object reference to a NumPy ndarray or a list of Ray object
            references to NumPy ndarrays.

    Returns:
        Dataset holding the given ndarrays.
    """
    if isinstance(ndarrays, ray.ObjectRef):
        ndarrays = [ndarrays]
    elif isinstance(ndarrays, list):
        for ndarray in ndarrays:
            if not isinstance(ndarray, ray.ObjectRef):
                raise ValueError(
                    "Expected list of Ray object refs, "
                    f"got list containing {type(ndarray)}"
                )
    else:
        raise ValueError(
            f"Expected Ray object ref or list of Ray object refs, got {type(ndarray)}"
        )

    ndarray_to_block = cached_remote_fn(_ndarray_to_block, num_returns=2)

    res = [ndarray_to_block.remote(ndarray) for ndarray in ndarrays]
    blocks, metadata = map(list, zip(*res))
    metadata = ray.get(metadata)
    return Dataset(
        ExecutionPlan(
            BlockList(blocks, metadata),
            DatasetStats(stages={"from_numpy_refs": metadata}, parent=None),
        ),
        0,
        False,
    )
Exemplo n.º 9
0
        def do_zip_all(block_list, clear_input_blocks: bool, *_):
            blocks1 = block_list.get_blocks()
            blocks2 = other.get_internal_block_refs()

            if clear_input_blocks:
                block_list.clear()

            if len(blocks1) != len(blocks2):
                # TODO(ekl) consider supporting if num_rows are equal.
                raise ValueError(
                    "Cannot zip dataset of different num blocks: {} vs {}".format(
                        len(blocks1), len(blocks2)
                    )
                )

            def do_zip(block1: Block, block2: Block) -> (Block, BlockMetadata):
                stats = BlockExecStats.builder()
                b1 = BlockAccessor.for_block(block1)
                result = b1.zip(block2)
                br = BlockAccessor.for_block(result)
                return result, br.get_metadata(input_files=[], exec_stats=stats.build())

            do_zip_fn = cached_remote_fn(do_zip, num_returns=2)

            blocks = []
            metadata = []
            for b1, b2 in zip(blocks1, blocks2):
                res, meta = do_zip_fn.remote(b1, b2)
                blocks.append(res)
                metadata.append(meta)

            # Early release memory.
            del blocks1, blocks2

            # TODO(ekl) it might be nice to have a progress bar here.
            metadata = ray.get(metadata)
            blocks = BlockList(blocks, metadata)
            return blocks, {}
Exemplo n.º 10
0
def sample_boundaries(blocks: List[ObjectRef[Block]], key: SortKeyT,
                      num_reducers: int) -> List[T]:
    """
    Return (num_reducers - 1) items in ascending order from the blocks that
    partition the domain into ranges with approximately equally many elements.
    """
    # TODO(Clark): Support multiple boundary sampling keys.
    if isinstance(key, list) and len(key) > 1:
        raise ValueError("Multiple boundary sampling keys not supported.")

    n_samples = int(num_reducers * 10 / len(blocks))

    sample_block = cached_remote_fn(_sample_block)

    sample_results = [
        sample_block.remote(block, n_samples, key) for block in blocks
    ]
    sample_bar = ProgressBar("Sort Sample", len(sample_results))
    samples = sample_bar.fetch_until_complete(sample_results)
    sample_bar.close()
    del sample_results
    samples = [s for s in samples if len(s) > 0]
    # The dataset is empty
    if len(samples) == 0:
        return [None] * (num_reducers - 1)
    builder = DelegatingBlockBuilder()
    for sample in samples:
        builder.add_block(sample)
    samples = builder.build()
    column = key[0][0] if isinstance(key, list) else None
    sample_items = BlockAccessor.for_block(samples).to_numpy(column)
    sample_items = np.sort(sample_items)
    ret = [
        np.quantile(sample_items, q, interpolation="nearest")
        for q in np.linspace(0, 1, num_reducers)
    ]
    return ret[1:]
Exemplo n.º 11
0
Arquivo: split.py Projeto: parasj/ray
def _split_all_blocks(
    blocks_with_metadata: List[Tuple[ObjectRef[Block], BlockMetadata]],
    block_rows: List[int],
    per_block_split_indices: List[List[int]],
) -> List[Tuple[ObjectRef[Block], BlockMetadata]]:
    """Split all the input blocks based on the split indices"""
    split_single_block = cached_remote_fn(_split_single_block)

    all_blocks_split_results: List[List[Tuple[
        ObjectRef[Block], BlockMetadata]]] = [None] * len(blocks_with_metadata)

    split_single_block_futures = []

    for block_id, block_split_indices in enumerate(per_block_split_indices):
        (block_ref, meta) = blocks_with_metadata[block_id]
        block_row = block_rows[block_id]
        if len(block_split_indices) == 0:
            # optimization: if no split is needed, we just need to add it to the
            # result
            all_blocks_split_results[block_id] = [(block_ref, meta)]
        else:
            # otherwise call split remote function.
            split_single_block_futures.append(
                split_single_block.options(
                    scheduling_strategy="SPREAD").remote(
                        block_id,
                        block_ref,
                        meta,
                        block_row,
                        block_split_indices,
                    ))
    if split_single_block_futures:
        split_single_block_results = ray.get(split_single_block_futures)
        for block_id, block_split_result in split_single_block_results:
            all_blocks_split_results[block_id] = block_split_result
    return all_blocks_split_results
Exemplo n.º 12
0
    def execute(
        self,
        input_blocks: BlockList,
        output_num_blocks: int,
        clear_input_blocks: bool,
        *,
        map_ray_remote_args: Optional[Dict[str, Any]] = None,
        reduce_ray_remote_args: Optional[Dict[str, Any]] = None,
        merge_factor: int = 2,
    ) -> Tuple[BlockList, Dict[str, List[BlockMetadata]]]:
        logger.info("Using experimental push-based shuffle.")
        # TODO(swang): For jobs whose reduce work is heavier than the map work,
        # we should support fractional merge factors.
        # TODO(swang): For large jobs, we should try to choose the merge factor
        # automatically, e.g., by running one test round of map and merge tasks
        # and comparing their run times.
        # TODO(swang): Add option to automatically reduce write amplification
        # during map-merge stage, by limiting how many partitions can be
        # processed concurrently.
        input_blocks_list = input_blocks.get_blocks()
        # Preemptively clear the blocks list since we will incrementally delete
        # the last remaining references as we submit the dependent map tasks
        # during the map-merge stage.
        if clear_input_blocks:
            input_blocks.clear()

        if map_ray_remote_args is None:
            map_ray_remote_args = {}
        if reduce_ray_remote_args is None:
            reduce_ray_remote_args = {}
        # The placement strategy for reduce tasks is overwritten to colocate
        # them with their inputs from the merge stage, so remove any
        # pre-specified scheduling strategy here.
        reduce_ray_remote_args = reduce_ray_remote_args.copy()
        reduce_ray_remote_args.pop("scheduling_strategy", None)

        # Compute all constants used for task scheduling.
        num_cpus_per_node_map = _get_num_cpus_per_node_map()
        stage = self._compute_shuffle_schedule(
            num_cpus_per_node_map,
            len(input_blocks_list),
            merge_factor,
            output_num_blocks,
        )

        map_fn = self._map_partition
        merge_fn = self._merge

        def map_partition(*args, **kwargs):
            return map_fn(self.map, *args, **kwargs)

        def merge(*args, **kwargs):
            return merge_fn(self.reduce, *args, **kwargs)

        shuffle_map = cached_remote_fn(map_partition)
        shuffle_map = shuffle_map.options(
            **map_ray_remote_args,
            num_returns=1 + stage.num_merge_tasks_per_round,
        )

        map_stage_iter = _MapStageIterator(
            input_blocks_list,
            shuffle_map,
            [output_num_blocks, stage.merge_schedule, *self._map_args],
        )
        map_bar = ProgressBar("Shuffle Map",
                              position=0,
                              total=len(input_blocks_list))
        map_stage_executor = _PipelinedStageExecutor(
            map_stage_iter,
            stage.num_map_tasks_per_round,
            progress_bar=map_bar)

        shuffle_merge = cached_remote_fn(merge)
        merge_stage_iter = _MergeStageIterator(map_stage_iter, shuffle_merge,
                                               stage, self._reduce_args)
        merge_stage_executor = _PipelinedStageExecutor(
            merge_stage_iter,
            stage.num_merge_tasks_per_round,
            max_concurrent_rounds=2)

        # Execute the map-merge stage. This submits tasks in rounds of M map
        # tasks and N merge tasks each. Task execution between map and merge is
        # pipelined, so that while executing merge for one round of inputs, we
        # also execute the map tasks for the following round.
        map_done = False
        merge_done = False
        map_stage_metadata = []
        merge_stage_metadata = []
        while not (map_done and merge_done):
            try:
                map_stage_metadata += next(map_stage_executor)
            except StopIteration:
                map_done = True
                break

            try:
                merge_stage_metadata += next(merge_stage_executor)
            except StopIteration:
                merge_done = True
                break

        map_bar.close()
        all_merge_results = merge_stage_iter.pop_merge_results()

        # Execute and wait for the reduce stage.
        reduce_bar = ProgressBar("Shuffle Reduce", total=output_num_blocks)
        shuffle_reduce = cached_remote_fn(self.reduce)
        reduce_stage_iter = _ReduceStageIterator(
            stage,
            shuffle_reduce,
            all_merge_results,
            reduce_ray_remote_args,
            self._reduce_args,
        )

        max_reduce_tasks_in_flight = output_num_blocks
        ctx = DatasetContext.get_current()
        if ctx.pipeline_push_based_shuffle_reduce_tasks:
            # If pipelining is enabled, we should still try to utilize all
            # cores.
            max_reduce_tasks_in_flight = min(
                max_reduce_tasks_in_flight,
                sum(num_cpus_per_node_map.values()))

        reduce_stage_executor = _PipelinedStageExecutor(
            reduce_stage_iter,
            max_reduce_tasks_in_flight,
            max_concurrent_rounds=2,
            progress_bar=reduce_bar,
        )
        reduce_stage_metadata = []
        while True:
            try:
                reduce_stage_metadata += next(reduce_stage_executor)
            except StopIteration:
                break

        new_blocks = reduce_stage_iter.pop_reduce_results()
        sorted_blocks = [(block[0], block[1], reduce_stage_metadata[i])
                         for i, block in enumerate(new_blocks)]
        sorted_blocks.sort(key=lambda x: x[0])
        _, new_blocks, reduce_stage_metadata = zip(*sorted_blocks)
        del sorted_blocks

        assert (
            len(new_blocks) == output_num_blocks
        ), f"Expected {output_num_blocks} outputs, produced {len(new_blocks)}"
        reduce_bar.close()

        stats = {
            "map": map_stage_metadata,
            "merge": merge_stage_metadata,
            "reduce": reduce_stage_metadata,
        }

        return BlockList(list(new_blocks), list(reduce_stage_metadata)), stats
Exemplo n.º 13
0
    def __init__(
        self,
        dataset: "Dataset[T]",
        key: str,
        num_workers: int,
    ):
        """Construct a RandomAccessDataset (internal API).

        The constructor is a private API. Use ``dataset.to_random_access_dataset()``
        to construct a RandomAccessDataset.
        """
        self._format = dataset._dataset_format()
        if self._format not in ["arrow", "pandas"]:
            raise ValueError(
                "RandomAccessDataset only supports Arrow-format datasets.")

        start = time.perf_counter()
        logger.info("[setup] Indexing dataset by sort key.")
        sorted_ds = dataset.sort(key)
        get_bounds = cached_remote_fn(_get_bounds)
        blocks = sorted_ds.get_internal_block_refs()

        logger.info("[setup] Computing block range bounds.")
        bounds = ray.get(
            [get_bounds.remote(b, key, self._format) for b in blocks])
        self._non_empty_blocks = []
        self._lower_bound = None
        self._upper_bounds = []
        for i, b in enumerate(bounds):
            if b:
                self._non_empty_blocks.append(blocks[i])
                if self._lower_bound is None:
                    self._lower_bound = b[0]
                self._upper_bounds.append(b[1])

        logger.info(
            "[setup] Creating {} random access workers.".format(num_workers))
        ctx = DatasetContext.get_current()
        if ctx.scheduling_strategy != DEFAULT_SCHEDULING_STRATEGY:
            scheduling_strategy = ctx.scheduling_strategy
        else:
            scheduling_strategy = "SPREAD"
        self._workers = [
            _RandomAccessWorker.options(
                scheduling_strategy=scheduling_strategy).remote(
                    key, self._format) for _ in range(num_workers)
        ]
        (
            self._block_to_workers_map,
            self._worker_to_blocks_map,
        ) = self._compute_block_to_worker_assignments()

        logger.info("[setup] Worker to blocks assignment: {}".format(
            self._worker_to_blocks_map))
        ray.get([
            w.assign_blocks.remote({
                i: self._non_empty_blocks[i]
                for i in self._worker_to_blocks_map[w]
            }) for w in self._workers
        ])

        logger.info("[setup] Finished assigning blocks to workers.")
        self._build_time = time.perf_counter() - start
Exemplo n.º 14
0
    def execute(
        self,
        input_blocks: BlockList,
        output_num_blocks: int,
        clear_input_blocks: bool,
        *,
        map_ray_remote_args: Optional[Dict[str, Any]] = None,
        reduce_ray_remote_args: Optional[Dict[str, Any]] = None,
        merge_factor: int = 2,
    ) -> Tuple[BlockList, Dict[str, List[BlockMetadata]]]:
        logger.info("Using experimental push-based shuffle.")
        # TODO(swang): For jobs whose reduce work is heavier than the map work,
        # we should support fractional merge factors.
        # TODO(swang): For large jobs, we should try to choose the merge factor
        # automatically, e.g., by running one test round of map and merge tasks
        # and comparing their run times.
        # TODO(swang): Add option to automatically reduce write amplification
        # during map-merge stage, by limiting how many partitions can be
        # processed concurrently.
        input_blocks_list = input_blocks.get_blocks()
        # Preemptively clear the blocks list since we will incrementally delete
        # the last remaining references as we submit the dependent map tasks
        # during the map-merge stage.
        if clear_input_blocks:
            input_blocks.clear()

        if map_ray_remote_args is None:
            map_ray_remote_args = {}
        if reduce_ray_remote_args is None:
            reduce_ray_remote_args = {}
        # The placement strategy for reduce tasks is overwritten to colocate
        # them with their inputs from the merge stage, so remove any
        # pre-specified scheduling strategy here.
        reduce_ray_remote_args = reduce_ray_remote_args.copy()
        reduce_ray_remote_args.pop("scheduling_strategy", None)

        map_fn = self._map_partition
        merge_fn = self._merge

        def map_partition(*args, **kwargs):
            return map_fn(self.map, *args, **kwargs)

        def merge(*args, **kwargs):
            return merge_fn(self.reduce, *args, **kwargs)

        shuffle_map = cached_remote_fn(map_partition)
        shuffle_merge = cached_remote_fn(merge)

        def submit_map_task(arg):
            mapper_idx, block = arg
            # NOTE(swang): Results are shuffled between map and merge tasks, so
            # there is no advantage to colocating specific map and merge tasks.
            # Therefore, we do not specify a node affinity policy for map tasks
            # in case the caller or Ray has a better scheduling strategy, e.g.,
            # based on data locality.
            map_result = shuffle_map.options(
                **map_ray_remote_args,
                num_returns=1 + schedule.num_merge_tasks_per_round,
            ).remote(
                mapper_idx,
                block,
                output_num_blocks,
                schedule,
                *self._map_args,
            )
            metadata_ref = map_result.pop(0)
            return metadata_ref, map_result

        def submit_merge_task(arg):
            merge_idx, map_results = arg
            num_merge_returns = schedule.get_num_reducers_per_merge_idx(merge_idx)
            merge_result = shuffle_merge.options(
                num_returns=1 + num_merge_returns,
                **schedule.get_merge_task_options(merge_idx),
            ).remote(
                *map_results,
                reduce_args=self._reduce_args,
            )
            metadata_ref = merge_result.pop(0)
            return metadata_ref, merge_result

        # Compute all constants used for task scheduling.
        num_cpus_per_node_map = _get_num_cpus_per_node_map()
        schedule = self._compute_shuffle_schedule(
            num_cpus_per_node_map,
            len(input_blocks_list),
            merge_factor,
            output_num_blocks,
        )

        # ObjectRef results from the last round of tasks. Used to add
        # backpressure during pipelining of map and merge tasks.
        last_map_metadata_results = []
        last_merge_metadata_results = []
        # Final outputs from the map-merge stage.
        # This is a map from merge task index to a nested list of merge results
        # (ObjectRefs). Each merge task index corresponds to a partition of P
        # final reduce tasks.
        all_merge_results = [[] for _ in range(schedule.num_merge_tasks_per_round)]
        shuffle_map_metadata = []
        shuffle_merge_metadata = []
        map_bar = ProgressBar("Shuffle Map", position=0, total=len(input_blocks_list))

        # Execute the map-merge stage. This submits tasks in rounds of M map
        # tasks and N merge tasks each. Task execution between map and merge is
        # pipelined, so that while executing merge for one round of inputs, we
        # also execute the map tasks for the following round.
        input_blocks_list = list(enumerate(input_blocks_list))
        while input_blocks_list:
            # Execute one round of the map stage.
            # Pop from the inputs so that we can clear the memory ASAP.
            round_input_blocks = []
            try:
                for _ in range(schedule.num_map_tasks_per_round):
                    round_input_blocks.append(input_blocks_list.pop(0))
            except IndexError:
                pass
            (
                prev_map_metadata,
                last_map_metadata_results,
                map_results,
            ) = _execute_pipelined_stage(
                submit_map_task,
                last_map_metadata_results,
                round_input_blocks,
                progress_bar=map_bar,
            )
            shuffle_map_metadata += prev_map_metadata

            # Shuffle the map results for the merge tasks.
            merge_args = [
                (merge_idx, [map_result.pop(0) for map_result in map_results])
                for merge_idx in range(schedule.num_merge_tasks_per_round)
            ]
            assert all([not map_result for map_result in map_results])
            # Execute one round of the merge stage.
            (
                prev_merge_metadata,
                last_merge_metadata_results,
                merge_results,
            ) = _execute_pipelined_stage(
                submit_merge_task,
                last_merge_metadata_results,
                merge_args,
            )
            shuffle_merge_metadata += prev_merge_metadata
            for merge_idx, merge_result in enumerate(merge_results):
                all_merge_results[merge_idx].append(merge_result)
            del merge_results

        # Wait for last map and merge tasks to finish.
        prev_map_metadata, _, _ = _execute_pipelined_stage(
            None, last_map_metadata_results, [], progress_bar=map_bar
        )
        shuffle_map_metadata += prev_map_metadata
        map_bar.close()
        prev_merge_metadata, _, _ = _execute_pipelined_stage(
            None, last_merge_metadata_results, []
        )
        shuffle_merge_metadata += prev_merge_metadata

        # Execute and wait for the reduce stage.
        new_metadata, new_blocks = self._execute_reduce_stage(
            output_num_blocks, schedule, reduce_ray_remote_args, all_merge_results
        )

        stats = {
            "map": shuffle_map_metadata,
            "merge": shuffle_merge_metadata,
            "reduce": new_metadata,
        }

        return BlockList(list(new_blocks), list(new_metadata)), stats
Exemplo n.º 15
0
def read_datasource(
    datasource: Datasource[T],
    *,
    parallelism: int = -1,
    ray_remote_args: Dict[str, Any] = None,
    **read_args,
) -> Dataset[T]:
    """Read a dataset from a custom data source.

    Args:
        datasource: The datasource to read data from.
        parallelism: The requested parallelism of the read. Parallelism may be
            limited by the available partitioning of the datasource. If set to -1,
            parallelism will be automatically chosen based on the available cluster
            resources and estimated in-memory data size.
        read_args: Additional kwargs to pass to the datasource impl.
        ray_remote_args: kwargs passed to ray.remote in the read tasks.

    Returns:
        Dataset holding the data read from the datasource.
    """
    ctx = DatasetContext.get_current()
    # TODO(ekl) remove this feature flag.
    force_local = "RAY_DATASET_FORCE_LOCAL_METADATA" in os.environ
    cur_pg = ray.util.get_current_placement_group()
    pa_ds = _lazy_import_pyarrow_dataset()
    if pa_ds:
        partitioning = read_args.get("dataset_kwargs",
                                     {}).get("partitioning", None)
        if isinstance(partitioning, pa_ds.Partitioning):
            logger.info(
                "Forcing local metadata resolution since the provided partitioning "
                f"{partitioning} is not serializable.")
            force_local = True

    if force_local:
        requested_parallelism, min_safe_parallelism, read_tasks = _get_read_tasks(
            datasource, ctx, cur_pg, parallelism, read_args)
    else:
        # Prepare read in a remote task so that in Ray client mode, we aren't
        # attempting metadata resolution from the client machine.
        get_read_tasks = cached_remote_fn(_get_read_tasks,
                                          retry_exceptions=False,
                                          num_cpus=0)

        requested_parallelism, min_safe_parallelism, read_tasks = ray.get(
            get_read_tasks.remote(
                datasource,
                ctx,
                cur_pg,
                parallelism,
                _wrap_and_register_arrow_serialization_workaround(read_args),
            ))

    if read_tasks and len(read_tasks) < min_safe_parallelism * 0.7:
        perc = 1 + round(
            (min_safe_parallelism - len(read_tasks)) / len(read_tasks), 1)
        logger.warning(
            f"{WARN_PREFIX} The blocks of this dataset are estimated to be {perc}x "
            "larger than the target block size "
            f"of {int(ctx.target_max_block_size / 1024 / 1024)} MiB. This may lead to "
            "out-of-memory errors during processing. Consider reducing the size of "
            "input files or using `.repartition(n)` to increase the number of "
            "dataset blocks.")
    elif len(read_tasks) < requested_parallelism and (
            len(read_tasks) < ray.available_resources().get("CPU", 1) // 2):
        logger.warning(
            f"{WARN_PREFIX} The number of blocks in this dataset ({len(read_tasks)}) "
            f"limits its parallelism to {len(read_tasks)} concurrent tasks. "
            "This is much less than the number "
            "of available CPU slots in the cluster. Use `.repartition(n)` to "
            "increase the number of "
            "dataset blocks.")

    if ray_remote_args is None:
        ray_remote_args = {}
    if ("scheduling_strategy" not in ray_remote_args
            and ctx.scheduling_strategy == DEFAULT_SCHEDULING_STRATEGY):
        ray_remote_args["scheduling_strategy"] = "SPREAD"

    block_list = LazyBlockList(read_tasks, ray_remote_args=ray_remote_args)
    block_list.compute_first_block()
    block_list.ensure_metadata_for_first_block()

    return Dataset(
        ExecutionPlan(block_list, block_list.stats()),
        0,
        False,
    )
Exemplo n.º 16
0
def fast_repartition(blocks, num_blocks):
    from ray.data.dataset import Dataset

    wrapped_ds = Dataset(
        ExecutionPlan(blocks, DatasetStats(stages={}, parent=None)), 0, lazy=False
    )
    # Compute the (n-1) indices needed for an equal split of the data.
    count = wrapped_ds.count()
    dataset_format = wrapped_ds._dataset_format()
    indices = []
    cur_idx = 0
    for _ in range(num_blocks - 1):
        cur_idx += count / num_blocks
        indices.append(int(cur_idx))
    assert len(indices) < num_blocks, (indices, num_blocks)
    if indices:
        splits = wrapped_ds.split_at_indices(indices)
    else:
        splits = [wrapped_ds]
    # TODO(ekl) include stats for the split tasks. We may also want to
    # consider combining the split and coalesce tasks as an optimization.

    # Coalesce each split into a single block.
    reduce_task = cached_remote_fn(_ShufflePartitionOp.reduce).options(num_returns=2)
    reduce_bar = ProgressBar("Repartition", position=0, total=len(splits))
    reduce_out = [
        reduce_task.remote(False, None, *s.get_internal_block_refs())
        for s in splits
        if s.num_blocks() > 0
    ]

    # Early-release memory.
    del splits, blocks, wrapped_ds

    new_blocks, new_metadata = zip(*reduce_out)
    new_blocks, new_metadata = list(new_blocks), list(new_metadata)
    new_metadata = reduce_bar.fetch_until_complete(new_metadata)
    reduce_bar.close()

    # Handle empty blocks.
    if len(new_blocks) < num_blocks:
        from ray.data._internal.arrow_block import ArrowBlockBuilder
        from ray.data._internal.pandas_block import PandasBlockBuilder
        from ray.data._internal.simple_block import SimpleBlockBuilder

        num_empties = num_blocks - len(new_blocks)
        if dataset_format == "arrow":
            builder = ArrowBlockBuilder()
        elif dataset_format == "pandas":
            builder = PandasBlockBuilder()
        else:
            builder = SimpleBlockBuilder()
        empty_block = builder.build()
        empty_meta = BlockAccessor.for_block(empty_block).get_metadata(
            input_files=None, exec_stats=None
        )  # No stats for empty block.
        empty_blocks, empty_metadata = zip(
            *[(ray.put(empty_block), empty_meta) for _ in range(num_empties)]
        )
        new_blocks += empty_blocks
        new_metadata += empty_metadata

    return BlockList(new_blocks, new_metadata), {}
Exemplo n.º 17
0
def read_datasource(
    datasource: Datasource[T],
    *,
    parallelism: int = 200,
    ray_remote_args: Dict[str, Any] = None,
    **read_args,
) -> Dataset[T]:
    """Read a dataset from a custom data source.

    Args:
        datasource: The datasource to read data from.
        parallelism: The requested parallelism of the read. Parallelism may be
            limited by the available partitioning of the datasource.
        read_args: Additional kwargs to pass to the datasource impl.
        ray_remote_args: kwargs passed to ray.remote in the read tasks.

    Returns:
        Dataset holding the data read from the datasource.
    """
    ctx = DatasetContext.get_current()
    # TODO(ekl) remove this feature flag.
    force_local = "RAY_DATASET_FORCE_LOCAL_METADATA" in os.environ
    pa_ds = _lazy_import_pyarrow_dataset()
    if pa_ds:
        partitioning = read_args.get("dataset_kwargs", {}).get("partitioning", None)
        if isinstance(partitioning, pa_ds.Partitioning):
            logger.info(
                "Forcing local metadata resolution since the provided partitioning "
                f"{partitioning} is not serializable."
            )
            force_local = True

    if force_local:
        read_tasks = datasource.prepare_read(parallelism, **read_args)
    else:
        # Prepare read in a remote task so that in Ray client mode, we aren't
        # attempting metadata resolution from the client machine.
        prepare_read = cached_remote_fn(
            _prepare_read, retry_exceptions=False, num_cpus=0
        )
        read_tasks = ray.get(
            prepare_read.remote(
                datasource,
                ctx,
                parallelism,
                _wrap_and_register_arrow_serialization_workaround(read_args),
            )
        )

    if len(read_tasks) < parallelism and (
        len(read_tasks) < ray.available_resources().get("CPU", 1) // 2
    ):
        logger.warning(
            "The number of blocks in this dataset ({}) limits its parallelism to {} "
            "concurrent tasks. This is much less than the number of available "
            "CPU slots in the cluster. Use `.repartition(n)` to increase the number of "
            "dataset blocks.".format(len(read_tasks), len(read_tasks))
        )

    if ray_remote_args is None:
        ray_remote_args = {}
    if (
        "scheduling_strategy" not in ray_remote_args
        and ctx.scheduling_strategy == DEFAULT_SCHEDULING_STRATEGY
    ):
        ray_remote_args["scheduling_strategy"] = "SPREAD"

    block_list = LazyBlockList(read_tasks, ray_remote_args=ray_remote_args)
    block_list.compute_first_block()
    block_list.ensure_metadata_for_first_block()

    return Dataset(
        ExecutionPlan(block_list, block_list.stats()),
        0,
        False,
    )
Exemplo n.º 18
0
    def _apply(
        self,
        fn: Any,
        remote_args: dict,
        block_list: BlockList,
        clear_input_blocks: bool,
        name: Optional[str] = None,
    ) -> BlockList:
        context = DatasetContext.get_current()

        # Handle empty datasets.
        if block_list.initial_num_blocks() == 0:
            return block_list

        blocks = block_list.get_blocks_with_metadata()
        if name is None:
            name = "map"
        name = name.title()
        map_bar = ProgressBar(name, total=len(blocks))

        if context.block_splitting_enabled:
            map_block = cached_remote_fn(_map_block_split).options(
                **remote_args)
            refs = [map_block.remote(b, fn, m.input_files) for b, m in blocks]
        else:
            map_block = cached_remote_fn(_map_block_nosplit).options(
                **dict(remote_args, num_returns=2))
            all_refs = [
                map_block.remote(b, fn, m.input_files) for b, m in blocks
            ]
            data_refs = [r[0] for r in all_refs]
            refs = [r[1] for r in all_refs]

        # Release input block references.
        if clear_input_blocks:
            del blocks
            block_list.clear()

        # Common wait for non-data refs.
        try:
            results = map_bar.fetch_until_complete(refs)
        except (ray.exceptions.RayTaskError, KeyboardInterrupt) as e:
            # One or more mapper tasks failed, or we received a SIGINT signal
            # while waiting; either way, we cancel all map tasks.
            for ref in refs:
                ray.cancel(ref)
            # Wait until all tasks have failed or been cancelled.
            for ref in refs:
                try:
                    ray.get(ref)
                except (ray.exceptions.RayTaskError,
                        ray.exceptions.TaskCancelledError):
                    pass
            # Reraise the original task failure exception.
            raise e from None

        new_blocks, new_metadata = [], []
        if context.block_splitting_enabled:
            for result in results:
                for block, metadata in result:
                    new_blocks.append(block)
                    new_metadata.append(metadata)
        else:
            for block, metadata in zip(data_refs, results):
                new_blocks.append(block)
                new_metadata.append(metadata)
        return BlockList(list(new_blocks), list(new_metadata))
Exemplo n.º 19
0
    def execute(
        self,
        input_blocks: BlockList,
        output_num_blocks: int,
        clear_input_blocks: bool,
        *,
        map_ray_remote_args: Optional[Dict[str, Any]] = None,
        reduce_ray_remote_args: Optional[Dict[str, Any]] = None,
    ) -> Tuple[BlockList, Dict[str, List[BlockMetadata]]]:
        input_blocks_list = input_blocks.get_blocks()
        input_num_blocks = len(input_blocks_list)

        if map_ray_remote_args is None:
            map_ray_remote_args = {}
        if reduce_ray_remote_args is None:
            reduce_ray_remote_args = {}
        if "scheduling_strategy" not in reduce_ray_remote_args:
            reduce_ray_remote_args = reduce_ray_remote_args.copy()
            reduce_ray_remote_args["scheduling_strategy"] = "SPREAD"

        shuffle_map = cached_remote_fn(self.map)
        shuffle_reduce = cached_remote_fn(self.reduce)

        map_bar = ProgressBar("Shuffle Map", total=input_num_blocks)

        shuffle_map_out = [
            shuffle_map.options(
                **map_ray_remote_args,
                num_returns=1 + output_num_blocks,
            ).remote(i, block, output_num_blocks, *self._map_args)
            for i, block in enumerate(input_blocks_list)
        ]

        # The first item returned is the BlockMetadata.
        shuffle_map_metadata = []
        for i, refs in enumerate(shuffle_map_out):
            shuffle_map_metadata.append(refs[0])
            shuffle_map_out[i] = refs[1:]

        # Eagerly delete the input block references in order to eagerly release
        # the blocks' memory.
        del input_blocks_list
        if clear_input_blocks:
            input_blocks.clear()
        shuffle_map_metadata = map_bar.fetch_until_complete(
            shuffle_map_metadata)
        map_bar.close()

        reduce_bar = ProgressBar("Shuffle Reduce", total=output_num_blocks)
        shuffle_reduce_out = [
            shuffle_reduce.options(
                **reduce_ray_remote_args,
                num_returns=2,
            ).remote(
                *self._reduce_args,
                *[shuffle_map_out[i][j] for i in range(input_num_blocks)],
            ) for j in range(output_num_blocks)
        ]
        # Eagerly delete the map block references in order to eagerly release
        # the blocks' memory.
        del shuffle_map_out
        new_blocks, new_metadata = zip(*shuffle_reduce_out)
        new_metadata = reduce_bar.fetch_until_complete(list(new_metadata))
        reduce_bar.close()

        stats = {
            "map": shuffle_map_metadata,
            "reduce": new_metadata,
        }

        return BlockList(list(new_blocks), list(new_metadata)), stats
Exemplo n.º 20
0
    def do_write(
        self,
        blocks: List[ObjectRef[Block]],
        metadata: List[BlockMetadata],
        path: str,
        dataset_uuid: str,
        filesystem: Optional["pyarrow.fs.FileSystem"] = None,
        try_create_dir: bool = True,
        open_stream_args: Optional[Dict[str, Any]] = None,
        block_path_provider:
        BlockWritePathProvider = DefaultBlockWritePathProvider(),
        write_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
        _block_udf: Optional[Callable[[Block], Block]] = None,
        ray_remote_args: Dict[str, Any] = None,
        **write_args,
    ) -> List[ObjectRef[WriteResult]]:
        """Creates and returns write tasks for a file-based datasource."""
        path, filesystem = _resolve_paths_and_filesystem(path, filesystem)
        path = path[0]
        if try_create_dir:
            filesystem.create_dir(path, recursive=True)
        filesystem = _wrap_s3_serialization_workaround(filesystem)

        _write_block_to_file = self._write_block

        if open_stream_args is None:
            open_stream_args = {}

        if ray_remote_args is None:
            ray_remote_args = {}

        def write_block(write_path: str, block: Block):
            logger.debug(f"Writing {write_path} file.")
            fs = filesystem
            if isinstance(fs, _S3FileSystemWrapper):
                fs = fs.unwrap()
            if _block_udf is not None:
                block = _block_udf(block)

            with fs.open_output_stream(write_path, **open_stream_args) as f:
                _write_block_to_file(
                    f,
                    BlockAccessor.for_block(block),
                    writer_args_fn=write_args_fn,
                    **write_args,
                )

        write_block = cached_remote_fn(write_block).options(**ray_remote_args)

        file_format = self._FILE_EXTENSION
        if isinstance(file_format, list):
            file_format = file_format[0]

        write_tasks = []
        if not block_path_provider:
            block_path_provider = DefaultBlockWritePathProvider()
        for block_idx, block in enumerate(blocks):
            write_path = block_path_provider(
                path,
                filesystem=filesystem,
                dataset_uuid=dataset_uuid,
                block=block,
                block_index=block_idx,
                file_format=file_format,
            )
            write_task = write_block.remote(write_path, block)
            write_tasks.append(write_task)

        return write_tasks