Exemplo n.º 1
0
def _map_block_split(
    block: Block,
    block_fn: BlockTransform,
    input_files: List[str],
    fn: Optional[UDF],
    *fn_args,
    **fn_kwargs,
) -> BlockPartition:
    output = []
    stats = BlockExecStats.builder()
    if fn is not None:
        fn_args = (fn,) + fn_args
    for new_block in block_fn(block, *fn_args, **fn_kwargs):
        accessor = BlockAccessor.for_block(new_block)
        new_meta = BlockMetadata(
            num_rows=accessor.num_rows(),
            size_bytes=accessor.size_bytes(),
            schema=accessor.schema(),
            input_files=input_files,
            exec_stats=stats.build(),
        )
        owner = DatasetContext.get_current().block_owner
        output.append((ray.put(new_block, _owner=owner), new_meta))
        stats = BlockExecStats.builder()
    return output
Exemplo n.º 2
0
def _map_block_split(block: Block, fn: Any,
                     input_files: List[str]) -> BlockPartition:
    output = []
    stats = BlockExecStats.builder()
    for new_block in fn(block):
        accessor = BlockAccessor.for_block(new_block)
        new_meta = BlockMetadata(num_rows=accessor.num_rows(),
                                 size_bytes=accessor.size_bytes(),
                                 schema=accessor.schema(),
                                 input_files=input_files,
                                 exec_stats=stats.build())
        owner = DatasetContext.get_current().block_owner
        output.append((ray.put(new_block, _owner=owner), new_meta))
        stats = BlockExecStats.builder()
    return output
Exemplo n.º 3
0
def from_items(items: List[Any], *, parallelism: int = 200) -> Dataset[Any]:
    """Create a dataset from a list of local Python objects.

    Examples:
        >>> ray.data.from_items([1, 2, 3, 4, 5])

    Args:
        items: List of local Python objects.
        parallelism: The amount of parallelism to use for the dataset.
            Parallelism may be limited by the number of items.

    Returns:
        Dataset holding the items.
    """
    block_size = max(1, len(items) // parallelism)

    blocks: List[ObjectRef[Block]] = []
    metadata: List[BlockMetadata] = []
    i = 0
    while i < len(items):
        stats = BlockExecStats.builder()
        builder = DelegatingBlockBuilder()
        for item in items[i:i + block_size]:
            builder.add(item)
        block = builder.build()
        blocks.append(ray.put(block))
        metadata.append(
            BlockAccessor.for_block(block).get_metadata(
                input_files=None, exec_stats=stats.build()))
        i += block_size

    return Dataset(BlockList(blocks, metadata), 0,
                   DatasetStats(stages={"from_items": metadata}, parent=None))
Exemplo n.º 4
0
def _ndarray_to_block(ndarray: np.ndarray) -> Block[np.ndarray]:
    stats = BlockExecStats.builder()
    block = BlockAccessor.batch_to_block(ndarray)
    metadata = BlockAccessor.for_block(block).get_metadata(
        input_files=None, exec_stats=stats.build()
    )
    return block, metadata
Exemplo n.º 5
0
def _shuffle_map(
    block: Block,
    idx: int,
    output_num_blocks: int,
    random_shuffle: bool,
    random_seed: Optional[int],
) -> List[Union[BlockMetadata, Block]]:
    """Returns list of [BlockMetadata, O1, O2, O3, ...output_num_blocks]."""
    stats = BlockExecStats.builder()
    block = BlockAccessor.for_block(block)

    # Randomize the distribution of records to blocks.
    if random_shuffle:
        seed_i = random_seed + idx if random_seed is not None else None
        block = block.random_shuffle(seed_i)
        block = BlockAccessor.for_block(block)

    slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks))
    slices = []
    for i in range(output_num_blocks):
        slices.append(block.slice(i * slice_sz, (i + 1) * slice_sz, copy=True))

    # Randomize the distribution order of the blocks (this matters when
    # some blocks are larger than others).
    if random_shuffle:
        random = np.random.RandomState(seed_i)
        random.shuffle(slices)

    num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices)
    assert num_rows == block.num_rows(), (num_rows, block.num_rows())
    metadata = block.get_metadata(input_files=None, exec_stats=stats.build())
    return [metadata] + slices
Exemplo n.º 6
0
Arquivo: split.py Projeto: parasj/ray
def _split_single_block(
    block_id: int,
    block: Block,
    meta: BlockMetadata,
    block_row: int,
    split_indices: List[int],
) -> Tuple[int, List[Tuple[ObjectRef[Block], BlockMetadata]]]:
    """Split the provided block at the given indices."""
    split_result = []
    block_accessor = BlockAccessor.for_block(block)
    prev_index = 0
    # append one more entry at the last so we don't
    # need handle empty edge case.
    split_indices.append(block_row)
    for index in split_indices:
        logger.debug(f"slicing block {prev_index}:{index}")
        stats = BlockExecStats.builder()
        split_block = block_accessor.slice(prev_index, index, copy=True)
        accessor = BlockAccessor.for_block(split_block)
        split_meta = BlockMetadata(
            num_rows=accessor.num_rows(),
            size_bytes=accessor.size_bytes(),
            schema=meta.schema,
            input_files=meta.input_files,
            exec_stats=stats.build(),
        )
        split_result.append((ray.put(split_block), split_meta))
        prev_index = index
    return (block_id, split_result)
Exemplo n.º 7
0
def _ndarray_to_block(ndarray: np.ndarray) -> Block[np.ndarray]:
    stats = BlockExecStats.builder()
    import pyarrow as pa
    from ray.data.extensions import TensorArray
    table = pa.Table.from_pydict({"value": TensorArray(ndarray)})
    return (table, BlockAccessor.for_block(table).get_metadata(
        input_files=None, exec_stats=stats.build()))
Exemplo n.º 8
0
def _sort_block(block, boundaries, key, descending):
    stats = BlockExecStats.builder()
    out = BlockAccessor.for_block(block).sort_and_partition(
        boundaries, key, descending)
    meta = BlockAccessor.for_block(block).get_metadata(
        input_files=None, exec_stats=stats.build())
    return out + [meta]
Exemplo n.º 9
0
 def merge_sorted_blocks(
         blocks: List[Block[T]], key: "SortKeyT",
         descending: bool) -> Tuple[Block[T], BlockMetadata]:
     stats = BlockExecStats.builder()
     ret = [x for block in blocks for x in block]
     ret.sort(key=key, reverse=descending)
     return ret, SimpleBlockAccessor(ret).get_metadata(
         None, exec_stats=stats.build())
Exemplo n.º 10
0
def _df_to_block(df: "pandas.DataFrame") -> Block[ArrowRow]:
    stats = BlockExecStats.builder()
    import pyarrow as pa

    block = pa.table(df)
    return (
        block,
        BlockAccessor.for_block(block).get_metadata(input_files=None,
                                                    exec_stats=stats.build()),
    )
Exemplo n.º 11
0
def _map_block_nosplit(block: Block, fn: Any,
                       input_files: List[str]) -> Tuple[Block, BlockMetadata]:
    stats = BlockExecStats.builder()
    builder = DelegatingBlockBuilder()
    for new_block in fn(block):
        builder.add_block(new_block)
    new_block = builder.build()
    accessor = BlockAccessor.for_block(new_block)
    return new_block, accessor.get_metadata(input_files=input_files,
                                            exec_stats=stats.build())
Exemplo n.º 12
0
def from_items(items: List[Any], *, parallelism: int = -1) -> Dataset[Any]:
    """Create a dataset from a list of local Python objects.

    Examples:
        >>> import ray
        >>> ds = ray.data.from_items([1, 2, 3, 4, 5]) # doctest: +SKIP
        >>> ds # doctest: +SKIP
        Dataset(num_blocks=5, num_rows=5, schema=<class 'int'>)
        >>> ds.take(2) # doctest: +SKIP
        [1, 2]

    Args:
        items: List of local Python objects.
        parallelism: The amount of parallelism to use for the dataset.
            Parallelism may be limited by the number of items.

    Returns:
        Dataset holding the items.
    """

    detected_parallelism, _ = _autodetect_parallelism(
        parallelism,
        ray.util.get_current_placement_group(),
        DatasetContext.get_current(),
    )
    block_size = max(
        1,
        len(items) // detected_parallelism,
    )

    blocks: List[ObjectRef[Block]] = []
    metadata: List[BlockMetadata] = []
    i = 0
    while i < len(items):
        stats = BlockExecStats.builder()
        builder = DelegatingBlockBuilder()
        for item in items[i:i + block_size]:
            builder.add(item)
        block = builder.build()
        blocks.append(ray.put(block))
        metadata.append(
            BlockAccessor.for_block(block).get_metadata(
                input_files=None, exec_stats=stats.build()))
        i += block_size

    return Dataset(
        ExecutionPlan(
            BlockList(blocks, metadata),
            DatasetStats(stages={"from_items": metadata}, parent=None),
        ),
        0,
        False,
    )
Exemplo n.º 13
0
 def merge_sorted_blocks(
     blocks: List[Block[T]], key: "SortKeyT", _descending: bool
 ) -> Tuple[Block[T], BlockMetadata]:
     stats = BlockExecStats.builder()
     blocks = [b for b in blocks if b.num_rows > 0]
     if len(blocks) == 0:
         ret = ArrowBlockAccessor._empty_table()
     else:
         ret = pyarrow.concat_tables(blocks, promote=True)
         indices = pyarrow.compute.sort_indices(ret, sort_keys=key)
         ret = ArrowBlockAccessor.take_table(ret, indices)
     return ret, ArrowBlockAccessor(ret).get_metadata(None, exec_stats=stats.build())
Exemplo n.º 14
0
 def merge_sorted_blocks(
     blocks: List[Block[T]], key: "SortKeyT", _descending: bool
 ) -> Tuple[Block[T], BlockMetadata]:
     stats = BlockExecStats.builder()
     blocks = [b for b in blocks if b.num_rows > 0]
     if len(blocks) == 0:
         ret = ArrowBlockAccessor._empty_table()
     else:
         concat_and_sort = get_concat_and_sort_transform(
             DatasetContext.get_current()
         )
         ret = concat_and_sort(blocks, key, _descending)
     return ret, ArrowBlockAccessor(ret).get_metadata(None, exec_stats=stats.build())
Exemplo n.º 15
0
Arquivo: sort.py Projeto: tchordia/ray
 def map(
     idx: int,
     block: Block,
     output_num_blocks: int,
     boundaries: List[T],
     key: SortKeyT,
     descending: bool,
 ) -> List[Union[BlockMetadata, Block]]:
     stats = BlockExecStats.builder()
     out = BlockAccessor.for_block(block).sort_and_partition(
         boundaries, key, descending)
     meta = BlockAccessor.for_block(block).get_metadata(
         input_files=None, exec_stats=stats.build())
     return [meta] + out
Exemplo n.º 16
0
def _partition_and_combine_block(
        block: Block[T], boundaries: List[KeyType], key: KeyFn,
        aggs: Tuple[AggregateFn]) -> List[Union[Block, BlockMetadata]]:
    """Partition the block and combine rows with the same key."""
    stats = BlockExecStats.builder()
    if key is None:
        partitions = [block]
    else:
        partitions = BlockAccessor.for_block(block).sort_and_partition(
            boundaries, [(key, "ascending")] if isinstance(key, str) else key,
            descending=False)
    parts = [BlockAccessor.for_block(p).combine(key, aggs) for p in partitions]
    meta = BlockAccessor.for_block(block).get_metadata(
        input_files=None, exec_stats=stats.build())
    return parts + [meta]
Exemplo n.º 17
0
def _shuffle_reduce(*mapper_outputs: List[Block]) -> (Block, BlockMetadata):
    stats = BlockExecStats.builder()
    builder = DelegatingBlockBuilder()
    for block in mapper_outputs:
        builder.add_block(block)
    new_block = builder.build()
    accessor = BlockAccessor.for_block(new_block)
    new_metadata = BlockMetadata(
        num_rows=accessor.num_rows(),
        size_bytes=accessor.size_bytes(),
        schema=accessor.schema(),
        input_files=None,
        exec_stats=stats.build(),
    )
    return new_block, new_metadata
Exemplo n.º 18
0
    def remote_read(i: int, task: ReadTask) -> MaybeBlockPartition:
        DatasetContext._set_current(context)
        stats = BlockExecStats.builder()

        # Execute the read task.
        block = task()

        if context.block_splitting_enabled:
            metadata = task.get_metadata()
            metadata.exec_stats = stats.build()
        else:
            metadata = BlockAccessor.for_block(block).get_metadata(
                input_files=task.get_metadata().input_files,
                exec_stats=stats.build())
        stats_actor.record_task.remote(stats_uuid, i, metadata)
        return block
Exemplo n.º 19
0
def _map_block_nosplit(
    block: Block,
    block_fn: BlockTransform,
    input_files: List[str],
    fn: Optional[UDF],
    *fn_args,
    **fn_kwargs,
) -> Tuple[Block, BlockMetadata]:
    stats = BlockExecStats.builder()
    builder = DelegatingBlockBuilder()
    if fn is not None:
        fn_args = (fn, ) + fn_args
    for new_block in block_fn(block, *fn_args, **fn_kwargs):
        builder.add_block(new_block)
    new_block = builder.build()
    accessor = BlockAccessor.for_block(new_block)
    return new_block, accessor.get_metadata(input_files=input_files,
                                            exec_stats=stats.build())
Exemplo n.º 20
0
    def map(
        idx: int,
        block: Block,
        output_num_blocks: int,
        block_udf: Optional[Callable[[Block], Iterable[Block]]],
        random_shuffle: bool,
        random_seed: Optional[int],
    ) -> List[Union[BlockMetadata, Block]]:
        stats = BlockExecStats.builder()
        if block_udf:
            # TODO(ekl) note that this effectively disables block splitting.
            blocks = list(block_udf(block))
            if len(blocks) > 1:
                builder = BlockAccessor.for_block(blocks[0]).builder()
                for b in blocks:
                    builder.add_block(b)
                block = builder.build()
            else:
                block = blocks[0]
        block = BlockAccessor.for_block(block)

        # Randomize the distribution of records to blocks.
        if random_shuffle:
            seed_i = random_seed + idx if random_seed is not None else None
            block = block.random_shuffle(seed_i)
            block = BlockAccessor.for_block(block)

        slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks))
        slices = []
        for i in range(output_num_blocks):
            slices.append(
                block.slice(i * slice_sz, (i + 1) * slice_sz, copy=True))

        # Randomize the distribution order of the blocks (this prevents empty
        # outputs when input blocks are very small).
        if random_shuffle:
            random = np.random.RandomState(seed_i)
            random.shuffle(slices)

        num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices)
        assert num_rows == block.num_rows(), (num_rows, block.num_rows())
        metadata = block.get_metadata(input_files=None,
                                      exec_stats=stats.build())
        return [metadata] + slices
Exemplo n.º 21
0
 def reduce(random_shuffle: bool, random_seed: Optional[int],
            *mapper_outputs: List[Block]) -> (Block, BlockMetadata):
     stats = BlockExecStats.builder()
     builder = DelegatingBlockBuilder()
     for block in mapper_outputs:
         builder.add_block(block)
     new_block = builder.build()
     accessor = BlockAccessor.for_block(new_block)
     if random_shuffle:
         new_block = accessor.random_shuffle(
             random_seed if random_seed is not None else None)
         accessor = BlockAccessor.for_block(new_block)
     new_metadata = BlockMetadata(
         num_rows=accessor.num_rows(),
         size_bytes=accessor.size_bytes(),
         schema=accessor.schema(),
         input_files=None,
         exec_stats=stats.build(),
     )
     return new_block, new_metadata
Exemplo n.º 22
0
def _execute_read_task(
    i: int,
    task: ReadTask,
    context: DatasetContext,
    stats_uuid: str,
    stats_actor: ray.actor.ActorHandle,
) -> Tuple[MaybeBlockPartition, BlockPartitionMetadata]:
    DatasetContext._set_current(context)
    stats = BlockExecStats.builder()

    # Execute the task.
    block = task()

    metadata = task.get_metadata()
    if context.block_splitting_enabled:
        metadata.exec_stats = stats.build()
    else:
        metadata = BlockAccessor.for_block(block).get_metadata(
            input_files=metadata.input_files, exec_stats=stats.build())
    stats_actor.record_task.remote(stats_uuid, i, metadata)
    return block, metadata
Exemplo n.º 23
0
 def _merge(
     reduce_fn,
     *all_mapper_outputs: List[List[Block]],
     reduce_args: Optional[List[Any]] = None,
 ) -> List[Union[BlockMetadata, Block]]:
     """
     Returns list of [BlockMetadata, O1, O2, O3, ...output_num_blocks].
     """
     assert (
         len({len(mapper_outputs) for mapper_outputs in all_mapper_outputs}) == 1
     ), "Received different number of map inputs"
     stats = BlockExecStats.builder()
     merged_outputs = []
     if not reduce_args:
         reduce_args = []
     for mapper_outputs in zip(*all_mapper_outputs):
         block, meta = reduce_fn(*reduce_args, *mapper_outputs)
         merged_outputs.append(block)
     meta = BlockAccessor.for_block(block).get_metadata(
         input_files=None, exec_stats=stats.build()
     )
     return [meta] + merged_outputs
Exemplo n.º 24
0
    def _merge(
        reduce_fn,
        *all_mapper_outputs: List[List[Block]],
        reduce_args: Optional[List[Any]] = None,
    ) -> List[Union[BlockMetadata, Block]]:
        """
        Returns list of [BlockMetadata, O1, O2, O3, ...output_num_blocks].
        """
        assert (len({
            len(mapper_outputs)
            for mapper_outputs in all_mapper_outputs
        }) == 1), "Received different number of map inputs"
        stats = BlockExecStats.builder()
        if not reduce_args:
            reduce_args = []

        num_rows = 0
        size_bytes = 0
        schema = None
        for i, mapper_outputs in enumerate(zip(*all_mapper_outputs)):
            block, meta = reduce_fn(*reduce_args,
                                    *mapper_outputs,
                                    partial_reduce=True)
            yield block

            block = BlockAccessor.for_block(block)
            num_rows += block.num_rows()
            size_bytes += block.size_bytes()
            schema = block.schema()
            del block

        yield BlockMetadata(
            num_rows=num_rows,
            size_bytes=size_bytes,
            schema=schema,
            input_files=None,
            exec_stats=stats.build(),
        )
Exemplo n.º 25
0
def _get_metadata(table: "pyarrow.Table") -> BlockMetadata:
    stats = BlockExecStats.builder()
    return BlockAccessor.for_block(table).get_metadata(
        input_files=None, exec_stats=stats.build())
Exemplo n.º 26
0
def _get_metadata(
        table: Union["pyarrow.Table", "pandas.DataFrame"]) -> BlockMetadata:
    stats = BlockExecStats.builder()
    return BlockAccessor.for_block(table).get_metadata(
        input_files=None, exec_stats=stats.build())
Exemplo n.º 27
0
 def do_zip(block1: Block, block2: Block) -> (Block, BlockMetadata):
     stats = BlockExecStats.builder()
     b1 = BlockAccessor.for_block(block1)
     result = b1.zip(block2)
     br = BlockAccessor.for_block(result)
     return result, br.get_metadata(input_files=[], exec_stats=stats.build())
Exemplo n.º 28
0
    def aggregate_combined_blocks(
        blocks: List[Block[ArrowRow]],
        key: KeyFn,
        aggs: Tuple[AggregateFn],
        finalize: bool,
    ) -> Tuple[Block[ArrowRow], BlockMetadata]:
        """Aggregate sorted, partially combined blocks with the same key range.

        This assumes blocks are already sorted by key in ascending order,
        so we can do merge sort to get all the rows with the same key.

        Args:
            blocks: A list of partially combined and sorted blocks.
            key: The column name of key or None for global aggregation.
            aggs: The aggregations to do.
            finalize: Whether to finalize the aggregation. This is used as an
                optimization for cases where we repeatedly combine partially
                aggregated groups.

        Returns:
            A block of [k, v_1, ..., v_n] columns and its metadata where k is
            the groupby key and v_i is the corresponding aggregation result for
            the ith given aggregation.
            If key is None then the k column is omitted.
        """

        stats = BlockExecStats.builder()
        key_fn = (
            (lambda r: r[r._row.schema.names[0]]) if key is not None else (lambda r: 0)
        )

        iter = heapq.merge(
            *[ArrowBlockAccessor(block).iter_rows() for block in blocks], key=key_fn
        )
        next_row = None
        builder = ArrowBlockBuilder()
        while True:
            try:
                if next_row is None:
                    next_row = next(iter)
                next_key = key_fn(next_row)
                next_key_name = (
                    next_row._row.schema.names[0] if key is not None else None
                )

                def gen():
                    nonlocal iter
                    nonlocal next_row
                    while key_fn(next_row) == next_key:
                        yield next_row
                        try:
                            next_row = next(iter)
                        except StopIteration:
                            next_row = None
                            break

                # Merge.
                first = True
                accumulators = [None] * len(aggs)
                resolved_agg_names = [None] * len(aggs)
                for r in gen():
                    if first:
                        count = collections.defaultdict(int)
                        for i in range(len(aggs)):
                            name = aggs[i].name
                            # Check for conflicts with existing aggregation
                            # name.
                            if count[name] > 0:
                                name = ArrowBlockAccessor._munge_conflict(
                                    name, count[name]
                                )
                            count[name] += 1
                            resolved_agg_names[i] = name
                            accumulators[i] = r[name]
                        first = False
                    else:
                        for i in range(len(aggs)):
                            accumulators[i] = aggs[i].merge(
                                accumulators[i], r[resolved_agg_names[i]]
                            )
                # Build the row.
                row = {}
                if key is not None:
                    row[next_key_name] = next_key

                for agg, agg_name, accumulator in zip(
                    aggs, resolved_agg_names, accumulators
                ):
                    if finalize:
                        row[agg_name] = agg.finalize(accumulator)
                    else:
                        row[agg_name] = accumulator

                builder.add(row)
            except StopIteration:
                break

        ret = builder.build()
        return ret, ArrowBlockAccessor(ret).get_metadata(None, exec_stats=stats.build())
Exemplo n.º 29
0
    def aggregate_combined_blocks(
        blocks: List[Block[Tuple[KeyType, AggType]]],
        key: KeyFn,
        aggs: Tuple[AggregateFn],
    ) -> Tuple[Block[Tuple[KeyType, U]], BlockMetadata]:
        """Aggregate sorted, partially combined blocks with the same key range.

        This assumes blocks are already sorted by key in ascending order,
        so we can do merge sort to get all the rows with the same key.

        Args:
            blocks: A list of partially combined and sorted blocks.
            key: The key function that returns the key from the row
                or None for global aggregation.
            aggs: The aggregations to do.

        Returns:
            A block of (k, v_1, ..., v_n) tuples and its metadata where k is
            the groupby key and v_i is the corresponding aggregation result for
            the ith given aggregation.
            If key is None then the k element of tuple is omitted.
        """

        stats = BlockExecStats.builder()
        key_fn = (lambda r: r[0]) if key else (lambda r: 0)

        iter = heapq.merge(
            *[SimpleBlockAccessor(block).iter_rows() for block in blocks], key=key_fn
        )
        next_row = None
        ret = []
        while True:
            try:
                if next_row is None:
                    next_row = next(iter)
                next_key = key_fn(next_row)

                def gen():
                    nonlocal iter
                    nonlocal next_row
                    while key_fn(next_row) == next_key:
                        yield next_row
                        try:
                            next_row = next(iter)
                        except StopIteration:
                            next_row = None
                            break

                first = True
                accumulators = [None] * len(aggs)
                for r in gen():
                    if first:
                        for i in range(len(aggs)):
                            accumulators[i] = r[i + 1] if key else r[i]
                        first = False
                    else:
                        for i in range(len(aggs)):
                            accumulators[i] = aggs[i].merge(
                                accumulators[i], r[i + 1] if key else r[i]
                            )
                if key is None:
                    ret.append(
                        tuple(
                            agg.finalize(accumulator)
                            for agg, accumulator in zip(aggs, accumulators)
                        )
                    )
                else:
                    ret.append(
                        (next_key,)
                        + tuple(
                            agg.finalize(accumulator)
                            for agg, accumulator in zip(aggs, accumulators)
                        )
                    )
            except StopIteration:
                break

        return ret, SimpleBlockAccessor(ret).get_metadata(
            None, exec_stats=stats.build()
        )