예제 #1
0
def _partition_and_combine_block(
        block: Block[T], boundaries: List[KeyType], key: KeyFn,
        aggs: Tuple[AggregateFn]) -> List[Union[Block, BlockMetadata]]:
    """Partition the block and combine rows with the same key."""
    stats = BlockExecStats.builder()
    if key is None:
        partitions = [block]
    else:
        partitions = BlockAccessor.for_block(block).sort_and_partition(
            boundaries,
            [(key, "ascending")] if isinstance(key, str) else key,
            descending=False,
        )
    parts = [BlockAccessor.for_block(p).combine(key, aggs) for p in partitions]
    meta = BlockAccessor.for_block(block).get_metadata(
        input_files=None, exec_stats=stats.build())
    return parts + [meta]
예제 #2
0
    def remote_read(i: int, task: ReadTask,
                    stats_actor) -> MaybeBlockPartition:
        DatasetContext._set_current(context)
        stats = BlockExecStats.builder()

        # Execute the read task.
        block = task()

        if context.block_splitting_enabled:
            metadata = task.get_metadata()
            metadata.exec_stats = stats.build()
        else:
            metadata = BlockAccessor.for_block(block).get_metadata(
                input_files=task.get_metadata().input_files,
                exec_stats=stats.build())
        stats_actor.record_task.remote(stats_uuid, i, metadata)
        return block
예제 #3
0
def _map_block_nosplit(
    block: Block,
    block_fn: BlockTransform,
    input_files: List[str],
    fn: Optional[UDF],
    *fn_args,
    **fn_kwargs,
) -> Tuple[Block, BlockMetadata]:
    stats = BlockExecStats.builder()
    builder = DelegatingBlockBuilder()
    if fn is not None:
        fn_args = (fn, ) + fn_args
    for new_block in block_fn(block, *fn_args, **fn_kwargs):
        builder.add_block(new_block)
    new_block = builder.build()
    accessor = BlockAccessor.for_block(new_block)
    return new_block, accessor.get_metadata(input_files=input_files,
                                            exec_stats=stats.build())
예제 #4
0
    def map(
        idx: int,
        block: Block,
        output_num_blocks: int,
        block_udf: Optional[Callable[[Block], Iterable[Block]]],
        random_shuffle: bool,
        random_seed: Optional[int],
    ) -> List[Union[BlockMetadata, Block]]:
        stats = BlockExecStats.builder()
        if block_udf:
            # TODO(ekl) note that this effectively disables block splitting.
            blocks = list(block_udf(block))
            if len(blocks) > 1:
                builder = BlockAccessor.for_block(blocks[0]).builder()
                for b in blocks:
                    builder.add_block(b)
                block = builder.build()
            else:
                block = blocks[0]
        block = BlockAccessor.for_block(block)

        # Randomize the distribution of records to blocks.
        if random_shuffle:
            seed_i = random_seed + idx if random_seed is not None else None
            block = block.random_shuffle(seed_i)
            block = BlockAccessor.for_block(block)

        slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks))
        slices = []
        for i in range(output_num_blocks):
            slices.append(
                block.slice(i * slice_sz, (i + 1) * slice_sz, copy=True))

        # Randomize the distribution order of the blocks (this prevents empty
        # outputs when input blocks are very small).
        if random_shuffle:
            random = np.random.RandomState(seed_i)
            random.shuffle(slices)

        num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices)
        assert num_rows == block.num_rows(), (num_rows, block.num_rows())
        metadata = block.get_metadata(input_files=None,
                                      exec_stats=stats.build())
        return [metadata] + slices
예제 #5
0
def from_items(items: List[Any], *, parallelism: int = 200) -> Dataset[Any]:
    """Create a dataset from a list of local Python objects.

    Examples:
        >>> import ray
        >>> ray.data.from_items([1, 2, 3, 4, 5]) # doctest: +SKIP

    Args:
        items: List of local Python objects.
        parallelism: The amount of parallelism to use for the dataset.
            Parallelism may be limited by the number of items.

    Returns:
        Dataset holding the items.
    """
    block_size = max(1, len(items) // parallelism)

    blocks: List[ObjectRef[Block]] = []
    metadata: List[BlockMetadata] = []
    i = 0
    while i < len(items):
        stats = BlockExecStats.builder()
        builder = DelegatingBlockBuilder()
        for item in items[i : i + block_size]:
            builder.add(item)
        block = builder.build()
        blocks.append(ray.put(block))
        metadata.append(
            BlockAccessor.for_block(block).get_metadata(
                input_files=None, exec_stats=stats.build()
            )
        )
        i += block_size

    return Dataset(
        ExecutionPlan(
            BlockList(blocks, metadata),
            DatasetStats(stages={"from_items": metadata}, parent=None),
        ),
        0,
        False,
    )
예제 #6
0
 def reduce(random_shuffle: bool, random_seed: Optional[int],
            *mapper_outputs: List[Block]) -> (Block, BlockMetadata):
     stats = BlockExecStats.builder()
     builder = DelegatingBlockBuilder()
     for block in mapper_outputs:
         builder.add_block(block)
     new_block = builder.build()
     accessor = BlockAccessor.for_block(new_block)
     if random_shuffle:
         new_block = accessor.random_shuffle(
             random_seed if random_seed is not None else None)
         accessor = BlockAccessor.for_block(new_block)
     new_metadata = BlockMetadata(
         num_rows=accessor.num_rows(),
         size_bytes=accessor.size_bytes(),
         schema=accessor.schema(),
         input_files=None,
         exec_stats=stats.build(),
     )
     return new_block, new_metadata
예제 #7
0
def _execute_read_task(
    i: int,
    task: ReadTask,
    context: DatasetContext,
    stats_uuid: str,
    stats_actor: ray.actor.ActorHandle,
) -> Tuple[MaybeBlockPartition, BlockPartitionMetadata]:
    DatasetContext._set_current(context)
    stats = BlockExecStats.builder()

    # Execute the task.
    block = task()

    metadata = task.get_metadata()
    if context.block_splitting_enabled:
        metadata.exec_stats = stats.build()
    else:
        metadata = BlockAccessor.for_block(block).get_metadata(
            input_files=metadata.input_files, exec_stats=stats.build())
    stats_actor.record_task.remote(stats_uuid, i, metadata)
    return block, metadata
예제 #8
0
 def _merge(
     reduce_fn,
     *all_mapper_outputs: List[List[Block]],
     reduce_args: Optional[List[Any]] = None,
 ) -> List[Union[BlockMetadata, Block]]:
     """
     Returns list of [BlockMetadata, O1, O2, O3, ...output_num_blocks].
     """
     assert (
         len({len(mapper_outputs) for mapper_outputs in all_mapper_outputs}) == 1
     ), "Received different number of map inputs"
     stats = BlockExecStats.builder()
     merged_outputs = []
     if not reduce_args:
         reduce_args = []
     for mapper_outputs in zip(*all_mapper_outputs):
         block, meta = reduce_fn(*reduce_args, *mapper_outputs)
         merged_outputs.append(block)
     meta = BlockAccessor.for_block(block).get_metadata(
         input_files=None, exec_stats=stats.build()
     )
     return [meta] + merged_outputs
예제 #9
0
    def _merge(
        reduce_fn,
        *all_mapper_outputs: List[List[Block]],
        reduce_args: Optional[List[Any]] = None,
    ) -> List[Union[BlockMetadata, Block]]:
        """
        Returns list of [BlockMetadata, O1, O2, O3, ...output_num_blocks].
        """
        assert (len({
            len(mapper_outputs)
            for mapper_outputs in all_mapper_outputs
        }) == 1), "Received different number of map inputs"
        stats = BlockExecStats.builder()
        if not reduce_args:
            reduce_args = []

        num_rows = 0
        size_bytes = 0
        schema = None
        for i, mapper_outputs in enumerate(zip(*all_mapper_outputs)):
            block, meta = reduce_fn(*reduce_args,
                                    *mapper_outputs,
                                    partial_reduce=True)
            yield block

            block = BlockAccessor.for_block(block)
            num_rows += block.num_rows()
            size_bytes += block.size_bytes()
            schema = block.schema()
            del block

        yield BlockMetadata(
            num_rows=num_rows,
            size_bytes=size_bytes,
            schema=schema,
            input_files=None,
            exec_stats=stats.build(),
        )
예제 #10
0
파일: arrow_block.py 프로젝트: parasj/ray
    def aggregate_combined_blocks(
        blocks: List[Block[ArrowRow]],
        key: KeyFn,
        aggs: Tuple[AggregateFn],
        finalize: bool,
    ) -> Tuple[Block[ArrowRow], BlockMetadata]:
        """Aggregate sorted, partially combined blocks with the same key range.

        This assumes blocks are already sorted by key in ascending order,
        so we can do merge sort to get all the rows with the same key.

        Args:
            blocks: A list of partially combined and sorted blocks.
            key: The column name of key or None for global aggregation.
            aggs: The aggregations to do.
            finalize: Whether to finalize the aggregation. This is used as an
                optimization for cases where we repeatedly combine partially
                aggregated groups.

        Returns:
            A block of [k, v_1, ..., v_n] columns and its metadata where k is
            the groupby key and v_i is the corresponding aggregation result for
            the ith given aggregation.
            If key is None then the k column is omitted.
        """

        stats = BlockExecStats.builder()
        key_fn = (
            (lambda r: r[r._row.schema.names[0]]) if key is not None else (lambda r: 0)
        )

        iter = heapq.merge(
            *[ArrowBlockAccessor(block).iter_rows() for block in blocks], key=key_fn
        )
        next_row = None
        builder = ArrowBlockBuilder()
        while True:
            try:
                if next_row is None:
                    next_row = next(iter)
                next_key = key_fn(next_row)
                next_key_name = (
                    next_row._row.schema.names[0] if key is not None else None
                )

                def gen():
                    nonlocal iter
                    nonlocal next_row
                    while key_fn(next_row) == next_key:
                        yield next_row
                        try:
                            next_row = next(iter)
                        except StopIteration:
                            next_row = None
                            break

                # Merge.
                first = True
                accumulators = [None] * len(aggs)
                resolved_agg_names = [None] * len(aggs)
                for r in gen():
                    if first:
                        count = collections.defaultdict(int)
                        for i in range(len(aggs)):
                            name = aggs[i].name
                            # Check for conflicts with existing aggregation
                            # name.
                            if count[name] > 0:
                                name = ArrowBlockAccessor._munge_conflict(
                                    name, count[name]
                                )
                            count[name] += 1
                            resolved_agg_names[i] = name
                            accumulators[i] = r[name]
                        first = False
                    else:
                        for i in range(len(aggs)):
                            accumulators[i] = aggs[i].merge(
                                accumulators[i], r[resolved_agg_names[i]]
                            )
                # Build the row.
                row = {}
                if key is not None:
                    row[next_key_name] = next_key

                for agg, agg_name, accumulator in zip(
                    aggs, resolved_agg_names, accumulators
                ):
                    if finalize:
                        row[agg_name] = agg.finalize(accumulator)
                    else:
                        row[agg_name] = accumulator

                builder.add(row)
            except StopIteration:
                break

        ret = builder.build()
        return ret, ArrowBlockAccessor(ret).get_metadata(None, exec_stats=stats.build())
예제 #11
0
def _get_metadata(
        table: Union["pyarrow.Table", "pandas.DataFrame"]) -> BlockMetadata:
    stats = BlockExecStats.builder()
    return BlockAccessor.for_block(table).get_metadata(
        input_files=None, exec_stats=stats.build())
예제 #12
0
 def do_zip(block1: Block, block2: Block) -> (Block, BlockMetadata):
     stats = BlockExecStats.builder()
     b1 = BlockAccessor.for_block(block1)
     result = b1.zip(block2)
     br = BlockAccessor.for_block(result)
     return result, br.get_metadata(input_files=[], exec_stats=stats.build())
예제 #13
0
파일: read_api.py 프로젝트: parasj/ray
def _ndarray_to_block(ndarray: np.ndarray) -> Block[np.ndarray]:
    stats = BlockExecStats.builder()
    block = BlockAccessor.batch_to_block(ndarray)
    metadata = BlockAccessor.for_block(block).get_metadata(
        input_files=None, exec_stats=stats.build())
    return block, metadata
예제 #14
0
    def aggregate_combined_blocks(
        blocks: List[Block[Tuple[KeyType, AggType]]],
        key: KeyFn,
        aggs: Tuple[AggregateFn],
    ) -> Tuple[Block[Tuple[KeyType, U]], BlockMetadata]:
        """Aggregate sorted, partially combined blocks with the same key range.

        This assumes blocks are already sorted by key in ascending order,
        so we can do merge sort to get all the rows with the same key.

        Args:
            blocks: A list of partially combined and sorted blocks.
            key: The key function that returns the key from the row
                or None for global aggregation.
            aggs: The aggregations to do.

        Returns:
            A block of (k, v_1, ..., v_n) tuples and its metadata where k is
            the groupby key and v_i is the corresponding aggregation result for
            the ith given aggregation.
            If key is None then the k element of tuple is omitted.
        """

        stats = BlockExecStats.builder()
        key_fn = (lambda r: r[0]) if key else (lambda r: 0)

        iter = heapq.merge(
            *[SimpleBlockAccessor(block).iter_rows() for block in blocks], key=key_fn
        )
        next_row = None
        ret = []
        while True:
            try:
                if next_row is None:
                    next_row = next(iter)
                next_key = key_fn(next_row)

                def gen():
                    nonlocal iter
                    nonlocal next_row
                    while key_fn(next_row) == next_key:
                        yield next_row
                        try:
                            next_row = next(iter)
                        except StopIteration:
                            next_row = None
                            break

                first = True
                accumulators = [None] * len(aggs)
                for r in gen():
                    if first:
                        for i in range(len(aggs)):
                            accumulators[i] = r[i + 1] if key else r[i]
                        first = False
                    else:
                        for i in range(len(aggs)):
                            accumulators[i] = aggs[i].merge(
                                accumulators[i], r[i + 1] if key else r[i]
                            )
                if key is None:
                    ret.append(
                        tuple(
                            agg.finalize(accumulator)
                            for agg, accumulator in zip(aggs, accumulators)
                        )
                    )
                else:
                    ret.append(
                        (next_key,)
                        + tuple(
                            agg.finalize(accumulator)
                            for agg, accumulator in zip(aggs, accumulators)
                        )
                    )
            except StopIteration:
                break

        return ret, SimpleBlockAccessor(ret).get_metadata(
            None, exec_stats=stats.build()
        )
예제 #15
0
파일: read_api.py 프로젝트: stjordanis/ray
def _get_metadata(table: "pyarrow.Table") -> BlockMetadata:
    stats = BlockExecStats.builder()
    return BlockAccessor.for_block(table).get_metadata(
        input_files=None, exec_stats=stats.build())
예제 #16
0
파일: read_api.py 프로젝트: stjordanis/ray
def _df_to_block(df: "pandas.DataFrame") -> Block[ArrowRow]:
    stats = BlockExecStats.builder()
    import pyarrow as pa
    block = pa.table(df)
    return (block, BlockAccessor.for_block(block).get_metadata(
        input_files=None, exec_stats=stats.build()))