def _partition_and_combine_block( block: Block[T], boundaries: List[KeyType], key: KeyFn, aggs: Tuple[AggregateFn]) -> List[Union[Block, BlockMetadata]]: """Partition the block and combine rows with the same key.""" stats = BlockExecStats.builder() if key is None: partitions = [block] else: partitions = BlockAccessor.for_block(block).sort_and_partition( boundaries, [(key, "ascending")] if isinstance(key, str) else key, descending=False, ) parts = [BlockAccessor.for_block(p).combine(key, aggs) for p in partitions] meta = BlockAccessor.for_block(block).get_metadata( input_files=None, exec_stats=stats.build()) return parts + [meta]
def remote_read(i: int, task: ReadTask, stats_actor) -> MaybeBlockPartition: DatasetContext._set_current(context) stats = BlockExecStats.builder() # Execute the read task. block = task() if context.block_splitting_enabled: metadata = task.get_metadata() metadata.exec_stats = stats.build() else: metadata = BlockAccessor.for_block(block).get_metadata( input_files=task.get_metadata().input_files, exec_stats=stats.build()) stats_actor.record_task.remote(stats_uuid, i, metadata) return block
def _map_block_nosplit( block: Block, block_fn: BlockTransform, input_files: List[str], fn: Optional[UDF], *fn_args, **fn_kwargs, ) -> Tuple[Block, BlockMetadata]: stats = BlockExecStats.builder() builder = DelegatingBlockBuilder() if fn is not None: fn_args = (fn, ) + fn_args for new_block in block_fn(block, *fn_args, **fn_kwargs): builder.add_block(new_block) new_block = builder.build() accessor = BlockAccessor.for_block(new_block) return new_block, accessor.get_metadata(input_files=input_files, exec_stats=stats.build())
def map( idx: int, block: Block, output_num_blocks: int, block_udf: Optional[Callable[[Block], Iterable[Block]]], random_shuffle: bool, random_seed: Optional[int], ) -> List[Union[BlockMetadata, Block]]: stats = BlockExecStats.builder() if block_udf: # TODO(ekl) note that this effectively disables block splitting. blocks = list(block_udf(block)) if len(blocks) > 1: builder = BlockAccessor.for_block(blocks[0]).builder() for b in blocks: builder.add_block(b) block = builder.build() else: block = blocks[0] block = BlockAccessor.for_block(block) # Randomize the distribution of records to blocks. if random_shuffle: seed_i = random_seed + idx if random_seed is not None else None block = block.random_shuffle(seed_i) block = BlockAccessor.for_block(block) slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks)) slices = [] for i in range(output_num_blocks): slices.append( block.slice(i * slice_sz, (i + 1) * slice_sz, copy=True)) # Randomize the distribution order of the blocks (this prevents empty # outputs when input blocks are very small). if random_shuffle: random = np.random.RandomState(seed_i) random.shuffle(slices) num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices) assert num_rows == block.num_rows(), (num_rows, block.num_rows()) metadata = block.get_metadata(input_files=None, exec_stats=stats.build()) return [metadata] + slices
def from_items(items: List[Any], *, parallelism: int = 200) -> Dataset[Any]: """Create a dataset from a list of local Python objects. Examples: >>> import ray >>> ray.data.from_items([1, 2, 3, 4, 5]) # doctest: +SKIP Args: items: List of local Python objects. parallelism: The amount of parallelism to use for the dataset. Parallelism may be limited by the number of items. Returns: Dataset holding the items. """ block_size = max(1, len(items) // parallelism) blocks: List[ObjectRef[Block]] = [] metadata: List[BlockMetadata] = [] i = 0 while i < len(items): stats = BlockExecStats.builder() builder = DelegatingBlockBuilder() for item in items[i : i + block_size]: builder.add(item) block = builder.build() blocks.append(ray.put(block)) metadata.append( BlockAccessor.for_block(block).get_metadata( input_files=None, exec_stats=stats.build() ) ) i += block_size return Dataset( ExecutionPlan( BlockList(blocks, metadata), DatasetStats(stages={"from_items": metadata}, parent=None), ), 0, False, )
def reduce(random_shuffle: bool, random_seed: Optional[int], *mapper_outputs: List[Block]) -> (Block, BlockMetadata): stats = BlockExecStats.builder() builder = DelegatingBlockBuilder() for block in mapper_outputs: builder.add_block(block) new_block = builder.build() accessor = BlockAccessor.for_block(new_block) if random_shuffle: new_block = accessor.random_shuffle( random_seed if random_seed is not None else None) accessor = BlockAccessor.for_block(new_block) new_metadata = BlockMetadata( num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=accessor.schema(), input_files=None, exec_stats=stats.build(), ) return new_block, new_metadata
def _execute_read_task( i: int, task: ReadTask, context: DatasetContext, stats_uuid: str, stats_actor: ray.actor.ActorHandle, ) -> Tuple[MaybeBlockPartition, BlockPartitionMetadata]: DatasetContext._set_current(context) stats = BlockExecStats.builder() # Execute the task. block = task() metadata = task.get_metadata() if context.block_splitting_enabled: metadata.exec_stats = stats.build() else: metadata = BlockAccessor.for_block(block).get_metadata( input_files=metadata.input_files, exec_stats=stats.build()) stats_actor.record_task.remote(stats_uuid, i, metadata) return block, metadata
def _merge( reduce_fn, *all_mapper_outputs: List[List[Block]], reduce_args: Optional[List[Any]] = None, ) -> List[Union[BlockMetadata, Block]]: """ Returns list of [BlockMetadata, O1, O2, O3, ...output_num_blocks]. """ assert ( len({len(mapper_outputs) for mapper_outputs in all_mapper_outputs}) == 1 ), "Received different number of map inputs" stats = BlockExecStats.builder() merged_outputs = [] if not reduce_args: reduce_args = [] for mapper_outputs in zip(*all_mapper_outputs): block, meta = reduce_fn(*reduce_args, *mapper_outputs) merged_outputs.append(block) meta = BlockAccessor.for_block(block).get_metadata( input_files=None, exec_stats=stats.build() ) return [meta] + merged_outputs
def _merge( reduce_fn, *all_mapper_outputs: List[List[Block]], reduce_args: Optional[List[Any]] = None, ) -> List[Union[BlockMetadata, Block]]: """ Returns list of [BlockMetadata, O1, O2, O3, ...output_num_blocks]. """ assert (len({ len(mapper_outputs) for mapper_outputs in all_mapper_outputs }) == 1), "Received different number of map inputs" stats = BlockExecStats.builder() if not reduce_args: reduce_args = [] num_rows = 0 size_bytes = 0 schema = None for i, mapper_outputs in enumerate(zip(*all_mapper_outputs)): block, meta = reduce_fn(*reduce_args, *mapper_outputs, partial_reduce=True) yield block block = BlockAccessor.for_block(block) num_rows += block.num_rows() size_bytes += block.size_bytes() schema = block.schema() del block yield BlockMetadata( num_rows=num_rows, size_bytes=size_bytes, schema=schema, input_files=None, exec_stats=stats.build(), )
def aggregate_combined_blocks( blocks: List[Block[ArrowRow]], key: KeyFn, aggs: Tuple[AggregateFn], finalize: bool, ) -> Tuple[Block[ArrowRow], BlockMetadata]: """Aggregate sorted, partially combined blocks with the same key range. This assumes blocks are already sorted by key in ascending order, so we can do merge sort to get all the rows with the same key. Args: blocks: A list of partially combined and sorted blocks. key: The column name of key or None for global aggregation. aggs: The aggregations to do. finalize: Whether to finalize the aggregation. This is used as an optimization for cases where we repeatedly combine partially aggregated groups. Returns: A block of [k, v_1, ..., v_n] columns and its metadata where k is the groupby key and v_i is the corresponding aggregation result for the ith given aggregation. If key is None then the k column is omitted. """ stats = BlockExecStats.builder() key_fn = ( (lambda r: r[r._row.schema.names[0]]) if key is not None else (lambda r: 0) ) iter = heapq.merge( *[ArrowBlockAccessor(block).iter_rows() for block in blocks], key=key_fn ) next_row = None builder = ArrowBlockBuilder() while True: try: if next_row is None: next_row = next(iter) next_key = key_fn(next_row) next_key_name = ( next_row._row.schema.names[0] if key is not None else None ) def gen(): nonlocal iter nonlocal next_row while key_fn(next_row) == next_key: yield next_row try: next_row = next(iter) except StopIteration: next_row = None break # Merge. first = True accumulators = [None] * len(aggs) resolved_agg_names = [None] * len(aggs) for r in gen(): if first: count = collections.defaultdict(int) for i in range(len(aggs)): name = aggs[i].name # Check for conflicts with existing aggregation # name. if count[name] > 0: name = ArrowBlockAccessor._munge_conflict( name, count[name] ) count[name] += 1 resolved_agg_names[i] = name accumulators[i] = r[name] first = False else: for i in range(len(aggs)): accumulators[i] = aggs[i].merge( accumulators[i], r[resolved_agg_names[i]] ) # Build the row. row = {} if key is not None: row[next_key_name] = next_key for agg, agg_name, accumulator in zip( aggs, resolved_agg_names, accumulators ): if finalize: row[agg_name] = agg.finalize(accumulator) else: row[agg_name] = accumulator builder.add(row) except StopIteration: break ret = builder.build() return ret, ArrowBlockAccessor(ret).get_metadata(None, exec_stats=stats.build())
def _get_metadata( table: Union["pyarrow.Table", "pandas.DataFrame"]) -> BlockMetadata: stats = BlockExecStats.builder() return BlockAccessor.for_block(table).get_metadata( input_files=None, exec_stats=stats.build())
def do_zip(block1: Block, block2: Block) -> (Block, BlockMetadata): stats = BlockExecStats.builder() b1 = BlockAccessor.for_block(block1) result = b1.zip(block2) br = BlockAccessor.for_block(result) return result, br.get_metadata(input_files=[], exec_stats=stats.build())
def _ndarray_to_block(ndarray: np.ndarray) -> Block[np.ndarray]: stats = BlockExecStats.builder() block = BlockAccessor.batch_to_block(ndarray) metadata = BlockAccessor.for_block(block).get_metadata( input_files=None, exec_stats=stats.build()) return block, metadata
def aggregate_combined_blocks( blocks: List[Block[Tuple[KeyType, AggType]]], key: KeyFn, aggs: Tuple[AggregateFn], ) -> Tuple[Block[Tuple[KeyType, U]], BlockMetadata]: """Aggregate sorted, partially combined blocks with the same key range. This assumes blocks are already sorted by key in ascending order, so we can do merge sort to get all the rows with the same key. Args: blocks: A list of partially combined and sorted blocks. key: The key function that returns the key from the row or None for global aggregation. aggs: The aggregations to do. Returns: A block of (k, v_1, ..., v_n) tuples and its metadata where k is the groupby key and v_i is the corresponding aggregation result for the ith given aggregation. If key is None then the k element of tuple is omitted. """ stats = BlockExecStats.builder() key_fn = (lambda r: r[0]) if key else (lambda r: 0) iter = heapq.merge( *[SimpleBlockAccessor(block).iter_rows() for block in blocks], key=key_fn ) next_row = None ret = [] while True: try: if next_row is None: next_row = next(iter) next_key = key_fn(next_row) def gen(): nonlocal iter nonlocal next_row while key_fn(next_row) == next_key: yield next_row try: next_row = next(iter) except StopIteration: next_row = None break first = True accumulators = [None] * len(aggs) for r in gen(): if first: for i in range(len(aggs)): accumulators[i] = r[i + 1] if key else r[i] first = False else: for i in range(len(aggs)): accumulators[i] = aggs[i].merge( accumulators[i], r[i + 1] if key else r[i] ) if key is None: ret.append( tuple( agg.finalize(accumulator) for agg, accumulator in zip(aggs, accumulators) ) ) else: ret.append( (next_key,) + tuple( agg.finalize(accumulator) for agg, accumulator in zip(aggs, accumulators) ) ) except StopIteration: break return ret, SimpleBlockAccessor(ret).get_metadata( None, exec_stats=stats.build() )
def _get_metadata(table: "pyarrow.Table") -> BlockMetadata: stats = BlockExecStats.builder() return BlockAccessor.for_block(table).get_metadata( input_files=None, exec_stats=stats.build())
def _df_to_block(df: "pandas.DataFrame") -> Block[ArrowRow]: stats = BlockExecStats.builder() import pyarrow as pa block = pa.table(df) return (block, BlockAccessor.for_block(block).get_metadata( input_files=None, exec_stats=stats.build()))