def map( idx: int, block: Block, output_num_blocks: int, block_udf: Optional[Callable[[Block], Iterable[Block]]], random_shuffle: bool, random_seed: Optional[int], ) -> List[Union[BlockMetadata, Block]]: stats = BlockExecStats.builder() if block_udf: # TODO(ekl) note that this effectively disables block splitting. blocks = list(block_udf(block)) if len(blocks) > 1: builder = BlockAccessor.for_block(blocks[0]).builder() for b in blocks: builder.add_block(b) block = builder.build() else: block = blocks[0] block = BlockAccessor.for_block(block) # Randomize the distribution of records to blocks. if random_shuffle: seed_i = random_seed + idx if random_seed is not None else None block = block.random_shuffle(seed_i) block = BlockAccessor.for_block(block) slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks)) slices = [] for i in range(output_num_blocks): slices.append(block.slice(i * slice_sz, (i + 1) * slice_sz, copy=True)) # Randomize the distribution order of the blocks (this prevents empty # outputs when input blocks are very small). if random_shuffle: random = np.random.RandomState(seed_i) random.shuffle(slices) num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices) assert num_rows == block.num_rows(), (num_rows, block.num_rows()) metadata = block.get_metadata(input_files=None, exec_stats=stats.build()) return [metadata] + slices
def _ndarray_to_block(ndarray: np.ndarray) -> Block[np.ndarray]: stats = BlockExecStats.builder() import pyarrow as pa from ray.data.extensions import TensorArray table = pa.Table.from_pydict({"value": TensorArray(ndarray)}) return ( table, BlockAccessor.for_block(table).get_metadata(input_files=None, exec_stats=stats.build()), )
def reduce(random_shuffle: bool, random_seed: Optional[int], *mapper_outputs: List[Block]) -> (Block, BlockMetadata): stats = BlockExecStats.builder() builder = DelegatingBlockBuilder() for block in mapper_outputs: builder.add_block(block) new_block = builder.build() accessor = BlockAccessor.for_block(new_block) if random_shuffle: new_block = accessor.random_shuffle( random_seed if random_seed is not None else None) accessor = BlockAccessor.for_block(new_block) new_metadata = BlockMetadata( num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=accessor.schema(), input_files=None, exec_stats=stats.build(), ) return new_block, new_metadata
def _df_to_block(df: "pandas.DataFrame") -> Block[ArrowRow]: stats = BlockExecStats.builder() import pyarrow as pa block = pa.table(df) return ( block, BlockAccessor.for_block(block).get_metadata( input_files=None, exec_stats=stats.build() ), )
def _shuffle_reduce(*mapper_outputs: List[Block]) -> (Block, BlockMetadata): builder = DelegatingArrowBlockBuilder() for block in mapper_outputs: builder.add_block(block) new_block = builder.build() accessor = BlockAccessor.for_block(new_block) new_metadata = BlockMetadata(num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=accessor.schema(), input_files=None) return new_block, new_metadata
def _write_block( self, f: "pyarrow.NativeFile", block: BlockAccessor, writer_args_fn: Callable[[], Dict[str, Any]] = lambda: {}, **writer_args, ): import pyarrow.parquet as pq writer_args = _resolve_kwargs(writer_args_fn, **writer_args) pq.write_table(block.to_arrow(), f, **writer_args)
def _buffer_size(self) -> int: """Return shuffle buffer size.""" buffer_size = self._builder.num_rows() if self._shuffle_buffer is not None: # Include the size of the concrete (materialized) shuffle buffer, adjusting # for the batch head position, which also serves as a counter of the number # of already-yielded rows from the current concrete shuffle buffer. buffer_size += ( BlockAccessor.for_block(self._shuffle_buffer).num_rows() - self._batch_head) return buffer_size
def write_block(write_path: str, block: Block): logger.debug(f"Writing {write_path} file.") fs = filesystem if isinstance(fs, _S3FileSystemWrapper): fs = fs.unwrap() if _block_udf is not None: block = _block_udf(block) with fs.open_output_stream(write_path, **open_stream_args) as f: _write_block_to_file(f, BlockAccessor.for_block(block), **write_args)
def vectorized_mean(block: Block[T]) -> AggType: block_acc = BlockAccessor.for_block(block) count = block_acc.count(on) if count == 0 or count is None: # Empty or all null. return None sum_ = block_acc.sum(on, ignore_nulls) if sum_ is None: # ignore_nulls=False and at least one null. return None return [sum_, count]
def add_block(self, block: Any) -> None: if not isinstance(block, self._block_type): raise TypeError( f"Got a block of type {type(block)}, expected {self._block_type}." "If you are mapping a function, ensure it returns an " "object with the expected type. Block:\n" f"{block}") accessor = BlockAccessor.for_block(block) self._tables.append(block) self._tables_size_bytes += accessor.size_bytes() self._num_rows += accessor.num_rows()
def from_items(items: List[Any], *, parallelism: int = -1) -> Dataset[Any]: """Create a dataset from a list of local Python objects. Examples: >>> import ray >>> ds = ray.data.from_items([1, 2, 3, 4, 5]) # doctest: +SKIP >>> ds # doctest: +SKIP Dataset(num_blocks=5, num_rows=5, schema=<class 'int'>) >>> ds.take(2) # doctest: +SKIP [1, 2] Args: items: List of local Python objects. parallelism: The amount of parallelism to use for the dataset. Parallelism may be limited by the number of items. Returns: Dataset holding the items. """ detected_parallelism, _ = _autodetect_parallelism( parallelism, ray.util.get_current_placement_group(), DatasetContext.get_current(), ) block_size = max( 1, len(items) // detected_parallelism, ) blocks: List[ObjectRef[Block]] = [] metadata: List[BlockMetadata] = [] i = 0 while i < len(items): stats = BlockExecStats.builder() builder = DelegatingBlockBuilder() for item in items[i:i + block_size]: builder.add(item) block = builder.build() blocks.append(ray.put(block)) metadata.append( BlockAccessor.for_block(block).get_metadata( input_files=None, exec_stats=stats.build())) i += block_size return Dataset( ExecutionPlan( BlockList(blocks, metadata), DatasetStats(stages={"from_items": metadata}, parent=None), ), 0, False, )
def _format_batch(batch: Block, batch_format: str) -> BatchType: import pyarrow as pa if batch_format == "native": # Always promote Arrow blocks to pandas for consistency, since # we lazily convert pandas->Arrow internally for efficiency. if isinstance(batch, pa.Table) or isinstance(batch, bytes): batch = BlockAccessor.for_block(batch) batch = batch.to_pandas() return batch elif batch_format == "pandas": batch = BlockAccessor.for_block(batch) return batch.to_pandas() elif batch_format == "pyarrow": batch = BlockAccessor.for_block(batch) return batch.to_arrow() else: raise ValueError( f"The given batch format: {batch_format} " f"is invalid. Supported batch type: {BatchType}" )
def _map_block_split(block: Block, fn: Any, input_files: List[str]) -> BlockPartition: output = [] for new_block in fn(block): accessor = BlockAccessor.for_block(new_block) new_meta = BlockMetadata(num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=accessor.schema(), input_files=input_files) owner = DatasetContext.get_current().block_owner output.append((ray.put(new_block, _owner=owner), new_meta)) return output
def _write_block( self, f: "pyarrow.NativeFile", block: BlockAccessor, writer_args_fn: Callable[[], Dict[str, Any]] = lambda: {}, **writer_args, ): from pyarrow import csv writer_args = _resolve_kwargs(writer_args_fn, **writer_args) write_options = writer_args.pop("write_options", None) csv.write_csv(block.to_arrow(), f, write_options, **writer_args)
def _get(self, block_index, key): if block_index is None: return None block = self.blocks[block_index] column = block[self.key_field] if self.dataset_format == "arrow": column = _ArrowListWrapper(column) i = _binary_search_find(column, key) if i is None: return None acc = BlockAccessor.for_block(block) return acc._create_table_row(acc.slice(i, i + 1, copy=True))
def _map_block_nosplit( block: Block, fn: Any, input_files: List[str] ) -> Tuple[Block, BlockMetadata]: stats = BlockExecStats.builder() builder = DelegatingBlockBuilder() for new_block in fn(block): builder.add_block(new_block) new_block = builder.build() accessor = BlockAccessor.for_block(new_block) return new_block, accessor.get_metadata( input_files=input_files, exec_stats=stats.build() )
def test_sort_arrow_with_empty_blocks(ray_start_regular, use_push_based_shuffle): ctx = ray.data.context.DatasetContext.get_current() try: original = ctx.use_push_based_shuffle ctx.use_push_based_shuffle = use_push_based_shuffle assert (BlockAccessor.for_block(pa.Table.from_pydict({})).sample( 10, "A").num_rows == 0) partitions = BlockAccessor.for_block(pa.Table.from_pydict( {})).sort_and_partition([1, 5, 10], "A", descending=False) assert len(partitions) == 4 for partition in partitions: assert partition.num_rows == 0 assert (BlockAccessor.for_block(pa.Table.from_pydict( {})).merge_sorted_blocks([pa.Table.from_pydict({})], "A", False)[0].num_rows == 0) ds = ray.data.from_items([{ "A": (x % 3), "B": x } for x in range(3)], parallelism=3) ds = ds.filter(lambda r: r["A"] == 0) assert [row.as_pydict() for row in ds.sort("A").iter_rows()] == [{ "A": 0, "B": 0 }] # Test empty dataset. ds = ray.data.range_table(10).filter(lambda r: r["value"] > 10) assert (len( ray.data.impl.sort.sample_boundaries( ds._plan.execute().get_blocks(), "value", 3)) == 2) assert ds.sort("value").count() == 0 finally: ctx.use_push_based_shuffle = original
def zip(self, other: "Block[T]") -> "Block[T]": acc = BlockAccessor.for_block(other) if not isinstance(acc, type(self)): raise ValueError( "Cannot zip {} with block of type {}".format(type(self), type(other)) ) if acc.num_rows() != self.num_rows(): raise ValueError( "Cannot zip self (length {}) with block of length {}".format( self.num_rows(), acc.num_rows() ) ) return self._zip(acc)
def vectorized_std(block: Block[T]) -> AggType: block_acc = BlockAccessor.for_block(block) count = block_acc.count(on) if count == 0 or count is None: # Empty or all null. return None sum_ = block_acc.sum(on, ignore_nulls) if sum_ is None: # ignore_nulls=False and at least one null. return None mean = sum_ / count M2 = block_acc.sum_of_squared_diffs_from_mean(on, ignore_nulls, mean) return [M2, mean, count]
def group_fn(batch): block_accessor = BlockAccessor.for_block(batch) boundaries = get_boundaries(block_accessor) builder = block_accessor.builder() start = 0 for end in boundaries: group = block_accessor.slice(start, end, False) applied = fn(group) builder.add_block(applied) start = end rs = builder.build() return rs
def _get_write_path_for_block(self, base_path, *, filesystem=None, dataset_uuid=None, block=None, block_index=None, file_format=None): num_rows = BlockAccessor.for_block(ray.get(block)).num_rows() suffix = f"{block_index:06}_{num_rows:02}_{dataset_uuid}" \ f".test.{file_format}" print(f"Writing to: {base_path}/{suffix}") return f"{base_path}/{suffix}"
def map( idx: int, block: Block, output_num_blocks: int, boundaries: List[KeyType], key: KeyFn, aggs: Tuple[AggregateFn], ) -> List[Union[BlockMetadata, Block]]: """Partition the block and combine rows with the same key.""" stats = BlockExecStats.builder() if key is None: partitions = [block] else: partitions = BlockAccessor.for_block(block).sort_and_partition( boundaries, [(key, "ascending")] if isinstance(key, str) else key, descending=False, ) parts = [BlockAccessor.for_block(p).combine(key, aggs) for p in partitions] meta = BlockAccessor.for_block(block).get_metadata( input_files=None, exec_stats=stats.build() ) return [meta] + parts
def _zip(self, acc: BlockAccessor) -> "Block[T]": r = self.to_arrow() s = acc.to_arrow() for col_name in s.column_names: col = s.column(col_name) # Ensure the column names are unique after zip. if col_name in r.column_names: i = 1 new_name = col_name while new_name in r.column_names: new_name = "{}_{}".format(col_name, i) i += 1 col_name = new_name r = r.append_column(col_name, col) return r
def _zip(self, acc: BlockAccessor) -> "pandas.DataFrame": r = self.to_pandas().copy(deep=False) s = acc.to_pandas() for col_name in s.columns: col = s[col_name] # Ensure the column names are unique after zip. if col_name in r.column_names: i = 1 new_name = col_name while new_name in r.column_names: new_name = "{}_{}".format(col_name, i) i += 1 col_name = new_name r[col_name] = col return r
def process_block( self, block: Block, input_files: List[str] ) -> Iterable[Tuple[Block, BlockMetadata]]: output = [] for new_block in fn(block): accessor = BlockAccessor.for_block(new_block) new_metadata = BlockMetadata( num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=accessor.schema(), input_files=input_files) owner = DatasetContext.get_current().block_owner output.append((ray.put(new_block, _owner=owner), new_metadata)) return output
def _get_write_path_for_block( self, base_path, *, filesystem=None, dataset_uuid=None, block=None, block_index=None, file_format=None, ): num_rows = BlockAccessor.for_block(ray.get(block)).num_rows() suffix = ( f"{block_index:06}_{num_rows:02}_{dataset_uuid}" f".test.{file_format}" ) return posixpath.join(base_path, suffix)
def __init__(self, on: Optional[KeyFn] = None, ignore_nulls: bool = True): self._set_key_fn(on) null_merge = _null_wrap_merge(ignore_nulls, max) super().__init__( init=_null_wrap_init(lambda k: float("-inf")), merge=null_merge, accumulate_block=_null_wrap_accumulate_block( ignore_nulls, lambda block: BlockAccessor.for_block(block).max(on, ignore_nulls), null_merge, ), finalize=_null_wrap_finalize(lambda a: a), name=(f"max({str(on)})"), )
def multiget(self, block_indices, keys): start = time.perf_counter() if self.dataset_format == "arrow" and len(set(block_indices)) == 1: # Fast path: use np.searchsorted for vectorized search on a single block. # This is ~3x faster than the naive case. block = self.blocks[block_indices[0]] col = block[self.key_field] indices = np.searchsorted(col, keys) acc = BlockAccessor.for_block(block) result = [acc._get_row(i, copy=True) for i in indices] # assert result == [self._get(i, k) for i, k in zip(block_indices, keys)] else: result = [self._get(i, k) for i, k in zip(block_indices, keys)] self.total_time += time.perf_counter() - start self.num_accesses += 1 return result
def remote_read(i: int, task: ReadTask) -> MaybeBlockPartition: DatasetContext._set_current(context) stats = BlockExecStats.builder() # Execute the read task. block = task() if context.block_splitting_enabled: metadata = task.get_metadata() metadata.exec_stats = stats.build() else: metadata = BlockAccessor.for_block(block).get_metadata( input_files=task.get_metadata().input_files, exec_stats=stats.build()) stats_actor.record_task.remote(stats_uuid, i, metadata) return block
def make_block(start: int, count: int) -> Block: if block_format == "arrow": import pyarrow as pa return pa.Table.from_arrays([np.arange(start, start + count)], names=["value"]) elif block_format == "tensor": import pyarrow as pa tensor = np.ones(tensor_shape, dtype=np.int64) * np.expand_dims( np.arange(start, start + count), tuple(range(1, 1 + len(tensor_shape))), ) return BlockAccessor.batch_to_block(tensor) else: return list(builtins.range(start, start + count))