def test_memory_release_pipeline(shutdown_only, lazy_input): context = DatasetContext.get_current() # Disable stage fusion so we can keep reads and maps from being fused together, # since we're trying to test multi-stage memory releasing here. context.optimize_fuse_stages = False # This object store allocation can hold at most 1 copy of the transformed dataset. if lazy_input: object_store_memory = 3000e6 else: object_store_memory = 3000e6 n = 10 info = ray.init(num_cpus=n, object_store_memory=object_store_memory) if lazy_input: ds = ray.data.read_datasource( OnesSource(), parallelism=n, n_per_block=100 * 1024 * 1024, ) else: ds = ray.data.from_items(list(range(n)), parallelism=n) # Create a single-window pipeline. pipe = ds.window(blocks_per_window=n) # Round 1. def gen(x): import time # TODO(Clark): Remove this sleep once we have fixed memory pressure handling. time.sleep(2) if isinstance(x, np.ndarray): return x else: return np.ones(100 * 1024 * 1024, dtype=np.uint8) pipe = pipe.map(gen) def inc(x): import time # TODO(Clark): Remove this sleep once we have fixed memory pressure handling. time.sleep(2) return x + 1 num_rounds = 10 for _ in range(num_rounds): pipe = pipe.map(inc) for block in pipe.iter_batches(batch_size=None): for arr in block: np.testing.assert_equal( arr, np.ones(100 * 1024 * 1024, dtype=np.uint8) + num_rounds, ) meminfo = memory_summary(info["address"], stats_only=True) assert "Spilled" not in meminfo, meminfo
def prepare_read(self, parallelism: int): value = DatasetContext.get_current().foo meta = BlockMetadata( num_rows=1, size_bytes=8, schema=None, input_files=None, exec_stats=None) return [ReadTask(lambda: [[value]], meta)]
def test_dataset_pipeline_stats_basic(ray_start_regular_shared): context = DatasetContext.get_current() context.optimize_fuse_stages = True ds = ray.data.range(1000, parallelism=10) ds = ds.map_batches(lambda x: x) pipe = ds.repeat(5) pipe = pipe.map(lambda x: x) for batch in pipe.iter_batches(): pass stats = canonicalize(pipe.stats()) assert (stats == """== Pipeline Window N == Stage N read->map_batches: N/N blocks executed in T * Remote wall time: T min, T max, T mean, T total * Remote cpu time: T min, T max, T mean, T total * Output num rows: N min, N max, N mean, N total * Output size bytes: N min, N max, N mean, N total * Tasks per node: N min, N max, N mean; N nodes used Stage N map: N/N blocks executed in T * Remote wall time: T min, T max, T mean, T total * Remote cpu time: T min, T max, T mean, T total * Output num rows: N min, N max, N mean, N total * Output size bytes: N min, N max, N mean, N total * Tasks per node: N min, N max, N mean; N nodes used == Pipeline Window N == Stage N read->map_batches: [execution cached] Stage N map: N/N blocks executed in T * Remote wall time: T min, T max, T mean, T total * Remote cpu time: T min, T max, T mean, T total * Output num rows: N min, N max, N mean, N total * Output size bytes: N min, N max, N mean, N total * Tasks per node: N min, N max, N mean; N nodes used == Pipeline Window N == Stage N read->map_batches: [execution cached] Stage N map: N/N blocks executed in T * Remote wall time: T min, T max, T mean, T total * Remote cpu time: T min, T max, T mean, T total * Output num rows: N min, N max, N mean, N total * Output size bytes: N min, N max, N mean, N total * Tasks per node: N min, N max, N mean; N nodes used ##### Overall Pipeline Time Breakdown ##### * Time stalled waiting for next dataset: T min, T max, T mean, T total DatasetPipeline iterator time breakdown: * Waiting for next dataset: T * In ray.wait(): T * In ray.get(): T * In format_batch(): T * In user code: T * Total time: T """)
def test_repeat_forever(ray_start_regular_shared): context = DatasetContext.get_current() context.optimize_fuse_stages = True ds = ray.data.range(10) pipe = ds.repeat() assert str(pipe) == "DatasetPipeline(num_windows=inf, num_stages=2)" for i, v in enumerate(pipe.iter_rows()): assert v == i % 10, (v, i, i % 10) if i > 1000: break
def __next__(self): output = None while output is None: if all(s is None for s in self._stages): raise StopIteration # Wait for any completed stages. pending = [s for s in self._stages if s is not None] ready, _ = ray.wait(pending, timeout=0.1, num_returns=len(pending)) # Bubble elements down the pipeline as they become ready. for i in range(len(self._stages))[::-1]: is_last = i + 1 >= len(self._stages) next_slot_free = is_last or self._stages[i + 1] is None if not next_slot_free: continue slot_ready = self._stages[i] in ready if not slot_ready: continue # Bubble. result = ray.get(self._stages[i]) if self._bars: self._bars[i].update(1) self._stages[i] = None if is_last: output = result else: fn = self._pipeline._stages[i] self._stages[i + 1] = pipeline_stage.remote( lambda: fn(result), DatasetContext.get_current()) # Pull a new element for the initial slot if possible. if self._stages[0] is None: try: self._stages[0] = pipeline_stage.remote( next(self._iter), DatasetContext.get_current()) except StopIteration: pass return output
def iter_blocks_with_metadata( self, block_for_metadata: bool = False, ) -> Iterator[Tuple[ObjectRef[Block], BlockMetadata]]: """Iterate over the blocks along with their metadata. Note that, if block_for_metadata is False (default), this iterator returns pre-read metadata from the ReadTasks given to this LazyBlockList so it doesn't have to block on the execution of the read tasks. Therefore, the metadata may be under-specified, e.g. missing schema or the number of rows. If fully-specified block metadata is required, pass block_for_metadata=True. The length of this iterator is not known until execution. Args: block_for_metadata: Whether we should block on the execution of read tasks in order to obtain fully-specified block metadata. Returns: An iterator of block references and the corresponding block metadata. """ context = DatasetContext.get_current() outer = self class Iter: def __init__(self): self._base_iter = outer._iter_block_partition_refs() self._pos = -1 self._buffer = [] def __iter__(self): return self def __next__(self): while not self._buffer: self._pos += 1 if context.block_splitting_enabled: part_ref, _ = next(self._base_iter) partition = ray.get(part_ref) else: block_ref, metadata_ref = next(self._base_iter) if block_for_metadata: # This blocks until the read task completes, returning # fully-specified block metadata. metadata = ray.get(metadata_ref) else: # This does not block, returning (possibly under-specified) # pre-read block metadata. metadata = outer._tasks[self._pos].get_metadata() partition = [(block_ref, metadata)] for block_ref, metadata in partition: self._buffer.append((block_ref, metadata)) return self._buffer.pop(0) return Iter()
def is_read_stage_equivalent(self) -> bool: """Return whether this plan can be executed as only a read stage.""" context = DatasetContext.get_current() remaining_stages = self._stages_after_snapshot if (context.optimize_fuse_stages and remaining_stages and isinstance(remaining_stages[0], RandomizeBlocksStage)): remaining_stages = remaining_stages[1:] return (self.has_lazy_input() and not self._stages_before_snapshot and not remaining_stages and (not self._snapshot_blocks or isinstance(self._snapshot_blocks, LazyBlockList)))
def test_window_randomize_fusion(ray_start_regular_shared): context = DatasetContext.get_current() context.optimize_fuse_stages = True context.optimize_fuse_read_stages = True context.optimize_reorder_stages = True pipe = ray.data.range(100).randomize_block_order().window().map_batches( lambda x: x) pipe.take() stats = pipe.stats() assert "read->randomize_block_order->map_batches" in stats, stats
def from_pandas_refs( dfs: Union[ObjectRef["pandas.DataFrame"], List[ObjectRef["pandas.DataFrame"]]] ) -> Dataset[ArrowRow]: """Create a dataset from a list of Ray object references to Pandas dataframes. Args: dfs: A Ray object references to pandas dataframe, or a list of Ray object references to pandas dataframes. Returns: Dataset holding Arrow records read from the dataframes. """ if isinstance(dfs, ray.ObjectRef): dfs = [dfs] elif isinstance(dfs, list): for df in dfs: if not isinstance(df, ray.ObjectRef): raise ValueError( "Expected list of Ray object refs, " f"got list containing {type(df)}" ) else: raise ValueError( "Expected Ray object ref or list of Ray object refs, " f"got {type(df)}" ) context = DatasetContext.get_current() if context.enable_pandas_block: get_metadata = cached_remote_fn(_get_metadata) metadata = ray.get([get_metadata.remote(df) for df in dfs]) return Dataset( ExecutionPlan( BlockList(dfs, metadata), DatasetStats(stages={"from_pandas_refs": metadata}, parent=None), ), 0, False, ) df_to_block = cached_remote_fn(_df_to_block, num_returns=2) res = [df_to_block.remote(df) for df in dfs] blocks, metadata = map(list, zip(*res)) metadata = ray.get(metadata) return Dataset( ExecutionPlan( BlockList(blocks, metadata), DatasetStats(stages={"from_pandas_refs": metadata}, parent=None), ), 0, False, )
def from_items(items: List[Any], *, parallelism: int = -1) -> Dataset[Any]: """Create a dataset from a list of local Python objects. Examples: >>> import ray >>> ds = ray.data.from_items([1, 2, 3, 4, 5]) # doctest: +SKIP >>> ds # doctest: +SKIP Dataset(num_blocks=5, num_rows=5, schema=<class 'int'>) >>> ds.take(2) # doctest: +SKIP [1, 2] Args: items: List of local Python objects. parallelism: The amount of parallelism to use for the dataset. Parallelism may be limited by the number of items. Returns: Dataset holding the items. """ detected_parallelism, _ = _autodetect_parallelism( parallelism, ray.util.get_current_placement_group(), DatasetContext.get_current(), ) block_size = max( 1, len(items) // detected_parallelism, ) blocks: List[ObjectRef[Block]] = [] metadata: List[BlockMetadata] = [] i = 0 while i < len(items): stats = BlockExecStats.builder() builder = DelegatingBlockBuilder() for item in items[i:i + block_size]: builder.add(item) block = builder.build() blocks.append(ray.put(block)) metadata.append( BlockAccessor.for_block(block).get_metadata( input_files=None, exec_stats=stats.build())) i += block_size return Dataset( ExecutionPlan( BlockList(blocks, metadata), DatasetStats(stages={"from_items": metadata}, parent=None), ), 0, False, )
def _map_block_split(block: Block, fn: Any, input_files: List[str]) -> BlockPartition: output = [] for new_block in fn(block): accessor = BlockAccessor.for_block(new_block) new_meta = BlockMetadata(num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=accessor.schema(), input_files=input_files) owner = DatasetContext.get_current().block_owner output.append((ray.put(new_block, _owner=owner), new_meta)) return output
def apply(self, fn: Any, remote_args: dict, blocks: BlockList) -> BlockList: context = DatasetContext.get_current() # Handle empty datasets. if blocks.initial_num_blocks() == 0: return blocks blocks = blocks.get_blocks_with_metadata() map_bar = ProgressBar("Map Progress", total=len(blocks)) if context.block_splitting_enabled: map_block = cached_remote_fn(_map_block_split).options( **remote_args) refs = [map_block.remote(b, fn, m.input_files) for b, m in blocks] else: map_block = cached_remote_fn(_map_block_nosplit).options( **dict(remote_args, num_returns=2)) all_refs = [ map_block.remote(b, fn, m.input_files) for b, m in blocks ] data_refs = [r[0] for r in all_refs] refs = [r[1] for r in all_refs] # Common wait for non-data refs. try: results = map_bar.fetch_until_complete(refs) except (ray.exceptions.RayTaskError, KeyboardInterrupt) as e: # One or more mapper tasks failed, or we received a SIGINT signal # while waiting; either way, we cancel all map tasks. for ref in refs: ray.cancel(ref) # Wait until all tasks have failed or been cancelled. for ref in refs: try: ray.get(ref) except (ray.exceptions.RayTaskError, ray.exceptions.TaskCancelledError): pass # Reraise the original task failure exception. raise e from None new_blocks, new_metadata = [], [] if context.block_splitting_enabled: for result in results: for block, metadata in result: new_blocks.append(block) new_metadata.append(metadata) else: for block, metadata in zip(data_refs, results): new_blocks.append(block) new_metadata.append(metadata) return BlockList(list(new_blocks), list(new_metadata))
def read_pieces(serialized_pieces: str) -> Iterator[pa.Table]: # Implicitly trigger S3 subsystem initialization by importing # pyarrow.fs. import pyarrow.fs # noqa: F401 # Deserialize after loading the filesystem class. try: _register_parquet_file_fragment_serialization() pieces: List[ "pyarrow._dataset.ParquetFileFragment" ] = cloudpickle.loads(serialized_pieces) finally: _deregister_parquet_file_fragment_serialization() # Ensure that we're reading at least one dataset fragment. assert len(pieces) > 0 from pyarrow.dataset import _get_partition_keys ctx = DatasetContext.get_current() output_buffer = BlockOutputBuffer( block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size ) logger.debug(f"Reading {len(pieces)} parquet pieces") use_threads = reader_args.pop("use_threads", False) for piece in pieces: part = _get_partition_keys(piece.partition_expression) batches = piece.to_batches( use_threads=use_threads, columns=columns, schema=schema, batch_size=PARQUET_READER_ROW_BATCH_SIZE, **reader_args, ) for batch in batches: table = pyarrow.Table.from_batches([batch], schema=schema) if part: for col, value in part.items(): table = table.set_column( table.schema.get_field_index(col), col, pa.array([value] * len(table)), ) # If the table is empty, drop it. if table.num_rows > 0: output_buffer.add_block(table) if output_buffer.has_next(): yield output_buffer.next() output_buffer.finalize() if output_buffer.has_next(): yield output_buffer.next()
def test_autodetect_parallelism(avail_cpus, data_size, expected): class MockReader: def estimate_inmemory_data_size(self): return data_size result, _ = _autodetect_parallelism( parallelism=-1, cur_pg=None, ctx=DatasetContext.get_current(), reader=MockReader(), avail_cpus=avail_cpus, ) assert result == expected, (result, expected)
def test_read(ray_start_regular_shared): class CustomDatasource(Datasource): def prepare_read(self, parallelism: int): value = DatasetContext.get_current().foo meta = BlockMetadata(num_rows=1, size_bytes=8, schema=None, input_files=None) return [ReadTask(lambda: [[value]], meta)] context = DatasetContext.get_current() context.foo = 12345 assert ray.data.read_datasource(CustomDatasource()).take_all()[0] == 12345
def merge_sorted_blocks( blocks: List[Block[T]], key: "SortKeyT", _descending: bool ) -> Tuple[Block[T], BlockMetadata]: stats = BlockExecStats.builder() blocks = [b for b in blocks if b.num_rows > 0] if len(blocks) == 0: ret = ArrowBlockAccessor._empty_table() else: concat_and_sort = get_concat_and_sort_transform( DatasetContext.get_current() ) ret = concat_and_sort(blocks, key, _descending) return ret, ArrowBlockAccessor(ret).get_metadata(None, exec_stats=stats.build())
def _split(self, n: int, splitter: Callable[[Dataset], "DatasetPipeline[T]"]): coordinator = PipelineSplitExecutorCoordinator.remote( self, n, splitter, DatasetContext.get_current()) if self._executed[0]: raise RuntimeError("Pipeline cannot be read multiple times.") self._executed[0] = True class SplitIterator: def __init__(self, split_index, coordinator): self.split_index = split_index self.coordinator = coordinator self.warn_threshold = 100 self.wait_delay_s = 0.1 def __iter__(self): return self def __next__(self): ds = None tries = 0 while ds is None: ds = ray.get( self.coordinator.next_dataset_if_ready.remote( self.split_index)) # Wait for other shards to catch up reading. if not ds: time.sleep(self.wait_delay_s) tries += 1 if tries > self.warn_threshold: print("Warning: reader on shard {} of the pipeline " "has been blocked more than {}s waiting for " "other readers to catch up. All pipeline shards " "must be read from concurrently.".format( self.split_index, self.wait_delay_s * self.warn_threshold, )) self.warn_threshold *= 2 return lambda: ds return [ # Disable progress bars for the split readers since they would # overwhelm the console. DatasetPipeline( SplitIterator(idx, coordinator), length=self._length, progress_bars=False, ) for idx in range(n) ]
def can_fuse(self, prev: Stage): context = DatasetContext.get_current() # TODO(ekl) also support fusing shuffle stages to subsequent 1:1 stages. if not context.optimize_fuse_shuffle_stages: return False if not self.supports_block_udf: return False if not isinstance(prev, OneToOneStage): return False if prev.compute != "tasks": return False if any(k not in INHERITABLE_REMOTE_ARGS for k in prev.ray_remote_args): return False return True
def test_dataset_stats_read_parquet(ray_start_regular_shared, tmp_path): context = DatasetContext.get_current() context.optimize_fuse_stages = True ds = ray.data.range(1000, parallelism=10) ds.write_parquet(str(tmp_path)) ds = ray.data.read_parquet(str(tmp_path)).map(lambda x: x) stats = canonicalize(ds.stats()) assert (stats == """Stage N read->map: N/N blocks executed in T * Remote wall time: T min, T max, T mean, T total * Remote cpu time: T min, T max, T mean, T total * Output num rows: N min, N max, N mean, N total * Output size bytes: N min, N max, N mean, N total * Tasks per node: N min, N max, N mean; N nodes used """)
def test_auto_parallelism_basic(shutdown_only): ray.init(num_cpus=8) context = DatasetContext.get_current() context.min_parallelism = 1 # Datasource bound. ds = ray.data.range_tensor(5, shape=(100,), parallelism=-1) assert ds.num_blocks() == 5, ds # CPU bound. TODO(ekl) we should fix range datasource to respect parallelism more # properly, currently it can go a little over. ds = ray.data.range_tensor(10000, shape=(100,), parallelism=-1) assert ds.num_blocks() == 16, ds # Block size bound. ds = ray.data.range_tensor(100000000, shape=(100,), parallelism=-1) assert ds.num_blocks() == 150, ds
def test_dataset_pipeline_split_stats_basic(ray_start_regular_shared): context = DatasetContext.get_current() context.optimize_fuse_stages = True ds = ray.data.range(1000, parallelism=10) pipe = ds.repeat(2) @ray.remote def consume(split): for batch in split.iter_batches(): pass return split.stats() s0, s1 = pipe.split(2) stats = ray.get([consume.remote(s0), consume.remote(s1)]) assert (canonicalize(stats[0]) == """== Pipeline Window Z == Stage N read: N/N blocks executed in T * Remote wall time: T min, T max, T mean, T total * Remote cpu time: T min, T max, T mean, T total * Output num rows: N min, N max, N mean, N total * Output size bytes: N min, N max, N mean, N total * Tasks per node: N min, N max, N mean; N nodes used Dataset iterator time breakdown: * In ray.wait(): T * In ray.get(): T * In format_batch(): T * In user code: T * Total time: T == Pipeline Window N == Stage N read: N/N blocks executed in T * Remote wall time: T min, T max, T mean, T total * Remote cpu time: T min, T max, T mean, T total * Output num rows: N min, N max, N mean, N total * Output size bytes: N min, N max, N mean, N total * Tasks per node: N min, N max, N mean; N nodes used Dataset iterator time breakdown: * In ray.wait(): T * In ray.get(): T * In format_batch(): T * In user code: T * Total time: T ##### Overall Pipeline Time Breakdown ##### * Time stalled waiting for next dataset: T min, T max, T mean, T total * Time in dataset iterator: T * Time in user code: T * Total time: T """)
def _read_pieces( block_udf, reader_args, columns, schema, serialized_pieces: List[_SerializedPiece] ) -> Iterator["pyarrow.Table"]: # Deserialize after loading the filesystem class. pieces: List[ "pyarrow._dataset.ParquetFileFragment"] = _deserialize_pieces_with_retry( serialized_pieces) # Ensure that we're reading at least one dataset fragment. assert len(pieces) > 0 import pyarrow as pa from pyarrow.dataset import _get_partition_keys ctx = DatasetContext.get_current() output_buffer = BlockOutputBuffer( block_udf=block_udf, target_max_block_size=ctx.target_max_block_size, ) logger.debug(f"Reading {len(pieces)} parquet pieces") use_threads = reader_args.pop("use_threads", False) for piece in pieces: part = _get_partition_keys(piece.partition_expression) batches = piece.to_batches( use_threads=use_threads, columns=columns, schema=schema, batch_size=PARQUET_READER_ROW_BATCH_SIZE, **reader_args, ) for batch in batches: table = pa.Table.from_batches([batch], schema=schema) if part: for col, value in part.items(): table = table.set_column( table.schema.get_field_index(col), col, pa.array([value] * len(table)), ) # If the table is empty, drop it. if table.num_rows > 0: output_buffer.add_block(table) if output_buffer.has_next(): yield output_buffer.next() output_buffer.finalize() if output_buffer.has_next(): yield output_buffer.next()
def read_files( read_paths: List[str], fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper], ) -> Iterable[Block]: logger.debug(f"Reading {len(read_paths)} files.") if isinstance(fs, _S3FileSystemWrapper): fs = fs.unwrap() ctx = DatasetContext.get_current() output_buffer = BlockOutputBuffer( block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size) for read_path in read_paths: compression = open_stream_args.pop("compression", None) if compression is None: import pyarrow as pa try: # If no compression manually given, try to detect # compression codec from path. compression = pa.Codec.detect(read_path).name except (ValueError, TypeError): # Arrow's compression inference on the file path # doesn't work for Snappy, so we double-check ourselves. import pathlib suffix = pathlib.Path(read_path).suffix if suffix and suffix[1:] == "snappy": compression = "snappy" else: compression = None if compression == "snappy": # Pass Snappy compression as a reader arg, so datasource subclasses # can manually handle streaming decompression in # self._read_stream(). reader_args["compression"] = compression reader_args["filesystem"] = fs elif compression is not None: # Non-Snappy compression, pass as open_input_stream() arg so Arrow # can take care of streaming decompression for us. open_stream_args["compression"] = compression with self._open_input_source(fs, read_path, **open_stream_args) as f: for data in read_stream(f, read_path, **reader_args): output_buffer.add_block(data) if output_buffer.has_next(): yield output_buffer.next() output_buffer.finalize() if output_buffer.has_next(): yield output_buffer.next()
def test_memory_release_lazy(shutdown_only): context = DatasetContext.get_current() # Ensure that stage fusion is enabled. context.optimize_fuse_stages = True info = ray.init(num_cpus=1, object_store_memory=1500e6) ds = ray.data.range(10) # Should get fused into single stage. ds = ds.experimental_lazy() ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8)) ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8)) ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8)) ds.fully_executed() meminfo = memory_summary(info.address_info["address"], stats_only=True) assert "Spilled" not in meminfo, meminfo
def _optimize(self) -> Tuple[BlockList, DatasetStats, List[Stage]]: """Apply stage fusion optimizations, returning an updated source block list and associated stats, and a set of optimized stages. """ context = DatasetContext.get_current() blocks, stats, stages = self._get_source_blocks_and_stages() if context.optimize_fuse_stages: if context.optimize_fuse_read_stages: # If using a lazy datasource, rewrite read stage into one-to-one stage # so it can be fused into downstream stages. blocks, stats, stages = _rewrite_read_stages( blocks, stats, stages, self._dataset_uuid) stages = _fuse_one_to_one_stages(stages) self._last_optimized_stages = stages return blocks, stats, stages
def process_block( self, block: Block, input_files: List[str] ) -> Iterable[Tuple[Block, BlockMetadata]]: output = [] for new_block in fn(block): accessor = BlockAccessor.for_block(new_block) new_metadata = BlockMetadata( num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=accessor.schema(), input_files=input_files) owner = DatasetContext.get_current().block_owner output.append((ray.put(new_block, _owner=owner), new_metadata)) return output
def test_optimize_equivalent_remote_args(ray_start_regular_shared): context = DatasetContext.get_current() context.optimize_fuse_stages = True context.optimize_fuse_read_stages = True context.optimize_fuse_shuffle_stages = True equivalent_kwargs = [ {}, { "resources": { "blah": 0 } }, { "resources": { "blah": None } }, { "num_cpus": None }, { "num_cpus": 1 }, { "num_cpus": 1, "num_gpus": 0 }, { "num_cpus": 1, "num_gpus": None }, ] for kwa in equivalent_kwargs: for kwb in equivalent_kwargs: print("CHECKING", kwa, kwb) pipe = ray.data.range(3).repeat(2) pipe = pipe.map_batches(lambda x: x, compute="tasks", **kwa) pipe = pipe.map_batches(lambda x: x, compute="tasks", **kwb) pipe.take() expect_stages( pipe, 1, [ "read->map_batches->map_batches", ], )
def test_dataset_stats_shuffle(ray_start_regular_shared): context = DatasetContext.get_current() context.optimize_fuse_stages = True ds = ray.data.range(1000, parallelism=10) ds = ds.random_shuffle().repartition(1, shuffle=True) stats = canonicalize(ds.stats()) assert ( stats == """Stage N read->random_shuffle: executed in T Substage Z read->random_shuffle_map: N/N blocks executed * Remote wall time: T min, T max, T mean, T total * Remote cpu time: T min, T max, T mean, T total * Peak heap memory usage (MiB): N min, N max, N mean * Output num rows: N min, N max, N mean, N total * Output size bytes: N min, N max, N mean, N total * Tasks per node: N min, N max, N mean; N nodes used Substage N random_shuffle_reduce: N/N blocks executed * Remote wall time: T min, T max, T mean, T total * Remote cpu time: T min, T max, T mean, T total * Peak heap memory usage (MiB): N min, N max, N mean * Output num rows: N min, N max, N mean, N total * Output size bytes: N min, N max, N mean, N total * Tasks per node: N min, N max, N mean; N nodes used Stage N repartition: executed in T Substage Z repartition_map: N/N blocks executed * Remote wall time: T min, T max, T mean, T total * Remote cpu time: T min, T max, T mean, T total * Peak heap memory usage (MiB): N min, N max, N mean * Output num rows: N min, N max, N mean, N total * Output size bytes: N min, N max, N mean, N total * Tasks per node: N min, N max, N mean; N nodes used Substage N repartition_reduce: N/N blocks executed * Remote wall time: T min, T max, T mean, T total * Remote cpu time: T min, T max, T mean, T total * Peak heap memory usage (MiB): N min, N max, N mean * Output num rows: N min, N max, N mean, N total * Output size bytes: N min, N max, N mean, N total * Tasks per node: N min, N max, N mean; N nodes used """ )
def _get_or_create_stats_actor(): # Need to re-create it if Ray restarts (mostly for unit tests). if (not _stats_actor[0] or not ray.is_initialized() or _stats_actor[1] != ray.get_runtime_context().job_id.hex()): ctx = DatasetContext.get_current() _stats_actor[0] = _StatsActor.options( name="datasets_stats_actor", get_if_exists=True, scheduling_strategy=ctx.scheduling_strategy, ).remote() _stats_actor[1] = ray.get_runtime_context().job_id.hex() # Clear the actor handle after Ray reinits since it's no longer valid. def clear_actor(): _stats_actor[0] = None ray.worker._post_init_hooks.append(clear_actor) return _stats_actor[0]
def __call__(self) -> BlockPartition: context = DatasetContext.get_current() result = self._read_fn() if not hasattr(result, "__iter__"): DeprecationWarning( "Read function must return Iterable[Block], got {}. " "Probably you need to return `[block]` instead of " "`block`.".format(result)) partition: BlockPartition = [] for block in result: metadata = BlockAccessor.for_block(block).get_metadata( input_files=self._metadata.input_files) assert context.block_owner partition.append((ray.put(block, _owner=context.block_owner), metadata)) if len(partition) == 0: raise ValueError("Read task must return non-empty list.") return partition