def from_vineyard(object_id): vineyard_to_block = cached_remote_fn(_vineyard_to_block, num_cpus=0.1, num_returns=2) get_vineyard_instance_id = cached_remote_fn(_get_vineyard_instance_id, num_cpus=0.1) get_remote_chunks_map = cached_remote_fn(_get_remote_chunks_map, num_cpus=0.1) chunks = ray.get(get_remote_chunks_map.remote(object_id)) with spread_to_all_nodes(get_vineyard_instance_id) as (nodes, pg): instances = dict() # instance_id -> placement group index for index in range(nodes): instance = ray.get( get_vineyard_instance_id.options( placement_group=pg, placement_group_bundle_index=index).remote()) instances[instance] = index blocks, metadatas = [], [] for object_id, location in chunks.items(): block, metadata = vineyard_to_block.options( placement_group=pg, placement_group_bundle_index=instances[location]).remote( vineyard.ObjectID(object_id)) blocks.append(block) metadatas.append(metadata) return Dataset(BlockList(blocks, ray.get(metadatas)))
def _apply( self, fn: Any, remote_args: dict, block_list: BlockList, clear_input_blocks: bool, ) -> BlockList: context = DatasetContext.get_current() # Handle empty datasets. if block_list.initial_num_blocks() == 0: return block_list blocks = block_list.get_blocks_with_metadata() map_bar = ProgressBar("Map Progress", total=len(blocks)) if context.block_splitting_enabled: map_block = cached_remote_fn(_map_block_split).options(**remote_args) refs = [map_block.remote(b, fn, m.input_files) for b, m in blocks] else: map_block = cached_remote_fn(_map_block_nosplit).options( **dict(remote_args, num_returns=2) ) all_refs = [map_block.remote(b, fn, m.input_files) for b, m in blocks] data_refs = [r[0] for r in all_refs] refs = [r[1] for r in all_refs] # Release input block references. if clear_input_blocks: del blocks block_list.clear() # Common wait for non-data refs. try: results = map_bar.fetch_until_complete(refs) except (ray.exceptions.RayTaskError, KeyboardInterrupt) as e: # One or more mapper tasks failed, or we received a SIGINT signal # while waiting; either way, we cancel all map tasks. for ref in refs: ray.cancel(ref) # Wait until all tasks have failed or been cancelled. for ref in refs: try: ray.get(ref) except (ray.exceptions.RayTaskError, ray.exceptions.TaskCancelledError): pass # Reraise the original task failure exception. raise e from None new_blocks, new_metadata = [], [] if context.block_splitting_enabled: for result in results: for block, metadata in result: new_blocks.append(block) new_metadata.append(metadata) else: for block, metadata in zip(data_refs, results): new_blocks.append(block) new_metadata.append(metadata) return BlockList(list(new_blocks), list(new_metadata))
def sort_impl(blocks: BlockList, key: SortKeyT, descending: bool = False) -> Tuple[BlockList, dict]: stage_info = {} blocks = blocks.get_blocks() if len(blocks) == 0: return BlockList([], []), stage_info if isinstance(key, str): key = [(key, "descending" if descending else "ascending")] if isinstance(key, list): descending = key[0][1] == "descending" num_mappers = len(blocks) num_reducers = num_mappers boundaries = sample_boundaries(blocks, key, num_reducers) if descending: boundaries.reverse() sort_block = cached_remote_fn(_sort_block).options( num_returns=num_reducers + 1) merge_sorted_blocks = cached_remote_fn(_merge_sorted_blocks, num_returns=2) map_results = np.empty((num_mappers, num_reducers), dtype=object) map_meta = [] for i, block in enumerate(blocks): result = sort_block.remote(block, boundaries, key, descending) map_results[i, :] = result[:-1] map_meta.append(result[-1]) # Early release memory. del blocks map_bar = ProgressBar("Sort Map", len(map_results)) map_bar.block_until_complete(map_meta) map_bar.close() stage_info["map"] = ray.get(map_meta) reduce_results = [] for j in range(num_reducers): ret = merge_sorted_blocks.remote(key, descending, *map_results[:, j].tolist()) reduce_results.append(ret) # Early release memory. del map_results merge_bar = ProgressBar("Sort Merge", len(reduce_results)) merge_bar.block_until_complete([ret[0] for ret in reduce_results]) merge_bar.close() blocks = [b for b, _ in reduce_results] metadata = ray.get([m for _, m in reduce_results]) stage_info["merge"] = metadata return BlockList(blocks, metadata), stage_info
def do_agg(blocks, clear_input_blocks: bool, block_udf): # TODO: implement clear_input_blocks stage_info = {} if len(aggs) == 0: raise ValueError("Aggregate requires at least one aggregation") for agg in aggs: agg._validate(self._dataset) # Handle empty dataset. if blocks.initial_num_blocks() == 0: return blocks, stage_info num_mappers = blocks.initial_num_blocks() num_reducers = num_mappers if self._key is None: num_reducers = 1 boundaries = [] else: boundaries = sort.sample_boundaries( blocks.get_blocks(), [(self._key, "ascending")] if isinstance(self._key, str) else self._key, num_reducers, ) partition_and_combine_block = cached_remote_fn( _partition_and_combine_block).options( num_returns=num_reducers + 1) aggregate_combined_blocks = cached_remote_fn( _aggregate_combined_blocks, num_returns=2) map_results = np.empty((num_mappers, num_reducers), dtype=object) map_meta = [] for i, block in enumerate(blocks.get_blocks()): results = partition_and_combine_block.remote( block, boundaries, self._key, aggs) map_results[i, :] = results[:-1] map_meta.append(results[-1]) map_bar = ProgressBar("GroupBy Map", len(map_results)) map_bar.block_until_complete(map_meta) stage_info["map"] = ray.get(map_meta) map_bar.close() blocks = [] metadata = [] for j in range(num_reducers): block, meta = aggregate_combined_blocks.remote( num_reducers, self._key, aggs, *map_results[:, j].tolist()) blocks.append(block) metadata.append(meta) reduce_bar = ProgressBar("GroupBy Reduce", len(blocks)) reduce_bar.block_until_complete(blocks) reduce_bar.close() metadata = ray.get(metadata) stage_info["reduce"] = metadata return BlockList(blocks, metadata), stage_info
def from_pandas_refs( dfs: Union[ObjectRef["pandas.DataFrame"], List[ObjectRef["pandas.DataFrame"]]] ) -> Dataset[ArrowRow]: """Create a dataset from a list of Ray object references to Pandas dataframes. Args: dfs: A Ray object references to pandas dataframe, or a list of Ray object references to pandas dataframes. Returns: Dataset holding Arrow records read from the dataframes. """ if isinstance(dfs, ray.ObjectRef): dfs = [dfs] elif isinstance(dfs, list): for df in dfs: if not isinstance(df, ray.ObjectRef): raise ValueError( "Expected list of Ray object refs, " f"got list containing {type(df)}" ) else: raise ValueError( "Expected Ray object ref or list of Ray object refs, " f"got {type(df)}" ) context = DatasetContext.get_current() if context.enable_pandas_block: get_metadata = cached_remote_fn(_get_metadata) metadata = [get_metadata.remote(df) for df in dfs] return Dataset( ExecutionPlan(BlockList(dfs, ray.get(metadata)), DatasetStats.TODO()), 0, False, ) df_to_block = cached_remote_fn(_df_to_block, num_returns=2) res = [df_to_block.remote(df) for df in dfs] blocks, metadata = zip(*res) return Dataset( ExecutionPlan( BlockList(blocks, ray.get(list(metadata))), DatasetStats(stages={"from_pandas_refs": metadata}, parent=None), ), 0, False, )
def read_datasource(datasource: Datasource[T], *, parallelism: int = 200, ray_remote_args: Dict[str, Any] = None, **read_args) -> Dataset[T]: """Read a dataset from a custom data source. Args: datasource: The datasource to read data from. parallelism: The requested parallelism of the read. read_args: Additional kwargs to pass to the datasource impl. ray_remote_args: kwargs passed to ray.remote in the read tasks. Returns: Dataset holding the data read from the datasource. """ read_tasks = datasource.prepare_read(parallelism, **read_args) def remote_read(task: ReadTask) -> Block: return task() if ray_remote_args is None: ray_remote_args = {} remote_read = cached_remote_fn(remote_read, **ray_remote_args) calls: List[Callable[[], ObjectRef[Block]]] = [] metadata: List[BlockMetadata] = [] for task in read_tasks: calls.append(lambda task=task: remote_read.remote(task)) metadata.append(task.get_metadata()) block_list = LazyBlockList(calls, metadata) # Get the schema from the first block synchronously. if metadata and metadata[0].schema is None: get_schema = cached_remote_fn(_get_schema) schema0 = ray.get(get_schema.remote(next(iter(block_list)))) block_list.set_metadata( 0, BlockMetadata( num_rows=metadata[0].num_rows, size_bytes=metadata[0].size_bytes, schema=schema0, input_files=metadata[0].input_files, )) return Dataset(block_list)
def sample_boundaries(blocks: BlockList[T], key: SortKeyT, num_reducers: int) -> List[T]: """ Return (num_reducers - 1) items in ascending order from the blocks that partition the domain into ranges with approximately equally many elements. """ n_samples = int(num_reducers * 10 / len(blocks)) sample_block = cached_remote_fn(_sample_block) sample_results = [ sample_block.remote(block, n_samples, key) for block in blocks ] sample_bar = ProgressBar("Sort Sample", len(sample_results)) sample_bar.block_until_complete(sample_results) sample_bar.close() samples = ray.get(sample_results) sample_items = np.concatenate(samples) sample_items.sort() ret = [ np.quantile(sample_items, q, interpolation="nearest") for q in np.arange(0, 1, 1 / num_reducers) ] return ret[1:]
def do_write(self, blocks: List[ObjectRef[Block]], metadata: List[BlockMetadata], path: str, dataset_uuid: str, filesystem: Optional["pyarrow.fs.FileSystem"] = None, **write_args) -> List[ObjectRef[WriteResult]]: """Creates and returns write tasks for a file-based datasource.""" path, filesystem = _resolve_paths_and_filesystem(path, filesystem) path = path[0] filesystem = _wrap_s3_serialization_workaround(filesystem) _write_block_to_file = self._write_block def write_block(write_path: str, block: Block): logger.debug(f"Writing {write_path} file.") fs = filesystem if isinstance(fs, _S3FileSystemWrapper): fs = fs.unwrap() with fs.open_output_stream(write_path) as f: _write_block_to_file(f, BlockAccessor.for_block(block)) write_block = cached_remote_fn(write_block) file_format = self._file_format() write_tasks = [] for block_idx, block in enumerate(blocks): write_path = os.path.join( path, f"{dataset_uuid}_{block_idx:06}.{file_format}") write_task = write_block.remote(write_path, block) write_tasks.append(write_task) return write_tasks
def from_arrow_refs( tables: Union[ ObjectRef[Union["pyarrow.Table", bytes]], List[ObjectRef[Union["pyarrow.Table", bytes]]], ] ) -> Dataset[ArrowRow]: """Create a dataset from a set of Arrow tables. Args: tables: A Ray object reference to Arrow table, or list of Ray object references to Arrow tables, or its streaming format in bytes. Returns: Dataset holding Arrow records from the tables. """ if isinstance(tables, ray.ObjectRef): tables = [tables] get_metadata = cached_remote_fn(_get_metadata) metadata = [get_metadata.remote(t) for t in tables] return Dataset( ExecutionPlan( BlockList(tables, ray.get(metadata)), DatasetStats(stages={"from_arrow_refs": metadata}, parent=None), ), 0, False, )
def do_write(self, blocks: List[ObjectRef[Block]], metadata: List[BlockMetadata], path: str, dataset_uuid: str, filesystem: Optional["pyarrow.fs.FileSystem"] = None, try_create_dir: bool = True, open_stream_args: Optional[Dict[str, Any]] = None, block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(), write_args_fn: Callable[[], Dict[str, Any]] = lambda: {}, _block_udf: Optional[Callable[[Block], Block]] = None, **write_args) -> List[ObjectRef[WriteResult]]: """Creates and returns write tasks for a file-based datasource.""" path, filesystem = _resolve_paths_and_filesystem(path, filesystem) path = path[0] if try_create_dir: filesystem.create_dir(path, recursive=True) filesystem = _wrap_s3_serialization_workaround(filesystem) _write_block_to_file = self._write_block if open_stream_args is None: open_stream_args = {} def write_block(write_path: str, block: Block): logger.debug(f"Writing {write_path} file.") fs = filesystem if isinstance(fs, _S3FileSystemWrapper): fs = fs.unwrap() if _block_udf is not None: block = _block_udf(block) with fs.open_output_stream(write_path, **open_stream_args) as f: _write_block_to_file( f, BlockAccessor.for_block(block), writer_args_fn=write_args_fn, **write_args) write_block = cached_remote_fn(write_block) file_format = self._file_format() write_tasks = [] if not block_path_provider: block_path_provider = DefaultBlockWritePathProvider() for block_idx, block in enumerate(blocks): write_path = block_path_provider( path, filesystem=filesystem, dataset_uuid=dataset_uuid, block=block, block_index=block_idx, file_format=file_format) write_task = write_block.remote(write_path, block) write_tasks.append(write_task) return write_tasks
def __init__( self, dataset: "Dataset[T]", key: str, num_workers: int, ): """Construct a RandomAccessDataset (internal API). The constructor is a private API. Use ``dataset.to_random_access_dataset()`` to construct a RandomAccessDataset. """ self._format = dataset._dataset_format() if self._format not in ["arrow", "pandas"]: raise ValueError( "RandomAccessDataset only supports Arrow-format datasets.") start = time.perf_counter() logger.info("[setup] Indexing dataset by sort key.") sorted_ds = dataset.sort(key) get_bounds = cached_remote_fn(_get_bounds) blocks = sorted_ds.get_internal_block_refs() logger.info("[setup] Computing block range bounds.") bounds = ray.get( [get_bounds.remote(b, key, self._format) for b in blocks]) self._non_empty_blocks = [] self._lower_bound = None self._upper_bounds = [] for i, b in enumerate(bounds): if b: self._non_empty_blocks.append(blocks[i]) if self._lower_bound is None: self._lower_bound = b[0] self._upper_bounds.append(b[1]) logger.info( "[setup] Creating {} random access workers.".format(num_workers)) self._workers = [ _RandomAccessWorker.options(scheduling_strategy="SPREAD").remote( key, self._format) for _ in range(num_workers) ] ( self._block_to_workers_map, self._worker_to_blocks_map, ) = self._compute_block_to_worker_assignments() logger.info("[setup] Worker to blocks assignment: {}".format( self._worker_to_blocks_map)) ray.get([ w.assign_blocks.remote({ i: self._non_empty_blocks[i] for i in self._worker_to_blocks_map[w] }) for w in self._workers ]) logger.info("[setup] Finished assigning blocks to workers.") self._build_time = time.perf_counter() - start
def sort_impl(blocks: BlockList, key: SortKeyT, descending: bool = False) -> BlockList: blocks = list(blocks.iter_blocks()) if len(blocks) == 0: return BlockList([], []) if isinstance(key, str): key = [(key, "descending" if descending else "ascending")] if isinstance(key, list): descending = key[0][1] == "descending" num_mappers = len(blocks) num_reducers = num_mappers boundaries = sample_boundaries(blocks, key, num_reducers) if descending: boundaries.reverse() sort_block = cached_remote_fn(_sort_block).options( num_returns=num_reducers) merge_sorted_blocks = cached_remote_fn(_merge_sorted_blocks, num_returns=2) map_results = np.empty((num_mappers, num_reducers), dtype=object) for i, block in enumerate(blocks): map_results[i, :] = sort_block.remote(block, boundaries, key, descending) map_bar = ProgressBar("Sort Map", len(map_results)) map_bar.block_until_complete([ret[0] for ret in map_results]) map_bar.close() reduce_results = [] for j in range(num_reducers): ret = merge_sorted_blocks.remote(key, descending, *map_results[:, j].tolist()) reduce_results.append(ret) merge_bar = ProgressBar("Sort Merge", len(reduce_results)) merge_bar.block_until_complete([ret[0] for ret in reduce_results]) merge_bar.close() blocks = [b for b, _ in reduce_results] metadata = ray.get([m for _, m in reduce_results]) return BlockList(blocks, metadata)
def simple_shuffle(input_blocks: BlockList[T], output_num_blocks: int, *, random_shuffle: bool = False, random_seed: Optional[int] = None) -> BlockList[T]: input_num_blocks = len(input_blocks) shuffle_map = cached_remote_fn(_shuffle_map).options( num_returns=output_num_blocks) shuffle_reduce = cached_remote_fn(_shuffle_reduce, num_returns=2) map_bar = ProgressBar("Shuffle Map", position=0, total=input_num_blocks) shuffle_map_out = [ shuffle_map.remote(block, i, output_num_blocks, random_shuffle, random_seed) for i, block in enumerate(input_blocks) ] if output_num_blocks == 1: # Handle the num_returns=1 edge case which doesn't return a list. shuffle_map_out = [[x] for x in shuffle_map_out] map_bar.block_until_complete([x[0] for x in shuffle_map_out]) map_bar.close() # Randomize the reduce order of the blocks. if random_shuffle: random = np.random.RandomState(random_seed) random.shuffle(shuffle_map_out) reduce_bar = ProgressBar("Shuffle Reduce", position=0, total=output_num_blocks) shuffle_reduce_out = [ shuffle_reduce.remote( *[shuffle_map_out[i][j] for i in range(input_num_blocks)]) for j in range(output_num_blocks) ] new_blocks, new_metadata = zip(*shuffle_reduce_out) reduce_bar.block_until_complete(list(new_blocks)) new_metadata = ray.get(list(new_metadata)) reduce_bar.close() return BlockList(list(new_blocks), list(new_metadata))
def from_arrow(tables: List[ObjectRef["pyarrow.Table"]]) -> Dataset[ArrowRow]: """Create a dataset from a set of Arrow tables. Args: dfs: A list of Ray object references to Arrow tables. Returns: Dataset holding Arrow records from the tables. """ get_metadata = cached_remote_fn(_get_metadata) metadata = [get_metadata.remote(t) for t in tables] return Dataset(BlockList(tables, ray.get(metadata)))
def from_pandas(dfs: List[ObjectRef["pandas.DataFrame"]]) -> Dataset[ArrowRow]: """Create a dataset from a set of Pandas dataframes. Args: dfs: A list of Ray object references to pandas dataframes. Returns: Dataset holding Arrow records read from the dataframes. """ df_to_block = cached_remote_fn(_df_to_block, num_returns=2) res = [df_to_block.remote(df) for df in dfs] blocks, metadata = zip(*res) return Dataset(BlockList(blocks, ray.get(list(metadata))))
def from_numpy(ndarrays: List[ObjectRef[np.ndarray]]) -> Dataset[np.ndarray]: """Create a dataset from a set of NumPy ndarrays. Args: ndarrays: A list of Ray object references to NumPy ndarrays. Returns: Dataset holding the given ndarrays. """ ndarray_to_block = cached_remote_fn(_ndarray_to_block, num_returns=2) res = [ndarray_to_block.remote(ndarray) for ndarray in ndarrays] blocks, metadata = zip(*res) return Dataset(BlockList(blocks, ray.get(list(metadata))))
def _fetch_metadata_remotely( pieces: List[bytes] ) -> List[ObjectRef["pyarrow.parquet.FileMetaData"]]: remote_fetch_metadata = cached_remote_fn( _fetch_metadata_serialization_wrapper) metas = [] parallelism = min(len(pieces) // PIECES_PER_META_FETCH, 100) meta_fetch_bar = ProgressBar("Metadata Fetch Progress", total=parallelism) for pieces_ in np.array_split(pieces, parallelism): if len(pieces_) == 0: continue metas.append(remote_fetch_metadata.remote(pieces_)) metas = meta_fetch_bar.fetch_until_complete(metas) return list(itertools.chain.from_iterable(metas))
def ensure_schema_for_first_block( self) -> Optional[Union["pyarrow.Schema", type]]: """Ensure that the schema is set for the first block. Returns None if the block list is empty. """ get_schema = cached_remote_fn(_get_schema) try: block = next(self.iter_blocks()) except (StopIteration, ValueError): # Dataset is empty (no blocks) or was manually cleared. return None schema = ray.get(get_schema.remote(block)) # Set the schema. self._metadata[0].schema = schema return schema
def apply(self, fn: Any, remote_args: dict, blocks: BlockList[Any]) -> BlockList[Any]: map_bar = ProgressBar("Map Progress", total=len(blocks)) kwargs = remote_args.copy() kwargs["num_returns"] = 2 map_block = cached_remote_fn(_map_block) refs = [ map_block.options(**kwargs).remote(b, m, fn) for b, m in zip(blocks, blocks.get_metadata()) ] new_blocks, new_metadata = zip(*refs) map_bar.block_until_complete(list(new_blocks)) new_metadata = ray.get(list(new_metadata)) return BlockList(list(new_blocks), list(new_metadata))
def _submit_task( self, task_idx: int ) -> Tuple[ObjectRef[MaybeBlockPartition], ObjectRef[BlockPartitionMetadata]]: """Submit the task with index task_idx.""" stats_actor = _get_or_create_stats_actor() if not self._execution_started: stats_actor.record_start.remote(self._stats_uuid) self._execution_started = True task = self._tasks[task_idx] return (cached_remote_fn(_execute_read_task).options( num_returns=2, **self._remote_args).remote( i=task_idx, task=task, context=DatasetContext.get_current(), stats_uuid=self._stats_uuid, stats_actor=stats_actor, ))
def _execute_reduce_stage( self, output_num_blocks: int, schedule: _PushBasedShuffleTaskSchedule, reduce_ray_remote_args: Dict[str, Any], all_merge_results: List[List[ObjectRef]], ): shuffle_reduce = cached_remote_fn(self.reduce) # Execute the final reduce stage. shuffle_reduce_out = [] for reducer_idx in range(output_num_blocks): merge_idx = schedule.get_merge_idx_for_reducer_idx(reducer_idx) # Submit one partition of reduce tasks, one for each of the P # outputs produced by the corresponding merge task. # We also add the merge task arguments so that the reduce task # is colocated with its inputs. shuffle_reduce_out.append( shuffle_reduce.options( **reduce_ray_remote_args, **schedule.get_merge_task_options(merge_idx), num_returns=2, ).remote( *self._reduce_args, *[ merge_results.pop(0) for merge_results in all_merge_results[merge_idx] ], ) ) for merge_idx, merge_results in enumerate(all_merge_results): assert all(len(merge_result) == 0 for merge_result in merge_results), ( "Reduce stage did not process outputs from merge tasks at index: " f"{merge_idx}" ) assert ( len(shuffle_reduce_out) == output_num_blocks ), f"Expected {output_num_blocks} outputs, produced {len(shuffle_reduce_out)}" reduce_bar = ProgressBar("Shuffle Reduce", total=output_num_blocks) reduce_blocks, reduce_metadata = zip(*shuffle_reduce_out) reduce_metadata = reduce_bar.fetch_until_complete(list(reduce_metadata)) reduce_bar.close() return reduce_metadata, reduce_blocks
def _fetch_metadata_remotely( pieces: List["pyarrow._dataset.ParquetFileFragment"], ) -> List[ObjectRef["pyarrow.parquet.FileMetaData"]]: from ray import cloudpickle remote_fetch_metadata = cached_remote_fn(_fetch_metadata_serialization_wrapper) metas = [] parallelism = min(len(pieces) // PIECES_PER_META_FETCH, 100) meta_fetch_bar = ProgressBar("Metadata Fetch Progress", total=parallelism) try: _register_parquet_file_fragment_serialization() for pcs in np.array_split(pieces, parallelism): if len(pcs) == 0: continue metas.append(remote_fetch_metadata.remote(cloudpickle.dumps(pcs))) finally: _deregister_parquet_file_fragment_serialization() metas = meta_fetch_bar.fetch_until_complete(metas) return list(itertools.chain.from_iterable(metas))
def from_numpy(ndarrays: List[ObjectRef[np.ndarray]]) -> Dataset[ArrowRow]: """Create a dataset from a set of NumPy ndarrays. Args: ndarrays: A list of Ray object references to NumPy ndarrays. Returns: Dataset holding the given ndarrays. """ ndarray_to_block = cached_remote_fn(_ndarray_to_block, num_returns=2) res = [ndarray_to_block.remote(ndarray) for ndarray in ndarrays] blocks, metadata = zip(*res) return Dataset( ExecutionPlan( BlockList(blocks, ray.get(list(metadata))), DatasetStats(stages={"from_numpy": metadata}, parent=None), ), 0, False, )
def from_numpy_refs( ndarrays: Union[ObjectRef[np.ndarray], List[ObjectRef[np.ndarray]]], ) -> Dataset[ArrowRow]: """Create a dataset from a list of NumPy ndarray futures. Args: ndarrays: A Ray object reference to a NumPy ndarray or a list of Ray object references to NumPy ndarrays. Returns: Dataset holding the given ndarrays. """ if isinstance(ndarrays, ray.ObjectRef): ndarrays = [ndarrays] elif isinstance(ndarrays, list): for ndarray in ndarrays: if not isinstance(ndarray, ray.ObjectRef): raise ValueError( "Expected list of Ray object refs, " f"got list containing {type(ndarray)}" ) else: raise ValueError( f"Expected Ray object ref or list of Ray object refs, got {type(ndarray)}" ) ndarray_to_block = cached_remote_fn(_ndarray_to_block, num_returns=2) res = [ndarray_to_block.remote(ndarray) for ndarray in ndarrays] blocks, metadata = zip(*res) return Dataset( ExecutionPlan( BlockList(blocks, ray.get(list(metadata))), DatasetStats(stages={"from_numpy": metadata}, parent=None), ), 0, False, )
def sample_boundaries(blocks: List[ObjectRef[Block]], key: SortKeyT, num_reducers: int) -> List[T]: """ Return (num_reducers - 1) items in ascending order from the blocks that partition the domain into ranges with approximately equally many elements. """ # TODO(Clark): Support multiple boundary sampling keys. if isinstance(key, list) and len(key) > 1: raise ValueError("Multiple boundary sampling keys not supported.") n_samples = int(num_reducers * 10 / len(blocks)) sample_block = cached_remote_fn(_sample_block) sample_results = [ sample_block.remote(block, n_samples, key) for block in blocks ] sample_bar = ProgressBar("Sort Sample", len(sample_results)) sample_bar.block_until_complete(sample_results) sample_bar.close() samples = ray.get(sample_results) samples = [s for s in samples if len(s) > 0] # The dataset is empty if len(samples) == 0: return [None] * (num_reducers - 1) builder = DelegatingArrowBlockBuilder() for sample in samples: builder.add_block(sample) samples = builder.build() column = key[0][0] if isinstance(key, list) else None sample_items = BlockAccessor.for_block(samples).to_numpy(column) sample_items = np.sort(sample_items) ret = [ np.quantile(sample_items, q, interpolation="nearest") for q in np.linspace(0, 1, num_reducers) ] return ret[1:]
def apply(self, fn: Any, remote_args: dict, blocks: BlockList) -> BlockList: # Handle empty datasets. if blocks.initial_num_blocks() == 0: return blocks blocks = list(blocks.iter_blocks_with_metadata()) map_bar = ProgressBar("Map Progress", total=len(blocks)) map_block = cached_remote_fn(_map_block) refs = [ map_block.options(**remote_args).remote(b, fn, m.input_files) for b, m in blocks ] try: results = map_bar.fetch_until_complete(refs) except (ray.exceptions.RayTaskError, KeyboardInterrupt) as e: # One or more mapper tasks failed, or we received a SIGINT signal # while waiting; either way, we cancel all map tasks. for ref in refs: ray.cancel(ref) # Wait until all tasks have failed or been cancelled. for ref in refs: try: ray.get(ref) except (ray.exceptions.RayTaskError, ray.exceptions.TaskCancelledError): pass # Reraise the original task failure exception. raise e from None new_blocks, new_metadata = [], [] for result in results: for block, metadata in result: new_blocks.append(block) new_metadata.append(metadata) return BlockList(list(new_blocks), list(new_metadata))
def from_pandas_refs( dfs: Union[ObjectRef["pandas.DataFrame"], List[ObjectRef["pandas.DataFrame"]]] ) -> Dataset[ArrowRow]: """Create a dataset from a list of Ray object references to Pandas dataframes. Args: dfs: A Ray object references to pandas dataframe, or a list of Ray object references to pandas dataframes. Returns: Dataset holding Arrow records read from the dataframes. """ if isinstance(dfs, ray.ObjectRef): dfs = [dfs] df_to_block = cached_remote_fn(_df_to_block, num_returns=2) res = [df_to_block.remote(df) for df in dfs] blocks, metadata = zip(*res) return Dataset(BlockList(blocks, ray.get(list(metadata))), 0, DatasetStats.TODO())
def apply(self, fn: Any, remote_args: dict, blocks: BlockList[Any]) -> BlockList[Any]: # Handle empty datasets. if len(blocks) == 0: return blocks map_bar = ProgressBar("Map Progress", total=len(blocks)) kwargs = remote_args.copy() kwargs["num_returns"] = 2 map_block = cached_remote_fn(_map_block) refs = [ map_block.options(**kwargs).remote(b, m, fn) for b, m in zip(blocks, blocks.get_metadata()) ] new_blocks, new_metadata = zip(*refs) new_metadata = list(new_metadata) try: new_metadata = map_bar.fetch_until_complete(new_metadata) except (ray.exceptions.RayTaskError, KeyboardInterrupt) as e: # One or more mapper tasks failed, or we received a SIGINT signal # while waiting; either way, we cancel all map tasks. for ref in new_metadata: ray.cancel(ref) # Wait until all tasks have failed or been cancelled. for ref in new_metadata: try: ray.get(ref) except (ray.exceptions.RayTaskError, ray.exceptions.TaskCancelledError): pass # Reraise the original task failure exception. raise e from None return BlockList(list(new_blocks), list(new_metadata))
def execute( self, input_blocks: BlockList, output_num_blocks: int, clear_input_blocks: bool, *, map_ray_remote_args: Optional[Dict[str, Any]] = None, reduce_ray_remote_args: Optional[Dict[str, Any]] = None, merge_factor: int = 2, ) -> Tuple[BlockList, Dict[str, List[BlockMetadata]]]: logger.info("Using experimental push-based shuffle.") # TODO(swang): For jobs whose reduce work is heavier than the map work, # we should support fractional merge factors. # TODO(swang): For large jobs, we should try to choose the merge factor # automatically, e.g., by running one test round of map and merge tasks # and comparing their run times. # TODO(swang): Add option to automatically reduce write amplification # during map-merge stage, by limiting how many partitions can be # processed concurrently. input_blocks_list = input_blocks.get_blocks() # Preemptively clear the blocks list since we will incrementally delete # the last remaining references as we submit the dependent map tasks # during the map-merge stage. if clear_input_blocks: input_blocks.clear() if map_ray_remote_args is None: map_ray_remote_args = {} if reduce_ray_remote_args is None: reduce_ray_remote_args = {} # The placement strategy for reduce tasks is overwritten to colocate # them with their inputs from the merge stage, so remove any # pre-specified scheduling strategy here. reduce_ray_remote_args = reduce_ray_remote_args.copy() reduce_ray_remote_args.pop("scheduling_strategy", None) map_fn = self._map_partition merge_fn = self._merge def map_partition(*args, **kwargs): return map_fn(self.map, *args, **kwargs) def merge(*args, **kwargs): return merge_fn(self.reduce, *args, **kwargs) shuffle_map = cached_remote_fn(map_partition) shuffle_merge = cached_remote_fn(merge) def submit_map_task(arg): mapper_idx, block = arg # NOTE(swang): Results are shuffled between map and merge tasks, so # there is no advantage to colocating specific map and merge tasks. # Therefore, we do not specify a node affinity policy for map tasks # in case the caller or Ray has a better scheduling strategy, e.g., # based on data locality. map_result = shuffle_map.options( **map_ray_remote_args, num_returns=1 + schedule.num_merge_tasks_per_round, ).remote( mapper_idx, block, output_num_blocks, schedule, *self._map_args, ) metadata_ref = map_result.pop(0) return metadata_ref, map_result def submit_merge_task(arg): merge_idx, map_results = arg num_merge_returns = schedule.get_num_reducers_per_merge_idx(merge_idx) merge_result = shuffle_merge.options( num_returns=1 + num_merge_returns, **schedule.get_merge_task_options(merge_idx), ).remote( *map_results, reduce_args=self._reduce_args, ) metadata_ref = merge_result.pop(0) return metadata_ref, merge_result # Compute all constants used for task scheduling. num_cpus_per_node_map = _get_num_cpus_per_node_map() schedule = self._compute_shuffle_schedule( num_cpus_per_node_map, len(input_blocks_list), merge_factor, output_num_blocks, ) # ObjectRef results from the last round of tasks. Used to add # backpressure during pipelining of map and merge tasks. last_map_metadata_results = [] last_merge_metadata_results = [] # Final outputs from the map-merge stage. # This is a map from merge task index to a nested list of merge results # (ObjectRefs). Each merge task index corresponds to a partition of P # final reduce tasks. all_merge_results = [[] for _ in range(schedule.num_merge_tasks_per_round)] shuffle_map_metadata = [] shuffle_merge_metadata = [] map_bar = ProgressBar("Shuffle Map", position=0, total=len(input_blocks_list)) # Execute the map-merge stage. This submits tasks in rounds of M map # tasks and N merge tasks each. Task execution between map and merge is # pipelined, so that while executing merge for one round of inputs, we # also execute the map tasks for the following round. input_blocks_list = list(enumerate(input_blocks_list)) while input_blocks_list: # Execute one round of the map stage. # Pop from the inputs so that we can clear the memory ASAP. round_input_blocks = [] try: for _ in range(schedule.num_map_tasks_per_round): round_input_blocks.append(input_blocks_list.pop(0)) except IndexError: pass ( prev_map_metadata, last_map_metadata_results, map_results, ) = _execute_pipelined_stage( submit_map_task, last_map_metadata_results, round_input_blocks, progress_bar=map_bar, ) shuffle_map_metadata += prev_map_metadata # Shuffle the map results for the merge tasks. merge_args = [ (merge_idx, [map_result.pop(0) for map_result in map_results]) for merge_idx in range(schedule.num_merge_tasks_per_round) ] assert all([not map_result for map_result in map_results]) # Execute one round of the merge stage. ( prev_merge_metadata, last_merge_metadata_results, merge_results, ) = _execute_pipelined_stage( submit_merge_task, last_merge_metadata_results, merge_args, ) shuffle_merge_metadata += prev_merge_metadata for merge_idx, merge_result in enumerate(merge_results): all_merge_results[merge_idx].append(merge_result) del merge_results # Wait for last map and merge tasks to finish. prev_map_metadata, _, _ = _execute_pipelined_stage( None, last_map_metadata_results, [], progress_bar=map_bar ) shuffle_map_metadata += prev_map_metadata map_bar.close() prev_merge_metadata, _, _ = _execute_pipelined_stage( None, last_merge_metadata_results, [] ) shuffle_merge_metadata += prev_merge_metadata # Execute and wait for the reduce stage. new_metadata, new_blocks = self._execute_reduce_stage( output_num_blocks, schedule, reduce_ray_remote_args, all_merge_results ) stats = { "map": shuffle_map_metadata, "merge": shuffle_merge_metadata, "reduce": new_metadata, } return BlockList(list(new_blocks), list(new_metadata)), stats
def execute( self, input_blocks: BlockList, output_num_blocks: int, clear_input_blocks: bool, *, map_ray_remote_args: Optional[Dict[str, Any]] = None, reduce_ray_remote_args: Optional[Dict[str, Any]] = None ) -> Tuple[BlockList, Dict[str, List[BlockMetadata]]]: input_blocks_list = input_blocks.get_blocks() input_num_blocks = len(input_blocks_list) if map_ray_remote_args is None: map_ray_remote_args = {} if reduce_ray_remote_args is None: reduce_ray_remote_args = {} if "scheduling_strategy" not in reduce_ray_remote_args: reduce_ray_remote_args = reduce_ray_remote_args.copy() reduce_ray_remote_args["scheduling_strategy"] = "SPREAD" shuffle_map = cached_remote_fn(self.map) shuffle_reduce = cached_remote_fn(self.reduce) map_bar = ProgressBar("Shuffle Map", total=input_num_blocks) shuffle_map_out = [ shuffle_map.options( **map_ray_remote_args, num_returns=1 + output_num_blocks, ).remote(i, block, output_num_blocks, *self._map_args) for i, block in enumerate(input_blocks_list) ] # The first item returned is the BlockMetadata. shuffle_map_metadata = [] for i, refs in enumerate(shuffle_map_out): shuffle_map_metadata.append(refs[0]) shuffle_map_out[i] = refs[1:] # Eagerly delete the input block references in order to eagerly release # the blocks' memory. del input_blocks_list if clear_input_blocks: input_blocks.clear() shuffle_map_metadata = map_bar.fetch_until_complete( shuffle_map_metadata) map_bar.close() reduce_bar = ProgressBar("Shuffle Reduce", total=output_num_blocks) shuffle_reduce_out = [ shuffle_reduce.options( **reduce_ray_remote_args, num_returns=2, ).remote( *self._reduce_args, *[shuffle_map_out[i][j] for i in range(input_num_blocks)], ) for j in range(output_num_blocks) ] # Eagerly delete the map block references in order to eagerly release # the blocks' memory. del shuffle_map_out new_blocks, new_metadata = zip(*shuffle_reduce_out) new_metadata = reduce_bar.fetch_until_complete(list(new_metadata)) reduce_bar.close() stats = { "map": shuffle_map_metadata, "reduce": new_metadata, } return BlockList(list(new_blocks), list(new_metadata)), stats