def sort_impl(blocks: BlockList, key: SortKeyT, descending: bool = False) -> BlockList: blocks = list(blocks.iter_blocks()) if len(blocks) == 0: return BlockList([], []) if isinstance(key, str): key = [(key, "descending" if descending else "ascending")] if isinstance(key, list): descending = key[0][1] == "descending" num_mappers = len(blocks) num_reducers = num_mappers boundaries = sample_boundaries(blocks, key, num_reducers) if descending: boundaries.reverse() sort_block = cached_remote_fn(_sort_block).options( num_returns=num_reducers) merge_sorted_blocks = cached_remote_fn(_merge_sorted_blocks, num_returns=2) map_results = np.empty((num_mappers, num_reducers), dtype=object) for i, block in enumerate(blocks): map_results[i, :] = sort_block.remote(block, boundaries, key, descending) map_bar = ProgressBar("Sort Map", len(map_results)) map_bar.block_until_complete([ret[0] for ret in map_results]) map_bar.close() reduce_results = [] for j in range(num_reducers): ret = merge_sorted_blocks.remote(key, descending, *map_results[:, j].tolist()) reduce_results.append(ret) merge_bar = ProgressBar("Sort Merge", len(reduce_results)) merge_bar.block_until_complete([ret[0] for ret in reduce_results]) merge_bar.close() blocks = [b for b, _ in reduce_results] metadata = ray.get([m for _, m in reduce_results]) return BlockList(blocks, metadata)
def simple_shuffle(input_blocks: BlockList, output_num_blocks: int, *, random_shuffle: bool = False, random_seed: Optional[int] = None, map_ray_remote_args: Optional[Dict[str, Any]] = None, reduce_ray_remote_args: Optional[Dict[str, Any]] = None, _spread_resource_prefix: Optional[str] = None) -> BlockList: input_blocks = list(input_blocks.iter_blocks()) if map_ray_remote_args is None: map_ray_remote_args = {} if reduce_ray_remote_args is None: reduce_ray_remote_args = {} input_num_blocks = len(input_blocks) if _spread_resource_prefix is not None: # Use given spread resource prefix for round-robin resource-based # scheduling. nodes = ray.nodes() map_resource_iter = _get_spread_resources_iter( nodes, _spread_resource_prefix, map_ray_remote_args) reduce_resource_iter = _get_spread_resources_iter( nodes, _spread_resource_prefix, reduce_ray_remote_args) else: # If no spread resource prefix given, yield an empty dictionary. map_resource_iter, reduce_resource_iter = itertools.tee( itertools.repeat({}), 2) shuffle_map = cached_remote_fn(_shuffle_map) shuffle_reduce = cached_remote_fn(_shuffle_reduce) map_bar = ProgressBar("Shuffle Map", position=0, total=input_num_blocks) shuffle_map_out = [ shuffle_map.options(**map_ray_remote_args, num_returns=output_num_blocks, resources=next(map_resource_iter)).remote( block, i, output_num_blocks, random_shuffle, random_seed) for i, block in enumerate(input_blocks) ] # Eagerly delete the input block references in order to eagerly release # the blocks' memory. del input_blocks if output_num_blocks == 1: # Handle the num_returns=1 edge case which doesn't return a list. shuffle_map_out = [[x] for x in shuffle_map_out] map_bar.block_until_complete([x[0] for x in shuffle_map_out]) map_bar.close() # Randomize the reduce order of the blocks. if random_shuffle: random = np.random.RandomState(random_seed) random.shuffle(shuffle_map_out) reduce_bar = ProgressBar("Shuffle Reduce", position=0, total=output_num_blocks) shuffle_reduce_out = [ shuffle_reduce.options(**reduce_ray_remote_args, num_returns=2, resources=next(reduce_resource_iter)). remote(*[shuffle_map_out[i][j] for i in range(input_num_blocks)]) for j in range(output_num_blocks) ] # Eagerly delete the map block references in order to eagerly release # the blocks' memory. del shuffle_map_out new_blocks, new_metadata = zip(*shuffle_reduce_out) reduce_bar.block_until_complete(list(new_blocks)) new_metadata = ray.get(list(new_metadata)) reduce_bar.close() return BlockList(list(new_blocks), list(new_metadata))