def _shuffle_map(block: Block, idx: int, output_num_blocks: int, random_shuffle: bool, random_seed: Optional[int]) -> List[Block]: block = BlockAccessor.for_block(block) # Randomize the distribution of records to blocks. if random_shuffle: seed_i = random_seed + idx if random_seed is not None else None block = block.random_shuffle(seed_i) block = BlockAccessor.for_block(block) slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks)) slices = [] for i in range(output_num_blocks): slices.append(block.slice(i * slice_sz, (i + 1) * slice_sz, copy=True)) # Randomize the distribution order of the blocks (this matters when # some blocks are larger than others). if random_shuffle: random = np.random.RandomState(seed_i) random.shuffle(slices) num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices) assert num_rows == block.num_rows(), (num_rows, block.num_rows()) # Needed to handle num_returns=1 edge case in Ray API. if len(slices) == 1: return slices[0] else: return slices
def _shuffle_map( block: Block, idx: int, output_num_blocks: int, random_shuffle: bool, random_seed: Optional[int], ) -> List[Union[BlockMetadata, Block]]: """Returns list of [BlockMetadata, O1, O2, O3, ...output_num_blocks].""" stats = BlockExecStats.builder() block = BlockAccessor.for_block(block) # Randomize the distribution of records to blocks. if random_shuffle: seed_i = random_seed + idx if random_seed is not None else None block = block.random_shuffle(seed_i) block = BlockAccessor.for_block(block) slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks)) slices = [] for i in range(output_num_blocks): slices.append(block.slice(i * slice_sz, (i + 1) * slice_sz, copy=True)) # Randomize the distribution order of the blocks (this matters when # some blocks are larger than others). if random_shuffle: random = np.random.RandomState(seed_i) random.shuffle(slices) num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices) assert num_rows == block.num_rows(), (num_rows, block.num_rows()) metadata = block.get_metadata(input_files=None, exec_stats=stats.build()) return [metadata] + slices
def map( idx: int, block: Block, output_num_blocks: int, block_udf: Optional[Callable[[Block], Iterable[Block]]], random_shuffle: bool, random_seed: Optional[int], ) -> List[Union[BlockMetadata, Block]]: stats = BlockExecStats.builder() if block_udf: # TODO(ekl) note that this effectively disables block splitting. blocks = list(block_udf(block)) if len(blocks) > 1: builder = BlockAccessor.for_block(blocks[0]).builder() for b in blocks: builder.add_block(b) block = builder.build() else: block = blocks[0] block = BlockAccessor.for_block(block) # Randomize the distribution of records to blocks. if random_shuffle: seed_i = random_seed + idx if random_seed is not None else None block = block.random_shuffle(seed_i) block = BlockAccessor.for_block(block) slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks)) slices = [] for i in range(output_num_blocks): slices.append( block.slice(i * slice_sz, (i + 1) * slice_sz, copy=True)) # Randomize the distribution order of the blocks (this prevents empty # outputs when input blocks are very small). if random_shuffle: random = np.random.RandomState(seed_i) random.shuffle(slices) num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices) assert num_rows == block.num_rows(), (num_rows, block.num_rows()) metadata = block.get_metadata(input_files=None, exec_stats=stats.build()) return [metadata] + slices