def transform(block: Block) -> Block: block = BlockAccessor.for_block(block) builder = block.builder() for row in block.iter_rows(): if fn(row): builder.add(row) return builder.build()
def csv_write(write_path: str, block: Block): block = BlockAccessor.for_block(block) logger.debug( f"Writing {block.num_rows()} records to {write_path}.") block.to_pandas().to_csv(write_path, mode="a", header=True, index=False)
def format_batch(batch: Block, format: str) -> BatchType: if batch_format == "pandas": batch = BlockAccessor.for_block(batch) return batch.to_pandas() elif batch_format == "pyarrow": batch = BlockAccessor.for_block(batch) return batch.to_arrow_table() elif batch_format == "_blocks": return batch else: raise ValueError( f"The given batch format: {batch_format} " f"is invalid. Supported batch type: {BatchType}")
def shuffle_map(block: Block) -> List[Block]: block = BlockAccessor.for_block(block) slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks)) slices = [] for i in range(output_num_blocks): slices.append( block.slice(i * slice_sz, (i + 1) * slice_sz, copy=True)) num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices) assert num_rows == block.num_rows(), (num_rows, block.num_rows()) # Needed to handle num_returns=1 edge case in Ray API. if len(slices) == 1: return slices[0] else: return slices
def transform(block: Block) -> Block: block = BlockAccessor.for_block(block) builder = DelegatingArrowBlockBuilder() for row in block.iter_rows(): for r2 in fn(row): builder.add(r2) return builder.build()
def block_to_df(block: Block): block = BlockAccessor.for_block(block) if isinstance(block, (ray.ObjectRef, ClientObjectRef)): raise ValueError( "Dataset.to_dask() must be used with Dask-on-Ray, please " "set the Dask scheduler to ray_dask_get (located in " "ray.util.dask).") return block.to_pandas()
def truncate(block: Block, meta: BlockMetadata, count: int) -> (Block, BlockMetadata): block = BlockAccessor.for_block(block) logger.debug("Truncating last block to size: {}".format(count)) new_block = block.slice(0, count, copy=True) accessor = BlockAccessor.for_block(new_block) new_meta = BlockMetadata(num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=meta.schema, input_files=meta.input_files) return new_block, new_meta
def transform(block: Block) -> Block: block = BlockAccessor.for_block(block) total_rows = block.num_rows() max_batch_size = batch_size if max_batch_size is None: max_batch_size = total_rows builder = DelegatingArrowBlockBuilder() for start in range(0, total_rows, max_batch_size): # Build a block for each batch. end = min(total_rows, start + max_batch_size) view = block.slice(start, end, copy=False) if batch_format == "pandas": view = BlockAccessor.for_block(view).to_pandas() elif batch_format == "pyarrow": view = BlockAccessor.for_block(view).to_arrow_table() else: raise ValueError( f"The given batch format: {batch_format} " f"is invalid. Supported batch type: {BatchType}") applied = fn(view) if isinstance(applied, list): applied = applied elif isinstance(applied, pa.Table): applied = applied elif isinstance(applied, pd.core.frame.DataFrame): applied = pa.Table.from_pandas(applied) else: raise ValueError("The map batch UDF returns a type " f"{type(applied)}, which is not allowed. " "The return type must be either list, " "pandas.DataFrame, or pyarrow.Table") builder.add_block(applied) return builder.build()
def json_write(write_path: str, block: Block): block = BlockAccessor.for_block(block) logger.debug( f"Writing {block.num_rows()} records to {write_path}.") block.to_pandas().to_json(write_path, orient="records")
def block_to_df(block: Block): block = BlockAccessor.for_block(block) return block.to_arrow_table()
def block_to_df(block: Block): block = BlockAccessor.for_block(block) return block.to_pandas()
def write(self, block: Block) -> str: block = BlockAccessor.for_block(block) if not self.enabled: raise ValueError("disabled") self.rows_written += block.num_rows() return "ok"
def agg(block: Block) -> int: block = BlockAccessor.for_block(block) return sum(block.iter_rows())
def count(block: Block) -> int: block = BlockAccessor.for_block(block) return block.num_rows()