def map(self, fn: Callable[[T], U], compute: Optional[str] = None, **ray_remote_args) -> "Dataset[U]": """Apply the given function to each record of this dataset. This is a blocking operation. Note that mapping individual records can be quite slow. Consider using `.map_batches()` for performance. Examples: # Transform python objects. >>> ds.map(lambda x: x * 2) # Transform Arrow records. >>> ds.map(lambda record: {"v2": record["value"] * 2}) Time complexity: O(dataset size / parallelism) Args: fn: The function to apply to each record. compute: The compute strategy, either "tasks" (default) to use Ray tasks, or "actors" to use an autoscaling Ray actor pool. ray_remote_args: Additional resource requirements to request from ray (e.g., num_gpus=1 to request GPUs for the map tasks). """ def transform(block: Block[T]) -> Block[U]: builder = DelegatingArrowBlockBuilder() for row in block.iter_rows(): builder.add(fn(row)) return builder.build() compute = get_compute(compute) return Dataset(compute.apply(transform, ray_remote_args, self._blocks))
def filter(self, fn: Callable[[T], bool], compute: Optional[str] = None, **ray_remote_args) -> "Dataset[T]": """Filter out records that do not satisfy the given predicate. This is a blocking operation. Consider using ``.map_batches()`` for better performance (you can implement filter by dropping records). Examples: >>> ds.flat_map(lambda x: x % 2 == 0) Time complexity: O(dataset size / parallelism) Args: fn: The predicate function to apply to each record. compute: The compute strategy, either "tasks" (default) to use Ray tasks, or "actors" to use an autoscaling Ray actor pool. ray_remote_args: Additional resource requirements to request from ray (e.g., num_gpus=1 to request GPUs for the map tasks). """ def transform(block: Block[T]) -> Block[T]: builder = block.builder() for row in block.iter_rows(): if fn(row): builder.add(row) return builder.build() compute = get_compute(compute) return Dataset(compute.apply(transform, ray_remote_args, self._blocks))
def flat_map(self, fn: Callable[[T], Iterable[U]], compute: Optional[str] = None, **ray_remote_args) -> "Dataset[U]": """Apply the given function to each record and then flatten results. This is a blocking operation. Consider using ``.map_batches()`` for better performance (the batch size can be altered in map_batches). Examples: >>> ds.flat_map(lambda x: [x, x ** 2, x ** 3]) Time complexity: O(dataset size / parallelism) Args: fn: The function to apply to each record. compute: The compute strategy, either "tasks" (default) to use Ray tasks, or "actors" to use an autoscaling Ray actor pool. ray_remote_args: Additional resource requirements to request from ray (e.g., num_gpus=1 to request GPUs for the map tasks). """ def transform(block: Block[T]) -> Block[U]: builder = DelegatingArrowBlockBuilder() for row in block.iter_rows(): for r2 in fn(row): builder.add(r2) return builder.build() compute = get_compute(compute) return Dataset(compute.apply(transform, ray_remote_args, self._blocks))
def map(self, fn: Union[CallableClass, Callable[[T], U]], compute: Optional[str] = None, **ray_remote_args) -> "Dataset[U]": """Apply the given function to each record of this dataset. This is a blocking operation. Note that mapping individual records can be quite slow. Consider using `.map_batches()` for performance. Examples: >>> # Transform python objects. >>> ds.map(lambda x: x * 2) >>> # Transform Arrow records. >>> ds.map(lambda record: {"v2": record["value"] * 2}) >>> # Define a callable class that persists state across >>> # function invocations for efficiency. >>> class CachedModel: ... def __init__(self): ... self.model = init_model() ... def __call__(self, batch): ... return self.model(batch) >>> # Apply the transform in parallel on GPUs. Since >>> # compute="actors", the transform will be applied on an >>> # autoscaling pool of Ray actors, each allocated 1 GPU by Ray. >>> ds.map(CachedModel, compute="actors", num_gpus=1) Time complexity: O(dataset size / parallelism) Args: fn: The function to apply to each record, or a class type that can be instantiated to create such a callable. compute: The compute strategy, either "tasks" (default) to use Ray tasks, or "actors" to use an autoscaling Ray actor pool. ray_remote_args: Additional resource requirements to request from ray (e.g., num_gpus=1 to request GPUs for the map tasks). """ fn = cache_wrapper(fn) def transform(block: Block[T]) -> Block[U]: block = BlockAccessor.for_block(block) builder = DelegatingArrowBlockBuilder() for row in block.iter_rows(): builder.add(fn(row)) return builder.build() compute = get_compute(compute) return Dataset(compute.apply(transform, ray_remote_args, self._blocks))
def map_batches(self, fn: Callable[[BatchType], BatchType], batch_size: int = None, compute: Optional[str] = None, batch_format: str = "pandas", **ray_remote_args) -> "Dataset[Any]": """Apply the given function to batches of records of this dataset. This is a blocking operation. Examples: # Transform batches in parallel. >>> ds.map_batches(lambda batch: [v * 2 for v in batch]) # Define a batch transform function that persists state across # function invocations for efficiency with compute="actors". >>> def batch_infer_fn(batch): ... global model ... if model is None: ... model = init_model() ... return model(batch) # Apply the transform in parallel on GPUs. Since compute="actors", # the transform will be applied on an autoscaling pool of Ray # actors, each allocated 1 GPU by Ray. >>> ds.map_batches( ... batch_infer_fn, ... batch_size=256, compute="actors", num_gpus=1) Time complexity: O(dataset size / parallelism) Args: fn: The function to apply to each record batch. batch_size: Request a specific batch size, or leave unspecified to use entire blocks as batches. compute: The compute strategy, either "tasks" (default) to use Ray tasks, or "actors" to use an autoscaling Ray actor pool. When using actors, state can be preserved across function invocations in Python global variables. This can be useful for one-time setups, e.g., initializing a model once and re-using it across many function applications. batch_format: Specify "pandas" to select ``pandas.DataFrame`` as the batch format, or "pyarrow" to select ``pyarrow.Table``. ray_remote_args: Additional resource requirements to request from ray (e.g., num_gpus=1 to request GPUs for the map tasks). """ if batch_size is not None and batch_size < 1: raise ValueError("Batch size cannot be negative or 0") import pyarrow as pa import pandas as pd def transform(block: Block[T]) -> Block[U]: total_rows = block.num_rows() max_batch_size = batch_size if max_batch_size is None: max_batch_size = total_rows builder = DelegatingArrowBlockBuilder() for start in range(0, total_rows, max_batch_size): # Build a block for each batch. end = min(total_rows, start + max_batch_size) view = block.slice(start, end, copy=False) if batch_format == "pandas": view = view.to_pandas() elif batch_format == "pyarrow": view = view.to_arrow_table() else: raise ValueError( f"The given batch format: {batch_format} " f"is invalid. Supported batch type: {BatchType}") applied = fn(view) if isinstance(applied, list): applied = SimpleBlock(applied) elif isinstance(applied, pd.core.frame.DataFrame): applied = ArrowBlock(pa.Table.from_pandas(applied)) elif isinstance(applied, pa.Table): applied = ArrowBlock(applied) else: raise ValueError("The map batch UDF returns a type " f"{type(applied)}, which is not allowed. " "The return type must be either list, " "pandas.DataFrame, or pyarrow.Table") builder.add_block(applied) return builder.build() compute = get_compute(compute) return Dataset(compute.apply(transform, ray_remote_args, self._blocks))
def map_batches(self, fn: Union[CallableClass, Callable[[BatchType], BatchType]], batch_size: int = None, compute: Optional[str] = None, batch_format: str = "pandas", **ray_remote_args) -> "Dataset[Any]": """Apply the given function to batches of records of this dataset. This is a blocking operation. Examples: >>> # Transform batches in parallel. >>> ds.map_batches(lambda batch: [v * 2 for v in batch]) >>> # Define a callable class that persists state across >>> # function invocations for efficiency. >>> class CachedModel: ... def __init__(self): ... self.model = init_model() ... def __call__(self, item): ... return self.model(item) >>> # Apply the transform in parallel on GPUs. Since >>> # compute="actors", the transform will be applied on an >>> # autoscaling pool of Ray actors, each allocated 1 GPU by Ray. >>> ds.map_batches( ... CachedModel, ... batch_size=256, compute="actors", num_gpus=1) Time complexity: O(dataset size / parallelism) Args: fn: The function to apply to each record batch, or a class type that can be instantiated to create such a callable. batch_size: Request a specific batch size, or leave unspecified to use entire blocks as batches. compute: The compute strategy, either "tasks" (default) to use Ray tasks, or "actors" to use an autoscaling Ray actor pool. batch_format: Specify "pandas" to select ``pandas.DataFrame`` as the batch format, or "pyarrow" to select ``pyarrow.Table``. ray_remote_args: Additional resource requirements to request from ray (e.g., num_gpus=1 to request GPUs for the map tasks). """ if batch_size is not None and batch_size < 1: raise ValueError("Batch size cannot be negative or 0") import pyarrow as pa import pandas as pd fn = cache_wrapper(fn) def transform(block: Block[T]) -> Block[U]: block = BlockAccessor.for_block(block) total_rows = block.num_rows() max_batch_size = batch_size if max_batch_size is None: max_batch_size = total_rows builder = DelegatingArrowBlockBuilder() for start in range(0, total_rows, max_batch_size): # Build a block for each batch. end = min(total_rows, start + max_batch_size) view = block.slice(start, end, copy=False) if batch_format == "pandas": view = BlockAccessor.for_block(view).to_pandas() elif batch_format == "pyarrow": view = BlockAccessor.for_block(view).to_arrow_table() else: raise ValueError( f"The given batch format: {batch_format} " f"is invalid. Supported batch type: {BatchType}") applied = fn(view) if isinstance(applied, list): applied = applied elif isinstance(applied, pa.Table): applied = applied elif isinstance(applied, pd.core.frame.DataFrame): applied = pa.Table.from_pandas(applied) else: raise ValueError("The map batch UDF returns a type " f"{type(applied)}, which is not allowed. " "The return type must be either list, " "pandas.DataFrame, or pyarrow.Table") builder.add_block(applied) return builder.build() compute = get_compute(compute) return Dataset(compute.apply(transform, ray_remote_args, self._blocks))
def map_batches(self, fn: Callable[[BatchType], BatchType], batch_size: int = None, compute: str = "tasks", batch_format: str = "pandas", **ray_remote_args) -> "Dataset[Any]": """Apply the given function to batches of records of this dataset. This is a blocking operation. Examples: # Transform batches in parallel. >>> ds.map_batches(lambda batch: [v * 2 for v in batch]) # Transform batches in parallel on GPUs. >>> ds.map_batches( ... batch_infer_fn, ... batch_size=256, compute="actors", num_gpus=1) Time complexity: O(dataset size / parallelism) Args: fn: The function to apply to each record batch. batch_size: Request a specific batch size, or leave unspecified to use entire blocks as batches. compute: The compute strategy, either "tasks" to use Ray tasks, or "actors" to use an autoscaling Ray actor pool. batch_format: Specify "pandas" to select ``pandas.DataFrame`` as the batch format, or "pyarrow" to select ``pyarrow.Table``. ray_remote_args: Additional resource requirements to request from ray (e.g., num_gpus=1 to request GPUs for the map tasks). """ if batch_size is not None and batch_size < 1: raise ValueError("Batch size cannot be negative or 0") import pyarrow as pa import pandas as pd def transform(block: Block[T]) -> Block[U]: total_rows = block.num_rows() max_batch_size = batch_size if max_batch_size is None: max_batch_size = total_rows builder = DelegatingArrowBlockBuilder() for start in range(0, total_rows, max_batch_size): # Build a block for each batch. end = min(total_rows, start + max_batch_size) # Note: if the block is a list, it doesn't support zero-copy. view = block.slice(start, end) if batch_format == "pandas": view = view.to_pandas() elif batch_format == "pyarrow": view = view._table else: raise ValueError( f"The given batch format: {batch_format} " f"is invalid. Supported batch type: {BatchType}") applied = fn(view) if isinstance(applied, list): applied = ListBlock(applied) elif isinstance(applied, pd.core.frame.DataFrame): applied = ArrowBlock(pa.Table.from_pandas(applied)) elif isinstance(applied, pa.Table): applied = ArrowBlock(applied) else: raise ValueError("The map batch UDF returns a type " f"{type(applied)}, which is not allowed. " "The return type must be either list, " "pandas.DataFrame, or pyarrow.Table") builder.add_block(applied) return builder.build() compute = get_compute(compute) return Dataset(compute.apply(transform, ray_remote_args, self._blocks))