def from_items(items: List[Any], parallelism: int = 200) -> Dataset[Any]: """Create a dataset from a list of local Python objects. Examples: >>> ds.from_items([1, 2, 3, 4, 5]) Args: items: List of local Python objects. parallelism: The amount of parallelism to use for the dataset. Returns: Dataset holding the items. """ block_size = max(1, len(items) // parallelism) blocks: List[ObjectRef[Block]] = [] metadata: List[BlockMetadata] = [] i = 0 while i < len(items): builder = SimpleBlock.builder() for item in items[i:i + block_size]: builder.add(item) block = builder.build() blocks.append(ray.put(block)) metadata.append( BlockMetadata(num_rows=block.num_rows(), size_bytes=block.size_bytes(), schema=type(items[0]), input_files=None)) i += block_size return Dataset(BlockList(blocks, metadata))
def wrapped_fn(block: Block, meta: BlockMetadata): new_block = fn(block) new_meta = BlockMetadata(num_rows=new_block.num_rows(), size_bytes=new_block.size_bytes(), schema=new_block.schema(), input_files=meta.input_files) return new_block, new_meta
def truncate(block: Block[T], meta: BlockMetadata, count: int) -> (Block[T], BlockMetadata): logger.debug("Truncating last block to size: {}".format(count)) new_block = block.slice(0, count, copy=True) new_meta = BlockMetadata(num_rows=new_block.num_rows(), size_bytes=new_block.size_bytes(), schema=meta.schema, input_files=meta.input_files) return new_block, new_meta
def process_block( self, block: Block[T], meta: BlockMetadata) -> (Block[U], BlockMetadata): new_block = fn(block) new_metadata = BlockMetadata(num_rows=new_block.num_rows(), size_bytes=new_block.size_bytes(), schema=new_block.schema(), input_files=meta.input_files) return new_block, new_metadata
def shuffle_reduce( *mapper_outputs: List[Block[T]]) -> (Block[T], BlockMetadata): builder = DelegatingArrowBlockBuilder() assert len(mapper_outputs) == input_num_blocks for block in mapper_outputs: builder.add_block(block) new_block = builder.build() new_metadata = BlockMetadata(num_rows=new_block.num_rows(), size_bytes=new_block.size_bytes(), schema=new_block.schema(), input_files=None) return new_block, new_metadata
def read_parquet(paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, columns: Optional[List[str]] = None, parallelism: int = 200, **arrow_parquet_args) -> Dataset[ArrowRow]: """Create an Arrow dataset from parquet files. Examples: # Read a directory of files in remote storage. >>> ds.read_parquet("s3://bucket/path") # Read multiple local files. >>> ds.read_parquet(["/path/to/file1", "/path/to/file2"]) Args: paths: A single file path or a list of file paths (or directories). filesystem: The filesystem implementation to read from. columns: A list of column names to read. parallelism: The amount of parallelism to use for the dataset. arrow_parquet_args: Other parquet read options to pass to pyarrow. Returns: Dataset holding Arrow records read from the specified paths. """ import pyarrow.parquet as pq pq_ds = pq.ParquetDataset(paths, **arrow_parquet_args) read_tasks = [[] for _ in builtins.range(parallelism)] # TODO(ekl) support reading row groups (maybe as an option) for i, piece in enumerate(pq_ds.pieces): read_tasks[i % len(read_tasks)].append(piece) nonempty_tasks = [r for r in read_tasks if r] partitions = pq_ds.partitions @ray.remote def gen_read(pieces: List[pq.ParquetDatasetPiece]): import pyarrow logger.debug("Reading {} parquet pieces".format(len(pieces))) tables = [ piece.read(columns=columns, use_threads=False, partitions=partitions) for piece in pieces ] if len(tables) > 1: table = pyarrow.concat_tables(tables) else: table = tables[0] return ArrowBlock(table) calls: List[Callable[[], ObjectRef[Block]]] = [] metadata: List[BlockMetadata] = [] for pieces in nonempty_tasks: calls.append(lambda pieces=pieces: gen_read.remote(pieces)) piece_metadata = [p.get_metadata() for p in pieces] metadata.append( BlockMetadata(num_rows=sum(m.num_rows for m in piece_metadata), size_bytes=sum( sum( m.row_group(i).total_byte_size for i in builtins.range(m.num_row_groups)) for m in piece_metadata), schema=piece_metadata[0].schema.to_arrow_schema(), input_files=[p.path for p in pieces])) return Dataset(LazyBlockList(calls, metadata))