示例#1
0
def from_items(items: List[Any], parallelism: int = 200) -> Dataset[Any]:
    """Create a dataset from a list of local Python objects.

    Examples:
        >>> ds.from_items([1, 2, 3, 4, 5])

    Args:
        items: List of local Python objects.
        parallelism: The amount of parallelism to use for the dataset.

    Returns:
        Dataset holding the items.
    """
    block_size = max(1, len(items) // parallelism)

    blocks: List[ObjectRef[Block]] = []
    metadata: List[BlockMetadata] = []
    i = 0
    while i < len(items):
        builder = SimpleBlock.builder()
        for item in items[i:i + block_size]:
            builder.add(item)
        block = builder.build()
        blocks.append(ray.put(block))
        metadata.append(
            BlockMetadata(num_rows=block.num_rows(),
                          size_bytes=block.size_bytes(),
                          schema=type(items[0]),
                          input_files=None))
        i += block_size

    return Dataset(BlockList(blocks, metadata))
示例#2
0
 def wrapped_fn(block: Block, meta: BlockMetadata):
     new_block = fn(block)
     new_meta = BlockMetadata(num_rows=new_block.num_rows(),
                              size_bytes=new_block.size_bytes(),
                              schema=new_block.schema(),
                              input_files=meta.input_files)
     return new_block, new_meta
示例#3
0
 def truncate(block: Block[T], meta: BlockMetadata,
              count: int) -> (Block[T], BlockMetadata):
     logger.debug("Truncating last block to size: {}".format(count))
     new_block = block.slice(0, count, copy=True)
     new_meta = BlockMetadata(num_rows=new_block.num_rows(),
                              size_bytes=new_block.size_bytes(),
                              schema=meta.schema,
                              input_files=meta.input_files)
     return new_block, new_meta
示例#4
0
 def process_block(
         self, block: Block[T],
         meta: BlockMetadata) -> (Block[U], BlockMetadata):
     new_block = fn(block)
     new_metadata = BlockMetadata(num_rows=new_block.num_rows(),
                                  size_bytes=new_block.size_bytes(),
                                  schema=new_block.schema(),
                                  input_files=meta.input_files)
     return new_block, new_metadata
示例#5
0
 def shuffle_reduce(
         *mapper_outputs: List[Block[T]]) -> (Block[T], BlockMetadata):
     builder = DelegatingArrowBlockBuilder()
     assert len(mapper_outputs) == input_num_blocks
     for block in mapper_outputs:
         builder.add_block(block)
     new_block = builder.build()
     new_metadata = BlockMetadata(num_rows=new_block.num_rows(),
                                  size_bytes=new_block.size_bytes(),
                                  schema=new_block.schema(),
                                  input_files=None)
     return new_block, new_metadata
示例#6
0
def read_parquet(paths: Union[str, List[str]],
                 filesystem: Optional["pyarrow.fs.FileSystem"] = None,
                 columns: Optional[List[str]] = None,
                 parallelism: int = 200,
                 **arrow_parquet_args) -> Dataset[ArrowRow]:
    """Create an Arrow dataset from parquet files.

    Examples:
        # Read a directory of files in remote storage.
        >>> ds.read_parquet("s3://bucket/path")

        # Read multiple local files.
        >>> ds.read_parquet(["/path/to/file1", "/path/to/file2"])

    Args:
        paths: A single file path or a list of file paths (or directories).
        filesystem: The filesystem implementation to read from.
        columns: A list of column names to read.
        parallelism: The amount of parallelism to use for the dataset.
        arrow_parquet_args: Other parquet read options to pass to pyarrow.

    Returns:
        Dataset holding Arrow records read from the specified paths.
    """
    import pyarrow.parquet as pq

    pq_ds = pq.ParquetDataset(paths, **arrow_parquet_args)

    read_tasks = [[] for _ in builtins.range(parallelism)]
    # TODO(ekl) support reading row groups (maybe as an option)
    for i, piece in enumerate(pq_ds.pieces):
        read_tasks[i % len(read_tasks)].append(piece)
    nonempty_tasks = [r for r in read_tasks if r]
    partitions = pq_ds.partitions

    @ray.remote
    def gen_read(pieces: List[pq.ParquetDatasetPiece]):
        import pyarrow
        logger.debug("Reading {} parquet pieces".format(len(pieces)))
        tables = [
            piece.read(columns=columns,
                       use_threads=False,
                       partitions=partitions) for piece in pieces
        ]
        if len(tables) > 1:
            table = pyarrow.concat_tables(tables)
        else:
            table = tables[0]
        return ArrowBlock(table)

    calls: List[Callable[[], ObjectRef[Block]]] = []
    metadata: List[BlockMetadata] = []
    for pieces in nonempty_tasks:
        calls.append(lambda pieces=pieces: gen_read.remote(pieces))
        piece_metadata = [p.get_metadata() for p in pieces]
        metadata.append(
            BlockMetadata(num_rows=sum(m.num_rows for m in piece_metadata),
                          size_bytes=sum(
                              sum(
                                  m.row_group(i).total_byte_size
                                  for i in builtins.range(m.num_row_groups))
                              for m in piece_metadata),
                          schema=piece_metadata[0].schema.to_arrow_schema(),
                          input_files=[p.path for p in pieces]))

    return Dataset(LazyBlockList(calls, metadata))