Python BlockMetadata 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: ray.experimental.data.impl.block

클래스/타입: BlockMetadata

hotexamples.com에서의 예제들: 6

Python BlockMetadata - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 ray.experimental.data.impl.block.BlockMetadata에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

BlockMetadata(6)

자주 사용되는 메소드들

BlockMetadata (6)

예제 #1

파일 보기

파일: read_api.py 프로젝트: yiranwang52/ray

def from_items(items: List[Any], parallelism: int = 200) -> Dataset[Any]:
    """Create a dataset from a list of local Python objects.

    Examples:
        >>> ds.from_items([1, 2, 3, 4, 5])

    Args:
        items: List of local Python objects.
        parallelism: The amount of parallelism to use for the dataset.

    Returns:
        Dataset holding the items.
    """
    block_size = max(1, len(items) // parallelism)

    blocks: List[ObjectRef[Block]] = []
    metadata: List[BlockMetadata] = []
    i = 0
    while i < len(items):
        builder = SimpleBlock.builder()
        for item in items[i:i + block_size]:
            builder.add(item)
        block = builder.build()
        blocks.append(ray.put(block))
        metadata.append(
            BlockMetadata(num_rows=block.num_rows(),
                          size_bytes=block.size_bytes(),
                          schema=type(items[0]),
                          input_files=None))
        i += block_size

    return Dataset(BlockList(blocks, metadata))

예제 #2

파일 보기

파일: compute.py 프로젝트: yiranwang52/ray

 def wrapped_fn(block: Block, meta: BlockMetadata):
     new_block = fn(block)
     new_meta = BlockMetadata(num_rows=new_block.num_rows(),
                              size_bytes=new_block.size_bytes(),
                              schema=new_block.schema(),
                              input_files=meta.input_files)
     return new_block, new_meta

예제 #3

파일 보기

 def truncate(block: Block[T], meta: BlockMetadata,
              count: int) -> (Block[T], BlockMetadata):
     logger.debug("Truncating last block to size: {}".format(count))
     new_block = block.slice(0, count, copy=True)
     new_meta = BlockMetadata(num_rows=new_block.num_rows(),
                              size_bytes=new_block.size_bytes(),
                              schema=meta.schema,
                              input_files=meta.input_files)
     return new_block, new_meta

예제 #4

파일 보기

파일: compute.py 프로젝트: yiranwang52/ray

 def process_block(
         self, block: Block[T],
         meta: BlockMetadata) -> (Block[U], BlockMetadata):
     new_block = fn(block)
     new_metadata = BlockMetadata(num_rows=new_block.num_rows(),
                                  size_bytes=new_block.size_bytes(),
                                  schema=new_block.schema(),
                                  input_files=meta.input_files)
     return new_block, new_metadata

예제 #5

파일 보기

파일: shuffle.py 프로젝트: yiranwang52/ray

 def shuffle_reduce(
         *mapper_outputs: List[Block[T]]) -> (Block[T], BlockMetadata):
     builder = DelegatingArrowBlockBuilder()
     assert len(mapper_outputs) == input_num_blocks
     for block in mapper_outputs:
         builder.add_block(block)
     new_block = builder.build()
     new_metadata = BlockMetadata(num_rows=new_block.num_rows(),
                                  size_bytes=new_block.size_bytes(),
                                  schema=new_block.schema(),
                                  input_files=None)
     return new_block, new_metadata

예제 #6

파일 보기

파일: read_api.py 프로젝트: yiranwang52/ray

def read_parquet(paths: Union[str, List[str]],
                 filesystem: Optional["pyarrow.fs.FileSystem"] = None,
                 columns: Optional[List[str]] = None,
                 parallelism: int = 200,
                 **arrow_parquet_args) -> Dataset[ArrowRow]:
    """Create an Arrow dataset from parquet files.

    Examples:
        # Read a directory of files in remote storage.
        >>> ds.read_parquet("s3://bucket/path")

        # Read multiple local files.
        >>> ds.read_parquet(["/path/to/file1", "/path/to/file2"])

    Args:
        paths: A single file path or a list of file paths (or directories).
        filesystem: The filesystem implementation to read from.
        columns: A list of column names to read.
        parallelism: The amount of parallelism to use for the dataset.
        arrow_parquet_args: Other parquet read options to pass to pyarrow.

    Returns:
        Dataset holding Arrow records read from the specified paths.
    """
    import pyarrow.parquet as pq

    pq_ds = pq.ParquetDataset(paths, **arrow_parquet_args)

    read_tasks = [[] for _ in builtins.range(parallelism)]
    # TODO(ekl) support reading row groups (maybe as an option)
    for i, piece in enumerate(pq_ds.pieces):
        read_tasks[i % len(read_tasks)].append(piece)
    nonempty_tasks = [r for r in read_tasks if r]
    partitions = pq_ds.partitions

    @ray.remote
    def gen_read(pieces: List[pq.ParquetDatasetPiece]):
        import pyarrow
        logger.debug("Reading {} parquet pieces".format(len(pieces)))
        tables = [
            piece.read(columns=columns,
                       use_threads=False,
                       partitions=partitions) for piece in pieces
        ]
        if len(tables) > 1:
            table = pyarrow.concat_tables(tables)
        else:
            table = tables[0]
        return ArrowBlock(table)

    calls: List[Callable[[], ObjectRef[Block]]] = []
    metadata: List[BlockMetadata] = []
    for pieces in nonempty_tasks:
        calls.append(lambda pieces=pieces: gen_read.remote(pieces))
        piece_metadata = [p.get_metadata() for p in pieces]
        metadata.append(
            BlockMetadata(num_rows=sum(m.num_rows for m in piece_metadata),
                          size_bytes=sum(
                              sum(
                                  m.row_group(i).total_byte_size
                                  for i in builtins.range(m.num_row_groups))
                              for m in piece_metadata),
                          schema=piece_metadata[0].schema.to_arrow_schema(),
                          input_files=[p.path for p in pieces]))

    return Dataset(LazyBlockList(calls, metadata))