Python LazyBlockList.set_metadata 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: ray.data.impl.lazy_block_list

클래스/타입: LazyBlockList

메소드/함수: set_metadata

hotexamples.com에서의 예제들: 3

Python LazyBlockList.set_metadata - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 ray.data.impl.lazy_block_list.LazyBlockList.set_metadata에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

LazyBlockList(6)

ensure_schema_for_first_block(3)

set_metadata(3)

_read_remote_args(1)

_read_tasks(1)

compute_first_block(1)

ensure_metadata_for_first_block(1)

iter_blocks(1)

stats(1)

예제 #1

파일 보기

파일: read_api.py 프로젝트: hngenc/ray

def read_datasource(datasource: Datasource[T],
                    *,
                    parallelism: int = 200,
                    ray_remote_args: Dict[str, Any] = None,
                    **read_args) -> Dataset[T]:
    """Read a dataset from a custom data source.

    Args:
        datasource: The datasource to read data from.
        parallelism: The requested parallelism of the read.
        read_args: Additional kwargs to pass to the datasource impl.
        ray_remote_args: kwargs passed to ray.remote in the read tasks.

    Returns:
        Dataset holding the data read from the datasource.
    """

    read_tasks = datasource.prepare_read(parallelism, **read_args)

    def remote_read(task: ReadTask) -> Block:
        return task()

    if ray_remote_args:
        remote_read = ray.remote(**ray_remote_args)(remote_read)
    else:
        remote_read = ray.remote(remote_read)

    calls: List[Callable[[], ObjectRef[Block]]] = []
    metadata: List[BlockMetadata] = []

    for task in read_tasks:
        calls.append(lambda task=task: remote_read.remote(task))
        metadata.append(task.get_metadata())

    block_list = LazyBlockList(calls, metadata)

    # Get the schema from the first block synchronously.
    if metadata and metadata[0].schema is None:
        get_schema = cached_remote_fn(_get_schema)
        schema0 = ray.get(get_schema.remote(next(iter(block_list))))
        block_list.set_metadata(
            0,
            BlockMetadata(
                num_rows=metadata[0].num_rows,
                size_bytes=metadata[0].size_bytes,
                schema=schema0,
                input_files=metadata[0].input_files,
            ))

    return Dataset(block_list)

예제 #2

파일 보기

def read_datasource(datasource: Datasource[T],
                    *,
                    parallelism: int = 200,
                    ray_remote_args: Dict[str, Any] = None,
                    **read_args) -> Dataset[T]:
    """Read a dataset from a custom data source.

    Args:
        datasource: The datasource to read data from.
        parallelism: The requested parallelism of the read.
        read_args: Additional kwargs to pass to the datasource impl.
        ray_remote_args: kwargs passed to ray.remote in the read tasks.

    Returns:
        Dataset holding the data read from the datasource.
    """

    read_tasks = datasource.prepare_read(parallelism, **read_args)

    def remote_read(task: ReadTask) -> Block:
        return task()

    if ray_remote_args is None:
        ray_remote_args = {}
    # Increase the read parallelism by default to maximize IO throughput. This
    # is particularly important when reading from e.g., remote storage.
    if "num_cpus" not in ray_remote_args:
        # Note that the too many workers warning triggers at 4x subscription,
        # so we go at 0.5 to avoid the warning message.
        ray_remote_args["num_cpus"] = 0.5
    remote_read = cached_remote_fn(remote_read)

    read_spread_custom_resource_labels = os.getenv(
        "RAY_DATASETS_READ_SPREAD_CUSTOM_RESOURCE_LABELS", None)
    if read_spread_custom_resource_labels is not None:
        read_spread_custom_resource_labels = (
            read_spread_custom_resource_labels.split(","))
        round_robin_resource_provider = itertools.cycle(
            map(lambda resource: {resource: 0.001},
                read_spread_custom_resource_labels))
    else:
        round_robin_resource_provider = itertools.repeat({})

    resource_iter = iter(round_robin_resource_provider)

    calls: List[Callable[[], ObjectRef[Block]]] = []
    metadata: List[BlockMetadata] = []

    for task in read_tasks:
        calls.append(
            lambda task=task,
            resources=next(resource_iter): remote_read.options(
                **ray_remote_args,
                resources=resources).remote(task))
        metadata.append(task.get_metadata())

    block_list = LazyBlockList(calls, metadata)

    # Get the schema from the first block synchronously.
    if metadata and metadata[0].schema is None:
        get_schema = cached_remote_fn(_get_schema)
        schema0 = ray.get(get_schema.remote(next(iter(block_list))))
        block_list.set_metadata(
            0,
            BlockMetadata(
                num_rows=metadata[0].num_rows,
                size_bytes=metadata[0].size_bytes,
                schema=schema0,
                input_files=metadata[0].input_files,
            ))

    return Dataset(block_list)

예제 #3

파일 보기

def read_datasource(datasource: Datasource[T],
                    *,
                    parallelism: int = 200,
                    ray_remote_args: Dict[str, Any] = None,
                    _spread_resource_prefix: Optional[str] = None,
                    **read_args) -> Dataset[T]:
    """Read a dataset from a custom data source.

    Args:
        datasource: The datasource to read data from.
        parallelism: The requested parallelism of the read.
        read_args: Additional kwargs to pass to the datasource impl.
        ray_remote_args: kwargs passed to ray.remote in the read tasks.

    Returns:
        Dataset holding the data read from the datasource.
    """

    read_tasks = datasource.prepare_read(parallelism, **read_args)

    def remote_read(task: ReadTask) -> Block:
        return task()

    if ray_remote_args is None:
        ray_remote_args = {}
    # Increase the read parallelism by default to maximize IO throughput. This
    # is particularly important when reading from e.g., remote storage.
    if "num_cpus" not in ray_remote_args:
        # Note that the too many workers warning triggers at 4x subscription,
        # so we go at 0.5 to avoid the warning message.
        ray_remote_args["num_cpus"] = 0.5
    remote_read = cached_remote_fn(remote_read)

    if _spread_resource_prefix is not None:
        # Use given spread resource prefix for round-robin resource-based
        # scheduling.
        nodes = ray.nodes()
        resource_iter = _get_spread_resources_iter(nodes,
                                                   _spread_resource_prefix,
                                                   ray_remote_args)
    else:
        # If no spread resource prefix given, yield an empty dictionary.
        resource_iter = itertools.repeat({})

    calls: List[Callable[[], ObjectRef[Block]]] = []
    metadata: List[BlockMetadata] = []

    for task in read_tasks:
        calls.append(
            lambda task=task, resources=next(resource_iter): remote_read.
            options(**ray_remote_args, resources=resources).remote(task))
        metadata.append(task.get_metadata())

    block_list = LazyBlockList(calls, metadata)

    # Get the schema from the first block synchronously.
    if metadata and metadata[0].schema is None:
        get_schema = cached_remote_fn(_get_schema)
        schema0 = ray.get(get_schema.remote(next(iter(block_list))))
        block_list.set_metadata(
            0,
            BlockMetadata(
                num_rows=metadata[0].num_rows,
                size_bytes=metadata[0].size_bytes,
                schema=schema0,
                input_files=metadata[0].input_files,
            ))

    return Dataset(block_list)