Пример #1
0
def read_datasource(datasource: Datasource[T],
                    *,
                    parallelism: int = 200,
                    ray_remote_args: Dict[str, Any] = None,
                    **read_args) -> Dataset[T]:
    """Read a dataset from a custom data source.

    Args:
        datasource: The datasource to read data from.
        parallelism: The requested parallelism of the read.
        read_args: Additional kwargs to pass to the datasource impl.
        ray_remote_args: kwargs passed to ray.remote in the read tasks.

    Returns:
        Dataset holding the data read from the datasource.
    """

    read_tasks = datasource.prepare_read(parallelism, **read_args)

    def remote_read(task: ReadTask) -> Block:
        return task()

    if ray_remote_args:
        remote_read = ray.remote(**ray_remote_args)(remote_read)
    else:
        remote_read = ray.remote(remote_read)

    calls: List[Callable[[], ObjectRef[Block]]] = []
    metadata: List[BlockMetadata] = []

    for task in read_tasks:
        calls.append(lambda task=task: remote_read.remote(task))
        metadata.append(task.get_metadata())

    block_list = LazyBlockList(calls, metadata)

    # Get the schema from the first block synchronously.
    if metadata and metadata[0].schema is None:
        get_schema = cached_remote_fn(_get_schema)
        schema0 = ray.get(get_schema.remote(next(iter(block_list))))
        block_list.set_metadata(
            0,
            BlockMetadata(
                num_rows=metadata[0].num_rows,
                size_bytes=metadata[0].size_bytes,
                schema=schema0,
                input_files=metadata[0].input_files,
            ))

    return Dataset(block_list)
Пример #2
0
def read_datasource(datasource: Datasource[T],
                    *,
                    parallelism: int = 200,
                    ray_remote_args: Dict[str, Any] = None,
                    **read_args) -> Dataset[T]:
    """Read a dataset from a custom data source.

    Args:
        datasource: The datasource to read data from.
        parallelism: The requested parallelism of the read.
        read_args: Additional kwargs to pass to the datasource impl.
        ray_remote_args: kwargs passed to ray.remote in the read tasks.

    Returns:
        Dataset holding the data read from the datasource.
    """

    read_tasks = datasource.prepare_read(parallelism, **read_args)

    def remote_read(task: ReadTask) -> Block:
        return task()

    if ray_remote_args is None:
        ray_remote_args = {}
    # Increase the read parallelism by default to maximize IO throughput. This
    # is particularly important when reading from e.g., remote storage.
    if "num_cpus" not in ray_remote_args:
        # Note that the too many workers warning triggers at 4x subscription,
        # so we go at 0.5 to avoid the warning message.
        ray_remote_args["num_cpus"] = 0.5
    remote_read = cached_remote_fn(remote_read)

    read_spread_custom_resource_labels = os.getenv(
        "RAY_DATASETS_READ_SPREAD_CUSTOM_RESOURCE_LABELS", None)
    if read_spread_custom_resource_labels is not None:
        read_spread_custom_resource_labels = (
            read_spread_custom_resource_labels.split(","))
        round_robin_resource_provider = itertools.cycle(
            map(lambda resource: {resource: 0.001},
                read_spread_custom_resource_labels))
    else:
        round_robin_resource_provider = itertools.repeat({})

    resource_iter = iter(round_robin_resource_provider)

    calls: List[Callable[[], ObjectRef[Block]]] = []
    metadata: List[BlockMetadata] = []

    for task in read_tasks:
        calls.append(
            lambda task=task,
            resources=next(resource_iter): remote_read.options(
                **ray_remote_args,
                resources=resources).remote(task))
        metadata.append(task.get_metadata())

    block_list = LazyBlockList(calls, metadata)

    # Get the schema from the first block synchronously.
    if metadata and metadata[0].schema is None:
        get_schema = cached_remote_fn(_get_schema)
        schema0 = ray.get(get_schema.remote(next(iter(block_list))))
        block_list.set_metadata(
            0,
            BlockMetadata(
                num_rows=metadata[0].num_rows,
                size_bytes=metadata[0].size_bytes,
                schema=schema0,
                input_files=metadata[0].input_files,
            ))

    return Dataset(block_list)
Пример #3
0
def read_datasource(datasource: Datasource[T],
                    *,
                    parallelism: int = 200,
                    ray_remote_args: Dict[str, Any] = None,
                    _spread_resource_prefix: Optional[str] = None,
                    **read_args) -> Dataset[T]:
    """Read a dataset from a custom data source.

    Args:
        datasource: The datasource to read data from.
        parallelism: The requested parallelism of the read.
        read_args: Additional kwargs to pass to the datasource impl.
        ray_remote_args: kwargs passed to ray.remote in the read tasks.

    Returns:
        Dataset holding the data read from the datasource.
    """

    read_tasks = datasource.prepare_read(parallelism, **read_args)

    def remote_read(task: ReadTask) -> Block:
        return task()

    if ray_remote_args is None:
        ray_remote_args = {}
    # Increase the read parallelism by default to maximize IO throughput. This
    # is particularly important when reading from e.g., remote storage.
    if "num_cpus" not in ray_remote_args:
        # Note that the too many workers warning triggers at 4x subscription,
        # so we go at 0.5 to avoid the warning message.
        ray_remote_args["num_cpus"] = 0.5
    remote_read = cached_remote_fn(remote_read)

    if _spread_resource_prefix is not None:
        # Use given spread resource prefix for round-robin resource-based
        # scheduling.
        nodes = ray.nodes()
        resource_iter = _get_spread_resources_iter(nodes,
                                                   _spread_resource_prefix,
                                                   ray_remote_args)
    else:
        # If no spread resource prefix given, yield an empty dictionary.
        resource_iter = itertools.repeat({})

    calls: List[Callable[[], ObjectRef[Block]]] = []
    metadata: List[BlockMetadata] = []

    for task in read_tasks:
        calls.append(
            lambda task=task, resources=next(resource_iter): remote_read.
            options(**ray_remote_args, resources=resources).remote(task))
        metadata.append(task.get_metadata())

    block_list = LazyBlockList(calls, metadata)

    # Get the schema from the first block synchronously.
    if metadata and metadata[0].schema is None:
        get_schema = cached_remote_fn(_get_schema)
        schema0 = ray.get(get_schema.remote(next(iter(block_list))))
        block_list.set_metadata(
            0,
            BlockMetadata(
                num_rows=metadata[0].num_rows,
                size_bytes=metadata[0].size_bytes,
                schema=schema0,
                input_files=metadata[0].input_files,
            ))

    return Dataset(block_list)