Пример #1
0
def _read_schemas_from_files(
    paths: List[str],
    sampling: float,
    use_threads: bool,
    boto3_session: boto3.Session,
    s3_additional_kwargs: Optional[Dict[str, str]],
) -> Tuple[Dict[str, str], ...]:
    paths = _utils.list_sampling(lst=paths, sampling=sampling)
    schemas: Tuple[Dict[str, str], ...] = tuple()
    n_paths: int = len(paths)
    if use_threads is False or n_paths == 1:
        schemas = tuple(
            _read_parquet_metadata_file(
                path=p,
                boto3_session=boto3_session,
                s3_additional_kwargs=s3_additional_kwargs,
                use_threads=use_threads) for p in paths)
    elif n_paths > 1:
        cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=cpus) as executor:
            schemas = tuple(
                executor.map(
                    _read_parquet_metadata_file,
                    paths,
                    itertools.repeat(
                        _utils.boto3_to_primitives(
                            boto3_session=boto3_session)),  # Boto3.Session
                    itertools.repeat(s3_additional_kwargs),
                    itertools.repeat(use_threads),
                ))
    _logger.debug("schemas: %s", schemas)
    return schemas
Пример #2
0
def read_parquet_metadata_internal(
    path: Union[str, List[str]],
    dtype: Optional[Dict[str, str]],
    sampling: float,
    dataset: bool,
    use_threads: bool,
    boto3_session: Optional[boto3.Session],
) -> Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[
        str, List[str]]]]:
    """Handle wr.s3.read_parquet_metadata internally."""
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if dataset is True:
        if isinstance(path, str):
            _path: Optional[str] = path if path.endswith("/") else f"{path}/"
            paths: List[str] = path2list(path=_path, boto3_session=session)
        else:  # pragma: no cover
            raise exceptions.InvalidArgumentType(
                "Argument <path> must be str if dataset=True.")
    else:
        if isinstance(path, str):
            _path = None
            paths = path2list(path=path, boto3_session=session)
        elif isinstance(path, list):
            _path = None
            paths = path
        else:  # pragma: no cover
            raise exceptions.InvalidArgumentType(
                f"Argument path must be str or List[str] instead of {type(path)}."
            )
    schemas: List[Dict[str, str]] = [
        _read_parquet_metadata_file(path=x,
                                    use_threads=use_threads,
                                    boto3_session=session)
        for x in _utils.list_sampling(lst=paths, sampling=sampling)
    ]
    _logger.debug("schemas: %s", schemas)
    columns_types: Dict[str, str] = {}
    for schema in schemas:
        for column, _dtype in schema.items():
            if (column in columns_types) and (columns_types[column] !=
                                              _dtype):  # pragma: no cover
                raise exceptions.InvalidSchemaConvergence(
                    f"Was detect at least 2 different types in column {column} ({columns_types[column]} and {dtype})."
                )
            columns_types[column] = _dtype
    partitions_types: Optional[Dict[str, str]] = None
    partitions_values: Optional[Dict[str, List[str]]] = None
    if (dataset is True) and (_path is not None):
        partitions_types, partitions_values = _utils.extract_partitions_metadata_from_paths(
            path=_path, paths=paths)
    if dtype:
        for k, v in dtype.items():
            if columns_types and k in columns_types:
                columns_types[k] = v
            if partitions_types and k in partitions_types:
                partitions_types[k] = v
    _logger.debug("columns_types: %s", columns_types)
    return columns_types, partitions_types, partitions_values
Пример #3
0
def _read_schemas_from_files(
    paths: List[str],
    sampling: float,
    use_threads: Union[bool, int],
    boto3_session: boto3.Session,
    s3_additional_kwargs: Optional[Dict[str, str]],
    version_ids: Optional[Dict[str, str]] = None,
    ignore_null: bool = False,
    pyarrow_additional_kwargs: Optional[Dict[str, Any]] = None,
) -> Tuple[Dict[str, str], ...]:

    paths = _utils.list_sampling(lst=paths, sampling=sampling)
    schemas: Tuple[Optional[Dict[str, str]], ...] = tuple()
    n_paths: int = len(paths)
    cpus: int = _utils.ensure_cpu_count(use_threads)
    if cpus == 1 or n_paths == 1:
        schemas = tuple(
            _read_parquet_metadata_file(
                path=p,
                boto3_session=boto3_session,
                s3_additional_kwargs=s3_additional_kwargs,
                use_threads=use_threads,
                version_id=version_ids.get(p) if isinstance(version_ids, dict) else None,
                ignore_null=ignore_null,
                pyarrow_additional_kwargs=pyarrow_additional_kwargs,
            )
            for p in paths
        )
    elif n_paths > 1:
        versions = [version_ids.get(p) if isinstance(version_ids, dict) else None for p in paths]
        with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor:
            schemas = tuple(
                executor.map(
                    _read_parquet_metadata_file,
                    paths,
                    itertools.repeat(_utils.boto3_to_primitives(boto3_session=boto3_session)),  # Boto3.Session
                    itertools.repeat(s3_additional_kwargs),
                    itertools.repeat(use_threads),
                    versions,
                    itertools.repeat(ignore_null),
                    itertools.repeat(pyarrow_additional_kwargs),
                )
            )
    schemas = cast(Tuple[Dict[str, str], ...], tuple(x for x in schemas if x is not None))
    _logger.debug("schemas: %s", schemas)
    return schemas