Пример #1
0
def _read_parquet_metadata_file(
    path: str,
    boto3_session: boto3.Session,
    s3_additional_kwargs: Optional[Dict[str, str]],
    use_threads: Union[bool, int],
    version_id: Optional[str] = None,
    ignore_null: bool = False,
    pyarrow_additional_kwargs: Optional[Dict[str, Any]] = None,
) -> Optional[Dict[str, str]]:
    pyarrow_args = _set_default_pyarrow_additional_kwargs(pyarrow_additional_kwargs)
    with open_s3_object(
        path=path,
        mode="rb",
        version_id=version_id,
        use_threads=use_threads,
        s3_block_size=131_072,  # 128 KB (128 * 2**10)
        s3_additional_kwargs=s3_additional_kwargs,
        boto3_session=boto3_session,
    ) as f:
        pq_file: Optional[pyarrow.parquet.ParquetFile] = _pyarrow_parquet_file_wrapper(
            source=f, coerce_int96_timestamp_unit=pyarrow_args["coerce_int96_timestamp_unit"]
        )
        if pq_file is None:
            return None
        return _data_types.athena_types_from_pyarrow_schema(
            schema=pq_file.schema.to_arrow_schema(), partitions=None, ignore_null=ignore_null
        )[0]
Пример #2
0
def _read_parquet_metadata_file(
        path: str, use_threads: bool,
        boto3_session: boto3.Session) -> Dict[str, str]:
    data: pyarrow.parquet.ParquetDataset = _read_parquet_init(
        path=path,
        filters=None,
        dataset=False,
        use_threads=use_threads,
        boto3_session=boto3_session)
    return _data_types.athena_types_from_pyarrow_schema(
        schema=data.schema.to_arrow_schema(), partitions=None)[0]
Пример #3
0
def _read_parquet_metadata_file(
    path: str, boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], use_threads: bool
) -> Dict[str, str]:
    with open_s3_object(
        path=path,
        mode="rb",
        use_threads=use_threads,
        s3_block_size=131_072,  # 128 KB (128 * 2**10)
        s3_additional_kwargs=s3_additional_kwargs,
        boto3_session=boto3_session,
    ) as f:
        pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(source=f)
        return _data_types.athena_types_from_pyarrow_schema(schema=pq_file.schema.to_arrow_schema(), partitions=None)[0]
Пример #4
0
def _read_parquet_metadata_file(
    path: str,
    boto3_session: boto3.Session,
    s3_additional_kwargs: Optional[Dict[str, str]],
) -> Dict[str, str]:
    fs: s3fs.S3FileSystem = _utils.get_fs(
        s3fs_block_size=4_194_304,
        session=boto3_session,
        s3_additional_kwargs=s3_additional_kwargs  # 4 MB (4 * 2**20)
    )
    with _utils.open_file(fs=fs, path=path, mode="rb") as f:
        pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(
            source=f)
        return _data_types.athena_types_from_pyarrow_schema(
            schema=pq_file.schema.to_arrow_schema(), partitions=None)[0]
Пример #5
0
def _read_parquet_chunked(
    paths: List[str],
    chunked: Union[bool, int],
    columns: Optional[List[str]],
    categories: Optional[List[str]],
    validate_schema: bool,
    safe: bool,
    boto3_session: boto3.Session,
    dataset: bool,
    path_root: Optional[str],
    s3_additional_kwargs: Optional[Dict[str, str]],
    use_threads: bool,
) -> Iterator[pd.DataFrame]:
    next_slice: Optional[pd.DataFrame] = None
    last_schema: Optional[Dict[str, str]] = None
    last_path: str = ""
    for path in paths:
        with open_s3_object(
                path=path,
                mode="rb",
                use_threads=use_threads,
                s3_block_size=10_485_760,  # 10 MB (10 * 2**20)
                s3_additional_kwargs=s3_additional_kwargs,
                boto3_session=boto3_session,
        ) as f:
            pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(
                source=f, read_dictionary=categories)
            schema: Dict[str,
                         str] = _data_types.athena_types_from_pyarrow_schema(
                             schema=pq_file.schema.to_arrow_schema(),
                             partitions=None)[0]
            if validate_schema is True and last_schema is not None:
                if schema != last_schema:
                    raise exceptions.InvalidSchemaConvergence(
                        f"Was detect at least 2 different schemas:\n"
                        f"    - {last_path} -> {last_schema}\n"
                        f"    - {path} -> {schema}")
            last_schema = schema
            last_path = path
            num_row_groups: int = pq_file.num_row_groups
            _logger.debug("num_row_groups: %s", num_row_groups)
            for i in range(num_row_groups):
                _logger.debug("Reading Row Group %s...", i)
                df: pd.DataFrame = _arrowtable2df(
                    table=pq_file.read_row_group(i=i,
                                                 columns=columns,
                                                 use_threads=use_threads,
                                                 use_pandas_metadata=False),
                    categories=categories,
                    safe=safe,
                    use_threads=use_threads,
                    dataset=dataset,
                    path=path,
                    path_root=path_root,
                )
                if chunked is True:
                    yield df
                elif isinstance(chunked, int) and chunked > 0:
                    if next_slice is not None:
                        df = _union(dfs=[next_slice, df], ignore_index=None)
                    while len(df.index) >= chunked:
                        yield df.iloc[:chunked]
                        df = df.iloc[chunked:]
                    if df.empty:
                        next_slice = None
                    else:
                        next_slice = df
                else:
                    raise exceptions.InvalidArgument(f"chunked: {chunked}")
Пример #6
0
def _read_parquet_chunked(  # pylint: disable=too-many-branches
    paths: List[str],
    chunked: Union[bool, int],
    validate_schema: bool,
    ignore_index: Optional[bool],
    columns: Optional[List[str]],
    categories: Optional[List[str]],
    safe: bool,
    map_types: bool,
    boto3_session: boto3.Session,
    dataset: bool,
    path_root: Optional[str],
    s3_additional_kwargs: Optional[Dict[str, str]],
    use_threads: Union[bool, int],
) -> Iterator[pd.DataFrame]:
    next_slice: Optional[pd.DataFrame] = None
    last_schema: Optional[Dict[str, str]] = None
    last_path: str = ""
    for path in paths:
        with open_s3_object(
                path=path,
                mode="rb",
                use_threads=use_threads,
                s3_block_size=10_485_760,  # 10 MB (10 * 2**20)
                s3_additional_kwargs=s3_additional_kwargs,
                boto3_session=boto3_session,
        ) as f:
            pq_file: Optional[
                pyarrow.parquet.ParquetFile] = _pyarrow_parquet_file_wrapper(
                    source=f, read_dictionary=categories)
            if pq_file is None:
                continue
            if validate_schema is True:
                schema: Dict[
                    str, str] = _data_types.athena_types_from_pyarrow_schema(
                        schema=pq_file.schema.to_arrow_schema(),
                        partitions=None)[0]
                if last_schema is not None:
                    if schema != last_schema:
                        raise exceptions.InvalidSchemaConvergence(
                            f"Was detect at least 2 different schemas:\n"
                            f"    - {last_path} -> {last_schema}\n"
                            f"    - {path} -> {schema}")
                last_schema = schema
                last_path = path
            num_row_groups: int = pq_file.num_row_groups
            _logger.debug("num_row_groups: %s", num_row_groups)
            use_threads_flag: bool = use_threads if isinstance(
                use_threads, bool) else bool(use_threads > 1)
            # iter_batches is only available for pyarrow >= 3.0.0
            if callable(getattr(pq_file, "iter_batches", None)):
                chunk_generator = _pyarrow_chunk_generator(
                    pq_file=pq_file,
                    chunked=chunked,
                    columns=columns,
                    use_threads_flag=use_threads_flag)
            else:
                chunk_generator = _row_group_chunk_generator(
                    pq_file=pq_file,
                    columns=columns,
                    use_threads_flag=use_threads_flag,
                    num_row_groups=num_row_groups)

            for chunk in chunk_generator:
                df: pd.DataFrame = _arrowtable2df(
                    table=chunk,
                    categories=categories,
                    safe=safe,
                    map_types=map_types,
                    use_threads=use_threads,
                    dataset=dataset,
                    path=path,
                    path_root=path_root,
                )
                if chunked is True:
                    yield df
                elif isinstance(chunked, int) and chunked > 0:
                    if next_slice is not None:
                        df = _union(dfs=[next_slice, df],
                                    ignore_index=ignore_index)
                    while len(df.index) >= chunked:
                        yield df.iloc[:chunked, :].copy()
                        df = df.iloc[chunked:, :]
                    if df.empty:
                        next_slice = None
                    else:
                        next_slice = df
                else:
                    raise exceptions.InvalidArgument(f"chunked: {chunked}")