def _to_parquet_chunked(
    file_path: str,
    boto3_session: Optional[boto3.Session],
    s3_additional_kwargs: Optional[Dict[str, str]],
    compression: Optional[str],
    pyarrow_additional_kwargs: Optional[Dict[str, Any]],
    table: pa.Table,
    max_rows_by_file: int,
    num_of_rows: int,
    cpus: int,
) -> List[str]:
    chunks: int = math.ceil(num_of_rows / max_rows_by_file)
    use_threads: Union[bool, int] = cpus > 1
    proxy: _WriteProxy = _WriteProxy(use_threads=use_threads)
    for chunk in range(chunks):
        offset: int = chunk * max_rows_by_file
        write_path: str = _get_file_path(chunk, file_path)
        proxy.write(
            func=_write_chunk,
            file_path=write_path,
            boto3_session=boto3_session,
            s3_additional_kwargs=s3_additional_kwargs,
            compression=compression,
            pyarrow_additional_kwargs=pyarrow_additional_kwargs,
            table=table,
            offset=offset,
            chunk_size=max_rows_by_file,
            use_threads=use_threads,
        )
    return proxy.close()  # blocking
Пример #2
0
def _to_buckets(
    func: Callable[..., List[str]],
    df: pd.DataFrame,
    path_root: str,
    bucketing_info: Tuple[List[str], int],
    filename_prefix: str,
    boto3_session: boto3.Session,
    use_threads: Union[bool, int],
    proxy: Optional[_WriteProxy] = None,
    **func_kwargs: Any,
) -> List[str]:
    _proxy: _WriteProxy = proxy if proxy else _WriteProxy(use_threads=False)
    bucket_number_series = df.astype("O").apply(
        lambda row: _get_bucket_number(bucketing_info[
            1], [row[col_name] for col_name in bucketing_info[0]]),
        axis="columns",
    )
    bucket_number_series = bucket_number_series.astype(
        pd.CategoricalDtype(range(bucketing_info[1])))
    for bucket_number, subgroup in df.groupby(by=bucket_number_series,
                                              observed=False):
        _proxy.write(
            func=func,
            df=subgroup,
            path_root=path_root,
            filename_prefix=f"{filename_prefix}_bucket-{bucket_number:05d}",
            boto3_session=boto3_session,
            use_threads=use_threads,
            **func_kwargs,
        )
    if proxy:
        return []

    paths: List[str] = _proxy.close()  # blocking
    return paths
Пример #3
0
def _to_partitions(
    func: Callable[..., List[str]],
    concurrent_partitioning: bool,
    df: pd.DataFrame,
    path_root: str,
    use_threads: bool,
    mode: str,
    partition_cols: List[str],
    boto3_session: boto3.Session,
    **func_kwargs: Any,
) -> Tuple[List[str], Dict[str, List[str]]]:
    partitions_values: Dict[str, List[str]] = {}
    proxy: _WriteProxy = _WriteProxy(use_threads=concurrent_partitioning)
    for keys, subgroup in df.groupby(by=partition_cols, observed=True):
        subgroup = subgroup.drop(partition_cols, axis="columns")
        keys = (keys, ) if not isinstance(keys, tuple) else keys
        subdir = "/".join(
            [f"{name}={val}" for name, val in zip(partition_cols, keys)])
        prefix: str = f"{path_root}{subdir}/"
        if mode == "overwrite_partitions":
            delete_objects(path=prefix,
                           use_threads=use_threads,
                           boto3_session=boto3_session)
        proxy.write(func=func,
                    df=subgroup,
                    path_root=prefix,
                    boto3_session=boto3_session,
                    **func_kwargs)
        partitions_values[prefix] = [str(k) for k in keys]
    paths: List[str] = proxy.close()  # blocking
    return paths, partitions_values
Пример #4
0
def _to_partitions(
    func: Callable[..., List[str]],
    concurrent_partitioning: bool,
    df: pd.DataFrame,
    path_root: str,
    use_threads: Union[bool, int],
    mode: str,
    partition_cols: List[str],
    partitions_types: Optional[Dict[str, str]],
    catalog_id: Optional[str],
    database: Optional[str],
    table: Optional[str],
    table_type: Optional[str],
    transaction_id: Optional[str],
    bucketing_info: Optional[Tuple[List[str], int]],
    filename_prefix: str,
    boto3_session: boto3.Session,
    **func_kwargs: Any,
) -> Tuple[List[str], Dict[str, List[str]]]:
    partitions_values: Dict[str, List[str]] = {}
    proxy: _WriteProxy = _WriteProxy(use_threads=concurrent_partitioning)

    for keys, subgroup in df.groupby(by=partition_cols, observed=True):
        subgroup = subgroup.drop(partition_cols, axis="columns")
        keys = (keys, ) if not isinstance(keys, tuple) else keys
        subdir = "/".join(
            [f"{name}={val}" for name, val in zip(partition_cols, keys)])
        prefix: str = f"{path_root}{subdir}/"
        if mode == "overwrite_partitions":
            if (table_type == "GOVERNED") and (table
                                               is not None) and (database
                                                                 is not None):
                del_objects: List[Dict[
                    str, Any]] = lakeformation._get_table_objects(  # pylint: disable=protected-access
                        catalog_id=catalog_id,
                        database=database,
                        table=table,
                        transaction_id=transaction_id,  # type: ignore
                        partition_cols=partition_cols,
                        partitions_values=keys,
                        partitions_types=partitions_types,
                        boto3_session=boto3_session,
                    )
                if del_objects:
                    lakeformation._update_table_objects(  # pylint: disable=protected-access
                        catalog_id=catalog_id,
                        database=database,
                        table=table,
                        transaction_id=transaction_id,  # type: ignore
                        del_objects=del_objects,
                        boto3_session=boto3_session,
                    )
            else:
                delete_objects(
                    path=prefix,
                    use_threads=use_threads,
                    boto3_session=boto3_session,
                    s3_additional_kwargs=func_kwargs.get(
                        "s3_additional_kwargs"),
                )
        if bucketing_info:
            _to_buckets(
                func=func,
                df=subgroup,
                path_root=prefix,
                bucketing_info=bucketing_info,
                boto3_session=boto3_session,
                use_threads=use_threads,
                proxy=proxy,
                filename_prefix=filename_prefix,
                **func_kwargs,
            )
        else:
            proxy.write(
                func=func,
                df=subgroup,
                path_root=prefix,
                filename_prefix=filename_prefix,
                boto3_session=boto3_session,
                use_threads=use_threads,
                **func_kwargs,
            )
        partitions_values[prefix] = [str(k) for k in keys]
    paths: List[str] = proxy.close()  # blocking
    return paths, partitions_values