Пример #1
0
def read_parquet_metadata_internal(
    path: Union[str, List[str]],
    dtype: Optional[Dict[str, str]],
    sampling: float,
    dataset: bool,
    use_threads: bool,
    boto3_session: Optional[boto3.Session],
) -> Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[
        str, List[str]]]]:
    """Handle wr.s3.read_parquet_metadata internally."""
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if dataset is True:
        if isinstance(path, str):
            _path: Optional[str] = path if path.endswith("/") else f"{path}/"
            paths: List[str] = path2list(path=_path, boto3_session=session)
        else:  # pragma: no cover
            raise exceptions.InvalidArgumentType(
                "Argument <path> must be str if dataset=True.")
    else:
        if isinstance(path, str):
            _path = None
            paths = path2list(path=path, boto3_session=session)
        elif isinstance(path, list):
            _path = None
            paths = path
        else:  # pragma: no cover
            raise exceptions.InvalidArgumentType(
                f"Argument path must be str or List[str] instead of {type(path)}."
            )
    schemas: List[Dict[str, str]] = [
        _read_parquet_metadata_file(path=x,
                                    use_threads=use_threads,
                                    boto3_session=session)
        for x in _utils.list_sampling(lst=paths, sampling=sampling)
    ]
    _logger.debug("schemas: %s", schemas)
    columns_types: Dict[str, str] = {}
    for schema in schemas:
        for column, _dtype in schema.items():
            if (column in columns_types) and (columns_types[column] !=
                                              _dtype):  # pragma: no cover
                raise exceptions.InvalidSchemaConvergence(
                    f"Was detect at least 2 different types in column {column} ({columns_types[column]} and {dtype})."
                )
            columns_types[column] = _dtype
    partitions_types: Optional[Dict[str, str]] = None
    partitions_values: Optional[Dict[str, List[str]]] = None
    if (dataset is True) and (_path is not None):
        partitions_types, partitions_values = _utils.extract_partitions_metadata_from_paths(
            path=_path, paths=paths)
    if dtype:
        for k, v in dtype.items():
            if columns_types and k in columns_types:
                columns_types[k] = v
            if partitions_types and k in partitions_types:
                partitions_types[k] = v
    _logger.debug("columns_types: %s", columns_types)
    return columns_types, partitions_types, partitions_values
Пример #2
0
def _read_parquet_init(
    path: Union[str, List[str]],
    filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None,
    categories: List[str] = None,
    validate_schema: bool = True,
    dataset: bool = False,
    use_threads: bool = True,
    boto3_session: Optional[boto3.Session] = None,
    s3_additional_kwargs: Optional[Dict[str, str]] = None,
) -> pyarrow.parquet.ParquetDataset:
    """Encapsulate all initialization before the use of the pyarrow.parquet.ParquetDataset."""
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if dataset is False:
        path_or_paths: Union[str, List[str]] = path2list(path=path,
                                                         boto3_session=session)
    elif isinstance(path, str):
        path_or_paths = path[:-1] if path.endswith("/") else path
    else:
        path_or_paths = path
    _logger.debug("path_or_paths: %s", path_or_paths)
    fs: s3fs.S3FileSystem = _utils.get_fs(
        session=session, s3_additional_kwargs=s3_additional_kwargs)
    cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
    data: pyarrow.parquet.ParquetDataset = pyarrow.parquet.ParquetDataset(
        path_or_paths=path_or_paths,
        filesystem=fs,
        metadata_nthreads=cpus,
        filters=filters,
        read_dictionary=categories,
        validate_schema=validate_schema,
        split_row_groups=False,
        use_legacy_dataset=True,
    )
    return data
Пример #3
0
def describe_objects(
    path: Union[str, List[str]],
    wait_time: Optional[Union[int, float]] = None,
    use_threads: bool = True,
    boto3_session: Optional[boto3.Session] = None,
) -> Dict[str, Dict[str, Any]]:
    """Describe Amazon S3 objects from a received S3 prefix or list of S3 objects paths.

    Fetch attributes like ContentLength, DeleteMarker, LastModified, ContentType, etc
    The full list of attributes can be explored under the boto3 head_object documentation:
    https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.head_object

    Note
    ----
    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().

    Parameters
    ----------
    path : Union[str, List[str]]
        S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
    wait_time : Union[int,float], optional
        How much time (seconds) should Wrangler try to reach this objects.
        Very useful to overcome eventual consistence issues.
        `None` means only a single try will be done.
    use_threads : bool
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    Dict[str, Dict[str, Any]]
        Return a dictionary of objects returned from head_objects where the key is the object path.
        The response object can be explored here:
        https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.head_object

    Examples
    --------
    >>> import awswrangler as wr
    >>> descs0 = wr.s3.describe_objects(['s3://bucket/key0', 's3://bucket/key1'])  # Describe both objects
    >>> descs1 = wr.s3.describe_objects('s3://bucket/prefix')  # Describe all objects under the prefix
    >>> descs2 = wr.s3.describe_objects('s3://bucket/prefix', wait_time=30)  # Overcoming eventual consistence issues

    """
    paths: List[str] = path2list(path=path, boto3_session=boto3_session)
    if len(paths) < 1:
        return {}
    client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session)
    resp_list: List[Tuple[str, Dict[str, Any]]]
    if use_threads is False:
        resp_list = [_describe_object(path=p, wait_time=wait_time, client_s3=client_s3) for p in paths]
    else:
        cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
        with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor:
            resp_list = list(
                executor.map(_describe_object, paths, itertools.repeat(wait_time), itertools.repeat(client_s3))
            )
    desc_dict: Dict[str, Dict[str, Any]] = dict(resp_list)
    return desc_dict
Пример #4
0
def delete_objects(path: Union[str, List[str]],
                   use_threads: bool = True,
                   boto3_session: Optional[boto3.Session] = None) -> None:
    """Delete Amazon S3 objects from a received S3 prefix or list of S3 objects paths.

    Note
    ----
    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().

    Parameters
    ----------
    path : Union[str, List[str]]
        S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
    use_threads : bool
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    None
        None.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.s3.delete_objects(['s3://bucket/key0', 's3://bucket/key1'])  # Delete both objects
    >>> wr.s3.delete_objects('s3://bucket/prefix')  # Delete all objects under the received prefix

    """
    paths: List[str] = path2list(path=path, boto3_session=boto3_session)
    if len(paths) < 1:
        return
    client_s3: boto3.client = _utils.client(service_name="s3",
                                            session=boto3_session)
    buckets: Dict[str, List[str]] = _split_paths_by_bucket(paths=paths)
    for bucket, keys in buckets.items():
        chunks: List[List[str]] = _utils.chunkify(lst=keys, max_length=1_000)
        if use_threads is False:
            for chunk in chunks:
                _delete_objects(bucket=bucket, keys=chunk, client_s3=client_s3)
        else:
            cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
            with concurrent.futures.ThreadPoolExecutor(
                    max_workers=cpus) as executor:
                list(
                    executor.map(_delete_objects, itertools.repeat(bucket),
                                 chunks, itertools.repeat(client_s3)))
Пример #5
0
def _read_text(
    parser_func: Callable,
    path: Union[str, List[str]],
    use_threads: bool = True,
    boto3_session: Optional[boto3.Session] = None,
    s3_additional_kwargs: Optional[Dict[str, str]] = None,
    chunksize: Optional[int] = None,
    dataset: bool = False,
    **pandas_kwargs,
) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
    if "iterator" in pandas_kwargs:
        raise exceptions.InvalidArgument(
            "Please, use chunksize instead of iterator.")
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if (dataset is True) and (not isinstance(path, str)):  # pragma: no cover
        raise exceptions.InvalidArgument(
            "The path argument must be a string Amazon S3 prefix if dataset=True."
        )
    if dataset is True:
        path_root: str = str(path)
    else:
        path_root = ""
    paths: List[str] = path2list(path=path, boto3_session=session)
    _logger.debug("paths:\n%s", paths)
    if chunksize is not None:
        dfs: Iterator[pd.DataFrame] = _read_text_chunksize(
            parser_func=parser_func,
            paths=paths,
            boto3_session=session,
            chunksize=chunksize,
            pandas_args=pandas_kwargs,
            s3_additional_kwargs=s3_additional_kwargs,
            dataset=dataset,
            path_root=path_root,
        )
        return dfs
    if use_threads is False:
        df: pd.DataFrame = pd.concat(
            objs=[
                _read_text_full(
                    parser_func=parser_func,
                    path=p,
                    boto3_session=session,
                    pandas_args=pandas_kwargs,
                    s3_additional_kwargs=s3_additional_kwargs,
                    dataset=dataset,
                    path_root=path_root,
                ) for p in paths
            ],
            ignore_index=True,
            sort=False,
        )
    else:
        cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=cpus) as executor:
            df = pd.concat(
                objs=executor.map(
                    _read_text_full,
                    itertools.repeat(parser_func),
                    itertools.repeat(path_root),
                    paths,
                    itertools.repeat(
                        _utils.boto3_to_primitives(
                            boto3_session=session)),  # Boto3.Session
                    itertools.repeat(pandas_kwargs),
                    itertools.repeat(s3_additional_kwargs),
                    itertools.repeat(dataset),
                ),
                ignore_index=True,
                sort=False,
            )
    return df
Пример #6
0
def copy_files_to_redshift(  # pylint: disable=too-many-locals,too-many-arguments
    path: Union[str, List[str]],
    manifest_directory: str,
    con: sqlalchemy.engine.Engine,
    table: str,
    schema: str,
    iam_role: str,
    mode: str = "append",
    diststyle: str = "AUTO",
    distkey: Optional[str] = None,
    sortstyle: str = "COMPOUND",
    sortkey: Optional[List[str]] = None,
    primary_keys: Optional[List[str]] = None,
    varchar_lengths_default: int = 256,
    varchar_lengths: Optional[Dict[str, int]] = None,
    use_threads: bool = True,
    boto3_session: Optional[boto3.Session] = None,
) -> None:
    """Load Parquet files from S3 to a Table on Amazon Redshift (Through COPY command).

    https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html

    Note
    ----
    If the table does not exist yet,
    it will be automatically created for you
    using the Parquet metadata to
    infer the columns data types.

    Note
    ----
    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().

    Parameters
    ----------
    path : Union[str, List[str]]
        S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
    manifest_directory : str
        S3 prefix (e.g. s3://bucket/prefix)
    con : sqlalchemy.engine.Engine
        SQLAlchemy Engine. Please use,
        wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine()
    table : str
        Table name
    schema : str
        Schema name
    iam_role : str
        AWS IAM role with the related permissions.
    mode : str
        Append, overwrite or upsert.
    diststyle : str
        Redshift distribution styles. Must be in ["AUTO", "EVEN", "ALL", "KEY"].
        https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html
    distkey : str, optional
        Specifies a column name or positional number for the distribution key.
    sortstyle : str
        Sorting can be "COMPOUND" or "INTERLEAVED".
        https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html
    sortkey : List[str], optional
        List of columns to be sorted.
    primary_keys : List[str], optional
        Primary keys.
    varchar_lengths_default : int
        The size that will be set for all VARCHAR columns not specified with varchar_lengths.
    varchar_lengths : Dict[str, int], optional
        Dict of VARCHAR length by columns. (e.g. {"col1": 10, "col5": 200}).
    use_threads : bool
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    None
        None.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.db.copy_files_to_redshift(
    ...     path="s3://bucket/my_parquet_files/",
    ...     con=wr.catalog.get_engine(connection="my_glue_conn_name"),
    ...     table="my_table",
    ...     schema="public"
    ...     iam_role="arn:aws:iam::XXX:role/XXX"
    ... )

    """
    _varchar_lengths: Dict[
        str, int] = {} if varchar_lengths is None else varchar_lengths
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    paths: List[str] = path2list(path=path, boto3_session=session)  # pylint: disable=protected-access
    manifest_directory = manifest_directory if manifest_directory.endswith(
        "/") else f"{manifest_directory}/"
    manifest_path: str = f"{manifest_directory}manifest.json"
    write_redshift_copy_manifest(manifest_path=manifest_path,
                                 paths=paths,
                                 use_threads=use_threads,
                                 boto3_session=session)
    s3.wait_objects_exist(paths=paths + [manifest_path],
                          use_threads=False,
                          boto3_session=session)
    athena_types, _ = s3.read_parquet_metadata(path=paths,
                                               dataset=False,
                                               use_threads=use_threads,
                                               boto3_session=session)
    _logger.debug("athena_types: %s", athena_types)
    redshift_types: Dict[str, str] = {}
    for col_name, col_type in athena_types.items():
        length: int = _varchar_lengths[
            col_name] if col_name in _varchar_lengths else varchar_lengths_default
        redshift_types[col_name] = _data_types.athena2redshift(
            dtype=col_type, varchar_length=length)
    with con.begin() as _con:
        created_table, created_schema = _rs_create_table(
            con=_con,
            table=table,
            schema=schema,
            redshift_types=redshift_types,
            mode=mode,
            diststyle=diststyle,
            sortstyle=sortstyle,
            distkey=distkey,
            sortkey=sortkey,
            primary_keys=primary_keys,
        )
        _rs_copy(
            con=_con,
            table=created_table,
            schema=created_schema,
            manifest_path=manifest_path,
            iam_role=iam_role,
            num_files=len(paths),
        )
        if table != created_table:  # upsert
            _rs_upsert(con=_con,
                       schema=schema,
                       table=table,
                       temp_table=created_table,
                       primary_keys=primary_keys)
    s3.delete_objects(path=[manifest_path],
                      use_threads=use_threads,
                      boto3_session=session)