Exemplo n.º 1
0
def _add_partitions(
    database: str,
    table: str,
    boto3_session: Optional[boto3.Session],
    inputs: List[Dict[str, Any]],
    catalog_id: Optional[str] = None,
) -> None:
    chunks: List[List[Dict[str, Any]]] = _utils.chunkify(lst=inputs,
                                                         max_length=100)
    client_glue: boto3.client = _utils.client(service_name="glue",
                                              session=boto3_session)
    for chunk in chunks:  # pylint: disable=too-many-nested-blocks
        res: Dict[str, Any] = client_glue.batch_create_partition(
            **_catalog_id(catalog_id=catalog_id,
                          DatabaseName=database,
                          TableName=table,
                          PartitionInputList=chunk))
        if ("Errors" in res) and res["Errors"]:
            for error in res["Errors"]:
                if "ErrorDetail" in error:
                    if "ErrorCode" in error["ErrorDetail"]:
                        if error["ErrorDetail"][
                                "ErrorCode"] != "AlreadyExistsException":
                            raise exceptions.ServiceApiError(str(
                                res["Errors"]))
Exemplo n.º 2
0
def add_parquet_partitions(
    database: str,
    table: str,
    partitions_values: Dict[str, List[str]],
    compression: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> None:
    """Add partitions (metadata) to a Parquet Table in the AWS Glue Catalog.

    Parameters
    ----------
    database : str
        Database name.
    table : str
        Table name.
    partitions_values: Dict[str, List[str]]
        Dictionary with keys as S3 path locations and values as a list of partitions values as str
        (e.g. {'s3://bucket/prefix/y=2020/m=10/': ['2020', '10']}).
    compression: str, optional
        Compression style (``None``, ``snappy``, ``gzip``, etc).
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    None
        None.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.catalog.add_parquet_partitions(
    ...     database='default',
    ...     table='my_table',
    ...     partitions_values={
    ...         's3://bucket/prefix/y=2020/m=10/': ['2020', '10'],
    ...         's3://bucket/prefix/y=2020/m=11/': ['2020', '11'],
    ...         's3://bucket/prefix/y=2020/m=12/': ['2020', '12']
    ...     }
    ... )

    """
    inputs: List[Dict[str, Any]] = [
        _parquet_partition_definition(location=k,
                                      values=v,
                                      compression=compression)
        for k, v in partitions_values.items()
    ]
    chunks: List[List[Dict[str, Any]]] = _utils.chunkify(lst=inputs,
                                                         max_length=100)
    client_glue: boto3.client = _utils.client(service_name="glue",
                                              session=boto3_session)
    for chunk in chunks:
        res: Dict[str, Any] = client_glue.batch_create_partition(
            DatabaseName=database, TableName=table, PartitionInputList=chunk)
        if ("Errors" in res) and res["Errors"]:  # pragma: no cover
            raise exceptions.ServiceApiError(str(res["Errors"]))
Exemplo n.º 3
0
def delete_partitions(
    table: str,
    database: str,
    partitions_values: List[List[str]],
    catalog_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> None:
    """Delete specified partitions in a AWS Glue Catalog table.

    Parameters
    ----------
    table : str
        Table name.
    database : str
        Table name.
    catalog_id : str, optional
        The ID of the Data Catalog from which to retrieve Databases.
        If none is provided, the AWS account ID is used by default.
    partitions_values : List[List[str]]
        List of lists of partitions values as strings.
        (e.g. [['2020', '10', '25'], ['2020', '11', '16'], ['2020', '12', '19']]).
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    None
        None.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.catalog.delete_partitions(
    ...     table='my_table',
    ...     database='awswrangler_test',
    ...     partitions_values=[['2020', '10', '25'], ['2020', '11', '16'], ['2020', '12', '19']]
    ... )
    """
    client_glue: boto3.client = _utils.client(service_name="glue",
                                              session=boto3_session)
    chunks: List[List[List[str]]] = _utils.chunkify(lst=partitions_values,
                                                    max_length=25)
    for chunk in chunks:
        client_glue.batch_delete_partition(**_catalog_id(
            catalog_id=catalog_id,
            DatabaseName=database,
            TableName=table,
            PartitionsToDelete=[{
                "Values": v
            } for v in chunk],
        ))
Exemplo n.º 4
0
def delete_objects(path: Union[str, List[str]],
                   use_threads: bool = True,
                   boto3_session: Optional[boto3.Session] = None) -> None:
    """Delete Amazon S3 objects from a received S3 prefix or list of S3 objects paths.

    Note
    ----
    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().

    Parameters
    ----------
    path : Union[str, List[str]]
        S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
    use_threads : bool
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    None
        None.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.s3.delete_objects(['s3://bucket/key0', 's3://bucket/key1'])  # Delete both objects
    >>> wr.s3.delete_objects('s3://bucket/prefix')  # Delete all objects under the received prefix

    """
    paths: List[str] = path2list(path=path, boto3_session=boto3_session)
    if len(paths) < 1:
        return
    client_s3: boto3.client = _utils.client(service_name="s3",
                                            session=boto3_session)
    buckets: Dict[str, List[str]] = _split_paths_by_bucket(paths=paths)
    for bucket, keys in buckets.items():
        chunks: List[List[str]] = _utils.chunkify(lst=keys, max_length=1_000)
        if use_threads is False:
            for chunk in chunks:
                _delete_objects(bucket=bucket, keys=chunk, client_s3=client_s3)
        else:
            cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
            with concurrent.futures.ThreadPoolExecutor(
                    max_workers=cpus) as executor:
                list(
                    executor.map(_delete_objects, itertools.repeat(bucket),
                                 chunks, itertools.repeat(client_s3)))
Exemplo n.º 5
0
def write(
    df: pd.DataFrame,
    database: str,
    table: str,
    time_col: str,
    measure_col: str,
    dimensions_cols: List[str],
    num_threads: int = 32,
    boto3_session: Optional[boto3.Session] = None,
) -> List[Dict[str, str]]:
    """Store a Pandas DataFrame into a Amazon Timestream table.

    Parameters
    ----------
    df: pandas.DataFrame
        Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
    database : str
        Amazon Timestream database name.
    table : str
        Amazon Timestream table name.
    time_col : str
        DataFrame column name to be used as time. MUST be a timestamp column.
    measure_col : str
        DataFrame column name to be used as measure.
    dimensions_cols : List[str]
        List of DataFrame column names to be used as dimensions.
    num_threads : str
        Number of thread to be used for concurrent writing.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 Session will be used if boto3_session receive None.

    Returns
    -------
    List[Dict[str, str]]
        Rejected records.

    Examples
    --------
    Store a Pandas DataFrame into a Amazon Timestream table.

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> df = pd.DataFrame(
    >>>     {
    >>>         "time": [datetime.now(), datetime.now(), datetime.now()],
    >>>         "dim0": ["foo", "boo", "bar"],
    >>>         "dim1": [1, 2, 3],
    >>>         "measure": [1.0, 1.1, 1.2],
    >>>     }
    >>> )
    >>> rejected_records = wr.timestream.write(
    >>>     df=df,
    >>>     database="sampleDB",
    >>>     table="sampleTable",
    >>>     time_col="time",
    >>>     measure_col="measure",
    >>>     dimensions_cols=["dim0", "dim1"],
    >>> )
    >>> assert len(rejected_records) == 0

    """
    measure_type: str = _data_types.timestream_type_from_pandas(
        df[[measure_col]])
    _logger.debug("measure_type: %s", measure_type)
    cols_names: List[str] = [time_col, measure_col] + dimensions_cols
    _logger.debug("cols_names: %s", cols_names)
    batches: List[List[Any]] = _utils.chunkify(lst=_df2list(df=df[cols_names]),
                                               max_length=100)
    _logger.debug("len(batches): %s", len(batches))
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=num_threads) as executor:
        res: List[List[Any]] = list(
            executor.map(
                _write_batch,
                itertools.repeat(database),
                itertools.repeat(table),
                itertools.repeat(cols_names),
                itertools.repeat(measure_type),
                batches,
                itertools.repeat(
                    _utils.boto3_to_primitives(boto3_session=boto3_session)),
            ))
        return [item for sublist in res for item in sublist]
Exemplo n.º 6
0
def delete_objects(
    path: Union[str, List[str]],
    use_threads: bool = True,
    last_modified_begin: Optional[datetime.datetime] = None,
    last_modified_end: Optional[datetime.datetime] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> None:
    """Delete Amazon S3 objects from a received S3 prefix or list of S3 objects paths.

    This function accepts Unix shell-style wildcards in the path argument.
    * (matches everything), ? (matches any single character),
    [seq] (matches any character in seq), [!seq] (matches any character not in seq).

    Note
    ----
    In case of `use_threads=True` the number of threads
    that will be spawned will be gotten from os.cpu_count().

    Note
    ----
    The filter by last_modified begin last_modified end is applied after list all S3 files

    Parameters
    ----------
    path : Union[str, List[str]]
        S3 prefix (accepts Unix shell-style wildcards)
        (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
    use_threads : bool
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
    last_modified_begin
        Filter the s3 files by the Last modified date of the object.
        The filter is applied only after list all s3 files.
    last_modified_end: datetime, optional
        Filter the s3 files by the Last modified date of the object.
        The filter is applied only after list all s3 files.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    None
        None.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.s3.delete_objects(['s3://bucket/key0', 's3://bucket/key1'])  # Delete both objects
    >>> wr.s3.delete_objects('s3://bucket/prefix')  # Delete all objects under the received prefix

    """
    paths: List[str] = _path2list(
        path=path,
        boto3_session=boto3_session,
        last_modified_begin=last_modified_begin,
        last_modified_end=last_modified_end,
    )
    if len(paths) < 1:
        return
    buckets: Dict[str, List[str]] = _split_paths_by_bucket(paths=paths)
    for bucket, keys in buckets.items():
        chunks: List[List[str]] = _utils.chunkify(lst=keys, max_length=1_000)
        if len(chunks) == 1:
            _delete_objects(bucket=bucket,
                            keys=chunks[0],
                            boto3_session=boto3_session)
        elif use_threads is False:
            for chunk in chunks:
                _delete_objects(bucket=bucket,
                                keys=chunk,
                                boto3_session=boto3_session)
        else:
            cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
            with concurrent.futures.ThreadPoolExecutor(
                    max_workers=cpus) as executor:
                list(
                    executor.map(
                        _delete_objects_concurrent,
                        itertools.repeat(bucket),
                        chunks,
                        itertools.repeat(
                            _utils.boto3_to_primitives(
                                boto3_session=boto3_session)),
                    ))