示例#1
0
def delete_database(
    database: str,
    boto3_session: Optional[boto3.Session] = None,
) -> None:
    """Delete a given Timestream database. This is an irreversible operation.

    After a database is deleted, the time series data from its tables cannot be recovered.

    All tables in the database must be deleted first, or a ValidationException error will be thrown.

    Due to the nature of distributed retries,
    the operation can return either success or a ResourceNotFoundException.
    Clients should consider them equivalent.

    Parameters
    ----------
    database: str
        Database name.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 Session will be used if boto3_session receive None.

    Returns
    -------
    None
        None.

    Examples
    --------
    Deleting a database

    >>> import awswrangler as wr
    >>> arn = wr.timestream.delete_database("MyDatabase")

    """
    client: boto3.client = _utils.client(service_name="timestream-write",
                                         session=boto3_session)
    client.delete_database(DatabaseName=database)
示例#2
0
def get_table_types(
        database: str,
        table: str,
        boto3_session: Optional[boto3.Session] = None) -> Dict[str, str]:
    """Get all columns and types from a table.

    Parameters
    ----------
    database : str
        Database name.
    table : str
        Table name.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    Dict[str, str]
        A dictionary as {'col name': 'col data type'}.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.catalog.get_table_types(database='default', name='my_table')
    {'col0': 'int', 'col1': double}

    """
    client_glue: boto3.client = _utils.client(service_name="glue",
                                              session=boto3_session)
    response: Dict[str, Any] = client_glue.get_table(DatabaseName=database,
                                                     Name=table)
    dtypes: Dict[str, str] = {}
    for col in response["Table"]["StorageDescriptor"]["Columns"]:
        dtypes[col["Name"]] = col["Type"]
    for par in response["Table"]["PartitionKeys"]:
        dtypes[par["Name"]] = par["Type"]
    return dtypes
示例#3
0
def _get_table_input(
    database: str,
    table: str,
    boto3_session: Optional[boto3.Session],
    transaction_id: Optional[str] = None,
    catalog_id: Optional[str] = None,
) -> Optional[Dict[str, Any]]:
    client_glue: boto3.client = _utils.client(service_name="glue",
                                              session=boto3_session)
    args: Dict[str, Any] = _catalog_id(catalog_id=catalog_id,
                                       **_transaction_id(
                                           transaction_id=transaction_id,
                                           DatabaseName=database,
                                           Name=table))
    try:
        response: Dict[str, Any] = client_glue.get_table(**args)
    except client_glue.exceptions.EntityNotFoundException:
        return None
    table_input: Dict[str, Any] = {}
    for k, v in response["Table"].items():
        if k in [
                "Name",
                "Description",
                "Owner",
                "LastAccessTime",
                "LastAnalyzedTime",
                "Retention",
                "StorageDescriptor",
                "PartitionKeys",
                "ViewOriginalText",
                "ViewExpandedText",
                "TableType",
                "Parameters",
                "TargetTable",
        ]:
            table_input[k] = v
    return table_input
示例#4
0
def get_table_versions(
    database: str, table: str, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None
) -> List[Dict[str, Any]]:
    """Get all versions.

    Parameters
    ----------
    database : str
        Database name.
    table : str
        Table name.
    catalog_id : str, optional
        The ID of the Data Catalog from which to retrieve Databases.
        If none is provided, the AWS account ID is used by default.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    List[Dict[str, Any]
        List of table inputs:
        https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue.html#Glue.Client.get_table_versions

    Examples
    --------
    >>> import awswrangler as wr
    >>> tables_versions = wr.catalog.get_table_versions(database="...", table="...")

    """
    client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session)
    paginator = client_glue.get_paginator("get_table_versions")
    versions: List[Dict[str, Any]] = []
    response_iterator = paginator.paginate(**_catalog_id(DatabaseName=database, TableName=table, catalog_id=catalog_id))
    for page in response_iterator:
        for tbl in page["TableVersions"]:
            versions.append(tbl)
    return versions
示例#5
0
def get_query_columns_types(
        query_execution_id: str,
        boto3_session: Optional[boto3.Session] = None) -> Dict[str, str]:
    """Get the data type of all columns queried.

    https://docs.aws.amazon.com/athena/latest/ug/data-types.html

    Parameters
    ----------
    query_execution_id : str
        Athena query execution ID.
    boto3_session : boto3.Session(), optional
        Boto3 Session. If none, the default boto3 session is used.

    Returns
    -------
    Dict[str, str]
        Dictionary with all data types.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.athena.get_query_columns_types('query-execution-id')
    {'col0': 'int', 'col1': 'double'}

    """
    client_athena: boto3.client = _utils.client(
        service_name="athena",
        session=_utils.ensure_session(session=boto3_session))
    response: Dict[str, Any] = client_athena.get_query_results(
        QueryExecutionId=query_execution_id, MaxResults=1)
    col_info: List[Dict[
        str, str]] = response["ResultSet"]["ResultSetMetadata"]["ColumnInfo"]
    return dict(
        (c["Name"], f"{c['Type']}({c['Precision']},{c.get('Scale', 0)})"
         ) if c["Type"] in ["decimal"] else (c["Name"], c["Type"])
        for c in col_info)
示例#6
0
def delete_table_if_exists(
    database: str, table: str, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None
) -> bool:
    """Delete Glue table if exists.

    Parameters
    ----------
    database : str
        Database name.
    table : str
        Table name.
    catalog_id : str, optional
        The ID of the Data Catalog from which to retrieve Databases.
        If none is provided, the AWS account ID is used by default.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    bool
        True if deleted, otherwise False.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.catalog.delete_table_if_exists(database='default', table='my_table')  # deleted
    True
    >>> wr.catalog.delete_table_if_exists(database='default', table='my_table')  # Nothing to be deleted
    False

    """
    client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session)
    try:
        client_glue.delete_table(**_catalog_id(DatabaseName=database, Name=table, catalog_id=catalog_id))
        return True
    except client_glue.exceptions.EntityNotFoundException:
        return False
示例#7
0
def get_connection(
        name: str,
        catalog_id: Optional[str] = None,
        boto3_session: Optional[boto3.Session] = None) -> Dict[str, Any]:
    """Get Glue connection details.

    Parameters
    ----------
    name : str
        Connection name.
    catalog_id : str, optional
        The ID of the Data Catalog from which to retrieve Databases.
        If none is provided, the AWS account ID is used by default.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    Dict[str, Any]
        API Response for:
        https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue.html#Glue.Client.get_connection

    Examples
    --------
    >>> import awswrangler as wr
    >>> res = wr.catalog.get_connection(name='my_connection')

    """
    client_glue: boto3.client = _utils.client(service_name="glue",
                                              session=boto3_session)
    if catalog_id is None:
        return client_glue.get_connection(Name=name,
                                          HidePassword=False)["Connection"]
    return client_glue.get_connection(CatalogId=catalog_id,
                                      Name=name,
                                      HidePassword=False)["Connection"]
示例#8
0
def delete_table_if_exists(
        database: str,
        table: str,
        boto3_session: Optional[boto3.Session] = None) -> bool:
    """Delete Glue table if exists.

    Parameters
    ----------
    database : str
        Database name.
    table : str
        Table name.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    bool
        True if deleted, otherwise False.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.catalog.delete_table_if_exists(database='default', name='my_table')  # deleted
    True
    >>> wr.catalog.delete_table_if_exists(database='default', name='my_table')  # Nothing to be deleted
    False

    """
    client_glue: boto3.client = _utils.client(service_name="glue",
                                              session=boto3_session)
    try:
        client_glue.delete_table(DatabaseName=database, Name=table)
        return True
    except client_glue.exceptions.EntityNotFoundException:
        return False
示例#9
0
def get_table_description(
        database: str,
        table: str,
        catalog_id: Optional[str] = None,
        boto3_session: Optional[boto3.Session] = None) -> Optional[str]:
    """Get table description.

    Parameters
    ----------
    database : str
        Database name.
    table : str
        Table name.
    catalog_id : str, optional
        The ID of the Data Catalog from which to retrieve Databases.
        If none is provided, the AWS account ID is used by default.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    Optional[str]
        Description if exists.

    Examples
    --------
    >>> import awswrangler as wr
    >>> desc = wr.catalog.get_table_description(database="...", table="...")

    """
    client_glue: boto3.client = _utils.client(service_name="glue",
                                              session=boto3_session)
    response: Dict[str, Any] = client_glue.get_table(**_catalog_id(
        catalog_id=catalog_id, DatabaseName=database, Name=table))
    desc: Optional[str] = response["Table"].get("Description", None)
    return desc
示例#10
0
def search_tables(text: str, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None):
    """Get Pandas DataFrame of tables filtered by a search string.

    Parameters
    ----------
    text : str, optional
        Select only tables with the given string in table's properties.
    catalog_id : str, optional
        The ID of the Data Catalog from which to retrieve Databases.
        If none is provided, the AWS account ID is used by default.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    Iterator[Dict[str, Any]]
        Iterator of tables.

    Examples
    --------
    >>> import awswrangler as wr
    >>> df_tables = wr.catalog.search_tables(text='my_property')

    """
    client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session)
    args: Dict[str, Any] = {"SearchText": text}
    if catalog_id is not None:
        args["CatalogId"] = catalog_id
    response: Dict[str, Any] = client_glue.search_tables(**args)
    for tbl in response["TableList"]:
        yield tbl
    while "NextToken" in response:  # pragma: no cover
        args["NextToken"] = response["NextToken"]
        response = client_glue.search_tables(**args)
        for tbl in response["TableList"]:
            yield tbl
示例#11
0
def get_account_id(boto3_session: Optional[boto3.Session] = None) -> str:
    """Get Account ID.

    Parameters
    ----------
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    str
        Account ID.

    Examples
    --------
    >>> import awswrangler as wr
    >>> account_id = wr.sts.get_account_id()

    """
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    return cast(
        str,
        _utils.client(service_name="sts",
                      session=session).get_caller_identity().get("Account"))
示例#12
0
def terminate_cluster(cluster_id: str, boto3_session: Optional[boto3.Session] = None) -> None:
    """Terminate EMR cluster.

    Parameters
    ----------
    cluster_id : str
        Cluster ID.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    None
        None.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.emr.terminate_cluster("cluster-id")

    """
    client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session)
    response: Dict[str, Any] = client_emr.terminate_job_flows(JobFlowIds=[cluster_id])
    _logger.debug("response: \n%s", pprint.pformat(response))
示例#13
0
def _list(
    func_name: str,
    attr_name: str,
    account_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
    **kwargs,
) -> List[Dict[str, Any]]:
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if account_id is None:
        account_id = sts.get_account_id(
            boto3_session=session)  # pragma: no cover
    client: boto3.client = _utils.client(service_name="quicksight",
                                         session=session)
    func: Callable = getattr(client, func_name)
    response = func(AwsAccountId=account_id, **kwargs)
    next_token: str = response.get("NextToken", None)
    result: List[Dict[str, Any]] = response[attr_name]
    while next_token is not None:  # pragma: no cover
        response = func(AwsAccountId=account_id,
                        NextToken=next_token,
                        **kwargs)
        next_token = response.get("NextToken", None)
        result += response[attr_name]
    return result
示例#14
0
def wait_query(query_execution_id: str, boto3_session: Optional[boto3.Session] = None) -> Dict[str, Any]:
    """Wait for the query end.

    Parameters
    ----------
    query_execution_id : str
        Athena query execution ID.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    Dict[str, Any]
        Dictionary with the get_query_execution response.

    Examples
    --------
    >>> import awswrangler as wr
    >>> res = wr.athena.wait_query(query_execution_id='query-execution-id')

    """
    final_states: List[str] = ["FAILED", "SUCCEEDED", "CANCELLED"]
    client_athena: boto3.client = _utils.client(service_name="athena", session=boto3_session)
    response: Dict[str, Any] = client_athena.get_query_execution(QueryExecutionId=query_execution_id)
    state: str = response["QueryExecution"]["Status"]["State"]
    while state not in final_states:
        time.sleep(_QUERY_WAIT_POLLING_DELAY)
        response = client_athena.get_query_execution(QueryExecutionId=query_execution_id)
        state = response["QueryExecution"]["Status"]["State"]
    _logger.debug("state: %s", state)
    _logger.debug("StateChangeReason: %s", response["QueryExecution"]["Status"].get("StateChangeReason"))
    if state == "FAILED":
        raise exceptions.QueryFailed(response["QueryExecution"]["Status"].get("StateChangeReason"))
    if state == "CANCELLED":
        raise exceptions.QueryCancelled(response["QueryExecution"]["Status"].get("StateChangeReason"))
    return response
示例#15
0
def read_sql_query(
    sql: str,
    database: str,
    transaction_id: Optional[str] = None,
    query_as_of_time: Optional[str] = None,
    catalog_id: Optional[str] = None,
    categories: Optional[List[str]] = None,
    safe: bool = True,
    map_types: bool = True,
    use_threads: bool = True,
    boto3_session: Optional[boto3.Session] = None,
    params: Optional[Dict[str, Any]] = None,
) -> pd.DataFrame:
    """Execute PartiQL query on AWS Glue Table (Transaction ID or time travel timestamp). Return Pandas DataFrame.

    Note
    ----
    ORDER BY operations are not honoured.
    i.e. sql="SELECT * FROM my_table ORDER BY my_column" is NOT valid

    Note
    ----
    The database must NOT be explicitely defined in the PartiQL statement.
    i.e. sql="SELECT * FROM my_table" is valid
    but sql="SELECT * FROM my_db.my_table" is NOT valid

    Note
    ----
    Pass one of `transaction_id` or `query_as_of_time`, not both.

    Parameters
    ----------
    sql : str
        partiQL query.
    database : str
        AWS Glue database name
    transaction_id : str, optional
        The ID of the transaction at which to read the table contents.
        Cannot be specified alongside query_as_of_time
    query_as_of_time : str, optional
        The time as of when to read the table contents. Must be a valid Unix epoch timestamp.
        Cannot be specified alongside transaction_id
    catalog_id : str, optional
        The ID of the Data Catalog from which to retrieve Databases.
        If none is provided, the AWS account ID is used by default.
    categories: Optional[List[str]], optional
        List of columns names that should be returned as pandas.Categorical.
        Recommended for memory restricted environments.
    safe : bool, default True
        For certain data types, a cast is needed in order to store the
        data in a pandas DataFrame or Series (e.g. timestamps are always
        stored as nanoseconds in pandas). This option controls whether it
        is a safe cast or not.
    map_types : bool, default True
        True to convert pyarrow DataTypes to pandas ExtensionDtypes. It is
        used to override the default pandas type for conversion of built-in
        pyarrow types or in absence of pandas_metadata in the Table schema.
    use_threads : bool
        True to enable concurrent requests, False to disable multiple threads.
        When enabled, os.cpu_count() is used as the max number of threads.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session is used if boto3_session receives None.
    params: Dict[str, any], optional
        Dict of parameters used to format the partiQL query. Only named parameters are supported.
        The dict must contain the information in the form {"name": "value"} and the SQL query must contain
        `:name`.

    Returns
    -------
    pd.DataFrame
        Pandas DataFrame.

    Examples
    --------
    >>> import awswrangler as wr
    >>> df = wr.lakeformation.read_sql_query(
    ...     sql="SELECT * FROM my_table;",
    ...     database="my_db",
    ...     catalog_id="111111111111"
    ... )

    >>> import awswrangler as wr
    >>> df = wr.lakeformation.read_sql_query(
    ...     sql="SELECT * FROM my_table LIMIT 10;",
    ...     database="my_db",
    ...     transaction_id="1b62811fa3e02c4e5fdbaa642b752030379c4a8a70da1f8732ce6ccca47afdc9"
    ... )

    >>> import awswrangler as wr
    >>> df = wr.lakeformation.read_sql_query(
    ...     sql="SELECT * FROM my_table WHERE name=:name; AND city=:city;",
    ...     database="my_db",
    ...     query_as_of_time="1611142914",
    ...     params={"name": "'filtered_name'", "city": "'filtered_city'"}
    ... )

    """
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    client_lakeformation: boto3.client = _utils.client(
        service_name="lakeformation", session=session)
    commit_trans: bool = False
    if params is None:
        params = {}
    for key, value in params.items():
        sql = sql.replace(f":{key};", str(value))

    if not any([transaction_id, query_as_of_time]):
        _logger.debug(
            "Neither `transaction_id` nor `query_as_of_time` were specified, starting transaction"
        )
        transaction_id = start_transaction(read_only=True,
                                           boto3_session=session)
        commit_trans = True
    args: Dict[str, Optional[str]] = _catalog_id(
        catalog_id=catalog_id,
        **_transaction_id(transaction_id=transaction_id,
                          query_as_of_time=query_as_of_time,
                          DatabaseName=database),
    )
    query_id: str = client_lakeformation.start_query_planning(
        QueryString=sql, QueryPlanningContext=args)["QueryId"]
    df = _resolve_sql_query(
        query_id=query_id,
        categories=categories,
        safe=safe,
        map_types=map_types,
        use_threads=use_threads,
        boto3_session=session,
    )
    if commit_trans:
        commit_transaction(transaction_id=transaction_id)  # type: ignore
    return df
def start_query(
    query: str,
    log_group_names: List[str],
    start_time: datetime.datetime = datetime.datetime(
        year=1970, month=1, day=1, tzinfo=datetime.timezone.utc),
    end_time: datetime.datetime = datetime.datetime.now(),
    limit: Optional[int] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> str:
    """Run a query against AWS CloudWatchLogs Insights.

    https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/CWL_QuerySyntax.html

    Parameters
    ----------
    query : str
        The query string.
    log_group_names : str
        The list of log groups to be queried. You can include up to 20 log groups.
    start_time : datetime.datetime
        The beginning of the time range to query.
    end_time : datetime.datetime
        The end of the time range to query.
    limit : Optional[int]
        The maximum number of log events to return in the query.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    str
        Query ID.

    Examples
    --------
    >>> import awswrangler as wr
    >>> query_id = wr.cloudwatch.start_query(
    ...     log_group_names=["loggroup"],
    ...     query="fields @timestamp, @message | sort @timestamp desc | limit 5",
    ... )

    """
    _logger.debug("log_group_names: %s", log_group_names)
    start_timestamp: int = int(1000 * start_time.timestamp())
    end_timestamp: int = int(1000 * end_time.timestamp())
    _logger.debug("start_timestamp: %s", start_timestamp)
    _logger.debug("end_timestamp: %s", end_timestamp)
    _validate_args(start_timestamp=start_timestamp,
                   end_timestamp=end_timestamp)
    args: Dict[str, Any] = {
        "logGroupNames": log_group_names,
        "startTime": start_timestamp,
        "endTime": end_timestamp,
        "queryString": query,
    }
    if limit is not None:
        args["limit"] = limit
    client_logs: boto3.client = _utils.client(service_name="logs",
                                              session=boto3_session)
    response: Dict[str, Any] = client_logs.start_query(**args)
    return cast(str, response["queryId"])
示例#17
0
    def __init__(
        self,
        path: str,
        s3_block_size: int,
        mode: str,
        use_threads: bool,
        s3_additional_kwargs: Optional[Dict[str, str]],
        boto3_session: Optional[boto3.Session],
        newline: Optional[str],
        encoding: Optional[str],
    ) -> None:
        self.closed: bool = False
        self._use_threads = use_threads
        self._newline: str = "\n" if newline is None else newline
        self._encoding: str = "utf-8" if encoding is None else encoding
        self._bucket, self._key = _utils.parse_path(path=path)
        self._boto3_session: boto3.Session = _utils.ensure_session(
            session=boto3_session)
        if mode not in {"rb", "wb", "r", "w"}:
            raise NotImplementedError(
                "File mode must be {'rb', 'wb', 'r', 'w'}, not %s" % mode)
        self._mode: str = "rb" if mode is None else mode
        self._one_shot_download: bool = False
        if 0 < s3_block_size < 3:
            raise exceptions.InvalidArgumentValue(
                "s3_block_size MUST > 2 to define a valid size or "
                "< 1 to avoid blocks and always execute one shot downloads.")
        if s3_block_size <= 0:
            _logger.debug("s3_block_size of %d, enabling one_shot_download.",
                          s3_block_size)
            self._one_shot_download = True
        self._s3_block_size: int = s3_block_size
        self._s3_half_block_size: int = s3_block_size // 2
        self._s3_additional_kwargs: Dict[
            str,
            str] = {} if s3_additional_kwargs is None else s3_additional_kwargs
        self._client: boto3.client = _utils.client(service_name="s3",
                                                   session=self._boto3_session)
        self._loc: int = 0

        if self.readable() is True:
            self._cache: bytes = b""
            self._start: int = 0
            self._end: int = 0
            size: Optional[int] = size_objects(
                path=[path],
                use_threads=False,
                boto3_session=self._boto3_session)[path]
            if size is None:
                raise exceptions.InvalidArgumentValue(
                    f"S3 object w/o defined size: {path}")
            self._size: int = size
            _logger.debug("self._size: %s", self._size)
            _logger.debug("self._s3_block_size: %s", self._s3_block_size)
        elif self.writable() is True:
            self._mpu: Dict[str, Any] = {}
            self._buffer: io.BytesIO = io.BytesIO()
            self._parts_count: int = 0
            self._size = 0
            self._upload_proxy: _UploadProxy = _UploadProxy(
                use_threads=self._use_threads)
        else:
            raise RuntimeError(f"Invalid mode: {self._mode}")
示例#18
0
def create_table(
    database: str,
    table: str,
    memory_retention_hours: int,
    magnetic_retention_days: int,
    tags: Optional[Dict[str, str]] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> str:
    """Create a new Timestream database.

    Note
    ----
    If the KMS key is not specified, the database will be encrypted with a
    Timestream managed KMS key located in your account.

    Parameters
    ----------
    database: str
        Database name.
    table: str
        Table name.
    memory_retention_hours: int
        The duration for which data must be stored in the memory store.
    magnetic_retention_days: int
        The duration for which data must be stored in the magnetic store.
    tags: Optional[Dict[str, str]]
        Key/Value dict to put on the table.
        Tags enable you to categorize databases and/or tables, for example,
        by purpose, owner, or environment.
        e.g. {"foo": "boo", "bar": "xoo"})
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 Session will be used if boto3_session receive None.

    Returns
    -------
    str
        The Amazon Resource Name that uniquely identifies this database. (ARN)

    Examples
    --------
    Creating a table.

    >>> import awswrangler as wr
    >>> arn = wr.timestream.create_table(
    ...     database="MyDatabase",
    ...     table="MyTable",
    ...     memory_retention_hours=3,
    ...     magnetic_retention_days=7
    ... )

    """
    client: boto3.client = _utils.client(service_name="timestream-write",
                                         session=boto3_session)
    args: Dict[str, Any] = {
        "DatabaseName": database,
        "TableName": table,
        "RetentionProperties": {
            "MemoryStoreRetentionPeriodInHours": memory_retention_hours,
            "MagneticStoreRetentionPeriodInDays": magnetic_retention_days,
        },
    }
    if tags is not None:
        args["Tags"] = [{"Key": k, "Value": v} for k, v in tags.items()]
    response: Dict[str, Dict[str, Any]] = client.create_table(**args)
    return cast(str, response["Table"]["Arn"])
示例#19
0
def _extract_ctas_manifest_paths(path: str, boto3_session: Optional[boto3.Session] = None) -> List[str]:
    """Get the list of paths of the generated files."""
    bucket_name, key_path = _utils.parse_path(path)
    client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session)
    body: bytes = client_s3.get_object(Bucket=bucket_name, Key=key_path)["Body"].read()
    return [x for x in body.decode("utf-8").split("\n") if x != ""]
示例#20
0
def _list_objects(  # pylint: disable=too-many-branches
    path: str,
    delimiter: Optional[str] = None,
    suffix: Union[str, List[str], None] = None,
    ignore_suffix: Union[str, List[str], None] = None,
    last_modified_begin: Optional[datetime.datetime] = None,
    last_modified_end: Optional[datetime.datetime] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> List[str]:
    bucket: str
    prefix_original: str
    bucket, prefix_original = _utils.parse_path(path=path)
    prefix: str = _prefix_cleanup(prefix=prefix_original)
    _suffix: Union[List[str],
                   None] = [suffix] if isinstance(suffix, str) else suffix
    _ignore_suffix: Union[List[str], None] = [ignore_suffix] if isinstance(
        ignore_suffix, str) else ignore_suffix
    client_s3: boto3.client = _utils.client(service_name="s3",
                                            session=boto3_session)
    paginator = client_s3.get_paginator("list_objects_v2")
    args: Dict[str, Any] = {
        "Bucket": bucket,
        "Prefix": prefix,
        "PaginationConfig": {
            "PageSize": 1000
        }
    }
    if delimiter is not None:
        args["Delimiter"] = delimiter
    response_iterator = paginator.paginate(**args)
    paths: List[str] = []
    _validate_datetimes(last_modified_begin=last_modified_begin,
                        last_modified_end=last_modified_end)

    for page in response_iterator:  # pylint: disable=too-many-nested-blocks
        if delimiter is None:
            contents: Optional[List[Dict[str, Any]]] = page.get("Contents")
            if contents is not None:
                for content in contents:
                    key: str = content["Key"]
                    if (content is not None) and ("Key" in content):
                        if (_suffix is None) or key.endswith(tuple(_suffix)):
                            if last_modified_begin is not None:
                                if content[
                                        "LastModified"] < last_modified_begin:
                                    continue
                            if last_modified_end is not None:
                                if content["LastModified"] > last_modified_end:
                                    continue
                            paths.append(f"s3://{bucket}/{key}")
        else:
            prefixes: Optional[List[Optional[Dict[str, str]]]] = page.get(
                "CommonPrefixes")
            if prefixes is not None:
                for pfx in prefixes:
                    if (pfx is not None) and ("Prefix" in pfx):
                        key = pfx["Prefix"]
                        paths.append(f"s3://{bucket}/{key}")

    if prefix != prefix_original:
        paths = fnmatch.filter(paths, path)

    if _ignore_suffix is not None:
        paths = [
            p for p in paths if p.endswith(tuple(_ignore_suffix)) is False
        ]

    return paths
示例#21
0
def _create_table(  # pylint: disable=too-many-branches,too-many-statements
    database: str,
    table: str,
    description: Optional[str],
    parameters: Optional[Dict[str, str]],
    mode: str,
    catalog_versioning: bool,
    boto3_session: Optional[boto3.Session],
    table_input: Dict[str, Any],
    table_exist: bool,
    projection_enabled: bool,
    partitions_types: Optional[Dict[str, str]],
    columns_comments: Optional[Dict[str, str]],
    projection_types: Optional[Dict[str, str]],
    projection_ranges: Optional[Dict[str, str]],
    projection_values: Optional[Dict[str, str]],
    projection_intervals: Optional[Dict[str, str]],
    projection_digits: Optional[Dict[str, str]],
    catalog_id: Optional[str],
) -> None:
    # Description
    mode = _update_if_necessary(dic=table_input,
                                key="Description",
                                value=description,
                                mode=mode)

    # Parameters
    parameters = parameters if parameters else {}
    for k, v in parameters.items():
        mode = _update_if_necessary(dic=table_input["Parameters"],
                                    key=k,
                                    value=v,
                                    mode=mode)

    # Projection
    if projection_enabled is True:
        table_input["Parameters"]["projection.enabled"] = "true"
        partitions_types = partitions_types if partitions_types else {}
        projection_types = projection_types if projection_types else {}
        projection_ranges = projection_ranges if projection_ranges else {}
        projection_values = projection_values if projection_values else {}
        projection_intervals = projection_intervals if projection_intervals else {}
        projection_digits = projection_digits if projection_digits else {}
        projection_types = {
            sanitize_column_name(k): v
            for k, v in projection_types.items()
        }
        projection_ranges = {
            sanitize_column_name(k): v
            for k, v in projection_ranges.items()
        }
        projection_values = {
            sanitize_column_name(k): v
            for k, v in projection_values.items()
        }
        projection_intervals = {
            sanitize_column_name(k): v
            for k, v in projection_intervals.items()
        }
        projection_digits = {
            sanitize_column_name(k): v
            for k, v in projection_digits.items()
        }
        for k, v in projection_types.items():
            dtype: Optional[str] = partitions_types.get(k)
            if dtype is None:
                raise exceptions.InvalidArgumentCombination(
                    f"Column {k} appears as projected column but not as partitioned column."
                )
            if dtype == "date":
                table_input["Parameters"][
                    f"projection.{k}.format"] = "yyyy-MM-dd"
            elif dtype == "timestamp":
                table_input["Parameters"][
                    f"projection.{k}.format"] = "yyyy-MM-dd HH:mm:ss"
                table_input["Parameters"][
                    f"projection.{k}.interval.unit"] = "SECONDS"
                table_input["Parameters"][f"projection.{k}.interval"] = "1"
        for k, v in projection_types.items():
            mode = _update_if_necessary(dic=table_input["Parameters"],
                                        key=f"projection.{k}.type",
                                        value=v,
                                        mode=mode)
        for k, v in projection_ranges.items():
            mode = _update_if_necessary(dic=table_input["Parameters"],
                                        key=f"projection.{k}.range",
                                        value=v,
                                        mode=mode)
        for k, v in projection_values.items():
            mode = _update_if_necessary(dic=table_input["Parameters"],
                                        key=f"projection.{k}.values",
                                        value=v,
                                        mode=mode)
        for k, v in projection_intervals.items():
            mode = _update_if_necessary(dic=table_input["Parameters"],
                                        key=f"projection.{k}.interval",
                                        value=str(v),
                                        mode=mode)
        for k, v in projection_digits.items():
            mode = _update_if_necessary(dic=table_input["Parameters"],
                                        key=f"projection.{k}.digits",
                                        value=str(v),
                                        mode=mode)
    else:
        table_input["Parameters"]["projection.enabled"] = "false"

    # Column comments
    columns_comments = columns_comments if columns_comments else {}
    columns_comments = {
        sanitize_column_name(k): v
        for k, v in columns_comments.items()
    }
    if columns_comments:
        for col in table_input["StorageDescriptor"]["Columns"]:
            name: str = col["Name"]
            if name in columns_comments:
                mode = _update_if_necessary(dic=col,
                                            key="Comment",
                                            value=columns_comments[name],
                                            mode=mode)
        for par in table_input["PartitionKeys"]:
            name = par["Name"]
            if name in columns_comments:
                mode = _update_if_necessary(dic=par,
                                            key="Comment",
                                            value=columns_comments[name],
                                            mode=mode)

    _logger.debug("table_input: %s", table_input)

    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    client_glue: boto3.client = _utils.client(service_name="glue",
                                              session=session)
    skip_archive: bool = not catalog_versioning
    if mode not in ("overwrite", "append", "overwrite_partitions", "update"):
        raise exceptions.InvalidArgument(
            f"{mode} is not a valid mode. It must be 'overwrite', 'append' or 'overwrite_partitions'."
        )
    if table_exist is True and mode == "overwrite":
        delete_all_partitions(table=table,
                              database=database,
                              catalog_id=catalog_id,
                              boto3_session=session)
        _logger.debug("Updating table (%s)...", mode)
        client_glue.update_table(**_catalog_id(catalog_id=catalog_id,
                                               DatabaseName=database,
                                               TableInput=table_input,
                                               SkipArchive=skip_archive))
    elif (table_exist is True) and (mode in ("append", "overwrite_partitions",
                                             "update")):
        if mode == "update":
            _logger.debug("Updating table (%s)...", mode)
            client_glue.update_table(**_catalog_id(catalog_id=catalog_id,
                                                   DatabaseName=database,
                                                   TableInput=table_input,
                                                   SkipArchive=skip_archive))
    elif table_exist is False:
        try:
            _logger.debug("Creating table (%s)...", mode)
            client_glue.create_table(**_catalog_id(catalog_id=catalog_id,
                                                   DatabaseName=database,
                                                   TableInput=table_input))
        except client_glue.exceptions.AlreadyExistsException:
            if mode == "overwrite":
                _utils.try_it(
                    f=_overwrite_table,
                    ex=client_glue.exceptions.AlreadyExistsException,
                    client_glue=client_glue,
                    catalog_id=catalog_id,
                    database=database,
                    table=table,
                    table_input=table_input,
                    boto3_session=boto3_session,
                )
    _logger.debug("Leaving table as is (%s)...", mode)
示例#22
0
def _resolve_sql_query(
    query_id: str,
    categories: Optional[List[str]],
    safe: bool,
    map_types: bool,
    use_threads: bool,
    boto3_session: boto3.Session,
) -> pd.DataFrame:
    client_lakeformation: boto3.client = _utils.client(
        service_name="lakeformation", session=boto3_session)

    wait_query(query_id=query_id, boto3_session=boto3_session)

    # The LF Query Engine distributes the load across workers
    # Retrieve the tokens and their associated work units until NextToken is ''
    # One Token can span multiple work units
    # PageSize determines the size of the "Units" array in each call
    scan_kwargs: Dict[str, Union[str, int]] = {
        "QueryId": query_id,
        "PageSize": 10
    }
    next_token: str = "init_token"  # Dummy token
    token_work_units: List[Tuple[str, int]] = []
    while next_token:
        response = client_lakeformation.get_work_units(**scan_kwargs)
        token_work_units.extend(  # [(Token0, WorkUnitId0), (Token0, WorkUnitId1), (Token1, WorkUnitId2) ... ]
            [
                (unit["WorkUnitToken"], unit_id)
                for unit in response["WorkUnitRanges"] for unit_id in range(
                    unit["WorkUnitIdMin"], unit["WorkUnitIdMax"] +
                    1)  # Max is inclusive
            ])
        next_token = response.get("NextToken", None)
        scan_kwargs["NextToken"] = next_token

    tables: List[Table] = []
    if use_threads is False:
        tables = list(
            _get_work_unit_results(
                query_id=query_id,
                token_work_unit=token_work_unit,
                client_lakeformation=client_lakeformation,
            ) for token_work_unit in token_work_units)
    else:
        cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=cpus) as executor:
            tables = list(
                executor.map(
                    _get_work_unit_results,
                    itertools.repeat(query_id),
                    token_work_units,
                    itertools.repeat(client_lakeformation),
                ))
    table = concat_tables(tables)
    args = {
        "use_threads":
        use_threads,
        "split_blocks":
        True,
        "self_destruct":
        True,
        "integer_object_nulls":
        False,
        "date_as_object":
        True,
        "ignore_metadata":
        True,
        "strings_to_categorical":
        False,
        "categories":
        categories,
        "safe":
        safe,
        "types_mapper":
        _data_types.pyarrow2pandas_extension if map_types else None,
    }
    return _utils.ensure_df_is_mutable(df=table.to_pandas(**args))
示例#23
0
def create_athena_dataset(
    name: str,
    database: Optional[str] = None,
    table: Optional[str] = None,
    sql: Optional[str] = None,
    sql_name: str = "CustomSQL",
    data_source_name: Optional[str] = None,
    data_source_arn: Optional[str] = None,
    import_mode: str = "DIRECT_QUERY",
    allowed_to_use: Optional[List[str]] = None,
    allowed_to_manage: Optional[List[str]] = None,
    logical_table_alias: str = "LogicalTable",
    rename_columns: Optional[Dict[str, str]] = None,
    cast_columns_types: Optional[Dict[str, str]] = None,
    tags: Optional[Dict[str, str]] = None,
    account_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> str:
    """Create a QuickSight dataset.

    Note
    ----
    You will not be able to see the the dataset in the console
    if you not pass your user to one of the ``allowed_*`` arguments.

    Note
    ----
    You must pass ``database``/``table`` OR ``sql`` argument.

    Note
    ----
    You must pass ``data_source_name`` OR ``data_source_arn`` argument.

    Parameters
    ----------
    name : str
        Dataset name.
    database : str
        Athena's database name.
    table : str
        Athena's table name.
    sql : str
        Use a SQL query to define your table.
    sql_name : str
        Query name.
    data_source_name : str, optional
        QuickSight data source name.
    data_source_arn : str, optional
        QuickSight data source ARN.
    import_mode : str
        Indicates whether you want to import the data into SPICE.
        'SPICE'|'DIRECT_QUERY'
    tags : Dict[str, str], optional
        Key/Value collection to put on the Cluster.
        e.g. {"foo": "boo", "bar": "xoo"})
    allowed_to_use : optional
        List of principals that will be allowed to see and use the data source.
        e.g. ["john", "Mary"]
    allowed_to_manage : optional
        List of principals that will be allowed to see, use, update and delete the data source.
        e.g. ["Mary"]
    logical_table_alias : str
        A display name for the logical table.
    rename_columns : Dict[str, str], optional
        Dictionary to map column renames. e.g. {"old_name": "new_name", "old_name2": "new_name2"}
    cast_columns_types : Dict[str, str], optional
        Dictionary to map column casts. e.g. {"col_name": "STRING", "col_name2": "DECIMAL"}
        Valid types: 'STRING'|'INTEGER'|'DECIMAL'|'DATETIME'
    account_id : str, optional
        If None, the account ID will be inferred from your boto3 session.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    str
        Dataset ID.

    Examples
    --------
    >>> import awswrangler as wr
    >>> dataset_id = wr.quicksight.create_athena_dataset(
    ...     name="...",
    ...     database="..."
    ...     table="..."
    ...     data_source_name="..."
    ...     allowed_to_manage=["Mary"]
    ... )
    """
    if (data_source_name is None) and (data_source_arn is None):
        raise exceptions.InvalidArgument("You must pass a not None data_source_name or data_source_arn argument.")
    if ((database is None) and (table is None)) and (sql is None):
        raise exceptions.InvalidArgument("You must pass database/table OR sql argument.")
    if (database is not None) and (sql is not None):
        raise exceptions.InvalidArgument(
            "If you provide sql argument, please include the database name inside the sql statement."
            "Do NOT pass in with database argument."
        )
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    client: boto3.client = _utils.client(service_name="quicksight", session=session)
    if account_id is None:
        account_id = sts.get_account_id(boto3_session=session)
    if (data_source_arn is None) and (data_source_name is not None):
        data_source_arn = get_data_source_arn(name=data_source_name, account_id=account_id, boto3_session=session)
    if sql is not None:
        physical_table: Dict[str, Dict[str, Any]] = {
            "CustomSql": {
                "DataSourceArn": data_source_arn,
                "Name": sql_name,
                "SqlQuery": sql,
                "Columns": extract_athena_query_columns(
                    sql=sql,
                    data_source_arn=data_source_arn,  # type: ignore
                    account_id=account_id,
                    boto3_session=session,
                ),
            }
        }
    else:
        physical_table = {
            "RelationalTable": {
                "DataSourceArn": data_source_arn,
                "Schema": database,
                "Name": table,
                "InputColumns": extract_athena_table_columns(
                    database=database,  # type: ignore
                    table=table,  # type: ignore
                    boto3_session=session,
                ),
            }
        }
    table_uuid: str = uuid.uuid4().hex
    dataset_id: str = uuid.uuid4().hex
    args: Dict[str, Any] = {
        "AwsAccountId": account_id,
        "DataSetId": dataset_id,
        "Name": name,
        "ImportMode": import_mode,
        "PhysicalTableMap": {table_uuid: physical_table},
        "LogicalTableMap": {table_uuid: {"Alias": logical_table_alias, "Source": {"PhysicalTableId": table_uuid}}},
    }
    trans: List[Dict[str, Dict[str, Any]]] = _generate_transformations(
        rename_columns=rename_columns, cast_columns_types=cast_columns_types
    )
    if trans:
        args["LogicalTableMap"][table_uuid]["DataTransforms"] = trans
    permissions: List[Dict[str, Union[str, List[str]]]] = _generate_permissions(
        resource="dataset",
        account_id=account_id,
        boto3_session=session,
        allowed_to_use=allowed_to_use,
        allowed_to_manage=allowed_to_manage,
    )
    if permissions:
        args["Permissions"] = permissions
    if tags is not None:
        _tags: List[Dict[str, str]] = [{"Key": k, "Value": v} for k, v in tags.items()]
        args["Tags"] = _tags
    client.create_data_set(**args)
    return dataset_id
示例#24
0
def get_redshift_temp_engine(
    cluster_identifier: str,
    user: str,
    database: Optional[str] = None,
    duration: int = 900,
    auto_create: bool = True,
    db_groups: Optional[List[str]] = None,
    boto3_session: Optional[boto3.Session] = None,
    **sqlalchemy_kwargs: Any,
) -> sqlalchemy.engine.Engine:
    """Get Glue connection details.

    Parameters
    ----------
    cluster_identifier : str
        The unique identifier of a cluster.
        This parameter is case sensitive.
    user : str, optional
        The name of a database user.
    database : str, optional
        Database name. If None, the default Database is used.
    duration : int, optional
        The number of seconds until the returned temporary password expires.
        Constraint: minimum 900, maximum 3600.
        Default: 900
    auto_create : bool
        Create a database user with the name specified for the user named in user if one does not exist.
    db_groups: List[str], optinal
        A list of the names of existing database groups that the user named in DbUser will join for the current session,
        in addition to any group memberships for an existing user.
        If not specified, a new user is added only to PUBLIC.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
    sqlalchemy_kwargs
        keyword arguments forwarded to sqlalchemy.create_engine().
        https://docs.sqlalchemy.org/en/13/core/engines.html

    Returns
    -------
    sqlalchemy.engine.Engine
        SQLAlchemy Engine.

    Examples
    --------
    >>> import awswrangler as wr
    >>> engine = wr.db.get_redshift_temp_engine('my_cluster', 'my_user')

    """
    client_redshift: boto3.client = _utils.client(service_name="redshift",
                                                  session=boto3_session)
    args: Dict[str, Any] = {
        "DbUser": user,
        "ClusterIdentifier": cluster_identifier,
        "DurationSeconds": duration,
        "AutoCreate": auto_create,
    }
    if db_groups is not None:
        args["DbGroups"] = db_groups
    res: Dict[str, Any] = client_redshift.get_cluster_credentials(**args)
    _user: str = _quote_plus(res["DbUser"])
    password: str = _quote_plus(res["DbPassword"])
    cluster: Dict[str, Any] = client_redshift.describe_clusters(
        ClusterIdentifier=cluster_identifier)["Clusters"][0]
    host: str = cluster["Endpoint"]["Address"]
    port: str = cluster["Endpoint"]["Port"]
    if database is None:
        database = cluster["DBName"]
    conn_str: str = f"redshift+psycopg2://{_user}:{password}@{host}:{port}/{database}"
    sqlalchemy_kwargs["executemany_mode"] = "values"
    sqlalchemy_kwargs["executemany_values_page_size"] = 100_000
    return sqlalchemy.create_engine(conn_str, **sqlalchemy_kwargs)
示例#25
0
def write_redshift_copy_manifest(
    manifest_path: str,
    paths: List[str],
    use_threads: bool = True,
    boto3_session: Optional[boto3.Session] = None,
    s3_additional_kwargs: Optional[Dict[str, str]] = None,
) -> Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]]:
    """Write Redshift copy manifest and return its structure.

    Only Parquet files are supported.

    Note
    ----
    In case of `use_threads=True` the number of threads
    that will be spawned will be gotten from os.cpu_count().

    Parameters
    ----------
    manifest_path : str
        Amazon S3 manifest path (e.g. s3://...)
    paths: List[str]
        List of S3 paths (Parquet Files) to be copied.
    use_threads : bool
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
    s3_additional_kwargs:
        Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass",
        "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging".
        e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'}

    Returns
    -------
    Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]]
        Manifest content.

    Examples
    --------
    Copying two files to Redshift cluster.

    >>> import awswrangler as wr
    >>> wr.db.write_redshift_copy_manifest(
    ...     path="s3://bucket/my.manifest",
    ...     paths=["s3://...parquet", "s3://...parquet"]
    ... )

    """
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    objects_sizes: Dict[str, Optional[int]] = s3.size_objects(
        path=paths, use_threads=use_threads, boto3_session=session)
    manifest: Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]] = {
        "entries": []
    }
    path: str
    size: Optional[int]
    for path, size in objects_sizes.items():
        if size is not None:
            entry: Dict[str, Union[str, bool, Dict[str, int]]] = {
                "url": path,
                "mandatory": True,
                "meta": {
                    "content_length": size
                },
            }
            manifest["entries"].append(entry)
    payload: str = json.dumps(manifest)
    bucket: str
    bucket, key = _utils.parse_path(manifest_path)
    additional_kwargs: Dict[
        str,
        str] = {} if s3_additional_kwargs is None else s3_additional_kwargs
    _logger.debug("payload: %s", payload)
    client_s3: boto3.client = _utils.client(service_name="s3", session=session)
    _logger.debug("bucket: %s", bucket)
    _logger.debug("key: %s", key)
    client_s3.put_object(Body=payload,
                         Bucket=bucket,
                         Key=key,
                         **additional_kwargs)
    return manifest
示例#26
0
def _start_query_execution(
    sql: str,
    wg_config: _WorkGroupConfig,
    database: Optional[str] = None,
    data_source: Optional[str] = None,
    s3_output: Optional[str] = None,
    workgroup: Optional[str] = None,
    encryption: Optional[str] = None,
    kms_key: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> str:
    args: Dict[str, Any] = {"QueryString": sql}
    session: boto3.Session = _utils.ensure_session(session=boto3_session)

    # s3_output
    args["ResultConfiguration"] = {
        "OutputLocation":
        _get_s3_output(s3_output=s3_output,
                       wg_config=wg_config,
                       boto3_session=session)
    }

    # encryption
    if wg_config.enforced is True:
        if wg_config.encryption is not None:
            args["ResultConfiguration"]["EncryptionConfiguration"] = {
                "EncryptionOption": wg_config.encryption
            }
            if wg_config.kms_key is not None:
                args["ResultConfiguration"]["EncryptionConfiguration"][
                    "KmsKey"] = wg_config.kms_key
    else:
        if encryption is not None:
            args["ResultConfiguration"]["EncryptionConfiguration"] = {
                "EncryptionOption": encryption
            }
            if kms_key is not None:
                args["ResultConfiguration"]["EncryptionConfiguration"][
                    "KmsKey"] = kms_key

    # database
    if database is not None:
        args["QueryExecutionContext"] = {"Database": database}
        if data_source is not None:
            args["QueryExecutionContext"]["Catalog"] = data_source

    # workgroup
    if workgroup is not None:
        args["WorkGroup"] = workgroup

    client_athena: boto3.client = _utils.client(service_name="athena",
                                                session=session)
    _logger.debug("args: \n%s", pprint.pformat(args))
    response: Dict[str, Any] = _utils.try_it(
        f=client_athena.start_query_execution,
        ex=botocore.exceptions.ClientError,
        ex_code="ThrottlingException",
        max_num_tries=5,
        **args,
    )
    return cast(str, response["QueryExecutionId"])
示例#27
0
def create_athena_data_source(
    name: str,
    workgroup: str = "primary",
    allowed_to_use: Optional[List[str]] = None,
    allowed_to_manage: Optional[List[str]] = None,
    tags: Optional[Dict[str, str]] = None,
    account_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> None:
    """Create a QuickSight data source pointing to an Athena/Workgroup.

    Note
    ----
    You will not be able to see the the data source in the console
    if you not pass your user to one of the ``allowed_*`` arguments.

    Parameters
    ----------
    name : str
        Data source name.
    workgroup : str
        Athena workgroup.
    tags : Dict[str, str], optional
        Key/Value collection to put on the Cluster.
        e.g. {"foo": "boo", "bar": "xoo"})
    allowed_to_use : optional
        List of principals that will be allowed to see and use the data source.
        e.g. ["John"]
    allowed_to_manage : optional
        List of principals that will be allowed to see, use, update and delete the data source.
        e.g. ["Mary"]
    account_id : str, optional
        If None, the account ID will be inferred from your boto3 session.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    None
        None.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.quicksight.create_athena_data_source(
    ...     name="...",
    ...     allowed_to_manage=["john"]
    ... )
    """
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    client: boto3.client = _utils.client(service_name="quicksight", session=session)
    if account_id is None:
        account_id = sts.get_account_id(boto3_session=session)
    args: Dict[str, Any] = {
        "AwsAccountId": account_id,
        "DataSourceId": name,
        "Name": name,
        "Type": "ATHENA",
        "DataSourceParameters": {"AthenaParameters": {"WorkGroup": workgroup}},
        "SslProperties": {"DisableSsl": True},
    }
    permissions: List[Dict[str, Union[str, List[str]]]] = _generate_permissions(
        resource="data_source",
        account_id=account_id,
        boto3_session=session,
        allowed_to_use=allowed_to_use,
        allowed_to_manage=allowed_to_manage,
    )
    if permissions:
        args["Permissions"] = permissions
    if tags is not None:
        _tags: List[Dict[str, str]] = [{"Key": k, "Value": v} for k, v in tags.items()]
        args["Tags"] = _tags
    client.create_data_source(**args)
示例#28
0
def get_tables(
    catalog_id: Optional[str] = None,
    database: Optional[str] = None,
    transaction_id: Optional[str] = None,
    name_contains: Optional[str] = None,
    name_prefix: Optional[str] = None,
    name_suffix: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> Iterator[Dict[str, Any]]:
    """Get an iterator of tables.

    Note
    ----
    Please, does not filter using name_contains and name_prefix/name_suffix at the same time.
    Only name_prefix and name_suffix can be combined together.

    Parameters
    ----------
    catalog_id : str, optional
        The ID of the Data Catalog from which to retrieve Databases.
        If none is provided, the AWS account ID is used by default.
    database : str, optional
        Database name.
    transaction_id: str, optional
        The ID of the transaction (i.e. used with GOVERNED tables).
    name_contains : str, optional
        Select by a specific string on table name
    name_prefix : str, optional
        Select by a specific prefix on table name
    name_suffix : str, optional
        Select by a specific suffix on table name
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    Iterator[Dict[str, Any]]
        Iterator of tables.

    Examples
    --------
    >>> import awswrangler as wr
    >>> tables = wr.catalog.get_tables()

    """
    client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session)
    paginator = client_glue.get_paginator("get_tables")
    args: Dict[str, str] = {}
    if (name_prefix is not None) and (name_suffix is not None) and (name_contains is not None):
        raise exceptions.InvalidArgumentCombination(
            "Please, does not filter using name_contains and "
            "name_prefix/name_suffix at the same time. Only "
            "name_prefix and name_suffix can be combined together."
        )
    if (name_prefix is not None) and (name_suffix is not None):
        args["Expression"] = f"{name_prefix}*{name_suffix}"
    elif name_contains is not None:
        args["Expression"] = f"*{name_contains}*"
    elif name_prefix is not None:
        args["Expression"] = f"{name_prefix}*"
    elif name_suffix is not None:
        args["Expression"] = f"*{name_suffix}"
    if database is not None:
        dbs: List[str] = [database]
    else:
        dbs = [x["Name"] for x in get_databases(catalog_id=catalog_id)]
    for db in dbs:
        args["DatabaseName"] = db
        response_iterator = paginator.paginate(
            **_catalog_id(catalog_id=catalog_id, **_transaction_id(transaction_id=transaction_id, **args))
        )
        try:
            for page in response_iterator:
                for tbl in page["TableList"]:
                    yield tbl
        except client_glue.exceptions.EntityNotFoundException:
            continue
示例#29
0
def read_parquet_table(
    table: str,
    database: str,
    catalog_id: Optional[str] = None,
    partition_filter: Optional[Callable[[Dict[str, str]], bool]] = None,
    columns: Optional[List[str]] = None,
    validate_schema: bool = True,
    categories: Optional[List[str]] = None,
    safe: bool = True,
    chunked: Union[bool, int] = False,
    use_threads: bool = True,
    boto3_session: Optional[boto3.Session] = None,
    s3_additional_kwargs: Optional[Dict[str, Any]] = None,
) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
    """Read Apache Parquet table registered on AWS Glue Catalog.

    Note
    ----
    ``Batching`` (`chunked` argument) (Memory Friendly):

    Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame.

    There are two batching strategies on Wrangler:

    - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset.

    - If **chunked=INTEGER**, Wrangler will paginate through files slicing and concatenating
      to return DataFrames with the number of row igual the received INTEGER.

    `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise
    in number of rows for each Dataframe.


    Note
    ----
    In case of `use_threads=True` the number of threads
    that will be spawned will be gotten from os.cpu_count().

    Parameters
    ----------
    table : str
        AWS Glue Catalog table name.
    database : str
        AWS Glue Catalog database name.
    catalog_id : str, optional
        The ID of the Data Catalog from which to retrieve Databases.
        If none is provided, the AWS account ID is used by default.
    partition_filter: Optional[Callable[[Dict[str, str]], bool]]
        Callback Function filters to apply on PARTITION columns (PUSH-DOWN filter).
        This function MUST receive a single argument (Dict[str, str]) where keys are partitions
        names and values are partitions values. Partitions values will be always strings extracted from S3.
        This function MUST return a bool, True to read the partition or False to ignore it.
        Ignored if `dataset=False`.
        E.g ``lambda x: True if x["year"] == "2020" and x["month"] == "1" else False``
        https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/023%20-%20Flexible%20Partitions%20Filter.ipynb
    columns : List[str], optional
        Names of columns to read from the file(s).
    validate_schema:
        Check that individual file schemas are all the same / compatible. Schemas within a
        folder prefix should all be the same. Disable if you have schemas that are different
        and want to disable this check.
    categories: Optional[List[str]], optional
        List of columns names that should be returned as pandas.Categorical.
        Recommended for memory restricted environments.
    safe : bool, default True
        For certain data types, a cast is needed in order to store the
        data in a pandas DataFrame or Series (e.g. timestamps are always
        stored as nanoseconds in pandas). This option controls whether it
        is a safe cast or not.
    chunked : bool
        If True will break the data in smaller DataFrames (Non deterministic number of lines).
        Otherwise return a single DataFrame with the whole data.
    use_threads : bool
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
    s3_additional_kwargs : Optional[Dict[str, Any]]
        Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered.

    Returns
    -------
    Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]]
        Pandas DataFrame or a Generator in case of `chunked=True`.

    Examples
    --------
    Reading Parquet Table

    >>> import awswrangler as wr
    >>> df = wr.s3.read_parquet_table(database='...', table='...')

    Reading Parquet Table encrypted

    >>> import awswrangler as wr
    >>> df = wr.s3.read_parquet_table(
    ...     database='...',
    ...     table='...'
    ...     s3_additional_kwargs={
    ...         'ServerSideEncryption': 'aws:kms',
    ...         'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'
    ...     }
    ... )

    Reading Parquet Table in chunks (Chunk by file)

    >>> import awswrangler as wr
    >>> dfs = wr.s3.read_parquet_table(database='...', table='...', chunked=True)
    >>> for df in dfs:
    >>>     print(df)  # Smaller Pandas DataFrame

    Reading Parquet Dataset with PUSH-DOWN filter over partitions

    >>> import awswrangler as wr
    >>> my_filter = lambda x: True if x["city"].startswith("new") else False
    >>> df = wr.s3.read_parquet_table(path, dataset=True, partition_filter=my_filter)

    """
    client_glue: boto3.client = _utils.client(service_name="glue",
                                              session=boto3_session)
    args: Dict[str, Any] = {"DatabaseName": database, "Name": table}
    if catalog_id is not None:
        args["CatalogId"] = catalog_id
    res: Dict[str, Any] = client_glue.get_table(**args)
    try:
        path: str = res["Table"]["StorageDescriptor"]["Location"]
    except KeyError as ex:
        raise exceptions.InvalidTable(
            f"Missing s3 location for {database}.{table}.") from ex
    return _data_types.cast_pandas_with_athena_types(
        df=read_parquet(
            path=path,
            partition_filter=partition_filter,
            columns=columns,
            validate_schema=validate_schema,
            categories=categories,
            safe=safe,
            chunked=chunked,
            dataset=True,
            use_threads=use_threads,
            boto3_session=boto3_session,
            s3_additional_kwargs=s3_additional_kwargs,
        ),
        dtype=_extract_partitions_dtypes_from_table_details(response=res),
    )
示例#30
0
def table(
    database: str,
    table: str,
    transaction_id: Optional[str] = None,
    query_as_of_time: Optional[str] = None,
    catalog_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> pd.DataFrame:
    """Get table details as Pandas DataFrame.

    Note
    ----
    If reading from a governed table, pass only one of `transaction_id` or `query_as_of_time`.

    Parameters
    ----------
    database: str
        Database name.
    table: str
        Table name.
    transaction_id: str, optional
        The ID of the transaction (i.e. used with GOVERNED tables).
    query_as_of_time: str, optional
        The time as of when to read the table contents. Must be a valid Unix epoch timestamp.
        Cannot be specified alongside transaction_id.
    catalog_id: str, optional
        The ID of the Data Catalog from which to retrieve Databases.
        If none is provided, the AWS account ID is used by default.
    boto3_session: boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    pandas.DataFrame
        Pandas DataFrame filled by formatted infos.

    Examples
    --------
    >>> import awswrangler as wr
    >>> df_table = wr.catalog.table(database='default', table='my_table')

    """
    client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session)
    tbl = client_glue.get_table(
        **_catalog_id(
            catalog_id=catalog_id,
            **_transaction_id(
                transaction_id=transaction_id, query_as_of_time=query_as_of_time, DatabaseName=database, Name=table
            ),
        )
    )["Table"]
    df_dict: Dict[str, List[Union[str, bool]]] = {"Column Name": [], "Type": [], "Partition": [], "Comment": []}
    if "StorageDescriptor" in tbl:
        for col in tbl["StorageDescriptor"].get("Columns", {}):
            df_dict["Column Name"].append(col["Name"])
            df_dict["Type"].append(col["Type"])
            df_dict["Partition"].append(False)
            if "Comment" in col:
                df_dict["Comment"].append(col["Comment"])
            else:
                df_dict["Comment"].append("")
    if "PartitionKeys" in tbl:
        for col in tbl["PartitionKeys"]:
            df_dict["Column Name"].append(col["Name"])
            df_dict["Type"].append(col["Type"])
            df_dict["Partition"].append(True)
            if "Comment" in col:
                df_dict["Comment"].append(col["Comment"])
            else:
                df_dict["Comment"].append("")
    return pd.DataFrame(data=df_dict)