def _list( func_name: str, attr_name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, **kwargs: Any, ) -> List[Dict[str, Any]]: session: boto3.Session = _utils.ensure_session(session=boto3_session) if account_id is None: account_id = sts.get_account_id(boto3_session=session) client: boto3.client = _utils.client(service_name="quicksight", session=session) func: Callable[..., Dict[str, Any]] = getattr(client, func_name) response: Dict[str, Any] = func(AwsAccountId=account_id, **kwargs) next_token: str = response.get("NextToken", None) result: List[Dict[str, Any]] = response[attr_name] while next_token is not None: response = func(AwsAccountId=account_id, NextToken=next_token, **kwargs) next_token = response.get("NextToken", None) result += response[attr_name] return result
def start_transaction(read_only: Optional[bool] = False, time_out: Optional[float] = inf, boto3_session: Optional[boto3.Session] = None) -> str: """Start a new transaction and returns its transaction ID. The transaction is periodically extended until it's committed, canceled or the defined time-out is reached. Parameters ---------- read_only : bool, optional Indicates that that this transaction should be read only. Writes made using a read-only transaction ID will be rejected. Read-only transactions do not need to be committed. time_out: float, optional Maximum duration over which a transaction is extended. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session received None. Returns ------- str An opaque identifier for the transaction. Examples -------- >>> import awswrangler as wr >>> transaction_id = wr.lakeformation.start_transaction(read_only=False) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) client_lakeformation: boto3.client = _utils.client( service_name="lakeformation", session=session) transaction_type: str = "READ_ONLY" if read_only else "READ_AND_WRITE" transaction_id: str = client_lakeformation.start_transaction( TransactionType=transaction_type)["TransactionId"] # Extend the transaction while in "active" state in a separate thread t = Thread(target=_monitor_transaction, args=(transaction_id, time_out, boto3_session)) t.daemon = True # Ensures thread is killed when any exception is raised t.start() return transaction_id
def get_account_id(boto3_session: Optional[boto3.Session] = None) -> str: """Get Account ID. Parameters ---------- boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- str Account ID. Examples -------- >>> import awswrangler as wr >>> account_id = wr.sts.get_account_id() """ session: boto3.Session = _utils.ensure_session(session=boto3_session) return _utils.client(service_name="sts", session=session).get_caller_identity().get("Account")
def wait_query( query_id: str, boto3_session: Optional[boto3.Session] = None) -> Dict[str, Any]: """Wait for the query to end. Parameters ---------- query_id : str Lake Formation query execution ID. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session received None. Returns ------- Dict[str, Any] Dictionary with the get_query_state response. Examples -------- >>> import awswrangler as wr >>> res = wr.lakeformation.wait_query(query_id='query-id') """ session: boto3.Session = _utils.ensure_session(session=boto3_session) client_lakeformation: boto3.client = _utils.client( service_name="lakeformation", session=session) response: Dict[str, Any] = client_lakeformation.get_query_state( QueryId=query_id) state: str = response["State"] while state not in _QUERY_FINAL_STATES: time.sleep(_QUERY_WAIT_POLLING_DELAY) response = client_lakeformation.get_query_state(QueryId=query_id) state = response["State"] _logger.debug("state: %s", state) if state == "ERROR": raise exceptions.QueryFailed(response.get("Error")) return response
def commit_transaction(transaction_id: str, boto3_session: Optional[boto3.Session] = None) -> None: """Commit the specified transaction. Returns exception if the transaction was previously canceled. Parameters ---------- transaction_id : str The ID of the transaction. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session received None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.lakeformation.commit_transaction(transaction_id="...") """ session: boto3.Session = _utils.ensure_session(session=boto3_session) client_lakeformation: boto3.client = _utils.client( service_name="lakeformation", session=session) client_lakeformation.commit_transaction(TransactionId=transaction_id) committed: bool = False # Confirm transaction was committed while not committed: state: str = describe_transaction(transaction_id=transaction_id, boto3_session=session) if state == "committed": committed = True elif state == "aborted": raise exceptions.CommitCancelled( f"Transaction commit with id {transaction_id} was aborted.") time.sleep(_TRANSACTION_WAIT_COMMIT_DELAY)
def get_query_columns_types( query_execution_id: str, boto3_session: Optional[boto3.Session] = None) -> Dict[str, str]: """Get the data type of all columns queried. https://docs.aws.amazon.com/athena/latest/ug/data-types.html Parameters ---------- query_execution_id : str Athena query execution ID. boto3_session : boto3.Session(), optional Boto3 Session. If none, the default boto3 session is used. Returns ------- Dict[str, str] Dictionary with all data types. Examples -------- >>> import awswrangler as wr >>> wr.athena.get_query_columns_types('query-execution-id') {'col0': 'int', 'col1': 'double'} """ client_athena: boto3.client = _utils.client( service_name="athena", session=_utils.ensure_session(session=boto3_session)) response: Dict[str, Any] = client_athena.get_query_results( QueryExecutionId=query_execution_id, MaxResults=1) col_info: List[Dict[ str, str]] = response["ResultSet"]["ResultSetMetadata"]["ColumnInfo"] return dict( (c["Name"], f"{c['Type']}({c['Precision']},{c.get('Scale', 0)})" ) if c["Type"] in ["decimal"] else (c["Name"], c["Type"]) for c in col_info)
def _read_parquet( path: str, version_id: Optional[str], columns: Optional[List[str]], categories: Optional[List[str]], safe: bool, map_types: bool, boto3_session: Union[boto3.Session, _utils.Boto3PrimitivesType], dataset: bool, path_root: Optional[str], s3_additional_kwargs: Optional[Dict[str, str]], use_threads: Union[bool, int], pyarrow_additional_kwargs: Optional[Dict[str, Any]] = None, ) -> pd.DataFrame: pyarrow_args = _set_default_pyarrow_additional_kwargs(pyarrow_additional_kwargs) boto3_session = _utils.ensure_session(boto3_session) return _arrowtable2df( table=_read_parquet_file( path=path, columns=columns, categories=categories, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, use_threads=use_threads, version_id=version_id, pyarrow_additional_kwargs=pyarrow_args, ), categories=categories, safe=safe, map_types=map_types, use_threads=use_threads, dataset=dataset, path=path, path_root=path_root, timestamp_as_object=pyarrow_args["timestamp_as_object"], )
def read_parquet( path: Union[str, List[str]], path_suffix: Union[str, List[str], None] = None, path_ignore_suffix: Union[str, List[str], None] = None, partition_filter: Optional[Callable[[Dict[str, str]], bool]] = None, columns: Optional[List[str]] = None, validate_schema: bool = False, chunked: Union[bool, int] = False, dataset: bool = False, categories: Optional[List[str]] = None, safe: bool = True, use_threads: bool = True, last_modified_begin: Optional[datetime.datetime] = None, last_modified_end: Optional[datetime.datetime] = None, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Read Apache Parquet file(s) from from a received S3 prefix or list of S3 objects paths. The concept of Dataset goes beyond the simple idea of files and enable more complex features like partitioning and catalog integration (AWS Glue Catalog). This function accepts Unix shell-style wildcards in the path argument. * (matches everything), ? (matches any single character), [seq] (matches any character in seq), [!seq] (matches any character not in seq). Note ---- ``Batching`` (`chunked` argument) (Memory Friendly): Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. There are two batching strategies on Wrangler: - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset. - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER. `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise in number of rows for each Dataframe. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Note ---- The filter by last_modified begin last_modified end is applied after list all S3 files Parameters ---------- path : Union[str, List[str]] S3 prefix (accepts Unix shell-style wildcards) (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). path_suffix: Union[str, List[str], None] Suffix or List of suffixes for filtering S3 keys. path_ignore_suffix: Union[str, List[str], None] Suffix or List of suffixes for S3 keys to be ignored. partition_filter: Optional[Callable[[Dict[str, str]], bool]] Callback Function filters to apply on PARTITION columns (PUSH-DOWN filter). This function MUST receive a single argument (Dict[str, str]) where keys are partitions names and values are partitions values. Partitions values will be always strings extracted from S3. This function MUST return a bool, True to read the partition or False to ignore it. Ignored if `dataset=False`. E.g ``lambda x: True if x["year"] == "2020" and x["month"] == "1" else False`` columns : List[str], optional Names of columns to read from the file(s). validate_schema: Check that individual file schemas are all the same / compatible. Schemas within a folder prefix should all be the same. Disable if you have schemas that are different and want to disable this check. chunked : Union[int, bool] If passed will split the data in a Iterable of DataFrames (Memory friendly). If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize. If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER. dataset: bool If `True` read a parquet dataset instead of simple file(s) loading all the related partitions as columns. categories: Optional[List[str]], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. safe : bool, default True For certain data types, a cast is needed in order to store the data in a pandas DataFrame or Series (e.g. timestamps are always stored as nanoseconds in pandas). This option controls whether it is a safe cast or not. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. last_modified_begin Filter the s3 files by the Last modified date of the object. The filter is applied only after list all s3 files. last_modified_end: datetime, optional Filter the s3 files by the Last modified date of the object. The filter is applied only after list all s3 files. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs : Optional[Dict[str, Any]] Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered. Returns ------- Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] Pandas DataFrame or a Generator in case of `chunked=True`. Examples -------- Reading all Parquet files under a prefix >>> import awswrangler as wr >>> df = wr.s3.read_parquet(path='s3://bucket/prefix/') Reading all Parquet files from a list >>> import awswrangler as wr >>> df = wr.s3.read_parquet(path=['s3://bucket/filename0.parquet', 's3://bucket/filename1.parquet']) Reading in chunks (Chunk by file) >>> import awswrangler as wr >>> dfs = wr.s3.read_parquet(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunked=True) >>> for df in dfs: >>> print(df) # Smaller Pandas DataFrame Reading in chunks (Chunk by 1MM rows) >>> import awswrangler as wr >>> dfs = wr.s3.read_parquet(path=['s3://bucket/filename0.csv', 's3://bucket/filename1.csv'], chunked=1_000_000) >>> for df in dfs: >>> print(df) # 1MM Pandas DataFrame Reading Parquet Dataset with PUSH-DOWN filter over partitions >>> import awswrangler as wr >>> my_filter = lambda x: True if x["city"].startswith("new") else False >>> df = wr.s3.read_parquet(path, dataset=True, partition_filter=my_filter) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) paths: List[str] = _path2list( path=path, boto3_session=session, suffix=path_suffix, ignore_suffix=_get_path_ignore_suffix( path_ignore_suffix=path_ignore_suffix), last_modified_begin=last_modified_begin, last_modified_end=last_modified_end, ) path_root: Optional[str] = _get_path_root(path=path, dataset=dataset) if path_root is not None: paths = _apply_partition_filter(path_root=path_root, paths=paths, filter_func=partition_filter) if len(paths) < 1: raise exceptions.NoFilesFound(f"No files Found on: {path}.") _logger.debug("paths:\n%s", paths) args: Dict[str, Any] = { "columns": columns, "categories": categories, "safe": safe, "boto3_session": session, "dataset": dataset, "path_root": path_root, "s3_additional_kwargs": s3_additional_kwargs, "use_threads": use_threads, } _logger.debug("args:\n%s", pprint.pformat(args)) if chunked is not False: return _read_parquet_chunked(paths=paths, chunked=chunked, validate_schema=validate_schema, **args) if len(paths) == 1: return _read_parquet(path=paths[0], **args) if validate_schema is True: _validate_schemas_from_files( paths=paths, sampling=1.0, use_threads=True, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, ) return _union(dfs=[_read_parquet(path=p, **args) for p in paths], ignore_index=None)
def create_athena_data_source( name: str, workgroup: str = "primary", allowed_to_use: Optional[List[str]] = None, allowed_to_manage: Optional[List[str]] = None, tags: Optional[Dict[str, str]] = None, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> None: """Create a QuickSight data source pointing to an Athena/Workgroup. Note ---- You will not be able to see the the data source in the console if you not pass your user to one of the ``allowed_*`` arguments. Parameters ---------- name : str Data source name. workgroup : str Athena workgroup. tags : Dict[str, str], optional Key/Value collection to put on the Cluster. e.g. {"foo": "boo", "bar": "xoo"}) allowed_to_use : optional List of principals that will be allowed to see and use the data source. e.g. ["John"] allowed_to_manage : optional List of principals that will be allowed to see, use, update and delete the data source. e.g. ["Mary"] account_id : str, optional If None, the account ID will be inferred from your boto3 session. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.quicksight.create_athena_data_source( ... name="...", ... allowed_to_manage=["john"] ... ) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) client: boto3.client = _utils.client(service_name="quicksight", session=session) if account_id is None: account_id = sts.get_account_id(boto3_session=session) args: Dict[str, Any] = { "AwsAccountId": account_id, "DataSourceId": name, "Name": name, "Type": "ATHENA", "DataSourceParameters": {"AthenaParameters": {"WorkGroup": workgroup}}, "SslProperties": {"DisableSsl": True}, } permissions: List[Dict[str, Union[str, List[str]]]] = _generate_permissions( resource="data_source", account_id=account_id, boto3_session=session, allowed_to_use=allowed_to_use, allowed_to_manage=allowed_to_manage, ) if permissions: args["Permissions"] = permissions if tags is not None: _tags: List[Dict[str, str]] = [{"Key": k, "Value": v} for k, v in tags.items()] args["Tags"] = _tags client.create_data_source(**args)
def unload_redshift_to_files( sql: str, path: str, con: sqlalchemy.engine.Engine, iam_role: str, region: Optional[str] = None, max_file_size: Optional[float] = None, kms_key_id: Optional[str] = None, use_threads: bool = True, manifest: bool = False, partition_cols: Optional[List[str]] = None, boto3_session: Optional[boto3.Session] = None, ) -> List[str]: """Unload Parquet files from a Amazon Redshift query result to parquet files on s3 (Through UNLOAD command). https://docs.aws.amazon.com/redshift/latest/dg/r_UNLOAD.html Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- sql: str SQL query. path : Union[str, List[str]] S3 path to write stage files (e.g. s3://bucket_name/any_name/) con : sqlalchemy.engine.Engine SQLAlchemy Engine. Please use, wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine() iam_role : str AWS IAM role with the related permissions. region : str, optional Specifies the AWS Region where the target Amazon S3 bucket is located. REGION is required for UNLOAD to an Amazon S3 bucket that isn't in the same AWS Region as the Amazon Redshift cluster. By default, UNLOAD assumes that the target Amazon S3 bucket is located in the same AWS Region as the Amazon Redshift cluster. max_file_size : float, optional Specifies the maximum size (MB) of files that UNLOAD creates in Amazon S3. Specify a decimal value between 5.0 MB and 6200.0 MB. If None, the default maximum file size is 6200.0 MB. kms_key_id : str, optional Specifies the key ID for an AWS Key Management Service (AWS KMS) key to be used to encrypt data files on Amazon S3. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. manifest : bool Unload a manifest file on S3. partition_cols: List[str], optional Specifies the partition keys for the unload operation. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- List[str] Paths list with all unloaded files. Examples -------- >>> import awswrangler as wr >>> paths = wr.db.unload_redshift_to_files( ... sql="SELECT * FROM public.mytable", ... path="s3://bucket/extracted_parquet_files/", ... con=wr.catalog.get_engine(connection="my_glue_connection"), ... iam_role="arn:aws:iam::XXX:role/XXX" ... ) """ path = path if path.endswith("/") else f"{path}/" session: boto3.Session = _utils.ensure_session(session=boto3_session) s3.delete_objects(path=path, use_threads=use_threads, boto3_session=session) with con.connect() as _con: partition_str: str = f"\nPARTITION BY ({','.join(partition_cols)})" if partition_cols else "" manifest_str: str = "\nmanifest" if manifest is True else "" region_str: str = f"\nREGION AS '{region}'" if region is not None else "" max_file_size_str: str = f"\nMAXFILESIZE AS {max_file_size} MB" if max_file_size is not None else "" kms_key_id_str: str = f"\nKMS_KEY_ID '{kms_key_id}'" if kms_key_id is not None else "" sql = (f"UNLOAD ('{sql}')\n" f"TO '{path}'\n" f"IAM_ROLE '{iam_role}'\n" "ALLOWOVERWRITE\n" "PARALLEL ON\n" "FORMAT PARQUET\n" "ENCRYPTED" f"{kms_key_id_str}" f"{partition_str}" f"{region_str}" f"{max_file_size_str}" f"{manifest_str};") _logger.debug("sql: \n%s", sql) _con.execute(sql) sql = "SELECT pg_last_query_id() AS query_id" query_id: int = _con.execute(sql).fetchall()[0][0] sql = f"SELECT path FROM STL_UNLOAD_LOG WHERE query={query_id};" paths = [x[0].replace(" ", "") for x in _con.execute(sql).fetchall()] _logger.debug("paths: %s", paths) return paths
def write_redshift_copy_manifest( manifest_path: str, paths: List[str], use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, ) -> Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]]: """Write Redshift copy manifest and return its structure. Only Parquet files are supported. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- manifest_path : str Amazon S3 manifest path (e.g. s3://...) paths: List[str] List of S3 paths (Parquet Files) to be copied. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass", "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging". e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'} Returns ------- Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]] Manifest content. Examples -------- Copying two files to Redshift cluster. >>> import awswrangler as wr >>> wr.db.write_redshift_copy_manifest( ... path="s3://bucket/my.manifest", ... paths=["s3://...parquet", "s3://...parquet"] ... ) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) objects_sizes: Dict[str, Optional[int]] = s3.size_objects( path=paths, use_threads=use_threads, boto3_session=session) manifest: Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]] = { "entries": [] } path: str size: Optional[int] for path, size in objects_sizes.items(): if size is not None: entry: Dict[str, Union[str, bool, Dict[str, int]]] = { "url": path, "mandatory": True, "meta": { "content_length": size }, } manifest["entries"].append(entry) payload: str = json.dumps(manifest) bucket: str bucket, key = _utils.parse_path(manifest_path) additional_kwargs: Dict[ str, str] = {} if s3_additional_kwargs is None else s3_additional_kwargs _logger.debug("payload: %s", payload) client_s3: boto3.client = _utils.client(service_name="s3", session=session) _logger.debug("bucket: %s", bucket) _logger.debug("key: %s", key) client_s3.put_object(Body=payload, Bucket=bucket, Key=key, **additional_kwargs) return manifest
def to_csv( # pylint: disable=too-many-arguments,too-many-locals df: pd.DataFrame, path: str, sep: str = ",", index: bool = True, columns: Optional[List[str]] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, sanitize_columns: bool = False, dataset: bool = False, partition_cols: Optional[List[str]] = None, concurrent_partitioning: bool = False, mode: Optional[str] = None, catalog_versioning: bool = False, database: Optional[str] = None, table: Optional[str] = None, dtype: Optional[Dict[str, str]] = None, description: Optional[str] = None, parameters: Optional[Dict[str, str]] = None, columns_comments: Optional[Dict[str, str]] = None, regular_partitions: bool = True, projection_enabled: bool = False, projection_types: Optional[Dict[str, str]] = None, projection_ranges: Optional[Dict[str, str]] = None, projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, catalog_id: Optional[str] = None, **pandas_kwargs: Any, ) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: """Write CSV file or dataset on Amazon S3. The concept of Dataset goes beyond the simple idea of ordinary files and enable more complex features like partitioning and catalog integration (Amazon Athena/AWS Glue Catalog). Note ---- If database` and `table` arguments are passed, the table name and all column names will be automatically sanitized using `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`. Please, pass `sanitize_columns=True` to enforce this behaviour always. Note ---- If `dataset=True`, `pandas_kwargs` will be ignored due restrictive quoting, date_format, escapechar, encoding, etc required by Athena/Glue Catalog. Note ---- By now Pandas does not support in-memory CSV compression. https://github.com/pandas-dev/pandas/issues/22555 So the `compression` will not be supported on Wrangler too. Note ---- On `append` mode, the `parameters` will be upsert on an existing table. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- df: pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html path : str Amazon S3 path (e.g. s3://bucket/filename.csv). sep : str String of length 1. Field delimiter for the output file. index : bool Write row names (index). columns : List[str], optional Columns to write. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 Session will be used if boto3_session receive None. s3_additional_kwargs : Optional[Dict[str, Any]] Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass", "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging". e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'} sanitize_columns : bool True to sanitize columns names or False to keep it as is. True value is forced if `dataset=True`. dataset : bool If True store a parquet dataset instead of a ordinary file(s) If True, enable all follow arguments: partition_cols, mode, database, table, description, parameters, columns_comments, concurrent_partitioning, catalog_versioning, projection_enabled, projection_types, projection_ranges, projection_values, projection_intervals, projection_digits, catalog_id, schema_evolution. partition_cols: List[str], optional List of column names that will be used to create partitions. Only takes effect if dataset=True. concurrent_partitioning: bool If True will increase the parallelism level during the partitions writing. It will decrease the writing time and increase the memory usage. https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/022%20-%20Writing%20Partitions%20Concurrently.ipynb mode : str, optional ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True. For details check the related tutorial: https://aws-data-wrangler.readthedocs.io/en/stable/stubs/awswrangler.s3.to_parquet.html#awswrangler.s3.to_parquet catalog_versioning : bool If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. database : str, optional Glue/Athena catalog: Database name. table : str, optional Glue/Athena catalog: Table name. dtype : Dict[str, str], optional Dictionary of columns names and Athena/Glue types to be casted. Useful when you have columns with undetermined or mixed data types. (e.g. {'col name': 'bigint', 'col2 name': 'int'}) description : str, optional Glue/Athena catalog: Table description parameters : Dict[str, str], optional Glue/Athena catalog: Key/value pairs to tag the table. columns_comments : Dict[str, str], optional Glue/Athena catalog: Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). regular_partitions : bool Create regular partitions (Non projected partitions) on Glue Catalog. Disable when you will work only with Partition Projection. Keep enabled even when working with projections is useful to keep Redshift Spectrum working with the regular partitions. projection_enabled : bool Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) projection_types : Optional[Dict[str, str]] Dictionary of partitions names and Athena projections types. Valid types: "enum", "integer", "date", "injected" https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) projection_ranges: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections ranges. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) projection_values: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections values. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) projection_intervals: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections intervals. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '5'}) projection_digits: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. pandas_kwargs : KEYWORD arguments forwarded to pandas.DataFrame.to_csv(). You can NOT pass `pandas_kwargs` explicit, just add valid Pandas arguments in the function call and Wrangler will accept it. e.g. wr.s3.to_csv(df, path, sep='|', na_rep='NULL', decimal=',') https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html Returns ------- Dict[str, Union[List[str], Dict[str, List[str]]]] Dictionary with: 'paths': List of all stored files paths on S3. 'partitions_values': Dictionary of partitions added with keys as S3 path locations and values as a list of partitions values as str. Examples -------- Writing single file >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.csv', ... ) { 'paths': ['s3://bucket/prefix/my_file.csv'], 'partitions_values': {} } Writing single file with pandas_kwargs >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.csv', ... sep='|', ... na_rep='NULL', ... decimal=',' ... ) { 'paths': ['s3://bucket/prefix/my_file.csv'], 'partitions_values': {} } Writing single file encrypted with a KMS key >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.csv', ... s3_additional_kwargs={ ... 'ServerSideEncryption': 'aws:kms', ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' ... } ... ) { 'paths': ['s3://bucket/prefix/my_file.csv'], 'partitions_values': {} } Writing partitioned dataset >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... partition_cols=['col2'] ... ) { 'paths': ['s3://.../col2=A/x.csv', 's3://.../col2=B/y.csv'], 'partitions_values: { 's3://.../col2=A/': ['A'], 's3://.../col2=B/': ['B'] } } Writing dataset to S3 with metadata on Athena/Glue Catalog. >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... partition_cols=['col2'], ... database='default', # Athena/Glue database ... table='my_table' # Athena/Glue table ... ) { 'paths': ['s3://.../col2=A/x.csv', 's3://.../col2=B/y.csv'], 'partitions_values: { 's3://.../col2=A/': ['A'], 's3://.../col2=B/': ['B'] } } Writing dataset casting empty column data type >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'], ... 'col3': [None, None, None] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... database='default', # Athena/Glue database ... table='my_table' # Athena/Glue table ... dtype={'col3': 'date'} ... ) { 'paths': ['s3://.../x.csv'], 'partitions_values: {} } """ if "pandas_kwargs" in pandas_kwargs: raise exceptions.InvalidArgument( "You can NOT pass `pandas_kwargs` explicit, just add valid " "Pandas arguments in the function call and Wrangler will accept it." "e.g. wr.s3.to_csv(df, path, sep='|', na_rep='NULL', decimal=',')") _validate_args( df=df, table=table, database=database, dataset=dataset, path=path, partition_cols=partition_cols, mode=mode, description=description, parameters=parameters, columns_comments=columns_comments, ) # Initializing defaults partition_cols = partition_cols if partition_cols else [] dtype = dtype if dtype else {} partitions_values: Dict[str, List[str]] = {} mode = "append" if mode is None else mode session: boto3.Session = _utils.ensure_session(session=boto3_session) # Sanitize table to respect Athena's standards if (sanitize_columns is True) or (database is not None and table is not None): df, dtype, partition_cols = _sanitize(df=df, dtype=dtype, partition_cols=partition_cols) # Evaluating dtype catalog_table_input: Optional[Dict[str, Any]] = None if database is not None and table is not None: catalog_table_input = catalog._get_table_input( # pylint: disable=protected-access database=database, table=table, boto3_session=session, catalog_id=catalog_id) df = _apply_dtype(df=df, dtype=dtype, catalog_table_input=catalog_table_input, mode=mode) if dataset is False: pandas_kwargs["sep"] = sep pandas_kwargs["index"] = index pandas_kwargs["columns"] = columns _to_text( file_format="csv", df=df, use_threads=use_threads, path=path, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, **pandas_kwargs, ) paths = [path] else: df = df[columns] if columns else df paths, partitions_values = _to_dataset( func=_to_text, concurrent_partitioning=concurrent_partitioning, df=df, path_root=path, index=index, sep=sep, use_threads=use_threads, partition_cols=partition_cols, mode=mode, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, file_format="csv", quoting=csv.QUOTE_NONE, escapechar="\\", header=False, date_format="%Y-%m-%d %H:%M:%S.%f", ) if (database is not None) and (table is not None): try: columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned( df=df, index=index, partition_cols=partition_cols, dtype=dtype, index_left=True) catalog._create_csv_table( # pylint: disable=protected-access database=database, table=table, path=path, columns_types=columns_types, partitions_types=partitions_types, description=description, parameters=parameters, columns_comments=columns_comments, boto3_session=session, mode=mode, catalog_versioning=catalog_versioning, sep=sep, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, catalog_table_input=catalog_table_input, catalog_id=catalog_id, compression=None, skip_header_line_count=None, ) if partitions_values and (regular_partitions is True): _logger.debug("partitions_values:\n%s", partitions_values) catalog.add_csv_partitions( database=database, table=table, partitions_values=partitions_values, boto3_session=session, sep=sep, catalog_id=catalog_id, columns_types=columns_types, ) except Exception: _logger.debug( "Catalog write failed, cleaning up S3 (paths: %s).", paths) delete_objects(path=paths, use_threads=use_threads, boto3_session=session) raise return {"paths": paths, "partitions_values": partitions_values}
def to_parquet( # pylint: disable=too-many-arguments,too-many-locals,too-many-branches,too-many-statements df: pd.DataFrame, path: Optional[str] = None, index: bool = False, compression: Optional[str] = "snappy", pyarrow_additional_kwargs: Optional[Dict[str, Any]] = None, max_rows_by_file: Optional[int] = None, use_threads: Union[bool, int] = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, sanitize_columns: bool = False, dataset: bool = False, filename_prefix: Optional[str] = None, partition_cols: Optional[List[str]] = None, bucketing_info: Optional[Tuple[List[str], int]] = None, concurrent_partitioning: bool = False, mode: Optional[str] = None, catalog_versioning: bool = False, schema_evolution: bool = True, database: Optional[str] = None, table: Optional[str] = None, table_type: Optional[str] = None, transaction_id: Optional[str] = None, dtype: Optional[Dict[str, str]] = None, description: Optional[str] = None, parameters: Optional[Dict[str, str]] = None, columns_comments: Optional[Dict[str, str]] = None, regular_partitions: bool = True, projection_enabled: bool = False, projection_types: Optional[Dict[str, str]] = None, projection_ranges: Optional[Dict[str, str]] = None, projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, catalog_id: Optional[str] = None, ) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: """Write Parquet file or dataset on Amazon S3. The concept of Dataset goes beyond the simple idea of ordinary files and enable more complex features like partitioning and catalog integration (Amazon Athena/AWS Glue Catalog). Note ---- This operation may mutate the original pandas dataframe in-place. To avoid this behaviour please pass in a deep copy instead (i.e. `df.copy()`) Note ---- If `database` and `table` arguments are passed, the table name and all column names will be automatically sanitized using `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`. Please, pass `sanitize_columns=True` to enforce this behaviour always. Note ---- On `append` mode, the `parameters` will be upsert on an existing table. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- df: pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html path : str, optional S3 path (for file e.g. ``s3://bucket/prefix/filename.parquet``) (for dataset e.g. ``s3://bucket/prefix``). Required if dataset=False or when dataset=True and creating a new dataset index : bool True to store the DataFrame index in file, otherwise False to ignore it. compression: str, optional Compression style (``None``, ``snappy``, ``gzip``). pyarrow_additional_kwargs : Optional[Dict[str, Any]] Additional parameters forwarded to pyarrow. e.g. pyarrow_additional_kwargs={'coerce_timestamps': 'ns', 'use_deprecated_int96_timestamps': False, 'allow_truncated_timestamps'=False} max_rows_by_file : int Max number of rows in each file. Default is None i.e. dont split the files. (e.g. 33554432, 268435456) use_threads : bool, int True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. If integer is provided, specified number is used. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs : Optional[Dict[str, Any]] Forwarded to botocore requests. e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'} sanitize_columns : bool True to sanitize columns names (using `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`) or False to keep it as is. True value behaviour is enforced if `database` and `table` arguments are passed. dataset : bool If True store a parquet dataset instead of a ordinary file(s) If True, enable all follow arguments: partition_cols, mode, database, table, description, parameters, columns_comments, concurrent_partitioning, catalog_versioning, projection_enabled, projection_types, projection_ranges, projection_values, projection_intervals, projection_digits, catalog_id, schema_evolution. filename_prefix: str, optional If dataset=True, add a filename prefix to the output files. partition_cols: List[str], optional List of column names that will be used to create partitions. Only takes effect if dataset=True. bucketing_info: Tuple[List[str], int], optional Tuple consisting of the column names used for bucketing as the first element and the number of buckets as the second element. Only `str`, `int` and `bool` are supported as column data types for bucketing. concurrent_partitioning: bool If True will increase the parallelism level during the partitions writing. It will decrease the writing time and increase the memory usage. https://aws-data-wrangler.readthedocs.io/en/2.13.0/tutorials/022%20-%20Writing%20Partitions%20Concurrently.html mode: str, optional ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True. For details check the related tutorial: https://aws-data-wrangler.readthedocs.io/en/2.13.0/stubs/awswrangler.s3.to_parquet.html#awswrangler.s3.to_parquet catalog_versioning : bool If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. schema_evolution : bool If True allows schema evolution (new or missing columns), otherwise a exception will be raised. True by default. (Only considered if dataset=True and mode in ("append", "overwrite_partitions")) Related tutorial: https://aws-data-wrangler.readthedocs.io/en/2.13.0/tutorials/014%20-%20Schema%20Evolution.html database : str, optional Glue/Athena catalog: Database name. table : str, optional Glue/Athena catalog: Table name. table_type: str, optional The type of the Glue Table. Set to EXTERNAL_TABLE if None. transaction_id: str, optional The ID of the transaction when writing to a Governed Table. dtype : Dict[str, str], optional Dictionary of columns names and Athena/Glue types to be casted. Useful when you have columns with undetermined or mixed data types. (e.g. {'col name': 'bigint', 'col2 name': 'int'}) description : str, optional Glue/Athena catalog: Table description parameters : Dict[str, str], optional Glue/Athena catalog: Key/value pairs to tag the table. columns_comments : Dict[str, str], optional Glue/Athena catalog: Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). regular_partitions : bool Create regular partitions (Non projected partitions) on Glue Catalog. Disable when you will work only with Partition Projection. Keep enabled even when working with projections is useful to keep Redshift Spectrum working with the regular partitions. projection_enabled : bool Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) projection_types : Optional[Dict[str, str]] Dictionary of partitions names and Athena projections types. Valid types: "enum", "integer", "date", "injected" https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) projection_ranges: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections ranges. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) projection_values: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections values. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) projection_intervals: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections intervals. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '5'}) projection_digits: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. Returns ------- Dict[str, Union[List[str], Dict[str, List[str]]]] Dictionary with: 'paths': List of all stored files paths on S3. 'partitions_values': Dictionary of partitions added with keys as S3 path locations and values as a list of partitions values as str. Examples -------- Writing single file >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.parquet', ... ) { 'paths': ['s3://bucket/prefix/my_file.parquet'], 'partitions_values': {} } Writing single file encrypted with a KMS key >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.parquet', ... s3_additional_kwargs={ ... 'ServerSideEncryption': 'aws:kms', ... 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN' ... } ... ) { 'paths': ['s3://bucket/prefix/my_file.parquet'], 'partitions_values': {} } Writing partitioned dataset >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... partition_cols=['col2'] ... ) { 'paths': ['s3://.../col2=A/x.parquet', 's3://.../col2=B/y.parquet'], 'partitions_values: { 's3://.../col2=A/': ['A'], 's3://.../col2=B/': ['B'] } } Writing bucketed dataset >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... bucketing_info=(["col2"], 2) ... ) { 'paths': ['s3://.../x_bucket-00000.csv', 's3://.../col2=B/x_bucket-00001.csv'], 'partitions_values: {} } Writing dataset to S3 with metadata on Athena/Glue Catalog. >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... partition_cols=['col2'], ... database='default', # Athena/Glue database ... table='my_table' # Athena/Glue table ... ) { 'paths': ['s3://.../col2=A/x.parquet', 's3://.../col2=B/y.parquet'], 'partitions_values: { 's3://.../col2=A/': ['A'], 's3://.../col2=B/': ['B'] } } Writing dataset to Glue governed table >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'], ... 'col3': [None, None, None] ... }), ... dataset=True, ... mode='append', ... database='default', # Athena/Glue database ... table='my_table', # Athena/Glue table ... table_type='GOVERNED', ... transaction_id="xxx", ... ) { 'paths': ['s3://.../x.parquet'], 'partitions_values: {} } Writing dataset casting empty column data type >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'], ... 'col3': [None, None, None] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... database='default', # Athena/Glue database ... table='my_table' # Athena/Glue table ... dtype={'col3': 'date'} ... ) { 'paths': ['s3://.../x.parquet'], 'partitions_values: {} } """ _validate_args( df=df, table=table, database=database, dataset=dataset, path=path, partition_cols=partition_cols, bucketing_info=bucketing_info, mode=mode, description=description, parameters=parameters, columns_comments=columns_comments, ) # Evaluating compression if _COMPRESSION_2_EXT.get(compression, None) is None: raise exceptions.InvalidCompression(f"{compression} is invalid, please use None, 'snappy' or 'gzip'.") compression_ext: str = _COMPRESSION_2_EXT[compression] # Initializing defaults partition_cols = partition_cols if partition_cols else [] dtype = dtype if dtype else {} partitions_values: Dict[str, List[str]] = {} mode = "append" if mode is None else mode commit_trans: bool = False if transaction_id: table_type = "GOVERNED" filename_prefix = filename_prefix + uuid.uuid4().hex if filename_prefix else uuid.uuid4().hex cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) session: boto3.Session = _utils.ensure_session(session=boto3_session) # Sanitize table to respect Athena's standards if (sanitize_columns is True) or (database is not None and table is not None): df, dtype, partition_cols = _sanitize(df=df, dtype=dtype, partition_cols=partition_cols) # Evaluating dtype catalog_table_input: Optional[Dict[str, Any]] = None if database is not None and table is not None: catalog_table_input = catalog._get_table_input( # pylint: disable=protected-access database=database, table=table, boto3_session=session, transaction_id=transaction_id, catalog_id=catalog_id ) catalog_path: Optional[str] = None if catalog_table_input: table_type = catalog_table_input["TableType"] catalog_path = catalog_table_input["StorageDescriptor"]["Location"] if path is None: if catalog_path: path = catalog_path else: raise exceptions.InvalidArgumentValue( "Glue table does not exist in the catalog. Please pass the `path` argument to create it." ) elif path and catalog_path: if path.rstrip("/") != catalog_path.rstrip("/"): raise exceptions.InvalidArgumentValue( f"The specified path: {path}, does not match the existing Glue catalog table path: {catalog_path}" ) if (table_type == "GOVERNED") and (not transaction_id): _logger.debug("`transaction_id` not specified for GOVERNED table, starting transaction") transaction_id = lakeformation.start_transaction(read_only=False, boto3_session=boto3_session) commit_trans = True df = _apply_dtype(df=df, dtype=dtype, catalog_table_input=catalog_table_input, mode=mode) schema: pa.Schema = _data_types.pyarrow_schema_from_pandas( df=df, index=index, ignore_cols=partition_cols, dtype=dtype ) _logger.debug("schema: \n%s", schema) if dataset is False: paths = _to_parquet( df=df, path=path, schema=schema, index=index, cpus=cpus, compression=compression, compression_ext=compression_ext, pyarrow_additional_kwargs=pyarrow_additional_kwargs, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, dtype=dtype, max_rows_by_file=max_rows_by_file, use_threads=use_threads, ) else: columns_types: Dict[str, str] = {} partitions_types: Dict[str, str] = {} if (database is not None) and (table is not None): columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned( df=df, index=index, partition_cols=partition_cols, dtype=dtype ) if schema_evolution is False: _utils.check_schema_changes(columns_types=columns_types, table_input=catalog_table_input, mode=mode) if (catalog_table_input is None) and (table_type == "GOVERNED"): catalog._create_parquet_table( # pylint: disable=protected-access database=database, table=table, path=path, # type: ignore columns_types=columns_types, table_type=table_type, partitions_types=partitions_types, bucketing_info=bucketing_info, compression=compression, description=description, parameters=parameters, columns_comments=columns_comments, boto3_session=session, mode=mode, transaction_id=transaction_id, catalog_versioning=catalog_versioning, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, projection_storage_location_template=None, catalog_id=catalog_id, catalog_table_input=catalog_table_input, ) catalog_table_input = catalog._get_table_input( # pylint: disable=protected-access database=database, table=table, boto3_session=session, transaction_id=transaction_id, catalog_id=catalog_id, ) paths, partitions_values = _to_dataset( func=_to_parquet, concurrent_partitioning=concurrent_partitioning, df=df, path_root=path, # type: ignore filename_prefix=filename_prefix, index=index, compression=compression, compression_ext=compression_ext, catalog_id=catalog_id, database=database, table=table, table_type=table_type, transaction_id=transaction_id, pyarrow_additional_kwargs=pyarrow_additional_kwargs, cpus=cpus, use_threads=use_threads, partition_cols=partition_cols, partitions_types=partitions_types, bucketing_info=bucketing_info, dtype=dtype, mode=mode, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, schema=schema, max_rows_by_file=max_rows_by_file, ) if (database is not None) and (table is not None): try: catalog._create_parquet_table( # pylint: disable=protected-access database=database, table=table, path=path, # type: ignore columns_types=columns_types, table_type=table_type, partitions_types=partitions_types, bucketing_info=bucketing_info, compression=compression, description=description, parameters=parameters, columns_comments=columns_comments, boto3_session=session, mode=mode, transaction_id=transaction_id, catalog_versioning=catalog_versioning, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, projection_storage_location_template=None, catalog_id=catalog_id, catalog_table_input=catalog_table_input, ) if partitions_values and (regular_partitions is True) and (table_type != "GOVERNED"): _logger.debug("partitions_values:\n%s", partitions_values) catalog.add_parquet_partitions( database=database, table=table, partitions_values=partitions_values, bucketing_info=bucketing_info, compression=compression, boto3_session=session, catalog_id=catalog_id, columns_types=columns_types, ) if commit_trans: lakeformation.commit_transaction( transaction_id=transaction_id, boto3_session=boto3_session # type: ignore ) except Exception: _logger.debug("Catalog write failed, cleaning up S3 (paths: %s).", paths) delete_objects( path=paths, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, ) raise return {"paths": paths, "partitions_values": partitions_values}
def to_parquet( # pylint: disable=too-many-arguments,too-many-locals df: pd.DataFrame, path: str, index: bool = False, compression: Optional[str] = "snappy", max_rows_by_file: Optional[int] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, sanitize_columns: bool = False, dataset: bool = False, partition_cols: Optional[List[str]] = None, concurrent_partitioning: bool = False, mode: Optional[str] = None, catalog_versioning: bool = False, database: Optional[str] = None, table: Optional[str] = None, dtype: Optional[Dict[str, str]] = None, description: Optional[str] = None, parameters: Optional[Dict[str, str]] = None, columns_comments: Optional[Dict[str, str]] = None, regular_partitions: bool = True, projection_enabled: bool = False, projection_types: Optional[Dict[str, str]] = None, projection_ranges: Optional[Dict[str, str]] = None, projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, catalog_id: Optional[str] = None, ) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: """Write Parquet file or dataset on Amazon S3. The concept of Dataset goes beyond the simple idea of files and enable more complex features like partitioning, casting and catalog integration (Amazon Athena/AWS Glue Catalog). Note ---- If `dataset=True` The table name and all column names will be automatically sanitized using `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`. Please, pass `sanitize_columns=True` to force the same behaviour for `dataset=False`. Note ---- On `append` mode, the `parameters` will be upsert on an existing table. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- df: pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html path : str S3 path (for file e.g. ``s3://bucket/prefix/filename.parquet``) (for dataset e.g. ``s3://bucket/prefix``). index : bool True to store the DataFrame index in file, otherwise False to ignore it. compression: str, optional Compression style (``None``, ``snappy``, ``gzip``). max_rows_by_file : int Max number of rows in each file. Default is None i.e. dont split the files. (e.g. 33554432, 268435456) use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: Forward to s3fs, useful for server side encryption https://s3fs.readthedocs.io/en/latest/#serverside-encryption sanitize_columns : bool True to sanitize columns names or False to keep it as is. True value is forced if `dataset=True`. dataset : bool If True store a parquet dataset instead of a single file. If True, enable all follow arguments: partition_cols, mode, database, table, description, parameters, columns_comments, . partition_cols: List[str], optional List of column names that will be used to create partitions. Only takes effect if dataset=True. concurrent_partitioning: bool If True will increase the parallelism level during the partitions writing. It will decrease the writing time and increase the memory usage. https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/022%20-%20Writing%20Partitions%20Concurrently.ipynb mode: str, optional ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True. For details check the related tutorial: https://aws-data-wrangler.readthedocs.io/en/latest/stubs/awswrangler.s3.to_parquet.html#awswrangler.s3.to_parquet catalog_versioning : bool If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. database : str, optional Glue/Athena catalog: Database name. table : str, optional Glue/Athena catalog: Table name. dtype : Dict[str, str], optional Dictionary of columns names and Athena/Glue types to be casted. Useful when you have columns with undetermined or mixed data types. (e.g. {'col name': 'bigint', 'col2 name': 'int'}) description : str, optional Glue/Athena catalog: Table description parameters : Dict[str, str], optional Glue/Athena catalog: Key/value pairs to tag the table. columns_comments : Dict[str, str], optional Glue/Athena catalog: Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). regular_partitions : bool Create regular partitions (Non projected partitions) on Glue Catalog. Disable when you will work only with Partition Projection. Keep enabled even when working with projections is useful to keep Redshift Spectrum working with the regular partitions. projection_enabled : bool Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) projection_types : Optional[Dict[str, str]] Dictionary of partitions names and Athena projections types. Valid types: "enum", "integer", "date", "injected" https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) projection_ranges: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections ranges. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) projection_values: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections values. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) projection_intervals: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections intervals. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '5'}) projection_digits: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. Returns ------- Dict[str, Union[List[str], Dict[str, List[str]]]] Dictionary with: 'paths': List of all stored files paths on S3. 'partitions_values': Dictionary of partitions added with keys as S3 path locations and values as a list of partitions values as str. Examples -------- Writing single file >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.parquet', ... ) { 'paths': ['s3://bucket/prefix/my_file.parquet'], 'partitions_values': {} } Writing single file encrypted with a KMS key >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.parquet', ... s3_additional_kwargs={ ... 'ServerSideEncryption': 'aws:kms', ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' ... } ... ) { 'paths': ['s3://bucket/prefix/my_file.parquet'], 'partitions_values': {} } Writing partitioned dataset >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... partition_cols=['col2'] ... ) { 'paths': ['s3://.../col2=A/x.parquet', 's3://.../col2=B/y.parquet'], 'partitions_values: { 's3://.../col2=A/': ['A'], 's3://.../col2=B/': ['B'] } } Writing dataset to S3 with metadata on Athena/Glue Catalog. >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... partition_cols=['col2'], ... database='default', # Athena/Glue database ... table='my_table' # Athena/Glue table ... ) { 'paths': ['s3://.../col2=A/x.parquet', 's3://.../col2=B/y.parquet'], 'partitions_values: { 's3://.../col2=A/': ['A'], 's3://.../col2=B/': ['B'] } } Writing dataset casting empty column data type >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'], ... 'col3': [None, None, None] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... database='default', # Athena/Glue database ... table='my_table' # Athena/Glue table ... dtype={'col3': 'date'} ... ) { 'paths': ['s3://.../x.parquet'], 'partitions_values: {} } """ _validate_args( df=df, table=table, dataset=dataset, path=path, partition_cols=partition_cols, mode=mode, description=description, parameters=parameters, columns_comments=columns_comments, ) # Evaluating compression if _COMPRESSION_2_EXT.get(compression, None) is None: raise exceptions.InvalidCompression( f"{compression} is invalid, please use None, 'snappy' or 'gzip'.") compression_ext: str = _COMPRESSION_2_EXT[compression] # Initializing defaults partition_cols = partition_cols if partition_cols else [] dtype = dtype if dtype else {} partitions_values: Dict[str, List[str]] = {} mode = "append" if mode is None else mode cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) session: boto3.Session = _utils.ensure_session(session=boto3_session) # Sanitize table to respect Athena's standards if (sanitize_columns is True) or (dataset is True): df, dtype, partition_cols = _sanitize(df=df, dtype=dtype, partition_cols=partition_cols) # Evaluating dtype catalog_table_input: Optional[Dict[str, Any]] = None if database is not None and table is not None: catalog_table_input = catalog._get_table_input( # pylint: disable=protected-access database=database, table=table, boto3_session=session, catalog_id=catalog_id) df = _apply_dtype(df=df, dtype=dtype, catalog_table_input=catalog_table_input, mode=mode) schema: pa.Schema = _data_types.pyarrow_schema_from_pandas( df=df, index=index, ignore_cols=partition_cols, dtype=dtype) _logger.debug("schema: \n%s", schema) if dataset is False: paths = _to_parquet( df=df, path=path, schema=schema, index=index, cpus=cpus, compression=compression, compression_ext=compression_ext, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, dtype=dtype, max_rows_by_file=max_rows_by_file, ) else: paths, partitions_values = _to_dataset( func=_to_parquet, concurrent_partitioning=concurrent_partitioning, df=df, path_root=path, index=index, compression=compression, compression_ext=compression_ext, cpus=cpus, use_threads=use_threads, partition_cols=partition_cols, dtype=dtype, mode=mode, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, schema=schema, max_rows_by_file=max_rows_by_file, ) if (database is not None) and (table is not None): columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned( df=df, index=index, partition_cols=partition_cols, dtype=dtype) catalog._create_parquet_table( # pylint: disable=protected-access database=database, table=table, path=path, columns_types=columns_types, partitions_types=partitions_types, compression=compression, description=description, parameters=parameters, columns_comments=columns_comments, boto3_session=session, mode=mode, catalog_versioning=catalog_versioning, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, catalog_id=catalog_id, catalog_table_input=catalog_table_input, ) if partitions_values and (regular_partitions is True): _logger.debug("partitions_values:\n%s", partitions_values) catalog.add_parquet_partitions( database=database, table=table, partitions_values=partitions_values, compression=compression, boto3_session=session, ) return {"paths": paths, "partitions_values": partitions_values}
def merge_datasets( source_path: str, target_path: str, mode: str = "append", use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, ) -> List[str]: """Merge a source dataset into a target dataset. This function accepts Unix shell-style wildcards in the source_path argument. * (matches everything), ? (matches any single character), [seq] (matches any character in seq), [!seq] (matches any character not in seq). Note ---- If you are merging tables (S3 datasets + Glue Catalog metadata), remember that you will also need to update your partitions metadata in some cases. (e.g. wr.athena.repair_table(table='...', database='...')) Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- source_path : str, S3 Path for the source directory. target_path : str, S3 Path for the target directory. mode: str, optional ``append`` (Default), ``overwrite``, ``overwrite_partitions``. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs : Optional[Dict[str, Any]] Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass", "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging". e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'} Returns ------- List[str] List of new objects paths. Examples -------- Merging >>> import awswrangler as wr >>> wr.s3.merge_datasets( ... source_path="s3://bucket0/dir0/", ... target_path="s3://bucket1/dir1/", ... mode="append" ... ) ["s3://bucket1/dir1/key0", "s3://bucket1/dir1/key1"] Merging with a KMS key >>> import awswrangler as wr >>> wr.s3.merge_datasets( ... source_path="s3://bucket0/dir0/", ... target_path="s3://bucket1/dir1/", ... mode="append", ... s3_additional_kwargs={ ... 'ServerSideEncryption': 'aws:kms', ... 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN' ... } ... ) ["s3://bucket1/dir1/key0", "s3://bucket1/dir1/key1"] """ source_path = source_path[:-1] if source_path[-1] == "/" else source_path target_path = target_path[:-1] if target_path[-1] == "/" else target_path session: boto3.Session = _utils.ensure_session(session=boto3_session) paths: List[str] = list_objects(path=f"{source_path}/", boto3_session=session) _logger.debug("len(paths): %s", len(paths)) if len(paths) < 1: return [] if mode == "overwrite": _logger.debug("Deleting to overwrite: %s/", target_path) delete_objects(path=f"{target_path}/", use_threads=use_threads, boto3_session=session) elif mode == "overwrite_partitions": paths_wo_prefix: List[str] = [ x.replace(f"{source_path}/", "") for x in paths ] paths_wo_filename: List[str] = [ f"{x.rpartition('/')[0]}/" for x in paths_wo_prefix ] partitions_paths: List[str] = list(set(paths_wo_filename)) target_partitions_paths = [ f"{target_path}/{x}" for x in partitions_paths ] for path in target_partitions_paths: _logger.debug("Deleting to overwrite_partitions: %s", path) delete_objects(path=path, use_threads=use_threads, boto3_session=session) elif mode != "append": raise exceptions.InvalidArgumentValue( f"{mode} is a invalid mode option.") new_objects: List[str] = copy_objects( paths=paths, source_path=source_path, target_path=target_path, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, ) _logger.debug("len(new_objects): %s", len(new_objects)) return new_objects
def read_sql_query( sql: str, database: str, ctas_approach: bool = True, categories: Optional[List[str]] = None, chunksize: Optional[Union[int, bool]] = None, s3_output: Optional[str] = None, workgroup: Optional[str] = None, encryption: Optional[str] = None, kms_key: Optional[str] = None, keep_files: bool = True, ctas_temp_table_name: Optional[str] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, max_cache_seconds: int = 0, max_cache_query_inspections: int = 50, max_remote_cache_entries: int = 50, max_local_cache_entries: int = 100, data_source: Optional[str] = None, params: Optional[Dict[str, Any]] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Execute any SQL query on AWS Athena and return the results as a Pandas DataFrame. **Related tutorial:** - `Amazon Athena <https://aws-data-wrangler.readthedocs.io/en/2.5.0/ tutorials/006%20-%20Amazon%20Athena.html>`_ - `Athena Cache <https://aws-data-wrangler.readthedocs.io/en/2.5.0/ tutorials/019%20-%20Athena%20Cache.html>`_ - `Global Configurations <https://aws-data-wrangler.readthedocs.io/en/2.5.0/ tutorials/021%20-%20Global%20Configurations.html>`_ **There are two approaches to be defined through ctas_approach parameter:** **1** - ctas_approach=True (Default): Wrap the query with a CTAS and then reads the table data as parquet directly from s3. PROS: - Faster for mid and big result sizes. - Can handle some level of nested types. CONS: - Requires create/delete table permissions on Glue. - Does not support timestamp with time zone - Does not support columns with repeated names. - Does not support columns with undefined data types. - A temporary table will be created and then deleted immediately. - Does not support custom data_source/catalog_id. **2** - ctas_approach=False: Does a regular query on Athena and parse the regular CSV result on s3. PROS: - Faster for small result sizes (less latency). - Does not require create/delete table permissions on Glue - Supports timestamp with time zone. - Support custom data_source/catalog_id. CONS: - Slower for big results (But stills faster than other libraries that uses the regular Athena's API) - Does not handle nested types at all. Note ---- The resulting DataFrame (or every DataFrame in the returned Iterator for chunked queries) have a `query_metadata` attribute, which brings the query result metadata returned by `Boto3/Athena <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services /athena.html#Athena.Client.get_query_execution>`_ . For a practical example check out the `related tutorial <https://aws-data-wrangler.readthedocs.io/en/2.5.0/ tutorials/024%20-%20Athena%20Query%20Metadata.html>`_! Note ---- Valid encryption modes: [None, 'SSE_S3', 'SSE_KMS']. `P.S. 'CSE_KMS' is not supported.` Note ---- Create the default Athena bucket if it doesn't exist and s3_output is None. (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/) Note ---- `chunksize` argument (Memory Friendly) (i.e batching): Return an Iterable of DataFrames instead of a regular DataFrame. There are two batching strategies: - If **chunksize=True**, a new DataFrame will be returned for each file in the query result. - If **chunksize=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER. `P.S.` `chunksize=True` is faster and uses less memory while `chunksize=INTEGER` is more precise in number of rows for each Dataframe. `P.P.S.` If `ctas_approach=False` and `chunksize=True`, you will always receive an interador with a single DataFrame because regular Athena queries only produces a single output file. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- sql : str SQL query. database : str AWS Glue/Athena database name - It is only the origin database from where the query will be launched. You can still using and mixing several databases writing the full table name within the sql (e.g. `database.table`). ctas_approach: bool Wraps the query using a CTAS, and read the resulted parquet data on S3. If false, read the regular CSV on S3. categories: List[str], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. chunksize : Union[int, bool], optional If passed will split the data in a Iterable of DataFrames (Memory friendly). If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize. If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER. s3_output : str, optional Amazon S3 path. workgroup : str, optional Athena workgroup. encryption : str, optional Valid values: [None, 'SSE_S3', 'SSE_KMS']. Notice: 'CSE_KMS' is not supported. kms_key : str, optional For SSE-KMS, this is the KMS key ARN or ID. keep_files : bool Should Wrangler delete or keep the staging files produced by Athena? ctas_temp_table_name : str, optional The name of the temporary table and also the directory name on S3 where the CTAS result is stored. If None, it will use the follow random pattern: `f"temp_table_{uuid.uuid4().hex()}"`. On S3 this directory will be under under the pattern: `f"{s3_output}/{ctas_temp_table_name}/"`. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. max_cache_seconds : int Wrangler can look up in Athena's history if this query has been run before. If so, and its completion time is less than `max_cache_seconds` before now, wrangler skips query execution and just returns the same results as last time. If cached results are valid, wrangler ignores the `ctas_approach`, `s3_output`, `encryption`, `kms_key`, `keep_files` and `ctas_temp_table_name` params. If reading cached data fails for any reason, execution falls back to the usual query run path. max_cache_query_inspections : int Max number of queries that will be inspected from the history to try to find some result to reuse. The bigger the number of inspection, the bigger will be the latency for not cached queries. Only takes effect if max_cache_seconds > 0. max_remote_cache_entries : int Max number of queries that will be retrieved from AWS for cache inspection. The bigger the number of inspection, the bigger will be the latency for not cached queries. Only takes effect if max_cache_seconds > 0 and default value is 50. max_local_cache_entries : int Max number of queries for which metadata will be cached locally. This will reduce the latency and also enables keeping more than `max_remote_cache_entries` available for the cache. This value should not be smaller than max_remote_cache_entries. Only takes effect if max_cache_seconds > 0 and default value is 100. data_source : str, optional Data Source / Catalog name. If None, 'AwsDataCatalog' will be used by default. params: Dict[str, any], optional Dict of parameters that will be used for constructing the SQL query. Only named parameters are supported. The dict needs to contain the information in the form {'name': 'value'} and the SQL query needs to contain `:name;`. s3_additional_kwargs : Optional[Dict[str, Any]] Forward to botocore requests. Valid parameters: "RequestPayer", "ExpectedBucketOwner". e.g. s3_additional_kwargs={'RequestPayer': 'requester'} Returns ------- Union[pd.DataFrame, Iterator[pd.DataFrame]] Pandas DataFrame or Generator of Pandas DataFrames if chunksize is passed. Examples -------- >>> import awswrangler as wr >>> df = wr.athena.read_sql_query(sql="...", database="...") >>> scanned_bytes = df.query_metadata["Statistics"]["DataScannedInBytes"] >>> import awswrangler as wr >>> df = wr.athena.read_sql_query( ... sql="SELECT * FROM my_table WHERE name=:name;", ... params={"name": "filtered_name"} ... ) """ if ctas_approach and data_source not in (None, "AwsDataCatalog"): raise exceptions.InvalidArgumentCombination( "Queries with ctas_approach=True (default) does not support " "data_source values different than None and 'AwsDataCatalog'. " "Please check the related tutorial for more details " "(https://github.com/awslabs/aws-data-wrangler/blob/main/" "tutorials/006%20-%20Amazon%20Athena.ipynb)") chunksize = sys.maxsize if ctas_approach is False and chunksize is True else chunksize session: boto3.Session = _utils.ensure_session(session=boto3_session) if params is None: params = {} for key, value in params.items(): sql = sql.replace(f":{key};", str(value)) if max_remote_cache_entries > max_local_cache_entries: max_remote_cache_entries = max_local_cache_entries _cache_manager.max_cache_size = max_local_cache_entries cache_info: _CacheInfo = _check_for_cached_results( sql=sql, boto3_session=session, workgroup=workgroup, max_cache_seconds=max_cache_seconds, max_cache_query_inspections=max_cache_query_inspections, max_remote_cache_entries=max_remote_cache_entries, ) _logger.debug("cache_info:\n%s", cache_info) if cache_info.has_valid_cache is True: _logger.debug("Valid cache found. Retrieving...") try: return _resolve_query_with_cache( cache_info=cache_info, categories=categories, chunksize=chunksize, use_threads=use_threads, session=session, s3_additional_kwargs=s3_additional_kwargs, ) except Exception as e: # pylint: disable=broad-except _logger.error( e ) # if there is anything wrong with the cache, just fallback to the usual path _logger.debug("Corrupted cache. Continuing to execute query...") return _resolve_query_without_cache( sql=sql, database=database, data_source=data_source, ctas_approach=ctas_approach, categories=categories, chunksize=chunksize, s3_output=s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, keep_files=keep_files, ctas_temp_table_name=ctas_temp_table_name, use_threads=use_threads, s3_additional_kwargs=s3_additional_kwargs, boto3_session=session, )
def read_sql_query( sql: str, database: str, transaction_id: Optional[str] = None, query_as_of_time: Optional[str] = None, catalog_id: Optional[str] = None, categories: Optional[List[str]] = None, safe: bool = True, map_types: bool = True, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, params: Optional[Dict[str, Any]] = None, ) -> pd.DataFrame: """Execute PartiQL query on AWS Glue Table (Transaction ID or time travel timestamp). Return Pandas DataFrame. Note ---- ORDER BY operations are not honoured. i.e. sql="SELECT * FROM my_table ORDER BY my_column" is NOT valid Note ---- The database must NOT be explicitely defined in the PartiQL statement. i.e. sql="SELECT * FROM my_table" is valid but sql="SELECT * FROM my_db.my_table" is NOT valid Note ---- Pass one of `transaction_id` or `query_as_of_time`, not both. Parameters ---------- sql : str partiQL query. database : str AWS Glue database name transaction_id : str, optional The ID of the transaction at which to read the table contents. Cannot be specified alongside query_as_of_time query_as_of_time : str, optional The time as of when to read the table contents. Must be a valid Unix epoch timestamp. Cannot be specified alongside transaction_id catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. categories: Optional[List[str]], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. safe : bool, default True For certain data types, a cast is needed in order to store the data in a pandas DataFrame or Series (e.g. timestamps are always stored as nanoseconds in pandas). This option controls whether it is a safe cast or not. map_types : bool, default True True to convert pyarrow DataTypes to pandas ExtensionDtypes. It is used to override the default pandas type for conversion of built-in pyarrow types or in absence of pandas_metadata in the Table schema. use_threads : bool True to enable concurrent requests, False to disable multiple threads. When enabled, os.cpu_count() is used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session is used if boto3_session receives None. params: Dict[str, any], optional Dict of parameters used to format the partiQL query. Only named parameters are supported. The dict must contain the information in the form {"name": "value"} and the SQL query must contain `:name`. Returns ------- pd.DataFrame Pandas DataFrame. Examples -------- >>> import awswrangler as wr >>> df = wr.lakeformation.read_sql_query( ... sql="SELECT * FROM my_table;", ... database="my_db", ... catalog_id="111111111111" ... ) >>> import awswrangler as wr >>> df = wr.lakeformation.read_sql_query( ... sql="SELECT * FROM my_table LIMIT 10;", ... database="my_db", ... transaction_id="1b62811fa3e02c4e5fdbaa642b752030379c4a8a70da1f8732ce6ccca47afdc9" ... ) >>> import awswrangler as wr >>> df = wr.lakeformation.read_sql_query( ... sql="SELECT * FROM my_table WHERE name=:name; AND city=:city;", ... database="my_db", ... query_as_of_time="1611142914", ... params={"name": "'filtered_name'", "city": "'filtered_city'"} ... ) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) client_lakeformation: boto3.client = _utils.client( service_name="lakeformation", session=session) commit_trans: bool = False if params is None: params = {} for key, value in params.items(): sql = sql.replace(f":{key};", str(value)) if not any([transaction_id, query_as_of_time]): _logger.debug( "Neither `transaction_id` nor `query_as_of_time` were specified, starting transaction" ) transaction_id = start_transaction(read_only=True, boto3_session=session) commit_trans = True args: Dict[str, Optional[str]] = _catalog_id( catalog_id=catalog_id, **_transaction_id(transaction_id=transaction_id, query_as_of_time=query_as_of_time, DatabaseName=database), ) query_id: str = client_lakeformation.start_query_planning( QueryString=sql, QueryPlanningContext=args)["QueryId"] df = _resolve_sql_query( query_id=query_id, categories=categories, safe=safe, map_types=map_types, use_threads=use_threads, boto3_session=session, ) if commit_trans: commit_transaction(transaction_id=transaction_id) # type: ignore return df
def __init__( self, path: str, s3_block_size: int, mode: str, use_threads: bool, s3_additional_kwargs: Optional[Dict[str, str]], boto3_session: Optional[boto3.Session], newline: Optional[str], encoding: Optional[str], ) -> None: self.closed: bool = False self._use_threads = use_threads self._newline: str = "\n" if newline is None else newline self._encoding: str = "utf-8" if encoding is None else encoding self._bucket, self._key = _utils.parse_path(path=path) self._boto3_session: boto3.Session = _utils.ensure_session( session=boto3_session) if mode not in {"rb", "wb", "r", "w"}: raise NotImplementedError( "File mode must be {'rb', 'wb', 'r', 'w'}, not %s" % mode) self._mode: str = "rb" if mode is None else mode self._one_shot_download: bool = False if 0 < s3_block_size < 3: raise exceptions.InvalidArgumentValue( "s3_block_size MUST > 2 to define a valid size or " "< 1 to avoid blocks and always execute one shot downloads.") if s3_block_size <= 0: _logger.debug("s3_block_size of %d, enabling one_shot_download.", s3_block_size) self._one_shot_download = True self._s3_block_size: int = s3_block_size self._s3_half_block_size: int = s3_block_size // 2 self._s3_additional_kwargs: Dict[ str, str] = {} if s3_additional_kwargs is None else s3_additional_kwargs self._client: boto3.client = _utils.client(service_name="s3", session=self._boto3_session) self._loc: int = 0 if self.readable() is True: self._cache: bytes = b"" self._start: int = 0 self._end: int = 0 size: Optional[int] = size_objects( path=[path], use_threads=False, boto3_session=self._boto3_session)[path] if size is None: raise exceptions.InvalidArgumentValue( f"S3 object w/o defined size: {path}") self._size: int = size _logger.debug("self._size: %s", self._size) _logger.debug("self._s3_block_size: %s", self._s3_block_size) elif self.writable() is True: self._mpu: Dict[str, Any] = {} self._buffer: io.BytesIO = io.BytesIO() self._parts_count: int = 0 self._size = 0 self._upload_proxy: _UploadProxy = _UploadProxy( use_threads=self._use_threads) else: raise RuntimeError(f"Invalid mode: {self._mode}")
def store_parquet_metadata( # pylint: disable=too-many-arguments path: str, database: str, table: str, catalog_id: Optional[str] = None, path_suffix: Optional[str] = None, path_ignore_suffix: Optional[str] = None, ignore_empty: bool = True, dtype: Optional[Dict[str, str]] = None, sampling: float = 1.0, dataset: bool = False, use_threads: Union[bool, int] = True, description: Optional[str] = None, parameters: Optional[Dict[str, str]] = None, columns_comments: Optional[Dict[str, str]] = None, compression: Optional[str] = None, mode: str = "overwrite", catalog_versioning: bool = False, regular_partitions: bool = True, projection_enabled: bool = False, projection_types: Optional[Dict[str, str]] = None, projection_ranges: Optional[Dict[str, str]] = None, projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, boto3_session: Optional[boto3.Session] = None, ) -> Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[str, List[str]]]]: """Infer and store parquet metadata on AWS Glue Catalog. Infer Apache Parquet file(s) metadata from from a received S3 prefix or list of S3 objects paths And then stores it on AWS Glue Catalog including all inferred partitions (No need of 'MSCK REPAIR TABLE') The concept of Dataset goes beyond the simple idea of files and enable more complex features like partitioning and catalog integration (AWS Glue Catalog). This function accepts Unix shell-style wildcards in the path argument. * (matches everything), ? (matches any single character), [seq] (matches any character in seq), [!seq] (matches any character not in seq). If you want to use a path which includes Unix shell-style wildcard characters (`*, ?, []`), you can use `glob.escape(path)` before passing the path to this function. Note ---- On `append` mode, the `parameters` will be upsert on an existing table. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- path : Union[str, List[str]] S3 prefix (accepts Unix shell-style wildcards) (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). database : str Glue/Athena catalog: Database name. table : str Glue/Athena catalog: Table name. database : str AWS Glue Catalog database name. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. path_suffix: Union[str, List[str], None] Suffix or List of suffixes for filtering S3 keys. path_ignore_suffix: Union[str, List[str], None] Suffix or List of suffixes for S3 keys to be ignored. ignore_empty: bool Ignore files with 0 bytes. dtype : Dict[str, str], optional Dictionary of columns names and Athena/Glue types to be casted. Useful when you have columns with undetermined data types as partitions columns. (e.g. {'col name': 'bigint', 'col2 name': 'int'}) sampling : float Random sample ratio of files that will have the metadata inspected. Must be `0.0 < sampling <= 1.0`. The higher, the more accurate. The lower, the faster. dataset: bool If True read a parquet dataset instead of simple file(s) loading all the related partitions as columns. use_threads : bool, int True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. If integer is provided, specified number is used. description: str, optional Glue/Athena catalog: Table description parameters: Dict[str, str], optional Glue/Athena catalog: Key/value pairs to tag the table. columns_comments: Dict[str, str], optional Glue/Athena catalog: Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). compression: str, optional Compression style (``None``, ``snappy``, ``gzip``, etc). mode: str 'overwrite' to recreate any possible existing table or 'append' to keep any possible existing table. catalog_versioning : bool If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. regular_partitions : bool Create regular partitions (Non projected partitions) on Glue Catalog. Disable when you will work only with Partition Projection. Keep enabled even when working with projections is useful to keep Redshift Spectrum working with the regular partitions. projection_enabled : bool Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) projection_types : Optional[Dict[str, str]] Dictionary of partitions names and Athena projections types. Valid types: "enum", "integer", "date", "injected" https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) projection_ranges: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections ranges. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) projection_values: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections values. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) projection_intervals: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections intervals. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '5'}) projection_digits: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) s3_additional_kwargs : Optional[Dict[str, Any]] Forwarded to botocore requests. e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'} boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[str, List[str]]]] The metadata used to create the Glue Table. columns_types: Dictionary with keys as column names and values as data types (e.g. {'col0': 'bigint', 'col1': 'double'}). / partitions_types: Dictionary with keys as partition names and values as data types (e.g. {'col2': 'date'}). / partitions_values: Dictionary with keys as S3 path locations and values as a list of partitions values as str (e.g. {'s3://bucket/prefix/y=2020/m=10/': ['2020', '10']}). Examples -------- Reading all Parquet files metadata under a prefix >>> import awswrangler as wr >>> columns_types, partitions_types, partitions_values = wr.s3.store_parquet_metadata( ... path='s3://bucket/prefix/', ... database='...', ... table='...', ... dataset=True ... ) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) columns_types: Dict[str, str] partitions_types: Optional[Dict[str, str]] partitions_values: Optional[Dict[str, List[str]]] columns_types, partitions_types, partitions_values = _read_parquet_metadata( path=path, dtype=dtype, sampling=sampling, dataset=dataset, path_suffix=path_suffix, path_ignore_suffix=path_ignore_suffix, ignore_empty=ignore_empty, use_threads=use_threads, s3_additional_kwargs=s3_additional_kwargs, boto3_session=session, ) _logger.debug("columns_types: %s", columns_types) _logger.debug("partitions_types: %s", partitions_types) _logger.debug("partitions_values: %s", partitions_values) catalog.create_parquet_table( database=database, table=table, path=path, columns_types=columns_types, partitions_types=partitions_types, description=description, parameters=parameters, columns_comments=columns_comments, mode=mode, compression=compression, catalog_versioning=catalog_versioning, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, boto3_session=session, catalog_id=catalog_id, ) if (partitions_types is not None) and (partitions_values is not None) and (regular_partitions is True): catalog.add_parquet_partitions( database=database, table=table, partitions_values=partitions_values, compression=compression, boto3_session=session, catalog_id=catalog_id, columns_types=columns_types, ) return columns_types, partitions_types, partitions_values
def repair_table( table: str, database: Optional[str] = None, s3_output: Optional[str] = None, workgroup: Optional[str] = None, encryption: Optional[str] = None, kms_key: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> str: """Run the Hive's metastore consistency check: 'MSCK REPAIR TABLE table;'. Recovers partitions and data associated with partitions. Use this statement when you add partitions to the catalog. It is possible it will take some time to add all partitions. If this operation times out, it will be in an incomplete state where only a few partitions are added to the catalog. Note ---- Create the default Athena bucket if it doesn't exist and s3_output is None. (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/) Parameters ---------- table : str Table name. database : str, optional AWS Glue/Athena database name. s3_output : str, optional AWS S3 path. workgroup : str, optional Athena workgroup. encryption : str, optional None, 'SSE_S3', 'SSE_KMS', 'CSE_KMS'. kms_key : str, optional For SSE-KMS and CSE-KMS , this is the KMS key ARN or ID. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- str Query final state ('SUCCEEDED', 'FAILED', 'CANCELLED'). Examples -------- >>> import awswrangler as wr >>> query_final_state = wr.athena.repair_table(table='...', database='...') """ query = f"MSCK REPAIR TABLE `{table}`;" if (database is not None) and (not database.startswith("`")): database = f"`{database}`" session: boto3.Session = _utils.ensure_session(session=boto3_session) query_id = start_query_execution( sql=query, database=database, s3_output=s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, boto3_session=session, ) response: Dict[str, Any] = wait_query(query_execution_id=query_id, boto3_session=session) return cast(str, response["Status"]["State"])
Examples -------- >>> import awswrangler as wr >>> import pandas as pd >>> wr.db.copy_to_redshift( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path="s3://bucket/my_parquet_files/", ... con=wr.catalog.get_engine(connection="my_glue_conn_name"), ... table="my_table", ... schema="public" ... iam_role="arn:aws:iam::XXX:role/XXX" ... ) """ path = path if path.endswith("/") else f"{path}/" session: boto3.Session = _utils.ensure_session(session=boto3_session) paths: List[str] = s3.to_parquet( df=df, path=path, index=index, dataset=True, mode="append", dtype=dtype, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, max_rows_by_file=max_rows_by_file, )["paths"] s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session)
def describe_table( table: str, database: Optional[str] = None, s3_output: Optional[str] = None, workgroup: Optional[str] = None, encryption: Optional[str] = None, kms_key: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> pd.DataFrame: """Show the list of columns, including partition columns: 'DESCRIBE table;'. Shows the list of columns, including partition columns, for the named column. The result of this function will be equal to `wr.catalog.table`. Note ---- Create the default Athena bucket if it doesn't exist and s3_output is None. (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/) Parameters ---------- table : str Table name. database : str, optional AWS Glue/Athena database name. s3_output : str, optional AWS S3 path. workgroup : str, optional Athena workgroup. encryption : str, optional None, 'SSE_S3', 'SSE_KMS', 'CSE_KMS'. kms_key : str, optional For SSE-KMS and CSE-KMS , this is the KMS key ARN or ID. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- pandas.DataFrame Pandas DataFrame filled by formatted infos. Examples -------- >>> import awswrangler as wr >>> df_table = wr.athena.describe_table(table='my_table', database='default') """ query = f"DESCRIBE `{table}`;" if (database is not None) and (not database.startswith("`")): database = f"`{database}`" session: boto3.Session = _utils.ensure_session(session=boto3_session) query_id = start_query_execution( sql=query, database=database, s3_output=s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, boto3_session=session, ) query_metadata: _QueryMetadata = _get_query_metadata(query_execution_id=query_id, boto3_session=session) raw_result = _fetch_txt_result(query_metadata=query_metadata, keep_files=True, boto3_session=session,) return _parse_describe_table(raw_result)
def unload_redshift( sql: str, path: str, con: sqlalchemy.engine.Engine, iam_role: str, region: Optional[str] = None, max_file_size: Optional[float] = None, kms_key_id: Optional[str] = None, categories: Optional[List[str]] = None, chunked: Union[bool, int] = False, keep_files: bool = False, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Load Pandas DataFrame from a Amazon Redshift query result using Parquet files on s3 as stage. This is a **HIGH** latency and **HIGH** throughput alternative to `wr.db.read_sql_query()`/`wr.db.read_sql_table()` to extract large Amazon Redshift data into a Pandas DataFrames through the **UNLOAD command**. This strategy has more overhead and requires more IAM privileges than the regular `wr.db.read_sql_query()`/`wr.db.read_sql_table()` function, so it is only recommended to fetch +1MM rows at once. https://docs.aws.amazon.com/redshift/latest/dg/r_UNLOAD.html Note ---- ``Batching`` (`chunked` argument) (Memory Friendly): Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. There are two batching strategies on Wrangler: - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset. - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER. `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise in number of rows for each Dataframe. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- sql: str SQL query. path : Union[str, List[str]] S3 path to write stage files (e.g. s3://bucket_name/any_name/) con : sqlalchemy.engine.Engine SQLAlchemy Engine. Please use, wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine() iam_role : str AWS IAM role with the related permissions. region : str, optional Specifies the AWS Region where the target Amazon S3 bucket is located. REGION is required for UNLOAD to an Amazon S3 bucket that isn't in the same AWS Region as the Amazon Redshift cluster. By default, UNLOAD assumes that the target Amazon S3 bucket is located in the same AWS Region as the Amazon Redshift cluster. max_file_size : float, optional Specifies the maximum size (MB) of files that UNLOAD creates in Amazon S3. Specify a decimal value between 5.0 MB and 6200.0 MB. If None, the default maximum file size is 6200.0 MB. kms_key_id : str, optional Specifies the key ID for an AWS Key Management Service (AWS KMS) key to be used to encrypt data files on Amazon S3. categories: List[str], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. keep_files : bool Should keep the stage files? chunked : Union[int, bool] If passed will split the data in a Iterable of DataFrames (Memory friendly). If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize. If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered. Returns ------- Union[pandas.DataFrame, Iterator[pandas.DataFrame]] Result as Pandas DataFrame(s). Examples -------- >>> import awswrangler as wr >>> import pandas as pd >>> df = wr.db.unload_redshift( ... sql="SELECT * FROM public.mytable", ... path="s3://bucket/extracted_parquet_files/", ... con=wr.catalog.get_engine(connection="my_glue_connection"), ... iam_role="arn:aws:iam::XXX:role/XXX" ... ) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) paths: List[str] = unload_redshift_to_files( sql=sql, path=path, con=con, iam_role=iam_role, region=region, max_file_size=max_file_size, kms_key_id=kms_key_id, use_threads=use_threads, boto3_session=session, ) s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session) if chunked is False: if not paths: return pd.DataFrame() df: pd.DataFrame = s3.read_parquet( path=paths, categories=categories, chunked=chunked, dataset=False, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, ) if keep_files is False: s3.delete_objects(path=paths, use_threads=use_threads, boto3_session=session) return df if not paths: return _utils.empty_generator() return _read_parquet_iterator( paths=paths, categories=categories, chunked=chunked, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, keep_files=keep_files, )
def show_create_table( table: str, database: Optional[str] = None, s3_output: Optional[str] = None, workgroup: Optional[str] = None, encryption: Optional[str] = None, kms_key: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> str: """Generate the query that created it: 'SHOW CREATE TABLE table;'. Analyzes an existing table named table_name to generate the query that created it. Note ---- Create the default Athena bucket if it doesn't exist and s3_output is None. (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/) Parameters ---------- table : str Table name. database : str, optional AWS Glue/Athena database name. s3_output : str, optional AWS S3 path. workgroup : str, optional Athena workgroup. encryption : str, optional None, 'SSE_S3', 'SSE_KMS', 'CSE_KMS'. kms_key : str, optional For SSE-KMS and CSE-KMS , this is the KMS key ARN or ID. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- str The query that created the table. Examples -------- >>> import awswrangler as wr >>> df_table = wr.athena.show_create_table(table='my_table', database='default') """ query = f"SHOW CREATE TABLE `{table}`;" if (database is not None) and (not database.startswith("`")): database = f"`{database}`" session: boto3.Session = _utils.ensure_session(session=boto3_session) query_id = start_query_execution( sql=query, database=database, s3_output=s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, boto3_session=session, ) query_metadata: _QueryMetadata = _get_query_metadata(query_execution_id=query_id, boto3_session=session) raw_result = _fetch_txt_result(query_metadata=query_metadata, keep_files=True, boto3_session=session,) return cast(str, raw_result.createtab_stmt.str.strip().str.cat(sep=" "))
def copy_files_to_redshift( # pylint: disable=too-many-locals,too-many-arguments path: Union[str, List[str]], manifest_directory: str, con: sqlalchemy.engine.Engine, table: str, schema: str, iam_role: str, parquet_infer_sampling: float = 1.0, mode: str = "append", diststyle: str = "AUTO", distkey: Optional[str] = None, sortstyle: str = "COMPOUND", sortkey: Optional[List[str]] = None, primary_keys: Optional[List[str]] = None, varchar_lengths_default: int = 256, varchar_lengths: Optional[Dict[str, int]] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, ) -> None: """Load Parquet files from S3 to a Table on Amazon Redshift (Through COPY command). https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html This function accepts Unix shell-style wildcards in the path argument. * (matches everything), ? (matches any single character), [seq] (matches any character in seq), [!seq] (matches any character not in seq). Note ---- If the table does not exist yet, it will be automatically created for you using the Parquet metadata to infer the columns data types. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- path : Union[str, List[str]] S3 prefix (accepts Unix shell-style wildcards) (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). manifest_directory : str S3 prefix (e.g. s3://bucket/prefix) con : sqlalchemy.engine.Engine SQLAlchemy Engine. Please use, wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine() table : str Table name schema : str Schema name iam_role : str AWS IAM role with the related permissions. parquet_infer_sampling : float Random sample ratio of files that will have the metadata inspected. Must be `0.0 < sampling <= 1.0`. The higher, the more accurate. The lower, the faster. mode : str Append, overwrite or upsert. diststyle : str Redshift distribution styles. Must be in ["AUTO", "EVEN", "ALL", "KEY"]. https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html distkey : str, optional Specifies a column name or positional number for the distribution key. sortstyle : str Sorting can be "COMPOUND" or "INTERLEAVED". https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html sortkey : List[str], optional List of columns to be sorted. primary_keys : List[str], optional Primary keys. varchar_lengths_default : int The size that will be set for all VARCHAR columns not specified with varchar_lengths. varchar_lengths : Dict[str, int], optional Dict of VARCHAR length by columns. (e.g. {"col1": 10, "col5": 200}). use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass", "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging". e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'} Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.db.copy_files_to_redshift( ... path="s3://bucket/my_parquet_files/", ... con=wr.catalog.get_engine(connection="my_glue_conn_name"), ... table="my_table", ... schema="public" ... iam_role="arn:aws:iam::XXX:role/XXX" ... ) """ _varchar_lengths: Dict[ str, int] = {} if varchar_lengths is None else varchar_lengths session: boto3.Session = _utils.ensure_session(session=boto3_session) paths: List[str] = _path2list(path=path, boto3_session=session) # pylint: disable=protected-access manifest_directory = manifest_directory if manifest_directory.endswith( "/") else f"{manifest_directory}/" manifest_path: str = f"{manifest_directory}manifest.json" write_redshift_copy_manifest( manifest_path=manifest_path, paths=paths, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, ) s3.wait_objects_exist(paths=paths + [manifest_path], use_threads=False, boto3_session=session) athena_types, _ = s3.read_parquet_metadata(path=paths, sampling=parquet_infer_sampling, dataset=False, use_threads=use_threads, boto3_session=session) _logger.debug("athena_types: %s", athena_types) redshift_types: Dict[str, str] = {} for col_name, col_type in athena_types.items(): length: int = _varchar_lengths[ col_name] if col_name in _varchar_lengths else varchar_lengths_default redshift_types[col_name] = _data_types.athena2redshift( dtype=col_type, varchar_length=length) with con.begin() as _con: created_table, created_schema = _rs_create_table( con=_con, table=table, schema=schema, redshift_types=redshift_types, mode=mode, diststyle=diststyle, sortstyle=sortstyle, distkey=distkey, sortkey=sortkey, primary_keys=primary_keys, ) _rs_copy( con=_con, table=created_table, schema=created_schema, manifest_path=manifest_path, iam_role=iam_role, num_files=len(paths), ) if table != created_table: # upsert _rs_upsert(con=_con, schema=schema, table=table, temp_table=created_table, primary_keys=primary_keys) s3.delete_objects(path=[manifest_path], use_threads=use_threads, boto3_session=session)
def to_excel( df: pd.DataFrame, path: str, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, use_threads: Union[bool, int] = True, **pandas_kwargs: Any, ) -> str: """Write EXCEL file on Amazon S3. Note ---- This function accepts any Pandas's read_excel() argument. https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html Note ---- Depending on the file extension ('xlsx', 'xls', 'odf'...), an additional library might have to be installed first (e.g. xlrd). Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- df: pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html path : str Amazon S3 path (e.g. s3://bucket/filename.xlsx). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 Session will be used if boto3_session receive None. s3_additional_kwargs : Optional[Dict[str, Any]] Forwarded to botocore requests. e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'} use_threads : bool, int True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. If integer is provided, specified number is used. pandas_kwargs: KEYWORD arguments forwarded to pandas.DataFrame.to_excel(). You can NOT pass `pandas_kwargs` explicit, just add valid Pandas arguments in the function call and Wrangler will accept it. e.g. wr.s3.to_excel(df, path, na_rep="", index=False) https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_excel.html Returns ------- str Written S3 path. Examples -------- Writing EXCEL file >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_excel(df, 's3://bucket/filename.xlsx') """ if "pandas_kwargs" in pandas_kwargs: raise exceptions.InvalidArgument( "You can NOT pass `pandas_kwargs` explicit, just add valid " "Pandas arguments in the function call and Wrangler will accept it." "e.g. wr.s3.to_excel(df, path, na_rep=" ", index=False)") session: boto3.Session = _utils.ensure_session(session=boto3_session) with open_s3_object( path=path, mode="wb", use_threads=use_threads, s3_additional_kwargs=s3_additional_kwargs, boto3_session=session, ) as f: _logger.debug("pandas_kwargs: %s", pandas_kwargs) df.to_excel(f, **pandas_kwargs) return path
def create_athena_dataset( name: str, database: Optional[str] = None, table: Optional[str] = None, sql: Optional[str] = None, sql_name: str = "CustomSQL", data_source_name: Optional[str] = None, data_source_arn: Optional[str] = None, import_mode: str = "DIRECT_QUERY", allowed_to_use: Optional[List[str]] = None, allowed_to_manage: Optional[List[str]] = None, logical_table_alias: str = "LogicalTable", rename_columns: Optional[Dict[str, str]] = None, cast_columns_types: Optional[Dict[str, str]] = None, tags: Optional[Dict[str, str]] = None, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> str: """Create a QuickSight dataset. Note ---- You will not be able to see the the dataset in the console if you not pass your user to one of the ``allowed_*`` arguments. Note ---- You must pass ``database``/``table`` OR ``sql`` argument. Note ---- You must pass ``data_source_name`` OR ``data_source_arn`` argument. Parameters ---------- name : str Dataset name. database : str Athena's database name. table : str Athena's table name. sql : str Use a SQL query to define your table. sql_name : str Query name. data_source_name : str, optional QuickSight data source name. data_source_arn : str, optional QuickSight data source ARN. import_mode : str Indicates whether you want to import the data into SPICE. 'SPICE'|'DIRECT_QUERY' tags : Dict[str, str], optional Key/Value collection to put on the Cluster. e.g. {"foo": "boo", "bar": "xoo"}) allowed_to_use : optional List of principals that will be allowed to see and use the data source. e.g. ["john", "Mary"] allowed_to_manage : optional List of principals that will be allowed to see, use, update and delete the data source. e.g. ["Mary"] logical_table_alias : str A display name for the logical table. rename_columns : Dict[str, str], optional Dictionary to map column renames. e.g. {"old_name": "new_name", "old_name2": "new_name2"} cast_columns_types : Dict[str, str], optional Dictionary to map column casts. e.g. {"col_name": "STRING", "col_name2": "DECIMAL"} Valid types: 'STRING'|'INTEGER'|'DECIMAL'|'DATETIME' account_id : str, optional If None, the account ID will be inferred from your boto3 session. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- str Dataset ID. Examples -------- >>> import awswrangler as wr >>> dataset_id = wr.quicksight.create_athena_dataset( ... name="...", ... database="..." ... table="..." ... data_source_name="..." ... allowed_to_manage=["Mary"] ... ) """ if (data_source_name is None) and (data_source_arn is None): raise exceptions.InvalidArgument("You must pass a not None data_source_name or data_source_arn argument.") if ((database is None) and (table is None)) and (sql is None): raise exceptions.InvalidArgument("You must pass database/table OR sql argument.") if (database is not None) and (sql is not None): raise exceptions.InvalidArgument( "If you provide sql argument, please include the database name inside the sql statement." "Do NOT pass in with database argument." ) session: boto3.Session = _utils.ensure_session(session=boto3_session) client: boto3.client = _utils.client(service_name="quicksight", session=session) if account_id is None: account_id = sts.get_account_id(boto3_session=session) if (data_source_arn is None) and (data_source_name is not None): data_source_arn = get_data_source_arn(name=data_source_name, account_id=account_id, boto3_session=session) if sql is not None: physical_table: Dict[str, Dict[str, Any]] = { "CustomSql": { "DataSourceArn": data_source_arn, "Name": sql_name, "SqlQuery": sql, "Columns": extract_athena_query_columns( sql=sql, data_source_arn=data_source_arn, # type: ignore account_id=account_id, boto3_session=session, ), } } else: physical_table = { "RelationalTable": { "DataSourceArn": data_source_arn, "Schema": database, "Name": table, "InputColumns": extract_athena_table_columns( database=database, # type: ignore table=table, # type: ignore boto3_session=session, ), } } table_uuid: str = uuid.uuid4().hex dataset_id: str = uuid.uuid4().hex args: Dict[str, Any] = { "AwsAccountId": account_id, "DataSetId": dataset_id, "Name": name, "ImportMode": import_mode, "PhysicalTableMap": {table_uuid: physical_table}, "LogicalTableMap": {table_uuid: {"Alias": logical_table_alias, "Source": {"PhysicalTableId": table_uuid}}}, } trans: List[Dict[str, Dict[str, Any]]] = _generate_transformations( rename_columns=rename_columns, cast_columns_types=cast_columns_types ) if trans: args["LogicalTableMap"][table_uuid]["DataTransforms"] = trans permissions: List[Dict[str, Union[str, List[str]]]] = _generate_permissions( resource="dataset", account_id=account_id, boto3_session=session, allowed_to_use=allowed_to_use, allowed_to_manage=allowed_to_manage, ) if permissions: args["Permissions"] = permissions if tags is not None: _tags: List[Dict[str, str]] = [{"Key": k, "Value": v} for k, v in tags.items()] args["Tags"] = _tags client.create_data_set(**args) return dataset_id
def to_csv( # pylint: disable=too-many-arguments,too-many-locals df: pd.DataFrame, path: str, sep: str = ",", index: bool = True, columns: Optional[List[str]] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, sanitize_columns: bool = False, dataset: bool = False, partition_cols: Optional[List[str]] = None, mode: Optional[str] = None, catalog_versioning: bool = False, database: Optional[str] = None, table: Optional[str] = None, dtype: Optional[Dict[str, str]] = None, description: Optional[str] = None, parameters: Optional[Dict[str, str]] = None, columns_comments: Optional[Dict[str, str]] = None, regular_partitions: bool = True, projection_enabled: bool = False, projection_types: Optional[Dict[str, str]] = None, projection_ranges: Optional[Dict[str, str]] = None, projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, **pandas_kwargs, ) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: """Write CSV file or dataset on Amazon S3. The concept of Dataset goes beyond the simple idea of files and enable more complex features like partitioning, casting and catalog integration (Amazon Athena/AWS Glue Catalog). Note ---- If `dataset=True` The table name and all column names will be automatically sanitized using `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`. Please, pass `sanitize_columns=True` to force the same behaviour for `dataset=False`. Note ---- If `dataset=True`, `pandas_kwargs` will be ignored due restrictive quoting, date_format, escapechar, encoding, etc required by Athena/Glue Catalog. Note ---- By now Pandas does not support in-memory CSV compression. https://github.com/pandas-dev/pandas/issues/22555 So the `compression` will not be supported on Wrangler too. Note ---- On `append` mode, the `parameters` will be upsert on an existing table. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). Parameters ---------- df: pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html path : str Amazon S3 path (e.g. s3://bucket/filename.csv). sep : str String of length 1. Field delimiter for the output file. index : bool Write row names (index). columns : List[str], optional Columns to write. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 Session will be used if boto3_session receive None. s3_additional_kwargs: Forward to s3fs, useful for server side encryption https://s3fs.readthedocs.io/en/latest/#serverside-encryption sanitize_columns : bool True to sanitize columns names or False to keep it as is. True value is forced if `dataset=True`. dataset : bool If True store a parquet dataset instead of a single file. If True, enable all follow arguments: partition_cols, mode, database, table, description, parameters, columns_comments, . partition_cols: List[str], optional List of column names that will be used to create partitions. Only takes effect if dataset=True. mode : str, optional ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True. catalog_versioning : bool If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. database : str, optional Glue/Athena catalog: Database name. table : str, optional Glue/Athena catalog: Table name. dtype : Dict[str, str], optional Dictionary of columns names and Athena/Glue types to be casted. Useful when you have columns with undetermined or mixed data types. (e.g. {'col name': 'bigint', 'col2 name': 'int'}) description : str, optional Glue/Athena catalog: Table description parameters : Dict[str, str], optional Glue/Athena catalog: Key/value pairs to tag the table. columns_comments : Dict[str, str], optional Glue/Athena catalog: Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). regular_partitions : bool Create regular partitions (Non projected partitions) on Glue Catalog. Disable when you will work only with Partition Projection. Keep enabled even when working with projections is useful to keep Redshift Spectrum working with the regular partitions. projection_enabled : bool Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) projection_types : Optional[Dict[str, str]] Dictionary of partitions names and Athena projections types. Valid types: "enum", "integer", "date", "injected" https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) projection_ranges: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections ranges. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) projection_values: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections values. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) projection_intervals: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections intervals. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '5'}) projection_digits: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) pandas_kwargs : keyword arguments forwarded to pandas.DataFrame.to_csv() https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html Returns ------- None None. Examples -------- Writing single file >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.csv', ... ) { 'paths': ['s3://bucket/prefix/my_file.csv'], 'partitions_values': {} } Writing single file encrypted with a KMS key >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.csv', ... s3_additional_kwargs={ ... 'ServerSideEncryption': 'aws:kms', ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' ... } ... ) { 'paths': ['s3://bucket/prefix/my_file.csv'], 'partitions_values': {} } Writing partitioned dataset >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... partition_cols=['col2'] ... ) { 'paths': ['s3://.../col2=A/x.csv', 's3://.../col2=B/y.csv'], 'partitions_values: { 's3://.../col2=A/': ['A'], 's3://.../col2=B/': ['B'] } } Writing dataset to S3 with metadata on Athena/Glue Catalog. >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... partition_cols=['col2'], ... database='default', # Athena/Glue database ... table='my_table' # Athena/Glue table ... ) { 'paths': ['s3://.../col2=A/x.csv', 's3://.../col2=B/y.csv'], 'partitions_values: { 's3://.../col2=A/': ['A'], 's3://.../col2=B/': ['B'] } } Writing dataset casting empty column data type >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_csv( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'], ... 'col3': [None, None, None] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... database='default', # Athena/Glue database ... table='my_table' # Athena/Glue table ... dtype={'col3': 'date'} ... ) { 'paths': ['s3://.../x.csv'], 'partitions_values: {} } """ if (database is None) ^ (table is None): raise exceptions.InvalidArgumentCombination( "Please pass database and table arguments to be able to store the metadata into the Athena/Glue Catalog." ) if df.empty is True: raise exceptions.EmptyDataFrame() partition_cols = partition_cols if partition_cols else [] dtype = dtype if dtype else {} partitions_values: Dict[str, List[str]] = {} # Sanitize table to respect Athena's standards if (sanitize_columns is True) or (dataset is True): df = catalog.sanitize_dataframe_columns_names(df=df) partition_cols = [catalog.sanitize_column_name(p) for p in partition_cols] dtype = {catalog.sanitize_column_name(k): v.lower() for k, v in dtype.items()} catalog.drop_duplicated_columns(df=df) session: boto3.Session = _utils.ensure_session(session=boto3_session) fs: s3fs.S3FileSystem = _utils.get_fs(session=session, s3_additional_kwargs=s3_additional_kwargs) if dataset is False: if partition_cols: raise exceptions.InvalidArgumentCombination("Please, pass dataset=True to be able to use partition_cols.") if mode is not None: raise exceptions.InvalidArgumentCombination("Please pass dataset=True to be able to use mode.") if columns_comments: raise exceptions.InvalidArgumentCombination("Please pass dataset=True to be able to use columns_comments.") if any(arg is not None for arg in (database, table, description, parameters)): raise exceptions.InvalidArgumentCombination( "Please pass dataset=True to be able to use any one of these " "arguments: database, table, description, parameters, " "columns_comments." ) pandas_kwargs["sep"] = sep pandas_kwargs["index"] = index pandas_kwargs["columns"] = columns _to_text(file_format="csv", df=df, path=path, fs=fs, **pandas_kwargs) paths = [path] else: mode = "append" if mode is None else mode if columns: df = df[columns] if ( (mode in ("append", "overwrite_partitions")) and (database is not None) and (table is not None) ): # Fetching Catalog Types catalog_types: Optional[Dict[str, str]] = catalog.get_table_types( database=database, table=table, boto3_session=session ) if catalog_types is not None: for k, v in catalog_types.items(): dtype[k] = v paths, partitions_values = _to_csv_dataset( df=df, path=path, index=index, sep=sep, fs=fs, use_threads=use_threads, partition_cols=partition_cols, dtype=dtype, mode=mode, boto3_session=session, ) if (database is not None) and (table is not None): columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned( df=df, index=index, partition_cols=partition_cols, dtype=dtype, index_left=True ) catalog.create_csv_table( database=database, table=table, path=path, columns_types=columns_types, partitions_types=partitions_types, description=description, parameters=parameters, columns_comments=columns_comments, boto3_session=session, mode=mode, catalog_versioning=catalog_versioning, sep=sep, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, ) if partitions_values and (regular_partitions is True): _logger.debug("partitions_values:\n%s", partitions_values) catalog.add_csv_partitions( database=database, table=table, partitions_values=partitions_values, boto3_session=session, sep=sep ) return {"paths": paths, "partitions_values": partitions_values}
def read_parquet_metadata( path: Union[str, List[str]], path_suffix: Optional[str] = None, path_ignore_suffix: Optional[str] = None, dtype: Optional[Dict[str, str]] = None, sampling: float = 1.0, dataset: bool = False, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, ) -> Tuple[Dict[str, str], Optional[Dict[str, str]]]: """Read Apache Parquet file(s) metadata from from a received S3 prefix or list of S3 objects paths. The concept of Dataset goes beyond the simple idea of files and enable more complex features like partitioning and catalog integration (AWS Glue Catalog). This function accepts Unix shell-style wildcards in the path argument. * (matches everything), ? (matches any single character), [seq] (matches any character in seq), [!seq] (matches any character not in seq). Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- path : Union[str, List[str]] S3 prefix (accepts Unix shell-style wildcards) (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). path_suffix: Union[str, List[str], None] Suffix or List of suffixes for filtering S3 keys. path_ignore_suffix: Union[str, List[str], None] Suffix or List of suffixes for S3 keys to be ignored. dtype : Dict[str, str], optional Dictionary of columns names and Athena/Glue types to be casted. Useful when you have columns with undetermined data types as partitions columns. (e.g. {'col name': 'bigint', 'col2 name': 'int'}) sampling : float Random sample ratio of files that will have the metadata inspected. Must be `0.0 < sampling <= 1.0`. The higher, the more accurate. The lower, the faster. dataset: bool If True read a parquet dataset instead of simple file(s) loading all the related partitions as columns. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs : Optional[Dict[str, Any]] Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered. Returns ------- Tuple[Dict[str, str], Optional[Dict[str, str]]] columns_types: Dictionary with keys as column names and values as data types (e.g. {'col0': 'bigint', 'col1': 'double'}). / partitions_types: Dictionary with keys as partition names and values as data types (e.g. {'col2': 'date'}). Examples -------- Reading all Parquet files (with partitions) metadata under a prefix >>> import awswrangler as wr >>> columns_types, partitions_types = wr.s3.read_parquet_metadata(path='s3://bucket/prefix/', dataset=True) Reading all Parquet files metadata from a list >>> import awswrangler as wr >>> columns_types, partitions_types = wr.s3.read_parquet_metadata(path=[ ... 's3://bucket/filename0.parquet', ... 's3://bucket/filename1.parquet' ... ]) """ return _read_parquet_metadata( path=path, path_suffix=path_suffix, path_ignore_suffix=path_ignore_suffix, dtype=dtype, sampling=sampling, dataset=dataset, use_threads=use_threads, s3_additional_kwargs=s3_additional_kwargs, boto3_session=_utils.ensure_session(session=boto3_session), )[:2]
def to_parquet( # pylint: disable=too-many-arguments,too-many-locals df: pd.DataFrame, path: str, index: bool = False, compression: Optional[str] = "snappy", use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, sanitize_columns: bool = False, dataset: bool = False, partition_cols: Optional[List[str]] = None, mode: Optional[str] = None, catalog_versioning: bool = False, database: Optional[str] = None, table: Optional[str] = None, dtype: Optional[Dict[str, str]] = None, description: Optional[str] = None, parameters: Optional[Dict[str, str]] = None, columns_comments: Optional[Dict[str, str]] = None, regular_partitions: bool = True, projection_enabled: bool = False, projection_types: Optional[Dict[str, str]] = None, projection_ranges: Optional[Dict[str, str]] = None, projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, ) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: """Write Parquet file or dataset on Amazon S3. The concept of Dataset goes beyond the simple idea of files and enable more complex features like partitioning, casting and catalog integration (Amazon Athena/AWS Glue Catalog). Note ---- If `dataset=True` The table name and all column names will be automatically sanitized using `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`. Please, pass `sanitize_columns=True` to force the same behaviour for `dataset=False`. Note ---- On `append` mode, the `parameters` will be upsert on an existing table. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). Parameters ---------- df: pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html path : str S3 path (for file e.g. ``s3://bucket/prefix/filename.parquet``) (for dataset e.g. ``s3://bucket/prefix``). index : bool True to store the DataFrame index in file, otherwise False to ignore it. compression: str, optional Compression style (``None``, ``snappy``, ``gzip``). use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: Forward to s3fs, useful for server side encryption https://s3fs.readthedocs.io/en/latest/#serverside-encryption sanitize_columns : bool True to sanitize columns names or False to keep it as is. True value is forced if `dataset=True`. dataset : bool If True store a parquet dataset instead of a single file. If True, enable all follow arguments: partition_cols, mode, database, table, description, parameters, columns_comments, . partition_cols: List[str], optional List of column names that will be used to create partitions. Only takes effect if dataset=True. mode: str, optional ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True. catalog_versioning : bool If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. database : str, optional Glue/Athena catalog: Database name. table : str, optional Glue/Athena catalog: Table name. dtype : Dict[str, str], optional Dictionary of columns names and Athena/Glue types to be casted. Useful when you have columns with undetermined or mixed data types. (e.g. {'col name': 'bigint', 'col2 name': 'int'}) description : str, optional Glue/Athena catalog: Table description parameters : Dict[str, str], optional Glue/Athena catalog: Key/value pairs to tag the table. columns_comments : Dict[str, str], optional Glue/Athena catalog: Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). regular_partitions : bool Create regular partitions (Non projected partitions) on Glue Catalog. Disable when you will work only with Partition Projection. Keep enabled even when working with projections is useful to keep Redshift Spectrum working with the regular partitions. projection_enabled : bool Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) projection_types : Optional[Dict[str, str]] Dictionary of partitions names and Athena projections types. Valid types: "enum", "integer", "date", "injected" https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) projection_ranges: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections ranges. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) projection_values: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections values. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) projection_intervals: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections intervals. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '5'}) projection_digits: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) Returns ------- Dict[str, Union[List[str], Dict[str, List[str]]]] Dictionary with: 'paths': List of all stored files paths on S3. 'partitions_values': Dictionary of partitions added with keys as S3 path locations and values as a list of partitions values as str. Examples -------- Writing single file >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.parquet', ... ) { 'paths': ['s3://bucket/prefix/my_file.parquet'], 'partitions_values': {} } Writing single file encrypted with a KMS key >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path='s3://bucket/prefix/my_file.parquet', ... s3_additional_kwargs={ ... 'ServerSideEncryption': 'aws:kms', ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' ... } ... ) { 'paths': ['s3://bucket/prefix/my_file.parquet'], 'partitions_values': {} } Writing partitioned dataset >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... partition_cols=['col2'] ... ) { 'paths': ['s3://.../col2=A/x.parquet', 's3://.../col2=B/y.parquet'], 'partitions_values: { 's3://.../col2=A/': ['A'], 's3://.../col2=B/': ['B'] } } Writing dataset to S3 with metadata on Athena/Glue Catalog. >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... partition_cols=['col2'], ... database='default', # Athena/Glue database ... table='my_table' # Athena/Glue table ... ) { 'paths': ['s3://.../col2=A/x.parquet', 's3://.../col2=B/y.parquet'], 'partitions_values: { 's3://.../col2=A/': ['A'], 's3://.../col2=B/': ['B'] } } Writing dataset casting empty column data type >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_parquet( ... df=pd.DataFrame({ ... 'col': [1, 2, 3], ... 'col2': ['A', 'A', 'B'], ... 'col3': [None, None, None] ... }), ... path='s3://bucket/prefix', ... dataset=True, ... database='default', # Athena/Glue database ... table='my_table' # Athena/Glue table ... dtype={'col3': 'date'} ... ) { 'paths': ['s3://.../x.parquet'], 'partitions_values: {} } """ if (database is None) ^ (table is None): raise exceptions.InvalidArgumentCombination( "Please pass database and table arguments to be able to store the metadata into the Athena/Glue Catalog." ) if df.empty is True: raise exceptions.EmptyDataFrame() partition_cols = partition_cols if partition_cols else [] dtype = dtype if dtype else {} partitions_values: Dict[str, List[str]] = {} # Sanitize table to respect Athena's standards if (sanitize_columns is True) or (dataset is True): df = catalog.sanitize_dataframe_columns_names(df=df) partition_cols = [catalog.sanitize_column_name(p) for p in partition_cols] dtype = {catalog.sanitize_column_name(k): v.lower() for k, v in dtype.items()} catalog.drop_duplicated_columns(df=df) session: boto3.Session = _utils.ensure_session(session=boto3_session) cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) fs: s3fs.S3FileSystem = _utils.get_fs(session=session, s3_additional_kwargs=s3_additional_kwargs) compression_ext: Optional[str] = _COMPRESSION_2_EXT.get(compression, None) if compression_ext is None: raise exceptions.InvalidCompression(f"{compression} is invalid, please use None, snappy or gzip.") if dataset is False: if path.endswith("/"): # pragma: no cover raise exceptions.InvalidArgumentValue( "If <dataset=False>, the argument <path> should be a object path, not a directory." ) if partition_cols: raise exceptions.InvalidArgumentCombination("Please, pass dataset=True to be able to use partition_cols.") if mode is not None: raise exceptions.InvalidArgumentCombination("Please pass dataset=True to be able to use mode.") if any(arg is not None for arg in (database, table, description, parameters)): raise exceptions.InvalidArgumentCombination( "Please pass dataset=True to be able to use any one of these " "arguments: database, table, description, parameters, " "columns_comments." ) df = _data_types.cast_pandas_with_athena_types(df=df, dtype=dtype) schema: pa.Schema = _data_types.pyarrow_schema_from_pandas( df=df, index=index, ignore_cols=partition_cols, dtype=dtype ) _logger.debug("schema: \n%s", schema) paths = [ _to_parquet_file( df=df, path=path, schema=schema, index=index, compression=compression, cpus=cpus, fs=fs, dtype=dtype ) ] else: mode = "append" if mode is None else mode if ( (mode in ("append", "overwrite_partitions")) and (database is not None) and (table is not None) ): # Fetching Catalog Types catalog_types: Optional[Dict[str, str]] = catalog.get_table_types( database=database, table=table, boto3_session=session ) if catalog_types is not None: for k, v in catalog_types.items(): dtype[k] = v paths, partitions_values = _to_parquet_dataset( df=df, path=path, index=index, compression=compression, compression_ext=compression_ext, cpus=cpus, fs=fs, use_threads=use_threads, partition_cols=partition_cols, dtype=dtype, mode=mode, boto3_session=session, ) if (database is not None) and (table is not None): columns_types, partitions_types = _data_types.athena_types_from_pandas_partitioned( df=df, index=index, partition_cols=partition_cols, dtype=dtype ) catalog.create_parquet_table( database=database, table=table, path=path, columns_types=columns_types, partitions_types=partitions_types, compression=compression, description=description, parameters=parameters, columns_comments=columns_comments, boto3_session=session, mode=mode, catalog_versioning=catalog_versioning, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, ) if partitions_values and (regular_partitions is True): _logger.debug("partitions_values:\n%s", partitions_values) catalog.add_parquet_partitions( database=database, table=table, partitions_values=partitions_values, compression=compression, boto3_session=session, ) return {"paths": paths, "partitions_values": partitions_values}