def _validate_datetimes( last_modified_begin: Optional[datetime.datetime] = None, last_modified_end: Optional[datetime.datetime] = None ) -> None: if (last_modified_begin is not None) and (last_modified_begin.tzinfo is None): raise exceptions.InvalidArgumentValue("Timezone is not defined for last_modified_begin.") if (last_modified_end is not None) and (last_modified_end.tzinfo is None): raise exceptions.InvalidArgumentValue("Timezone is not defined for last_modified_end.") if (last_modified_begin is not None) and (last_modified_end is not None): if last_modified_begin > last_modified_end: raise exceptions.InvalidArgumentValue("last_modified_begin is bigger than last_modified_end.")
def _resolve_query_with_cache( cache_info: _CacheInfo, categories: Optional[List[str]], chunksize: Optional[Union[int, bool]], use_threads: bool, session: Optional[boto3.Session], ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Fetch cached data and return it as a pandas DataFrame (or list of DataFrames).""" _logger.debug("cache_info:\n%s", cache_info) if cache_info.query_execution_id is None: raise RuntimeError("Trying to resolve with cache but w/o any query execution ID.") query_metadata: _QueryMetadata = _get_query_metadata( query_execution_id=cache_info.query_execution_id, boto3_session=session, categories=categories, query_execution_payload=cache_info.query_execution_payload, ) if cache_info.file_format == "parquet": return _fetch_parquet_result( query_metadata=query_metadata, keep_files=True, categories=categories, chunksize=chunksize, use_threads=use_threads, boto3_session=session, ) if cache_info.file_format == "csv": return _fetch_csv_result( query_metadata=query_metadata, keep_files=True, chunksize=chunksize, use_threads=use_threads, boto3_session=session, ) raise exceptions.InvalidArgumentValue(f"Invalid data type: {cache_info.file_format}.")
def _resolve_query_without_cache_ctas( sql: str, database: Optional[str], data_source: Optional[str], s3_output: Optional[str], keep_files: bool, chunksize: Union[int, bool, None], categories: Optional[List[str]], encryption: Optional[str], workgroup: Optional[str], kms_key: Optional[str], wg_config: _WorkGroupConfig, name: Optional[str], use_threads: bool, s3_additional_kwargs: Optional[Dict[str, Any]], boto3_session: boto3.Session, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: path: str = f"{s3_output}/{name}" ext_location: str = "\n" if wg_config.enforced is True else f",\n external_location = '{path}'\n" sql = (f'CREATE TABLE "{name}"\n' f"WITH(\n" f" format = 'Parquet',\n" f" parquet_compression = 'SNAPPY'" f"{ext_location}" f") AS\n" f"{sql}") _logger.debug("sql: %s", sql) try: query_id: str = _start_query_execution( sql=sql, wg_config=wg_config, database=database, data_source=data_source, s3_output=s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, boto3_session=boto3_session, ) except botocore.exceptions.ClientError as ex: error: Dict[str, Any] = ex.response["Error"] if error[ "Code"] == "InvalidRequestException" and "Exception parsing query" in error[ "Message"]: raise exceptions.InvalidCtasApproachQuery( "Is not possible to wrap this query into a CTAS statement. Please use ctas_approach=False." ) if error[ "Code"] == "InvalidRequestException" and "extraneous input" in error[ "Message"]: raise exceptions.InvalidCtasApproachQuery( "Is not possible to wrap this query into a CTAS statement. Please use ctas_approach=False." ) raise ex _logger.debug("query_id: %s", query_id) try: query_metadata: _QueryMetadata = _get_query_metadata( query_execution_id=query_id, boto3_session=boto3_session, categories=categories, metadata_cache_manager=_cache_manager, ) except exceptions.QueryFailed as ex: msg: str = str(ex) if "Column name" in msg and "specified more than once" in msg: raise exceptions.InvalidCtasApproachQuery( f"Please, define distinct names for your columns OR pass ctas_approach=False. Root error message: {msg}" ) if "Column name not specified" in msg: raise exceptions.InvalidArgumentValue( "Please, define all columns names in your query. (E.g. 'SELECT MAX(col1) AS max_col1, ...')" ) if "Column type is unknown" in msg: raise exceptions.InvalidArgumentValue( "Please, don't leave undefined columns types in your query. You can cast to ensure it. " "(E.g. 'SELECT CAST(NULL AS INTEGER) AS MY_COL, ...')") raise ex return _fetch_parquet_result( query_metadata=query_metadata, keep_files=keep_files, categories=categories, chunksize=chunksize, use_threads=use_threads, s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, )
def create_ctas_table( # pylint: disable=too-many-locals sql: str, database: str, ctas_table: Optional[str] = None, ctas_database: Optional[str] = None, s3_output: Optional[str] = None, storage_format: Optional[str] = None, write_compression: Optional[str] = None, partitioning_info: Optional[List[str]] = None, bucketing_info: Optional[Tuple[List[str], int]] = None, field_delimiter: Optional[str] = None, schema_only: bool = False, workgroup: Optional[str] = None, data_source: Optional[str] = None, encryption: Optional[str] = None, kms_key: Optional[str] = None, categories: Optional[List[str]] = None, wait: bool = False, boto3_session: Optional[boto3.Session] = None, ) -> Dict[str, Union[str, _QueryMetadata]]: """Create a new table populated with the results of a SELECT query. https://docs.aws.amazon.com/athena/latest/ug/create-table-as.html Parameters ---------- sql : str SELECT SQL query. database : str The name of the database where the original table is stored. ctas_table : Optional[str], optional The name of the CTAS table. If None, a name with a random string is used. ctas_database : Optional[str], optional The name of the alternative database where the CTAS table should be stored. If None, `database` is used, that is the CTAS table is stored in the same database as the original table. s3_output : Optional[str], optional The output Amazon S3 path. If None, either the Athena workgroup or client-side location setting is used. If a workgroup enforces a query results location, then it overrides this argument. storage_format : Optional[str], optional The storage format for the CTAS query results, such as ORC, PARQUET, AVRO, JSON, or TEXTFILE. PARQUET by default. write_compression : Optional[str], optional The compression type to use for any storage format that allows compression to be specified. partitioning_info : Optional[List[str]], optional A list of columns by which the CTAS table will be partitioned. bucketing_info : Optional[Tuple[List[str], int]], optional Tuple consisting of the column names used for bucketing as the first element and the number of buckets as the second element. Only `str`, `int` and `bool` are supported as column data types for bucketing. field_delimiter : Optional[str], optional The single-character field delimiter for files in CSV, TSV, and text files. schema_only : bool, optional _description_, by default False workgroup : Optional[str], optional Athena workgroup. data_source : Optional[str], optional Data Source / Catalog name. If None, 'AwsDataCatalog' is used. encryption : str, optional Valid values: [None, 'SSE_S3', 'SSE_KMS']. Note: 'CSE_KMS' is not supported. kms_key : str, optional For SSE-KMS, this is the KMS key ARN or ID. categories: List[str], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. wait : bool, default False Whether to wait for the query to finish and return a dictionary with the Query metadata. boto3_session : Optional[boto3.Session], optional Boto3 Session. The default boto3 session is used if boto3_session is None. Returns ------- Dict[str, Union[str, _QueryMetadata]] A dictionary with the the CTAS database and table names. If `wait` is `False`, the query ID is included, otherwise a Query metadata object is added instead. Examples -------- Select all into a new table and encrypt the results >>> import awswrangler as wr >>> wr.athena.create_ctas_table( ... sql="select * from table", ... database="default", ... encryption="SSE_KMS", ... kms_key="1234abcd-12ab-34cd-56ef-1234567890ab", ... ) {'ctas_database': 'default', 'ctas_table': 'temp_table_5669340090094....', 'ctas_query_id': 'cc7dfa81-831d-...'} Create a table with schema only >>> wr.athena.create_ctas_table( ... sql="select col1, col2 from table", ... database="default", ... ctas_table="my_ctas_table", ... schema_only=True, ... wait=True, ... ) Partition data and save to alternative CTAS database >>> wr.athena.create_ctas_table( ... sql="select * from table", ... database="default", ... ctas_database="my_ctas_db", ... storage_format="avro", ... write_compression="snappy", ... partitioning_info=["par0", "par1"], ... wait=True, ... ) """ ctas_table = catalog.sanitize_table_name( ctas_table) if ctas_table else f"temp_table_{uuid.uuid4().hex}" ctas_database = ctas_database if ctas_database else database fully_qualified_name = f'"{ctas_database}"."{ctas_table}"' wg_config: _WorkGroupConfig = _get_workgroup_config(session=boto3_session, workgroup=workgroup) s3_output = _get_s3_output(s3_output=s3_output, wg_config=wg_config, boto3_session=boto3_session) s3_output = s3_output[:-1] if s3_output[-1] == "/" else s3_output # If the workgroup enforces an external location, then it overrides the user supplied argument external_location_str: str = ( f" external_location = '{s3_output}/{ctas_table}',\n" if (not wg_config.enforced) and (s3_output) else "") # At least one property must be specified within `WITH()` in the query. We default to `PARQUET` for `storage_format` storage_format_str: str = f""" format = '{storage_format.upper() if storage_format else "PARQUET"}'""" write_compression_str: str = ( f" write_compression = '{write_compression.upper()}',\n" if write_compression else "") partitioning_str: str = f" partitioned_by = ARRAY{partitioning_info},\n" if partitioning_info else "" bucketing_str: str = ( f" bucketed_by = ARRAY{bucketing_info[0]},\n bucket_count = {bucketing_info[1]},\n" if bucketing_info else "") field_delimiter_str: str = f" field_delimiter = '{field_delimiter}',\n" if field_delimiter else "" schema_only_str: str = "\nWITH NO DATA" if schema_only else "" ctas_sql = (f"CREATE TABLE {fully_qualified_name}\n" f"WITH(\n" f"{external_location_str}" f"{partitioning_str}" f"{bucketing_str}" f"{field_delimiter_str}" f"{write_compression_str}" f"{storage_format_str}" f")\n" f"AS {sql}" f"{schema_only_str}") _logger.debug("ctas sql: %s", ctas_sql) try: query_execution_id: str = _start_query_execution( sql=ctas_sql, wg_config=wg_config, database=database, data_source=data_source, s3_output=s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, boto3_session=boto3_session, ) except botocore.exceptions.ClientError as ex: error: Dict[str, Any] = ex.response["Error"] if error[ "Code"] == "InvalidRequestException" and "Exception parsing query" in error[ "Message"]: raise exceptions.InvalidCtasApproachQuery( f"It is not possible to wrap this query into a CTAS statement. Root error message: {error['Message']}" ) if error[ "Code"] == "InvalidRequestException" and "extraneous input" in error[ "Message"]: raise exceptions.InvalidCtasApproachQuery( f"It is not possible to wrap this query into a CTAS statement. Root error message: {error['Message']}" ) raise ex response: Dict[str, Union[str, _QueryMetadata]] = { "ctas_database": ctas_database, "ctas_table": ctas_table } if wait: try: response["ctas_query_metadata"] = _get_query_metadata( query_execution_id=query_execution_id, boto3_session=boto3_session, categories=categories, metadata_cache_manager=_cache_manager, ) except exceptions.QueryFailed as ex: msg: str = str(ex) if "Column name" in msg and "specified more than once" in msg: raise exceptions.InvalidCtasApproachQuery( f"Please, define distinct names for your columns. Root error message: {msg}" ) if "Column name not specified" in msg: raise exceptions.InvalidArgumentValue( "Please, define all columns names in your query. (E.g. 'SELECT MAX(col1) AS max_col1, ...')" ) if "Column type is unknown" in msg: raise exceptions.InvalidArgumentValue( "Please, don't leave undefined columns types in your query. You can cast to ensure it. " "(E.g. 'SELECT CAST(NULL AS INTEGER) AS MY_COL, ...')") raise ex else: response["ctas_query_id"] = query_execution_id return response