def get_table_location(database: str, table: str, boto3_session: Optional[boto3.Session] = None) -> str: """Get table's location on Glue catalog. Parameters ---------- database : str Database name. table : str Table name. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- str Table's location. Examples -------- >>> import awswrangler as wr >>> wr.catalog.get_table_location(database='default', name='my_table') 's3://bucket/prefix/' """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) res: Dict[str, Any] = client_glue.get_table(DatabaseName=database, Name=table) try: return res["Table"]["StorageDescriptor"]["Location"] except KeyError: # pragma: no cover raise exceptions.InvalidTable(f"{database}.{table}")
def overwrite_table_parameters( parameters: Dict[str, str], database: str, table: str, transaction_id: Optional[str] = None, catalog_versioning: bool = False, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> Dict[str, str]: """Overwrite all existing parameters. Parameters ---------- parameters : Dict[str, str] e.g. {"source": "mysql", "destination": "datalake"} database : str Database name. table : str Table name. transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). catalog_versioning : bool If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Dict[str, str] All parameters after the overwrite (The same received). Examples -------- >>> import awswrangler as wr >>> pars = wr.catalog.overwrite_table_parameters( ... parameters={"source": "mysql", "destination": "datalake"}, ... database="...", ... table="...") """ session: boto3.Session = _utils.ensure_session(session=boto3_session) table_input: Optional[Dict[str, Any]] = _get_table_input( database=database, table=table, transaction_id=transaction_id, catalog_id=catalog_id, boto3_session=session ) if table_input is None: raise exceptions.InvalidTable(f"Table {table} does not exist on database {database}.") return _overwrite_table_parameters( parameters=parameters, database=database, catalog_id=catalog_id, transaction_id=transaction_id, table_input=table_input, boto3_session=session, catalog_versioning=catalog_versioning, )
def read_parquet_table( table: str, database: str, catalog_id: Optional[str] = None, partition_filter: Optional[Callable[[Dict[str, str]], bool]] = None, columns: Optional[List[str]] = None, validate_schema: bool = True, categories: Optional[List[str]] = None, safe: bool = True, chunked: Union[bool, int] = False, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Read Apache Parquet table registered on AWS Glue Catalog. Note ---- ``Batching`` (`chunked` argument) (Memory Friendly): Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. There are two batching strategies on Wrangler: - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset. - If **chunked=INTEGER**, Wrangler will paginate through files slicing and concatenating to return DataFrames with the number of row igual the received INTEGER. `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise in number of rows for each Dataframe. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- table : str AWS Glue Catalog table name. database : str AWS Glue Catalog database name. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. partition_filter: Optional[Callable[[Dict[str, str]], bool]] Callback Function filters to apply on PARTITION columns (PUSH-DOWN filter). This function MUST receive a single argument (Dict[str, str]) where keys are partitions names and values are partitions values. Partitions values will be always strings extracted from S3. This function MUST return a bool, True to read the partition or False to ignore it. Ignored if `dataset=False`. E.g ``lambda x: True if x["year"] == "2020" and x["month"] == "1" else False`` https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/023%20-%20Flexible%20Partitions%20Filter.ipynb columns : List[str], optional Names of columns to read from the file(s). validate_schema: Check that individual file schemas are all the same / compatible. Schemas within a folder prefix should all be the same. Disable if you have schemas that are different and want to disable this check. categories: Optional[List[str]], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. safe : bool, default True For certain data types, a cast is needed in order to store the data in a pandas DataFrame or Series (e.g. timestamps are always stored as nanoseconds in pandas). This option controls whether it is a safe cast or not. chunked : bool If True will break the data in smaller DataFrames (Non deterministic number of lines). Otherwise return a single DataFrame with the whole data. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs : Optional[Dict[str, Any]] Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered. Returns ------- Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] Pandas DataFrame or a Generator in case of `chunked=True`. Examples -------- Reading Parquet Table >>> import awswrangler as wr >>> df = wr.s3.read_parquet_table(database='...', table='...') Reading Parquet Table encrypted >>> import awswrangler as wr >>> df = wr.s3.read_parquet_table( ... database='...', ... table='...' ... s3_additional_kwargs={ ... 'ServerSideEncryption': 'aws:kms', ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' ... } ... ) Reading Parquet Table in chunks (Chunk by file) >>> import awswrangler as wr >>> dfs = wr.s3.read_parquet_table(database='...', table='...', chunked=True) >>> for df in dfs: >>> print(df) # Smaller Pandas DataFrame Reading Parquet Dataset with PUSH-DOWN filter over partitions >>> import awswrangler as wr >>> my_filter = lambda x: True if x["city"].startswith("new") else False >>> df = wr.s3.read_parquet_table(path, dataset=True, partition_filter=my_filter) """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) args: Dict[str, Any] = {"DatabaseName": database, "Name": table} if catalog_id is not None: args["CatalogId"] = catalog_id res: Dict[str, Any] = client_glue.get_table(**args) try: path: str = res["Table"]["StorageDescriptor"]["Location"] except KeyError as ex: raise exceptions.InvalidTable( f"Missing s3 location for {database}.{table}.") from ex return _data_types.cast_pandas_with_athena_types( df=read_parquet( path=path, partition_filter=partition_filter, columns=columns, validate_schema=validate_schema, categories=categories, safe=safe, chunked=chunked, dataset=True, use_threads=use_threads, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, ), dtype=_extract_partitions_dtypes_from_table_details(response=res), )
def read_parquet_table( table: str, database: str, filename_suffix: Union[str, List[str], None] = None, filename_ignore_suffix: Union[str, List[str], None] = None, catalog_id: Optional[str] = None, partition_filter: Optional[Callable[[Dict[str, str]], bool]] = None, columns: Optional[List[str]] = None, validate_schema: bool = True, categories: Optional[List[str]] = None, safe: bool = True, map_types: bool = True, chunked: Union[bool, int] = False, use_threads: Union[bool, int] = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Read Apache Parquet table registered on AWS Glue Catalog. Note ---- ``Batching`` (`chunked` argument) (Memory Friendly): Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. There are two batching strategies on Wrangler: - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset. - If **chunked=INTEGER**, Wrangler will paginate through files slicing and concatenating to return DataFrames with the number of row igual the received INTEGER. `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise in number of rows for each Dataframe. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- table : str AWS Glue Catalog table name. database : str AWS Glue Catalog database name. filename_suffix: Union[str, List[str], None] Suffix or List of suffixes to be read (e.g. [".gz.parquet", ".snappy.parquet"]). If None, will try to read all files. (default) filename_ignore_suffix: Union[str, List[str], None] Suffix or List of suffixes for S3 keys to be ignored.(e.g. [".csv", "_SUCCESS"]). If None, will try to read all files. (default) catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. partition_filter: Optional[Callable[[Dict[str, str]], bool]] Callback Function filters to apply on PARTITION columns (PUSH-DOWN filter). This function MUST receive a single argument (Dict[str, str]) where keys are partitions names and values are partitions values. Partitions values will be always strings extracted from S3. This function MUST return a bool, True to read the partition or False to ignore it. Ignored if `dataset=False`. E.g ``lambda x: True if x["year"] == "2020" and x["month"] == "1" else False`` https://aws-data-wrangler.readthedocs.io/en/2.13.0/tutorials/023%20-%20Flexible%20Partitions%20Filter.html columns : List[str], optional Names of columns to read from the file(s). validate_schema: Check that individual file schemas are all the same / compatible. Schemas within a folder prefix should all be the same. Disable if you have schemas that are different and want to disable this check. categories: Optional[List[str]], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. safe : bool, default True For certain data types, a cast is needed in order to store the data in a pandas DataFrame or Series (e.g. timestamps are always stored as nanoseconds in pandas). This option controls whether it is a safe cast or not. map_types : bool, default True True to convert pyarrow DataTypes to pandas ExtensionDtypes. It is used to override the default pandas type for conversion of built-in pyarrow types or in absence of pandas_metadata in the Table schema. chunked : bool If True will break the data in smaller DataFrames (Non deterministic number of lines). Otherwise return a single DataFrame with the whole data. use_threads : Union[bool, int] True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. If integer is provided, specified number is used. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs : Optional[Dict[str, Any]] Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered. Returns ------- Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] Pandas DataFrame or a Generator in case of `chunked=True`. Examples -------- Reading Parquet Table >>> import awswrangler as wr >>> df = wr.s3.read_parquet_table(database='...', table='...') Reading Parquet Table encrypted >>> import awswrangler as wr >>> df = wr.s3.read_parquet_table( ... database='...', ... table='...' ... s3_additional_kwargs={ ... 'ServerSideEncryption': 'aws:kms', ... 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN' ... } ... ) Reading Parquet Table in chunks (Chunk by file) >>> import awswrangler as wr >>> dfs = wr.s3.read_parquet_table(database='...', table='...', chunked=True) >>> for df in dfs: >>> print(df) # Smaller Pandas DataFrame Reading Parquet Dataset with PUSH-DOWN filter over partitions >>> import awswrangler as wr >>> my_filter = lambda x: True if x["city"].startswith("new") else False >>> df = wr.s3.read_parquet_table(path, dataset=True, partition_filter=my_filter) """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) args: Dict[str, Any] = {"DatabaseName": database, "Name": table} if catalog_id is not None: args["CatalogId"] = catalog_id res: Dict[str, Any] = client_glue.get_table(**args) try: location: str = res["Table"]["StorageDescriptor"]["Location"] path: str = location if location.endswith("/") else f"{location}/" except KeyError as ex: raise exceptions.InvalidTable(f"Missing s3 location for {database}.{table}.") from ex path_root: Optional[str] = None paths: Union[str, List[str]] = path # If filter is available, fetch & filter out partitions # Then list objects & process individual object keys under path_root if partition_filter is not None: available_partitions_dict = _get_partitions( database=database, table=table, catalog_id=catalog_id, boto3_session=boto3_session, ) available_partitions = list(available_partitions_dict.keys()) if available_partitions: paths = [] path_root = path partitions: Union[str, List[str]] = _apply_partition_filter( path_root=path_root, paths=available_partitions, filter_func=partition_filter ) for partition in partitions: paths += _path2list( path=partition, boto3_session=boto3_session, suffix=filename_suffix, ignore_suffix=_get_path_ignore_suffix(path_ignore_suffix=filename_ignore_suffix), s3_additional_kwargs=s3_additional_kwargs, ) df = read_parquet( path=paths, path_root=path_root, path_suffix=filename_suffix if path_root is None else None, path_ignore_suffix=filename_ignore_suffix if path_root is None else None, columns=columns, validate_schema=validate_schema, categories=categories, safe=safe, map_types=map_types, chunked=chunked, dataset=True, use_threads=use_threads, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, ) partial_cast_function = functools.partial( _data_types.cast_pandas_with_athena_types, dtype=_extract_partitions_dtypes_from_table_details(response=res) ) if isinstance(df, pd.DataFrame): return partial_cast_function(df) # df is a generator, so map is needed for casting dtypes return map(partial_cast_function, df)