def _redshift_types_from_path( path: Optional[Union[str, List[str]]], varchar_lengths_default: int, varchar_lengths: Optional[Dict[str, int]], parquet_infer_sampling: float, use_threads: bool, boto3_session: Optional[boto3.Session], s3_additional_kwargs: Optional[Dict[str, str]], ) -> Dict[str, str]: """Extract Redshift data types from a Pandas DataFrame.""" _varchar_lengths: Dict[ str, int] = {} if varchar_lengths is None else varchar_lengths session: boto3.Session = _utils.ensure_session(session=boto3_session) athena_types, _ = s3.read_parquet_metadata( path=path, sampling=parquet_infer_sampling, dataset=False, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, ) _logger.debug("athena_types: %s", athena_types) redshift_types: Dict[str, str] = {} for col_name, col_type in athena_types.items(): length: int = _varchar_lengths[ col_name] if col_name in _varchar_lengths else varchar_lengths_default redshift_types[col_name] = _data_types.athena2redshift( dtype=col_type, varchar_length=length) return redshift_types
def copy_files_to_redshift( # pylint: disable=too-many-locals,too-many-arguments path: Union[str, List[str]], manifest_directory: str, con: sqlalchemy.engine.Engine, table: str, schema: str, iam_role: str, parquet_infer_sampling: float = 1.0, mode: str = "append", diststyle: str = "AUTO", distkey: Optional[str] = None, sortstyle: str = "COMPOUND", sortkey: Optional[List[str]] = None, primary_keys: Optional[List[str]] = None, varchar_lengths_default: int = 256, varchar_lengths: Optional[Dict[str, int]] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, ) -> None: """Load Parquet files from S3 to a Table on Amazon Redshift (Through COPY command). https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html This function accepts Unix shell-style wildcards in the path argument. * (matches everything), ? (matches any single character), [seq] (matches any character in seq), [!seq] (matches any character not in seq). Note ---- If the table does not exist yet, it will be automatically created for you using the Parquet metadata to infer the columns data types. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- path : Union[str, List[str]] S3 prefix (accepts Unix shell-style wildcards) (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). manifest_directory : str S3 prefix (e.g. s3://bucket/prefix) con : sqlalchemy.engine.Engine SQLAlchemy Engine. Please use, wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine() table : str Table name schema : str Schema name iam_role : str AWS IAM role with the related permissions. parquet_infer_sampling : float Random sample ratio of files that will have the metadata inspected. Must be `0.0 < sampling <= 1.0`. The higher, the more accurate. The lower, the faster. mode : str Append, overwrite or upsert. diststyle : str Redshift distribution styles. Must be in ["AUTO", "EVEN", "ALL", "KEY"]. https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html distkey : str, optional Specifies a column name or positional number for the distribution key. sortstyle : str Sorting can be "COMPOUND" or "INTERLEAVED". https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html sortkey : List[str], optional List of columns to be sorted. primary_keys : List[str], optional Primary keys. varchar_lengths_default : int The size that will be set for all VARCHAR columns not specified with varchar_lengths. varchar_lengths : Dict[str, int], optional Dict of VARCHAR length by columns. (e.g. {"col1": 10, "col5": 200}). use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass", "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging". e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'} Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.db.copy_files_to_redshift( ... path="s3://bucket/my_parquet_files/", ... con=wr.catalog.get_engine(connection="my_glue_conn_name"), ... table="my_table", ... schema="public" ... iam_role="arn:aws:iam::XXX:role/XXX" ... ) """ _varchar_lengths: Dict[ str, int] = {} if varchar_lengths is None else varchar_lengths session: boto3.Session = _utils.ensure_session(session=boto3_session) paths: List[str] = _path2list(path=path, boto3_session=session) # pylint: disable=protected-access manifest_directory = manifest_directory if manifest_directory.endswith( "/") else f"{manifest_directory}/" manifest_path: str = f"{manifest_directory}manifest.json" write_redshift_copy_manifest( manifest_path=manifest_path, paths=paths, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, ) s3.wait_objects_exist(paths=paths + [manifest_path], use_threads=False, boto3_session=session) athena_types, _ = s3.read_parquet_metadata(path=paths, sampling=parquet_infer_sampling, dataset=False, use_threads=use_threads, boto3_session=session) _logger.debug("athena_types: %s", athena_types) redshift_types: Dict[str, str] = {} for col_name, col_type in athena_types.items(): length: int = _varchar_lengths[ col_name] if col_name in _varchar_lengths else varchar_lengths_default redshift_types[col_name] = _data_types.athena2redshift( dtype=col_type, varchar_length=length) with con.begin() as _con: created_table, created_schema = _rs_create_table( con=_con, table=table, schema=schema, redshift_types=redshift_types, mode=mode, diststyle=diststyle, sortstyle=sortstyle, distkey=distkey, sortkey=sortkey, primary_keys=primary_keys, ) _rs_copy( con=_con, table=created_table, schema=created_schema, manifest_path=manifest_path, iam_role=iam_role, num_files=len(paths), ) if table != created_table: # upsert _rs_upsert(con=_con, schema=schema, table=table, temp_table=created_table, primary_keys=primary_keys) s3.delete_objects(path=[manifest_path], use_threads=use_threads, boto3_session=session)
def copy_files_to_redshift( # pylint: disable=too-many-locals,too-many-arguments path: Union[str, List[str]], manifest_directory: str, con: sqlalchemy.engine.Engine, table: str, schema: str, iam_role: str, mode: str = "append", diststyle: str = "AUTO", distkey: Optional[str] = None, sortstyle: str = "COMPOUND", sortkey: Optional[str] = None, primary_keys: Optional[List[str]] = None, varchar_lengths_default: int = 256, varchar_lengths: Optional[Dict[str, int]] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, ) -> None: """Load Parquet files from S3 to a Table on Amazon Redshift (Through COPY command). https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html Note ---- If the table does not exist yet, it will be automatically created for you using the Parquet metadata to infer the columns data types. Note ---- In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count(). Parameters ---------- path : Union[str, List[str]] S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). manifest_directory : str S3 prefix (e.g. s3://bucket/prefix) con : sqlalchemy.engine.Engine SQLAlchemy Engine. Please use, wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine() table : str Table name schema : str Schema name iam_role : str AWS IAM role with the related permissions. mode : str Append, overwrite or upsert. diststyle : str Redshift distribution styles. Must be in ["AUTO", "EVEN", "ALL", "KEY"]. https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html distkey : str, optional Specifies a column name or positional number for the distribution key. sortstyle : str Sorting can be "COMPOUND" or "INTERLEAVED". https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html sortkey : str, optional List of columns to be sorted. primary_keys : List[str], optional Primary keys. varchar_lengths_default : int The size that will be set for all VARCHAR columns not specified with varchar_lengths. varchar_lengths : Dict[str, int], optional Dict of VARCHAR length by columns. (e.g. {"col1": 10, "col5": 200}). use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.db.copy_files_to_redshift( ... path="s3://bucket/my_parquet_files/", ... con=wr.catalog.get_engine(connection="my_glue_conn_name"), ... table="my_table", ... schema="public" ... iam_role="arn:aws:iam::XXX:role/XXX" ... ) """ _varchar_lengths: Dict[str, int] = {} if varchar_lengths is None else varchar_lengths session: boto3.Session = _utils.ensure_session(session=boto3_session) paths: List[str] = s3._path2list(path=path, boto3_session=session) # pylint: disable=protected-access manifest_directory = manifest_directory if manifest_directory.endswith("/") else f"{manifest_directory}/" manifest_path: str = f"{manifest_directory}manifest.json" write_redshift_copy_manifest( manifest_path=manifest_path, paths=paths, use_threads=use_threads, boto3_session=session ) s3.wait_objects_exist(paths=paths + [manifest_path], use_threads=False, boto3_session=session) athena_types, _ = s3.read_parquet_metadata( path=paths, dataset=False, use_threads=use_threads, boto3_session=session ) _logger.debug(f"athena_types: {athena_types}") redshift_types: Dict[str, str] = {} for col_name, col_type in athena_types.items(): length: int = _varchar_lengths[col_name] if col_name in _varchar_lengths else varchar_lengths_default redshift_types[col_name] = _data_types.athena2redshift(dtype=col_type, varchar_length=length) with con.begin() as _con: created_table, created_schema = _rs_create_table( con=_con, table=table, schema=schema, redshift_types=redshift_types, mode=mode, diststyle=diststyle, sortstyle=sortstyle, distkey=distkey, sortkey=sortkey, primary_keys=primary_keys, ) _rs_copy( con=_con, table=created_table, schema=created_schema, manifest_path=manifest_path, iam_role=iam_role, num_files=len(paths), ) if table != created_table: # upsert _rs_upsert(con=_con, schema=schema, table=table, temp_table=created_table, primary_keys=primary_keys) s3.delete_objects(path=[manifest_path], use_threads=use_threads, boto3_session=session)