... path="s3://bucket/my_parquet_files/", ... con=wr.catalog.get_engine(connection="my_glue_conn_name"), ... table="my_table", ... schema="public" ... iam_role="arn:aws:iam::XXX:role/XXX" ... ) """ path = path if path.endswith("/") else f"{path}/" session: boto3.Session = _utils.ensure_session(session=boto3_session) paths: List[str] = s3.to_parquet( df=df, path=path, index=index, dataset=True, mode="append", dtype=dtype, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, max_rows_by_file=max_rows_by_file, )["paths"] s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session) copy_files_to_redshift( path=paths, manifest_directory=_utils.get_directory(path=path), con=con, table=table, schema=schema, iam_role=iam_role,
def copy_to_redshift( # pylint: disable=too-many-arguments df: pd.DataFrame, path: str, con: sqlalchemy.engine.Engine, table: str, schema: str, iam_role: str, index: bool = False, dtype: Optional[Dict[str, str]] = None, mode: str = "append", diststyle: str = "AUTO", distkey: Optional[str] = None, sortstyle: str = "COMPOUND", sortkey: Optional[str] = None, primary_keys: Optional[List[str]] = None, varchar_lengths_default: int = 256, varchar_lengths: Optional[Dict[str, int]] = None, keep_files: bool = False, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, ) -> None: """Load Pandas DataFrame as a Table on Amazon Redshift using parquet files on S3 as stage. This is a **HIGH** latency and **HIGH** throughput alternative to `wr.db.to_sql()` to load large DataFrames into Amazon Redshift through the ** SQL COPY command**. This strategy has more overhead and requires more IAM privileges than the regular `wr.db.to_sql()` function, so it is only recommended to inserting +1MM rows at once. https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html Note ---- If the table does not exist yet, it will be automatically created for you using the Parquet metadata to infer the columns data types. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). Parameters ---------- df: pandas.DataFrame Pandas DataFrame. path : Union[str, List[str]] S3 path to write stage files (e.g. s3://bucket_name/any_name/) con : sqlalchemy.engine.Engine SQLAlchemy Engine. Please use, wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine() table : str Table name schema : str Schema name iam_role : str AWS IAM role with the related permissions. index : bool True to store the DataFrame index in file, otherwise False to ignore it. dtype: Dict[str, str], optional Dictionary of columns names and Athena/Glue types to be casted. Useful when you have columns with undetermined or mixed data types. Only takes effect if dataset=True. (e.g. {'col name': 'bigint', 'col2 name': 'int'}) mode : str Append, overwrite or upsert. diststyle : str Redshift distribution styles. Must be in ["AUTO", "EVEN", "ALL", "KEY"]. https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html distkey : str, optional Specifies a column name or positional number for the distribution key. sortstyle : str Sorting can be "COMPOUND" or "INTERLEAVED". https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html sortkey : str, optional List of columns to be sorted. primary_keys : List[str], optional Primary keys. varchar_lengths_default : int The size that will be set for all VARCHAR columns not specified with varchar_lengths. varchar_lengths : Dict[str, int], optional Dict of VARCHAR length by columns. (e.g. {"col1": 10, "col5": 200}). keep_files : bool Should keep the stage files? use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: Forward to s3fs, useful for server side encryption https://s3fs.readthedocs.io/en/latest/#serverside-encryption Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> import pandas as pd >>> wr.db.copy_to_redshift( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path="s3://bucket/my_parquet_files/", ... con=wr.catalog.get_engine(connection="my_glue_conn_name"), ... table="my_table", ... schema="public" ... iam_role="arn:aws:iam::XXX:role/XXX" ... ) """ path = path if path.endswith("/") else f"{path}/" session: boto3.Session = _utils.ensure_session(session=boto3_session) paths: List[str] = s3.to_parquet( # type: ignore df=df, path=path, index=index, dataset=True, mode="append", dtype=dtype, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, )["paths"] s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session) copy_files_to_redshift( path=paths, manifest_directory=_utils.get_directory(path=path), con=con, table=table, schema=schema, iam_role=iam_role, mode=mode, diststyle=diststyle, distkey=distkey, sortstyle=sortstyle, sortkey=sortkey, primary_keys=primary_keys, varchar_lengths_default=varchar_lengths_default, varchar_lengths=varchar_lengths, use_threads=use_threads, boto3_session=session, ) if keep_files is False: s3.delete_objects(path=paths, use_threads=use_threads, boto3_session=session)