def copy_from_files( # pylint: disable=too-many-locals,too-many-arguments path: str, con: redshift_connector.Connection, table: str, schema: str, iam_role: str, parquet_infer_sampling: float = 1.0, mode: str = "append", diststyle: str = "AUTO", distkey: Optional[str] = None, sortstyle: str = "COMPOUND", sortkey: Optional[List[str]] = None, primary_keys: Optional[List[str]] = None, varchar_lengths_default: int = 256, varchar_lengths: Optional[Dict[str, int]] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, ) -> None: """Load Parquet files from S3 to a Table on Amazon Redshift (Through COPY command). https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html Note ---- If the table does not exist yet, it will be automatically created for you using the Parquet metadata to infer the columns data types. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- path : str S3 prefix (e.g. s3://bucket/prefix/) con : redshift_connector.Connection Use redshift_connector.connect() to use " "credentials directly or wr.redshift.connect() to fetch it from the Glue Catalog. table : str Table name schema : str Schema name iam_role : str AWS IAM role with the related permissions. parquet_infer_sampling : float Random sample ratio of files that will have the metadata inspected. Must be `0.0 < sampling <= 1.0`. The higher, the more accurate. The lower, the faster. mode : str Append, overwrite or upsert. diststyle : str Redshift distribution styles. Must be in ["AUTO", "EVEN", "ALL", "KEY"]. https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html distkey : str, optional Specifies a column name or positional number for the distribution key. sortstyle : str Sorting can be "COMPOUND" or "INTERLEAVED". https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html sortkey : List[str], optional List of columns to be sorted. primary_keys : List[str], optional Primary keys. varchar_lengths_default : int The size that will be set for all VARCHAR columns not specified with varchar_lengths. varchar_lengths : Dict[str, int], optional Dict of VARCHAR length by columns. (e.g. {"col1": 10, "col5": 200}). use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass", "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging". e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'} Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> con = wr.redshift.connect("MY_GLUE_CONNECTION") >>> wr.db.copy_from_files( ... path="s3://bucket/my_parquet_files/", ... con=con, ... table="my_table", ... schema="public" ... iam_role="arn:aws:iam::XXX:role/XXX" ... ) >>> con.close() """ con.autocommit = False try: with con.cursor() as cursor: created_table, created_schema = _create_table( df=None, path=path, parquet_infer_sampling=parquet_infer_sampling, cursor=cursor, table=table, schema=schema, mode=mode, diststyle=diststyle, sortstyle=sortstyle, distkey=distkey, sortkey=sortkey, primary_keys=primary_keys, varchar_lengths_default=varchar_lengths_default, varchar_lengths=varchar_lengths, index=False, dtype=None, use_threads=use_threads, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, ) _copy( cursor=cursor, path=path, table=created_table, schema=created_schema, iam_role=iam_role, ) if table != created_table: # upsert _upsert(cursor=cursor, schema=schema, table=table, temp_table=created_table, primary_keys=primary_keys) con.commit() except Exception as ex: con.rollback() _logger.error(ex) raise
def to_sql( df: pd.DataFrame, con: redshift_connector.Connection, table: str, schema: str, mode: str = "append", index: bool = False, dtype: Optional[Dict[str, str]] = None, diststyle: str = "AUTO", distkey: Optional[str] = None, sortstyle: str = "COMPOUND", sortkey: Optional[List[str]] = None, primary_keys: Optional[List[str]] = None, varchar_lengths_default: int = 256, varchar_lengths: Optional[Dict[str, int]] = None, ) -> None: """Write records stored in a DataFrame into Redshift. Note ---- For large DataFrames (1K+ rows) consider the function **wr.redshift.copy()**. Parameters ---------- df : pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html con : redshift_connector.Connection Use redshift_connector.connect() to use " "credentials directly or wr.redshift.connect() to fetch it from the Glue Catalog. table : str Table name schema : str Schema name mode : str Append, overwrite or upsert. index : bool True to store the DataFrame index as a column in the table, otherwise False to ignore it. dtype: Dict[str, str], optional Dictionary of columns names and Redshift types to be casted. Useful when you have columns with undetermined or mixed data types. (e.g. {'col name': 'VARCHAR(10)', 'col2 name': 'FLOAT'}) diststyle : str Redshift distribution styles. Must be in ["AUTO", "EVEN", "ALL", "KEY"]. https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html distkey : str, optional Specifies a column name or positional number for the distribution key. sortstyle : str Sorting can be "COMPOUND" or "INTERLEAVED". https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html sortkey : List[str], optional List of columns to be sorted. primary_keys : List[str], optional Primary keys. varchar_lengths_default : int The size that will be set for all VARCHAR columns not specified with varchar_lengths. varchar_lengths : Dict[str, int], optional Dict of VARCHAR length by columns. (e.g. {"col1": 10, "col5": 200}). Returns ------- None None. Examples -------- Writing to Redshift using a Glue Catalog Connections >>> import awswrangler as wr >>> con = wr.redshift.connect("MY_GLUE_CONNECTION") >>> wr.redshift.to_sql( ... df=df ... table="my_table", ... schema="public", ... con=con ... ) >>> con.close() """ if df.empty is True: raise exceptions.EmptyDataFrame() _validate_connection(con=con) con.autocommit = False try: with con.cursor() as cursor: created_table, created_schema = _create_table( df=df, path=None, cursor=cursor, table=table, schema=schema, mode=mode, index=index, dtype=dtype, diststyle=diststyle, sortstyle=sortstyle, distkey=distkey, sortkey=sortkey, primary_keys=primary_keys, varchar_lengths_default=varchar_lengths_default, varchar_lengths=varchar_lengths, ) if index: df.reset_index(level=df.index.names, inplace=True) placeholders: str = ", ".join(["%s"] * len(df.columns)) schema_str = f"{created_schema}." if created_schema else "" sql: str = f"INSERT INTO {schema_str}{created_table} VALUES ({placeholders})" _logger.debug("sql: %s", sql) parameters: List[List[Any]] = _db_utils.extract_parameters(df=df) cursor.executemany(sql, parameters) if table != created_table: # upsert _upsert(cursor=cursor, schema=schema, table=table, temp_table=created_table, primary_keys=primary_keys) con.commit() except Exception as ex: con.rollback() _logger.error(ex) raise