def s3_folder_to_df(self, bucket: str, folder: str, prefix: str = None, **kwargs): """Read all files in folder with prefix to a df. Args: bucket: S3 bucket name. folder: S3 folder. prefix: File prefix. Returns: A DataFrame. """ s3_get_kwargs = filter_kwargs(kwargs, S3_GET_KWARGS) read_table_kwargs = filter_kwargs(kwargs, PANDAS_READ_TABLE_KWARGS) if folder[-1] != "/": folder = folder + "/" pattern = make_valid_uri(folder, prefix or "/") allfiles = [ f for f in self.list_object_keys(bucket, pattern) if f != folder ] dfs = [] for f in allfiles: LOGGER.info(f"Reading file {f}") dfs.append( self.s3_to_df(bucket, f, **s3_get_kwargs, **read_table_kwargs)) return pd.concat(dfs)
def test_make_valid_uri(test_input): URI = "s3://bucket/path" assert make_valid_uri(*test_input) == URI
def test_make_valid_uri_raises_with_invalid_args(): with pytest.raises(ValueError): make_valid_uri(["1"])
def redshift_to_s3( self, sql: str, bucket: str = None, path: str = None, prefix: str = None, iam_role: str = None, file_format: str = None, partition_by: List[str] = None, include_partition_column: bool = False, manifest: bool = False, header: bool = False, delimiter: str = None, fixedwidth: Union[str, int] = None, encrypted: bool = False, bzip2: bool = False, gzip: bool = False, zstd: bool = False, addquotes: bool = False, null: str = None, escape: bool = False, allowoverwrite: bool = False, parallel: str = "ON", maxfilesize: Union[str, int, float] = None, region: str = None, ): """Run sql and unload result to S3. Args: sql: SQL query. bucket: S3 bucket name. key (optional): S3 key. Create if does not exist. prefix (optional): Prefix of the set of files. iam_role (optional): IAM Role string. If provided, this will be used as authorization instead of access_key_id/secret_access_key. This feature is untested. file_format (optional): CSV or PARQUET. manifest (optional): Whether or not to create the manifest file. header (optional): Whether or not to include header. delimiter (optional): Delimiter charater if the output file is delimited. fixedwidth (optional): If not None, it will overwrite delimiter and use fixedwidth format instead. encrypted (optional): Whether or not the files should be encrypted. bzip2 (optional): Whether or not the files should be compressed with bzip2. gzip (optional): Whether or not the files should be compressed with gzip. zstd (optional): Whether or not the files should be compressed with zstd. addquotes (optional): Whether or not values with delimiter characters should be quoted. null (optional): Specify the NULL AS string. escape (optional): Whether to include the ESCAPE argument in UNLOAD. allowoverwrite (optional): Whether or not existing files should be overwritten. Redshift will fail with error message if this is False and there are existing files. parallel (optional): ON or OFF. Whether or not to use parallel and unload into multiple files. maxfilesize (optional): Maxfilesize argument for UNLOAD. region (optional): AWS region if S3 region is different from Redshift region. """ destination_option = "" if path is not None: destination_option = make_valid_uri(destination_option, f"{path}") if destination_option[-1] != "/": destination_option = destination_option + "/" if prefix is not None: destination_option = make_valid_uri(destination_option, f"{prefix}") dest_bucket: Optional[str] = bucket or self.default_bucket if dest_bucket is None: raise ValueError("bucket cannot be None.") existing_keys = self._get_s3_pattern_existence(dest_bucket, destination_option) if existing_keys: warn_message = f"""\ These keys already exist. It may cause data consistency issues. {existing_keys} """ warnings.warn(dedent(warn_message)) destination_option = make_valid_uri(f"s3://{dest_bucket}", destination_option) if sum([bzip2, gzip, zstd]) > 1: raise ValueError("Only one of [bzip2, gzip, zstd] should be True.") file_format_option = "" if file_format is not None: if file_format == "CSV": if fixedwidth: raise ValueError( "fixedwidth should not be specified for CSV file_format." ) delimiter = "," file_format_option = "format CSV" elif file_format == "PARQUET": file_format_option = "format PARQUET" if delimiter: raise ValueError( "delimiter should not be specified for PARQUET file_format." ) if fixedwidth: raise ValueError( "fixedwidth should not be specified for PARQUET file_format." ) if addquotes: raise ValueError( "addquotes should not be specified for PARQUET file_format." ) if escape: raise ValueError( "escape should not be specified for PARQUET file_format." ) if null: raise ValueError( "null should not be specified for PARQUET file_format." ) if header: raise ValueError( "header should not be specified for PARQUET file_format." ) if gzip: raise ValueError( "gzip should not be specified for PARQUET file_format." ) if bzip2: raise ValueError( "bzip2 should not be specified for PARQUET file_format." ) if zstd: raise ValueError( "zstd should not be specified for PARQUET file_format." ) else: raise ValueError("File format can only be CSV or PARQUET if specified.") partition_include_option = " INCLUDE" if include_partition_column else "" partition_option = ( f"{','.join(partition_by)}{partition_include_option}" if partition_by else "" ) manifest_option = "manifest" if manifest else "" header_option = "header" if header else "" delimiter_option = f"delimiter '{delimiter}'" if delimiter else "delimiter '|'" if fixedwidth is not None: fixedwidth_option = f"fixedwidth '{fixedwidth}'" delimiter_option = "" else: fixedwidth_option = "" encrypted_option = "encrypted" if encrypted else "" bzip2_option = "bzip2" if bzip2 else "" gzip_option = "gzip" if gzip else "" zstd_option = "zstd" if zstd else "" addquotes_option = "addquotes" if addquotes else "" null_option = f"null as '{null}'" if null is not None else "" escape_option = "escape" if escape else "" allowoverwrite_option = "allowoverwrite" if allowoverwrite else "" parallel_option = f"parallel {parallel}" maxfilesize_option = ( f"maxfilesize '{maxfilesize}'" if maxfilesize is not None else "" ) region_option = f"region {region}" if region else "" aws_access_key_id = self.aws_config.get("aws_access_key_id") aws_secret_access_key = self.aws_config.get("aws_secret_access_key") if ( aws_access_key_id is None and aws_secret_access_key is None and iam_role is None ): raise ValueError( "Must provide at least one of [iam_role, aws_access_key_id/aws_secret_access_key]" ) aws_token = self.aws_config.get("aws_session_token") aws_token_option = ( f"session_token '{aws_token}'" if aws_token is not None else "" ) if iam_role is not None: iam_role_option = f"iam_role '{iam_role}'" access_key_id_option = "" secret_access_key_option = "" else: iam_role_option = "" access_key_id_option = f"access_key_id '{aws_access_key_id}'" secret_access_key_option = f"secret_access_key '{aws_secret_access_key}'" sql = sql.replace("\n", " ") unload_template = f"""\ unload ('{sql}') to '{destination_option}' {file_format_option} {partition_option} {manifest_option} {header_option} {delimiter_option} {fixedwidth_option} {encrypted_option} {bzip2_option} {gzip_option} {zstd_option} {addquotes_option} {null_option} {escape_option} {allowoverwrite_option} {parallel_option} {maxfilesize_option} {region_option} {access_key_id_option} {secret_access_key_option} {aws_token_option} {iam_role_option} """ self.run_query(unload_template)
def df_to_redshift( self, df: pd.DataFrame, table_name: str, bucket: str = None, column_definition: dict = None, append: bool = False, path: str = None, file_name: str = None, cleanup: bool = True, **kwargs, ): """Pandas DataFrame to Redshift table. Args: df: Source dataframe. table_name: Redshift table name (optionally include schema name). bucket (optional): S3 bucket name, fallback to `default_bucket` if not present. column_definition (optional): Specify the column definition for CREATE TABLE. If not given and append is False, data type will be inferred. append (optional): If true, df will be appended to Redshift table, otherwise table will be dropped and recreated. path (optional): S3 key excluding file name. file_name (optional): If None, file_name will be randomly generated. cleanup: (optional): Default True, S3 file will be deleted after COPY. **kwargs: keyword arguments to pass to Pandas `to_csv` and Redshift COPY. """ bridge_bucket = bucket or self.default_bucket if not bridge_bucket: raise ValueError("Either bucket or default_bucket must be provided.") to_csv_kwargs = filter_kwargs(kwargs, PANDAS_TOCSV_KWARGS) copy_kwargs = filter_kwargs(kwargs, REDSHIFT_COPY_KWARGS) if column_definition is None: column_definition = map_types(OrderedDict(df.dtypes)) # default pandas behavior is true when index is not specified if to_csv_kwargs.get("index") is None or to_csv_kwargs.get("index"): if df.index.name: full_column_definition = OrderedDict({df.index.name: df.index.dtype}) else: full_column_definition = OrderedDict({"index": df.index.dtype}) full_column_definition = map_types(full_column_definition) full_column_definition.update(column_definition) column_definition = full_column_definition check_invalid_columns(list(column_definition)) if file_name is None: import uuid file_name = f"redpanda-{uuid.uuid4()}" s3_key = make_valid_uri(path if path is not None else "", file_name) self.df_to_s3(df, bucket=bridge_bucket, key=s3_key, **to_csv_kwargs) try: self.s3_to_redshift( bridge_bucket, s3_key, table_name, column_definition=column_definition, append=append, **copy_kwargs, ) finally: if cleanup: self.delete_from_s3(bridge_bucket, s3_key)