Пример #1
0
    def s3_folder_to_df(self,
                        bucket: str,
                        folder: str,
                        prefix: str = None,
                        **kwargs):
        """Read all files in folder with prefix to a df.

        Args:
            bucket: S3 bucket name.
            folder: S3 folder.
            prefix: File prefix.

        Returns:
            A DataFrame.
        """
        s3_get_kwargs = filter_kwargs(kwargs, S3_GET_KWARGS)
        read_table_kwargs = filter_kwargs(kwargs, PANDAS_READ_TABLE_KWARGS)
        if folder[-1] != "/":
            folder = folder + "/"
        pattern = make_valid_uri(folder, prefix or "/")
        allfiles = [
            f for f in self.list_object_keys(bucket, pattern) if f != folder
        ]
        dfs = []
        for f in allfiles:
            LOGGER.info(f"Reading file {f}")
            dfs.append(
                self.s3_to_df(bucket, f, **s3_get_kwargs, **read_table_kwargs))
        return pd.concat(dfs)
Пример #2
0
def test_make_valid_uri(test_input):
    URI = "s3://bucket/path"
    assert make_valid_uri(*test_input) == URI
Пример #3
0
def test_make_valid_uri_raises_with_invalid_args():
    with pytest.raises(ValueError):
        make_valid_uri(["1"])
Пример #4
0
    def redshift_to_s3(
        self,
        sql: str,
        bucket: str = None,
        path: str = None,
        prefix: str = None,
        iam_role: str = None,
        file_format: str = None,
        partition_by: List[str] = None,
        include_partition_column: bool = False,
        manifest: bool = False,
        header: bool = False,
        delimiter: str = None,
        fixedwidth: Union[str, int] = None,
        encrypted: bool = False,
        bzip2: bool = False,
        gzip: bool = False,
        zstd: bool = False,
        addquotes: bool = False,
        null: str = None,
        escape: bool = False,
        allowoverwrite: bool = False,
        parallel: str = "ON",
        maxfilesize: Union[str, int, float] = None,
        region: str = None,
    ):
        """Run sql and unload result to S3.

        Args:
            sql: SQL query.
            bucket: S3 bucket name.
            key (optional): S3 key. Create if does not exist.
            prefix (optional): Prefix of the set of files.
            iam_role (optional): IAM Role string. If provided, this will be used as authorization 
                instead of access_key_id/secret_access_key. This feature is untested.
            file_format (optional): CSV or PARQUET.
            manifest (optional): Whether or not to create the manifest file.
            header (optional): Whether or not to include header.
            delimiter (optional): Delimiter charater if the output file is delimited.
            fixedwidth (optional): If not None, it will overwrite delimiter and use fixedwidth 
                format instead.
            encrypted (optional): Whether or not the files should be encrypted.
            bzip2 (optional): Whether or not the files should be compressed with bzip2.
            gzip (optional): Whether or not the files should be compressed with gzip.
            zstd (optional): Whether or not the files should be compressed with zstd.
            addquotes (optional): Whether or not values with delimiter characters should be quoted.
            null (optional): Specify the NULL AS string.
            escape (optional): Whether to include the ESCAPE argument in UNLOAD.
            allowoverwrite (optional): Whether or not existing files should be overwritten. Redshift
                will fail with error message if this is False and there are existing files.
            parallel (optional): ON or OFF. Whether or not to use parallel and unload into multiple 
                files.
            maxfilesize (optional): Maxfilesize argument for UNLOAD.
            region (optional): AWS region if S3 region is different from Redshift region.
        """
        destination_option = ""
        if path is not None:
            destination_option = make_valid_uri(destination_option, f"{path}")
            if destination_option[-1] != "/":
                destination_option = destination_option + "/"
        if prefix is not None:
            destination_option = make_valid_uri(destination_option, f"{prefix}")
        dest_bucket: Optional[str] = bucket or self.default_bucket
        if dest_bucket is None:
            raise ValueError("bucket cannot be None.")
        existing_keys = self._get_s3_pattern_existence(dest_bucket, destination_option)
        if existing_keys:
            warn_message = f"""\
            These keys already exist. It may cause data consistency issues.
            {existing_keys}
            """
            warnings.warn(dedent(warn_message))
        destination_option = make_valid_uri(f"s3://{dest_bucket}", destination_option)

        if sum([bzip2, gzip, zstd]) > 1:
            raise ValueError("Only one of [bzip2, gzip, zstd] should be True.")

        file_format_option = ""
        if file_format is not None:
            if file_format == "CSV":
                if fixedwidth:
                    raise ValueError(
                        "fixedwidth should not be specified for CSV file_format."
                    )
                delimiter = ","
                file_format_option = "format CSV"
            elif file_format == "PARQUET":
                file_format_option = "format PARQUET"
                if delimiter:
                    raise ValueError(
                        "delimiter should not be specified for PARQUET file_format."
                    )
                if fixedwidth:
                    raise ValueError(
                        "fixedwidth should not be specified for PARQUET file_format."
                    )
                if addquotes:
                    raise ValueError(
                        "addquotes should not be specified for PARQUET file_format."
                    )
                if escape:
                    raise ValueError(
                        "escape should not be specified for PARQUET file_format."
                    )
                if null:
                    raise ValueError(
                        "null should not be specified for PARQUET file_format."
                    )
                if header:
                    raise ValueError(
                        "header should not be specified for PARQUET file_format."
                    )
                if gzip:
                    raise ValueError(
                        "gzip should not be specified for PARQUET file_format."
                    )
                if bzip2:
                    raise ValueError(
                        "bzip2 should not be specified for PARQUET file_format."
                    )
                if zstd:
                    raise ValueError(
                        "zstd should not be specified for PARQUET file_format."
                    )
            else:
                raise ValueError("File format can only be CSV or PARQUET if specified.")

        partition_include_option = " INCLUDE" if include_partition_column else ""
        partition_option = (
            f"{','.join(partition_by)}{partition_include_option}"
            if partition_by
            else ""
        )

        manifest_option = "manifest" if manifest else ""
        header_option = "header" if header else ""
        delimiter_option = f"delimiter '{delimiter}'" if delimiter else "delimiter '|'"
        if fixedwidth is not None:
            fixedwidth_option = f"fixedwidth '{fixedwidth}'"
            delimiter_option = ""
        else:
            fixedwidth_option = ""
        encrypted_option = "encrypted" if encrypted else ""
        bzip2_option = "bzip2" if bzip2 else ""
        gzip_option = "gzip" if gzip else ""
        zstd_option = "zstd" if zstd else ""
        addquotes_option = "addquotes" if addquotes else ""
        null_option = f"null as '{null}'" if null is not None else ""
        escape_option = "escape" if escape else ""
        allowoverwrite_option = "allowoverwrite" if allowoverwrite else ""
        parallel_option = f"parallel {parallel}"
        maxfilesize_option = (
            f"maxfilesize '{maxfilesize}'" if maxfilesize is not None else ""
        )
        region_option = f"region {region}" if region else ""
        aws_access_key_id = self.aws_config.get("aws_access_key_id")
        aws_secret_access_key = self.aws_config.get("aws_secret_access_key")
        if (
            aws_access_key_id is None
            and aws_secret_access_key is None
            and iam_role is None
        ):
            raise ValueError(
                "Must provide at least one of [iam_role, aws_access_key_id/aws_secret_access_key]"
            )
        aws_token = self.aws_config.get("aws_session_token")
        aws_token_option = (
            f"session_token '{aws_token}'" if aws_token is not None else ""
        )
        if iam_role is not None:
            iam_role_option = f"iam_role '{iam_role}'"
            access_key_id_option = ""
            secret_access_key_option = ""
        else:
            iam_role_option = ""
            access_key_id_option = f"access_key_id '{aws_access_key_id}'"
            secret_access_key_option = f"secret_access_key '{aws_secret_access_key}'"

        sql = sql.replace("\n", " ")
        unload_template = f"""\
        unload ('{sql}')
        to '{destination_option}'
        {file_format_option}
        {partition_option}
        {manifest_option}
        {header_option}
        {delimiter_option}
        {fixedwidth_option}
        {encrypted_option}
        {bzip2_option}
        {gzip_option}
        {zstd_option}
        {addquotes_option}
        {null_option}
        {escape_option}
        {allowoverwrite_option}
        {parallel_option}
        {maxfilesize_option}
        {region_option}
        {access_key_id_option}
        {secret_access_key_option}
        {aws_token_option}
        {iam_role_option}
        """
        self.run_query(unload_template)
Пример #5
0
    def df_to_redshift(
        self,
        df: pd.DataFrame,
        table_name: str,
        bucket: str = None,
        column_definition: dict = None,
        append: bool = False,
        path: str = None,
        file_name: str = None,
        cleanup: bool = True,
        **kwargs,
    ):
        """Pandas DataFrame to Redshift table.

        Args:
            df: Source dataframe.
            table_name: Redshift table name (optionally include schema name).
            bucket (optional): S3 bucket name, fallback to `default_bucket` if not present.
            column_definition (optional): Specify the column definition for CREATE TABLE. If 
                not given and append is False, data type will be inferred.
            append (optional): If true, df will be appended to Redshift table, otherwise table 
                will be dropped and recreated.
            path (optional): S3 key excluding file name.
            file_name (optional): If None, file_name will be randomly generated.
            cleanup: (optional): Default True, S3 file will be deleted after COPY.
            **kwargs: keyword arguments to pass to Pandas `to_csv` and Redshift COPY.
        """
        bridge_bucket = bucket or self.default_bucket
        if not bridge_bucket:
            raise ValueError("Either bucket or default_bucket must be provided.")

        to_csv_kwargs = filter_kwargs(kwargs, PANDAS_TOCSV_KWARGS)
        copy_kwargs = filter_kwargs(kwargs, REDSHIFT_COPY_KWARGS)

        if column_definition is None:
            column_definition = map_types(OrderedDict(df.dtypes))

        # default pandas behavior is true when index is not specified
        if to_csv_kwargs.get("index") is None or to_csv_kwargs.get("index"):
            if df.index.name:
                full_column_definition = OrderedDict({df.index.name: df.index.dtype})
            else:
                full_column_definition = OrderedDict({"index": df.index.dtype})
            full_column_definition = map_types(full_column_definition)
            full_column_definition.update(column_definition)
            column_definition = full_column_definition

        check_invalid_columns(list(column_definition))

        if file_name is None:
            import uuid

            file_name = f"redpanda-{uuid.uuid4()}"

        s3_key = make_valid_uri(path if path is not None else "", file_name)
        self.df_to_s3(df, bucket=bridge_bucket, key=s3_key, **to_csv_kwargs)
        try:
            self.s3_to_redshift(
                bridge_bucket,
                s3_key,
                table_name,
                column_definition=column_definition,
                append=append,
                **copy_kwargs,
            )
        finally:
            if cleanup:
                self.delete_from_s3(bridge_bucket, s3_key)