示例#1
0
def write_redshift_copy_manifest(
    manifest_path: str,
    paths: List[str],
    use_threads: bool = True,
    boto3_session: Optional[boto3.Session] = None,
    s3_additional_kwargs: Optional[Dict[str, str]] = None,
) -> Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]]:
    """Write Redshift copy manifest and return its structure.

    Only Parquet files are supported.

    Note
    ----
    In case of `use_threads=True` the number of threads
    that will be spawned will be gotten from os.cpu_count().

    Parameters
    ----------
    manifest_path : str
        Amazon S3 manifest path (e.g. s3://...)
    paths: List[str]
        List of S3 paths (Parquet Files) to be copied.
    use_threads : bool
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
    s3_additional_kwargs:
        Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass",
        "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging".
        e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'}

    Returns
    -------
    Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]]
        Manifest content.

    Examples
    --------
    Copying two files to Redshift cluster.

    >>> import awswrangler as wr
    >>> wr.db.write_redshift_copy_manifest(
    ...     path="s3://bucket/my.manifest",
    ...     paths=["s3://...parquet", "s3://...parquet"]
    ... )

    """
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    objects_sizes: Dict[str, Optional[int]] = s3.size_objects(
        path=paths, use_threads=use_threads, boto3_session=session)
    manifest: Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]] = {
        "entries": []
    }
    path: str
    size: Optional[int]
    for path, size in objects_sizes.items():
        if size is not None:
            entry: Dict[str, Union[str, bool, Dict[str, int]]] = {
                "url": path,
                "mandatory": True,
                "meta": {
                    "content_length": size
                },
            }
            manifest["entries"].append(entry)
    payload: str = json.dumps(manifest)
    bucket: str
    bucket, key = _utils.parse_path(manifest_path)
    additional_kwargs: Dict[
        str,
        str] = {} if s3_additional_kwargs is None else s3_additional_kwargs
    _logger.debug("payload: %s", payload)
    client_s3: boto3.client = _utils.client(service_name="s3", session=session)
    _logger.debug("bucket: %s", bucket)
    _logger.debug("key: %s", key)
    client_s3.put_object(Body=payload,
                         Bucket=bucket,
                         Key=key,
                         **additional_kwargs)
    return manifest
示例#2
0
def write_redshift_copy_manifest(
    manifest_path: str,
    paths: List[str],
    use_threads: bool = True,
    boto3_session: Optional[boto3.Session] = None
) -> Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]]:
    """Write Redshift copy manifest and return its structure.

    Only Parquet files are supported.

    Note
    ----
    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().

    Parameters
    ----------
    manifest_path : str
        Amazon S3 manifest path (e.g. s3://...)
    paths: List[str]
        List of S3 paths (Parquet Files) to be copied.
    use_threads : bool
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]]
        Manifest content.

    Examples
    --------
    Copying two files to Redshift cluster.

    >>> import awswrangler as wr
    >>> wr.db.write_redshift_copy_manifest(
    ...     path="s3://bucket/my.manifest",
    ...     paths=["s3://...parquet", "s3://...parquet"]
    ... )

    """
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    objects_sizes: Dict[str, Optional[int]] = s3.size_objects(
        path=paths, use_threads=use_threads, boto3_session=session)
    manifest: Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]] = {
        "entries": []
    }
    path: str
    size: Optional[int]
    for path, size in objects_sizes.items():
        if size is not None:
            entry: Dict[str, Union[str, bool, Dict[str, int]]] = {
                "url": path,
                "mandatory": True,
                "meta": {
                    "content_length": size
                },
            }
            manifest["entries"].append(entry)
    payload: str = json.dumps(manifest)
    bucket: str
    bucket, key = _utils.parse_path(manifest_path)
    _logger.debug(f"payload: {payload}")
    client_s3: boto3.client = _utils.client(service_name="s3", session=session)
    _logger.debug(f"bucket: {bucket}")
    _logger.debug(f"key: {key}")
    client_s3.put_object(Body=payload, Bucket=bucket, Key=key)
    return manifest