def write_redshift_copy_manifest( manifest_path: str, paths: List[str], use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, ) -> Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]]: """Write Redshift copy manifest and return its structure. Only Parquet files are supported. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- manifest_path : str Amazon S3 manifest path (e.g. s3://...) paths: List[str] List of S3 paths (Parquet Files) to be copied. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass", "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging". e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'} Returns ------- Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]] Manifest content. Examples -------- Copying two files to Redshift cluster. >>> import awswrangler as wr >>> wr.db.write_redshift_copy_manifest( ... path="s3://bucket/my.manifest", ... paths=["s3://...parquet", "s3://...parquet"] ... ) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) objects_sizes: Dict[str, Optional[int]] = s3.size_objects( path=paths, use_threads=use_threads, boto3_session=session) manifest: Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]] = { "entries": [] } path: str size: Optional[int] for path, size in objects_sizes.items(): if size is not None: entry: Dict[str, Union[str, bool, Dict[str, int]]] = { "url": path, "mandatory": True, "meta": { "content_length": size }, } manifest["entries"].append(entry) payload: str = json.dumps(manifest) bucket: str bucket, key = _utils.parse_path(manifest_path) additional_kwargs: Dict[ str, str] = {} if s3_additional_kwargs is None else s3_additional_kwargs _logger.debug("payload: %s", payload) client_s3: boto3.client = _utils.client(service_name="s3", session=session) _logger.debug("bucket: %s", bucket) _logger.debug("key: %s", key) client_s3.put_object(Body=payload, Bucket=bucket, Key=key, **additional_kwargs) return manifest
def write_redshift_copy_manifest( manifest_path: str, paths: List[str], use_threads: bool = True, boto3_session: Optional[boto3.Session] = None ) -> Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]]: """Write Redshift copy manifest and return its structure. Only Parquet files are supported. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). Parameters ---------- manifest_path : str Amazon S3 manifest path (e.g. s3://...) paths: List[str] List of S3 paths (Parquet Files) to be copied. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]] Manifest content. Examples -------- Copying two files to Redshift cluster. >>> import awswrangler as wr >>> wr.db.write_redshift_copy_manifest( ... path="s3://bucket/my.manifest", ... paths=["s3://...parquet", "s3://...parquet"] ... ) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) objects_sizes: Dict[str, Optional[int]] = s3.size_objects( path=paths, use_threads=use_threads, boto3_session=session) manifest: Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]] = { "entries": [] } path: str size: Optional[int] for path, size in objects_sizes.items(): if size is not None: entry: Dict[str, Union[str, bool, Dict[str, int]]] = { "url": path, "mandatory": True, "meta": { "content_length": size }, } manifest["entries"].append(entry) payload: str = json.dumps(manifest) bucket: str bucket, key = _utils.parse_path(manifest_path) _logger.debug(f"payload: {payload}") client_s3: boto3.client = _utils.client(service_name="s3", session=session) _logger.debug(f"bucket: {bucket}") _logger.debug(f"key: {key}") client_s3.put_object(Body=payload, Bucket=bucket, Key=key) return manifest