def _copy_objects( batch: List[Tuple[str, str]], use_threads: bool, boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, Any]], ) -> None: _logger.debug("len(batch): %s", len(batch)) client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) resource_s3: boto3.resource = _utils.resource(service_name="s3", session=boto3_session) if s3_additional_kwargs is None: boto3_kwargs: Optional[Dict[str, Any]] = None else: boto3_kwargs = get_botocore_valid_kwargs( function_name="copy_object", s3_additional_kwargs=s3_additional_kwargs) for source, target in batch: source_bucket, source_key = _utils.parse_path(path=source) copy_source: Dict[str, str] = { "Bucket": source_bucket, "Key": source_key } target_bucket, target_key = _utils.parse_path(path=target) resource_s3.meta.client.copy( CopySource=copy_source, Bucket=target_bucket, Key=target_key, SourceClient=client_s3, ExtraArgs=boto3_kwargs, Config=TransferConfig(num_download_attempts=10, use_threads=use_threads), )
def _describe_object( path: str, boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, Any]], version_id: Optional[str] = None, ) -> Tuple[str, Dict[str, Any]]: client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) bucket: str key: str bucket, key = _utils.parse_path(path=path) if s3_additional_kwargs: extra_kwargs: Dict[str, Any] = _fs.get_botocore_valid_kwargs( function_name="head_object", s3_additional_kwargs=s3_additional_kwargs) else: extra_kwargs = {} desc: Dict[str, Any] if version_id: extra_kwargs["VersionId"] = version_id desc = _utils.try_it(f=client_s3.head_object, ex=client_s3.exceptions.NoSuchKey, Bucket=bucket, Key=key, **extra_kwargs) return path, desc
def does_object_exist(path: str, s3_additional_kwargs: Optional[Dict[str, Any]] = None, boto3_session: Optional[boto3.Session] = None) -> bool: """Check if object exists on S3. Parameters ---------- path: str S3 path (e.g. s3://bucket/key). s3_additional_kwargs : Optional[Dict[str, Any]] Forward to botocore requests. Valid parameters: "RequestPayer", "ExpectedBucketOwner". e.g. s3_additional_kwargs={'RequestPayer': 'requester'} boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- bool True if exists, False otherwise. Examples -------- Using the default boto3 session >>> import awswrangler as wr >>> wr.s3.does_object_exist('s3://bucket/key_real') True >>> wr.s3.does_object_exist('s3://bucket/key_unreal') False Using a custom boto3 session >>> import boto3 >>> import awswrangler as wr >>> wr.s3.does_object_exist('s3://bucket/key_real', boto3_session=boto3.Session()) True >>> wr.s3.does_object_exist('s3://bucket/key_unreal', boto3_session=boto3.Session()) False """ client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) bucket: str key: str bucket, key = _utils.parse_path(path=path) if s3_additional_kwargs: extra_kwargs: Dict[str, Any] = _fs.get_botocore_valid_kwargs( function_name="head_object", s3_additional_kwargs=s3_additional_kwargs) else: extra_kwargs = {} try: client_s3.head_object(Bucket=bucket, Key=key, **extra_kwargs) return True except botocore.exceptions.ClientError as ex: if ex.response["ResponseMetadata"]["HTTPStatusCode"] == 404: return False raise ex
def _delete_objects( bucket: str, keys: List[str], boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, Any]], attempt: int = 1, ) -> None: client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) _logger.debug("len(keys): %s", len(keys)) batch: List[Dict[str, str]] = [{"Key": key} for key in keys] if s3_additional_kwargs: extra_kwargs: Dict[str, Any] = get_botocore_valid_kwargs( function_name="list_objects_v2", s3_additional_kwargs=s3_additional_kwargs) else: extra_kwargs = {} res = client_s3.delete_objects(Bucket=bucket, Delete={"Objects": batch}, **extra_kwargs) deleted: List[Dict[str, Any]] = res.get("Deleted", []) for obj in deleted: _logger.debug("s3://%s/%s has been deleted.", bucket, obj.get("Key")) errors: List[Dict[str, Any]] = res.get("Errors", []) internal_errors: List[str] = [] for error in errors: _logger.debug("error: %s", error) if "Code" not in error or error["Code"] != "InternalError": raise exceptions.ServiceApiError(errors) internal_errors.append(_unquote_plus(error["Key"])) if len(internal_errors) > 0: if attempt > 5: # Maximum of 5 attempts (Total of 15 seconds) raise exceptions.ServiceApiError(errors) time.sleep(attempt) # Incremental delay (linear) _delete_objects( bucket=bucket, keys=internal_errors, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, attempt=(attempt + 1), )
def _list_objects( # pylint: disable=too-many-branches path: str, s3_additional_kwargs: Optional[Dict[str, Any]], delimiter: Optional[str] = None, suffix: Union[str, List[str], None] = None, ignore_suffix: Union[str, List[str], None] = None, last_modified_begin: Optional[datetime.datetime] = None, last_modified_end: Optional[datetime.datetime] = None, boto3_session: Optional[boto3.Session] = None, ignore_empty: bool = False, ) -> Iterator[List[str]]: bucket: str prefix_original: str bucket, prefix_original = _utils.parse_path(path=path) prefix: str = _prefix_cleanup(prefix=prefix_original) _suffix: Union[List[str], None] = [suffix] if isinstance(suffix, str) else suffix _ignore_suffix: Union[List[str], None] = [ignore_suffix] if isinstance( ignore_suffix, str) else ignore_suffix client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) default_pagination: Dict[str, int] = {"PageSize": 1000} extra_kwargs: Dict[str, Any] = {"PaginationConfig": default_pagination} if s3_additional_kwargs: extra_kwargs = _fs.get_botocore_valid_kwargs( function_name="list_objects_v2", s3_additional_kwargs=s3_additional_kwargs) extra_kwargs["PaginationConfig"] = ( s3_additional_kwargs["PaginationConfig"] if "PaginationConfig" in s3_additional_kwargs else default_pagination) paginator = client_s3.get_paginator("list_objects_v2") args: Dict[str, Any] = {"Bucket": bucket, "Prefix": prefix, **extra_kwargs} if delimiter is not None: args["Delimiter"] = delimiter _logger.debug("args: %s", args) response_iterator = paginator.paginate(**args) paths: List[str] = [] _validate_datetimes(last_modified_begin=last_modified_begin, last_modified_end=last_modified_end) for page in response_iterator: # pylint: disable=too-many-nested-blocks if delimiter is None: contents: Optional[List[Dict[str, Any]]] = page.get("Contents") if contents is not None: for content in contents: key: str = content["Key"] if ignore_empty and content.get("Size", 0) == 0: _logger.debug("Skipping empty file: %s", f"s3://{bucket}/{key}") elif (content is not None) and ("Key" in content): if (_suffix is None) or key.endswith(tuple(_suffix)): if last_modified_begin is not None: if content[ "LastModified"] < last_modified_begin: continue if last_modified_end is not None: if content["LastModified"] > last_modified_end: continue paths.append(f"s3://{bucket}/{key}") else: prefixes: Optional[List[Optional[Dict[str, str]]]] = page.get( "CommonPrefixes") if prefixes is not None: for pfx in prefixes: if (pfx is not None) and ("Prefix" in pfx): key = pfx["Prefix"] paths.append(f"s3://{bucket}/{key}") if prefix != prefix_original: paths = fnmatch.filter(paths, path) if _ignore_suffix is not None: paths = [ p for p in paths if p.endswith(tuple(_ignore_suffix)) is False ] if paths: yield paths paths = []