def __init__(self, anon=False, key=None, secret=None, token=None, use_ssl=True, client_kwargs=None, requester_pays=False, default_block_size=None, default_fill_cache=True, default_cache_type='bytes', version_aware=False, config_kwargs=None, s3_additional_kwargs=None, session=None, username=None, password=None, **kwargs): if key and username: raise KeyError('Supply either key or username, not both') if secret and password: raise KeyError('Supply secret or password, not both') if username: key = username if password: secret = password if self._cached: return super().__init__() self.anon = anon self.session = None self.passed_in_session = session if self.passed_in_session: self.session = self.passed_in_session self.key = key self.secret = secret self.token = token self.kwargs = kwargs if client_kwargs is None: client_kwargs = {} if config_kwargs is None: config_kwargs = {} self.default_block_size = default_block_size or self.default_block_size self.default_fill_cache = default_fill_cache self.default_cache_type = default_cache_type self.version_aware = version_aware self.client_kwargs = client_kwargs self.config_kwargs = config_kwargs self.req_kw = {'RequestPayer': 'requester'} if requester_pays else {} self.s3_additional_kwargs = s3_additional_kwargs or {} self.use_ssl = use_ssl self.s3 = self.connect() self._kwargs_helper = ParamKwargsHelper(self.s3)
def __init__(self, anon=False, key=None, secret=None, token=None, use_ssl=True, client_kwargs=None, requester_pays=False, default_block_size=None, default_fill_cache=True, config_kwargs=None, s3_additional_kwargs=None, **kwargs): self.anon = anon self.session = None self.key = key self.secret = secret self.token = token self.kwargs = kwargs if client_kwargs is None: client_kwargs = {} if default_block_size is not None: self.default_block_size = default_block_size if config_kwargs is None: config_kwargs = {} self.default_fill_cache = default_fill_cache self.client_kwargs = client_kwargs self.config_kwargs = config_kwargs self.dirs = {} self.req_kw = {'RequestPayer': 'requester'} if requester_pays else {} self.s3_additional_kwargs = s3_additional_kwargs or {} self.use_ssl = use_ssl self.s3 = self.connect() self._kwargs_helper = ParamKwargsHelper(self.s3) self._singleton[0] = self
class S3FileSystem(AbstractFileSystem): """ Access S3 as if it were a file system. This exposes a filesystem-like API (ls, cp, open, etc.) on top of S3 storage. Provide credentials either explicitly (``key=``, ``secret=``) or depend on boto's credential methods. See boto3 documentation for more information. If no credentials are available, use ``anon=True``. Parameters ---------- anon : bool (False) Whether to use anonymous connection (public buckets only). If False, uses the key/secret given, or boto's credential resolver (environment variables, config files, EC2 IAM server, in that order) key : string (None) If not anonymous, use this access key ID, if specified secret : string (None) If not anonymous, use this secret access key, if specified token : string (None) If not anonymous, use this security token, if specified use_ssl : bool (True) Whether to use SSL in connections to S3; may be faster without, but insecure s3_additional_kwargs : dict of parameters that are used when calling s3 api methods. Typically used for things like "ServerSideEncryption". client_kwargs : dict of parameters for the boto3 client requester_pays : bool (False) If RequesterPays buckets are supported. default_block_size: int (None) If given, the default block size value used for ``open()``, if no specific value is given at all time. The built-in default is 5MB. default_fill_cache : Bool (True) Whether to use cache filling with open by default. Refer to ``S3File.open``. default_cache_type : string ('bytes') If given, the default cache_type value used for ``open()``. Set to "none" if no caching is desired. See fsspec's documentation for other available cache_type values. Default cache_type is 'bytes'. version_aware : bool (False) Whether to support bucket versioning. If enable this will require the user to have the necessary IAM permissions for dealing with versioned objects. config_kwargs : dict of parameters passed to ``botocore.client.Config`` kwargs : other parameters for boto3 session session : botocore Session object to be used for all connections. This session will be used inplace of creating a new session inside S3FileSystem. Examples -------- >>> s3 = S3FileSystem(anon=False) # doctest: +SKIP >>> s3.ls('my-bucket/') # doctest: +SKIP ['my-file.txt'] >>> with s3.open('my-bucket/my-file.txt', mode='rb') as f: # doctest: +SKIP ... print(f.read()) # doctest: +SKIP b'Hello, world!' """ root_marker = "" connect_timeout = 5 read_timeout = 15 default_block_size = 5 * 2**20 protocol = 's3' def __init__(self, anon=False, key=None, secret=None, token=None, use_ssl=True, client_kwargs=None, requester_pays=False, default_block_size=None, default_fill_cache=True, default_cache_type='bytes', version_aware=False, config_kwargs=None, s3_additional_kwargs=None, session=None, username=None, password=None, **kwargs): if key and username: raise KeyError('Supply either key or username, not both') if secret and password: raise KeyError('Supply secret or password, not both') if username: key = username if password: secret = password if self._cached: return super().__init__() self.anon = anon self.session = None self.passed_in_session = session if self.passed_in_session: self.session = self.passed_in_session self.key = key self.secret = secret self.token = token self.kwargs = kwargs if client_kwargs is None: client_kwargs = {} if config_kwargs is None: config_kwargs = {} self.default_block_size = default_block_size or self.default_block_size self.default_fill_cache = default_fill_cache self.default_cache_type = default_cache_type self.version_aware = version_aware self.client_kwargs = client_kwargs self.config_kwargs = config_kwargs self.req_kw = {'RequestPayer': 'requester'} if requester_pays else {} self.s3_additional_kwargs = s3_additional_kwargs or {} self.use_ssl = use_ssl self.s3 = self.connect() self._kwargs_helper = ParamKwargsHelper(self.s3) def _filter_kwargs(self, s3_method, kwargs): return self._kwargs_helper.filter_dict(s3_method.__name__, kwargs) def _call_s3(self, method, *akwarglist, **kwargs): kw2 = kwargs.copy() kw2.pop('Body', None) logger.debug("CALL: %s - %s - %s" % (method.__name__, akwarglist, kw2)) additional_kwargs = self._get_s3_method_kwargs(method, *akwarglist, **kwargs) return method(**additional_kwargs) def _get_s3_method_kwargs(self, method, *akwarglist, **kwargs): additional_kwargs = self.s3_additional_kwargs.copy() for akwargs in akwarglist: additional_kwargs.update(akwargs) # Add the normal kwargs in additional_kwargs.update(kwargs) # filter all kwargs return self._filter_kwargs(method, additional_kwargs) def connect(self, refresh=True): """ Establish S3 connection object. Parameters ---------- refresh : bool Whether to create new session/client, even if a previous one with the same parameters already exists. If False (default), an existing one will be used if possible """ if refresh is False: # back compat: we store whole FS instance now return self.s3 anon, key, secret, kwargs, ckwargs, token, ssl = (self.anon, self.key, self.secret, self.kwargs, self.client_kwargs, self.token, self.use_ssl) if self.anon: from botocore import UNSIGNED conf = Config(connect_timeout=self.connect_timeout, read_timeout=self.read_timeout, signature_version=UNSIGNED, **self.config_kwargs) if not self.passed_in_session: self.session = boto3.Session(**self.kwargs) else: conf = Config(connect_timeout=self.connect_timeout, read_timeout=self.read_timeout, **self.config_kwargs) if not self.passed_in_session: self.session = boto3.Session(self.key, self.secret, self.token, **self.kwargs) logger.debug("Setting up s3fs instance") self.s3 = self.session.client('s3', config=conf, use_ssl=ssl, **self.client_kwargs) return self.s3 def get_delegated_s3pars(self, exp=3600): """Get temporary credentials from STS, appropriate for sending across a network. Only relevant where the key/secret were explicitly provided. Parameters ---------- exp : int Time in seconds that credentials are good for Returns ------- dict of parameters """ if self.anon: return {'anon': True} if self.token: # already has temporary cred return { 'key': self.key, 'secret': self.secret, 'token': self.token, 'anon': False } if self.key is None or self.secret is None: # automatic credentials return {'anon': False} sts = self.session.client('sts') cred = sts.get_session_token(DurationSeconds=exp)['Credentials'] return { 'key': cred['AccessKeyId'], 'secret': cred['SecretAccessKey'], 'token': cred['SessionToken'], 'anon': False } def _open(self, path, mode='rb', block_size=None, acl='', version_id=None, fill_cache=None, cache_type=None, autocommit=True, **kwargs): """ Open a file for reading or writing Parameters ---------- path: string Path of file on S3 mode: string One of 'r', 'w', 'a', 'rb', 'wb', or 'ab'. These have the same meaning as they do for the built-in `open` function. block_size: int Size of data-node blocks if reading fill_cache: bool If seeking to new a part of the file beyond the current buffer, with this True, the buffer will be filled between the sections to best support random access. When reading only a few specific chunks out of a file, performance may be better if False. acl: str Canned ACL to set when writing version_id : str Explicit version of the object to open. This requires that the s3 filesystem is version aware and bucket versioning is enabled on the relevant bucket. encoding : str The encoding to use if opening the file in text mode. The platform's default text encoding is used if not given. cache_type : str See fsspec's documentation for available cache_type values. Set to "none" if no caching is desired. If None, defaults to ``self.default_cache_type``. kwargs: dict-like Additional parameters used for s3 methods. Typically used for ServerSideEncryption. """ if block_size is None: block_size = self.default_block_size if fill_cache is None: fill_cache = self.default_fill_cache acl = acl or self.s3_additional_kwargs.get('ACL', '') kw = self.s3_additional_kwargs.copy() kw.update(kwargs) if not self.version_aware and version_id: raise ValueError( "version_id cannot be specified if the filesystem " "is not version aware") if cache_type is None: cache_type = self.default_cache_type return S3File(self, path, mode, block_size=block_size, acl=acl, version_id=version_id, fill_cache=fill_cache, s3_additional_kwargs=kw, cache_type=cache_type, autocommit=autocommit) def _lsdir(self, path, refresh=False, max_items=None): if path.startswith('s3://'): path = path[len('s3://'):] path = path.rstrip('/') bucket, prefix = split_path(path) prefix = prefix + '/' if prefix else "" if path not in self.dircache or refresh: try: logger.debug("Get directory listing page for %s" % path) pag = self.s3.get_paginator('list_objects_v2') config = {} if max_items is not None: config.update(MaxItems=max_items, PageSize=2 * max_items) it = pag.paginate(Bucket=bucket, Prefix=prefix, Delimiter='/', PaginationConfig=config, **self.req_kw) files = [] dircache = [] for i in it: dircache.extend(i.get('CommonPrefixes', [])) for c in i.get('Contents', []): c['type'] = 'file' c['size'] = c['Size'] files.append(c) if dircache: files.extend([{ 'Key': l['Prefix'][:-1], 'Size': 0, 'StorageClass': "DIRECTORY", 'type': 'directory', 'size': 0 } for l in dircache]) for f in files: f['Key'] = '/'.join([bucket, f['Key']]) f['name'] = f['Key'] except ClientError as e: raise translate_boto_error(e) self.dircache[path] = files return self.dircache[path] def mkdir(self, path, acl="", **kwargs): path = self._strip_protocol(path).rstrip('/') if not self._parent(path): if acl and acl not in buck_acls: raise ValueError('ACL not in %s', buck_acls) try: params = {"Bucket": path, 'ACL': acl} region_name = (kwargs.get("region_name", None) or self.client_kwargs.get("region_name", None)) if region_name: params['CreateBucketConfiguration'] = { 'LocationConstraint': region_name } self.s3.create_bucket(**params) self.invalidate_cache('') self.invalidate_cache(path) except ClientError as e: raise translate_boto_error(e) except ParamValidationError as e: raise ValueError('Bucket create failed %r: %s' % (path, e)) def rmdir(self, path): path = self._strip_protocol(path).rstrip('/') if not self._parent(path): try: self.s3.delete_bucket(Bucket=path) except ClientError as e: raise translate_boto_error(e) self.invalidate_cache(path) self.invalidate_cache('') def _lsbuckets(self, refresh=False): if '' not in self.dircache or refresh: if self.anon: # cannot list buckets if not logged in return [] try: files = self.s3.list_buckets()['Buckets'] except ClientError: # listbucket permission missing return [] for f in files: f['Key'] = f['Name'] f['Size'] = 0 f['StorageClass'] = 'BUCKET' f['size'] = 0 f['type'] = 'directory' f['name'] = f['Name'] del f['Name'] self.dircache[''] = files return self.dircache[''] def _ls(self, path, refresh=False): """ List files in given bucket, or list of buckets. Listing is cached unless `refresh=True`. Note: only your buckets associated with the login will be listed by `ls('')`, not any public buckets (even if already accessed). Parameters ---------- path : string/bytes location at which to list files refresh : bool (=False) if False, look in local cache for file details first """ if path.startswith('s3://'): path = path[len('s3://'):] if path in ['', '/']: return self._lsbuckets(refresh) else: return self._lsdir(path, refresh) def exists(self, path): if path in ['', '/']: # the root always exists, even if anon return True bucket, key = split_path(path) if key: return super().exists(path) else: try: self.ls(path) return True except FileNotFoundError: return False def touch(self, path, truncate=True, data=None, **kwargs): """Create empty file or truncate""" bucket, key = split_path(path) if not truncate and self.exists(path): raise ValueError("S3 does not support touching existent files") try: self._call_s3(self.s3.put_object, kwargs, Bucket=bucket, Key=key) except ClientError as ex: raise translate_boto_error(ex) self.invalidate_cache(self._parent(path)) def info(self, path, version_id=None): if path in ['/', '']: return {'name': path, 'size': 0, 'type': 'directory'} kwargs = self.kwargs.copy() if version_id is not None: if not self.version_aware: raise ValueError("version_id cannot be specified if the " "filesystem is not version aware") kwargs['VersionId'] = version_id if self.version_aware: try: bucket, key = split_path(path) out = self._call_s3(self.s3.head_object, kwargs, Bucket=bucket, Key=key, **self.req_kw) return { 'ETag': out['ETag'], 'Key': '/'.join([bucket, key]), 'LastModified': out['LastModified'], 'Size': out['ContentLength'], 'size': out['ContentLength'], 'path': '/'.join([bucket, key]), 'StorageClass': "STANDARD", 'VersionId': out.get('VersionId') } except ClientError as e: ee = translate_boto_error(e) # This could have failed since the thing we are looking for is a prefix. if isinstance(ee, FileNotFoundError): return super().info(path) else: raise ee except ParamValidationError as e: raise ValueError('Failed to head path %r: %s' % (path, e)) return super().info(path) def ls(self, path, detail=False, refresh=False, **kwargs): """ List single "directory" with or without details Parameters ---------- path : string/bytes location at which to list files detail : bool (=True) if True, each list item is a dict of file properties; otherwise, returns list of filenames refresh : bool (=False) if False, look in local cache for file details first kwargs : dict additional arguments passed on """ path = self._strip_protocol(path).rstrip('/') files = self._ls(path, refresh=refresh) if not files: files = self._ls(self._parent(path), refresh=refresh) files = [ o for o in files if o['name'].rstrip('/') == path and o['type'] != 'directory' ] if detail: return files else: return list(sorted(set([f['name'] for f in files]))) def object_version_info(self, path, **kwargs): if not self.version_aware: raise ValueError("version specific functionality is disabled for " "non-version aware filesystems") bucket, key = split_path(path) kwargs = {} out = {'IsTruncated': True} versions = [] while out['IsTruncated']: out = self._call_s3(self.s3.list_object_versions, kwargs, Bucket=bucket, Prefix=key, **self.req_kw) versions.extend(out['Versions']) kwargs['VersionIdMarker'] = out.get('NextVersionIdMarker', '') return versions _metadata_cache = {} def metadata(self, path, refresh=False, **kwargs): """ Return metadata of path. Metadata is cached unless `refresh=True`. Parameters ---------- path : string/bytes filename to get metadata for refresh : bool (=False) if False, look in local cache for file metadata first """ bucket, key = split_path(path) if refresh or path not in self._metadata_cache: response = self._call_s3(self.s3.head_object, kwargs, Bucket=bucket, Key=key, **self.req_kw) self._metadata_cache[path] = response['Metadata'] return self._metadata_cache[path] def get_tags(self, path): """Retrieve tag key/values for the given path Returns ------- {str: str} """ bucket, key = split_path(path) response = self._call_s3(self.s3.get_object_tagging, Bucket=bucket, Key=key) return {v['Key']: v['Value'] for v in response['TagSet']} def put_tags(self, path, tags, mode='o'): """Set tags for given existing key Tags are a str:str mapping that can be attached to any key, see https://docs.aws.amazon.com/awsaccountbilling/latest/aboutv2/allocation-tag-restrictions.html This is similar to, but distinct from, key metadata, which is usually set at key creation time. Parameters ---------- path: str Existing key to attach tags to tags: dict str, str Tags to apply. mode: One of 'o' or 'm' 'o': Will over-write any existing tags. 'm': Will merge in new tags with existing tags. Incurs two remote calls. """ bucket, key = split_path(path) if mode == 'm': existing_tags = self.get_tags(path=path) existing_tags.update(tags) new_tags = [{ 'Key': k, 'Value': v } for k, v in existing_tags.items()] elif mode == 'o': new_tags = [{'Key': k, 'Value': v} for k, v in tags.items()] else: raise ValueError("Mode must be {'o', 'm'}, not %s" % mode) tag = {'TagSet': new_tags} self._call_s3(self.s3.put_object_tagging, Bucket=bucket, Key=key, Tagging=tag) def getxattr(self, path, attr_name, **kwargs): """ Get an attribute from the metadata. Examples -------- >>> mys3fs.getxattr('mykey', 'attribute_1') # doctest: +SKIP 'value_1' """ xattr = self.metadata(path, **kwargs) if attr_name in xattr: return xattr[attr_name] return None def setxattr(self, path, copy_kwargs=None, **kw_args): """ Set metadata. Attributes have to be of the form documented in the `Metadata Reference`_. Parameters --------- kw_args : key-value pairs like field="value", where the values must be strings. Does not alter existing fields, unless the field appears here - if the value is None, delete the field. copy_kwargs : dict, optional dictionary of additional params to use for the underlying s3.copy_object. Examples -------- >>> mys3file.setxattr(attribute_1='value1', attribute_2='value2') # doctest: +SKIP # Example for use with copy_args >>> mys3file.setxattr(copy_kwargs={'ContentType': 'application/pdf'}, ... attribute_1='value1') # doctest: +SKIP .. Metadata Reference: http://docs.aws.amazon.com/AmazonS3/latest/dev/UsingMetadata.html#object-metadata """ bucket, key = split_path(path) metadata = self.metadata(path) metadata.update(**kw_args) copy_kwargs = copy_kwargs or {} # remove all keys that are None for kw_key in kw_args: if kw_args[kw_key] is None: metadata.pop(kw_key, None) self._call_s3( self.s3.copy_object, copy_kwargs, CopySource="{}/{}".format(bucket, key), Bucket=bucket, Key=key, Metadata=metadata, MetadataDirective='REPLACE', ) # refresh metadata self._metadata_cache[path] = metadata def chmod(self, path, acl, **kwargs): """ Set Access Control on a bucket/key See http://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl Parameters ---------- path : string the object to set acl : string the value of ACL to apply """ bucket, key = split_path(path) if key: if acl not in key_acls: raise ValueError('ACL not in %s', key_acls) self._call_s3(self.s3.put_object_acl, kwargs, Bucket=bucket, Key=key, ACL=acl) else: if acl not in buck_acls: raise ValueError('ACL not in %s', buck_acls) self._call_s3(self.s3.put_bucket_acl, kwargs, Bucket=bucket, ACL=acl) def url(self, path, expires=3600, **kwargs): """ Generate presigned URL to access path by HTTP Parameters ---------- path : string the key path we are interested in expires : int the number of seconds this signature will be good for. """ bucket, key = split_path(path) return self.s3.generate_presigned_url(ClientMethod='get_object', Params=dict(Bucket=bucket, Key=key, **kwargs), ExpiresIn=expires) def merge(self, path, filelist, **kwargs): """ Create single S3 file from list of S3 files Uses multi-part, no data is downloaded. The original files are not deleted. Parameters ---------- path : str The final file to produce filelist : list of str The paths, in order, to assemble into the final file. """ bucket, key = split_path(path) mpu = self._call_s3(self.s3.create_multipart_upload, kwargs, Bucket=bucket, Key=key) out = [ self._call_s3(self.s3.upload_part_copy, kwargs, Bucket=bucket, Key=key, UploadId=mpu['UploadId'], CopySource=f, PartNumber=i + 1) for (i, f) in enumerate(filelist) ] parts = [{ 'PartNumber': i + 1, 'ETag': o['CopyPartResult']['ETag'] } for (i, o) in enumerate(out)] part_info = {'Parts': parts} self.s3.complete_multipart_upload(Bucket=bucket, Key=key, UploadId=mpu['UploadId'], MultipartUpload=part_info) self.invalidate_cache(path) def copy_basic(self, path1, path2, **kwargs): """ Copy file between locations on S3 """ buc1, key1 = split_path(path1) buc2, key2 = split_path(path2) try: self._call_s3(self.s3.copy_object, kwargs, Bucket=buc2, Key=key2, CopySource='/'.join([buc1, key1])) except ClientError as e: raise translate_boto_error(e) except ParamValidationError as e: raise ValueError('Copy failed (%r -> %r): %s' % (path1, path2, e)) def copy_managed(self, path1, path2, **kwargs): buc1, key1 = split_path(path1) buc2, key2 = split_path(path2) copy_source = {'Bucket': buc1, 'Key': key1} try: self.s3.copy(CopySource=copy_source, Bucket=buc2, Key=key2, ExtraArgs=self._get_s3_method_kwargs( self.s3.copy_object, kwargs)) except ClientError as e: raise translate_boto_error(e) except ParamValidationError as e: raise ValueError('Copy failed (%r -> %r): %s' % (path1, path2, e)) def copy(self, path1, path2, **kwargs): self.copy_managed(path1, path2, **kwargs) self.invalidate_cache(path2) def bulk_delete(self, pathlist, **kwargs): """ Remove multiple keys with one call Parameters ---------- pathlist : listof strings The keys to remove, must all be in the same bucket. """ if not pathlist: return buckets = {split_path(path)[0] for path in pathlist} if len(buckets) > 1: raise ValueError("Bulk delete files should refer to only one " "bucket") bucket = buckets.pop() if len(pathlist) > 1000: for i in range((len(pathlist) // 1000) + 1): self.bulk_delete(pathlist[i * 1000:(i + 1) * 1000]) return delete_keys = { 'Objects': [{ 'Key': split_path(path)[1] } for path in pathlist] } for path in pathlist: self.invalidate_cache(self._parent(path)) try: self._call_s3(self.s3.delete_objects, kwargs, Bucket=bucket, Delete=delete_keys) except ClientError as e: raise translate_boto_error(e) def rm(self, path, recursive=False, **kwargs): """ Remove keys and/or bucket. Parameters ---------- path : string The location to remove. recursive : bool (True) Whether to remove also all entries below, i.e., which are returned by `walk()`. """ bucket, key = split_path(path) if recursive: files = self.find(path, maxdepth=None) if key and not files: raise FileNotFoundError(path) self.bulk_delete(files, **kwargs) if not key: self.rmdir(bucket) return if key: if not self.exists(path): raise FileNotFoundError(path) try: self._call_s3(self.s3.delete_object, kwargs, Bucket=bucket, Key=key) except ClientError as e: raise translate_boto_error(e) self.invalidate_cache(self._parent(path)) else: if self.exists(bucket): try: self.s3.delete_bucket(Bucket=bucket) except BotoCoreError as e: raise IOError('Delete bucket %r failed: %s' % (bucket, e)) self.invalidate_cache(bucket) self.invalidate_cache('') else: raise FileNotFoundError(path) def invalidate_cache(self, path=None): if path is None: self.dircache.clear() else: path = self._strip_protocol(path) self.dircache.pop(path, None) self.dircache.pop(self._parent(path), None) def walk(self, path, maxdepth=None, **kwargs): if path in ['', '*', 's3://']: raise ValueError('Cannot crawl all of S3') return super().walk(path, maxdepth=maxdepth, **kwargs)
def __setstate__(self, state): self.__dict__.update(state) self.s3 = self.connect() self.dircache = {} self._kwargs_helper = ParamKwargsHelper(self.s3)
class S3FileSystem(object): """ Access S3 as if it were a file system. This exposes a filesystem-like API (ls, cp, open, etc.) on top of S3 storage. Provide credentials either explicitly (``key=``, ``secret=``) or depend on boto's credential methods. See boto3 documentation for more information. If no credentials are available, use ``anon=True``. Parameters ---------- anon : bool (False) Whether to use anonymous connection (public buckets only). If False, uses the key/secret given, or boto's credential resolver (environment variables, config files, EC2 IAM server, in that order) key : string (None) If not anonymous, use this access key ID, if specified secret : string (None) If not anonymous, use this secret access key, if specified token : string (None) If not anonymous, use this security token, if specified use_ssl : bool (True) Whether to use SSL in connections to S3; may be faster without, but insecure s3_additional_kwargs : dict of parameters that are used when calling s3 api methods. Typically used for things like "ServerSideEncryption". client_kwargs : dict of parameters for the boto3 client requester_pays : bool (False) If RequesterPays buckets are supported. default_block_size: None, int If given, the default block size value used for ``open()``, if no specific value is given at all time. The built-in default is 5MB. default_fill_cache : Bool (True) Whether to use cache filling with open by default. Refer to ``S3File.open``. config_kwargs : dict of parameters passed to ``botocore.client.Config`` kwargs : other parameters for boto3 session Examples -------- >>> s3 = S3FileSystem(anon=False) # doctest: +SKIP >>> s3.ls('my-bucket/') # doctest: +SKIP ['my-file.txt'] >>> with s3.open('my-bucket/my-file.txt', mode='rb') as f: # doctest: +SKIP ... print(f.read()) # doctest: +SKIP b'Hello, world!' """ _conn = {} _singleton = [None] connect_timeout = 5 read_timeout = 15 default_block_size = 5 * 2**20 def __init__(self, anon=False, key=None, secret=None, token=None, use_ssl=True, client_kwargs=None, requester_pays=False, default_block_size=None, default_fill_cache=True, config_kwargs=None, s3_additional_kwargs=None, **kwargs): self.anon = anon self.session = None self.key = key self.secret = secret self.token = token self.kwargs = kwargs if client_kwargs is None: client_kwargs = {} if default_block_size is not None: self.default_block_size = default_block_size if config_kwargs is None: config_kwargs = {} self.default_fill_cache = default_fill_cache self.client_kwargs = client_kwargs self.config_kwargs = config_kwargs self.dirs = {} self.req_kw = {'RequestPayer': 'requester'} if requester_pays else {} self.s3_additional_kwargs = s3_additional_kwargs or {} self.use_ssl = use_ssl self.s3 = self.connect() self._kwargs_helper = ParamKwargsHelper(self.s3) self._singleton[0] = self def _filter_kwargs(self, s3_method, kwargs): return self._kwargs_helper.filter_dict(s3_method.__name__, kwargs) def _call_s3(self, method, *akwarglist, **kwargs): additional_kwargs = self.s3_additional_kwargs.copy() for akwargs in akwarglist: additional_kwargs.update(self._filter_kwargs(method, akwargs)) # Add the normal kwargs in additional_kwargs.update(kwargs) return method(**additional_kwargs) @classmethod def current(cls): """ Return the most recently created S3FileSystem If no S3FileSystem has been created, then create one """ if not cls._singleton[0]: return cls() else: return cls._singleton[0] def connect(self, refresh=False): """ Establish S3 connection object. Parameters ---------- refresh : bool (True) Whether to use cached filelists, if already read """ anon, key, secret, kwargs, ckwargs, token, ssl = (self.anon, self.key, self.secret, self.kwargs, self.client_kwargs, self.token, self.use_ssl) # Include the current PID in the connection key so that different # SSL connections are made for each process. tok = tokenize(anon, key, secret, kwargs, ckwargs, token, ssl, os.getpid()) if refresh: self._conn.pop(tok, None) if tok not in self._conn: logger.debug("Open S3 connection. Anonymous: %s", self.anon) if self.anon: from botocore import UNSIGNED conf = Config(connect_timeout=self.connect_timeout, read_timeout=self.read_timeout, signature_version=UNSIGNED, **self.config_kwargs) self.session = boto3.Session(**self.kwargs) else: conf = Config(connect_timeout=self.connect_timeout, read_timeout=self.read_timeout, **self.config_kwargs) self.session = boto3.Session(self.key, self.secret, self.token, **self.kwargs) s3 = self.session.client('s3', config=conf, use_ssl=ssl, **self.client_kwargs) self._conn[tok] = (s3, self.session) else: s3, session = self._conn[tok] self.session = session return s3 def get_delegated_s3pars(self, exp=3600): """Get temporary credentials from STS, appropriate for sending across a network. Only relevant where the key/secret were explicitly provided. Parameters ---------- exp : int Time in seconds that credentials are good for Returns ------- dict of parameters """ if self.anon: return {'anon': True} if self.token: # already has temporary cred return { 'key': self.key, 'secret': self.secret, 'token': self.token, 'anon': False } if self.key is None or self.secret is None: # automatic credentials return {'anon': False} sts = self.session.client('sts') cred = sts.get_session_token(DurationSeconds=exp)['Credentials'] return { 'key': cred['AccessKeyId'], 'secret': cred['SecretAccessKey'], 'token': cred['SessionToken'], 'anon': False } def __getstate__(self): d = self.__dict__.copy() del d['s3'] del d['session'] del d['_kwargs_helper'] logger.debug("Serialize with state: %s", d) return d def __setstate__(self, state): self.__dict__.update(state) self._conn = {} self.s3 = self.connect() self._kwargs_helper = ParamKwargsHelper(self.s3) def open(self, path, mode='rb', block_size=None, acl='', fill_cache=None, **kwargs): """ Open a file for reading or writing Parameters ---------- path: string Path of file on S3 mode: string One of 'rb' or 'wb' block_size: int Size of data-node blocks if reading fill_cache: bool If seeking to new a part of the file beyond the current buffer, with this True, the buffer will be filled between the sections to best support random access. When reading only a few specific chunks out of a file, performance may be better if False. acl: str Canned ACL to set when writing kwargs: dict-like Additional parameters used for s3 methods. Typically used for ServerSideEncryption. """ if block_size is None: block_size = self.default_block_size if fill_cache is None: fill_cache = self.default_fill_cache if 'b' not in mode: raise NotImplementedError("Text mode not supported, use mode='%s'" " and manage bytes" % (mode[0] + 'b')) return S3File(self, path, mode, block_size=block_size, acl=acl, fill_cache=fill_cache, s3_additional_kwargs=kwargs) def _lsdir(self, path, refresh=False): if path.startswith('s3://'): path = path[len('s3://'):] path = path.rstrip('/') bucket, prefix = split_path(path) prefix = prefix + '/' if prefix else "" if path not in self.dirs or refresh: try: pag = self.s3.get_paginator('list_objects_v2') it = pag.paginate(Bucket=bucket, Prefix=prefix, Delimiter='/', **self.req_kw) files = [] dirs = None for i in it: dirs = dirs or i.get('CommonPrefixes', None) files.extend(i.get('Contents', [])) if dirs: files.extend([{ 'Key': l['Prefix'][:-1], 'Size': 0, 'StorageClass': "DIRECTORY" } for l in dirs]) files = [f for f in files if len(f['Key']) > len(prefix)] for f in files: f['Key'] = '/'.join([bucket, f['Key']]) except ClientError: # path not accessible files = [] self.dirs[path] = files return self.dirs[path] def _lsbuckets(self, refresh=False): if '' not in self.dirs or refresh: if self.anon: # cannot list buckets if not logged in return [] files = self.s3.list_buckets()['Buckets'] for f in files: f['Key'] = f['Name'] f['Size'] = 0 f['StorageClass'] = 'BUCKET' del f['Name'] self.dirs[''] = files return self.dirs[''] def _ls(self, path, refresh=False): """ List files in given bucket, or list of buckets. Listing is cached unless `refresh=True`. Note: only your buckets associated with the login will be listed by `ls('')`, not any public buckets (even if already accessed). Parameters ---------- path : string/bytes location at which to list files detail : bool (=True) if True, each list item is a dict of file properties; otherwise, returns list of filenames refresh : bool (=False) if False, look in local cache for file details first kwargs : dict additional arguments passed on """ if path.startswith('s3://'): path = path[len('s3://'):] if path in ['', '/']: return self._lsbuckets(refresh) else: return self._lsdir(path, refresh) def ls(self, path, detail=False, refresh=False, **kwargs): """ List single "directory" with or without details """ if path.startswith('s3://'): path = path[len('s3://'):] path = path.rstrip('/') files = self._ls(path, refresh=refresh) if not files: if split_path(path)[1]: files = [self.info(path, **kwargs)] elif path: raise FileNotFoundError(path) if detail: return files else: return [f['Key'] for f in files] def info(self, path, refresh=False, **kwargs): """ Detail on the specific file pointed to by path. Gets details only for a specific key, directories/buckets cannot be used with info. """ parent = path.rsplit('/', 1)[0] files = self._lsdir(parent, refresh=refresh) files = [ f for f in files if f['Key'] == path and f['StorageClass'] not in ['DIRECTORY', 'BUCKET'] ] if len(files) == 1: return files[0] else: try: bucket, key = split_path(path) out = self._call_s3(self.s3.head_object, kwargs, Bucket=bucket, Key=key, **self.req_kw) out = { 'ETag': out['ETag'], 'Key': '/'.join([bucket, key]), 'LastModified': out['LastModified'], 'Size': out['ContentLength'], 'StorageClass': "STANDARD" } return out except (ClientError, ParamValidationError): raise FileNotFoundError(path) _metadata_cache = {} def metadata(self, path, refresh=False, **kwargs): """ Return metadata of path. Metadata is cached unless `refresh=True`. Parameters ---------- path : string/bytes filename to get metadata for refresh : bool (=False) if False, look in local cache for file metadata first """ bucket, key = split_path(path) if refresh or path not in self._metadata_cache: response = self._call_s3(self.s3.head_object, kwargs, Bucket=bucket, Key=key, **self.req_kw) self._metadata_cache[path] = response['Metadata'] return self._metadata_cache[path] def getxattr(self, path, attr_name, **kwargs): """ Get an attribute from the metadata. Examples -------- >>> mys3fs.getxattr('mykey', 'attribute_1') # doctest: +SKIP 'value_1' """ xattr = self.metadata(path, **kwargs) if attr_name in xattr: return xattr[attr_name] return None def setxattr(self, path, copy_kwargs=None, **kw_args): """ Set metadata. Attributes have to be of the form documented in the `Metadata Reference`_. Parameters --------- kw_args : key-value pairs like field="value", where the values must be strings. Does not alter existing fields, unless the field appears here - if the value is None, delete the field. copy_args : dict, optional dictionary of additional params to use for the underlying s3.copy_object. Examples -------- >>> mys3file.setxattr(attribute_1='value1', attribute_2='value2') # doctest: +SKIP # Example for use with copy_args >>> mys3file.setxattr(copy_kwargs={'ContentType': 'application/pdf'}, attribute_1='value1') # doctest: +SKIP .. Metadata Reference: http://docs.aws.amazon.com/AmazonS3/latest/dev/UsingMetadata.html#object-metadata """ bucket, key = split_path(path) metadata = self.metadata(path) metadata.update(**kw_args) copy_kwargs = copy_kwargs or {} # remove all keys that are None for kw_key in kw_args: if kw_args[kw_key] is None: metadata.pop(kw_key, None) self._call_s3( self.s3.copy_object, copy_kwargs, CopySource="{}/{}".format(bucket, key), Bucket=bucket, Key=key, Metadata=metadata, MetadataDirective='REPLACE', ) # refresh metadata self._metadata_cache[path] = metadata def _walk(self, path, refresh=False): if path.startswith('s3://'): path = path[len('s3://'):] if path in ['', '/']: raise ValueError('Cannot walk all of S3') filenames = self._ls(path, refresh=refresh)[:] for f in filenames[:]: if f['StorageClass'] == 'DIRECTORY': filenames.extend(self._walk(f['Key'], refresh)) return [ f for f in filenames if f['StorageClass'] not in ['BUCKET', 'DIRECTORY'] ] def walk(self, path, refresh=False): """ Return all real keys below path """ return [f['Key'] for f in self._walk(path, refresh)] def chmod(self, path, acl, **kwargs): """ Set Access Control on a bucket/key See http://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl Parameters ---------- path : string the object to set acl : string the value of ACL to apply """ bucket, key = split_path(path) if key: if acl not in key_acls: raise ValueError('ACL not in %s', key_acls) self._call_s3(self.s3.put_object_acl, kwargs, Bucket=bucket, Key=key, ACL=acl) else: if acl not in buck_acls: raise ValueError('ACL not in %s', buck_acls) self._call_s3(self.s3.put_bucket_acl, kwargs, Bucket=bucket, ACL=acl) def glob(self, path): """ Find files by glob-matching. Note that the bucket part of the path must not contain a "*" """ path0 = path if path.startswith('s3://'): path = path[len('s3://'):] path = path.rstrip('/') bucket, key = split_path(path) if "*" in bucket: raise ValueError('Bucket cannot contain a "*"') if '*' not in path: path = path.rstrip('/') + '/*' if '/' in path[:path.index('*')]: ind = path[:path.index('*')].rindex('/') root = path[:ind + 1] else: root = '/' allfiles = self.walk(root) pattern = re.compile("^" + path.replace('//', '/').rstrip('/').replace( '*', '[^/]*').replace('?', '.') + "$") out = [ f for f in allfiles if re.match(pattern, f.replace('//', '/').rstrip('/')) ] if not out: out = self.ls(path0) return out def du(self, path, total=False, deep=False, **kwargs): """ Bytes in keys at path """ if deep: files = self.walk(path) files = [self.info(f, **kwargs) for f in files] else: files = self.ls(path, detail=True) if total: return sum(f.get('Size', 0) for f in files) else: return {p['Key']: p['Size'] for p in files} def exists(self, path): """ Does such a file/directory exist? """ bucket, key = split_path(path) if key or bucket not in self.ls(''): return not raises(FileNotFoundError, lambda: self.ls(path)) else: return True def cat(self, path, **kwargs): """ Returns contents of file """ with self.open(path, 'rb', **kwargs) as f: return f.read() def tail(self, path, size=1024, **kwargs): """ Return last bytes of file """ length = self.info(path, **kwargs)['Size'] if size > length: return self.cat(path, **kwargs) with self.open(path, 'rb', **kwargs) as f: f.seek(length - size) return f.read(size) def head(self, path, size=1024, **kwargs): """ Return first bytes of file """ with self.open(path, 'rb', block_size=size, **kwargs) as f: return f.read(size) def url(self, path, expires=3600, **kwargs): """ Generate presigned URL to access path by HTTP Parameters ---------- path : string the key path we are interested in expires : int the number of seconds this signature will be good for. """ bucket, key = split_path(path) return self.s3.generate_presigned_url(ClientMethod='get_object', Params=dict(Bucket=bucket, Key=key, **kwargs), ExpiresIn=expires) def get(self, path, filename, **kwargs): """ Stream data from file at path to local filename """ with self.open(path, 'rb', **kwargs) as f: with open(filename, 'wb') as f2: while True: data = f.read(f.blocksize) if len(data) == 0: break f2.write(data) def put(self, filename, path, **kwargs): """ Stream data from local filename to file at path """ with open(filename, 'rb') as f: with self.open(path, 'wb', **kwargs) as f2: while True: data = f.read(f2.blocksize) if len(data) == 0: break f2.write(data) def mkdir(self, path, acl="", **kwargs): """ Make new bucket or empty key """ self.touch(path, acl=acl, **kwargs) def rmdir(self, path, **kwargs): """ Remove empty key or bucket """ bucket, key = split_path(path) if (key and self.info(path)['Size'] == 0) or not key: self.rm(path, **kwargs) else: raise IOError('Path is not directory-like', path) def mv(self, path1, path2, **kwargs): """ Move file between locations on S3 """ self.copy(path1, path2, **kwargs) self.rm(path1) def merge(self, path, filelist, **kwargs): """ Create single S3 file from list of S3 files Uses multi-part, no data is downloaded. The original files are not deleted. Parameters ---------- path : str The final file to produce filelist : list of str The paths, in order, to assemble into the final file. """ bucket, key = split_path(path) mpu = self._call_s3(self.s3.create_multipart_upload, kwargs, Bucket=bucket, Key=key) out = [ self._call_s3(self.s3.upload_part_copy, kwargs, Bucket=bucket, Key=key, UploadId=mpu['UploadId'], CopySource=f, PartNumber=i + 1) for (i, f) in enumerate(filelist) ] parts = [{ 'PartNumber': i + 1, 'ETag': o['CopyPartResult']['ETag'] } for (i, o) in enumerate(out)] part_info = {'Parts': parts} self.s3.complete_multipart_upload(Bucket=bucket, Key=key, UploadId=mpu['UploadId'], MultipartUpload=part_info) self.invalidate_cache(path) def copy(self, path1, path2, **kwargs): """ Copy file between locations on S3 """ buc1, key1 = split_path(path1) buc2, key2 = split_path(path2) try: self._call_s3(self.s3.copy_object, kwargs, Bucket=buc2, Key=key2, CopySource='/'.join([buc1, key1])) except (ClientError, ParamValidationError): raise IOError('Copy failed', (path1, path2)) self.invalidate_cache(path2) def bulk_delete(self, pathlist, **kwargs): """ Remove multiple keys with one call Parameters ---------- pathlist : listof strings The keys to remove, must all be in the same bucket. """ if not pathlist: return buckets = {split_path(path)[0] for path in pathlist} if len(buckets) > 1: raise ValueError( "Bulk delete files should refer to only one bucket") bucket = buckets.pop() if len(pathlist) > 1000: for i in range((len(pathlist) // 1000) + 1): self.bulk_delete(pathlist[i * 1000:(i + 1) * 1000]) return delete_keys = { 'Objects': [{ 'Key': split_path(path)[1] } for path in pathlist] } try: self._call_s3(self.s3.delete_objects, kwargs, Bucket=bucket, Delete=delete_keys) for path in pathlist: self.invalidate_cache(path) except ClientError: raise IOError('Bulk delete failed') def rm(self, path, recursive=False, **kwargs): """ Remove keys and/or bucket. Parameters ---------- path : string The location to remove. recursive : bool (True) Whether to remove also all entries below, i.e., which are returned by `walk()`. """ if not self.exists(path): raise FileNotFoundError(path) if recursive: self.invalidate_cache(path) self.bulk_delete(self.walk(path), **kwargs) if not self.exists(path): return bucket, key = split_path(path) if key: try: self._call_s3(self.s3.delete_object, kwargs, Bucket=bucket, Key=key) except ClientError: raise IOError('Delete key failed', (bucket, key)) self.invalidate_cache(path) else: if not self.s3.list_objects(Bucket=bucket).get('Contents'): try: self.s3.delete_bucket(Bucket=bucket) except ClientError: raise IOError('Delete bucket failed', bucket) self.invalidate_cache(bucket) self.invalidate_cache('') else: raise IOError('Not empty', path) def invalidate_cache(self, path=None): if path is None: self.dirs.clear() else: self.dirs.pop(path, None) parent = path.rsplit('/', 1)[0] self.dirs.pop(parent, None) def touch(self, path, acl="", **kwargs): """ Create empty key If path is a bucket only, attempt to create bucket. """ bucket, key = split_path(path) if key: if acl and acl not in key_acls: raise ValueError('ACL not in %s', key_acls) self._call_s3(self.s3.put_object, kwargs, Bucket=bucket, Key=key, ACL=acl) self.invalidate_cache(path) else: if acl and acl not in buck_acls: raise ValueError('ACL not in %s', buck_acls) try: self.s3.create_bucket(Bucket=bucket, ACL=acl) self.invalidate_cache('') self.invalidate_cache(bucket) except (ClientError, ParamValidationError): raise IOError('Bucket create failed', path) def read_block(self, fn, offset, length, delimiter=None, **kwargs): """ Read a block of bytes from an S3 file Starting at ``offset`` of the file, read ``length`` bytes. If ``delimiter`` is set then we ensure that the read starts and stops at delimiter boundaries that follow the locations ``offset`` and ``offset + length``. If ``offset`` is zero then we start at zero. The bytestring returned WILL include the end delimiter string. If offset+length is beyond the eof, reads to eof. Parameters ---------- fn: string Path to filename on S3 offset: int Byte offset to start read length: int Number of bytes to read delimiter: bytes (optional) Ensure reading starts and stops at delimiter bytestring Examples -------- >>> s3.read_block('data/file.csv', 0, 13) # doctest: +SKIP b'Alice, 100\\nBo' >>> s3.read_block('data/file.csv', 0, 13, delimiter=b'\\n') # doctest: +SKIP b'Alice, 100\\nBob, 200\\n' Use ``length=None`` to read to the end of the file. >>> s3.read_block('data/file.csv', 0, None, delimiter=b'\\n') # doctest: +SKIP b'Alice, 100\\nBob, 200\\nCharlie, 300' See Also -------- distributed.utils.read_block """ with self.open(fn, 'rb', **kwargs) as f: size = f.info()['Size'] if length is None: length = size if offset + length > size: length = size - offset bytes = read_block(f, offset, length, delimiter) return bytes