示例#1
0
    def __init__(self, anon=False, key=None, secret=None, token=None,
                 use_ssl=True, client_kwargs=None, requester_pays=False,
                 default_block_size=None, default_fill_cache=True,
                 default_cache_type='bytes', version_aware=False, config_kwargs=None,
                 s3_additional_kwargs=None, session=None, username=None,
                 password=None, **kwargs):
        if key and username:
            raise KeyError('Supply either key or username, not both')
        if secret and password:
            raise KeyError('Supply secret or password, not both')
        if username:
            key = username
        if password:
            secret = password

        if self._cached:
            return
        super().__init__()
        self.anon = anon
        self.session = None
        self.passed_in_session = session
        if self.passed_in_session:
            self.session = self.passed_in_session
        self.key = key
        self.secret = secret
        self.token = token
        self.kwargs = kwargs

        if client_kwargs is None:
            client_kwargs = {}
        if config_kwargs is None:
            config_kwargs = {}
        self.default_block_size = default_block_size or self.default_block_size
        self.default_fill_cache = default_fill_cache
        self.default_cache_type = default_cache_type
        self.version_aware = version_aware
        self.client_kwargs = client_kwargs
        self.config_kwargs = config_kwargs
        self.req_kw = {'RequestPayer': 'requester'} if requester_pays else {}
        self.s3_additional_kwargs = s3_additional_kwargs or {}
        self.use_ssl = use_ssl
        self.s3 = self.connect()
        self._kwargs_helper = ParamKwargsHelper(self.s3)
示例#2
0
文件: core.py 项目: del82/s3fs
    def __init__(self,
                 anon=False,
                 key=None,
                 secret=None,
                 token=None,
                 use_ssl=True,
                 client_kwargs=None,
                 requester_pays=False,
                 default_block_size=None,
                 default_fill_cache=True,
                 config_kwargs=None,
                 s3_additional_kwargs=None,
                 **kwargs):
        self.anon = anon
        self.session = None
        self.key = key
        self.secret = secret
        self.token = token
        self.kwargs = kwargs

        if client_kwargs is None:
            client_kwargs = {}
        if default_block_size is not None:
            self.default_block_size = default_block_size
        if config_kwargs is None:
            config_kwargs = {}
        self.default_fill_cache = default_fill_cache
        self.client_kwargs = client_kwargs
        self.config_kwargs = config_kwargs
        self.dirs = {}
        self.req_kw = {'RequestPayer': 'requester'} if requester_pays else {}
        self.s3_additional_kwargs = s3_additional_kwargs or {}
        self.use_ssl = use_ssl
        self.s3 = self.connect()
        self._kwargs_helper = ParamKwargsHelper(self.s3)
        self._singleton[0] = self
示例#3
0
文件: core.py 项目: jrbourbeau/s3fs
class S3FileSystem(AbstractFileSystem):
    """
    Access S3 as if it were a file system.

    This exposes a filesystem-like API (ls, cp, open, etc.) on top of S3
    storage.

    Provide credentials either explicitly (``key=``, ``secret=``) or depend
    on boto's credential methods. See boto3 documentation for more
    information. If no credentials are available, use ``anon=True``.

    Parameters
    ----------
    anon : bool (False)
        Whether to use anonymous connection (public buckets only). If False,
        uses the key/secret given, or boto's credential resolver (environment
        variables, config files, EC2 IAM server, in that order)
    key : string (None)
        If not anonymous, use this access key ID, if specified
    secret : string (None)
        If not anonymous, use this secret access key, if specified
    token : string (None)
        If not anonymous, use this security token, if specified
    use_ssl : bool (True)
        Whether to use SSL in connections to S3; may be faster without, but
        insecure
    s3_additional_kwargs : dict of parameters that are used when calling s3 api
        methods. Typically used for things like "ServerSideEncryption".
    client_kwargs : dict of parameters for the boto3 client
    requester_pays : bool (False)
        If RequesterPays buckets are supported.
    default_block_size: int (None)
        If given, the default block size value used for ``open()``, if no
        specific value is given at all time. The built-in default is 5MB.
    default_fill_cache : Bool (True)
        Whether to use cache filling with open by default. Refer to
        ``S3File.open``.
    default_cache_type : string ('bytes')
        If given, the default cache_type value used for ``open()``. Set to "none"
        if no caching is desired. See fsspec's documentation for other available
        cache_type values. Default cache_type is 'bytes'.
    version_aware : bool (False)
        Whether to support bucket versioning.  If enable this will require the
        user to have the necessary IAM permissions for dealing with versioned
        objects.
    config_kwargs : dict of parameters passed to ``botocore.client.Config``
    kwargs : other parameters for boto3 session
    session : botocore Session object to be used for all connections.
         This session will be used inplace of creating a new session inside S3FileSystem.


    Examples
    --------
    >>> s3 = S3FileSystem(anon=False)  # doctest: +SKIP
    >>> s3.ls('my-bucket/')  # doctest: +SKIP
    ['my-file.txt']

    >>> with s3.open('my-bucket/my-file.txt', mode='rb') as f:  # doctest: +SKIP
    ...     print(f.read())  # doctest: +SKIP
    b'Hello, world!'
    """
    root_marker = ""
    connect_timeout = 5
    read_timeout = 15
    default_block_size = 5 * 2**20
    protocol = 's3'

    def __init__(self,
                 anon=False,
                 key=None,
                 secret=None,
                 token=None,
                 use_ssl=True,
                 client_kwargs=None,
                 requester_pays=False,
                 default_block_size=None,
                 default_fill_cache=True,
                 default_cache_type='bytes',
                 version_aware=False,
                 config_kwargs=None,
                 s3_additional_kwargs=None,
                 session=None,
                 username=None,
                 password=None,
                 **kwargs):
        if key and username:
            raise KeyError('Supply either key or username, not both')
        if secret and password:
            raise KeyError('Supply secret or password, not both')
        if username:
            key = username
        if password:
            secret = password

        if self._cached:
            return
        super().__init__()
        self.anon = anon
        self.session = None
        self.passed_in_session = session
        if self.passed_in_session:
            self.session = self.passed_in_session
        self.key = key
        self.secret = secret
        self.token = token
        self.kwargs = kwargs

        if client_kwargs is None:
            client_kwargs = {}
        if config_kwargs is None:
            config_kwargs = {}
        self.default_block_size = default_block_size or self.default_block_size
        self.default_fill_cache = default_fill_cache
        self.default_cache_type = default_cache_type
        self.version_aware = version_aware
        self.client_kwargs = client_kwargs
        self.config_kwargs = config_kwargs
        self.req_kw = {'RequestPayer': 'requester'} if requester_pays else {}
        self.s3_additional_kwargs = s3_additional_kwargs or {}
        self.use_ssl = use_ssl
        self.s3 = self.connect()
        self._kwargs_helper = ParamKwargsHelper(self.s3)

    def _filter_kwargs(self, s3_method, kwargs):
        return self._kwargs_helper.filter_dict(s3_method.__name__, kwargs)

    def _call_s3(self, method, *akwarglist, **kwargs):
        kw2 = kwargs.copy()
        kw2.pop('Body', None)
        logger.debug("CALL: %s - %s - %s" % (method.__name__, akwarglist, kw2))
        additional_kwargs = self._get_s3_method_kwargs(method, *akwarglist,
                                                       **kwargs)
        return method(**additional_kwargs)

    def _get_s3_method_kwargs(self, method, *akwarglist, **kwargs):
        additional_kwargs = self.s3_additional_kwargs.copy()
        for akwargs in akwarglist:
            additional_kwargs.update(akwargs)
        # Add the normal kwargs in
        additional_kwargs.update(kwargs)
        # filter all kwargs
        return self._filter_kwargs(method, additional_kwargs)

    def connect(self, refresh=True):
        """
        Establish S3 connection object.

        Parameters
        ----------
        refresh : bool
            Whether to create new session/client, even if a previous one with
            the same parameters already exists. If False (default), an
            existing one will be used if possible
        """
        if refresh is False:
            # back compat: we store whole FS instance now
            return self.s3
        anon, key, secret, kwargs, ckwargs, token, ssl = (self.anon, self.key,
                                                          self.secret,
                                                          self.kwargs,
                                                          self.client_kwargs,
                                                          self.token,
                                                          self.use_ssl)

        if self.anon:
            from botocore import UNSIGNED
            conf = Config(connect_timeout=self.connect_timeout,
                          read_timeout=self.read_timeout,
                          signature_version=UNSIGNED,
                          **self.config_kwargs)
            if not self.passed_in_session:
                self.session = boto3.Session(**self.kwargs)
        else:
            conf = Config(connect_timeout=self.connect_timeout,
                          read_timeout=self.read_timeout,
                          **self.config_kwargs)
            if not self.passed_in_session:
                self.session = boto3.Session(self.key, self.secret, self.token,
                                             **self.kwargs)

        logger.debug("Setting up s3fs instance")
        self.s3 = self.session.client('s3',
                                      config=conf,
                                      use_ssl=ssl,
                                      **self.client_kwargs)
        return self.s3

    def get_delegated_s3pars(self, exp=3600):
        """Get temporary credentials from STS, appropriate for sending across a
        network. Only relevant where the key/secret were explicitly provided.

        Parameters
        ----------
        exp : int
            Time in seconds that credentials are good for

        Returns
        -------
        dict of parameters
        """
        if self.anon:
            return {'anon': True}
        if self.token:  # already has temporary cred
            return {
                'key': self.key,
                'secret': self.secret,
                'token': self.token,
                'anon': False
            }
        if self.key is None or self.secret is None:  # automatic credentials
            return {'anon': False}
        sts = self.session.client('sts')
        cred = sts.get_session_token(DurationSeconds=exp)['Credentials']
        return {
            'key': cred['AccessKeyId'],
            'secret': cred['SecretAccessKey'],
            'token': cred['SessionToken'],
            'anon': False
        }

    def _open(self,
              path,
              mode='rb',
              block_size=None,
              acl='',
              version_id=None,
              fill_cache=None,
              cache_type=None,
              autocommit=True,
              **kwargs):
        """ Open a file for reading or writing

        Parameters
        ----------
        path: string
            Path of file on S3
        mode: string
            One of 'r', 'w', 'a', 'rb', 'wb', or 'ab'. These have the same meaning
            as they do for the built-in `open` function.
        block_size: int
            Size of data-node blocks if reading
        fill_cache: bool
            If seeking to new a part of the file beyond the current buffer,
            with this True, the buffer will be filled between the sections to
            best support random access. When reading only a few specific chunks
            out of a file, performance may be better if False.
        acl: str
            Canned ACL to set when writing
        version_id : str
            Explicit version of the object to open.  This requires that the s3
            filesystem is version aware and bucket versioning is enabled on the
            relevant bucket.
        encoding : str
            The encoding to use if opening the file in text mode. The platform's
            default text encoding is used if not given.
        cache_type : str
            See fsspec's documentation for available cache_type values. Set to "none"
            if no caching is desired. If None, defaults to ``self.default_cache_type``.
        kwargs: dict-like
            Additional parameters used for s3 methods.  Typically used for
            ServerSideEncryption.
        """
        if block_size is None:
            block_size = self.default_block_size
        if fill_cache is None:
            fill_cache = self.default_fill_cache

        acl = acl or self.s3_additional_kwargs.get('ACL', '')
        kw = self.s3_additional_kwargs.copy()
        kw.update(kwargs)
        if not self.version_aware and version_id:
            raise ValueError(
                "version_id cannot be specified if the filesystem "
                "is not version aware")

        if cache_type is None:
            cache_type = self.default_cache_type

        return S3File(self,
                      path,
                      mode,
                      block_size=block_size,
                      acl=acl,
                      version_id=version_id,
                      fill_cache=fill_cache,
                      s3_additional_kwargs=kw,
                      cache_type=cache_type,
                      autocommit=autocommit)

    def _lsdir(self, path, refresh=False, max_items=None):
        if path.startswith('s3://'):
            path = path[len('s3://'):]
        path = path.rstrip('/')
        bucket, prefix = split_path(path)
        prefix = prefix + '/' if prefix else ""
        if path not in self.dircache or refresh:
            try:
                logger.debug("Get directory listing page for %s" % path)
                pag = self.s3.get_paginator('list_objects_v2')
                config = {}
                if max_items is not None:
                    config.update(MaxItems=max_items, PageSize=2 * max_items)
                it = pag.paginate(Bucket=bucket,
                                  Prefix=prefix,
                                  Delimiter='/',
                                  PaginationConfig=config,
                                  **self.req_kw)
                files = []
                dircache = []
                for i in it:
                    dircache.extend(i.get('CommonPrefixes', []))
                    for c in i.get('Contents', []):
                        c['type'] = 'file'
                        c['size'] = c['Size']
                        files.append(c)
                if dircache:
                    files.extend([{
                        'Key': l['Prefix'][:-1],
                        'Size': 0,
                        'StorageClass': "DIRECTORY",
                        'type': 'directory',
                        'size': 0
                    } for l in dircache])
                for f in files:
                    f['Key'] = '/'.join([bucket, f['Key']])
                    f['name'] = f['Key']
            except ClientError as e:
                raise translate_boto_error(e)

            self.dircache[path] = files
        return self.dircache[path]

    def mkdir(self, path, acl="", **kwargs):
        path = self._strip_protocol(path).rstrip('/')
        if not self._parent(path):
            if acl and acl not in buck_acls:
                raise ValueError('ACL not in %s', buck_acls)
            try:
                params = {"Bucket": path, 'ACL': acl}
                region_name = (kwargs.get("region_name", None)
                               or self.client_kwargs.get("region_name", None))
                if region_name:
                    params['CreateBucketConfiguration'] = {
                        'LocationConstraint': region_name
                    }
                self.s3.create_bucket(**params)
                self.invalidate_cache('')
                self.invalidate_cache(path)
            except ClientError as e:
                raise translate_boto_error(e)
            except ParamValidationError as e:
                raise ValueError('Bucket create failed %r: %s' % (path, e))

    def rmdir(self, path):
        path = self._strip_protocol(path).rstrip('/')
        if not self._parent(path):
            try:
                self.s3.delete_bucket(Bucket=path)
            except ClientError as e:
                raise translate_boto_error(e)
            self.invalidate_cache(path)
            self.invalidate_cache('')

    def _lsbuckets(self, refresh=False):
        if '' not in self.dircache or refresh:
            if self.anon:
                # cannot list buckets if not logged in
                return []
            try:
                files = self.s3.list_buckets()['Buckets']
            except ClientError:
                # listbucket permission missing
                return []
            for f in files:
                f['Key'] = f['Name']
                f['Size'] = 0
                f['StorageClass'] = 'BUCKET'
                f['size'] = 0
                f['type'] = 'directory'
                f['name'] = f['Name']
                del f['Name']
            self.dircache[''] = files
        return self.dircache['']

    def _ls(self, path, refresh=False):
        """ List files in given bucket, or list of buckets.

        Listing is cached unless `refresh=True`.

        Note: only your buckets associated with the login will be listed by
        `ls('')`, not any public buckets (even if already accessed).

        Parameters
        ----------
        path : string/bytes
            location at which to list files
        refresh : bool (=False)
            if False, look in local cache for file details first
        """
        if path.startswith('s3://'):
            path = path[len('s3://'):]
        if path in ['', '/']:
            return self._lsbuckets(refresh)
        else:
            return self._lsdir(path, refresh)

    def exists(self, path):
        if path in ['', '/']:
            # the root always exists, even if anon
            return True
        bucket, key = split_path(path)
        if key:
            return super().exists(path)
        else:
            try:
                self.ls(path)
                return True
            except FileNotFoundError:
                return False

    def touch(self, path, truncate=True, data=None, **kwargs):
        """Create empty file or truncate"""
        bucket, key = split_path(path)
        if not truncate and self.exists(path):
            raise ValueError("S3 does not support touching existent files")
        try:
            self._call_s3(self.s3.put_object, kwargs, Bucket=bucket, Key=key)
        except ClientError as ex:
            raise translate_boto_error(ex)
        self.invalidate_cache(self._parent(path))

    def info(self, path, version_id=None):
        if path in ['/', '']:
            return {'name': path, 'size': 0, 'type': 'directory'}
        kwargs = self.kwargs.copy()
        if version_id is not None:
            if not self.version_aware:
                raise ValueError("version_id cannot be specified if the "
                                 "filesystem is not version aware")
            kwargs['VersionId'] = version_id
        if self.version_aware:
            try:
                bucket, key = split_path(path)
                out = self._call_s3(self.s3.head_object,
                                    kwargs,
                                    Bucket=bucket,
                                    Key=key,
                                    **self.req_kw)
                return {
                    'ETag': out['ETag'],
                    'Key': '/'.join([bucket, key]),
                    'LastModified': out['LastModified'],
                    'Size': out['ContentLength'],
                    'size': out['ContentLength'],
                    'path': '/'.join([bucket, key]),
                    'StorageClass': "STANDARD",
                    'VersionId': out.get('VersionId')
                }
            except ClientError as e:
                ee = translate_boto_error(e)
                # This could have failed since the thing we are looking for is a prefix.
                if isinstance(ee, FileNotFoundError):
                    return super().info(path)
                else:
                    raise ee
            except ParamValidationError as e:
                raise ValueError('Failed to head path %r: %s' % (path, e))
        return super().info(path)

    def ls(self, path, detail=False, refresh=False, **kwargs):
        """ List single "directory" with or without details

        Parameters
        ----------
        path : string/bytes
            location at which to list files
        detail : bool (=True)
            if True, each list item is a dict of file properties;
            otherwise, returns list of filenames
        refresh : bool (=False)
            if False, look in local cache for file details first
        kwargs : dict
            additional arguments passed on
        """
        path = self._strip_protocol(path).rstrip('/')
        files = self._ls(path, refresh=refresh)
        if not files:
            files = self._ls(self._parent(path), refresh=refresh)
            files = [
                o for o in files
                if o['name'].rstrip('/') == path and o['type'] != 'directory'
            ]
        if detail:
            return files
        else:
            return list(sorted(set([f['name'] for f in files])))

    def object_version_info(self, path, **kwargs):
        if not self.version_aware:
            raise ValueError("version specific functionality is disabled for "
                             "non-version aware filesystems")
        bucket, key = split_path(path)
        kwargs = {}
        out = {'IsTruncated': True}
        versions = []
        while out['IsTruncated']:
            out = self._call_s3(self.s3.list_object_versions,
                                kwargs,
                                Bucket=bucket,
                                Prefix=key,
                                **self.req_kw)
            versions.extend(out['Versions'])
            kwargs['VersionIdMarker'] = out.get('NextVersionIdMarker', '')
        return versions

    _metadata_cache = {}

    def metadata(self, path, refresh=False, **kwargs):
        """ Return metadata of path.

        Metadata is cached unless `refresh=True`.

        Parameters
        ----------
        path : string/bytes
            filename to get metadata for
        refresh : bool (=False)
            if False, look in local cache for file metadata first
        """
        bucket, key = split_path(path)

        if refresh or path not in self._metadata_cache:
            response = self._call_s3(self.s3.head_object,
                                     kwargs,
                                     Bucket=bucket,
                                     Key=key,
                                     **self.req_kw)
            self._metadata_cache[path] = response['Metadata']

        return self._metadata_cache[path]

    def get_tags(self, path):
        """Retrieve tag key/values for the given path

        Returns
        -------
        {str: str}
        """
        bucket, key = split_path(path)
        response = self._call_s3(self.s3.get_object_tagging,
                                 Bucket=bucket,
                                 Key=key)
        return {v['Key']: v['Value'] for v in response['TagSet']}

    def put_tags(self, path, tags, mode='o'):
        """Set tags for given existing key

        Tags are a str:str mapping that can be attached to any key, see
        https://docs.aws.amazon.com/awsaccountbilling/latest/aboutv2/allocation-tag-restrictions.html

        This is similar to, but distinct from, key metadata, which is usually
        set at key creation time.

        Parameters
        ----------
        path: str
            Existing key to attach tags to
        tags: dict str, str
            Tags to apply.
        mode:
            One of 'o' or 'm'
            'o': Will over-write any existing tags.
            'm': Will merge in new tags with existing tags.  Incurs two remote
            calls.
        """
        bucket, key = split_path(path)

        if mode == 'm':
            existing_tags = self.get_tags(path=path)
            existing_tags.update(tags)
            new_tags = [{
                'Key': k,
                'Value': v
            } for k, v in existing_tags.items()]
        elif mode == 'o':
            new_tags = [{'Key': k, 'Value': v} for k, v in tags.items()]
        else:
            raise ValueError("Mode must be {'o', 'm'}, not %s" % mode)

        tag = {'TagSet': new_tags}
        self._call_s3(self.s3.put_object_tagging,
                      Bucket=bucket,
                      Key=key,
                      Tagging=tag)

    def getxattr(self, path, attr_name, **kwargs):
        """ Get an attribute from the metadata.

        Examples
        --------
        >>> mys3fs.getxattr('mykey', 'attribute_1')  # doctest: +SKIP
        'value_1'
        """
        xattr = self.metadata(path, **kwargs)
        if attr_name in xattr:
            return xattr[attr_name]
        return None

    def setxattr(self, path, copy_kwargs=None, **kw_args):
        """ Set metadata.

        Attributes have to be of the form documented in the
        `Metadata Reference`_.

        Parameters
        ---------
        kw_args : key-value pairs like field="value", where the values must be
            strings. Does not alter existing fields, unless
            the field appears here - if the value is None, delete the
            field.
        copy_kwargs : dict, optional
            dictionary of additional params to use for the underlying
            s3.copy_object.

        Examples
        --------
        >>> mys3file.setxattr(attribute_1='value1', attribute_2='value2')  # doctest: +SKIP
        # Example for use with copy_args
        >>> mys3file.setxattr(copy_kwargs={'ContentType': 'application/pdf'},
        ...     attribute_1='value1')  # doctest: +SKIP


        .. Metadata Reference:
        http://docs.aws.amazon.com/AmazonS3/latest/dev/UsingMetadata.html#object-metadata
        """

        bucket, key = split_path(path)
        metadata = self.metadata(path)
        metadata.update(**kw_args)
        copy_kwargs = copy_kwargs or {}

        # remove all keys that are None
        for kw_key in kw_args:
            if kw_args[kw_key] is None:
                metadata.pop(kw_key, None)

        self._call_s3(
            self.s3.copy_object,
            copy_kwargs,
            CopySource="{}/{}".format(bucket, key),
            Bucket=bucket,
            Key=key,
            Metadata=metadata,
            MetadataDirective='REPLACE',
        )

        # refresh metadata
        self._metadata_cache[path] = metadata

    def chmod(self, path, acl, **kwargs):
        """ Set Access Control on a bucket/key

        See http://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl

        Parameters
        ----------
        path : string
            the object to set
        acl : string
            the value of ACL to apply
        """
        bucket, key = split_path(path)
        if key:
            if acl not in key_acls:
                raise ValueError('ACL not in %s', key_acls)
            self._call_s3(self.s3.put_object_acl,
                          kwargs,
                          Bucket=bucket,
                          Key=key,
                          ACL=acl)
        else:
            if acl not in buck_acls:
                raise ValueError('ACL not in %s', buck_acls)
            self._call_s3(self.s3.put_bucket_acl,
                          kwargs,
                          Bucket=bucket,
                          ACL=acl)

    def url(self, path, expires=3600, **kwargs):
        """ Generate presigned URL to access path by HTTP

        Parameters
        ----------
        path : string
            the key path we are interested in
        expires : int
            the number of seconds this signature will be good for.
        """
        bucket, key = split_path(path)
        return self.s3.generate_presigned_url(ClientMethod='get_object',
                                              Params=dict(Bucket=bucket,
                                                          Key=key,
                                                          **kwargs),
                                              ExpiresIn=expires)

    def merge(self, path, filelist, **kwargs):
        """ Create single S3 file from list of S3 files

        Uses multi-part, no data is downloaded. The original files are
        not deleted.

        Parameters
        ----------
        path : str
            The final file to produce
        filelist : list of str
            The paths, in order, to assemble into the final file.
        """
        bucket, key = split_path(path)
        mpu = self._call_s3(self.s3.create_multipart_upload,
                            kwargs,
                            Bucket=bucket,
                            Key=key)
        out = [
            self._call_s3(self.s3.upload_part_copy,
                          kwargs,
                          Bucket=bucket,
                          Key=key,
                          UploadId=mpu['UploadId'],
                          CopySource=f,
                          PartNumber=i + 1) for (i, f) in enumerate(filelist)
        ]
        parts = [{
            'PartNumber': i + 1,
            'ETag': o['CopyPartResult']['ETag']
        } for (i, o) in enumerate(out)]
        part_info = {'Parts': parts}
        self.s3.complete_multipart_upload(Bucket=bucket,
                                          Key=key,
                                          UploadId=mpu['UploadId'],
                                          MultipartUpload=part_info)
        self.invalidate_cache(path)

    def copy_basic(self, path1, path2, **kwargs):
        """ Copy file between locations on S3 """
        buc1, key1 = split_path(path1)
        buc2, key2 = split_path(path2)
        try:
            self._call_s3(self.s3.copy_object,
                          kwargs,
                          Bucket=buc2,
                          Key=key2,
                          CopySource='/'.join([buc1, key1]))
        except ClientError as e:
            raise translate_boto_error(e)
        except ParamValidationError as e:
            raise ValueError('Copy failed (%r -> %r): %s' % (path1, path2, e))

    def copy_managed(self, path1, path2, **kwargs):
        buc1, key1 = split_path(path1)
        buc2, key2 = split_path(path2)
        copy_source = {'Bucket': buc1, 'Key': key1}
        try:
            self.s3.copy(CopySource=copy_source,
                         Bucket=buc2,
                         Key=key2,
                         ExtraArgs=self._get_s3_method_kwargs(
                             self.s3.copy_object, kwargs))
        except ClientError as e:
            raise translate_boto_error(e)
        except ParamValidationError as e:
            raise ValueError('Copy failed (%r -> %r): %s' % (path1, path2, e))

    def copy(self, path1, path2, **kwargs):
        self.copy_managed(path1, path2, **kwargs)
        self.invalidate_cache(path2)

    def bulk_delete(self, pathlist, **kwargs):
        """
        Remove multiple keys with one call

        Parameters
        ----------
        pathlist : listof strings
            The keys to remove, must all be in the same bucket.
        """
        if not pathlist:
            return
        buckets = {split_path(path)[0] for path in pathlist}
        if len(buckets) > 1:
            raise ValueError("Bulk delete files should refer to only one "
                             "bucket")
        bucket = buckets.pop()
        if len(pathlist) > 1000:
            for i in range((len(pathlist) // 1000) + 1):
                self.bulk_delete(pathlist[i * 1000:(i + 1) * 1000])
            return
        delete_keys = {
            'Objects': [{
                'Key': split_path(path)[1]
            } for path in pathlist]
        }
        for path in pathlist:
            self.invalidate_cache(self._parent(path))
        try:
            self._call_s3(self.s3.delete_objects,
                          kwargs,
                          Bucket=bucket,
                          Delete=delete_keys)
        except ClientError as e:
            raise translate_boto_error(e)

    def rm(self, path, recursive=False, **kwargs):
        """
        Remove keys and/or bucket.

        Parameters
        ----------
        path : string
            The location to remove.
        recursive : bool (True)
            Whether to remove also all entries below, i.e., which are returned
            by `walk()`.
        """
        bucket, key = split_path(path)
        if recursive:
            files = self.find(path, maxdepth=None)
            if key and not files:
                raise FileNotFoundError(path)
            self.bulk_delete(files, **kwargs)
            if not key:
                self.rmdir(bucket)
            return
        if key:
            if not self.exists(path):
                raise FileNotFoundError(path)
            try:
                self._call_s3(self.s3.delete_object,
                              kwargs,
                              Bucket=bucket,
                              Key=key)
            except ClientError as e:
                raise translate_boto_error(e)
            self.invalidate_cache(self._parent(path))
        else:
            if self.exists(bucket):
                try:
                    self.s3.delete_bucket(Bucket=bucket)
                except BotoCoreError as e:
                    raise IOError('Delete bucket %r failed: %s' % (bucket, e))
                self.invalidate_cache(bucket)
                self.invalidate_cache('')
            else:
                raise FileNotFoundError(path)

    def invalidate_cache(self, path=None):
        if path is None:
            self.dircache.clear()
        else:
            path = self._strip_protocol(path)
            self.dircache.pop(path, None)
            self.dircache.pop(self._parent(path), None)

    def walk(self, path, maxdepth=None, **kwargs):
        if path in ['', '*', 's3://']:
            raise ValueError('Cannot crawl all of S3')
        return super().walk(path, maxdepth=maxdepth, **kwargs)
示例#4
0
 def __setstate__(self, state):
     self.__dict__.update(state)
     self.s3 = self.connect()
     self.dircache = {}
     self._kwargs_helper = ParamKwargsHelper(self.s3)
示例#5
0
文件: core.py 项目: del82/s3fs
class S3FileSystem(object):
    """
    Access S3 as if it were a file system.

    This exposes a filesystem-like API (ls, cp, open, etc.) on top of S3
    storage.

    Provide credentials either explicitly (``key=``, ``secret=``) or depend
    on boto's credential methods. See boto3 documentation for more
    information. If no credentials are available, use ``anon=True``.

    Parameters
    ----------
    anon : bool (False)
        Whether to use anonymous connection (public buckets only). If False,
        uses the key/secret given, or boto's credential resolver (environment
        variables, config files, EC2 IAM server, in that order)
    key : string (None)
        If not anonymous, use this access key ID, if specified
    secret : string (None)
        If not anonymous, use this secret access key, if specified
    token : string (None)
        If not anonymous, use this security token, if specified
    use_ssl : bool (True)
        Whether to use SSL in connections to S3; may be faster without, but
        insecure
    s3_additional_kwargs : dict of parameters that are used when calling s3 api methods.
           Typically used for things like "ServerSideEncryption".
    client_kwargs : dict of parameters for the boto3 client
    requester_pays : bool (False)
        If RequesterPays buckets are supported.
    default_block_size: None, int
        If given, the default block size value used for ``open()``, if no
        specific value is given at all time. The built-in default is 5MB.
    default_fill_cache : Bool (True)
        Whether to use cache filling with open by default. Refer to
        ``S3File.open``.
    config_kwargs : dict of parameters passed to ``botocore.client.Config``
    kwargs : other parameters for boto3 session

    Examples
    --------
    >>> s3 = S3FileSystem(anon=False)  # doctest: +SKIP
    >>> s3.ls('my-bucket/')  # doctest: +SKIP
    ['my-file.txt']

    >>> with s3.open('my-bucket/my-file.txt', mode='rb') as f:  # doctest: +SKIP
    ...     print(f.read())  # doctest: +SKIP
    b'Hello, world!'
    """
    _conn = {}
    _singleton = [None]
    connect_timeout = 5
    read_timeout = 15
    default_block_size = 5 * 2**20

    def __init__(self,
                 anon=False,
                 key=None,
                 secret=None,
                 token=None,
                 use_ssl=True,
                 client_kwargs=None,
                 requester_pays=False,
                 default_block_size=None,
                 default_fill_cache=True,
                 config_kwargs=None,
                 s3_additional_kwargs=None,
                 **kwargs):
        self.anon = anon
        self.session = None
        self.key = key
        self.secret = secret
        self.token = token
        self.kwargs = kwargs

        if client_kwargs is None:
            client_kwargs = {}
        if default_block_size is not None:
            self.default_block_size = default_block_size
        if config_kwargs is None:
            config_kwargs = {}
        self.default_fill_cache = default_fill_cache
        self.client_kwargs = client_kwargs
        self.config_kwargs = config_kwargs
        self.dirs = {}
        self.req_kw = {'RequestPayer': 'requester'} if requester_pays else {}
        self.s3_additional_kwargs = s3_additional_kwargs or {}
        self.use_ssl = use_ssl
        self.s3 = self.connect()
        self._kwargs_helper = ParamKwargsHelper(self.s3)
        self._singleton[0] = self

    def _filter_kwargs(self, s3_method, kwargs):
        return self._kwargs_helper.filter_dict(s3_method.__name__, kwargs)

    def _call_s3(self, method, *akwarglist, **kwargs):
        additional_kwargs = self.s3_additional_kwargs.copy()
        for akwargs in akwarglist:
            additional_kwargs.update(self._filter_kwargs(method, akwargs))
        # Add the normal kwargs in
        additional_kwargs.update(kwargs)
        return method(**additional_kwargs)

    @classmethod
    def current(cls):
        """ Return the most recently created S3FileSystem

        If no S3FileSystem has been created, then create one
        """
        if not cls._singleton[0]:
            return cls()
        else:
            return cls._singleton[0]

    def connect(self, refresh=False):
        """
        Establish S3 connection object.

        Parameters
        ----------
        refresh : bool (True)
            Whether to use cached filelists, if already read
        """
        anon, key, secret, kwargs, ckwargs, token, ssl = (self.anon, self.key,
                                                          self.secret,
                                                          self.kwargs,
                                                          self.client_kwargs,
                                                          self.token,
                                                          self.use_ssl)

        # Include the current PID in the connection key so that different
        # SSL connections are made for each process.
        tok = tokenize(anon, key, secret, kwargs, ckwargs, token, ssl,
                       os.getpid())
        if refresh:
            self._conn.pop(tok, None)
        if tok not in self._conn:
            logger.debug("Open S3 connection.  Anonymous: %s", self.anon)
            if self.anon:
                from botocore import UNSIGNED
                conf = Config(connect_timeout=self.connect_timeout,
                              read_timeout=self.read_timeout,
                              signature_version=UNSIGNED,
                              **self.config_kwargs)
                self.session = boto3.Session(**self.kwargs)
            else:
                conf = Config(connect_timeout=self.connect_timeout,
                              read_timeout=self.read_timeout,
                              **self.config_kwargs)
                self.session = boto3.Session(self.key, self.secret, self.token,
                                             **self.kwargs)
            s3 = self.session.client('s3',
                                     config=conf,
                                     use_ssl=ssl,
                                     **self.client_kwargs)
            self._conn[tok] = (s3, self.session)
        else:
            s3, session = self._conn[tok]
            self.session = session
        return s3

    def get_delegated_s3pars(self, exp=3600):
        """Get temporary credentials from STS, appropriate for sending across a
        network. Only relevant where the key/secret were explicitly provided.

        Parameters
        ----------
        exp : int
            Time in seconds that credentials are good for

        Returns
        -------
        dict of parameters
        """
        if self.anon:
            return {'anon': True}
        if self.token:  # already has temporary cred
            return {
                'key': self.key,
                'secret': self.secret,
                'token': self.token,
                'anon': False
            }
        if self.key is None or self.secret is None:  # automatic credentials
            return {'anon': False}
        sts = self.session.client('sts')
        cred = sts.get_session_token(DurationSeconds=exp)['Credentials']
        return {
            'key': cred['AccessKeyId'],
            'secret': cred['SecretAccessKey'],
            'token': cred['SessionToken'],
            'anon': False
        }

    def __getstate__(self):
        d = self.__dict__.copy()
        del d['s3']
        del d['session']
        del d['_kwargs_helper']
        logger.debug("Serialize with state: %s", d)
        return d

    def __setstate__(self, state):
        self.__dict__.update(state)
        self._conn = {}
        self.s3 = self.connect()
        self._kwargs_helper = ParamKwargsHelper(self.s3)

    def open(self,
             path,
             mode='rb',
             block_size=None,
             acl='',
             fill_cache=None,
             **kwargs):
        """ Open a file for reading or writing

        Parameters
        ----------
        path: string
            Path of file on S3
        mode: string
            One of 'rb' or 'wb'
        block_size: int
            Size of data-node blocks if reading
        fill_cache: bool
            If seeking to new a part of the file beyond the current buffer,
            with this True, the buffer will be filled between the sections to
            best support random access. When reading only a few specific chunks
            out of a file, performance may be better if False.
        acl: str
            Canned ACL to set when writing
        kwargs: dict-like
            Additional parameters used for s3 methods.  Typically used for ServerSideEncryption.
        """
        if block_size is None:
            block_size = self.default_block_size
        if fill_cache is None:
            fill_cache = self.default_fill_cache
        if 'b' not in mode:
            raise NotImplementedError("Text mode not supported, use mode='%s'"
                                      " and manage bytes" % (mode[0] + 'b'))
        return S3File(self,
                      path,
                      mode,
                      block_size=block_size,
                      acl=acl,
                      fill_cache=fill_cache,
                      s3_additional_kwargs=kwargs)

    def _lsdir(self, path, refresh=False):
        if path.startswith('s3://'):
            path = path[len('s3://'):]
        path = path.rstrip('/')
        bucket, prefix = split_path(path)
        prefix = prefix + '/' if prefix else ""
        if path not in self.dirs or refresh:
            try:
                pag = self.s3.get_paginator('list_objects_v2')
                it = pag.paginate(Bucket=bucket,
                                  Prefix=prefix,
                                  Delimiter='/',
                                  **self.req_kw)
                files = []
                dirs = None
                for i in it:
                    dirs = dirs or i.get('CommonPrefixes', None)
                    files.extend(i.get('Contents', []))
                if dirs:
                    files.extend([{
                        'Key': l['Prefix'][:-1],
                        'Size': 0,
                        'StorageClass': "DIRECTORY"
                    } for l in dirs])
                files = [f for f in files if len(f['Key']) > len(prefix)]
                for f in files:
                    f['Key'] = '/'.join([bucket, f['Key']])
            except ClientError:
                # path not accessible
                files = []
            self.dirs[path] = files
        return self.dirs[path]

    def _lsbuckets(self, refresh=False):
        if '' not in self.dirs or refresh:
            if self.anon:
                # cannot list buckets if not logged in
                return []
            files = self.s3.list_buckets()['Buckets']
            for f in files:
                f['Key'] = f['Name']
                f['Size'] = 0
                f['StorageClass'] = 'BUCKET'
                del f['Name']
            self.dirs[''] = files
        return self.dirs['']

    def _ls(self, path, refresh=False):
        """ List files in given bucket, or list of buckets.

        Listing is cached unless `refresh=True`.

        Note: only your buckets associated with the login will be listed by
        `ls('')`, not any public buckets (even if already accessed).

        Parameters
        ----------
        path : string/bytes
            location at which to list files
        detail : bool (=True)
            if True, each list item is a dict of file properties;
            otherwise, returns list of filenames
        refresh : bool (=False)
            if False, look in local cache for file details first
        kwargs : dict
            additional arguments passed on
        """
        if path.startswith('s3://'):
            path = path[len('s3://'):]
        if path in ['', '/']:
            return self._lsbuckets(refresh)
        else:
            return self._lsdir(path, refresh)

    def ls(self, path, detail=False, refresh=False, **kwargs):
        """ List single "directory" with or without details """
        if path.startswith('s3://'):
            path = path[len('s3://'):]
        path = path.rstrip('/')
        files = self._ls(path, refresh=refresh)
        if not files:
            if split_path(path)[1]:
                files = [self.info(path, **kwargs)]
            elif path:
                raise FileNotFoundError(path)
        if detail:
            return files
        else:
            return [f['Key'] for f in files]

    def info(self, path, refresh=False, **kwargs):
        """ Detail on the specific file pointed to by path.

        Gets details only for a specific key, directories/buckets cannot be
        used with info.
        """
        parent = path.rsplit('/', 1)[0]
        files = self._lsdir(parent, refresh=refresh)
        files = [
            f for f in files if f['Key'] == path
            and f['StorageClass'] not in ['DIRECTORY', 'BUCKET']
        ]
        if len(files) == 1:
            return files[0]
        else:
            try:
                bucket, key = split_path(path)
                out = self._call_s3(self.s3.head_object,
                                    kwargs,
                                    Bucket=bucket,
                                    Key=key,
                                    **self.req_kw)
                out = {
                    'ETag': out['ETag'],
                    'Key': '/'.join([bucket, key]),
                    'LastModified': out['LastModified'],
                    'Size': out['ContentLength'],
                    'StorageClass': "STANDARD"
                }
                return out
            except (ClientError, ParamValidationError):
                raise FileNotFoundError(path)

    _metadata_cache = {}

    def metadata(self, path, refresh=False, **kwargs):
        """ Return metadata of path.

        Metadata is cached unless `refresh=True`.

        Parameters
        ----------
        path : string/bytes
            filename to get metadata for
        refresh : bool (=False)
            if False, look in local cache for file metadata first
        """
        bucket, key = split_path(path)

        if refresh or path not in self._metadata_cache:
            response = self._call_s3(self.s3.head_object,
                                     kwargs,
                                     Bucket=bucket,
                                     Key=key,
                                     **self.req_kw)
            self._metadata_cache[path] = response['Metadata']

        return self._metadata_cache[path]

    def getxattr(self, path, attr_name, **kwargs):
        """ Get an attribute from the metadata.

        Examples
        --------
        >>> mys3fs.getxattr('mykey', 'attribute_1')  # doctest: +SKIP
        'value_1'
        """
        xattr = self.metadata(path, **kwargs)
        if attr_name in xattr:
            return xattr[attr_name]
        return None

    def setxattr(self, path, copy_kwargs=None, **kw_args):
        """ Set metadata.

        Attributes have to be of the form documented in the `Metadata Reference`_.

        Parameters
        ---------
        kw_args : key-value pairs like field="value", where the values must be strings. Does not alter existing fields, 
            unless the field appears here - if the value is None, delete the field.
        copy_args : dict, optional
            dictionary of additional params to use for the underlying s3.copy_object.

        Examples
        --------
        >>> mys3file.setxattr(attribute_1='value1', attribute_2='value2')  # doctest: +SKIP
        # Example for use with copy_args
        >>> mys3file.setxattr(copy_kwargs={'ContentType': 'application/pdf'}, attribute_1='value1')  # doctest: +SKIP
        

        .. Metadata Reference:
        http://docs.aws.amazon.com/AmazonS3/latest/dev/UsingMetadata.html#object-metadata
        """

        bucket, key = split_path(path)
        metadata = self.metadata(path)
        metadata.update(**kw_args)
        copy_kwargs = copy_kwargs or {}

        # remove all keys that are None
        for kw_key in kw_args:
            if kw_args[kw_key] is None:
                metadata.pop(kw_key, None)

        self._call_s3(
            self.s3.copy_object,
            copy_kwargs,
            CopySource="{}/{}".format(bucket, key),
            Bucket=bucket,
            Key=key,
            Metadata=metadata,
            MetadataDirective='REPLACE',
        )

        # refresh metadata
        self._metadata_cache[path] = metadata

    def _walk(self, path, refresh=False):
        if path.startswith('s3://'):
            path = path[len('s3://'):]
        if path in ['', '/']:
            raise ValueError('Cannot walk all of S3')
        filenames = self._ls(path, refresh=refresh)[:]
        for f in filenames[:]:
            if f['StorageClass'] == 'DIRECTORY':
                filenames.extend(self._walk(f['Key'], refresh))
        return [
            f for f in filenames
            if f['StorageClass'] not in ['BUCKET', 'DIRECTORY']
        ]

    def walk(self, path, refresh=False):
        """ Return all real keys below path """
        return [f['Key'] for f in self._walk(path, refresh)]

    def chmod(self, path, acl, **kwargs):
        """ Set Access Control on a bucket/key

        See http://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl

        Parameters
        ----------
        path : string
            the object to set
        acl : string
            the value of ACL to apply
        """
        bucket, key = split_path(path)
        if key:
            if acl not in key_acls:
                raise ValueError('ACL not in %s', key_acls)
            self._call_s3(self.s3.put_object_acl,
                          kwargs,
                          Bucket=bucket,
                          Key=key,
                          ACL=acl)
        else:
            if acl not in buck_acls:
                raise ValueError('ACL not in %s', buck_acls)
            self._call_s3(self.s3.put_bucket_acl,
                          kwargs,
                          Bucket=bucket,
                          ACL=acl)

    def glob(self, path):
        """
        Find files by glob-matching.

        Note that the bucket part of the path must not contain a "*"
        """
        path0 = path
        if path.startswith('s3://'):
            path = path[len('s3://'):]
        path = path.rstrip('/')
        bucket, key = split_path(path)
        if "*" in bucket:
            raise ValueError('Bucket cannot contain a "*"')
        if '*' not in path:
            path = path.rstrip('/') + '/*'
        if '/' in path[:path.index('*')]:
            ind = path[:path.index('*')].rindex('/')
            root = path[:ind + 1]
        else:
            root = '/'
        allfiles = self.walk(root)
        pattern = re.compile("^" + path.replace('//', '/').rstrip('/').replace(
            '*', '[^/]*').replace('?', '.') + "$")
        out = [
            f for f in allfiles if re.match(pattern,
                                            f.replace('//', '/').rstrip('/'))
        ]
        if not out:
            out = self.ls(path0)
        return out

    def du(self, path, total=False, deep=False, **kwargs):
        """ Bytes in keys at path """
        if deep:
            files = self.walk(path)
            files = [self.info(f, **kwargs) for f in files]
        else:
            files = self.ls(path, detail=True)
        if total:
            return sum(f.get('Size', 0) for f in files)
        else:
            return {p['Key']: p['Size'] for p in files}

    def exists(self, path):
        """ Does such a file/directory exist? """
        bucket, key = split_path(path)
        if key or bucket not in self.ls(''):
            return not raises(FileNotFoundError, lambda: self.ls(path))
        else:
            return True

    def cat(self, path, **kwargs):
        """ Returns contents of file """
        with self.open(path, 'rb', **kwargs) as f:
            return f.read()

    def tail(self, path, size=1024, **kwargs):
        """ Return last bytes of file """
        length = self.info(path, **kwargs)['Size']
        if size > length:
            return self.cat(path, **kwargs)
        with self.open(path, 'rb', **kwargs) as f:
            f.seek(length - size)
            return f.read(size)

    def head(self, path, size=1024, **kwargs):
        """ Return first bytes of file """
        with self.open(path, 'rb', block_size=size, **kwargs) as f:
            return f.read(size)

    def url(self, path, expires=3600, **kwargs):
        """ Generate presigned URL to access path by HTTP

        Parameters
        ----------
        path : string
            the key path we are interested in
        expires : int
            the number of seconds this signature will be good for.
        """
        bucket, key = split_path(path)
        return self.s3.generate_presigned_url(ClientMethod='get_object',
                                              Params=dict(Bucket=bucket,
                                                          Key=key,
                                                          **kwargs),
                                              ExpiresIn=expires)

    def get(self, path, filename, **kwargs):
        """ Stream data from file at path to local filename """
        with self.open(path, 'rb', **kwargs) as f:
            with open(filename, 'wb') as f2:
                while True:
                    data = f.read(f.blocksize)
                    if len(data) == 0:
                        break
                    f2.write(data)

    def put(self, filename, path, **kwargs):
        """ Stream data from local filename to file at path """
        with open(filename, 'rb') as f:
            with self.open(path, 'wb', **kwargs) as f2:
                while True:
                    data = f.read(f2.blocksize)
                    if len(data) == 0:
                        break
                    f2.write(data)

    def mkdir(self, path, acl="", **kwargs):
        """ Make new bucket or empty key """
        self.touch(path, acl=acl, **kwargs)

    def rmdir(self, path, **kwargs):
        """ Remove empty key or bucket """
        bucket, key = split_path(path)
        if (key and self.info(path)['Size'] == 0) or not key:
            self.rm(path, **kwargs)
        else:
            raise IOError('Path is not directory-like', path)

    def mv(self, path1, path2, **kwargs):
        """ Move file between locations on S3 """
        self.copy(path1, path2, **kwargs)
        self.rm(path1)

    def merge(self, path, filelist, **kwargs):
        """ Create single S3 file from list of S3 files

        Uses multi-part, no data is downloaded. The original files are
        not deleted.

        Parameters
        ----------
        path : str
            The final file to produce
        filelist : list of str
            The paths, in order, to assemble into the final file.
        """
        bucket, key = split_path(path)
        mpu = self._call_s3(self.s3.create_multipart_upload,
                            kwargs,
                            Bucket=bucket,
                            Key=key)
        out = [
            self._call_s3(self.s3.upload_part_copy,
                          kwargs,
                          Bucket=bucket,
                          Key=key,
                          UploadId=mpu['UploadId'],
                          CopySource=f,
                          PartNumber=i + 1) for (i, f) in enumerate(filelist)
        ]
        parts = [{
            'PartNumber': i + 1,
            'ETag': o['CopyPartResult']['ETag']
        } for (i, o) in enumerate(out)]
        part_info = {'Parts': parts}
        self.s3.complete_multipart_upload(Bucket=bucket,
                                          Key=key,
                                          UploadId=mpu['UploadId'],
                                          MultipartUpload=part_info)
        self.invalidate_cache(path)

    def copy(self, path1, path2, **kwargs):
        """ Copy file between locations on S3 """
        buc1, key1 = split_path(path1)
        buc2, key2 = split_path(path2)
        try:
            self._call_s3(self.s3.copy_object,
                          kwargs,
                          Bucket=buc2,
                          Key=key2,
                          CopySource='/'.join([buc1, key1]))
        except (ClientError, ParamValidationError):
            raise IOError('Copy failed', (path1, path2))
        self.invalidate_cache(path2)

    def bulk_delete(self, pathlist, **kwargs):
        """
        Remove multiple keys with one call

        Parameters
        ----------
        pathlist : listof strings
            The keys to remove, must all be in the same bucket.
        """
        if not pathlist:
            return
        buckets = {split_path(path)[0] for path in pathlist}
        if len(buckets) > 1:
            raise ValueError(
                "Bulk delete files should refer to only one bucket")
        bucket = buckets.pop()
        if len(pathlist) > 1000:
            for i in range((len(pathlist) // 1000) + 1):
                self.bulk_delete(pathlist[i * 1000:(i + 1) * 1000])
            return
        delete_keys = {
            'Objects': [{
                'Key': split_path(path)[1]
            } for path in pathlist]
        }
        try:
            self._call_s3(self.s3.delete_objects,
                          kwargs,
                          Bucket=bucket,
                          Delete=delete_keys)
            for path in pathlist:
                self.invalidate_cache(path)
        except ClientError:
            raise IOError('Bulk delete failed')

    def rm(self, path, recursive=False, **kwargs):
        """
        Remove keys and/or bucket.

        Parameters
        ----------
        path : string
            The location to remove.
        recursive : bool (True)
            Whether to remove also all entries below, i.e., which are returned
            by `walk()`.
        """
        if not self.exists(path):
            raise FileNotFoundError(path)
        if recursive:
            self.invalidate_cache(path)
            self.bulk_delete(self.walk(path), **kwargs)
            if not self.exists(path):
                return
        bucket, key = split_path(path)
        if key:
            try:
                self._call_s3(self.s3.delete_object,
                              kwargs,
                              Bucket=bucket,
                              Key=key)
            except ClientError:
                raise IOError('Delete key failed', (bucket, key))
            self.invalidate_cache(path)
        else:
            if not self.s3.list_objects(Bucket=bucket).get('Contents'):
                try:
                    self.s3.delete_bucket(Bucket=bucket)
                except ClientError:
                    raise IOError('Delete bucket failed', bucket)
                self.invalidate_cache(bucket)
                self.invalidate_cache('')
            else:
                raise IOError('Not empty', path)

    def invalidate_cache(self, path=None):
        if path is None:
            self.dirs.clear()
        else:
            self.dirs.pop(path, None)
            parent = path.rsplit('/', 1)[0]
            self.dirs.pop(parent, None)

    def touch(self, path, acl="", **kwargs):
        """
        Create empty key

        If path is a bucket only, attempt to create bucket.
        """
        bucket, key = split_path(path)
        if key:
            if acl and acl not in key_acls:
                raise ValueError('ACL not in %s', key_acls)
            self._call_s3(self.s3.put_object,
                          kwargs,
                          Bucket=bucket,
                          Key=key,
                          ACL=acl)
            self.invalidate_cache(path)
        else:
            if acl and acl not in buck_acls:
                raise ValueError('ACL not in %s', buck_acls)
            try:
                self.s3.create_bucket(Bucket=bucket, ACL=acl)
                self.invalidate_cache('')
                self.invalidate_cache(bucket)
            except (ClientError, ParamValidationError):
                raise IOError('Bucket create failed', path)

    def read_block(self, fn, offset, length, delimiter=None, **kwargs):
        """ Read a block of bytes from an S3 file

        Starting at ``offset`` of the file, read ``length`` bytes.  If
        ``delimiter`` is set then we ensure that the read starts and stops at
        delimiter boundaries that follow the locations ``offset`` and ``offset
        + length``.  If ``offset`` is zero then we start at zero.  The
        bytestring returned WILL include the end delimiter string.

        If offset+length is beyond the eof, reads to eof.

        Parameters
        ----------
        fn: string
            Path to filename on S3
        offset: int
            Byte offset to start read
        length: int
            Number of bytes to read
        delimiter: bytes (optional)
            Ensure reading starts and stops at delimiter bytestring

        Examples
        --------
        >>> s3.read_block('data/file.csv', 0, 13)  # doctest: +SKIP
        b'Alice, 100\\nBo'
        >>> s3.read_block('data/file.csv', 0, 13, delimiter=b'\\n')  # doctest: +SKIP
        b'Alice, 100\\nBob, 200\\n'

        Use ``length=None`` to read to the end of the file.
        >>> s3.read_block('data/file.csv', 0, None, delimiter=b'\\n')  # doctest: +SKIP
        b'Alice, 100\\nBob, 200\\nCharlie, 300'

        See Also
        --------
        distributed.utils.read_block
        """
        with self.open(fn, 'rb', **kwargs) as f:
            size = f.info()['Size']
            if length is None:
                length = size
            if offset + length > size:
                length = size - offset
            bytes = read_block(f, offset, length, delimiter)
        return bytes