示例#1
0
    def __init__(self,
                 key,
                 mode='r',
                 connection=None,
                 encrypt=True,
                 version_id=None):
        from baiji.connection import S3Connection
        self.encrypt = encrypt
        self.key = key
        if path.islocal(key):
            self.should_upload_on_close = False
            self.mode = FileMode(mode, allowed_modes='arwxb+t')
            from six.moves import builtins
            local_path = path.parse(key).path
            if self.mode.is_output and not os.path.exists(
                    os.path.dirname(local_path)):
                from baiji.util.shutillib import mkdir_p
                mkdir_p(os.path.dirname(local_path))
            try:
                # Use os.open to catch exclusive access to the file, but use open to get a nice, useful file object
                self.fd = os.open(local_path, self.mode.flags)
                self.f = builtins.open(local_path,
                                       self.mode.mode.replace('x', 'w'))
                os.close(self.fd)
            except OSError as e:
                import errno
                if e.errno is errno.EEXIST:
                    raise KeyExists("Local file exists: %s" % local_path)
                elif e.errno is errno.ENOENT:
                    raise KeyNotFound("Local file does not exist: %s" %
                                      local_path)
                else:
                    raise IOError(e.errno, "%s: %s" % (e.strerror, e.filename))
        else:
            if connection is None:
                connection = S3Connection()
            self.connection = connection

            self.mode = FileMode(mode, allowed_modes='rwxbt')
            self.should_upload_on_close = self.mode.is_output
            if self.mode.creating_exclusively:
                if self.connection.exists(self.key):
                    raise KeyExists("Key exists in bucket: %s" % self.key)
                else:
                    self.connection.touch(self.key, encrypt=self.encrypt)
            # Use w+ so we can read back the contents in upload()
            new_mode = ('w+' + (self.mode.binary and 'b' or '') +
                        (self.mode.text and 't' or ''))
            from baiji.util import tempfile
            self.f = tempfile.NamedTemporaryFile(
                mode=new_mode,
                suffix=os.path.splitext(path.parse(self.key).path)[1])
            self.name = self.f.name
            self.remotename = key  # Used by some serialization code to find files which sit along side the file in question, like textures which sit next to a mesh file
            if self.mode.reading:
                self.connection.cp(self.key,
                                   self.name,
                                   force=True,
                                   version_id=version_id)
示例#2
0
 def info(self, key_or_file):
     '''
     Get info about a file
     '''
     from datetime import datetime
     k = path.parse(key_or_file)
     result = {
         'uri': '%s://%s%s' % (k.scheme, k.netloc, k.path),
     }
     if k.scheme == 'file':
         if not os.path.exists(k.path):
             raise KeyNotFound("Error getting info on %s: File doesn't exist" % (key_or_file, ))
         stat = os.stat(k.path)
         result['size'] = stat.st_size
         result['last_modified'] = datetime.fromtimestamp(stat.st_mtime)
     elif k.scheme == 's3':
         remote_object = self._lookup(k.netloc, k.path)
         if remote_object is None:
             raise KeyNotFound("Error getting info on %s: Key doesn't exist" % (key_or_file, ))
         result['size'] = remote_object.size
         result['last_modified'] = datetime.strptime(remote_object.last_modified, "%a, %d %b %Y %H:%M:%S GMT")
         result['content_type'] = remote_object.content_type
         result['content_encoding'] = remote_object.content_encoding
         result['encrypted'] = bool(remote_object.encrypted)
         result['acl'] = remote_object.get_acl()
         result['owner'] = remote_object.owner
         result['version_id'] = remote_object.version_id
     else:
         raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
     return result
示例#3
0
 def info(self, key_or_file):
     '''
     Get info about a file
     '''
     from datetime import datetime
     k = path.parse(key_or_file)
     result = {
         'uri': '%s://%s%s' % (k.scheme, k.netloc, k.path),
     }
     if k.scheme == 'file':
         if not os.path.exists(k.path):
             raise KeyNotFound("Error getting info on %s: File doesn't exist" % (key_or_file, ))
         stat = os.stat(k.path)
         result['size'] = stat.st_size
         result['last_modified'] = datetime.fromtimestamp(stat.st_mtime)
     elif k.scheme == 's3':
         remote_object = self._lookup(k.netloc, k.path)
         if remote_object is None:
             raise KeyNotFound("Error getting info on %s: Key doesn't exist" % (key_or_file, ))
         result['size'] = remote_object.size
         result['last_modified'] = datetime.strptime(remote_object.last_modified, "%a, %d %b %Y %H:%M:%S GMT")
         result['content_type'] = remote_object.content_type
         result['content_encoding'] = remote_object.content_encoding
         result['encrypted'] = bool(remote_object.encrypted)
         result['acl'] = remote_object.get_acl()
         result['owner'] = remote_object.owner
         result['version_id'] = remote_object.version_id
     else:
         raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
     return result
示例#4
0
    def get_url(self, key, ttl):
        """
        Get a temporary https url for a file on AWS S3

        Returns the url as a string.
        The url will timeout and return an error after ``ttl`` seconds.
        """
        k = path.parse(key)
        return self._lookup(k.netloc, k.path).generate_url(ttl)
示例#5
0
    def get_url(self, key, ttl):
        """
        Get a temporary https url for a file on AWS S3

        Returns the url as a string.
        The url will timeout and return an error after ``ttl`` seconds.
        """
        k = path.parse(key)
        return self._lookup(k.netloc, k.path).generate_url(ttl)
示例#6
0
 def restore(self, key):
     from boto.s3.deletemarker import DeleteMarker
     k = path.parse(key)
     prefix = k.path
     if prefix.startswith(path.sep):
         prefix = prefix[len(path.sep):]
     versions = self._bucket(k.netloc).list_versions(prefix)
     delete_marker = [x for x in versions if x.name == prefix and isinstance(x, DeleteMarker) and x.is_latest]
     if delete_marker:
         self._bucket(k.netloc).delete_key(delete_marker[0].name, version_id=delete_marker[0].version_id)
示例#7
0
 def restore(self, key):
     from boto.s3.deletemarker import DeleteMarker
     k = path.parse(key)
     prefix = k.path
     if prefix.startswith(path.sep):
         prefix = prefix[len(path.sep):]
     versions = self._bucket(k.netloc).list_versions(prefix)
     delete_marker = [x for x in versions if x.name == prefix and isinstance(x, DeleteMarker) and x.is_latest]
     if delete_marker:
         self._bucket(k.netloc).delete_key(delete_marker[0].name, version_id=delete_marker[0].version_id)
示例#8
0
    def ls(self, s3prefix, return_full_urls=False, require_s3_scheme=False, shallow=False, followlinks=False, list_versions=False):
        '''
        List files on AWS S3
        prefix is given as an S3 url: ``s3://bucket-name/path/to/dir``.
        It will return all values in the bucket that have that prefix.

        Note that ``/dir/filename.ext`` is found by ``ls('s3://bucket-name/dir/fil')``; it's really a prefix and not a directory name.

        A local prefix generally is acceptable, but if require_s3_scheme
        is True, the prefix must be an s3 URL.

        If `shallow` is `True`, the key names are processed hierarchically
        using '/' as a delimiter, and only the immediate "children" are
        returned.

        '''
        import itertools
        k = path.parse(s3prefix)
        if k.scheme == 's3':
            prefix = k.path
            if prefix.startswith(path.sep):
                prefix = prefix[len(path.sep):]
            delimiter = shallow and path.sep or ''
            if return_full_urls:
                clean_paths = lambda x: "s3://" + k.netloc + path.sep + x.name
            else:
                clean_paths = lambda x: path.sep + x.name

            if list_versions:
                result_list_iterator = self._bucket(k.netloc).list_versions(prefix=prefix, delimiter=delimiter)
            else:
                result_list_iterator = self._bucket(k.netloc).list(prefix=prefix, delimiter=delimiter)

            return itertools.imap(clean_paths, result_list_iterator)
        elif k.scheme == 'file':
            if require_s3_scheme:
                raise InvalidSchemeException('URI should begin with s3://')
            paths = []
            remove = ''
            if not return_full_urls:
                remove = k.path
                if not remove.endswith(os.sep):
                    remove += os.sep
            for root, _, files in os.walk(k.path, followlinks=followlinks):
                for f in files:
                    # On Windows, results of os.abspath() and os.walk() have '\',
                    # so we replace them with '/'
                    paths.append(path.join(root, f).replace(remove, '').replace(os.sep, path.sep))
                if shallow:
                    break
            return paths
        else:
            raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
示例#9
0
    def ls(self, s3prefix, return_full_urls=False, require_s3_scheme=False, shallow=False, followlinks=False, list_versions=False):
        '''
        List files on AWS S3
        prefix is given as an S3 url: ``s3://bucket-name/path/to/dir``.
        It will return all values in the bucket that have that prefix.

        Note that ``/dir/filename.ext`` is found by ``ls('s3://bucket-name/dir/fil')``; it's really a prefix and not a directory name.

        A local prefix generally is acceptable, but if require_s3_scheme
        is True, the prefix must be an s3 URL.

        If `shallow` is `True`, the key names are processed hierarchically
        using '/' as a delimiter, and only the immediate "children" are
        returned.

        '''
        import six
        k = path.parse(s3prefix)
        if k.scheme == 's3':
            prefix = k.path
            if prefix.startswith(path.sep):
                prefix = prefix[len(path.sep):]
            delimiter = shallow and path.sep or ''
            if return_full_urls:
                clean_paths = lambda x: "s3://" + k.netloc + path.sep + x.name
            else:
                clean_paths = lambda x: path.sep + x.name

            if list_versions:
                result_list_iterator = self._bucket(k.netloc).list_versions(prefix=prefix, delimiter=delimiter)
            else:
                result_list_iterator = self._bucket(k.netloc).list(prefix=prefix, delimiter=delimiter)

            return six.moves.map(clean_paths, result_list_iterator)
        elif k.scheme == 'file':
            if require_s3_scheme:
                raise InvalidSchemeException('URI should begin with s3://')
            paths = []
            remove = ''
            if not return_full_urls:
                remove = k.path
                if not remove.endswith(os.sep):
                    remove += os.sep
            for root, _, files in os.walk(k.path, followlinks=followlinks):
                for f in files:
                    # On Windows, results of os.abspath() and os.walk() have '\',
                    # so we replace them with '/'
                    paths.append(path.join(root, f).replace(remove, '').replace(os.sep, path.sep))
                if shallow:
                    break
            return paths
        else:
            raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
示例#10
0
    def cp_r(self, dir_from, dir_to, parallel=False, **kwargs):
        '''
        kwargs are passed on directly to s3.cp; see defaults there.
        '''
        (from_scheme, _, from_path, _, _, _) = path.parse(dir_from)
        if from_scheme == 'file':
            files_to_copy = [(path.join(dir_from, f), path.join(dir_to, f))
                             for f in self.ls(dir_from, return_full_urls=False)]
        else:
            if from_path.endswith(path.sep):
                # Emulate `cp`, which copies the contents of the path.
                # Get path relative to from_path
                files_to_copy = [(f, path.join(dir_to, os.path.relpath(path.parse(f).path, from_path)))
                                 for f in self.ls(dir_from, return_full_urls=True) if not path.isdirlike(f)]
            else:
                # Get path relative to from_path's parent
                # Since from_path has no '/', we can get this with os.path.dirname()
                files_to_copy = [(f, path.join(dir_to, os.path.relpath(path.parse(f).path, os.path.dirname(from_path))))
                                 for f in self.ls(dir_from, return_full_urls=True) if not path.isdirlike(f)]

        if 'force' not in kwargs or not kwargs['force']:
            # we're not supposed to overwrite. Locally this is easy, since `exists` checks are cheap, but
            # on s3, it's more expensive, so we avoid it if possible:
            if path.isremote(dir_to):
                def common_prefix(a, b):
                    try:
                        ind = [x == y for x, y in zip(a, b)].index(False)
                    except ValueError:
                        return a
                    return a[:ind]
                destinations = [y for _, y in files_to_copy]
                prefix = reduce(common_prefix, destinations[1:], destinations[0])
                try:
                    # note that we can't use `exists` here, as it only works for full keys
                    self.ls(prefix).next()
                except StopIteration:
                    # There's nothing in the iterator, so there are no files to be found, so
                    # we set force for the copy so that we don't have to check each one:
                    kwargs['force'] = True
        self.cp_many(files_to_copy, parallel, **kwargs)
示例#11
0
    def cp_r(self, dir_from, dir_to, parallel=False, **kwargs):
        '''
        kwargs are passed on directly to s3.cp; see defaults there.
        '''
        (from_scheme, _, from_path, _, _, _) = path.parse(dir_from)
        if from_scheme == 'file':
            files_to_copy = [(path.join(dir_from, f), path.join(dir_to, f))
                             for f in self.ls(dir_from, return_full_urls=False)]
        else:
            if from_path.endswith(path.sep):
                # Emulate `cp`, which copies the contents of the path.
                # Get path relative to from_path
                files_to_copy = [(f, path.join(dir_to, os.path.relpath(path.parse(f).path, from_path)))
                                 for f in self.ls(dir_from, return_full_urls=True) if not path.isdirlike(f)]
            else:
                # Get path relative to from_path's parent
                # Since from_path has no '/', we can get this with os.path.dirname()
                files_to_copy = [(f, path.join(dir_to, os.path.relpath(path.parse(f).path, os.path.dirname(from_path))))
                                 for f in self.ls(dir_from, return_full_urls=True) if not path.isdirlike(f)]

        if 'force' not in kwargs or not kwargs['force']:
            # we're not supposed to overwrite. Locally this is easy, since `exists` checks are cheap, but
            # on s3, it's more expensive, so we avoid it if possible:
            if path.isremote(dir_to):
                def common_prefix(a, b):
                    try:
                        ind = [x == y for x, y in zip(a, b)].index(False)
                    except ValueError:
                        return a
                    return a[:ind]
                destinations = [y for _, y in files_to_copy]
                prefix = reduce(common_prefix, destinations[1:], destinations[0])
                try:
                    # note that we can't use `exists` here, as it only works for full keys
                    self.ls(prefix).next()
                except StopIteration:
                    # There's nothing in the iterator, so there are no files to be found, so
                    # we set force for the copy so that we don't have to check each one:
                    kwargs['force'] = True
        self.cp_many(files_to_copy, parallel, **kwargs)
示例#12
0
 def size(self, key_or_file, version_id=None):
     '''
     Return the size of a file. If it's on s3, don't download it.
     '''
     k = path.parse(key_or_file)
     if k.scheme == 'file':
         return os.path.getsize(k.path)
     elif k.scheme == 's3':
         k = self._lookup(k.netloc, k.path, version_id=version_id)
         if k is None:
             raise KeyNotFound("s3://%s/%s not found on s3" % (k.netloc, k.path))
         return k.size
     else:
         raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
示例#13
0
 def size(self, key_or_file, version_id=None):
     '''
     Return the size of a file. If it's on s3, don't download it.
     '''
     k = path.parse(key_or_file)
     if k.scheme == 'file':
         return os.path.getsize(k.path)
     elif k.scheme == 's3':
         k = self._lookup(k.netloc, k.path, version_id=version_id)
         if k is None:
             raise KeyNotFound("s3://%s/%s not found on s3" % (k.netloc, k.path))
         return k.size
     else:
         raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
示例#14
0
 def md5(self, key_or_file):
     '''
     Return the MD5 checksum of a file. If it's on s3, don't download it.
     '''
     k = path.parse(key_or_file)
     if k.scheme == 'file':
         from baiji.util.md5 import md5_for_file
         return md5_for_file(k.path)
     elif k.scheme == 's3':
         res = self._get_etag(k.netloc, k.path)
         if "-" in res:
             raise ValueError("md5 hashes not available from s3 for files that were uploaded as multipart (if over 5gb, there's no hope; if under, try copying it to itself to have S3 reset the etag)")
         return res
     else:
         raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
示例#15
0
    def put_string(self, key, s, encrypt=True, replace=True):
        '''
        Save string ``s`` to S3 as ``key``.

        If ``replace=True``, this will overwrite an existing key.
        If ``replace=false``, this will be a no-op when the key already exists.

        '''
        from boto.s3.key import Key
        from baiji.util.munging import _strip_initial_slashes
        key = path.parse(key)
        b = self._bucket(key.netloc)
        k = Key(b)
        k.key = _strip_initial_slashes(key.path)
        k.set_contents_from_string(s, encrypt_key=encrypt, replace=replace)
示例#16
0
    def put_string(self, key, s, encrypt=True, replace=True):
        '''
        Save string ``s`` to S3 as ``key``.

        If ``replace=True``, this will overwrite an existing key.
        If ``replace=false``, this will be a no-op when the key already exists.

        '''
        from boto.s3.key import Key
        from baiji.util.munging import _strip_initial_slashes
        key = path.parse(key)
        b = self._bucket(key.netloc)
        k = Key(b)
        k.key = _strip_initial_slashes(key.path)
        k.set_contents_from_string(s, encrypt_key=encrypt, replace=replace)
示例#17
0
文件: copy.py 项目: bodylabs/baiji
 def __init__(self, key, connection):
     import re
     from baiji import path
     self.raw = key
     self.connection = connection
     self.parsed = path.parse(key)
     self.remote_path = None # value here will be set by the path setting, this just satisfies lint
     self.isdir = path.isdirlike(key)
     self.path = self.parsed.path
     if not (self.path.startswith(path.sep) or re.match(r'^[a-zA-Z]:', self.path)):
         self.path = path.sep + self.path
     self.bucket_name = self.parsed.netloc
     self.scheme = self.parsed.scheme
     if self.scheme not in ['file', 's3']:
         raise InvalidSchemeException("URI Scheme %s is not implemented" % self.scheme)
示例#18
0
 def md5(self, key_or_file):
     '''
     Return the MD5 checksum of a file. If it's on s3, don't download it.
     '''
     k = path.parse(key_or_file)
     if k.scheme == 'file':
         from baiji.util.md5 import md5_for_file
         return md5_for_file(k.path)
     elif k.scheme == 's3':
         res = self._get_etag(k.netloc, k.path)
         if "-" in res:
             raise ValueError("md5 hashes not available from s3 for files that were uploaded as multipart (if over 5gb, there's no hope; if under, try copying it to itself to have S3 reset the etag)")
         return res
     else:
         raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
示例#19
0
 def __init__(self, key, connection):
     import re
     from baiji import path
     self.raw = key
     self.connection = connection
     self.parsed = path.parse(key)
     self.remote_path = None  # value here will be set by the path setting, this just satisfies lint
     self.isdir = path.isdirlike(key)
     self.path = self.parsed.path
     if not (self.path.startswith(path.sep)
             or re.match(r'^[a-zA-Z]:', self.path)):
         self.path = path.sep + self.path
     self.bucket_name = self.parsed.netloc
     self.scheme = self.parsed.scheme
     if self.scheme not in ['file', 's3']:
         raise InvalidSchemeException(
             "URI Scheme %s is not implemented" % self.scheme)
示例#20
0
    def touch(self, key, encrypt=True):
        """
        Touch a local file or a path on s3

        Locally, this is analagous to the unix touch command

        On s3, it creates an empty file if there is not one there already,
        but does not change the timestamps (not possible to do without
        actually moving the file)
        """
        if path.islocal(key):
            filename = path.parse(key).path
            with open(filename, 'a'):
                os.utime(filename, None)
        else:
            # The replace=False here means that we only take action if
            # the file doesn't exist, so we don't accidentally truncate
            # files when we just mean to be touching them
            self.put_string(key, '', encrypt=encrypt, replace=False)
示例#21
0
 def encrypt_at_rest(self, key):
     '''
     This method takes a key on s3 and encrypts it.
     Note that calling this method on a local file is an error
     and that calling it on an s3 key that is already encrypted,
     while allowed, is a no-op.
     '''
     k = path.parse(key)
     if k.scheme != 's3':
         raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
     remote_object = self._lookup(k.netloc, k.path)
     if remote_object is None:
         raise KeyNotFound("Error encrypting %s: Key doesn't exist" % (key, ))
     if not bool(remote_object.encrypted):
         bucket = self._bucket(k.netloc)
         src = k.path
         if src.startswith(path.sep):
             src = src[len(path.sep):] # NB: copy_key is failing with absolute src keys...
         bucket.copy_key(src, k.netloc, src, preserve_acl=True, metadata=None, encrypt_key=True)
示例#22
0
 def etag(self, key_or_file):
     '''
     Return the s3 etag of the file. For single part uploads (for us, files less than 5gb) this is the same as md5.
     '''
     from baiji.copy import S3_MAX_UPLOAD_SIZE
     k = path.parse(key_or_file)
     if k.scheme == 'file':
         import math
         from baiji.util.md5 import md5_for_file
         file_size = os.path.getsize(k.path)
         if file_size > S3_MAX_UPLOAD_SIZE:
             n_parts = int(math.ceil(float(file_size) / S3_MAX_UPLOAD_SIZE))
             return self._build_etag(k.path, n_parts, S3_MAX_UPLOAD_SIZE)
         else:
             return md5_for_file(k.path)
     elif k.scheme == 's3':
         return self._get_etag(k.netloc, k.path)
     else:
         raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
示例#23
0
    def touch(self, key, encrypt=True):
        """
        Touch a local file or a path on s3

        Locally, this is analagous to the unix touch command

        On s3, it creates an empty file if there is not one there already,
        but does not change the timestamps (not possible to do without
        actually moving the file)
        """
        if path.islocal(key):
            filename = path.parse(key).path
            with open(filename, 'a'):
                os.utime(filename, None)
        else:
            # The replace=False here means that we only take action if
            # the file doesn't exist, so we don't accidentally truncate
            # files when we just mean to be touching them
            self.put_string(key, '', encrypt=encrypt, replace=False)
示例#24
0
 def etag(self, key_or_file):
     '''
     Return the s3 etag of the file. For single part uploads (for us, files less than 5gb) this is the same as md5.
     '''
     from baiji.copy import S3_MAX_UPLOAD_SIZE
     k = path.parse(key_or_file)
     if k.scheme == 'file':
         import math
         from baiji.util.md5 import md5_for_file
         file_size = os.path.getsize(k.path)
         if file_size > S3_MAX_UPLOAD_SIZE:
             n_parts = int(math.ceil(float(file_size) / S3_MAX_UPLOAD_SIZE))
             return self._build_etag(k.path, n_parts, S3_MAX_UPLOAD_SIZE)
         else:
             return md5_for_file(k.path)
     elif k.scheme == 's3':
         return self._get_etag(k.netloc, k.path)
     else:
         raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
示例#25
0
 def encrypt_at_rest(self, key):
     '''
     This method takes a key on s3 and encrypts it.
     Note that calling this method on a local file is an error
     and that calling it on an s3 key that is already encrypted,
     while allowed, is a no-op.
     '''
     k = path.parse(key)
     if k.scheme != 's3':
         raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
     remote_object = self._lookup(k.netloc, k.path)
     if remote_object is None:
         raise KeyNotFound("Error encrypting %s: Key doesn't exist" % (key, ))
     if not bool(remote_object.encrypted):
         bucket = self._bucket(k.netloc)
         src = k.path
         if src.startswith(path.sep):
             src = src[len(path.sep):] # NB: copy_key is failing with absolute src keys...
         bucket.copy_key(src, k.netloc, src, preserve_acl=True, metadata=None, encrypt_key=True)
示例#26
0
    def rm_r(self, key_or_file, force=False, quiet=False):
        '''
        Prompts for confirmation on each file when force is False.

        Raises an exception when not using AWS.
        '''
        k = path.parse(key_or_file)
        if not k.scheme == 's3':
            raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
        bucket = k.netloc
        keys_to_delete = self.ls(key_or_file)
        for key_to_delete in keys_to_delete:
            url = "s3://%s%s" % (bucket, key_to_delete)
            if not force:
                from baiji.util.console import confirm
                if not confirm("Remove %s" % url):
                    continue
            self.rm(url)
            if not quiet:
                print("[deleted] %s" % url)
示例#27
0
 def rm(self, key_or_file, version_id=None):
     '''
     Remove a key from AWS S3
     '''
     import shutil
     from baiji.util.munging import _strip_initial_slashes
     k = path.parse(key_or_file)
     if k.scheme == 'file':
         if os.path.isdir(k.path):
             shutil.rmtree(k.path)
         elif os.path.exists(k.path):
             return os.remove(k.path)
         else:
             raise KeyNotFound("%s does not exist" % key_or_file)
     elif k.scheme == 's3':
         if not self.exists(key_or_file, version_id=version_id):
             raise KeyNotFound("%s does not exist" % key_or_file)
         return self._bucket(k.netloc).delete_key(_strip_initial_slashes(k.path), version_id=version_id)
     else:
         raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
示例#28
0
 def rm(self, key_or_file, version_id=None):
     '''
     Remove a key from AWS S3
     '''
     import shutil
     from baiji.util.munging import _strip_initial_slashes
     k = path.parse(key_or_file)
     if k.scheme == 'file':
         if os.path.isdir(k.path):
             shutil.rmtree(k.path)
         elif os.path.exists(k.path):
             return os.remove(k.path)
         else:
             raise KeyNotFound("%s does not exist" % key_or_file)
     elif k.scheme == 's3':
         if not self.exists(key_or_file, version_id=version_id):
             raise KeyNotFound("%s does not exist" % key_or_file)
         return self._bucket(k.netloc).delete_key(_strip_initial_slashes(k.path), version_id=version_id)
     else:
         raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
示例#29
0
    def rm_r(self, key_or_file, force=False, quiet=False):
        '''
        Prompts for confirmation on each file when force is False.

        Raises an exception when not using AWS.
        '''
        k = path.parse(key_or_file)
        if not k.scheme == 's3':
            raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
        bucket = k.netloc
        keys_to_delete = self.ls(key_or_file)
        for key_to_delete in keys_to_delete:
            url = "s3://%s%s" % (bucket, key_to_delete)
            if not force:
                from baiji.util.console import confirm
                if not confirm("Remove %s" % url):
                    continue
            self.rm(url)
            if not quiet:
                print "[deleted] %s" % url
示例#30
0
    def etag_matches(self, key_or_file, other_etag):
        import math
        from baiji.copy import S3_MAX_UPLOAD_SIZE
        k = path.parse(key_or_file)
        # print "***", key_or_file, other_etag
        if "-" not in other_etag or k.scheme == 's3':
            return self.etag(key_or_file) == other_etag
        else: # This is the case where the key was uploaded multipart and has a `md5-n_parts` type etag
            n_parts = int(other_etag.split("-")[1])
            file_size = os.path.getsize(k.path)
            # There are a number of possible part sizes that could produce any given
            # number of parts. The most likely and only ones we've seen so far are
            # these, but we might someday need to try others, which might require
            # exhaustively searching the possibilities....

            # (n_parts-1) * part_size >= file_size >= n_parts * part_size
            min_part_size = int(math.ceil(float(file_size)/n_parts))
            max_part_size = file_size / (n_parts-1)
            # print "  - min part size {} gives last block size of {}".format(min_part_size, file_size - min_part_size*(n_parts-1))
            # print "  - max part size {} gives last block size of {}".format(max_part_size, file_size - max_part_size*(n_parts-1))
            possible_part_sizes = [
                S3_MAX_UPLOAD_SIZE, # what we do
                file_size/n_parts, # seen this from third party uploaders
                min_part_size, # just in case
                max_part_size, # seen this from third party uploaders
                1024*1024*8, # seen this from third party uploaders
                1024*1024*5, # the minimum s3 will allow
            ]
            # print "  - {} parts, file size {} bytes".format(n_parts, file_size)
            # print "  - possible_part_sizes:", possible_part_sizes
            possible_part_sizes = set([part_size for part_size in possible_part_sizes if part_size <= max_part_size and part_size >= 1024*1024*5])
            # print "  - possible_part_sizes:", possible_part_sizes
            if not possible_part_sizes:
                return False
            for part_size in possible_part_sizes:
                # print "  -", part_size, self._build_etag(k.path, n_parts, part_size)
                if self._build_etag(k.path, n_parts, part_size) == other_etag:
                    return True
            return False
示例#31
0
    def etag_matches(self, key_or_file, other_etag):
        import math
        from baiji.copy import S3_MAX_UPLOAD_SIZE
        k = path.parse(key_or_file)
        # print "***", key_or_file, other_etag
        if "-" not in other_etag or k.scheme == 's3':
            return self.etag(key_or_file) == other_etag
        else: # This is the case where the key was uploaded multipart and has a `md5-n_parts` type etag
            n_parts = int(other_etag.split("-")[1])
            file_size = os.path.getsize(k.path)
            # There are a number of possible part sizes that could produce any given
            # number of parts. The most likely and only ones we've seen so far are
            # these, but we might someday need to try others, which might require
            # exhaustively searching the possibilities....

            # (n_parts-1) * part_size >= file_size >= n_parts * part_size
            min_part_size = int(math.ceil(float(file_size)/n_parts))
            max_part_size = file_size / (n_parts-1)
            # print "  - min part size {} gives last block size of {}".format(min_part_size, file_size - min_part_size*(n_parts-1))
            # print "  - max part size {} gives last block size of {}".format(max_part_size, file_size - max_part_size*(n_parts-1))
            possible_part_sizes = [
                S3_MAX_UPLOAD_SIZE, # what we do
                file_size/n_parts, # seen this from third party uploaders
                min_part_size, # just in case
                max_part_size, # seen this from third party uploaders
                1024*1024*8, # seen this from third party uploaders
                1024*1024*5, # the minimum s3 will allow
            ]
            # print "  - {} parts, file size {} bytes".format(n_parts, file_size)
            # print "  - possible_part_sizes:", possible_part_sizes
            possible_part_sizes = set([part_size for part_size in possible_part_sizes if part_size <= max_part_size and part_size >= 1024*1024*5])
            # print "  - possible_part_sizes:", possible_part_sizes
            if not possible_part_sizes:
                return False
            for part_size in possible_part_sizes:
                # print "  -", part_size, self._build_etag(k.path, n_parts, part_size)
                if self._build_etag(k.path, n_parts, part_size) == other_etag:
                    return True
            return False
示例#32
0
    def exists(self, key_or_file, retries_allowed=3, version_id=None):
        '''
        Check if a file exists on AWS S3

        Returns a boolean.

        If the key is not found then we recheck up to `retries_allowed` times. We only do this
        on s3. We've had some observations of what appears to be eventual consistency, so this
        makes it a bit more reliable. This does slow down the call in the case where the key
        does not exist.

        On a relatively slow, high latency connection a test of 100 tests retreiving a
        non-existant file gives:

        With retries_allowed=1: median=457.587 ms, mean=707.12387 ms
        With retries_allowed=3: median=722.969 ms, mean=1185.86299 ms
        with retries_allowed=10: median=2489.767 ms, mean=2995.34233 ms
        With retries_allowed=100: median=24694.0815 ms, mean=26754.64137 ms

        So assume that letting retries_allowed=3 will cost you a bit less than double the time.
        '''
        k = path.parse(key_or_file)
        if k.scheme == 'file':
            return os.path.exists(k.path)
        elif k.scheme == 's3':
            retry_attempts = 0
            while retry_attempts < retries_allowed:
                key = self._lookup(k.netloc, k.path, cache_buckets=True, version_id=version_id)
                if key:
                    if retry_attempts > 0: # only if we find it after failing at least once
                        import warnings
                        from baiji.exceptions import EventualConsistencyWarning
                        warnings.warn("S3 is behaving in an eventually consistent way in s3.exists({}) -- it took {} attempts to locate the key".format(key_or_file, retry_attempts+1), EventualConsistencyWarning)
                    return True
                retry_attempts += 1
            return False
        else:
            raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
示例#33
0
    def exists(self, key_or_file, retries_allowed=3, version_id=None):
        '''
        Check if a file exists on AWS S3

        Returns a boolean.

        If the key is not found then we recheck up to `retries_allowed` times. We only do this
        on s3. We've had some observations of what appears to be eventual consistency, so this
        makes it a bit more reliable. This does slow down the call in the case where the key
        does not exist.

        On a relatively slow, high latency connection a test of 100 tests retreiving a
        non-existant file gives:

        With retries_allowed=1: median=457.587 ms, mean=707.12387 ms
        With retries_allowed=3: median=722.969 ms, mean=1185.86299 ms
        with retries_allowed=10: median=2489.767 ms, mean=2995.34233 ms
        With retries_allowed=100: median=24694.0815 ms, mean=26754.64137 ms

        So assume that letting retries_allowed=3 will cost you a bit less than double the time.
        '''
        k = path.parse(key_or_file)
        if k.scheme == 'file':
            return os.path.exists(k.path)
        elif k.scheme == 's3':
            retry_attempts = 0
            while retry_attempts < retries_allowed:
                key = self._lookup(k.netloc, k.path, cache_buckets=True, version_id=version_id)
                if key:
                    if retry_attempts > 0: # only if we find it after failing at least once
                        import warnings
                        from baiji.exceptions import EventualConsistencyWarning
                        warnings.warn("S3 is behaving in an eventually consistent way in s3.exists({}) -- it took {} attempts to locate the key".format(key_or_file, retry_attempts+1), EventualConsistencyWarning)
                    return True
                retry_attempts += 1
            return False
        else:
            raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
示例#34
0
 def get_string(self, key):
     '''
     Get string stored in S3 ``key``.
     '''
     k = path.parse(key)
     return self._lookup(k.netloc, k.path).get_contents_as_string()
示例#35
0
 def get_string(self, key, encoding=None):
     '''
     Get string stored in S3 ``key``.
     '''
     k = path.parse(key)
     return self._lookup(k.netloc, k.path).get_contents_as_string(encoding=encoding)