def __init__(self, key, mode='r', connection=None, encrypt=True, version_id=None): from baiji.connection import S3Connection self.encrypt = encrypt self.key = key if path.islocal(key): self.should_upload_on_close = False self.mode = FileMode(mode, allowed_modes='arwxb+t') from six.moves import builtins local_path = path.parse(key).path if self.mode.is_output and not os.path.exists( os.path.dirname(local_path)): from baiji.util.shutillib import mkdir_p mkdir_p(os.path.dirname(local_path)) try: # Use os.open to catch exclusive access to the file, but use open to get a nice, useful file object self.fd = os.open(local_path, self.mode.flags) self.f = builtins.open(local_path, self.mode.mode.replace('x', 'w')) os.close(self.fd) except OSError as e: import errno if e.errno is errno.EEXIST: raise KeyExists("Local file exists: %s" % local_path) elif e.errno is errno.ENOENT: raise KeyNotFound("Local file does not exist: %s" % local_path) else: raise IOError(e.errno, "%s: %s" % (e.strerror, e.filename)) else: if connection is None: connection = S3Connection() self.connection = connection self.mode = FileMode(mode, allowed_modes='rwxbt') self.should_upload_on_close = self.mode.is_output if self.mode.creating_exclusively: if self.connection.exists(self.key): raise KeyExists("Key exists in bucket: %s" % self.key) else: self.connection.touch(self.key, encrypt=self.encrypt) # Use w+ so we can read back the contents in upload() new_mode = ('w+' + (self.mode.binary and 'b' or '') + (self.mode.text and 't' or '')) from baiji.util import tempfile self.f = tempfile.NamedTemporaryFile( mode=new_mode, suffix=os.path.splitext(path.parse(self.key).path)[1]) self.name = self.f.name self.remotename = key # Used by some serialization code to find files which sit along side the file in question, like textures which sit next to a mesh file if self.mode.reading: self.connection.cp(self.key, self.name, force=True, version_id=version_id)
def info(self, key_or_file): ''' Get info about a file ''' from datetime import datetime k = path.parse(key_or_file) result = { 'uri': '%s://%s%s' % (k.scheme, k.netloc, k.path), } if k.scheme == 'file': if not os.path.exists(k.path): raise KeyNotFound("Error getting info on %s: File doesn't exist" % (key_or_file, )) stat = os.stat(k.path) result['size'] = stat.st_size result['last_modified'] = datetime.fromtimestamp(stat.st_mtime) elif k.scheme == 's3': remote_object = self._lookup(k.netloc, k.path) if remote_object is None: raise KeyNotFound("Error getting info on %s: Key doesn't exist" % (key_or_file, )) result['size'] = remote_object.size result['last_modified'] = datetime.strptime(remote_object.last_modified, "%a, %d %b %Y %H:%M:%S GMT") result['content_type'] = remote_object.content_type result['content_encoding'] = remote_object.content_encoding result['encrypted'] = bool(remote_object.encrypted) result['acl'] = remote_object.get_acl() result['owner'] = remote_object.owner result['version_id'] = remote_object.version_id else: raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme) return result
def get_url(self, key, ttl): """ Get a temporary https url for a file on AWS S3 Returns the url as a string. The url will timeout and return an error after ``ttl`` seconds. """ k = path.parse(key) return self._lookup(k.netloc, k.path).generate_url(ttl)
def restore(self, key): from boto.s3.deletemarker import DeleteMarker k = path.parse(key) prefix = k.path if prefix.startswith(path.sep): prefix = prefix[len(path.sep):] versions = self._bucket(k.netloc).list_versions(prefix) delete_marker = [x for x in versions if x.name == prefix and isinstance(x, DeleteMarker) and x.is_latest] if delete_marker: self._bucket(k.netloc).delete_key(delete_marker[0].name, version_id=delete_marker[0].version_id)
def ls(self, s3prefix, return_full_urls=False, require_s3_scheme=False, shallow=False, followlinks=False, list_versions=False): ''' List files on AWS S3 prefix is given as an S3 url: ``s3://bucket-name/path/to/dir``. It will return all values in the bucket that have that prefix. Note that ``/dir/filename.ext`` is found by ``ls('s3://bucket-name/dir/fil')``; it's really a prefix and not a directory name. A local prefix generally is acceptable, but if require_s3_scheme is True, the prefix must be an s3 URL. If `shallow` is `True`, the key names are processed hierarchically using '/' as a delimiter, and only the immediate "children" are returned. ''' import itertools k = path.parse(s3prefix) if k.scheme == 's3': prefix = k.path if prefix.startswith(path.sep): prefix = prefix[len(path.sep):] delimiter = shallow and path.sep or '' if return_full_urls: clean_paths = lambda x: "s3://" + k.netloc + path.sep + x.name else: clean_paths = lambda x: path.sep + x.name if list_versions: result_list_iterator = self._bucket(k.netloc).list_versions(prefix=prefix, delimiter=delimiter) else: result_list_iterator = self._bucket(k.netloc).list(prefix=prefix, delimiter=delimiter) return itertools.imap(clean_paths, result_list_iterator) elif k.scheme == 'file': if require_s3_scheme: raise InvalidSchemeException('URI should begin with s3://') paths = [] remove = '' if not return_full_urls: remove = k.path if not remove.endswith(os.sep): remove += os.sep for root, _, files in os.walk(k.path, followlinks=followlinks): for f in files: # On Windows, results of os.abspath() and os.walk() have '\', # so we replace them with '/' paths.append(path.join(root, f).replace(remove, '').replace(os.sep, path.sep)) if shallow: break return paths else: raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
def ls(self, s3prefix, return_full_urls=False, require_s3_scheme=False, shallow=False, followlinks=False, list_versions=False): ''' List files on AWS S3 prefix is given as an S3 url: ``s3://bucket-name/path/to/dir``. It will return all values in the bucket that have that prefix. Note that ``/dir/filename.ext`` is found by ``ls('s3://bucket-name/dir/fil')``; it's really a prefix and not a directory name. A local prefix generally is acceptable, but if require_s3_scheme is True, the prefix must be an s3 URL. If `shallow` is `True`, the key names are processed hierarchically using '/' as a delimiter, and only the immediate "children" are returned. ''' import six k = path.parse(s3prefix) if k.scheme == 's3': prefix = k.path if prefix.startswith(path.sep): prefix = prefix[len(path.sep):] delimiter = shallow and path.sep or '' if return_full_urls: clean_paths = lambda x: "s3://" + k.netloc + path.sep + x.name else: clean_paths = lambda x: path.sep + x.name if list_versions: result_list_iterator = self._bucket(k.netloc).list_versions(prefix=prefix, delimiter=delimiter) else: result_list_iterator = self._bucket(k.netloc).list(prefix=prefix, delimiter=delimiter) return six.moves.map(clean_paths, result_list_iterator) elif k.scheme == 'file': if require_s3_scheme: raise InvalidSchemeException('URI should begin with s3://') paths = [] remove = '' if not return_full_urls: remove = k.path if not remove.endswith(os.sep): remove += os.sep for root, _, files in os.walk(k.path, followlinks=followlinks): for f in files: # On Windows, results of os.abspath() and os.walk() have '\', # so we replace them with '/' paths.append(path.join(root, f).replace(remove, '').replace(os.sep, path.sep)) if shallow: break return paths else: raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
def cp_r(self, dir_from, dir_to, parallel=False, **kwargs): ''' kwargs are passed on directly to s3.cp; see defaults there. ''' (from_scheme, _, from_path, _, _, _) = path.parse(dir_from) if from_scheme == 'file': files_to_copy = [(path.join(dir_from, f), path.join(dir_to, f)) for f in self.ls(dir_from, return_full_urls=False)] else: if from_path.endswith(path.sep): # Emulate `cp`, which copies the contents of the path. # Get path relative to from_path files_to_copy = [(f, path.join(dir_to, os.path.relpath(path.parse(f).path, from_path))) for f in self.ls(dir_from, return_full_urls=True) if not path.isdirlike(f)] else: # Get path relative to from_path's parent # Since from_path has no '/', we can get this with os.path.dirname() files_to_copy = [(f, path.join(dir_to, os.path.relpath(path.parse(f).path, os.path.dirname(from_path)))) for f in self.ls(dir_from, return_full_urls=True) if not path.isdirlike(f)] if 'force' not in kwargs or not kwargs['force']: # we're not supposed to overwrite. Locally this is easy, since `exists` checks are cheap, but # on s3, it's more expensive, so we avoid it if possible: if path.isremote(dir_to): def common_prefix(a, b): try: ind = [x == y for x, y in zip(a, b)].index(False) except ValueError: return a return a[:ind] destinations = [y for _, y in files_to_copy] prefix = reduce(common_prefix, destinations[1:], destinations[0]) try: # note that we can't use `exists` here, as it only works for full keys self.ls(prefix).next() except StopIteration: # There's nothing in the iterator, so there are no files to be found, so # we set force for the copy so that we don't have to check each one: kwargs['force'] = True self.cp_many(files_to_copy, parallel, **kwargs)
def size(self, key_or_file, version_id=None): ''' Return the size of a file. If it's on s3, don't download it. ''' k = path.parse(key_or_file) if k.scheme == 'file': return os.path.getsize(k.path) elif k.scheme == 's3': k = self._lookup(k.netloc, k.path, version_id=version_id) if k is None: raise KeyNotFound("s3://%s/%s not found on s3" % (k.netloc, k.path)) return k.size else: raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
def md5(self, key_or_file): ''' Return the MD5 checksum of a file. If it's on s3, don't download it. ''' k = path.parse(key_or_file) if k.scheme == 'file': from baiji.util.md5 import md5_for_file return md5_for_file(k.path) elif k.scheme == 's3': res = self._get_etag(k.netloc, k.path) if "-" in res: raise ValueError("md5 hashes not available from s3 for files that were uploaded as multipart (if over 5gb, there's no hope; if under, try copying it to itself to have S3 reset the etag)") return res else: raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
def put_string(self, key, s, encrypt=True, replace=True): ''' Save string ``s`` to S3 as ``key``. If ``replace=True``, this will overwrite an existing key. If ``replace=false``, this will be a no-op when the key already exists. ''' from boto.s3.key import Key from baiji.util.munging import _strip_initial_slashes key = path.parse(key) b = self._bucket(key.netloc) k = Key(b) k.key = _strip_initial_slashes(key.path) k.set_contents_from_string(s, encrypt_key=encrypt, replace=replace)
def __init__(self, key, connection): import re from baiji import path self.raw = key self.connection = connection self.parsed = path.parse(key) self.remote_path = None # value here will be set by the path setting, this just satisfies lint self.isdir = path.isdirlike(key) self.path = self.parsed.path if not (self.path.startswith(path.sep) or re.match(r'^[a-zA-Z]:', self.path)): self.path = path.sep + self.path self.bucket_name = self.parsed.netloc self.scheme = self.parsed.scheme if self.scheme not in ['file', 's3']: raise InvalidSchemeException("URI Scheme %s is not implemented" % self.scheme)
def __init__(self, key, connection): import re from baiji import path self.raw = key self.connection = connection self.parsed = path.parse(key) self.remote_path = None # value here will be set by the path setting, this just satisfies lint self.isdir = path.isdirlike(key) self.path = self.parsed.path if not (self.path.startswith(path.sep) or re.match(r'^[a-zA-Z]:', self.path)): self.path = path.sep + self.path self.bucket_name = self.parsed.netloc self.scheme = self.parsed.scheme if self.scheme not in ['file', 's3']: raise InvalidSchemeException( "URI Scheme %s is not implemented" % self.scheme)
def touch(self, key, encrypt=True): """ Touch a local file or a path on s3 Locally, this is analagous to the unix touch command On s3, it creates an empty file if there is not one there already, but does not change the timestamps (not possible to do without actually moving the file) """ if path.islocal(key): filename = path.parse(key).path with open(filename, 'a'): os.utime(filename, None) else: # The replace=False here means that we only take action if # the file doesn't exist, so we don't accidentally truncate # files when we just mean to be touching them self.put_string(key, '', encrypt=encrypt, replace=False)
def encrypt_at_rest(self, key): ''' This method takes a key on s3 and encrypts it. Note that calling this method on a local file is an error and that calling it on an s3 key that is already encrypted, while allowed, is a no-op. ''' k = path.parse(key) if k.scheme != 's3': raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme) remote_object = self._lookup(k.netloc, k.path) if remote_object is None: raise KeyNotFound("Error encrypting %s: Key doesn't exist" % (key, )) if not bool(remote_object.encrypted): bucket = self._bucket(k.netloc) src = k.path if src.startswith(path.sep): src = src[len(path.sep):] # NB: copy_key is failing with absolute src keys... bucket.copy_key(src, k.netloc, src, preserve_acl=True, metadata=None, encrypt_key=True)
def etag(self, key_or_file): ''' Return the s3 etag of the file. For single part uploads (for us, files less than 5gb) this is the same as md5. ''' from baiji.copy import S3_MAX_UPLOAD_SIZE k = path.parse(key_or_file) if k.scheme == 'file': import math from baiji.util.md5 import md5_for_file file_size = os.path.getsize(k.path) if file_size > S3_MAX_UPLOAD_SIZE: n_parts = int(math.ceil(float(file_size) / S3_MAX_UPLOAD_SIZE)) return self._build_etag(k.path, n_parts, S3_MAX_UPLOAD_SIZE) else: return md5_for_file(k.path) elif k.scheme == 's3': return self._get_etag(k.netloc, k.path) else: raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
def rm_r(self, key_or_file, force=False, quiet=False): ''' Prompts for confirmation on each file when force is False. Raises an exception when not using AWS. ''' k = path.parse(key_or_file) if not k.scheme == 's3': raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme) bucket = k.netloc keys_to_delete = self.ls(key_or_file) for key_to_delete in keys_to_delete: url = "s3://%s%s" % (bucket, key_to_delete) if not force: from baiji.util.console import confirm if not confirm("Remove %s" % url): continue self.rm(url) if not quiet: print("[deleted] %s" % url)
def rm(self, key_or_file, version_id=None): ''' Remove a key from AWS S3 ''' import shutil from baiji.util.munging import _strip_initial_slashes k = path.parse(key_or_file) if k.scheme == 'file': if os.path.isdir(k.path): shutil.rmtree(k.path) elif os.path.exists(k.path): return os.remove(k.path) else: raise KeyNotFound("%s does not exist" % key_or_file) elif k.scheme == 's3': if not self.exists(key_or_file, version_id=version_id): raise KeyNotFound("%s does not exist" % key_or_file) return self._bucket(k.netloc).delete_key(_strip_initial_slashes(k.path), version_id=version_id) else: raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
def rm_r(self, key_or_file, force=False, quiet=False): ''' Prompts for confirmation on each file when force is False. Raises an exception when not using AWS. ''' k = path.parse(key_or_file) if not k.scheme == 's3': raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme) bucket = k.netloc keys_to_delete = self.ls(key_or_file) for key_to_delete in keys_to_delete: url = "s3://%s%s" % (bucket, key_to_delete) if not force: from baiji.util.console import confirm if not confirm("Remove %s" % url): continue self.rm(url) if not quiet: print "[deleted] %s" % url
def etag_matches(self, key_or_file, other_etag): import math from baiji.copy import S3_MAX_UPLOAD_SIZE k = path.parse(key_or_file) # print "***", key_or_file, other_etag if "-" not in other_etag or k.scheme == 's3': return self.etag(key_or_file) == other_etag else: # This is the case where the key was uploaded multipart and has a `md5-n_parts` type etag n_parts = int(other_etag.split("-")[1]) file_size = os.path.getsize(k.path) # There are a number of possible part sizes that could produce any given # number of parts. The most likely and only ones we've seen so far are # these, but we might someday need to try others, which might require # exhaustively searching the possibilities.... # (n_parts-1) * part_size >= file_size >= n_parts * part_size min_part_size = int(math.ceil(float(file_size)/n_parts)) max_part_size = file_size / (n_parts-1) # print " - min part size {} gives last block size of {}".format(min_part_size, file_size - min_part_size*(n_parts-1)) # print " - max part size {} gives last block size of {}".format(max_part_size, file_size - max_part_size*(n_parts-1)) possible_part_sizes = [ S3_MAX_UPLOAD_SIZE, # what we do file_size/n_parts, # seen this from third party uploaders min_part_size, # just in case max_part_size, # seen this from third party uploaders 1024*1024*8, # seen this from third party uploaders 1024*1024*5, # the minimum s3 will allow ] # print " - {} parts, file size {} bytes".format(n_parts, file_size) # print " - possible_part_sizes:", possible_part_sizes possible_part_sizes = set([part_size for part_size in possible_part_sizes if part_size <= max_part_size and part_size >= 1024*1024*5]) # print " - possible_part_sizes:", possible_part_sizes if not possible_part_sizes: return False for part_size in possible_part_sizes: # print " -", part_size, self._build_etag(k.path, n_parts, part_size) if self._build_etag(k.path, n_parts, part_size) == other_etag: return True return False
def exists(self, key_or_file, retries_allowed=3, version_id=None): ''' Check if a file exists on AWS S3 Returns a boolean. If the key is not found then we recheck up to `retries_allowed` times. We only do this on s3. We've had some observations of what appears to be eventual consistency, so this makes it a bit more reliable. This does slow down the call in the case where the key does not exist. On a relatively slow, high latency connection a test of 100 tests retreiving a non-existant file gives: With retries_allowed=1: median=457.587 ms, mean=707.12387 ms With retries_allowed=3: median=722.969 ms, mean=1185.86299 ms with retries_allowed=10: median=2489.767 ms, mean=2995.34233 ms With retries_allowed=100: median=24694.0815 ms, mean=26754.64137 ms So assume that letting retries_allowed=3 will cost you a bit less than double the time. ''' k = path.parse(key_or_file) if k.scheme == 'file': return os.path.exists(k.path) elif k.scheme == 's3': retry_attempts = 0 while retry_attempts < retries_allowed: key = self._lookup(k.netloc, k.path, cache_buckets=True, version_id=version_id) if key: if retry_attempts > 0: # only if we find it after failing at least once import warnings from baiji.exceptions import EventualConsistencyWarning warnings.warn("S3 is behaving in an eventually consistent way in s3.exists({}) -- it took {} attempts to locate the key".format(key_or_file, retry_attempts+1), EventualConsistencyWarning) return True retry_attempts += 1 return False else: raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
def get_string(self, key): ''' Get string stored in S3 ``key``. ''' k = path.parse(key) return self._lookup(k.netloc, k.path).get_contents_as_string()
def get_string(self, key, encoding=None): ''' Get string stored in S3 ``key``. ''' k = path.parse(key) return self._lookup(k.netloc, k.path).get_contents_as_string(encoding=encoding)