def test_get_all_keys_tree(self): """ test storing and retrieving a directory tree """ # 2011-12-04 -- s3 clips leading slash key_names = [ "aaa/b/cccc/1", "aaa/b/ccccccccc/1", "aaa/b/ccccccccc/2", "aaa/b/ccccccccc/3", "aaa/b/dddd/1", "aaa/b/dddd/2", "aaa/e/ccccccccc/1", "fff/e/ccccccccc/1", ] # create the bucket bucket = self._s3_connection.create_unique_bucket() self.assertTrue(bucket is not None) for key in bucket.list(): key.delete() # create some keys keys = list() for key_name in key_names: key = Key(bucket) # set the name key.name = key_name # upload some data test_string = os.urandom(1024) key.set_contents_from_string(test_string) self.assertTrue(key.exists()) keys.append(key) result_set = BucketListResultSet(bucket, prefix="aaa") self.assertEqual(len(list(result_set)), 7) result_set = BucketListResultSet(bucket, prefix="aaa/b") self.assertEqual(len(list(result_set)), 6) result_set = BucketListResultSet(bucket, prefix="aaa/b/ccccccccc/") self.assertEqual(len(list(result_set)), 3) result_set = BucketListResultSet(bucket, prefix="aaa/b/dddd") self.assertEqual(len(list(result_set)), 2) result_set = BucketListResultSet(bucket, prefix="aaa/e") self.assertEqual(len(list(result_set)), 1) # delete the keys for key in bucket.list(): key.delete() # delete the bucket self._s3_connection.delete_bucket(bucket.name)
def list(self, prefix='', delimiter='', marker='', headers=None): """ List key objects within a bucket. This returns an instance of an BucketListResultSet that automatically handles all of the result paging, etc. from S3. You just need to keep iterating until there are no more results. Called with no arguments, this will return an iterator object across all keys within the bucket. :type prefix: string :param prefix: allows you to limit the listing to a particular prefix. For example, if you call the method with prefix='/foo/' then the iterator will only cycle through the keys that begin with the string '/foo/'. :type delimiter: string :param delimiter: can be used in conjunction with the prefix to allow you to organize and browse your keys hierarchically. See: http://docs.amazonwebservices.com/AmazonS3/2006-03-01/ for more details. :type marker: string :param marker: The "marker" of where you are in the result set :rtype: :class:`boto.s3.bucketlistresultset.BucketListResultSet` :return: an instance of a BucketListResultSet that handles paging, etc """ return BucketListResultSet(self, prefix, delimiter, marker, headers)
def list_bucket_contents(self, bucket, subdir=None): """Returns files in the Google Storage bucket as a (dirs, files) tuple. TODO(epoger): This should raise an exception if subdir does not exist in Google Storage; right now, it just returns empty contents. Args: bucket: name of the Google Storage bucket subdir: directory within the bucket to list, or None for root directory """ # The GS command relies on the prefix (if any) ending with a slash. prefix = subdir or '' if prefix and not prefix.endswith('/'): prefix += '/' prefix_length = len(prefix) if prefix else 0 b = self._connect_to_bucket(bucket=bucket) items = BucketListResultSet(bucket=b, prefix=prefix, delimiter='/') dirs = [] files = [] for item in items: t = type(item) if t is Key: files.append(item.name[prefix_length:]) elif t is Prefix: dirs.append(item.name[prefix_length:-1]) return (dirs, files)
def reap(s3_bucket, capacity, dry=False): keys = [key for key in BucketListResultSet(s3_bucket)] if len(keys) <= capacity: return 0 keys = sorted(keys, key=lambda x: iso8601.parse_date(x.last_modified)) keys.reverse() for key in keys[capacity:]: logger.debug("deleting key %s last modified @ %s from s3 bucket %s", key.name, key.last_modified, s3_bucket.name) if not dry: key.delete() return len(keys) - capacity
def info(): if connected == 0: print 'Not connected!' elif connected == 1: bucket = raw_input('Bucket name:').strip() filename = raw_input('Filename:').strip() from boto.s3.bucketlistresultset import BucketListResultSet b = conn.get_bucket(bucket) brs = BucketListResultSet(bucket=b) for f in brs: key = b.lookup(f.name) print 'File: ' + f.name print 'size: ' + str(key.size) print 'last modified: ' + str(key.last_modified) print 'etag (md5): ' + str(key.etag)
def test_get_all_keys_empty_bucket(self): """ test get_all_keys() on an empty buckey """ log = logging.getLogger("empty") # create the bucket bucket = self._s3_connection.create_unique_bucket() self.assertTrue(bucket is not None) for key in bucket.list(): key.delete() # try a simple get_all_keys() result_set = BucketListResultSet(bucket) self.assertEqual(list(result_set), []) # delete the bucket self._s3_connection.delete_bucket(bucket.name)
def list(self, prefix='', delimiter='', marker='', headers=None): """ List key objects within a bucket. This returns an instance of an BucketListResultSet that automatically handles all of the result paging, etc. from S3. You just need to keep iterating until there are no more results. Called with no arguments, this will return an iterator object across all keys within the bucket. The Key objects returned by the iterator are obtained by parsing the results of a GET on the bucket, also known as the List Objects request. The XML returned by this request contains only a subset of the information about each key. Certain metadata fields such as Content-Type and user metadata are not available in the XML. Therefore, if you want these additional metadata fields you will have to do a HEAD request on the Key in the bucket. :type prefix: string :param prefix: allows you to limit the listing to a particular prefix. For example, if you call the method with prefix='/foo/' then the iterator will only cycle through the keys that begin with the string '/foo/'. :type delimiter: string :param delimiter: can be used in conjunction with the prefix to allow you to organize and browse your keys hierarchically. See: http://docs.amazonwebservices.com/AmazonS3/2006-03-01/ for more details. :type marker: string :param marker: The "marker" of where you are in the result set :rtype: :class:`boto.s3.bucketlistresultset.BucketListResultSet` :return: an instance of a BucketListResultSet that handles paging, etc """ return BucketListResultSet(self, prefix, delimiter, marker, headers)
def __iter__(self): return iter(BucketListResultSet(self))
def upload_dir_contents(self, source_dir, dest_bucket, dest_dir, num_threads=DEFAULT_UPLOAD_THREADS, upload_if=UploadIf.ALWAYS, **kwargs): """Recursively upload contents of a local directory to Google Storage. params: source_dir: full path (local-OS-style) on local disk of directory to copy contents of dest_bucket: GS bucket to copy the files into dest_dir: full path (Posix-style) within that bucket; write the files into this directory. If None, write into the root directory of the bucket. num_threads: how many files to upload at once upload_if: one of the UploadIf values, describing in which cases we should upload the file kwargs: any additional keyword arguments "inherited" from upload_file() The copy operates as a merge: any files in source_dir will be "overlaid" on top of the existing content in dest_dir. Existing files with the same names may or may not be overwritten, depending on the value of upload_if. TODO(epoger): Upload multiple files simultaneously to reduce latency. """ b = self._connect_to_bucket(bucket=dest_bucket) if not dest_dir: dest_dir = '' # Create a set of all files within source_dir. source_fileset = set() prefix_length = len(source_dir) + 1 for dirpath, _, filenames in os.walk(source_dir): relative_dirpath = dirpath[prefix_length:] for filename in filenames: source_fileset.add(os.path.join(relative_dirpath, filename)) num_files_total = len(source_fileset) # If we are only uploading files conditionally, remove any unnecessary # files from source_fileset. if upload_if == self.UploadIf.ALWAYS: pass # there are no shortcuts... upload them all else: # Create a mapping of filename to Key for existing files within dest_dir existing_dest_filemap = {} prefix = dest_dir if prefix and not prefix.endswith('/'): prefix += '/' prefix_length = len(prefix) items = BucketListResultSet(bucket=b, prefix=prefix) for item in items: if type(item) is Key: existing_dest_filemap[item.name[prefix_length:]] = item # Now, depending on upload_if, trim files we should skip uploading. files_in_common = source_fileset.intersection( existing_dest_filemap.keys()) if upload_if == self.UploadIf.IF_NEW: source_fileset -= files_in_common elif upload_if == self.UploadIf.IF_MODIFIED: for rel_path in files_in_common: local_md5 = '"%s"' % _get_local_md5( path=os.path.join(source_dir, rel_path)) key = existing_dest_filemap[rel_path] if local_md5 == key.etag: source_fileset.remove(rel_path) else: raise Exception('unknown value of upload_if: %s' % upload_if) # Upload any files still in source_fileset. num_files_to_upload = len(source_fileset) print('Uploading %d files, skipping %d ...' % (num_files_to_upload, num_files_total - num_files_to_upload)) if num_files_to_upload == 0: return if num_threads > num_files_to_upload: num_threads = num_files_to_upload # Create a work queue with all files that need to be uploaded. q = Queue.Queue(maxsize=num_files_to_upload) for rel_path in source_fileset: q.put(rel_path) err = {} # Spin up worker threads to read from the task queue. def worker(): while True: try: rel_path = q.get(block=False) except Queue.Empty: return # no more tasks in the queue, so exit print(' Uploading file %d/%d: %s' % (num_files_to_upload - q.qsize(), num_files_to_upload, rel_path)) retries = 5 for retry in range(retries): try: self.upload_file( source_path=os.path.join(source_dir, rel_path), dest_bucket=b, dest_path=posixpath.join(dest_dir, rel_path), upload_if=self.UploadIf.ALWAYS, **kwargs) q.task_done() break except Exception as error: if retry < retries - 1: print ' Retrying upload, attempt #%d' % (retry + 1) time.sleep(2**retry) else: err[rel_path] = error for _ in range(num_threads): t = threading.Thread(target=worker) t.daemon = True t.start() # Block until all files have been uploaded and all workers have exited. q.join() if err: errMsg = 'Failed to upload the following: \n\n' for rel_path, e in err.iteritems(): errMsg += '%s: %s\n' % (rel_path, e) raise Exception(errMsg)