def put_string_to_key(bucket, key_name, content, is_public, callback=None): """Write string to key in S3 bucket. If contents of existing key are unchanged, there will be no modification. Params: bucket (boto.s3 object): The bucket to write to. key_name (str): The key to write to (must include any applicable prefix). content (str): The content to write to the key. is_public (bool): Whether the new object should be publicly readable. callback (function): An optional progress callback. """ key = bucket.get_key(key_name) if key: etag = key.etag.strip('"').lower() local_etag = hashlib.md5(content).hexdigest().lower() if etag == local_etag: # key contents haven't changed return key = bucket.new_key(key_name) mimetype = mimetypes.guess_type(key_name)[0] if mimetype: key.set_metadata('Content-Type', mimetype) policy = 'public-read' if is_public else None key.set_contents_from_string(content, policy=policy, cb=callback) key.close()
def test_delete_key(self): bucket = self.s3.get_bucket(self.old_style_bucket_id) key = bucket.new_key('testkey') key.set_contents_from_string('test') key.close() self.assertEqual([key.name for key in bucket.get_all_keys()], ['testkey']) _delete_key(bucket, 'testkey') self.assertEqual([key.name for key in bucket.get_all_keys()], [])
def s3_iter_bucket(bucket, prefix="", accept_key=lambda key: True, key_limit=None, workers=16): """ Iterate and download all S3 files under `bucket/prefix`, yielding out `(key, key content)` 2-tuples (generator). `accept_key` is a function that accepts a key name (unicode string) and returns True/False, signalling whether the given key should be downloaded out or not (default: accept all keys). If `key_limit` is given, stop after yielding out that many results. The keys are processed in parallel, using `workers` processes (default: 16), to speed up downloads greatly. If multiprocessing is not available, thus NO_MULTIPROCESSING is True, this parameter will be ignored. Example:: >>> mybucket = boto.connect_s3().get_bucket('mybucket') >>> # get all JSON files under "mybucket/foo/" >>> for key, content in s3_iter_bucket(mybucket, prefix='foo/', accept_key=lambda key: key.endswith('.json')): ... print key, len(content) >>> # limit to 10k files, using 32 parallel workers (default is 16) >>> for key, content in s3_iter_bucket(mybucket, key_limit=10000, workers=32): ... print key, len(content) """ total_size, key_no = 0, -1 keys = (key for key in bucket.list(prefix=prefix) if accept_key(key.name)) if NO_MULTIPROCESSING: logger.info("iterating over keys from %s without multiprocessing" % bucket) iterator = imap(s3_iter_bucket_process_key, keys) else: logger.info("iterating over keys from %s with %i workers" % (bucket, workers)) pool = multiprocessing.pool.Pool(processes=workers) iterator = pool.imap_unordered(s3_iter_bucket_process_key, keys) for key_no, (key, content) in enumerate(iterator): if key_no % 1000 == 0: logger.info( "yielding key #%i: %s, size %i (total %.1fMB)" % (key_no, key, len(content), total_size / 1024.0 ** 2) ) yield key, content key.close() total_size += len(content) if key_limit is not None and key_no + 1 >= key_limit: # we were asked to output only a limited number of keys => we're done break if not NO_MULTIPROCESSING: pool.terminate() logger.info("processed %i keys, total size %i" % (key_no + 1, total_size))
def s3_iter_bucket(bucket, prefix='', accept_key=lambda key: True, key_limit=None, workers=16, retries=3): """ Iterate and download all S3 files under `bucket/prefix`, yielding out `(key, key content)` 2-tuples (generator). `accept_key` is a function that accepts a key name (unicode string) and returns True/False, signalling whether the given key should be downloaded out or not (default: accept all keys). If `key_limit` is given, stop after yielding out that many results. The keys are processed in parallel, using `workers` processes (default: 16), to speed up downloads greatly. If multiprocessing is not available, thus MULTIPROCESSING is False, this parameter will be ignored. Example:: >>> mybucket = boto.connect_s3().get_bucket('mybucket') >>> # get all JSON files under "mybucket/foo/" >>> for key, content in s3_iter_bucket(mybucket, prefix='foo/', accept_key=lambda key: key.endswith('.json')): ... print key, len(content) >>> # limit to 10k files, using 32 parallel workers (default is 16) >>> for key, content in s3_iter_bucket(mybucket, key_limit=10000, workers=32): ... print key, len(content) """ total_size, key_no = 0, -1 keys = ({'key': key, 'retries': retries} for key in bucket.list(prefix=prefix) if accept_key(key.name)) if MULTIPROCESSING: logger.info("iterating over keys from %s with %i workers" % (bucket, workers)) pool = multiprocessing.pool.Pool(processes=workers) iterator = pool.imap_unordered(s3_iter_bucket_process_key_with_kwargs, keys) else: logger.info("iterating over keys from %s without multiprocessing" % bucket) iterator = imap(s3_iter_bucket_process_key_with_kwargs, keys) for key_no, (key, content) in enumerate(iterator): if key_no % 1000 == 0: logger.info("yielding key #%i: %s, size %i (total %.1fMB)" % (key_no, key, len(content), total_size / 1024.0 ** 2)) yield key, content key.close() total_size += len(content) if key_limit is not None and key_no + 1 >= key_limit: # we were asked to output only a limited number of keys => we're done break if MULTIPROCESSING: pool.terminate() logger.info("processed %i keys, total size %i" % (key_no + 1, total_size))