Exemplo n.º 1
0
def put_string_to_key(bucket, key_name, content, is_public, callback=None):
    """Write string to key in S3 bucket. If contents of existing key are
    unchanged, there will be no modification.
    Params:
        bucket (boto.s3 object): The bucket to write to.
        key_name (str): The key to write to (must include any applicable prefix).
        content (str): The content to write to the key.
        is_public (bool): Whether the new object should be publicly readable.
        callback (function): An optional progress callback.
    """
    key = bucket.get_key(key_name)
    if key:
        etag = key.etag.strip('"').lower()
        local_etag = hashlib.md5(content).hexdigest().lower()

        if etag == local_etag:
            # key contents haven't changed
            return

    key = bucket.new_key(key_name)
    mimetype = mimetypes.guess_type(key_name)[0]
    if mimetype:
        key.set_metadata('Content-Type', mimetype)

    policy = 'public-read' if is_public else None

    key.set_contents_from_string(content, policy=policy, cb=callback)
    key.close()
Exemplo n.º 2
0
def put_string_to_key(bucket, key_name, content, is_public, callback=None):
    """Write string to key in S3 bucket. If contents of existing key are
    unchanged, there will be no modification.
    Params:
        bucket (boto.s3 object): The bucket to write to.
        key_name (str): The key to write to (must include any applicable prefix).
        content (str): The content to write to the key.
        is_public (bool): Whether the new object should be publicly readable.
        callback (function): An optional progress callback.
    """
    key = bucket.get_key(key_name)
    if key:
        etag = key.etag.strip('"').lower()
        local_etag = hashlib.md5(content).hexdigest().lower()

        if etag == local_etag:
            # key contents haven't changed
            return

    key = bucket.new_key(key_name)
    mimetype = mimetypes.guess_type(key_name)[0]
    if mimetype:
        key.set_metadata('Content-Type', mimetype)

    policy = 'public-read' if is_public else None

    key.set_contents_from_string(content, policy=policy, cb=callback)
    key.close()
Exemplo n.º 3
0
 def test_delete_key(self):
     bucket = self.s3.get_bucket(self.old_style_bucket_id)
     key = bucket.new_key('testkey')
     key.set_contents_from_string('test')
     key.close()
     self.assertEqual([key.name for key in bucket.get_all_keys()], ['testkey'])
     _delete_key(bucket, 'testkey')
     self.assertEqual([key.name for key in bucket.get_all_keys()], [])
Exemplo n.º 4
0
 def test_delete_key(self):
     bucket = self.s3.get_bucket(self.old_style_bucket_id)
     key = bucket.new_key('testkey')
     key.set_contents_from_string('test')
     key.close()
     self.assertEqual([key.name for key in bucket.get_all_keys()],
                      ['testkey'])
     _delete_key(bucket, 'testkey')
     self.assertEqual([key.name for key in bucket.get_all_keys()], [])
Exemplo n.º 5
0
def s3_iter_bucket(bucket, prefix="", accept_key=lambda key: True, key_limit=None, workers=16):
    """
    Iterate and download all S3 files under `bucket/prefix`, yielding out
    `(key, key content)` 2-tuples (generator).

    `accept_key` is a function that accepts a key name (unicode string) and
    returns True/False, signalling whether the given key should be downloaded out or
    not (default: accept all keys).

    If `key_limit` is given, stop after yielding out that many results.

    The keys are processed in parallel, using `workers` processes (default: 16),
    to speed up downloads greatly. If multiprocessing is not available, thus
    NO_MULTIPROCESSING is True, this parameter will be ignored.

    Example::

      >>> mybucket = boto.connect_s3().get_bucket('mybucket')

      >>> # get all JSON files under "mybucket/foo/"
      >>> for key, content in s3_iter_bucket(mybucket, prefix='foo/', accept_key=lambda key: key.endswith('.json')):
      ...     print key, len(content)

      >>> # limit to 10k files, using 32 parallel workers (default is 16)
      >>> for key, content in s3_iter_bucket(mybucket, key_limit=10000, workers=32):
      ...     print key, len(content)

    """
    total_size, key_no = 0, -1
    keys = (key for key in bucket.list(prefix=prefix) if accept_key(key.name))

    if NO_MULTIPROCESSING:
        logger.info("iterating over keys from %s without multiprocessing" % bucket)
        iterator = imap(s3_iter_bucket_process_key, keys)
    else:
        logger.info("iterating over keys from %s with %i workers" % (bucket, workers))
        pool = multiprocessing.pool.Pool(processes=workers)
        iterator = pool.imap_unordered(s3_iter_bucket_process_key, keys)

    for key_no, (key, content) in enumerate(iterator):
        if key_no % 1000 == 0:
            logger.info(
                "yielding key #%i: %s, size %i (total %.1fMB)" % (key_no, key, len(content), total_size / 1024.0 ** 2)
            )

        yield key, content
        key.close()
        total_size += len(content)

        if key_limit is not None and key_no + 1 >= key_limit:
            # we were asked to output only a limited number of keys => we're done
            break

    if not NO_MULTIPROCESSING:
        pool.terminate()

    logger.info("processed %i keys, total size %i" % (key_no + 1, total_size))
Exemplo n.º 6
0
def s3_iter_bucket(bucket, prefix='', accept_key=lambda key: True, key_limit=None, workers=16, retries=3):
    """
    Iterate and download all S3 files under `bucket/prefix`, yielding out
    `(key, key content)` 2-tuples (generator).

    `accept_key` is a function that accepts a key name (unicode string) and
    returns True/False, signalling whether the given key should be downloaded out or
    not (default: accept all keys).

    If `key_limit` is given, stop after yielding out that many results.

    The keys are processed in parallel, using `workers` processes (default: 16),
    to speed up downloads greatly. If multiprocessing is not available, thus
    MULTIPROCESSING is False, this parameter will be ignored.

    Example::

      >>> mybucket = boto.connect_s3().get_bucket('mybucket')

      >>> # get all JSON files under "mybucket/foo/"
      >>> for key, content in s3_iter_bucket(mybucket, prefix='foo/', accept_key=lambda key: key.endswith('.json')):
      ...     print key, len(content)

      >>> # limit to 10k files, using 32 parallel workers (default is 16)
      >>> for key, content in s3_iter_bucket(mybucket, key_limit=10000, workers=32):
      ...     print key, len(content)

    """
    total_size, key_no = 0, -1
    keys = ({'key': key, 'retries': retries} for key in bucket.list(prefix=prefix) if accept_key(key.name))

    if MULTIPROCESSING:
        logger.info("iterating over keys from %s with %i workers" % (bucket, workers))
        pool = multiprocessing.pool.Pool(processes=workers)
        iterator = pool.imap_unordered(s3_iter_bucket_process_key_with_kwargs, keys)
    else:
        logger.info("iterating over keys from %s without multiprocessing" % bucket)
        iterator = imap(s3_iter_bucket_process_key_with_kwargs, keys)

    for key_no, (key, content) in enumerate(iterator):
        if key_no % 1000 == 0:
            logger.info("yielding key #%i: %s, size %i (total %.1fMB)" %
                (key_no, key, len(content), total_size / 1024.0 ** 2))

        yield key, content
        key.close()
        total_size += len(content)

        if key_limit is not None and key_no + 1 >= key_limit:
            # we were asked to output only a limited number of keys => we're done
            break

    if MULTIPROCESSING:
        pool.terminate()

    logger.info("processed %i keys, total size %i" % (key_no + 1, total_size))