def put_multipart(self, local_path, destination_s3_path, part_size=67108864, **kwargs): """ Put an object stored locally to an S3 path using S3 multi-part upload (for files > 5GB). :param local_path: Path to source local file :param destination_s3_path: URL for target S3 location :param part_size: Part size in bytes. Default: 67108864 (64MB), must be >= 5MB and <= 5 GB. :param kwargs: Keyword arguments are passed to the boto function `initiate_multipart_upload` """ # calculate number of parts to upload # based on the size of the file source_size = os.stat(local_path).st_size if source_size <= part_size: # fallback to standard, non-multipart strategy return self.put(local_path, destination_s3_path, **kwargs) (bucket, key) = self._path_to_bucket_and_key(destination_s3_path) # grab and validate the bucket s3_bucket = self.s3.get_bucket(bucket, validate=True) # calculate the number of parts (int division). # use modulo to avoid float precision issues # for exactly-sized fits num_parts = \ (source_size // part_size) \ if source_size % part_size == 0 \ else (source_size // part_size) + 1 mp = None try: mp = s3_bucket.initiate_multipart_upload(key, **kwargs) for i in range(num_parts): # upload a part at a time to S3 offset = part_size * i bytes = min(part_size, source_size - offset) with open(local_path, 'rb') as fp: part_num = i + 1 logger.info('Uploading part %s/%s to %s', part_num, num_parts, destination_s3_path) fp.seek(offset) mp.upload_part_from_file(fp, part_num=part_num, size=bytes) # finish the upload, making the file available in S3 mp.complete_upload() except BaseException: if mp: logger.info('Canceling multipart s3 upload for %s', destination_s3_path) # cancel the upload so we don't get charged for # storage consumed by uploaded parts mp.cancel_upload() raise
def is_writable(self): """ Currently only works with hadoopcli """ if "/" in self.path: # example path: /log/ap/2013-01-17/00 parts = self.path.split("/") # start with the full path and then up the tree until we can check length = len(parts) for part in range(length): path = "/".join(parts[0:length - part]) + "/" if self.fs.exists(path): # if the path exists and we can write there, great! if self._is_writable(path): return True # if it exists and we can't =( sad panda else: return False # We went through all parts of the path and we still couldn't find # one that exists. return False
def __copy_multipart(self, pool, src_bucket, src_key, dst_bucket, dst_key, part_size, **kwargs): """ Copy a single S3 object to another S3 object, falling back to multipart copy where necessary NOTE: This is a private method and should only be called from within the `s3.copy` method :param pool: The threadpool to put the s3 copy processes onto :param src_bucket: source bucket name :param src_key: source key name :param dst_bucket: destination bucket name :param dst_key: destination key name :param key_size: size of the key to copy in bytes :param part_size: Part size in bytes. Must be >= 5MB and <= 5 GB. :param kwargs: Keyword arguments are passed to the boto function `initiate_multipart_upload` """ source_bucket = self.s3.get_bucket(src_bucket, validate=True) dest_bucket = self.s3.get_bucket(dst_bucket, validate=True) key_size = source_bucket.lookup(src_key).size # We can't do a multipart copy on an empty Key, so handle this specially. # Also, don't bother using the multipart machinery if we're only dealing with a small non-multipart file if key_size == 0 or key_size <= part_size: result = pool.apply_async(dest_bucket.copy_key, args=(dst_key, src_bucket, src_key), kwds=kwargs) # Bubble up any errors we may encounter return result.get() mp = None try: mp = dest_bucket.initiate_multipart_upload(dst_key, **kwargs) cur_pos = 0 # Store the results from the apply_async in a list so we can check for failures results = [] # Calculate the number of chunks the file will be num_parts = (key_size + part_size - 1) // part_size for i in range(num_parts): # Issue an S3 copy request, one part at a time, from one S3 object to another part_start = cur_pos cur_pos += part_size part_end = min(cur_pos - 1, key_size - 1) part_num = i + 1 results.append(pool.apply_async(mp.copy_part_from_key, args=(src_bucket, src_key, part_num, part_start, part_end))) logger.info('Requesting copy of %s/%s to %s/%s', part_num, num_parts, dst_bucket, dst_key) logger.info('Waiting for multipart copy of %s/%s to finish', dst_bucket, dst_key) # This will raise any exceptions in any of the copy threads for result in results: result.get() # finish the copy, making the file available in S3 mp.complete_upload() return mp.key_name except BaseException: logger.info('Error during multipart s3 copy for %s/%s to %s/%s...', src_bucket, src_key, dst_bucket, dst_key) # cancel the copy so we don't get charged for storage consumed by copied parts if mp: mp.cancel_upload() raise
def copy_multipart(self, source_path, destination_path, part_size=67108864, **kwargs): """ Copy a single S3 object to another S3 object using S3 multi-part copy (for files > 5GB). It will use a single thread per request so that all parts are requested to be moved simultaneously for maximum speed. :param source_path: URL for S3 Source :param destination_path: URL for target S3 location :param part_size: Part size in bytes. Default: 67108864 (64MB), must be >= 5MB and <= 5 GB. :param kwargs: Keyword arguments are passed to the boto function `initiate_multipart_upload` """ (src_bucket, src_key) = self._path_to_bucket_and_key(source_path) (dst_bucket, dst_key) = self._path_to_bucket_and_key(destination_path) dest_bucket = self.s3.get_bucket(dst_bucket, validate=True) source_bucket = self.s3.get_bucket(src_bucket, validate=True) source_size = source_bucket.lookup(src_key).size num_parts = (source_size + part_size - 1) // part_size # As the S3 copy command is completely server side, there is no issue with issuing a single # API call per part, however, this may in theory cause issues on systems with low ulimits for # number of threads when copying really large files, e.g. with a ~100GB file this will open ~1500 # threads. We take the max of this and 1 as if we're copying an empty file we will have `num_part == 0` pool = ThreadPool(processes=max(1, num_parts)) mp = None try: mp = dest_bucket.initiate_multipart_upload(dst_key, **kwargs) cur_pos = 0 # Store the results from the apply_async in a list so we can check for failures results = [] for i in range(num_parts): # Issue an S3 copy request, one part at a time, from one S3 object to another part_start = cur_pos cur_pos += part_size part_end = min(cur_pos - 1, source_size - 1) part_num = i + 1 results.append(pool.apply_async(mp.copy_part_from_key, args=(src_bucket, src_key, part_num, part_start, part_end))) logger.info('Requesting copy of %s/%s to %s', part_num, num_parts, destination_path) logger.info('Waiting for multipart copy of %s to finish', destination_path) pool.close() pool.join() # This will raise any exceptions in any of the copy threads for result in results: result.get() # finish the copy, making the file available in S3 mp.complete_upload() except BaseException: if mp: logger.info('Canceling multipart s3 copy for %s to %s', source_path, destination_path) # cancel the copy so we don't get charged for # storage consumed by copied parts mp.cancel_upload() raise