def multi_upload(self): """ Performs multipart uploads. It initiates the multipart upload. It creates a queue ``part_queue`` which is directly responsible with controlling the progress of the multipart upload. It then creates ``UploadPartTasks`` for threads to run via the ``executer``. This fucntion waits for all of the parts in the multipart upload to finish, and then it completes the multipart upload. This method waits on its parts to finish. So, threads are required to process the parts for this function to complete. """ part_queue = NoBlockQueue(self.interrupt) complete_upload_queue = Queue.PriorityQueue() part_counter = MultiCounter() counter_lock = threading.Lock() bucket, key = find_bucket_key(self.dest) params = {'endpoint': self.endpoint, 'bucket': bucket, 'key': key} if self.parameters['acl']: params['acl'] = self.parameters['acl'][0] if self.parameters['guess_mime_type']: self._inject_content_type(params, self.src) response_data, http = operate(self.service, 'CreateMultipartUpload', params) upload_id = response_data['UploadId'] size_uploads = self.chunksize num_uploads = int(math.ceil(self.size/float(size_uploads))) for i in range(1, (num_uploads + 1)): part_info = (self, upload_id, i, size_uploads) part_queue.put(part_info) task = UploadPartTask(session=self.session, executer=self.executer, part_queue=part_queue, dest_queue=complete_upload_queue, region=self.region, printQueue=self.printQueue, interrupt=self.interrupt, part_counter=part_counter, counter_lock=counter_lock) self.executer.submit(task) part_queue.join() # The following ensures that if the multipart upload is in progress, # all part uploads finish before aborting or completing. This # really only applies when an interrupt signal is sent because the # ``part_queue.join()`` ensures this if the process is not # interrupted. while part_counter.count: time.sleep(0.1) parts_list = [] while not complete_upload_queue.empty(): part = complete_upload_queue.get() parts_list.append(part[1]) if len(parts_list) == num_uploads: parts = {'Parts': parts_list} params = {'endpoint': self.endpoint, 'bucket': bucket, 'key': key, 'upload_id': upload_id, 'multipart_upload': parts} operate(self.service, 'CompleteMultipartUpload', params) else: abort_params = {'endpoint': self.endpoint, 'bucket': bucket, 'key': key, 'upload_id': upload_id} operate(self.service, 'AbortMultipartUpload', abort_params) raise Exception()
def multi_download(self): """ This performs the multipart download. It assigns ranges to get from s3 of particular object to a task.It creates a queue ``part_queue`` which is directly responsible with controlling the progress of the multipart download. It then creates ``DownloadPartTasks`` for threads to run via the ``executer``. This fucntion waits for all of the parts in the multipart download to finish, and then the last modification time is changed to the last modified time of the s3 object. This method waits on its parts to finish. So, threads are required to process the parts for this function to complete. """ part_queue = NoBlockQueue(self.interrupt) dest_queue = NoBlockQueue(self.interrupt) part_counter = MultiCounter() write_lock = threading.Lock() counter_lock = threading.Lock() d = os.path.dirname(self.dest) try: if not os.path.exists(d): os.makedirs(d) except Exception: pass size_uploads = self.chunksize num_uploads = int(self.size / size_uploads) with open(self.dest, 'wb') as f: for i in range(num_uploads): part = (self, i, size_uploads) part_queue.put(part) task = DownloadPartTask(session=self.session, executer=self.executer, part_queue=part_queue, dest_queue=dest_queue, f=f, region=self.region, printQueue=self.printQueue, write_lock=write_lock, part_counter=part_counter, counter_lock=counter_lock) self.executer.submit(task) part_queue.join() # The following ensures that if the multipart download is # in progress, all part uploads finish before releasing the # the file handle. This really only applies when an interrupt # signal is sent because the ``part_queue.join()`` ensures this # if the process is not interrupted. while part_counter.count: time.sleep(0.1) part_list = [] while not dest_queue.empty(): part = dest_queue.get() part_list.append(part) if len(part_list) != num_uploads: raise Exception() last_update_tuple = self.last_update.timetuple() mod_timestamp = time.mktime(last_update_tuple) os.utime(self.dest, (int(mod_timestamp), int(mod_timestamp)))
class Executer(object): """ This class is in charge of all of the threads. It starts up the threads and cleans up the threads when done. The two type of threads the ``Executer``runs is a worker and a print thread. """ def __init__(self, done, num_threads, timeout, print_queue, quiet, interrupt, max_multi, max_queue_size): self.queue = None self.done = done self.num_threads = num_threads self.timeout = timeout self.print_queue = print_queue self.quiet = quiet self.interrupt = interrupt self.threads_list = [] self.max_multi = max_multi self.multi_lock = threading.Lock() self.multi_counter = MultiCounter() self._max_queue_size = max_queue_size def start(self): self.queue = NoBlockQueue(self.interrupt, maxsize=self._max_queue_size) self.multi_counter.count = 0 self.print_thread = PrintThread(self.print_queue, self.done, self.quiet, self.interrupt, self.timeout) self.print_thread.setDaemon(True) self.threads_list.append(self.print_thread) self.print_thread.start() for i in range(self.num_threads): worker = Worker(queue=self.queue, done=self.done, timeout=self.timeout, multi_lock=self.multi_lock, multi_counter=self.multi_counter, max_multi=self.max_multi) worker.setDaemon(True) self.threads_list.append(worker) worker.start() def submit(self, task): """ This is the function used to submit a task to the ``Executer``. """ self.queue.put(task) def wait(self): """ This is the function used to wait on all of the tasks to finish in the ``Executer``. """ self.queue.join() def join(self): """ This is used to clean up the ``Executer``. """ for thread in self.threads_list: thread.join()
class Executer(object): """ This class is in charge of all of the threads. It starts up the threads and cleans up the threads when done. The two type of threads the ``Executer``runs is a worker and a print thread. """ def __init__(self, done, num_threads, timeout, printQueue, quiet, interrupt, max_multi): self.queue = None self.done = done self.num_threads = num_threads self.timeout = timeout self.printQueue = printQueue self.quiet = quiet self.interrupt = interrupt self.threads_list = [] self.max_multi = max_multi self.multi_lock = threading.Lock() self.multi_counter = MultiCounter() def start(self): self.queue = NoBlockQueue(self.interrupt) self.multi_counter.count = 0 self.print_thread = PrintThread(self.printQueue, self.done, self.quiet, self.interrupt, self.timeout) self.print_thread.setDaemon(True) self.threads_list.append(self.print_thread) self.print_thread.start() for i in range(self.num_threads): worker = Worker(queue=self.queue, done=self.done, timeout=self.timeout, multi_lock=self.multi_lock, multi_counter=self.multi_counter, max_multi=self.max_multi) worker.setDaemon(True) self.threads_list.append(worker) worker.start() def submit(self, task): """ This is the function used to submit a task to the ``Executer``. """ self.queue.put(task) def wait(self): """ This is the function used to wait on all of the tasks to finish in the ``Executer``. """ self.queue.join() def join(self): """ This is used to clean up the ``Executer``. """ for thread in self.threads_list: thread.join()
def multi_download(self): """ This performs the multipart download. It assigns ranges to get from s3 of particular object to a task.It creates a queue ``part_queue`` which is directly responsible with controlling the progress of the multipart download. It then creates ``DownloadPartTasks`` for threads to run via the ``executer``. This fucntion waits for all of the parts in the multipart download to finish, and then the last modification time is changed to the last modified time of the s3 object. This method waits on its parts to finish. So, threads are required to process the parts for this function to complete. """ part_queue = NoBlockQueue(self.interrupt) dest_queue = NoBlockQueue(self.interrupt) part_counter = MultiCounter() write_lock = threading.Lock() counter_lock = threading.Lock() d = os.path.dirname(self.dest) try: if not os.path.exists(d): os.makedirs(d) except Exception: pass size_uploads = self.chunksize num_uploads = int(self.size/size_uploads) with open(self.dest, 'wb') as f: for i in range(num_uploads): part = (self, i, size_uploads) part_queue.put(part) task = DownloadPartTask(session=self.session, executer=self.executer, part_queue=part_queue, dest_queue=dest_queue, f=f, region=self.region, printQueue=self.printQueue, write_lock=write_lock, part_counter=part_counter, counter_lock=counter_lock) self.executer.submit(task) part_queue.join() # The following ensures that if the multipart download is # in progress, all part uploads finish before releasing the # the file handle. This really only applies when an interrupt # signal is sent because the ``part_queue.join()`` ensures this # if the process is not interrupted. while part_counter.count: time.sleep(0.1) part_list = [] while not dest_queue.empty(): part = dest_queue.get() part_list.append(part) if len(part_list) != num_uploads: raise Exception() last_update_tuple = self.last_update.timetuple() mod_timestamp = time.mktime(last_update_tuple) os.utime(self.dest, (int(mod_timestamp), int(mod_timestamp)))
class Executer(object): """ This class is in charge of all of the threads. It starts up the threads and cleans up the threads when done. The two type of threads the ``Executer``runs is a worker and a print thread. """ def __init__(self, done, num_threads, result_queue, quiet, interrupt, max_queue_size): self.queue = None self.done = done self.num_threads = num_threads self.result_queue = result_queue self.quiet = quiet self.interrupt = interrupt self.threads_list = [] self._max_queue_size = max_queue_size self.print_thread = None @property def num_tasks_failed(self): tasks_failed = 0 if self.print_thread is not None: tasks_failed = self.print_thread.num_errors_seen return tasks_failed def start(self): self.print_thread = PrintThread(self.result_queue, self.done, self.quiet, self.interrupt) self.print_thread.daemon = True self.queue = NoBlockQueue(self.interrupt, maxsize=self._max_queue_size) self.threads_list.append(self.print_thread) self.print_thread.start() for i in range(self.num_threads): worker = Worker(queue=self.queue, done=self.done) worker.setDaemon(True) self.threads_list.append(worker) worker.start() def submit(self, task): """ This is the function used to submit a task to the ``Executer``. """ LOGGER.debug("Submitting task: %s", task) self.queue.put(task) def wait(self): """ This is the function used to wait on all of the tasks to finish in the ``Executer``. """ self.queue.join() def join(self): """ This is used to clean up the ``Executer``. """ self.result_queue.put(QUEUE_END_SENTINEL) for i in range(self.num_threads): self.queue.put(QUEUE_END_SENTINEL) for thread in self.threads_list: thread.join()
class S3Handler(object): """ This class sets up the process to perform the tasks sent to it. It sources the ``self.executer`` from which threads inside the class pull tasks from to complete. """ def __init__(self, session, params, multi_threshold=MULTI_THRESHOLD, chunksize=CHUNKSIZE): self.session = session self.done = threading.Event() self.interrupt = threading.Event() self.result_queue = NoBlockQueue() self.params = {'dryrun': False, 'quiet': False, 'acl': None, 'guess_mime_type': True, 'sse': False, 'storage_class': None, 'website_redirect': None, 'content_type': None, 'cache_control': None, 'content_disposition': None, 'content_encoding': None, 'content_language': None, 'expires': None, 'grants': None} self.params['region'] = params['region'] for key in self.params.keys(): if key in params: self.params[key] = params[key] self.multi_threshold = multi_threshold self.chunksize = chunksize self.executer = Executer( done=self.done, num_threads=NUM_THREADS, timeout=QUEUE_TIMEOUT_GET, result_queue=self.result_queue, quiet=self.params['quiet'], interrupt=self.interrupt, max_queue_size=MAX_QUEUE_SIZE, ) self._multipart_uploads = [] self._multipart_downloads = [] def call(self, files): """ This function pulls a ``FileInfo`` or ``TaskInfo`` object from a list ``files``. Each object is then deemed if it will be a multipart operation and add the necessary attributes if so. Each object is then wrapped with a ``BasicTask`` object which is essentially a thread of execution for a thread to follow. These tasks are then submitted to the main executer. """ self.done.clear() self.interrupt.clear() try: self.executer.start() total_files, total_parts = self._enqueue_tasks(files) self.executer.print_thread.set_total_files(total_files) self.executer.print_thread.set_total_parts(total_parts) self.executer.wait() self.result_queue.join() except Exception as e: LOGGER.debug('Exception caught during task execution: %s', str(e), exc_info=True) self.result_queue.put({'message': str(e), 'error': True}) except KeyboardInterrupt: self.interrupt.set() self.result_queue.put({'message': "Cleaning up. Please wait...", 'error': False}) self._shutdown() return self.executer.num_tasks_failed def _shutdown(self): # self.done will tell threads to shutdown. self.done.set() # This waill wait until all the threads are joined. self.executer.join() # And finally we need to make a pass through all the existing # multipart uploads and abort any pending multipart uploads. self._abort_pending_multipart_uploads() self._remove_pending_downloads() def _abort_pending_multipart_uploads(self): # For the purpose of aborting uploads, we consider any # upload context with an upload id. for upload, filename in self._multipart_uploads: if upload.is_cancelled(): try: upload.wait_for_upload_id() except tasks.UploadCancelledError: pass else: # This means that the upload went from STARTED -> CANCELLED. # This could happen if a part thread decided to cancel the # upload. We need to explicitly abort the upload here. self._cancel_upload(upload.wait_for_upload_id(), filename) upload.cancel_upload(self._cancel_upload, args=(filename,)) def _remove_pending_downloads(self): # The downloads case is easier than the uploads case because we don't # need to make any service calls. To properly cleanup we just need # to go through the multipart downloads that were in progress but # cancelled and remove the local file. for context, local_filename in self._multipart_downloads: if (context.is_cancelled() or context.is_started()) and \ os.path.exists(local_filename): # The file is in an inconsistent state (not all the parts # were written to the file) so we should remove the # local file rather than leave it in a bad state. We don't # want to remove the files if the download has *not* been # started because we haven't touched the file yet, so it's # better to leave the old version of the file rather than # deleting the file entirely. os.remove(local_filename) def _cancel_upload(self, upload_id, filename): bucket, key = find_bucket_key(filename.dest) params = { 'bucket': bucket, 'key': key, 'upload_id': upload_id, 'endpoint': filename.endpoint, } LOGGER.debug("Aborting multipart upload for: %s", key) response_data, http = operate( filename.service, 'AbortMultipartUpload', params) def _enqueue_tasks(self, files): total_files = 0 total_parts = 0 for filename in files: num_uploads = 1 is_multipart_task = self._is_multipart_task(filename) too_large = False if hasattr(filename, 'size'): too_large = filename.size > MAX_UPLOAD_SIZE if too_large and filename.operation_name == 'upload': warning = "Warning %s exceeds 5 TB and upload is " \ "being skipped" % relative_path(filename.src) self.result_queue.put({'message': warning, 'error': True}) elif is_multipart_task and not self.params['dryrun']: # If we're in dryrun mode, then we don't need the # real multipart tasks. We can just use a BasicTask # in the else clause below, which will print out the # fact that it's transferring a file rather than # the specific part tasks required to perform the # transfer. num_uploads = self._enqueue_multipart_tasks(filename) else: task = tasks.BasicTask( session=self.session, filename=filename, parameters=self.params, result_queue=self.result_queue) self.executer.submit(task) total_files += 1 total_parts += num_uploads return total_files, total_parts def _is_multipart_task(self, filename): # First we need to determine if it's an operation that even # qualifies for multipart upload. if hasattr(filename, 'size'): above_multipart_threshold = filename.size > self.multi_threshold if above_multipart_threshold: if filename.operation_name in ('upload', 'download', 'move', 'copy'): return True else: return False else: return False def _enqueue_multipart_tasks(self, filename): num_uploads = 1 if filename.operation_name == 'upload': num_uploads = self._enqueue_multipart_upload_tasks(filename) elif filename.operation_name == 'move': if filename.src_type == 'local' and filename.dest_type == 's3': num_uploads = self._enqueue_multipart_upload_tasks( filename, remove_local_file=True) elif filename.src_type == 's3' and filename.dest_type == 'local': num_uploads = self._enqueue_range_download_tasks( filename, remove_remote_file=True) elif filename.src_type == 's3' and filename.dest_type == 's3': num_uploads = self._enqueue_multipart_copy_tasks( filename, remove_remote_file=True) else: raise ValueError("Unknown transfer type of %s -> %s" % (filename.src_type, filename.dest_type)) elif filename.operation_name == 'copy': num_uploads = self._enqueue_multipart_copy_tasks( filename, remove_remote_file=False) elif filename.operation_name == 'download': num_uploads = self._enqueue_range_download_tasks(filename) return num_uploads def _enqueue_range_download_tasks(self, filename, remove_remote_file=False): chunksize = find_chunksize(filename.size, self.chunksize) num_downloads = int(filename.size / chunksize) context = tasks.MultipartDownloadContext(num_downloads) create_file_task = tasks.CreateLocalFileTask(context=context, filename=filename) self.executer.submit(create_file_task) for i in range(num_downloads): task = tasks.DownloadPartTask( part_number=i, chunk_size=chunksize, result_queue=self.result_queue, service=filename.service, filename=filename, context=context) self.executer.submit(task) complete_file_task = tasks.CompleteDownloadTask( context=context, filename=filename, result_queue=self.result_queue, params=self.params) self.executer.submit(complete_file_task) self._multipart_downloads.append((context, filename.dest)) if remove_remote_file: remove_task = tasks.RemoveRemoteObjectTask( filename=filename, context=context) self.executer.submit(remove_task) return num_downloads def _enqueue_multipart_upload_tasks(self, filename, remove_local_file=False): # First we need to create a CreateMultipartUpload task, # then create UploadTask objects for each of the parts. # And finally enqueue a CompleteMultipartUploadTask. chunksize = find_chunksize(filename.size, self.chunksize) num_uploads = int(math.ceil(filename.size / float(chunksize))) upload_context = self._enqueue_upload_start_task( chunksize, num_uploads, filename) self._enqueue_upload_tasks( num_uploads, chunksize, upload_context, filename, tasks.UploadPartTask) self._enqueue_upload_end_task(filename, upload_context) if remove_local_file: remove_task = tasks.RemoveFileTask(local_filename=filename.src, upload_context=upload_context) self.executer.submit(remove_task) return num_uploads def _enqueue_multipart_copy_tasks(self, filename, remove_remote_file=False): chunksize = find_chunksize(filename.size, self.chunksize) num_uploads = int(math.ceil(filename.size / float(chunksize))) upload_context = self._enqueue_upload_start_task( chunksize, num_uploads, filename) self._enqueue_upload_tasks( num_uploads, chunksize, upload_context, filename, tasks.CopyPartTask) self._enqueue_upload_end_task(filename, upload_context) if remove_remote_file: remove_task = tasks.RemoveRemoteObjectTask( filename=filename, context=upload_context) self.executer.submit(remove_task) return num_uploads def _enqueue_upload_start_task(self, chunksize, num_uploads, filename): upload_context = tasks.MultipartUploadContext( expected_parts=num_uploads) create_multipart_upload_task = tasks.CreateMultipartUploadTask( session=self.session, filename=filename, parameters=self.params, result_queue=self.result_queue, upload_context=upload_context) self.executer.submit(create_multipart_upload_task) return upload_context def _enqueue_upload_tasks(self, num_uploads, chunksize, upload_context, filename, task_class): for i in range(1, (num_uploads + 1)): task = task_class( part_number=i, chunk_size=chunksize, result_queue=self.result_queue, upload_context=upload_context, filename=filename) self.executer.submit(task) def _enqueue_upload_end_task(self, filename, upload_context): complete_multipart_upload_task = tasks.CompleteMultipartUploadTask( session=self.session, filename=filename, parameters=self.params, result_queue=self.result_queue, upload_context=upload_context) self.executer.submit(complete_multipart_upload_task) self._multipart_uploads.append((upload_context, filename))
class S3Handler(object): """ This class sets up the process to perform the tasks sent to it. It sources the ``self.executer`` from which threads inside the class pull tasks from to complete. """ def __init__(self, session, params, multi_threshold=MULTI_THRESHOLD, chunksize=CHUNKSIZE): self.session = session self.done = threading.Event() self.interrupt = threading.Event() self.printQueue = NoBlockQueue() self.params = {'dryrun': False, 'quiet': False, 'acl': None} self.params['region'] = params['region'] for key in self.params.keys(): if key in params: self.params[key] = params[key] self.multi_threshold = multi_threshold self.chunksize = chunksize self.executer = Executer(done=self.done, num_threads=NUM_THREADS, timeout=QUEUE_TIMEOUT_GET, printQueue=self.printQueue, quiet=self.params['quiet'], interrupt=self.interrupt, max_multi=NUM_MULTI_THREADS) def call(self, files): """ This function pulls a ``FileInfo`` or ``TaskInfo`` object from a list ``files``. Each object is then deemed if it will be a multipart operation and add the necessary attributes if so. Each object is then wrapped with a ``BasicTask`` object which is essentially a thread of execution for a thread to follow. These tasks are then submitted to the main executer. """ self.done.clear() self.interrupt.clear() try: self.executer.start() tot_files = 0 tot_parts = 0 for filename in files: num_uploads = 1 is_larger = False chunksize = self.chunksize too_large = False if hasattr(filename, 'size'): is_larger = filename.size > self.multi_threshold too_large = filename.size > MAX_UPLOAD_SIZE if is_larger: if filename.operation == 'upload': num_uploads = int(math.ceil(filename.size / float(chunksize))) chunksize = find_chunksize(filename.size, chunksize) filename.set_multi(executer=self.executer, printQueue=self.printQueue, interrupt=self.interrupt, chunksize=chunksize) elif filename.operation == 'download': num_uploads = int(filename.size / chunksize) filename.set_multi(executer=self.executer, printQueue=self.printQueue, interrupt=self.interrupt, chunksize=chunksize) task = BasicTask(session=self.session, filename=filename, executer=self.executer, done=self.done, parameters=self.params, multi_threshold=self.multi_threshold, chunksize=chunksize, printQueue=self.printQueue, interrupt=self.interrupt) if too_large and filename.operation == 'upload': warning = "Warning %s exceeds 5 TB and upload is " \ "being skipped" % os.path.relpath(filename.src) self.printQueue.put({'result': warning}) else: self.executer.submit(task) tot_files += 1 tot_parts += num_uploads self.executer.print_thread.totalFiles = tot_files self.executer.print_thread.totalParts = tot_parts self.executer.wait() self.printQueue.join() except Exception as e: LOGGER.debug('%s' % str(e)) except KeyboardInterrupt: self.interrupt.set() self.printQueue.put({'result': "Cleaning up. Please wait..."}) self.done.set() self.executer.join()
class Executor(object): """ This class is in charge of all of the threads. It starts up the threads and cleans up the threads when done. The two type of threads the ``Executor``runs is a worker and a print thread. """ def __init__(self, done, num_threads, result_queue, quiet, interrupt, max_queue_size, write_queue): self.queue = None self.done = done self.num_threads = num_threads self.result_queue = result_queue self.quiet = quiet self.interrupt = interrupt self.threads_list = [] self._max_queue_size = max_queue_size self.write_queue = write_queue self.print_thread = None self.io_thread = None @property def num_tasks_failed(self): tasks_failed = 0 if self.print_thread is not None: tasks_failed = self.print_thread.num_errors_seen return tasks_failed def start(self): self.print_thread = PrintThread(self.result_queue, self.done, self.quiet, self.interrupt) self.print_thread.daemon = True self.io_thread = IOWriterThread(self.write_queue, self.done) self.io_thread.start() self.threads_list.append(self.io_thread) self.queue = NoBlockQueue(self.interrupt, maxsize=self._max_queue_size) self.threads_list.append(self.print_thread) self.print_thread.start() for i in range(self.num_threads): worker = Worker(queue=self.queue, done=self.done) worker.setDaemon(True) self.threads_list.append(worker) worker.start() def submit(self, task): """ This is the function used to submit a task to the ``Executor``. """ LOGGER.debug("Submitting task: %s", task) self.queue.put(task) def wait(self): """ This is the function used to wait on all of the tasks to finish in the ``Executor``. """ self.queue.join() def join(self): """ This is used to clean up the ``Executor``. """ self.write_queue.put(QUEUE_END_SENTINEL) self.result_queue.put(QUEUE_END_SENTINEL) for i in range(self.num_threads): self.queue.put(QUEUE_END_SENTINEL) for thread in self.threads_list: thread.join()
def multi_upload(self): """ Performs multipart uploads. It initiates the multipart upload. It creates a queue ``part_queue`` which is directly responsible with controlling the progress of the multipart upload. It then creates ``UploadPartTasks`` for threads to run via the ``executer``. This fucntion waits for all of the parts in the multipart upload to finish, and then it completes the multipart upload. This method waits on its parts to finish. So, threads are required to process the parts for this function to complete. """ part_queue = NoBlockQueue(self.interrupt) complete_upload_queue = Queue.PriorityQueue() part_counter = MultiCounter() counter_lock = threading.Lock() bucket, key = find_bucket_key(self.dest) params = {'endpoint': self.endpoint, 'bucket': bucket, 'key': key} if self.parameters['acl']: params['acl'] = self.parameters['acl'][0] if self.parameters['guess_mime_type']: self._inject_content_type(params, self.src) response_data, http = operate(self.service, 'CreateMultipartUpload', params) upload_id = response_data['UploadId'] size_uploads = self.chunksize num_uploads = int(math.ceil(self.size / float(size_uploads))) for i in range(1, (num_uploads + 1)): part_info = (self, upload_id, i, size_uploads) part_queue.put(part_info) task = UploadPartTask(session=self.session, executer=self.executer, part_queue=part_queue, dest_queue=complete_upload_queue, region=self.region, printQueue=self.printQueue, interrupt=self.interrupt, part_counter=part_counter, counter_lock=counter_lock) self.executer.submit(task) part_queue.join() # The following ensures that if the multipart upload is in progress, # all part uploads finish before aborting or completing. This # really only applies when an interrupt signal is sent because the # ``part_queue.join()`` ensures this if the process is not # interrupted. while part_counter.count: time.sleep(0.1) parts_list = [] while not complete_upload_queue.empty(): part = complete_upload_queue.get() parts_list.append(part[1]) if len(parts_list) == num_uploads: parts = {'Parts': parts_list} params = { 'endpoint': self.endpoint, 'bucket': bucket, 'key': key, 'upload_id': upload_id, 'multipart_upload': parts } operate(self.service, 'CompleteMultipartUpload', params) else: abort_params = { 'endpoint': self.endpoint, 'bucket': bucket, 'key': key, 'upload_id': upload_id } operate(self.service, 'AbortMultipartUpload', abort_params) raise Exception()
class S3Handler(object): """ This class sets up the process to perform the tasks sent to it. It sources the ``self.executor`` from which threads inside the class pull tasks from to complete. """ MAX_IO_QUEUE_SIZE = 20 def __init__(self, session, params, multi_threshold=MULTI_THRESHOLD, chunksize=CHUNKSIZE): self.session = session self.done = threading.Event() self.interrupt = threading.Event() self.result_queue = NoBlockQueue() # The write_queue has potential for optimizations, so the constant # for maxsize is scoped to this class (as opposed to constants.py) # so we have the ability to change this value later. self.write_queue = NoBlockQueue(self.interrupt, maxsize=self.MAX_IO_QUEUE_SIZE) self.params = { 'dryrun': False, 'quiet': False, 'acl': None, 'guess_mime_type': True, 'sse': False, 'storage_class': None, 'website_redirect': None, 'content_type': None, 'cache_control': None, 'content_disposition': None, 'content_encoding': None, 'content_language': None, 'expires': None, 'grants': None } self.params['region'] = params['region'] for key in self.params.keys(): if key in params: self.params[key] = params[key] self.multi_threshold = multi_threshold self.chunksize = chunksize self.executor = Executor(done=self.done, num_threads=NUM_THREADS, result_queue=self.result_queue, quiet=self.params['quiet'], interrupt=self.interrupt, max_queue_size=MAX_QUEUE_SIZE, write_queue=self.write_queue) self._multipart_uploads = [] self._multipart_downloads = [] def call(self, files): """ This function pulls a ``FileInfo`` or ``TaskInfo`` object from a list ``files``. Each object is then deemed if it will be a multipart operation and add the necessary attributes if so. Each object is then wrapped with a ``BasicTask`` object which is essentially a thread of execution for a thread to follow. These tasks are then submitted to the main executor. """ self.done.clear() self.interrupt.clear() try: self.executor.start() total_files, total_parts = self._enqueue_tasks(files) self.executor.print_thread.set_total_files(total_files) self.executor.print_thread.set_total_parts(total_parts) self.executor.wait() self.result_queue.join() except Exception as e: LOGGER.debug('Exception caught during task execution: %s', str(e), exc_info=True) self.result_queue.put({'message': str(e), 'error': True}) except KeyboardInterrupt: self.interrupt.set() self.result_queue.put({ 'message': "Cleaning up. Please wait...", 'error': False }) self._shutdown() return self.executor.num_tasks_failed def _shutdown(self): # self.done will tell threads to shutdown. self.done.set() # This waill wait until all the threads are joined. self.executor.join() # And finally we need to make a pass through all the existing # multipart uploads and abort any pending multipart uploads. self._abort_pending_multipart_uploads() self._remove_pending_downloads() def _abort_pending_multipart_uploads(self): # For the purpose of aborting uploads, we consider any # upload context with an upload id. for upload, filename in self._multipart_uploads: if upload.is_cancelled(): try: upload.wait_for_upload_id() except tasks.UploadCancelledError: pass else: # This means that the upload went from STARTED -> CANCELLED. # This could happen if a part thread decided to cancel the # upload. We need to explicitly abort the upload here. self._cancel_upload(upload.wait_for_upload_id(), filename) upload.cancel_upload(self._cancel_upload, args=(filename, )) def _remove_pending_downloads(self): # The downloads case is easier than the uploads case because we don't # need to make any service calls. To properly cleanup we just need # to go through the multipart downloads that were in progress but # cancelled and remove the local file. for context, local_filename in self._multipart_downloads: if (context.is_cancelled() or context.is_started()) and \ os.path.exists(local_filename): # The file is in an inconsistent state (not all the parts # were written to the file) so we should remove the # local file rather than leave it in a bad state. We don't # want to remove the files if the download has *not* been # started because we haven't touched the file yet, so it's # better to leave the old version of the file rather than # deleting the file entirely. os.remove(local_filename) def _cancel_upload(self, upload_id, filename): bucket, key = find_bucket_key(filename.dest) params = { 'bucket': bucket, 'key': key, 'upload_id': upload_id, 'endpoint': filename.endpoint, } LOGGER.debug("Aborting multipart upload for: %s", key) response_data, http = operate(filename.service, 'AbortMultipartUpload', params) def _enqueue_tasks(self, files): total_files = 0 total_parts = 0 for filename in files: num_uploads = 1 is_multipart_task = self._is_multipart_task(filename) too_large = False if hasattr(filename, 'size'): too_large = filename.size > MAX_UPLOAD_SIZE if too_large and filename.operation_name == 'upload': warning = "Warning %s exceeds 5 TB and upload is " \ "being skipped" % relative_path(filename.src) self.result_queue.put({'message': warning, 'error': True}) elif is_multipart_task and not self.params['dryrun']: # If we're in dryrun mode, then we don't need the # real multipart tasks. We can just use a BasicTask # in the else clause below, which will print out the # fact that it's transferring a file rather than # the specific part tasks required to perform the # transfer. num_uploads = self._enqueue_multipart_tasks(filename) else: task = tasks.BasicTask(session=self.session, filename=filename, parameters=self.params, result_queue=self.result_queue) self.executor.submit(task) total_files += 1 total_parts += num_uploads return total_files, total_parts def _is_multipart_task(self, filename): # First we need to determine if it's an operation that even # qualifies for multipart upload. if hasattr(filename, 'size'): above_multipart_threshold = filename.size > self.multi_threshold if above_multipart_threshold: if filename.operation_name in ('upload', 'download', 'move', 'copy'): return True else: return False else: return False def _enqueue_multipart_tasks(self, filename): num_uploads = 1 if filename.operation_name == 'upload': num_uploads = self._enqueue_multipart_upload_tasks(filename) elif filename.operation_name == 'move': if filename.src_type == 'local' and filename.dest_type == 's3': num_uploads = self._enqueue_multipart_upload_tasks( filename, remove_local_file=True) elif filename.src_type == 's3' and filename.dest_type == 'local': num_uploads = self._enqueue_range_download_tasks( filename, remove_remote_file=True) elif filename.src_type == 's3' and filename.dest_type == 's3': num_uploads = self._enqueue_multipart_copy_tasks( filename, remove_remote_file=True) else: raise ValueError("Unknown transfer type of %s -> %s" % (filename.src_type, filename.dest_type)) elif filename.operation_name == 'copy': num_uploads = self._enqueue_multipart_copy_tasks( filename, remove_remote_file=False) elif filename.operation_name == 'download': num_uploads = self._enqueue_range_download_tasks(filename) return num_uploads def _enqueue_range_download_tasks(self, filename, remove_remote_file=False): chunksize = find_chunksize(filename.size, self.chunksize) num_downloads = int(filename.size / chunksize) context = tasks.MultipartDownloadContext(num_downloads) create_file_task = tasks.CreateLocalFileTask(context=context, filename=filename) self.executor.submit(create_file_task) for i in range(num_downloads): task = tasks.DownloadPartTask(part_number=i, chunk_size=chunksize, result_queue=self.result_queue, service=filename.service, filename=filename, context=context, io_queue=self.write_queue) self.executor.submit(task) complete_file_task = tasks.CompleteDownloadTask( context=context, filename=filename, result_queue=self.result_queue, params=self.params, io_queue=self.write_queue) self.executor.submit(complete_file_task) self._multipart_downloads.append((context, filename.dest)) if remove_remote_file: remove_task = tasks.RemoveRemoteObjectTask(filename=filename, context=context) self.executor.submit(remove_task) return num_downloads def _enqueue_multipart_upload_tasks(self, filename, remove_local_file=False): # First we need to create a CreateMultipartUpload task, # then create UploadTask objects for each of the parts. # And finally enqueue a CompleteMultipartUploadTask. chunksize = find_chunksize(filename.size, self.chunksize) num_uploads = int(math.ceil(filename.size / float(chunksize))) upload_context = self._enqueue_upload_start_task( chunksize, num_uploads, filename) self._enqueue_upload_tasks(num_uploads, chunksize, upload_context, filename, tasks.UploadPartTask) self._enqueue_upload_end_task(filename, upload_context) if remove_local_file: remove_task = tasks.RemoveFileTask(local_filename=filename.src, upload_context=upload_context) self.executor.submit(remove_task) return num_uploads def _enqueue_multipart_copy_tasks(self, filename, remove_remote_file=False): chunksize = find_chunksize(filename.size, self.chunksize) num_uploads = int(math.ceil(filename.size / float(chunksize))) upload_context = self._enqueue_upload_start_task( chunksize, num_uploads, filename) self._enqueue_upload_tasks(num_uploads, chunksize, upload_context, filename, tasks.CopyPartTask) self._enqueue_upload_end_task(filename, upload_context) if remove_remote_file: remove_task = tasks.RemoveRemoteObjectTask(filename=filename, context=upload_context) self.executor.submit(remove_task) return num_uploads def _enqueue_upload_start_task(self, chunksize, num_uploads, filename): upload_context = tasks.MultipartUploadContext( expected_parts=num_uploads) create_multipart_upload_task = tasks.CreateMultipartUploadTask( session=self.session, filename=filename, parameters=self.params, result_queue=self.result_queue, upload_context=upload_context) self.executor.submit(create_multipart_upload_task) return upload_context def _enqueue_upload_tasks(self, num_uploads, chunksize, upload_context, filename, task_class): for i in range(1, (num_uploads + 1)): task = task_class(part_number=i, chunk_size=chunksize, result_queue=self.result_queue, upload_context=upload_context, filename=filename) self.executor.submit(task) def _enqueue_upload_end_task(self, filename, upload_context): complete_multipart_upload_task = tasks.CompleteMultipartUploadTask( session=self.session, filename=filename, parameters=self.params, result_queue=self.result_queue, upload_context=upload_context) self.executor.submit(complete_multipart_upload_task) self._multipart_uploads.append((upload_context, filename))
class S3Handler(object): """ This class sets up the process to perform the tasks sent to it. It sources the ``self.executer`` from which threads inside the class pull tasks from to complete. """ def __init__(self, session, params, multi_threshold=MULTI_THRESHOLD, chunksize=CHUNKSIZE): self.session = session self.done = threading.Event() self.interrupt = threading.Event() self.print_queue = NoBlockQueue() self.params = {'dryrun': False, 'quiet': False, 'acl': None, 'guess_mime_type': True, 'sse': False, 'storage_class': None, 'website_redirect': None, 'content_type': None, 'cache_control': None, 'content_disposition': None, 'content_encoding': None, 'content_language': None, 'expires': None, 'grants': None} self.params['region'] = params['region'] for key in self.params.keys(): if key in params: self.params[key] = params[key] self.multi_threshold = multi_threshold self.chunksize = chunksize self.executer = Executer( done=self.done, num_threads=NUM_THREADS, timeout=QUEUE_TIMEOUT_GET, print_queue=self.print_queue, quiet=self.params['quiet'], interrupt=self.interrupt, max_multi=NUM_MULTI_THREADS, max_queue_size=MAX_QUEUE_SIZE, ) self._multipart_uploads = [] def call(self, files): """ This function pulls a ``FileInfo`` or ``TaskInfo`` object from a list ``files``. Each object is then deemed if it will be a multipart operation and add the necessary attributes if so. Each object is then wrapped with a ``BasicTask`` object which is essentially a thread of execution for a thread to follow. These tasks are then submitted to the main executer. """ self.done.clear() self.interrupt.clear() try: self.executer.start() total_files, total_parts = self._enqueue_tasks(files) self.executer.print_thread.set_total_files(total_files) self.executer.print_thread.set_total_parts(total_parts) self.executer.wait() self.print_queue.join() except Exception as e: LOGGER.debug('Exception caught during task execution: %s', str(e), exc_info=True) except KeyboardInterrupt: self.interrupt.set() self.print_queue.put({'result': "Cleaning up. Please wait..."}) self._shutdown() def _shutdown(self): # self.done will tell threads to shutdown. self.done.set() # This waill wait until all the threads are joined. self.executer.join() # And finally we need to make a pass through all the existing # multipart uploads and abort any pending multipart uploads. self._abort_pending_multipart_uploads() def _abort_pending_multipart_uploads(self): # For the purpose of aborting uploads, we consider any # upload context with an upload id. for upload, filename in self._multipart_uploads: if upload.is_cancelled(): try: upload_id = upload.wait_for_upload_id() except tasks.UploadCancelledError: pass else: # This means that the upload went from STARTED -> CANCELLED. # This could happen if a part thread decided to cancel the # upload. We need to explicitly abort the upload here. self._cancel_upload(upload.wait_for_upload_id(), filename) upload.cancel_upload(self._cancel_upload, args=(filename,)) def _cancel_upload(self, upload_id, filename): bucket, key = find_bucket_key(filename.dest) params = { 'bucket': bucket, 'key': key, 'upload_id': upload_id, 'endpoint': filename.endpoint, } LOGGER.debug("Aborting multipart upload for: %s", key) response_data, http = operate( filename.service, 'AbortMultipartUpload', params) def _enqueue_tasks(self, files): total_files = 0 total_parts = 0 for filename in files: filename.set_session(self.session, self.params['region']) num_uploads = 1 is_multipart_task = False too_large = False if hasattr(filename, 'size'): is_multipart_task = ( filename.size > self.multi_threshold and filename.operation == 'upload') too_large = filename.size > MAX_UPLOAD_SIZE if too_large and filename.operation == 'upload': warning = "Warning %s exceeds 5 TB and upload is " \ "being skipped" % os.path.relpath(filename.src) self.print_queue.put({'result': warning}) elif is_multipart_task: num_uploads = self._enqueue_multipart_tasks(filename) else: task = tasks.BasicTask( session=self.session, filename=filename, parameters=self.params, print_queue=self.print_queue) self.executer.submit(task) total_files += 1 total_parts += num_uploads return total_files, total_parts def _enqueue_multipart_tasks(self, filename): num_uploads = 1 chunksize = self.chunksize if filename.operation == 'upload': num_uploads = self._enqueue_multipart_upload_tasks(filename) elif filename.operation == 'download': num_uploads = int(filename.size / chunksize) filename.set_multi(executer=self.executer, print_queue=self.print_queue, interrupt=self.interrupt, chunksize=chunksize) return num_uploads def _enqueue_multipart_upload_tasks(self, filename): # First we need to create a CreateMultipartUpload task, # then create UploadTask objects for each of the parts. # And finally enqueue a CompleteMultipartUploadTask. chunksize = find_chunksize(filename.size, self.chunksize) num_uploads = int(math.ceil(filename.size / float(chunksize))) upload_context = tasks.MultipartUploadContext( expected_parts=num_uploads) create_multipart_upload_task = tasks.CreateMultipartUploadTask( session=self.session, filename=filename, parameters=self.params, print_queue=self.print_queue, upload_context=upload_context) self.executer.submit(create_multipart_upload_task) for i in range(1, (num_uploads + 1)): task = tasks.UploadPartTask( part_number=i, chunk_size=chunksize, print_queue=self.print_queue, upload_context=upload_context, filename=filename) self.executer.submit(task) complete_multipart_upload_task = tasks.CompleteMultipartUploadTask( session=self.session, filename=filename, parameters=self.params, print_queue=self.print_queue, upload_context=upload_context) self.executer.submit(complete_multipart_upload_task) self._multipart_uploads.append((upload_context, filename)) return num_uploads