class BaseSubmissionTaskTest(BaseTaskTest):
    def setUp(self):
        super(BaseSubmissionTaskTest, self).setUp()
        self.config = TransferConfig()
        self.osutil = OSUtils()
        self.executor = BoundedExecutor(
            1000, 1, {
                IN_MEMORY_UPLOAD_TAG: TaskSemaphore(10),
                IN_MEMORY_DOWNLOAD_TAG: SlidingWindowSemaphore(10)
            })

    def tearDown(self):
        super(BaseSubmissionTaskTest, self).tearDown()
        self.executor.shutdown()
示例#2
0
    def test_association_and_disassociation_on_submit(self):
        self.transfer_coordinator = RecordingTransferCoordinator()

        # Submit a callable to the transfer coordinator.
        executor = BoundedExecutor(1, 1)
        task = ReturnFooTask(self.transfer_coordinator)
        future = self.transfer_coordinator.submit(executor, task)
        executor.shutdown()

        # Make sure the future that got submitted was associated to the
        # transfer future at some point.
        self.assertEqual(
            self.transfer_coordinator.all_transfer_futures_ever_associated,
            set([future]))

        # Make sure the future got disassociated once the future is now done
        # by looking at the currently associated futures.
        self.assertEqual(self.transfer_coordinator.associated_futures, set([]))
示例#3
0
    def test_submit_writes_from_internal_queue(self):
        class FakeQueue(object):
            def request_writes(self, offset, data):
                return [
                    {
                        'offset': 0,
                        'data': 'foo'
                    },
                    {
                        'offset': 3,
                        'data': 'bar'
                    },
                ]

        q = FakeQueue()
        io_executor = BoundedExecutor(1000, 1)
        manager = DownloadNonSeekableOutputManager(self.osutil,
                                                   self.transfer_coordinator,
                                                   io_executor=io_executor,
                                                   defer_queue=q)
        fileobj = WriteCollector()
        manager.queue_file_io_task(fileobj=fileobj, data='foo', offset=1)
        io_executor.shutdown()
        self.assertEqual(fileobj.writes, [(0, 'foo'), (3, 'bar')])
示例#4
0
class TransferManager(object):
    ALLOWED_DOWNLOAD_ARGS = ALLOWED_DOWNLOAD_ARGS

    ALLOWED_UPLOAD_ARGS = [
        'ACL',
        'CacheControl',
        'ContentDisposition',
        'ContentEncoding',
        'ContentLanguage',
        'ContentType',
        'Expires',
        'GrantFullControl',
        'GrantRead',
        'GrantReadACP',
        'GrantWriteACP',
        'Metadata',
        'RequestPayer',
        'ServerSideEncryption',
        'StorageClass',
        'SSECustomerAlgorithm',
        'SSECustomerKey',
        'SSECustomerKeyMD5',
        'SSEKMSKeyId',
        'WebsiteRedirectLocation',
        'RetentionExpirationDate',
        'RetentionLegalHoldId',
        'RetentionPeriod',
    ]

    ALLOWED_COPY_ARGS = ALLOWED_UPLOAD_ARGS + [
        'CopySourceIfMatch', 'CopySourceIfModifiedSince',
        'CopySourceIfNoneMatch', 'CopySourceIfUnmodifiedSince',
        'CopySourceSSECustomerAlgorithm', 'CopySourceSSECustomerKey',
        'CopySourceSSECustomerKeyMD5', 'MetadataDirective'
    ]

    ALLOWED_DELETE_ARGS = [
        'MFA',
        'VersionId',
        'RequestPayer',
    ]

    def __init__(self, client, config=None, osutil=None, executor_cls=None):
        """A transfer manager interface for Amazon S3

        :param client: Client to be used by the manager
        :param config: TransferConfig to associate specific configurations
        :param osutil: OSUtils object to use for os-related behavior when
            using with transfer manager.

        :type executor_cls: ibm_s3transfer.futures.BaseExecutor
        :param executor_cls: The class of executor to use with the transfer
            manager. By default, concurrent.futures.ThreadPoolExecutor is used.
        """
        self._client = client
        self._config = config
        if config is None:
            self._config = TransferConfig()
        self._osutil = osutil
        if osutil is None:
            self._osutil = OSUtils()
        self._coordinator_controller = TransferCoordinatorController()
        # A counter to create unique id's for each transfer submitted.
        self._id_counter = 0

        # The executor responsible for making S3 API transfer requests
        self._request_executor = BoundedExecutor(
            max_size=self._config.max_request_queue_size,
            max_num_threads=self._config.max_request_concurrency,
            tag_semaphores={
                IN_MEMORY_UPLOAD_TAG:
                TaskSemaphore(self._config.max_in_memory_upload_chunks),
                IN_MEMORY_DOWNLOAD_TAG:
                SlidingWindowSemaphore(
                    self._config.max_in_memory_download_chunks)
            },
            executor_cls=executor_cls)

        # The executor responsible for submitting the necessary tasks to
        # perform the desired transfer
        self._submission_executor = BoundedExecutor(
            max_size=self._config.max_submission_queue_size,
            max_num_threads=self._config.max_submission_concurrency,
            executor_cls=executor_cls)

        # There is one thread available for writing to disk. It will handle
        # downloads for all files.
        self._io_executor = BoundedExecutor(
            max_size=self._config.max_io_queue_size,
            max_num_threads=1,
            executor_cls=executor_cls)

        # The component responsible for limiting bandwidth usage if it
        # is configured.
        self._bandwidth_limiter = None
        if self._config.max_bandwidth is not None:
            logger.debug('Setting max_bandwidth to %s',
                         self._config.max_bandwidth)
            leaky_bucket = LeakyBucket(self._config.max_bandwidth)
            self._bandwidth_limiter = BandwidthLimiter(leaky_bucket)

        self._register_handlers()

    def upload(self, fileobj, bucket, key, extra_args=None, subscribers=None):
        """Uploads a file to S3

        :type fileobj: str or seekable file-like object
        :param fileobj: The name of a file to upload or a seekable file-like
            object to upload. It is recommended to use a filename because
            file-like objects may result in higher memory usage.

        :type bucket: str
        :param bucket: The name of the bucket to upload to

        :type key: str
        :param key: The name of the key to upload to

        :type extra_args: dict
        :param extra_args: Extra arguments that may be passed to the
            client operation

        :type subscribers: list(ibm_s3transfer.subscribers.BaseSubscriber)
        :param subscribers: The list of subscribers to be invoked in the
            order provided based on the event emit during the process of
            the transfer request.

        :rtype: ibm_s3transfer.futures.TransferFuture
        :returns: Transfer future representing the upload
        """
        if extra_args is None:
            extra_args = {}
        if subscribers is None:
            subscribers = []
        self._validate_all_known_args(extra_args, self.ALLOWED_UPLOAD_ARGS)
        call_args = CallArgs(fileobj=fileobj,
                             bucket=bucket,
                             key=key,
                             extra_args=extra_args,
                             subscribers=subscribers)
        extra_main_kwargs = {}
        if self._bandwidth_limiter:
            extra_main_kwargs['bandwidth_limiter'] = self._bandwidth_limiter
        return self._submit_transfer(call_args, UploadSubmissionTask,
                                     extra_main_kwargs)

    def download(self,
                 bucket,
                 key,
                 fileobj,
                 extra_args=None,
                 subscribers=None):
        """Downloads a file from S3

        :type bucket: str
        :param bucket: The name of the bucket to download from

        :type key: str
        :param key: The name of the key to download from

        :type fileobj: str or seekable file-like object
        :param fileobj: The name of a file to download or a seekable file-like
            object to download. It is recommended to use a filename because
            file-like objects may result in higher memory usage.

        :type extra_args: dict
        :param extra_args: Extra arguments that may be passed to the
            client operation

        :type subscribers: list(ibm_s3transfer.subscribers.BaseSubscriber)
        :param subscribers: The list of subscribers to be invoked in the
            order provided based on the event emit during the process of
            the transfer request.

        :rtype: ibm_s3transfer.futures.TransferFuture
        :returns: Transfer future representing the download
        """
        if extra_args is None:
            extra_args = {}
        if subscribers is None:
            subscribers = []
        self._validate_all_known_args(extra_args, self.ALLOWED_DOWNLOAD_ARGS)
        call_args = CallArgs(bucket=bucket,
                             key=key,
                             fileobj=fileobj,
                             extra_args=extra_args,
                             subscribers=subscribers)
        extra_main_kwargs = {'io_executor': self._io_executor}
        if self._bandwidth_limiter:
            extra_main_kwargs['bandwidth_limiter'] = self._bandwidth_limiter
        return self._submit_transfer(call_args, DownloadSubmissionTask,
                                     extra_main_kwargs)

    def copy(self,
             copy_source,
             bucket,
             key,
             extra_args=None,
             subscribers=None,
             source_client=None):
        """Copies a file in S3

        :type copy_source: dict
        :param copy_source: The name of the source bucket, key name of the
            source object, and optional version ID of the source object. The
            dictionary format is:
            ``{'Bucket': 'bucket', 'Key': 'key', 'VersionId': 'id'}``. Note
            that the ``VersionId`` key is optional and may be omitted.

        :type bucket: str
        :param bucket: The name of the bucket to copy to

        :type key: str
        :param key: The name of the key to copy to

        :type extra_args: dict
        :param extra_args: Extra arguments that may be passed to the
            client operation

        :type subscribers: a list of subscribers
        :param subscribers: The list of subscribers to be invoked in the
            order provided based on the event emit during the process of
            the transfer request.

        :type source_client: ibm_botocore or ibm_boto3 Client
        :param source_client: The client to be used for operation that
            may happen at the source object. For example, this client is
            used for the head_object that determines the size of the copy.
            If no client is provided, the transfer manager's client is used
            as the client for the source object.

        :rtype: ibm_s3transfer.futures.TransferFuture
        :returns: Transfer future representing the copy
        """
        if extra_args is None:
            extra_args = {}
        if subscribers is None:
            subscribers = []
        if source_client is None:
            source_client = self._client
        self._validate_all_known_args(extra_args, self.ALLOWED_COPY_ARGS)
        call_args = CallArgs(copy_source=copy_source,
                             bucket=bucket,
                             key=key,
                             extra_args=extra_args,
                             subscribers=subscribers,
                             source_client=source_client)
        return self._submit_transfer(call_args, CopySubmissionTask)

    def delete(self, bucket, key, extra_args=None, subscribers=None):
        """Delete an S3 object.

        :type bucket: str
        :param bucket: The name of the bucket.

        :type key: str
        :param key: The name of the S3 object to delete.

        :type extra_args: dict
        :param extra_args: Extra arguments that may be passed to the
            DeleteObject call.

        :type subscribers: list
        :param subscribers: A list of subscribers to be invoked during the
            process of the transfer request.  Note that the ``on_progress``
            callback is not invoked during object deletion.

        :rtype: ibm_s3transfer.futures.TransferFuture
        :return: Transfer future representing the deletion.

        """
        if extra_args is None:
            extra_args = {}
        if subscribers is None:
            subscribers = []
        self._validate_all_known_args(extra_args, self.ALLOWED_DELETE_ARGS)
        call_args = CallArgs(bucket=bucket,
                             key=key,
                             extra_args=extra_args,
                             subscribers=subscribers)
        return self._submit_transfer(call_args, DeleteSubmissionTask)

    def _validate_all_known_args(self, actual, allowed):
        for kwarg in actual:
            if kwarg not in allowed:
                raise ValueError("Invalid extra_args key '%s', "
                                 "must be one of: %s" %
                                 (kwarg, ', '.join(allowed)))

    def _submit_transfer(self,
                         call_args,
                         submission_task_cls,
                         extra_main_kwargs=None):
        if not extra_main_kwargs:
            extra_main_kwargs = {}

        # Create a TransferFuture to return back to the user
        transfer_future, components = self._get_future_with_components(
            call_args)

        # Add any provided done callbacks to the created transfer future
        # to be invoked on the transfer future being complete.
        for callback in get_callbacks(transfer_future, 'done'):
            components['coordinator'].add_done_callback(callback)

        # Get the main kwargs needed to instantiate the submission task
        main_kwargs = self._get_submission_task_main_kwargs(
            transfer_future, extra_main_kwargs)

        # Submit a SubmissionTask that will submit all of the necessary
        # tasks needed to complete the S3 transfer.
        self._submission_executor.submit(
            submission_task_cls(transfer_coordinator=components['coordinator'],
                                main_kwargs=main_kwargs))

        # Increment the unique id counter for future transfer requests
        self._id_counter += 1

        return transfer_future

    def _get_future_with_components(self, call_args):
        transfer_id = self._id_counter
        # Creates a new transfer future along with its components
        transfer_coordinator = TransferCoordinator(transfer_id=transfer_id)
        # Track the transfer coordinator for transfers to manage.
        self._coordinator_controller.add_transfer_coordinator(
            transfer_coordinator)
        # Also make sure that the transfer coordinator is removed once
        # the transfer completes so it does not stick around in memory.
        transfer_coordinator.add_done_callback(
            self._coordinator_controller.remove_transfer_coordinator,
            transfer_coordinator)
        components = {
            'meta': TransferMeta(call_args, transfer_id=transfer_id),
            'coordinator': transfer_coordinator
        }
        transfer_future = TransferFuture(**components)
        return transfer_future, components

    def _get_submission_task_main_kwargs(self, transfer_future,
                                         extra_main_kwargs):
        main_kwargs = {
            'client': self._client,
            'config': self._config,
            'osutil': self._osutil,
            'request_executor': self._request_executor,
            'transfer_future': transfer_future
        }
        main_kwargs.update(extra_main_kwargs)
        return main_kwargs

    def _register_handlers(self):
        # Register handlers to enable/disable callbacks on uploads.
        event_name = 'request-created.s3'
        self._client.meta.events.register_first(
            event_name,
            signal_not_transferring,
            unique_id='s3upload-not-transferring')
        self._client.meta.events.register_last(
            event_name, signal_transferring, unique_id='s3upload-transferring')

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, *args):
        cancel = False
        cancel_msg = ''
        cancel_exc_type = FatalError
        # If a exception was raised in the context handler, signal to cancel
        # all of the inprogress futures in the shutdown.
        if exc_type:
            cancel = True
            cancel_msg = six.text_type(exc_value)
            if not cancel_msg:
                cancel_msg = repr(exc_value)
            # If it was a KeyboardInterrupt, the cancellation was initiated
            # by the user.
            if isinstance(exc_value, KeyboardInterrupt):
                cancel_exc_type = CancelledError
        self._shutdown(cancel, cancel_msg, cancel_exc_type)

    def shutdown(self, cancel=False, cancel_msg=''):
        """Shutdown the TransferManager

        It will wait till all transfers complete before it completely shuts
        down.

        :type cancel: boolean
        :param cancel: If True, calls TransferFuture.cancel() for
            all in-progress in transfers. This is useful if you want the
            shutdown to happen quicker.

        :type cancel_msg: str
        :param cancel_msg: The message to specify if canceling all in-progress
            transfers.
        """
        self._shutdown(cancel, cancel, cancel_msg)

    def _shutdown(self, cancel, cancel_msg, exc_type=CancelledError):
        if cancel:
            # Cancel all in-flight transfers if requested, before waiting
            # for them to complete.
            self._coordinator_controller.cancel(cancel_msg, exc_type)
        try:
            # Wait until there are no more in-progress transfers. This is
            # wrapped in a try statement because this can be interrupted
            # with a KeyboardInterrupt that needs to be caught.
            self._coordinator_controller.wait()
        except KeyboardInterrupt:
            # If not errors were raised in the try block, the cancel should
            # have no coordinators it needs to run cancel on. If there was
            # an error raised in the try statement we want to cancel all of
            # the inflight transfers before shutting down to speed that
            # process up.
            self._coordinator_controller.cancel('KeyboardInterrupt()')
            raise
        finally:
            # Shutdown all of the executors.
            self._submission_executor.shutdown()
            self._request_executor.shutdown()
            self._io_executor.shutdown()
示例#5
0
class TestGetObjectTask(BaseTaskTest):
    def setUp(self):
        super(TestGetObjectTask, self).setUp()
        self.bucket = 'mybucket'
        self.key = 'mykey'
        self.extra_args = {}
        self.callbacks = []
        self.max_attempts = 5
        self.io_executor = BoundedExecutor(1000, 1)
        self.content = b'my content'
        self.stream = six.BytesIO(self.content)
        self.fileobj = WriteCollector()
        self.osutil = OSUtils()
        self.io_chunksize = 64 * (1024**2)
        self.task_cls = GetObjectTask
        self.download_output_manager = DownloadSeekableOutputManager(
            self.osutil, self.transfer_coordinator, self.io_executor)

    def get_download_task(self, **kwargs):
        default_kwargs = {
            'client': self.client,
            'bucket': self.bucket,
            'key': self.key,
            'fileobj': self.fileobj,
            'extra_args': self.extra_args,
            'callbacks': self.callbacks,
            'max_attempts': self.max_attempts,
            'download_output_manager': self.download_output_manager,
            'io_chunksize': self.io_chunksize,
        }
        default_kwargs.update(kwargs)
        self.transfer_coordinator.set_status_to_queued()
        return self.get_task(self.task_cls, main_kwargs=default_kwargs)

    def assert_io_writes(self, expected_writes):
        # Let the io executor process all of the writes before checking
        # what writes were sent to it.
        self.io_executor.shutdown()
        self.assertEqual(self.fileobj.writes, expected_writes)

    def test_main(self):
        self.stubber.add_response('get_object',
                                  service_response={'Body': self.stream},
                                  expected_params={
                                      'Bucket': self.bucket,
                                      'Key': self.key
                                  })
        task = self.get_download_task()
        task()

        self.stubber.assert_no_pending_responses()
        self.assert_io_writes([(0, self.content)])

    def test_extra_args(self):
        self.stubber.add_response('get_object',
                                  service_response={'Body': self.stream},
                                  expected_params={
                                      'Bucket': self.bucket,
                                      'Key': self.key,
                                      'Range': 'bytes=0-'
                                  })
        self.extra_args['Range'] = 'bytes=0-'
        task = self.get_download_task()
        task()

        self.stubber.assert_no_pending_responses()
        self.assert_io_writes([(0, self.content)])

    def test_control_chunk_size(self):
        self.stubber.add_response('get_object',
                                  service_response={'Body': self.stream},
                                  expected_params={
                                      'Bucket': self.bucket,
                                      'Key': self.key
                                  })
        task = self.get_download_task(io_chunksize=1)
        task()

        self.stubber.assert_no_pending_responses()
        expected_contents = []
        for i in range(len(self.content)):
            expected_contents.append((i, bytes(self.content[i:i + 1])))

        self.assert_io_writes(expected_contents)

    def test_start_index(self):
        self.stubber.add_response('get_object',
                                  service_response={'Body': self.stream},
                                  expected_params={
                                      'Bucket': self.bucket,
                                      'Key': self.key
                                  })
        task = self.get_download_task(start_index=5)
        task()

        self.stubber.assert_no_pending_responses()
        self.assert_io_writes([(5, self.content)])

    def test_uses_bandwidth_limiter(self):
        bandwidth_limiter = mock.Mock(BandwidthLimiter)

        self.stubber.add_response('get_object',
                                  service_response={'Body': self.stream},
                                  expected_params={
                                      'Bucket': self.bucket,
                                      'Key': self.key
                                  })
        task = self.get_download_task(bandwidth_limiter=bandwidth_limiter)
        task()

        self.stubber.assert_no_pending_responses()
        self.assertEqual(
            bandwidth_limiter.get_bandwith_limited_stream.call_args_list,
            [mock.call(mock.ANY, self.transfer_coordinator)])

    def test_retries_succeeds(self):
        self.stubber.add_response('get_object',
                                  service_response={
                                      'Body':
                                      StreamWithError(self.stream,
                                                      SOCKET_ERROR)
                                  },
                                  expected_params={
                                      'Bucket': self.bucket,
                                      'Key': self.key
                                  })
        self.stubber.add_response('get_object',
                                  service_response={'Body': self.stream},
                                  expected_params={
                                      'Bucket': self.bucket,
                                      'Key': self.key
                                  })
        task = self.get_download_task()
        task()

        # Retryable error should have not affected the bytes placed into
        # the io queue.
        self.stubber.assert_no_pending_responses()
        self.assert_io_writes([(0, self.content)])

    def test_retries_failure(self):
        for _ in range(self.max_attempts):
            self.stubber.add_response('get_object',
                                      service_response={
                                          'Body':
                                          StreamWithError(
                                              self.stream, SOCKET_ERROR)
                                      },
                                      expected_params={
                                          'Bucket': self.bucket,
                                          'Key': self.key
                                      })

        task = self.get_download_task()
        task()
        self.transfer_coordinator.announce_done()

        # Should have failed out on a RetriesExceededError
        with self.assertRaises(RetriesExceededError):
            self.transfer_coordinator.result()
        self.stubber.assert_no_pending_responses()

    def test_retries_in_middle_of_streaming(self):
        # After the first read a retryable error will be thrown
        self.stubber.add_response('get_object',
                                  service_response={
                                      'Body':
                                      StreamWithError(
                                          copy.deepcopy(self.stream),
                                          SOCKET_ERROR, 1)
                                  },
                                  expected_params={
                                      'Bucket': self.bucket,
                                      'Key': self.key
                                  })
        self.stubber.add_response('get_object',
                                  service_response={'Body': self.stream},
                                  expected_params={
                                      'Bucket': self.bucket,
                                      'Key': self.key
                                  })
        task = self.get_download_task(io_chunksize=1)
        task()

        self.stubber.assert_no_pending_responses()
        expected_contents = []
        # This is the content intially read in before the retry hit on the
        # second read()
        expected_contents.append((0, bytes(self.content[0:1])))

        # The rest of the content should be the entire set of data partitioned
        # out based on the one byte stream chunk size. Note the second
        # element in the list should be a copy of the first element since
        # a retryable exception happened in between.
        for i in range(len(self.content)):
            expected_contents.append((i, bytes(self.content[i:i + 1])))
        self.assert_io_writes(expected_contents)

    def test_cancels_out_of_queueing(self):
        self.stubber.add_response('get_object',
                                  service_response={
                                      'Body':
                                      CancelledStreamWrapper(
                                          self.stream,
                                          self.transfer_coordinator)
                                  },
                                  expected_params={
                                      'Bucket': self.bucket,
                                      'Key': self.key
                                  })
        task = self.get_download_task()
        task()

        self.stubber.assert_no_pending_responses()
        # Make sure that no contents were added to the queue because the task
        # should have been canceled before trying to add the contents to the
        # io queue.
        self.assert_io_writes([])