def test_bucket_put_file(self): with patch("quilt3.bucket.copy_file") as copy_mock: bucket = Bucket('s3://test-bucket') bucket.put_file(key='README.md', path='./README') # put local file to bucket copy_mock.assert_called_once_with( PhysicalKey.from_path('README'), PhysicalKey.from_url('s3://test-bucket/README.md'))
def test_upload_large_file(self): path = DATA_DIR / 'large_file.npy' self.s3_stubber.add_client_error(method='head_object', http_status_code=404, expected_params={ 'Bucket': 'example', 'Key': 'large_file.npy', }) self.s3_stubber.add_response(method='put_object', service_response={'VersionId': 'v1'}, expected_params={ 'Body': ANY, 'Bucket': 'example', 'Key': 'large_file.npy', }) urls = data_transfer.copy_file_list([ (PhysicalKey.from_path(path), PhysicalKey.from_url('s3://example/large_file.npy'), path.stat().st_size), ]) assert urls[0] == PhysicalKey.from_url( 's3://example/large_file.npy?versionId=v1')
def test_upload_large_file_etag_mismatch(self): path = DATA_DIR / 'large_file.npy' self.s3_stubber.add_response(method='head_object', service_response={ 'ContentLength': path.stat().st_size, 'ETag': '"123"', 'VersionId': 'v1', }, expected_params={ 'Bucket': 'example', 'Key': 'large_file.npy', }) self.s3_stubber.add_response(method='put_object', service_response={'VersionId': 'v2'}, expected_params={ 'Body': ANY, 'Bucket': 'example', 'Key': 'large_file.npy', }) urls = data_transfer.copy_file_list([ (PhysicalKey.from_path(path), PhysicalKey.from_url('s3://example/large_file.npy'), path.stat().st_size), ]) assert urls[0] == PhysicalKey.from_url( 's3://example/large_file.npy?versionId=v2')
def get_pkg(src_registry, data): quilt3.util.validate_package_name(data['parent']['name']) manifest_pk = src_registry.manifest_pk(data['parent']['name'], data['parent']['top_hash']) manifest_size, version = quilt3.data_transfer.get_size_and_version( manifest_pk) if manifest_size > PROMOTE_PKG_MAX_MANIFEST_SIZE: raise ApiException( HTTPStatus.BAD_REQUEST, f"Manifest size of {manifest_size} exceeds supported limit of {PROMOTE_PKG_MAX_MANIFEST_SIZE}" ) manifest_pk = PhysicalKey(manifest_pk.bucket, manifest_pk.path, version) # TODO: it's better to use TemporaryFile() here, but we don't have API # for downloading to fileobj. with tempfile.NamedTemporaryFile() as tmp_file: quilt3.data_transfer.copy_file( manifest_pk, PhysicalKey.from_path(tmp_file.name), size=manifest_size, ) pkg = quilt3.Package.load(tmp_file) if any(e.physical_key.is_local() for lk, e in pkg.walk()): raise ApiException( HTTPStatus.BAD_REQUEST, "Parent's manifest contains non-S3 physical keys.") return pkg
def get_pkg(src_registry, data): quilt3.util.validate_package_name(data['parent']['name']) manifest_pk = src_registry.manifest_pk(data['parent']['name'], data['parent']['top_hash']) manifest_size, version = quilt3.data_transfer.get_size_and_version( manifest_pk) if manifest_size > PROMOTE_PKG_MAX_MANIFEST_SIZE: raise PkgpushException("ManifestTooLarge", { "size": manifest_size, "max_size": PROMOTE_PKG_MAX_MANIFEST_SIZE, }) manifest_pk = PhysicalKey(manifest_pk.bucket, manifest_pk.path, version) # TODO: it's better to use TemporaryFile() here, but we don't have API # for downloading to fileobj. with tempfile.NamedTemporaryFile() as tmp_file: quilt3.data_transfer.copy_file( manifest_pk, PhysicalKey.from_path(tmp_file.name), size=manifest_size, ) pkg = quilt3.Package.load(tmp_file) if any(e.physical_key.is_local() for lk, e in pkg.walk()): raise PkgpushException("ManifestHasLocalKeys") return pkg
def test_multipart_upload(self): name = 'very_large_file.bin' path = pathlib.Path(name) size = 30 * 1024 * 1024 chunksize = 8 * 1024 * 1024 chunks = -(-size // chunksize) # Create an empty 30MB file; shouldn't take up any actual space on any reasonable filesystem. with open(path, 'wb') as fd: fd.seek(size - 1) fd.write(b'!') self.s3_stubber.add_client_error(method='head_object', http_status_code=404, expected_params={ 'Bucket': 'example', 'Key': name, }) self.s3_stubber.add_response(method='create_multipart_upload', service_response={'UploadId': '123'}, expected_params={ 'Bucket': 'example', 'Key': name, }) for part_num in range(1, chunks + 1): self.s3_stubber.add_response( method='upload_part', service_response={'ETag': 'etag%d' % part_num}, expected_params={ 'Bucket': 'example', 'Key': name, 'UploadId': '123', 'Body': ANY, 'PartNumber': part_num }) self.s3_stubber.add_response(method='complete_multipart_upload', service_response={}, expected_params={ 'Bucket': 'example', 'Key': name, 'UploadId': '123', 'MultipartUpload': { 'Parts': [{ 'ETag': 'etag%d' % i, 'PartNumber': i } for i in range(1, chunks + 1)] } }) with mock.patch('quilt3.data_transfer.MAX_CONCURRENCY', 1): data_transfer.copy_file_list([ (PhysicalKey.from_path(path), PhysicalKey.from_url(f's3://example/{name}'), path.stat().st_size), ])
class S3DownloadTest(QuiltTestCase): data = b'0123456789abcdef' size = len(data) bucket = 'test-bucket' key = 'test-key' src = PhysicalKey(bucket, key, None) filename = 'some-file-name' dst = PhysicalKey(None, filename, None) def _test_download(self, *, threshold, chunksize, parts=data, devnull=False): dst = PhysicalKey(None, os.devnull, None) if devnull else self.dst with self.s3_test_multi_thread_download(self.bucket, self.key, parts, threshold=threshold, chunksize=chunksize): data_transfer.copy_file_list([(self.src, dst, self.size)]) if not devnull: with open(self.filename, 'rb') as f: assert f.read() == self.data def test_threshold_gt_size(self): self._test_download(threshold=self.size + 1, chunksize=5) def test_threshold_eq_size(self): parts = { 'bytes=0-4': self.data[:5], 'bytes=5-9': self.data[5:10], 'bytes=10-14': self.data[10:15], 'bytes=15-15': self.data[15:], } self._test_download(threshold=self.size, chunksize=5, parts=parts) def test_threshold_eq_size_special_file(self): if os.name == 'nt': with pytest.raises( ValueError, match= f'Cannot download to {os.devnull!r}: reserved file name'): self._test_download(threshold=self.size, chunksize=5, devnull=True) else: self._test_download(threshold=self.size, chunksize=5, devnull=True) def test_threshold_eq_chunk_eq_size(self): self._test_download(threshold=self.size, chunksize=self.size) def test_threshold_eq_chunk_gt_size(self): self._test_download(threshold=self.size, chunksize=self.size + 1)
def test_body_is_seekable(self): """ No errors if request body.read() or body.seek() are called right before sending request. """ def handler(request, **kwargs): request.body.read(2) request.body.seek(0) raise Success path = DATA_DIR / 'small_file.csv' self.s3_client.meta.events.register_first('before-send.*', handler) with pytest.raises(Success): data_transfer.copy_file(PhysicalKey.from_path(path), PhysicalKey.from_url('s3://example/foo.csv'))
def test_simple_upload(self): path = DATA_DIR / 'small_file.csv' # Unversioned bucket self.s3_stubber.add_response(method='put_object', service_response={}, expected_params={ 'Body': ANY, 'Bucket': 'example', 'Key': 'foo.csv', }) data_transfer.copy_file(PhysicalKey.from_path(path), PhysicalKey.from_url('s3://example/foo.csv'))
def test_copy_file_list_retry(self): bucket = 'test-bucket' other_bucket = f'{bucket}-other' key = 'dir/a' vid = None src = PhysicalKey(bucket, key, vid) dst = PhysicalKey(other_bucket, key, vid) with mock.patch('botocore.client.BaseClient._make_api_call', side_effect=ClientError({}, 'CopyObject')) as mocked_api_call: with pytest.raises(ClientError): data_transfer.copy_file_list([(src, dst, 1)]) self.assertEqual(mocked_api_call.call_count, data_transfer.MAX_COPY_FILE_LIST_RETRIES)
def test_list_local_url(self): dir_path = DATA_DIR / 'dir' contents = set(list(data_transfer.list_url(PhysicalKey.from_path(dir_path)))) assert contents == set([ ('foo.txt', 4), ('x/blah.txt', 6) ])
def test_schema_load_error_s3(self): schema_pk = PhysicalKey.from_url('s3://schema-bucket/schema-key') data = get_v1_conf_data(''' workflows: w1: name: Name metadata_schema: schema-id schemas: schema-id: url: %s ''' % schema_pk) registry = get_package_registry('s3://some-bucket') self.s3_mock_config(data, registry) self.s3_stubber.add_client_error( 'get_object', service_error_code='NoSuchKey', expected_params={ 'Bucket': 'schema-bucket', 'Key': 'schema-key', }, http_status_code=404, ) with pytest.raises(QuiltException, match=fr"Couldn't load schema at {schema_pk}"): self._validate(registry=registry, workflow='w1')
def exec_module(cls, module): """ Module executor. """ name_parts = module.__name__.split('.') registry = get_from_config('default_local_registry') if module.__name__ == 'quilt3.data': # __path__ must be set even if the package is virtual. Since __path__ will be # scanned by all other finders preceding this one in sys.meta_path order, make sure # it points to someplace lacking importable objects module.__path__ = MODULE_PATH return module elif len(name_parts) == 3: # e.g. module.__name__ == quilt3.data.foo namespace = name_parts[2] # we do not know the name the user will ask for, so populate all valid names for pkg in _list_packages( PhysicalKey.from_url(get_package_registry(registry))): pkg_user, pkg_name = pkg.split('/') if pkg_user == namespace: module.__dict__[pkg_name] = Package._browse( pkg, registry=registry) module.__path__ = MODULE_PATH return module else: assert False
def test_copy_file_list_retry_non_client_error(self): """ copy_file_list() is not retrying on random exceptions. """ bucket = 'test-bucket' other_bucket = f'{bucket}-other' key = 'dir/a' vid = None src = PhysicalKey(bucket, key, vid) dst = PhysicalKey(other_bucket, key, vid) with mock.patch('botocore.client.BaseClient._make_api_call', side_effect=Exception('test exception')) as mocked_api_call: with pytest.raises(Exception, match='test exception'): data_transfer.copy_file_list([(src, dst, 1)]) assert mocked_api_call.call_count == 1
def setUp(self): self.pkg = Package() self.entry_with_hash = PackageEntry( PhysicalKey('test-bucket', 'with-hash', 'with-hash'), 42, { 'type': 'SHA256', 'value': '0' * 64 }, {}, ) self.entry_without_hash = PackageEntry( PhysicalKey('test-bucket', 'without-hash', 'without-hash'), 42, None, {}, ) self.pkg.set('with-hash', self.entry_with_hash) self.pkg.set('without-hash', self.entry_without_hash)
def test_progress_updateds(self, mocked_update): """ Progress callback is called when calling body.read() or body.seek(). """ def handler(request, **kwargs): request.body.read(2) mocked_update.assert_called_once_with(2) mocked_update.reset_mock() request.body.seek(0) mocked_update.assert_called_once_with(-2) raise Success path = DATA_DIR / 'small_file.csv' self.s3_client.meta.events.register_first('before-send.*', handler) with pytest.raises(Success): data_transfer.copy_file(PhysicalKey.from_path(path), PhysicalKey.from_url('s3://example/foo.csv'))
class S3HashingTest(QuiltTestCase): data = b'0123456789abcdef' size = len(data) hasher = hashlib.sha256 bucket = 'test-bucket' key = 'test-key' src = PhysicalKey(bucket, key, None) def _hashing_subtest(self, *, threshold, chunksize, data=data): with self.s3_test_multi_thread_download(self.bucket, self.key, data, threshold=threshold, chunksize=chunksize): assert data_transfer.calculate_sha256([self.src], [self.size]) == [ self.hasher(self.data).hexdigest() ] def test_single_request(self): params = ( (self.size + 1, 5), (self.size, self.size), (self.size, self.size + 1), (5, self.size), ) for threshold, chunksize in params: with self.subTest(threshold=threshold, chunksize=chunksize): self._hashing_subtest(threshold=threshold, chunksize=chunksize) def test_multi_request(self): params = ( (self.size, 5, { 'bytes=0-4': self.data[:5], 'bytes=5-9': self.data[5:10], 'bytes=10-14': self.data[10:15], 'bytes=15-15': self.data[15:], }), (5, self.size - 1, { 'bytes=0-14': self.data[:15], 'bytes=15-15': self.data[15:], }), ) for threshold, chunksize, data in params: for concurrency in (len(data), 1): with mock.patch( 'quilt3.data_transfer.s3_transfer_config.max_request_concurrency', concurrency): with self.subTest(threshold=threshold, chunksize=chunksize, data=data, concurrency=concurrency): self._hashing_subtest(threshold=threshold, chunksize=chunksize, data=data)
def _test_download(self, *, threshold, chunksize, parts=data, devnull=False): dst = PhysicalKey(None, os.devnull, None) if devnull else self.dst with self.s3_test_multi_thread_download( self.bucket, self.key, parts, threshold=threshold, chunksize=chunksize ): data_transfer.copy_file_list([(self.src, dst, self.size)]) if not devnull: with open(self.filename, 'rb') as f: assert f.read() == self.data
def test_copy_file_list_multipart_retry(self): bucket = 'test-bucket' other_bucket = f'{bucket}-other' key = 'dir/a' vid = None src = PhysicalKey(bucket, key, vid) dst = PhysicalKey(other_bucket, key, vid) parts = 2 * data_transfer.s3_transfer_config.max_request_concurrency size = parts * data_transfer.s3_transfer_config.multipart_threshold def side_effect(operation_name, *args, **kwargs): if operation_name == 'CreateMultipartUpload': return {'UploadId': '123'} time.sleep(0.1) raise ClientError({}, 'CopyObject') with mock.patch('botocore.client.BaseClient._make_api_call', side_effect=side_effect): with pytest.raises(ClientError): data_transfer.copy_file_list([(src, dst, size)])
def get_package_registry(path=None) -> PackageRegistry: """ Returns the package registry for a given path """ # TODO: Don't check if it's PackageRegistry? Then we need better separation # to external functions that receive string and internal that receive # PackageRegistry. if isinstance(path, PackageRegistry): return path if not isinstance(path, PhysicalKey): path = PhysicalKey.from_url( get_from_config('default_local_registry') if path is None else fix_url(path) ) return (LocalPackageRegistryV1 if path.is_local() else S3PackageRegistryV1)(path)
def test_calculate_sha256_read_timeout(self): bucket = 'test-bucket' key = 'dir/a' vid = 'a1234' a_contents = b'a' * 10 pk = PhysicalKey(bucket, key, vid) with mock.patch('botocore.client.BaseClient._make_api_call', side_effect=ReadTimeoutError('Error Uploading', endpoint_url="s3://foobar")): results = data_transfer.calculate_sha256([pk], [len(a_contents)]) assert list(results) == [None]
def get_pkg_entry(cls, path): pk = PhysicalKey.from_url( f's3://{cls.parent_bucket}/{path}?versionId=obj{path}Version') return PackageEntry( pk, cls.file_size, { 'type': 'SHA256', 'value': cls.get_file_hash(pk) }, cls.get_file_meta(pk), )
def test_download_latest_in_versioned_bucket(self): bucket = 'example' key = 'foo.csv' src = PhysicalKey(bucket, key, None) latest_version = '1' latest_size = 3 # Check what is the latest version and size. expected_params = { 'Bucket': bucket, 'Key': key, } self.s3_stubber.add_response( 'head_object', service_response={ 'VersionId': latest_version, 'ContentLength': latest_size, }, expected_params=expected_params, ) for i in range(latest_size): self.s3_stubber.add_response( 'get_object', service_response={ 'Body': io.BytesIO(b'0'), }, expected_params={ **expected_params, 'Range': f'bytes={i}-{i}', # Version must be specified, otherwise we will end with # a truncated file if the file was modified after getting the latest # version/size. 'VersionId': latest_version, }, ) data_transfer.copy_file( src, PhysicalKey.from_path('some-file'), )
def test_calculate_sha256_read_timeout(self, mocked_api_call): bucket = 'test-bucket' key = 'dir/a' vid = 'a1234' a_contents = b'a' * 10 pk = PhysicalKey(bucket, key, vid) exc = ReadTimeoutError('Error Uploading', endpoint_url="s3://foobar") mocked_api_call.side_effect = exc results = data_transfer.calculate_sha256([pk], [len(a_contents)]) assert mocked_api_call.call_count == data_transfer.MAX_FIX_HASH_RETRIES assert results == [exc]
def prepare_pkg(self, *, copy_data): expected_pkg = Package() pkg_entries = self.entries.items() if copy_data: pkg_entries = [( lk, e.with_physical_key( PhysicalKey(self.dst_bucket, f'{self.dst_pkg_name}/{lk}', 'dst_' + e.physical_key.version_id)), ) for lk, e in pkg_entries] for lk, entry in pkg_entries: expected_pkg.set(lk, entry) expected_pkg._set_commit_message(None) return expected_pkg
def test_get_size_and_version(self): response = { 'ETag': '12345', 'VersionId': '1.0', 'ContentLength': 123, } expected_params = { 'Bucket': 'my_bucket', 'Key': 'my_obj', } self.s3_stubber.add_response('head_object', response, expected_params) # Verify the verion is present assert data_transfer.get_size_and_version(PhysicalKey.from_url('s3://my_bucket/my_obj'))[1] == '1.0'
def test_remote_registry_local_schema(self): data = get_v1_conf_data(''' workflows: w1: name: Name metadata_schema: schema-id schemas: schema-id: url: file:///local/path ''') registry = 's3://some-bucket' self.s3_mock_config(data, get_package_registry(registry)) schema_pk = PhysicalKey.from_path('/local/path') error_msg = rf"Local schema '{schema_pk}' can't be used on the remote registry." with pytest.raises(QuiltException, match=error_msg): self._validate(registry=registry, workflow='w1')
def _test_download(self, *, threshold, chunksize, parts=None, devnull=False): num_parts = 1 if parts is None else len(parts) barrier = threading.Barrier(num_parts, timeout=2) def side_effect(*args, **kwargs): barrier.wait( ) # This ensures that we have concurrent calls to get_object(). return { 'VersionId': 'v1', 'Body': io.BytesIO( self.data if parts is None else parts[kwargs['Range']]), } dst = PhysicalKey(None, os.devnull, None) if devnull else self.dst with mock.patch('quilt3.data_transfer.s3_transfer_config.max_request_concurrency', num_parts), \ mock.patch('quilt3.data_transfer.s3_transfer_config.multipart_threshold', threshold), \ mock.patch('quilt3.data_transfer.s3_transfer_config.multipart_chunksize', chunksize), \ mock.patch.object(self.s3_client, 'get_object', side_effect=side_effect) as get_object_mock: data_transfer.copy_file_list([(self.src, dst, self.size)]) expected_params = { 'Bucket': self.bucket, 'Key': self.key, } if parts is None: get_object_mock.assert_called_once_with(**expected_params) else: get_object_mock.assert_has_calls( [mock.call(**expected_params, Range=r) for r in parts], any_order=True) assert len(get_object_mock.call_args_list) == num_parts if not devnull: with open(self.filename, 'rb') as f: assert f.read() == self.data
def test_bucket_put_dir(self): path = pathlib.Path(__file__).parent / 'data' bucket = Bucket('s3://test-bucket') with patch("quilt3.bucket.copy_file") as copy_mock: bucket.put_dir('test', path) copy_mock.assert_called_once_with( PhysicalKey.from_path(str(path) + '/'), PhysicalKey.from_url('s3://test-bucket/test/')) with patch("quilt3.bucket.copy_file") as copy_mock: bucket.put_dir('test/', path) copy_mock.assert_called_once_with( PhysicalKey.from_path(str(path) + '/'), PhysicalKey.from_url('s3://test-bucket/test/')) with patch("quilt3.bucket.copy_file") as copy_mock: bucket.put_dir('', path) copy_mock.assert_called_once_with( PhysicalKey.from_path(str(path) + '/'), PhysicalKey.from_url('s3://test-bucket/'))
def test_multi_upload(self): path1 = DATA_DIR / 'small_file.csv' path2 = DATA_DIR / 'dir/foo.txt' # Unversioned bucket self.s3_stubber.add_response(method='put_object', service_response={}, expected_params={ 'Body': ANY, 'Bucket': 'example1', 'Key': 'foo.csv', }) # Versioned bucket self.s3_stubber.add_response(method='put_object', service_response={'VersionId': 'v123'}, expected_params={ 'Body': ANY, 'Bucket': 'example2', 'Key': 'foo.txt', }) # stubber expects responses in order, so disable multi-threading. with mock.patch( 'quilt3.data_transfer.s3_transfer_config.max_request_concurrency', 1): urls = data_transfer.copy_file_list([ (PhysicalKey.from_path(path1), PhysicalKey.from_url('s3://example1/foo.csv'), path1.stat().st_size), (PhysicalKey.from_path(path2), PhysicalKey.from_url('s3://example2/foo.txt'), path2.stat().st_size), ]) assert urls[0] == PhysicalKey.from_url('s3://example1/foo.csv') assert urls[1] == PhysicalKey.from_url( 's3://example2/foo.txt?versionId=v123')