async def test_zip_files(self, temp_files): files = [] for filename in ['file1.ext', 'zip.zip', 'file2.ext']: path = temp_files.add_file(filename) contents = os.urandom(2**5) with open(path, 'wb') as f: f.write(contents) files.append({ 'filename': filename, 'path': path, 'contents': contents, 'handle': open(path, 'rb') }) stream = streams.ZipStreamReader( AsyncIterator( (file['filename'], streams.FileStreamReader(file['handle'])) for file in files)) data = await stream.read() for file in files: file['handle'].close() zip = zipfile.ZipFile(io.BytesIO(data)) # Verify CRCs: `.testzip()` returns `None` if there are no bad files in the zipfile assert zip.testzip() is None for file in files: assert zip.open(file['filename']).read() == file['contents'] compression_type = zip.open(file['filename'])._compress_type if file['filename'].endswith('.zip'): assert compression_type == zipfile.ZIP_STORED else: assert compression_type != zipfile.ZIP_STORED
def test_multiple_large_files(self, temp_files): files = [] for index in range(5): filename = 'file{}.ext'.format(index) path = temp_files.add_file(filename) contents = os.urandom(2**18) with open(path, 'wb') as f: f.write(contents) files.append({ 'filename': filename, 'path': path, 'contents': contents }) for file in files: file['handle'] = open(file['path'], 'rb') stream = streams.ZipStreamReader( *((file['filename'], streams.FileStreamReader(file['handle'])) for file in files)) data = yield from stream.read() for file in files: file['handle'].close() zip = zipfile.ZipFile(io.BytesIO(data)) # Verify CRCs assert zip.testzip() is None for file in files: assert zip.open(file['filename']).read() == file['contents']
async def _upload_parity(path, credentials, settings): _, name = os.path.split(path) provider_name = settings.get('provider') provider = make_provider(provider_name, {}, credentials, settings) with open(path, 'rb') as file_pointer: stream = streams.FileStreamReader(file_pointer) await provider.upload(stream, (await provider.validate_path('/' + name)))
def download(self, path, revision=None, **kwargs): if not os.path.exists(path.full_path): raise exceptions.DownloadError( 'Could not retrieve file \'{0}\''.format(path), code=404, ) file_pointer = open(path.full_path, 'rb') return streams.FileStreamReader(file_pointer)
async def _upload_parity(path, credentials, settings): _, name = os.path.split(path) provider_name = settings.get('provider') provider = make_provider(provider_name, {}, credentials, settings) with open(path, 'rb') as file_pointer: stream = streams.FileStreamReader(file_pointer) stream.add_writer('sha256', streams.HashStreamWriter(hashlib.sha256)) await provider.upload(stream, (await provider.validate_path('/' + name))) return (name, stream.writers['sha256'].hexdigest)
async def upload(self, stream, path, **kwargs): """Zips the given stream then uploads to Dataverse. This will delete existing draft files with the same name. :param waterbutler.core.streams.RequestWrapper stream: The stream to put to Dataverse :param str path: The filename prepended with '/' :rtype: dict, bool """ stream.add_writer('md5', streams.HashStreamWriter(hashlib.md5)) zip_stream = streams.ZipStreamReader( AsyncIterator([(path.name, stream)])) # Write stream to disk (Necessary to find zip file size) f = tempfile.TemporaryFile() chunk = await zip_stream.read() while chunk: f.write(chunk) chunk = await zip_stream.read() file_stream = streams.FileStreamReader(f) dv_headers = { "Content-Disposition": "filename=temp.zip", "Content-Type": "application/zip", "Packaging": "http://purl.org/net/sword/package/SimpleZip", "Content-Length": str(file_stream.size), } # Delete old file if it exists if path.identifier: await self.delete(path) resp = await self.make_request('POST', self.build_url( settings.EDIT_MEDIA_BASE_URL, 'study', self.doi), headers=dv_headers, auth=(self.token, ), data=file_stream, expects=(201, ), throws=exceptions.UploadError) await resp.release() # Find appropriate version of file metadata = await self._get_data('latest') files = metadata if isinstance(metadata, list) else [] file_metadata = next(file for file in files if file.name == path.name) if stream.writers['md5'].hexdigest != file_metadata.extra['hashes'][ 'md5']: raise exceptions.UploadChecksumMismatchError() return file_metadata, path.identifier is None
async def _upload_part(self, stream: streams.BaseStream, part_id: str, part_size: int, start_offset: int, session_id: str) -> dict: """Upload one part/chunk of the given stream to Box. Box requires that the sha of the part be sent along in the headers of the request. To do this WB must write the stream segment to disk before uploading. The part sha is calculated as the tempfile is written. API Docs: https://developer.box.com/reference#upload-part """ cutoff_stream = streams.CutoffStream(stream, cutoff=part_size) part_hasher_name = 'part-{}-sha1'.format(part_id) stream.add_writer(part_hasher_name, streams.HashStreamWriter(hashlib.sha1)) f = tempfile.TemporaryFile() chunk = await cutoff_stream.read(self.TEMP_CHUNK_SIZE) while chunk: f.write(chunk) chunk = await cutoff_stream.read(self.TEMP_CHUNK_SIZE) file_stream = streams.FileStreamReader(f) part_sha = stream.writers[part_hasher_name].digest part_sha_b64 = base64.standard_b64encode(part_sha).decode() stream.remove_writer(part_hasher_name) byte_range = self._build_range_header( (start_offset, start_offset + part_size - 1)) content_range = str(byte_range).replace('=', ' ') + '/{}'.format( stream.size) async with self.request( 'PUT', self._build_upload_url('files', 'upload_sessions', session_id), headers= { # ``Content-Length`` is required for ``asyncio`` to use inner chunked stream read 'Content-Length': str(part_size), 'Content-Range': content_range, 'Content-Type:': 'application/octet-stream', 'Digest': 'sha={}'.format(part_sha_b64) }, data=file_stream, expects=(201, 200), throws=exceptions.UploadError, ) as resp: data = await resp.json() f.close() return data['part']
async def test_upload_nested_update(self, provider): file_name = 'nested.txt' file_folder = '/subfolder' file_path = os.path.join(file_folder, file_name) file_content = b'Test Update Nested Content' file_stream = streams.FileStreamReader(io.BytesIO(file_content)) path = await provider.validate_path(file_path) metadata, created = await provider.upload(file_stream, path) assert metadata.name == file_name assert metadata.path == file_path assert metadata.size == len(file_content) assert created is False
async def test_upload_update(self, provider): file_name = 'flower.jpg' file_folder = '/' file_path = os.path.join(file_folder, file_name) file_content = b'Short and stout' file_stream = streams.FileStreamReader(io.BytesIO(file_content)) path = await provider.validate_path(file_path) metadata, created = await provider.upload(file_stream, path) assert metadata.name == file_name assert metadata.path == file_path assert metadata.size == len(file_content) assert created is False
def test_upload_nested_create(self, provider): file_name = 'new.txt' file_folder = '/newsubfolder' file_path = os.path.join(file_folder, file_name) file_content = b'Test New Nested Content' file_stream = streams.FileStreamReader(io.BytesIO(file_content)) path = yield from provider.validate_path(file_path) metadata, created = yield from provider.upload(file_stream, path) assert metadata.name == file_name assert metadata.path == file_path assert metadata.size == len(file_content) assert created is True
async def test_file_stream_reader_after_seek(self): with open(DUMMY_FILE, 'r') as fp: fp.seek(3) reader = streams.FileStreamReader(fp) assert reader.size == 27 # still gives full size assert fp.tell() == 3 # returns to original seek position data = await reader.read() assert data == 'abcdefghijklmnopqrstuvwxyz\n' # always reads full data at_eof = reader.at_eof() assert not at_eof data = await reader.read() assert data == b'' at_eof = reader.at_eof() assert at_eof
def upload(self, stream, path, **kwargs): """Zips the given stream then uploads to Dataverse. This will delete existing draft files with the same name. :param waterbutler.core.streams.RequestWrapper stream: The stream to put to Dataverse :param str path: The filename prepended with '/' :rtype: dict, bool """ stream = streams.ZipStreamReader((path.name, stream)) # Write stream to disk (Necessary to find zip file size) f = tempfile.TemporaryFile() chunk = yield from stream.read() while chunk: f.write(chunk) chunk = yield from stream.read() stream = streams.FileStreamReader(f) dv_headers = { "Content-Disposition": "filename=temp.zip", "Content-Type": "application/zip", "Packaging": "http://purl.org/net/sword/package/SimpleZip", "Content-Length": str(stream.size), } # Delete old file if it exists if path.identifier: yield from self.delete(path) yield from self.make_request( 'POST', self.build_url(settings.EDIT_MEDIA_BASE_URL, 'study', self.doi), headers=dv_headers, auth=(self.token, ), data=stream, expects=(201, ), throws=exceptions.UploadError ) # Find appropriate version of file metadata = yield from self._get_data('latest') files = metadata if isinstance(metadata, list) else [] file_metadata = next(file for file in files if file['name'] == path.name) return file_metadata, path.identifier is None
async def test_file_stream_reader(self): with open(DUMMY_FILE, 'r') as fp: reader = streams.FileStreamReader(fp) assert reader.size == 27 data = await reader.read() assert data == 'abcdefghijklmnopqrstuvwxyz\n' at_eof = reader.at_eof() assert not at_eof data = await reader.read() assert data == b'' at_eof = reader.at_eof() assert at_eof reader.close() at_eof = reader.at_eof() assert at_eof with pytest.raises(ValueError): fp.read()
async def test_single_large_file(self, temp_files): filename = 'foo.txt' path = temp_files.add_file(filename) random_data = os.urandom(2**18) with open(path, 'wb') as f: f.write(random_data) with open(path, 'rb') as f: stream = streams.ZipStreamReader( AsyncIterator([(filename, streams.FileStreamReader(f))])) data = await stream.read() zip = zipfile.ZipFile(io.BytesIO(data)) # Verify CRCs assert zip.testzip() is None result = zip.open('foo.txt') # Check content of included file assert result.read() == random_data
async def test_file_stream_reader_subset(self): with open(DUMMY_FILE, 'r') as fp: reader = streams.FileStreamReader(fp) data = await reader.read(10) assert data == 'abcdefghij' at_eof = reader.at_eof() assert not at_eof data = await reader.read(2) assert data == 'kl' at_eof = reader.at_eof() assert not at_eof data = await reader.read() assert data == 'mnopqrstuvwxyz\n' at_eof = reader.at_eof() assert not at_eof data = await reader.read() assert data == b'' at_eof = reader.at_eof() assert at_eof
def file_stream(file_like): return streams.FileStreamReader(file_like)