def test_hashed_stream_reader(tmp_dir): tmp_dir.gen({"foo": "foo"}) foo = tmp_dir / "foo" with open(foo, "rb") as fobj: stream_reader = HashedStreamReader(fobj) assert stream_reader.read(3) == b"foo" hex_digest = file_md5(foo, LocalFileSystem(None, {})) assert stream_reader.is_text_file assert hex_digest == stream_reader.hash_info.value
def test_hashed_stream_reader_as_chunks(tmp_dir): tmp_dir.gen({"foo": b"foo \x00" * 16}) foo = tmp_dir / "foo" with open(foo, "rb") as fobj: stream_reader = HashedStreamReader(fobj) while True: chunk = stream_reader.read(16) if not chunk: break hex_digest = file_md5(foo, LocalFileSystem(None, {})) assert not stream_reader.is_text_file assert hex_digest == stream_reader.hash_info.value
def test_hashed_stream_reader_compatibility(tmp_dir, contents): # Always read more than the DEFAULT_CHUNK_SIZE (512 bytes). # This imitates the read actions performed by upload_fobj. chunk_size = DEFAULT_CHUNK_SIZE * 2 tmp_dir.gen("data", contents) data = tmp_dir / "data" with open(data, "rb") as fobj: stream_reader = HashedStreamReader(fobj) stream_reader.read(chunk_size) local_fs = LocalFileSystem() hex_digest = file_md5(data, local_fs) assert stream_reader.is_text_file is istextfile(data, local_fs) assert stream_reader.hash_info.value == hex_digest
def _upload_file(path_info, fs, odb, upload_odb): from dvc.utils import tmp_fname from dvc.utils.stream import HashedStreamReader tmp_info = upload_odb.path_info / tmp_fname() with fs.open(path_info, mode="rb", chunk_size=fs.CHUNK_SIZE) as stream: stream = HashedStreamReader(stream) size = fs.getsize(path_info) upload_odb.fs.upload(stream, tmp_info, desc=path_info.name, total=size) odb.add(tmp_info, upload_odb.fs, stream.hash_info) return path_info, odb.get(stream.hash_info)
def test_hashed_stream_reader_as_chunks(tmp_dir): tmp_dir.gen({"foo": b"foo \x00" * 16}) foo = tmp_dir / "foo" actual_size = len(foo.read_bytes()) with open(foo, "rb") as fobj: stream_reader = HashedStreamReader(fobj) total_read = 0 while True: chunk = stream_reader.read(16) total_read += len(chunk) assert stream_reader.tell() == total_read if not chunk: break assert stream_reader.tell() == actual_size == total_read hex_digest = file_md5(foo, LocalFileSystem(None, {})) assert not stream_reader.is_text_file assert hex_digest == stream_reader.hash_info.value
def _upload_file(path_info, fs, odb): from dvc.utils import tmp_fname from dvc.utils.stream import HashedStreamReader tmp_info = odb.path_info / tmp_fname() with fs.open(path_info, mode="rb", chunk_size=fs.CHUNK_SIZE) as stream: stream = HashedStreamReader(stream) odb.fs.upload_fobj(stream, tmp_info, desc=path_info.name, size=fs.getsize(path_info)) obj = HashFile(tmp_info, odb.fs, stream.hash_info) return path_info, obj
def _upload_file(from_fs_path, fs, odb, upload_odb): from dvc.utils import tmp_fname from dvc.utils.stream import HashedStreamReader fs_path = upload_odb.fs.path tmp_info = fs_path.join(upload_odb.fs_path, tmp_fname()) with fs.open(from_fs_path, mode="rb", chunk_size=fs.CHUNK_SIZE) as stream: stream = HashedStreamReader(stream) size = fs.getsize(from_fs_path) upload_odb.fs.upload( stream, tmp_info, desc=fs_path.name(from_fs_path), total=size ) odb.add(tmp_info, upload_odb.fs, stream.hash_info) meta = Meta(size=size) return from_fs_path, meta, odb.get(stream.hash_info)
def test_hashed_stream_reader(tmp_dir): tmp_dir.gen({"foo": "foo"}) foo = tmp_dir / "foo" with open(foo, "rb") as fobj: stream_reader = HashedStreamReader(fobj) assert stream_reader.readable() assert not stream_reader.seekable() assert stream_reader.read(2) == b"fo" assert stream_reader.tell() == 2 assert stream_reader.read(1) == b"o" assert stream_reader.tell() == 3 hex_digest = file_md5(foo, LocalFileSystem()) assert stream_reader.is_text_file assert hex_digest == stream_reader.hash_info.value
def _transfer_file(odb, from_fs, from_info): from dvc.utils import tmp_fname from dvc.utils.stream import HashedStreamReader tmp_info = odb.fs.path_info / tmp_fname() with from_fs.open(from_info, mode="rb", chunk_size=from_fs.CHUNK_SIZE) as stream: stream_reader = HashedStreamReader(stream) # Since we don't know the hash beforehand, we'll # upload it to a temporary location and then move # it. odb.fs.upload_fobj( stream_reader, tmp_info, total=from_fs.getsize(from_info), desc=from_info.name, ) hash_info = stream_reader.hash_info return tmp_info, hash_info
def _transfer_file(self, from_tree, from_info): from dvc.utils import tmp_fname from dvc.utils.stream import HashedStreamReader tmp_info = self.tree.path_info / tmp_fname() with from_tree.open(from_info, mode="rb", chunk_size=from_tree.CHUNK_SIZE) as stream: stream_reader = HashedStreamReader(stream) # Since we don't know the hash beforehand, we'll # upload it to a temporary location and then move # it. self.tree.upload_fobj( stream_reader, tmp_info, total=from_tree.getsize(from_info), desc=from_info.name, ) hash_info = stream_reader.hash_info self.move(tmp_info, self.tree.hash_to_path_info(hash_info.value)) return hash_info