def test_invalid_type(self): with self.assertRaises(TypeError): zstd.get_frame_parameters(None) # Python 3 doesn't appear to convert unicode to Py_buffer. if sys.version_info[0] >= 3: with self.assertRaises(TypeError): zstd.get_frame_parameters(u'foobarbaz') else: # CPython will convert unicode to Py_buffer. But CFFI won't. if zstd.backend == 'cffi': with self.assertRaises(TypeError): zstd.get_frame_parameters(u'foobarbaz') else: with self.assertRaises(zstd.ZstdError): zstd.get_frame_parameters(u'foobarbaz')
def test_multithreaded(self): chunk_size = multithreaded_chunk_size(1) source = b''.join([b'x' * chunk_size, b'y' * chunk_size]) cctx = zstd.ZstdCompressor(level=1, threads=2) compressed = cctx.compress(source) params = zstd.get_frame_parameters(compressed) self.assertEqual(params.content_size, chunk_size * 2) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum) dctx = zstd.ZstdDecompressor() self.assertEqual(dctx.decompress(compressed), source)
def test_write_checksum(self): source = io.BytesIO(b'foobar') no_checksum = io.BytesIO() cctx = zstd.ZstdCompressor(level=1) cctx.copy_stream(source, no_checksum) source.seek(0) with_checksum = io.BytesIO() cctx = zstd.ZstdCompressor(level=1, write_checksum=True) cctx.copy_stream(source, with_checksum) self.assertEqual(len(with_checksum.getvalue()), len(no_checksum.getvalue()) + 4) no_params = zstd.get_frame_parameters(no_checksum.getvalue()) with_params = zstd.get_frame_parameters(with_checksum.getvalue()) self.assertEqual(no_params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(with_params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(no_params.dict_id, 0) self.assertEqual(with_params.dict_id, 0) self.assertFalse(no_params.has_checksum) self.assertTrue(with_params.has_checksum)
def test_empty(self): buffer = io.BytesIO() cctx = zstd.ZstdCompressor(level=1, write_content_size=False) with cctx.stream_writer(buffer) as compressor: compressor.write(b'') result = buffer.getvalue() self.assertEqual(result, b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00') params = zstd.get_frame_parameters(result) self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 524288) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum)
def test_write_content_size(self): no_size = io.BytesIO() cctx = zstd.ZstdCompressor(level=1, write_content_size=False) with cctx.stream_writer(no_size, closefd=False) as compressor: self.assertEqual(compressor.write(b"foobar" * 256), len(b"foobar" * 256)) with_size = io.BytesIO() cctx = zstd.ZstdCompressor(level=1) with cctx.stream_writer(with_size, closefd=False) as compressor: self.assertEqual(compressor.write(b"foobar" * 256), len(b"foobar" * 256)) # Source size is not known in streaming mode, so header not # written. self.assertEqual(len(with_size.getvalue()), len(no_size.getvalue())) # Declaring size will write the header. with_size = io.BytesIO() with cctx.stream_writer(with_size, size=len(b"foobar" * 256), closefd=False) as compressor: self.assertEqual(compressor.write(b"foobar" * 256), len(b"foobar" * 256)) no_params = zstd.get_frame_parameters(no_size.getvalue()) with_params = zstd.get_frame_parameters(with_size.getvalue()) self.assertEqual(no_params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(with_params.content_size, 1536) self.assertEqual(no_params.dict_id, 0) self.assertEqual(with_params.dict_id, 0) self.assertFalse(no_params.has_checksum) self.assertFalse(with_params.has_checksum) self.assertEqual(len(with_size.getvalue()), len(no_size.getvalue()) + 1)
def test_read_large(self): cctx = zstd.ZstdCompressor(level=1) source = io.BytesIO() source.write(b'f' * zstd.COMPRESSION_RECOMMENDED_INPUT_SIZE) source.write(b'o') source.seek(0) # Creating an iterator should not perform any compression until # first read. it = cctx.read_to_iter(source, size=len(source.getvalue())) self.assertEqual(source.tell(), 0) # We should have exactly 2 output chunks. chunks = [] chunk = next(it) self.assertIsNotNone(chunk) self.assertEqual(source.tell(), zstd.COMPRESSION_RECOMMENDED_INPUT_SIZE) chunks.append(chunk) chunk = next(it) self.assertIsNotNone(chunk) chunks.append(chunk) self.assertEqual(source.tell(), len(source.getvalue())) with self.assertRaises(StopIteration): next(it) # And again for good measure. with self.assertRaises(StopIteration): next(it) # We should get the same output as the one-shot compression mechanism. self.assertEqual(b''.join(chunks), cctx.compress(source.getvalue())) params = zstd.get_frame_parameters(b''.join(chunks)) self.assertEqual(params.content_size, 0) self.assertEqual(params.window_size, 262144) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum) # Now check the buffer protocol. it = cctx.read_to_iter(source.getvalue()) chunks = list(it) self.assertEqual(len(chunks), 2) self.assertEqual(b''.join(chunks), cctx.compress(source.getvalue()))
def test_compress_empty(self): cctx = zstd.ZstdCompressor(level=1) result = cctx.compress(b'') self.assertEqual(result, b'\x28\xb5\x2f\xfd\x00\x48\x01\x00\x00') params = zstd.get_frame_parameters(result) self.assertEqual(params.content_size, 0) self.assertEqual(params.window_size, 524288) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum, 0) # TODO should be temporary until https://github.com/facebook/zstd/issues/506 # is fixed. cctx = zstd.ZstdCompressor(write_content_size=True) with self.assertRaises(ValueError): cctx.compress(b'') cctx.compress(b'', allow_empty=True)
def test_compressobj_large(self): chunks = [] for i in range(255): chunks.append(struct.Struct('>B').pack(i) * 16384) cctx = zstd.ZstdCompressor(level=3) cobj = cctx.compressobj() result = cobj.compress(b''.join(chunks)) + cobj.flush() self.assertEqual(len(result), 999) self.assertEqual(result[0:4], b'\x28\xb5\x2f\xfd') params = zstd.get_frame_parameters(result) self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 1048576) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum)
def test_large_data(self): source = io.BytesIO() for i in range(255): source.write(struct.Struct('>B').pack(i) * 16384) source.seek(0) dest = io.BytesIO() cctx = zstd.ZstdCompressor() r, w = cctx.copy_stream(source, dest) self.assertEqual(r, 255 * 16384) self.assertEqual(w, 999) params = zstd.get_frame_parameters(dest.getvalue()) self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 1048576) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum)
def test_input_types(self): v = zstd.FRAME_HEADER + b'\x00\x00' mutable_array = bytearray(len(v)) mutable_array[:] = v sources = [ memoryview(v), bytearray(v), mutable_array, ] for source in sources: params = zstd.get_frame_parameters(source) self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 1024) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum)
def test_input_types(self): v = zstd.FRAME_HEADER + b"\x00\x00" mutable_array = bytearray(len(v)) mutable_array[:] = v sources = [ memoryview(v), bytearray(v), mutable_array, ] for source in sources: params = zstd.get_frame_parameters(source) self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 1024) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum)
def test_multithreaded_dict(self): samples = [] for i in range(128): samples.append(b'foo' * 64) samples.append(b'bar' * 64) samples.append(b'foobar' * 64) d = zstd.train_dictionary(1024, samples) cctx = zstd.ZstdCompressor(dict_data=d, threads=2) result = cctx.compress(b'foo') params = zstd.get_frame_parameters(result); self.assertEqual(params.content_size, 3); self.assertEqual(params.dict_id, d.dict_id()) self.assertEqual(result, b'\x28\xb5\x2f\xfd\x23\x06\x59\xb5\x52\x03\x19\x00\x00' b'\x66\x6f\x6f')
def test_compression_params(self): params = zstd.CompressionParameters(20, 6, 12, 5, 4, 10, zstd.STRATEGY_FAST) buffer = io.BytesIO() cctx = zstd.ZstdCompressor(compression_params=params) with cctx.write_to(buffer) as compressor: self.assertEqual(compressor.write(b'foo'), 0) self.assertEqual(compressor.write(b'bar'), 0) self.assertEqual(compressor.write(b'foobar' * 16384), 0) compressed = buffer.getvalue() params = zstd.get_frame_parameters(compressed) self.assertEqual(params.content_size, 0) self.assertEqual(params.window_size, 1048576) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum) h = hashlib.sha1(compressed).hexdigest() self.assertEqual(h, '1ae31f270ed7de14235221a604b31ecd517ebd99')
def test_attributes(self): params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b"\x00\x00") self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 1024) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum) # Lowest 2 bits indicate a dictionary and length. Here, the dict id is 1 byte. params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b"\x01\x00\xff") self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 1024) self.assertEqual(params.dict_id, 255) self.assertFalse(params.has_checksum) # Lowest 3rd bit indicates if checksum is present. params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b"\x04\x00") self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 1024) self.assertEqual(params.dict_id, 0) self.assertTrue(params.has_checksum) # Upper 2 bits indicate content size. params = zstd.get_frame_parameters( zstd.FRAME_HEADER + b"\x40\x00\xff\x00" ) self.assertEqual(params.content_size, 511) self.assertEqual(params.window_size, 1024) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum) # Window descriptor is 2nd byte after frame header. params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b"\x00\x40") self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 262144) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum) # Set multiple things. params = zstd.get_frame_parameters( zstd.FRAME_HEADER + b"\x45\x40\x0f\x10\x00" ) self.assertEqual(params.content_size, 272) self.assertEqual(params.window_size, 262144) self.assertEqual(params.dict_id, 15) self.assertTrue(params.has_checksum)
def test_dictionary(self): samples = [] for i in range(128): samples.append(b'foo' * 64) samples.append(b'bar' * 64) samples.append(b'foobar' * 64) d = zstd.train_dictionary(8192, samples) h = hashlib.sha1(d.as_bytes()).hexdigest() self.assertEqual(h, '2b3b6428da5bf2c9cc9d4bb58ba0bc5990dd0e79') buffer = io.BytesIO() cctx = zstd.ZstdCompressor(level=9, dict_data=d) with cctx.stream_writer(buffer) as compressor: self.assertEqual(compressor.write(b'foo'), 0) self.assertEqual(compressor.write(b'bar'), 0) self.assertEqual(compressor.write(b'foo' * 16384), 0) compressed = buffer.getvalue() params = zstd.get_frame_parameters(compressed) self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 2097152) self.assertEqual(params.dict_id, d.dict_id()) self.assertFalse(params.has_checksum) h = hashlib.sha1(compressed).hexdigest() self.assertEqual(h, 'd118cd7a008c7b8f416aa3a5f609eab4c629af95') source = b'foo' + b'bar' + (b'foo' * 16384) dctx = zstd.ZstdDecompressor(dict_data=d) self.assertEqual(dctx.decompress(compressed, max_output_size=len(source)), source)
def from_file(cls: Type[DB], path: Union[str, PathLike], create_new=False) -> DB: """Load a Database from a path.""" path = Path(path) if not path.exists() and create_new: logger = logging.getLogger(__name__) logger.warning( "Database file does not exist. Starting with blank database." ) return cls() if path.suffix == ".gz": with gzip.open(path, "rb") as f: s = f.read() elif path.suffix == ".zst": with open(path, "rb") as f: c = f.read() has_checksum, checksum = ( zstd.get_frame_parameters(c).has_checksum, c[-4:], ) s = zstd.decompress(c) del c s_hash = xxhash.xxh64_digest(s) if has_checksum and checksum != s_hash[-4:][::-1]: raise DatabaseException( f"zstd content checksum verification failed: " f"{checksum.hex()} != {s_hash.hex()}" ) else: with open(path, "rb") as f: s = f.read() db = orjson.loads(s) del s db = cls.from_dict(db) return db
def test_dictionary(self): samples = [] for i in range(128): samples.append(b"foo" * 64) samples.append(b"bar" * 64) samples.append(b"foobar" * 64) d = zstd.train_dictionary(8192, samples) h = hashlib.sha1(d.as_bytes()).hexdigest() self.assertEqual(h, "e739fb6cecd613386b8fffc777f756f5e6115e73") buffer = io.BytesIO() cctx = zstd.ZstdCompressor(level=9, dict_data=d) with cctx.stream_writer(buffer, closefd=False) as compressor: self.assertEqual(compressor.write(b"foo"), 3) self.assertEqual(compressor.write(b"bar"), 3) self.assertEqual(compressor.write(b"foo" * 16384), 3 * 16384) compressed = buffer.getvalue() params = zstd.get_frame_parameters(compressed) self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 2097152) self.assertEqual(params.dict_id, d.dict_id()) self.assertFalse(params.has_checksum) h = hashlib.sha1(compressed).hexdigest() self.assertEqual(h, "8703b4316f274d26697ea5dd480f29c08e85d940") source = b"foo" + b"bar" + (b"foo" * 16384) dctx = zstd.ZstdDecompressor(dict_data=d) self.assertEqual( dctx.decompress(compressed, max_output_size=len(source)), source)
def test_dictionary(self): samples = [] for i in range(128): samples.append(b"foo" * 64) samples.append(b"bar" * 64) samples.append(b"foobar" * 64) d = zstd.train_dictionary(8192, samples) h = hashlib.sha1(d.as_bytes()).hexdigest() self.assertEqual(h, "a46d2f7a3bc3357c9d717d3dadf9a26fde23e93d") buffer = io.BytesIO() cctx = zstd.ZstdCompressor(level=9, dict_data=d) with cctx.stream_writer(buffer, closefd=False) as compressor: self.assertEqual(compressor.write(b"foo"), 3) self.assertEqual(compressor.write(b"bar"), 3) self.assertEqual(compressor.write(b"foo" * 16384), 3 * 16384) compressed = buffer.getvalue() params = zstd.get_frame_parameters(compressed) self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 4194304) self.assertEqual(params.dict_id, d.dict_id()) self.assertFalse(params.has_checksum) h = hashlib.sha1(compressed).hexdigest() self.assertEqual(h, "f8ca6ebe269a822615e86d710c74d61cb4d4e3ca") source = b"foo" + b"bar" + (b"foo" * 16384) dctx = zstd.ZstdDecompressor(dict_data=d) self.assertEqual( dctx.decompress(compressed, max_output_size=len(source)), source)
def test_attributes(self): params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x00\x00') self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 1024) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum) # Lowest 2 bits indicate a dictionary and length. Here, the dict id is 1 byte. params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x01\x00\xff') self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 1024) self.assertEqual(params.dict_id, 255) self.assertFalse(params.has_checksum) # Lowest 3rd bit indicates if checksum is present. params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x04\x00') self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 1024) self.assertEqual(params.dict_id, 0) self.assertTrue(params.has_checksum) # Upper 2 bits indicate content size. params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x40\x00\xff\x00') self.assertEqual(params.content_size, 511) self.assertEqual(params.window_size, 1024) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum) # Window descriptor is 2nd byte after frame header. params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x00\x40') self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 262144) self.assertEqual(params.dict_id, 0) self.assertFalse(params.has_checksum) # Set multiple things. params = zstd.get_frame_parameters(zstd.FRAME_HEADER + b'\x45\x40\x0f\x10\x00') self.assertEqual(params.content_size, 272) self.assertEqual(params.window_size, 262144) self.assertEqual(params.dict_id, 15) self.assertTrue(params.has_checksum)
def test_invalid_input_sizes(self): with self.assertRaisesRegexp(zstd.ZstdError, 'not enough data for frame'): zstd.get_frame_parameters(b'') with self.assertRaisesRegexp(zstd.ZstdError, 'not enough data for frame'): zstd.get_frame_parameters(zstd.FRAME_HEADER)
def test_invalid_frame(self): with self.assertRaisesRegex(zstd.ZstdError, "Unknown frame descriptor"): zstd.get_frame_parameters(b"foobarbaz")
def test_invalid_frame(self): with self.assertRaisesRegexp(zstd.ZstdError, 'Unknown frame descriptor'): zstd.get_frame_parameters(b'foobarbaz')
def test_invalid_type(self): with self.assertRaises(TypeError): zstd.get_frame_parameters(None) with self.assertRaises(TypeError): zstd.get_frame_parameters(u"foobarbaz")