def test_no_dict_id(self): samples = [] for i in range(128): samples.append(b'foo' * 64) samples.append(b'bar' * 64) samples.append(b'foobar' * 64) d = zstd.train_dictionary(1024, samples) with_dict_id = io.BytesIO() cctx = zstd.ZstdCompressor(level=1, dict_data=d) with cctx.write_to(with_dict_id) as compressor: self.assertEqual(compressor.write(b'foobarfoobar'), 0) cctx = zstd.ZstdCompressor(level=1, dict_data=d, write_dict_id=False) no_dict_id = io.BytesIO() with cctx.write_to(no_dict_id) as compressor: self.assertEqual(compressor.write(b'foobarfoobar'), 0) no_params = zstd.get_frame_parameters(no_dict_id.getvalue()) with_params = zstd.get_frame_parameters(with_dict_id.getvalue()) self.assertEqual(no_params.content_size, 0) self.assertEqual(with_params.content_size, 0) self.assertEqual(no_params.dict_id, 0) self.assertEqual(with_params.dict_id, d.dict_id()) self.assertFalse(no_params.has_checksum) self.assertFalse(with_params.has_checksum) self.assertEqual(len(with_dict_id.getvalue()), len(no_dict_id.getvalue()) + 4)
def test_no_dict_id(self): samples = [] for i in range(128): samples.append(b"foo" * 64) samples.append(b"bar" * 64) samples.append(b"foobar" * 64) d = zstd.train_dictionary(1024, samples) with_dict_id = io.BytesIO() cctx = zstd.ZstdCompressor(level=1, dict_data=d) with cctx.stream_writer(with_dict_id, closefd=False) as compressor: self.assertEqual(compressor.write(b"foobarfoobar"), 12) self.assertEqual(with_dict_id.getvalue()[4:5], b"\x03") cctx = zstd.ZstdCompressor(level=1, dict_data=d, write_dict_id=False) no_dict_id = io.BytesIO() with cctx.stream_writer(no_dict_id, closefd=False) as compressor: self.assertEqual(compressor.write(b"foobarfoobar"), 12) self.assertEqual(no_dict_id.getvalue()[4:5], b"\x00") no_params = zstd.get_frame_parameters(no_dict_id.getvalue()) with_params = zstd.get_frame_parameters(with_dict_id.getvalue()) self.assertEqual(no_params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(with_params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(no_params.dict_id, 0) self.assertEqual(with_params.dict_id, d.dict_id()) self.assertFalse(no_params.has_checksum) self.assertFalse(with_params.has_checksum) self.assertEqual(len(with_dict_id.getvalue()), len(no_dict_id.getvalue()) + 4)
def test_set_dict_id(self): d = zstd.train_dictionary(8192, generate_samples(), k=64, d=16, dict_id=42) self.assertEqual(d.dict_id(), 42)
def test_dictionary(self): samples = [] for i in range(128): samples.append(b'foo' * 64) samples.append(b'bar' * 64) samples.append(b'foobar' * 64) d = zstd.train_dictionary(8192, samples) h = hashlib.sha1(d.as_bytes()).hexdigest() self.assertEqual(h, '3040faa0ddc37d50e71a4dd28052cb8db5d9d027') buffer = io.BytesIO() cctx = zstd.ZstdCompressor(level=9, dict_data=d) with cctx.stream_writer(buffer) as compressor: self.assertEqual(compressor.write(b'foo'), 0) self.assertEqual(compressor.write(b'bar'), 0) self.assertEqual(compressor.write(b'foo' * 16384), 0) compressed = buffer.getvalue() params = zstd.get_frame_parameters(compressed) self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN) self.assertEqual(params.window_size, 2097152) self.assertEqual(params.dict_id, d.dict_id()) self.assertFalse(params.has_checksum) self.assertEqual(compressed, b'\x28\xb5\x2f\xfd\x03\x58\x06\x59\xb5\x52\x5d\x00' b'\x00\x00\x02\xfc\x3d\x3f\xd9\xb0\x51\x03\x45\x89')
def test_dictionary(self): samples = [] for i in range(128): samples.append(b"foo" * 64) samples.append(b"bar" * 64) samples.append(b"foobar" * 64) d = zstd.train_dictionary(8192, samples) orig = b"foobar" * 16384 buffer = io.BytesIO() cctx = zstd.ZstdCompressor(dict_data=d) with cctx.stream_writer(buffer, closefd=False) as compressor: self.assertEqual(compressor.write(orig), len(orig)) compressed = buffer.getvalue() buffer = io.BytesIO() dctx = zstd.ZstdDecompressor(dict_data=d) decompressor = dctx.stream_writer(buffer) self.assertEqual(decompressor.write(compressed), len(compressed)) self.assertEqual(buffer.getvalue(), orig) buffer = io.BytesIO() with dctx.stream_writer(buffer, closefd=False) as decompressor: self.assertEqual(decompressor.write(compressed), len(compressed)) self.assertEqual(buffer.getvalue(), orig)
def test_dictionary(self): samples = [] for i in range(128): samples.append(b'foo' * 64) samples.append(b'bar' * 64) samples.append(b'foobar' * 64) d = zstd.train_dictionary(8192, samples) buffer = io.BytesIO() cctx = zstd.ZstdCompressor(level=9, dict_data=d) with cctx.write_to(buffer) as compressor: self.assertEqual(compressor.write(b'foo'), 0) self.assertEqual(compressor.write(b'bar'), 0) self.assertEqual(compressor.write(b'foo' * 16384), 634) compressed = buffer.getvalue() params = zstd.get_frame_parameters(compressed) self.assertEqual(params.content_size, 0) self.assertEqual(params.window_size, 1024) self.assertEqual(params.dict_id, d.dict_id()) self.assertFalse(params.has_checksum) self.assertEqual( compressed[0:32], b'\x28\xb5\x2f\xfd\x03\x00\x55\x7b\x6b\x5e\x54\x00' b'\x00\x00\x02\xfc\xf4\xa5\xba\x23\x3f\x85\xb3\x54' b'\x00\x00\x18\x6f\x6f\x66\x01\x00') h = hashlib.sha1(compressed).hexdigest() self.assertEqual(h, '1c5bcd25181bcd8c1a73ea8773323e0056129f92')
def compression_dict(): # TODO: update dictionary txt_comming_from_a_file_for_training_of_compressor = "aasfdkjgd kajgsdf af kajgakjsg fkajgs df\n" * 653 samples = [ (str(random()) + s).encode() for s in txt_comming_from_a_file_for_training_of_compressor.split("\n") ] return zs.train_dictionary(dict_size=99999999, samples=samples, threads=-1)
def batch_done(self, batch_lines): self.acc_lines.extend(batch_lines) # we truncate before training as in practice we'd keep a size limited history if len(self.acc_lines) > 400: self.acc_lines = self.acc_lines[-400:] self.cur_dict = zstd.train_dictionary(self.train_dict_size, self.acc_lines)
def test_optimize(self): d = zstd.train_dictionary( 8192, generate_samples(), threads=-1, steps=1, d=16 ) # This varies by platform. self.assertIn(d.k, (50, 2000)) self.assertEqual(d.d, 16)
def test_set_dict_id(self): samples = [] for i in range(128): samples.append(b'foo' * 64) samples.append(b'foobar' * 64) d = zstd.train_dictionary(8192, samples, dict_id=42) self.assertEqual(d.dict_id(), 42)
def test_optimize(self): d = zstd.train_dictionary(8192, generate_samples(), threads=-1, steps=1, d=16) self.assertEqual(d.k, 50) self.assertEqual(d.d, 16)
def test_bad_precompute_compress(self): d = zstd.train_dictionary(8192, generate_samples(), k=64, d=16) with self.assertRaisesRegexp(ValueError, 'must specify one of level or '): d.precompute_compress() with self.assertRaisesRegexp(ValueError, 'must only specify one of level or '): d.precompute_compress(level=3, compression_params=zstd.CompressionParameters())
def test_dict(self): d = zstd.train_dictionary(16384, generate_samples(), k=64, d=16) cctx = zstd.ZstdCompressor(dict_data=d, level=1) frames = [cctx.compress(s) for s in generate_samples()] dctx = zstd.ZstdDecompressor(dict_data=d) result = dctx.multi_decompress_to_buffer(frames) self.assertEqual([o.tobytes() for o in result], generate_samples())
def test_bad_precompute_compress(self): d = zstd.train_dictionary(8192, generate_samples(), k=64, d=16) with self.assertRaisesRegexp(ValueError, 'must specify one of level or '): d.precompute_compress() with self.assertRaisesRegexp(ValueError, 'must only specify one of level or '): d.precompute_compress(level=3, compression_params=zstd.CompressionParameters())
def test_basic(self): d = zstd.train_dictionary(8192, generate_samples(), k=64, d=16) self.assertIsInstance(d.dict_id(), int_type) data = d.as_bytes() self.assertEqual(data[0:4], b'\x37\xa4\x30\xec') self.assertEqual(d.k, 64) self.assertEqual(d.d, 16)
def test_dict(self): d = zstd.train_dictionary(16384, generate_samples(), k=64, d=16) cctx = zstd.ZstdCompressor(dict_data=d, level=1) frames = [cctx.compress(s) for s in generate_samples()] dctx = zstd.ZstdDecompressor(dict_data=d) result = dctx.multi_decompress_to_buffer(frames) self.assertEqual([o.tobytes() for o in result], generate_samples())
def test_basic(self): d = zstd.train_dictionary(8192, generate_samples(), k=64, d=16) self.assertIsInstance(d.dict_id(), int_type) data = d.as_bytes() self.assertEqual(data[0:4], b'\x37\xa4\x30\xec') self.assertEqual(d.k, 64) self.assertEqual(d.d, 16)
def test_no_params(self): d = zstd.train_dictionary(8192, generate_samples()) self.assertIsInstance(d.dict_id(), int_type) # The dictionary ID may be different across platforms. expected = b'\x37\xa4\x30\xec' + struct.pack('<I', d.dict_id()) data = d.as_bytes() self.assertEqual(data[0:8], expected)
def test_no_params(self): d = zstd.train_dictionary(8192, generate_samples()) self.assertIsInstance(d.dict_id(), int_type) # The dictionary ID may be different across platforms. expected = b'\x37\xa4\x30\xec' + struct.pack('<I', d.dict_id()) data = d.as_bytes() self.assertEqual(data[0:8], expected)
def test_basic(self): d = zstd.train_dictionary(8192, generate_samples(), k=500, d=8) self.assertIsInstance(d.dict_id(), int) data = d.as_bytes() self.assertEqual(data[0:4], b"\x37\xa4\x30\xec") self.assertEqual(d.k, 500) self.assertEqual(d.d, 8)
def test_no_params(self): d = zstd.train_dictionary(8192, random_input_data()) self.assertIsInstance(d.dict_id(), int) # The dictionary ID may be different across platforms. expected = b"\x37\xa4\x30\xec" + struct.pack("<I", d.dict_id()) data = d.as_bytes() self.assertEqual(data[0:8], expected)
def train_dictionary(dss, dict_sz=8 * 1024): def to_bytes(o): if isinstance(o, dict): _, d = doc2bytes(o) else: _, d = ds2bytes(o) return d sample = list(map(to_bytes, dss)) return zstandard.train_dictionary(dict_sz, sample).as_bytes()
def test_set_dict_id(self): samples = generate_samples() d = zstd.train_dictionary( get_optimal_dict_size_heuristically(samples), samples, k=64, d=8, dict_id=42, ) self.assertEqual(d.dict_id(), 42)
def test_basic(self): samples = generate_samples() d = zstd.train_dictionary(8192, samples) self.assertLessEqual(len(d), 8192) dict_id = d.dict_id() self.assertIsInstance(dict_id, int_type) data = d.as_bytes() self.assertEqual(data[0:4], b'\x37\xa4\x30\xec')
def _save_training_dict(training_set, f4_file_path, compression_level, num_processes): training_list = sorted(list(training_set)) training_num_chars = sum([len(x) for x in training_set]) training_dict = zstandard.train_dictionary(training_num_chars, training_list, level=compression_level, threads=num_processes) with open(CompressionHelper._get_training_dict_file_path(f4_file_path), "wb") as dict_file: dict_file.write(training_dict.as_bytes())
def train_dictionary(docs, dict_sz=8 * 1024): """ Given a finite sequence of Datasets train zstandard compression dictionary of a given size. Accepts both `Dataset` as well as "raw" datasets. Will return None if input sequence is empty. """ sample = list(v for _, v in map(doc2bytes, docs)) if len(sample) == 0: return None return zstandard.train_dictionary(dict_sz, sample).as_bytes()
def test_dict(self): samples = generate_samples() optSize = get_optimal_dict_size_heuristically(samples) d = zstd.train_dictionary(optSize, samples, k=64, d=8) cctx = zstd.ZstdCompressor(dict_data=d, level=1) frames = [cctx.compress(s) for s in generate_samples()] dctx = zstd.ZstdDecompressor(dict_data=d) result = dctx.multi_decompress_to_buffer(frames) self.assertEqual([o.tobytes() for o in result], samples)
def test_compress_dict_multiple(self): samples = [] for i in range(128): samples.append(b'foo' * 64) samples.append(b'bar' * 64) samples.append(b'foobar' * 64) d = zstd.train_dictionary(8192, samples) cctx = zstd.ZstdCompressor(level=1, dict_data=d) for i in range(32): cctx.compress(b'foo bar foobar foo bar foobar')
def create_dictionary(database: Database): # make this parse through the entire entries table and make # a dictionary from it... def content_bytes(row): return bytes(row["content"], encoding='utf8') dict_size = 10485760 samples = list( map(content_bytes, (database.query("SELECT content from entries limit 10000")))) # https://python-zstandard.readthedocs.io/en/latest/compressor.html return zstd.train_dictionary(dict_size, samples)
def test_optimize(self): samples = generate_samples() d = zstd.train_dictionary( get_optimal_dict_size_heuristically(samples), samples, threads=-1, steps=1, d=6, notifications=2, ) # This varies by platform. self.assertIn(d.k, (50, 2000)) self.assertEqual(d.d, 6)
def test_dict_precompute(self): samples = [] for i in range(128): samples.append(b"foo" * 64) samples.append(b"bar" * 64) samples.append(b"foobar" * 64) d = zstd.train_dictionary(8192, samples) d.precompute_compress(level=1) cctx = zstd.ZstdCompressor(level=1, dict_data=d) for i in range(32): cctx.compress(b"foo bar foobar foo bar foobar")
def test_bad_precompute_compress(self): samples = generate_samples() d = zstd.train_dictionary(get_optimal_dict_size_heuristically(samples), samples, k=64, d=8) with self.assertRaisesRegex(ValueError, "must specify one of level or "): d.precompute_compress() with self.assertRaisesRegex(ValueError, "must only specify one of level or "): d.precompute_compress( level=3, compression_params=zstd.ZstdCompressionParameters())
def train_dictionary(docs: Iterable[Document_], dict_sz: int = 8 * 1024) -> Optional[bytes]: """Given a finite sequence of Documents train zstandard compression dictionary of a given size. Document is either a - (id, {..}) tuple - {id: str|UUID, ...} a dictionary with `id` key of type UUID (possibly string formatted UUID) Will return None if input sequence is empty. """ sample = list(v for _, v in map(doc2bytes, docs)) if len(sample) == 0: return None return zstandard.train_dictionary(dict_sz, sample).as_bytes()
def process_header(self, dict_dump): train_data = [] for k in self.cur_dict: train_data.append(bytes(k, 'utf-8')) # train on each action independently self.cur_zdict = zstd.train_dictionary(self.max_zdict_size, train_data) data = bytes(dict_dump, 'utf-8') dedup_dict_size = len( zstd.ZstdCompressor(level=self.level, dict_data=self.cur_zdict).compress(data)) dict_bytes = self.cur_zdict.as_bytes() comp_dict_size = len( zstd.ZstdCompressor(level=self.level).compress(dict_bytes)) return dedup_dict_size + comp_dict_size
def test_dictionary(self): samples = [] for i in range(128): samples.append(b'foo' * 64) samples.append(b'bar' * 64) samples.append(b'foobar' * 64) d = zstd.train_dictionary(8192, samples) orig = b'foobar' * 16384 cctx = zstd.ZstdCompressor(level=1, dict_data=d) compressed = cctx.compress(orig) dctx = zstd.ZstdDecompressor(dict_data=d) decompressed = dctx.decompress(compressed) self.assertEqual(decompressed, orig)
def test_dictionary_multiple(self): samples = [] for i in range(128): samples.append(b'foo' * 64) samples.append(b'bar' * 64) samples.append(b'foobar' * 64) d = zstd.train_dictionary(8192, samples) sources = (b'foobar' * 8192, b'foo' * 8192, b'bar' * 8192) compressed = [] cctx = zstd.ZstdCompressor(level=1, dict_data=d) for source in sources: compressed.append(cctx.compress(source)) dctx = zstd.ZstdDecompressor(dict_data=d) for i in range(len(sources)): decompressed = dctx.decompress(compressed[i]) self.assertEqual(decompressed, sources[i])
def test_dictionary(self): samples = [] for i in range(128): samples.append(b'foo' * 64) samples.append(b'bar' * 64) samples.append(b'foobar' * 64) d = zstd.train_dictionary(8192, samples) orig = b'foobar' * 16384 buffer = io.BytesIO() cctx = zstd.ZstdCompressor(dict_data=d) with cctx.stream_writer(buffer) as compressor: self.assertEqual(compressor.write(orig), 0) compressed = buffer.getvalue() buffer = io.BytesIO() dctx = zstd.ZstdDecompressor(dict_data=d) with dctx.stream_writer(buffer) as decompressor: self.assertEqual(decompressor.write(compressed), len(orig)) self.assertEqual(buffer.getvalue(), orig)
def test_optimize(self): d = zstd.train_dictionary(8192, generate_samples(), threads=-1, steps=1, d=16) self.assertEqual(d.k, 50) self.assertEqual(d.d, 16)
def test_no_args(self): with self.assertRaises(TypeError): zstd.train_dictionary()
def test_bad_args(self): with self.assertRaises(TypeError): zstd.train_dictionary(8192, u'foo') with self.assertRaises(ValueError): zstd.train_dictionary(8192, [u'foo'])
def test_set_dict_id(self): d = zstd.train_dictionary(8192, generate_samples(), k=64, d=16, dict_id=42) self.assertEqual(d.dict_id(), 42)
else: training_chunks = chunks train_args = { 'level': args.level, } if args.cover_k: train_args['k'] = args.cover_k if args.cover_d: train_args['d'] = args.cover_d # Always use all available threads in optimize mode. train_args['threads'] = -1 dict_data = zstd.train_dictionary(args.dict_size, training_chunks, **train_args) print('trained dictionary of size %d (wanted %d) (l=%d)' % ( len(dict_data), args.dict_size, args.level)) if args.zlib and args.discrete: compressed_discrete_zlib = [] ratios = [] for chunk in chunks: c = zlib.compress(chunk, args.zlib_level) compressed_discrete_zlib.append(c) ratios.append(float(len(c)) / float(len(chunk))) compressed_size = sum(map(len, compressed_discrete_zlib)) ratio = float(compressed_size) / float(orig_size) * 100.0 bad_count = sum(1 for r in ratios if r >= 1.00) good_ratio = 100.0 - (float(bad_count) / float(len(chunks)) * 100.0)