Exemplo n.º 1
0
    def test_no_dict_id(self):
        samples = []
        for i in range(128):
            samples.append(b'foo' * 64)
            samples.append(b'bar' * 64)
            samples.append(b'foobar' * 64)

        d = zstd.train_dictionary(1024, samples)

        with_dict_id = io.BytesIO()
        cctx = zstd.ZstdCompressor(level=1, dict_data=d)
        with cctx.write_to(with_dict_id) as compressor:
            self.assertEqual(compressor.write(b'foobarfoobar'), 0)

        cctx = zstd.ZstdCompressor(level=1, dict_data=d, write_dict_id=False)
        no_dict_id = io.BytesIO()
        with cctx.write_to(no_dict_id) as compressor:
            self.assertEqual(compressor.write(b'foobarfoobar'), 0)

        no_params = zstd.get_frame_parameters(no_dict_id.getvalue())
        with_params = zstd.get_frame_parameters(with_dict_id.getvalue())
        self.assertEqual(no_params.content_size, 0)
        self.assertEqual(with_params.content_size, 0)
        self.assertEqual(no_params.dict_id, 0)
        self.assertEqual(with_params.dict_id, d.dict_id())
        self.assertFalse(no_params.has_checksum)
        self.assertFalse(with_params.has_checksum)

        self.assertEqual(len(with_dict_id.getvalue()),
                         len(no_dict_id.getvalue()) + 4)
Exemplo n.º 2
0
    def test_no_dict_id(self):
        samples = []
        for i in range(128):
            samples.append(b"foo" * 64)
            samples.append(b"bar" * 64)
            samples.append(b"foobar" * 64)

        d = zstd.train_dictionary(1024, samples)

        with_dict_id = io.BytesIO()
        cctx = zstd.ZstdCompressor(level=1, dict_data=d)
        with cctx.stream_writer(with_dict_id, closefd=False) as compressor:
            self.assertEqual(compressor.write(b"foobarfoobar"), 12)

        self.assertEqual(with_dict_id.getvalue()[4:5], b"\x03")

        cctx = zstd.ZstdCompressor(level=1, dict_data=d, write_dict_id=False)
        no_dict_id = io.BytesIO()
        with cctx.stream_writer(no_dict_id, closefd=False) as compressor:
            self.assertEqual(compressor.write(b"foobarfoobar"), 12)

        self.assertEqual(no_dict_id.getvalue()[4:5], b"\x00")

        no_params = zstd.get_frame_parameters(no_dict_id.getvalue())
        with_params = zstd.get_frame_parameters(with_dict_id.getvalue())
        self.assertEqual(no_params.content_size, zstd.CONTENTSIZE_UNKNOWN)
        self.assertEqual(with_params.content_size, zstd.CONTENTSIZE_UNKNOWN)
        self.assertEqual(no_params.dict_id, 0)
        self.assertEqual(with_params.dict_id, d.dict_id())
        self.assertFalse(no_params.has_checksum)
        self.assertFalse(with_params.has_checksum)

        self.assertEqual(len(with_dict_id.getvalue()),
                         len(no_dict_id.getvalue()) + 4)
Exemplo n.º 3
0
 def test_set_dict_id(self):
     d = zstd.train_dictionary(8192,
                               generate_samples(),
                               k=64,
                               d=16,
                               dict_id=42)
     self.assertEqual(d.dict_id(), 42)
Exemplo n.º 4
0
    def test_dictionary(self):
        samples = []
        for i in range(128):
            samples.append(b'foo' * 64)
            samples.append(b'bar' * 64)
            samples.append(b'foobar' * 64)

        d = zstd.train_dictionary(8192, samples)

        h = hashlib.sha1(d.as_bytes()).hexdigest()
        self.assertEqual(h, '3040faa0ddc37d50e71a4dd28052cb8db5d9d027')

        buffer = io.BytesIO()
        cctx = zstd.ZstdCompressor(level=9, dict_data=d)
        with cctx.stream_writer(buffer) as compressor:
            self.assertEqual(compressor.write(b'foo'), 0)
            self.assertEqual(compressor.write(b'bar'), 0)
            self.assertEqual(compressor.write(b'foo' * 16384), 0)

        compressed = buffer.getvalue()

        params = zstd.get_frame_parameters(compressed)
        self.assertEqual(params.content_size, zstd.CONTENTSIZE_UNKNOWN)
        self.assertEqual(params.window_size, 2097152)
        self.assertEqual(params.dict_id, d.dict_id())
        self.assertFalse(params.has_checksum)
        self.assertEqual(compressed,
                         b'\x28\xb5\x2f\xfd\x03\x58\x06\x59\xb5\x52\x5d\x00'
                         b'\x00\x00\x02\xfc\x3d\x3f\xd9\xb0\x51\x03\x45\x89')
    def test_dictionary(self):
        samples = []
        for i in range(128):
            samples.append(b"foo" * 64)
            samples.append(b"bar" * 64)
            samples.append(b"foobar" * 64)

        d = zstd.train_dictionary(8192, samples)

        orig = b"foobar" * 16384
        buffer = io.BytesIO()
        cctx = zstd.ZstdCompressor(dict_data=d)
        with cctx.stream_writer(buffer, closefd=False) as compressor:
            self.assertEqual(compressor.write(orig), len(orig))

        compressed = buffer.getvalue()
        buffer = io.BytesIO()

        dctx = zstd.ZstdDecompressor(dict_data=d)
        decompressor = dctx.stream_writer(buffer)
        self.assertEqual(decompressor.write(compressed), len(compressed))
        self.assertEqual(buffer.getvalue(), orig)

        buffer = io.BytesIO()

        with dctx.stream_writer(buffer, closefd=False) as decompressor:
            self.assertEqual(decompressor.write(compressed), len(compressed))

        self.assertEqual(buffer.getvalue(), orig)
Exemplo n.º 6
0
    def test_dictionary(self):
        samples = []
        for i in range(128):
            samples.append(b'foo' * 64)
            samples.append(b'bar' * 64)
            samples.append(b'foobar' * 64)

        d = zstd.train_dictionary(8192, samples)

        buffer = io.BytesIO()
        cctx = zstd.ZstdCompressor(level=9, dict_data=d)
        with cctx.write_to(buffer) as compressor:
            self.assertEqual(compressor.write(b'foo'), 0)
            self.assertEqual(compressor.write(b'bar'), 0)
            self.assertEqual(compressor.write(b'foo' * 16384), 634)

        compressed = buffer.getvalue()

        params = zstd.get_frame_parameters(compressed)
        self.assertEqual(params.content_size, 0)
        self.assertEqual(params.window_size, 1024)
        self.assertEqual(params.dict_id, d.dict_id())
        self.assertFalse(params.has_checksum)

        self.assertEqual(
            compressed[0:32],
            b'\x28\xb5\x2f\xfd\x03\x00\x55\x7b\x6b\x5e\x54\x00'
            b'\x00\x00\x02\xfc\xf4\xa5\xba\x23\x3f\x85\xb3\x54'
            b'\x00\x00\x18\x6f\x6f\x66\x01\x00')

        h = hashlib.sha1(compressed).hexdigest()
        self.assertEqual(h, '1c5bcd25181bcd8c1a73ea8773323e0056129f92')
Exemplo n.º 7
0
def compression_dict():  # TODO: update dictionary
    txt_comming_from_a_file_for_training_of_compressor = "aasfdkjgd kajgsdf af kajgakjsg fkajgs df\n" * 653
    samples = [
        (str(random()) + s).encode()
        for s in txt_comming_from_a_file_for_training_of_compressor.split("\n")
    ]
    return zs.train_dictionary(dict_size=99999999, samples=samples, threads=-1)
Exemplo n.º 8
0
    def batch_done(self, batch_lines):
        self.acc_lines.extend(batch_lines)
        # we truncate before training as in practice we'd keep a size limited history
        if len(self.acc_lines) > 400:
            self.acc_lines = self.acc_lines[-400:]

        self.cur_dict = zstd.train_dictionary(self.train_dict_size,
                                              self.acc_lines)
Exemplo n.º 9
0
    def test_optimize(self):
        d = zstd.train_dictionary(
            8192, generate_samples(), threads=-1, steps=1, d=16
        )

        # This varies by platform.
        self.assertIn(d.k, (50, 2000))
        self.assertEqual(d.d, 16)
    def test_set_dict_id(self):
        samples = []
        for i in range(128):
            samples.append(b'foo' * 64)
            samples.append(b'foobar' * 64)

        d = zstd.train_dictionary(8192, samples, dict_id=42)
        self.assertEqual(d.dict_id(), 42)
Exemplo n.º 11
0
    def test_optimize(self):
        d = zstd.train_dictionary(8192,
                                  generate_samples(),
                                  threads=-1,
                                  steps=1,
                                  d=16)

        self.assertEqual(d.k, 50)
        self.assertEqual(d.d, 16)
    def test_bad_precompute_compress(self):
        d = zstd.train_dictionary(8192, generate_samples(), k=64, d=16)

        with self.assertRaisesRegexp(ValueError, 'must specify one of level or '):
            d.precompute_compress()

        with self.assertRaisesRegexp(ValueError, 'must only specify one of level or '):
            d.precompute_compress(level=3,
                                  compression_params=zstd.CompressionParameters())
Exemplo n.º 13
0
    def test_dict(self):
        d = zstd.train_dictionary(16384, generate_samples(), k=64, d=16)

        cctx = zstd.ZstdCompressor(dict_data=d, level=1)
        frames = [cctx.compress(s) for s in generate_samples()]

        dctx = zstd.ZstdDecompressor(dict_data=d)
        result = dctx.multi_decompress_to_buffer(frames)
        self.assertEqual([o.tobytes() for o in result], generate_samples())
Exemplo n.º 14
0
    def test_bad_precompute_compress(self):
        d = zstd.train_dictionary(8192, generate_samples(), k=64, d=16)

        with self.assertRaisesRegexp(ValueError, 'must specify one of level or '):
            d.precompute_compress()

        with self.assertRaisesRegexp(ValueError, 'must only specify one of level or '):
            d.precompute_compress(level=3,
                                  compression_params=zstd.CompressionParameters())
Exemplo n.º 15
0
    def test_basic(self):
        d = zstd.train_dictionary(8192, generate_samples(), k=64, d=16)
        self.assertIsInstance(d.dict_id(), int_type)

        data = d.as_bytes()
        self.assertEqual(data[0:4], b'\x37\xa4\x30\xec')

        self.assertEqual(d.k, 64)
        self.assertEqual(d.d, 16)
Exemplo n.º 16
0
    def test_dict(self):
        d = zstd.train_dictionary(16384, generate_samples(), k=64, d=16)

        cctx = zstd.ZstdCompressor(dict_data=d, level=1)
        frames = [cctx.compress(s) for s in generate_samples()]

        dctx = zstd.ZstdDecompressor(dict_data=d)
        result = dctx.multi_decompress_to_buffer(frames)
        self.assertEqual([o.tobytes() for o in result], generate_samples())
    def test_basic(self):
        d = zstd.train_dictionary(8192, generate_samples(), k=64, d=16)
        self.assertIsInstance(d.dict_id(), int_type)

        data = d.as_bytes()
        self.assertEqual(data[0:4], b'\x37\xa4\x30\xec')

        self.assertEqual(d.k, 64)
        self.assertEqual(d.d, 16)
Exemplo n.º 18
0
    def test_no_params(self):
        d = zstd.train_dictionary(8192, generate_samples())
        self.assertIsInstance(d.dict_id(), int_type)

        # The dictionary ID may be different across platforms.
        expected = b'\x37\xa4\x30\xec' + struct.pack('<I', d.dict_id())

        data = d.as_bytes()
        self.assertEqual(data[0:8], expected)
Exemplo n.º 19
0
    def test_no_params(self):
        d = zstd.train_dictionary(8192, generate_samples())
        self.assertIsInstance(d.dict_id(), int_type)

        # The dictionary ID may be different across platforms.
        expected = b'\x37\xa4\x30\xec' + struct.pack('<I', d.dict_id())

        data = d.as_bytes()
        self.assertEqual(data[0:8], expected)
    def test_basic(self):
        d = zstd.train_dictionary(8192, generate_samples(), k=500, d=8)
        self.assertIsInstance(d.dict_id(), int)

        data = d.as_bytes()
        self.assertEqual(data[0:4], b"\x37\xa4\x30\xec")

        self.assertEqual(d.k, 500)
        self.assertEqual(d.d, 8)
    def test_no_params(self):
        d = zstd.train_dictionary(8192, random_input_data())
        self.assertIsInstance(d.dict_id(), int)

        # The dictionary ID may be different across platforms.
        expected = b"\x37\xa4\x30\xec" + struct.pack("<I", d.dict_id())

        data = d.as_bytes()
        self.assertEqual(data[0:8], expected)
Exemplo n.º 22
0
def train_dictionary(dss, dict_sz=8 * 1024):
    def to_bytes(o):
        if isinstance(o, dict):
            _, d = doc2bytes(o)
        else:
            _, d = ds2bytes(o)
        return d

    sample = list(map(to_bytes, dss))
    return zstandard.train_dictionary(dict_sz, sample).as_bytes()
 def test_set_dict_id(self):
     samples = generate_samples()
     d = zstd.train_dictionary(
         get_optimal_dict_size_heuristically(samples),
         samples,
         k=64,
         d=8,
         dict_id=42,
     )
     self.assertEqual(d.dict_id(), 42)
    def test_basic(self):
        samples = generate_samples()

        d = zstd.train_dictionary(8192, samples)
        self.assertLessEqual(len(d), 8192)

        dict_id = d.dict_id()
        self.assertIsInstance(dict_id, int_type)

        data = d.as_bytes()
        self.assertEqual(data[0:4], b'\x37\xa4\x30\xec')
Exemplo n.º 25
0
    def _save_training_dict(training_set, f4_file_path, compression_level,
                            num_processes):
        training_list = sorted(list(training_set))
        training_num_chars = sum([len(x) for x in training_set])

        training_dict = zstandard.train_dictionary(training_num_chars,
                                                   training_list,
                                                   level=compression_level,
                                                   threads=num_processes)

        with open(CompressionHelper._get_training_dict_file_path(f4_file_path),
                  "wb") as dict_file:
            dict_file.write(training_dict.as_bytes())
Exemplo n.º 26
0
def train_dictionary(docs, dict_sz=8 * 1024):
    """ Given a finite sequence of Datasets train zstandard compression dictionary of a given size.

        Accepts both `Dataset` as well as "raw" datasets.

        Will return None if input sequence is empty.
    """
    sample = list(v for _, v in map(doc2bytes, docs))

    if len(sample) == 0:
        return None

    return zstandard.train_dictionary(dict_sz, sample).as_bytes()
    def test_dict(self):
        samples = generate_samples()
        optSize = get_optimal_dict_size_heuristically(samples)
        d = zstd.train_dictionary(optSize, samples, k=64, d=8)

        cctx = zstd.ZstdCompressor(dict_data=d, level=1)
        frames = [cctx.compress(s) for s in generate_samples()]

        dctx = zstd.ZstdDecompressor(dict_data=d)

        result = dctx.multi_decompress_to_buffer(frames)

        self.assertEqual([o.tobytes() for o in result], samples)
Exemplo n.º 28
0
    def test_compress_dict_multiple(self):
        samples = []
        for i in range(128):
            samples.append(b'foo' * 64)
            samples.append(b'bar' * 64)
            samples.append(b'foobar' * 64)

        d = zstd.train_dictionary(8192, samples)

        cctx = zstd.ZstdCompressor(level=1, dict_data=d)

        for i in range(32):
            cctx.compress(b'foo bar foobar foo bar foobar')
Exemplo n.º 29
0
def create_dictionary(database: Database):
    # make this parse through the entire entries table and make
    # a dictionary from it...

    def content_bytes(row):
        return bytes(row["content"], encoding='utf8')

    dict_size = 10485760
    samples = list(
        map(content_bytes,
            (database.query("SELECT content from entries limit 10000"))))
    # https://python-zstandard.readthedocs.io/en/latest/compressor.html
    return zstd.train_dictionary(dict_size, samples)
    def test_optimize(self):
        samples = generate_samples()
        d = zstd.train_dictionary(
            get_optimal_dict_size_heuristically(samples),
            samples,
            threads=-1,
            steps=1,
            d=6,
            notifications=2,
        )

        # This varies by platform.
        self.assertIn(d.k, (50, 2000))
        self.assertEqual(d.d, 6)
    def test_dict_precompute(self):
        samples = []
        for i in range(128):
            samples.append(b"foo" * 64)
            samples.append(b"bar" * 64)
            samples.append(b"foobar" * 64)

        d = zstd.train_dictionary(8192, samples)
        d.precompute_compress(level=1)

        cctx = zstd.ZstdCompressor(level=1, dict_data=d)

        for i in range(32):
            cctx.compress(b"foo bar foobar foo bar foobar")
    def test_bad_precompute_compress(self):
        samples = generate_samples()
        d = zstd.train_dictionary(get_optimal_dict_size_heuristically(samples),
                                  samples,
                                  k=64,
                                  d=8)

        with self.assertRaisesRegex(ValueError,
                                    "must specify one of level or "):
            d.precompute_compress()

        with self.assertRaisesRegex(ValueError,
                                    "must only specify one of level or "):
            d.precompute_compress(
                level=3, compression_params=zstd.ZstdCompressionParameters())
Exemplo n.º 33
0
    def train_dictionary(docs: Iterable[Document_],
                         dict_sz: int = 8 * 1024) -> Optional[bytes]:
        """Given a finite sequence of Documents train zstandard compression dictionary of a given size.

        Document is either a
         - (id, {..}) tuple
         - {id: str|UUID, ...} a dictionary with `id` key of type UUID (possibly string formatted UUID)

        Will return None if input sequence is empty.
        """
        sample = list(v for _, v in map(doc2bytes, docs))

        if len(sample) == 0:
            return None

        return zstandard.train_dictionary(dict_sz, sample).as_bytes()
Exemplo n.º 34
0
    def process_header(self, dict_dump):
        train_data = []
        for k in self.cur_dict:
            train_data.append(bytes(k, 'utf-8'))
        # train on each action independently
        self.cur_zdict = zstd.train_dictionary(self.max_zdict_size, train_data)

        data = bytes(dict_dump, 'utf-8')
        dedup_dict_size = len(
            zstd.ZstdCompressor(level=self.level,
                                dict_data=self.cur_zdict).compress(data))

        dict_bytes = self.cur_zdict.as_bytes()
        comp_dict_size = len(
            zstd.ZstdCompressor(level=self.level).compress(dict_bytes))
        return dedup_dict_size + comp_dict_size
Exemplo n.º 35
0
    def test_dictionary(self):
        samples = []
        for i in range(128):
            samples.append(b'foo' * 64)
            samples.append(b'bar' * 64)
            samples.append(b'foobar' * 64)

        d = zstd.train_dictionary(8192, samples)

        orig = b'foobar' * 16384
        cctx = zstd.ZstdCompressor(level=1, dict_data=d)
        compressed = cctx.compress(orig)

        dctx = zstd.ZstdDecompressor(dict_data=d)
        decompressed = dctx.decompress(compressed)

        self.assertEqual(decompressed, orig)
Exemplo n.º 36
0
    def test_dictionary_multiple(self):
        samples = []
        for i in range(128):
            samples.append(b'foo' * 64)
            samples.append(b'bar' * 64)
            samples.append(b'foobar' * 64)

        d = zstd.train_dictionary(8192, samples)

        sources = (b'foobar' * 8192, b'foo' * 8192, b'bar' * 8192)
        compressed = []
        cctx = zstd.ZstdCompressor(level=1, dict_data=d)
        for source in sources:
            compressed.append(cctx.compress(source))

        dctx = zstd.ZstdDecompressor(dict_data=d)
        for i in range(len(sources)):
            decompressed = dctx.decompress(compressed[i])
            self.assertEqual(decompressed, sources[i])
Exemplo n.º 37
0
    def test_dictionary(self):
        samples = []
        for i in range(128):
            samples.append(b'foo' * 64)
            samples.append(b'bar' * 64)
            samples.append(b'foobar' * 64)

        d = zstd.train_dictionary(8192, samples)

        orig = b'foobar' * 16384
        buffer = io.BytesIO()
        cctx = zstd.ZstdCompressor(dict_data=d)
        with cctx.stream_writer(buffer) as compressor:
            self.assertEqual(compressor.write(orig), 0)

        compressed = buffer.getvalue()
        buffer = io.BytesIO()

        dctx = zstd.ZstdDecompressor(dict_data=d)
        with dctx.stream_writer(buffer) as decompressor:
            self.assertEqual(decompressor.write(compressed), len(orig))

        self.assertEqual(buffer.getvalue(), orig)
Exemplo n.º 38
0
    def test_optimize(self):
        d = zstd.train_dictionary(8192, generate_samples(), threads=-1, steps=1,
                                  d=16)

        self.assertEqual(d.k, 50)
        self.assertEqual(d.d, 16)
Exemplo n.º 39
0
 def test_no_args(self):
     with self.assertRaises(TypeError):
         zstd.train_dictionary()
Exemplo n.º 40
0
    def test_bad_args(self):
        with self.assertRaises(TypeError):
            zstd.train_dictionary(8192, u'foo')

        with self.assertRaises(ValueError):
            zstd.train_dictionary(8192, [u'foo'])
Exemplo n.º 41
0
 def test_set_dict_id(self):
     d = zstd.train_dictionary(8192, generate_samples(), k=64, d=16,
                               dict_id=42)
     self.assertEqual(d.dict_id(), 42)
Exemplo n.º 42
0
        else:
            training_chunks = chunks

        train_args = {
            'level': args.level,
        }

        if args.cover_k:
            train_args['k'] = args.cover_k
        if args.cover_d:
            train_args['d'] = args.cover_d

        # Always use all available threads in optimize mode.
        train_args['threads'] = -1

        dict_data = zstd.train_dictionary(args.dict_size, training_chunks,
                                          **train_args)
        print('trained dictionary of size %d (wanted %d) (l=%d)' % (
            len(dict_data), args.dict_size, args.level))

    if args.zlib and args.discrete:
        compressed_discrete_zlib = []
        ratios = []
        for chunk in chunks:
            c = zlib.compress(chunk, args.zlib_level)
            compressed_discrete_zlib.append(c)
            ratios.append(float(len(c)) / float(len(chunk)))

        compressed_size = sum(map(len, compressed_discrete_zlib))
        ratio = float(compressed_size) / float(orig_size) * 100.0
        bad_count = sum(1 for r in ratios if r >= 1.00)
        good_ratio = 100.0 - (float(bad_count) / float(len(chunks)) * 100.0)