def test_simple(self): frame = zstd.compress(b"foobar") fp = zstd.get_frame_parameters(frame) self.assertEqual(fp.content_size, 6) self.assertFalse(fp.has_checksum) zstd.compress(b"foobar" * 16384, level=7)
def transmit(result, sock): pickler = pickle.Pickler(sock); cols = list(result.keys()); pickler.dump(cols); for col in cols: if(result[col].dtype == object): colz = zstd.compress(pickle.dumps(result[col])) else: colz = zstd.compress(result[col]); pickler.dump(result[col].dtype); pickler.dump(colz);
def compress_data(self): if self.settings['use_lzma']: print('[*] Compressing texture with lzma') filters = [ { "id": lzma.FILTER_LZMA1, "dict_size": 256 * 1024, "lc": 3, "lp": 0, "pb": 2, "mode": lzma.MODE_NORMAL }, ] compressed = lzma.compress(self.buffer, format=lzma.FORMAT_ALONE, filters=filters) compressed = compressed[0:5] + len(self.buffer).to_bytes(4, 'little') + compressed[13:] elif self.settings['use_lzham']: print('[*] Compressing texture with lzham') dict_size = 18 compressed = lzham.compress(self.buffer, {'dict_size_log2': dict_size}) compressed = 'SCLZ'.encode('utf-8') + dict_size.to_bytes(1, 'big') + len(self.buffer).to_bytes(4, 'little') + compressed else: print('[*] Compressing texture with zstandard') compressed = zstandard.compress(self.buffer, level=zstandard.MAX_COMPRESSION_LEVEL) fileMD5 = hashlib.md5(self.buffer).digest() # Flush the previous buffer self.buffer = b'' if self.settings['header']: self.write('SC'.encode('utf-8')) if self.settings['use_zstd']: self.write_uint32(3, 'big') else: self.write_uint32(1, 'big') self.write_uint32(len(fileMD5), 'big') self.write(fileMD5) print('[*] Header wrote !') self.write(compressed) print('[*] Compression done !')
def __serialize(self) -> bytes: batch = tsbatch_pb2.Batch() for timeseries in self.__timeseries.values(): protobuf_ts = batch.timeseries.add() timeseries.serialize_to(protobuf_ts=protobuf_ts) frame = tsbatch_pb2.Frame() frame.contentType = tsbatch_pb2.Frame.ContentType.ZSTD_COMPRESSED_BATCH # level 14 has been empirically determined as the threshold for dimnishing returns content = zstandard.compress(batch.SerializeToString(), level=14) h = hashlib.sha3_512() h.update(content) frame.content = content frame.messageId = h.hexdigest() self.__message = frame.SerializeToString()
def mutation_create(item): path, source, coverage, mutation_predicate = item if not coverage: msg = "Ignoring file {} because there is no associated coverage." log.trace(msg, path) return [] log.trace("Mutating file: {}...", path) mutations = [m for m in Mutation.ALL if mutation_predicate(m)] deltas = deltas_compute(source, path, coverage, mutations) # return the compressed deltas to save some time in the # mainthread. out = [(path, zstd.compress(x.encode("utf8"))) for x in deltas] log.trace("There is {} mutations for the file `{}`", len(out), path) return out
async def index(tx, store, docuid, counter): # translate keys that are string tokens, into uuid4 bytes with # store.tokens tokens = dict() for string, count in counter.items(): query = nstore.select(tx, store.tokens, string, nstore.var('uid')) try: uid = await query.__anext__() except StopAsyncIteration: uid = uuid4() nstore.add(tx, store.tokens, string, uid) else: uid = uid['uid'] tokens[uid] = count # store tokens to use later during search for filtering found.set(tx, found.pack((store.prefix_counters, docuid)), zstd.compress(found.pack(tuple(tokens.items())))) # store tokens keys for candidate selection for token in tokens: found.set(tx, found.pack((store.prefix_index, token, docuid)), b'')
def serialize_message(message: BaseModel, compress: bool = False) -> bytes: data = message.json().encode("utf-8") if compress: data = b"Z" + zstandard.compress(data) return data
def test_simple(self): source = b"foobar" * 8192 frame = zstd.compress(source) self.assertEqual(zstd.decompress(frame), source)
def commit(self): committed_blob_name = "" if len(self.buffer) > 0: lock = threading.Lock() try: lock.acquire(blocking=True, timeout=10) if self.format == "parquet": try: import pyarrow.json import pyarrow.parquet as pq # type:ignore except ImportError as err: # pragma: no cover raise MissingDependencyError( "`pyarrow` is missing, please install or includein requirements.txt" ) import io tempfile = io.BytesIO() temp_list = [ orjson.loads(record) for record in self.buffer.splitlines() ] pytable = pyarrow.Table.from_pylist(temp_list) pyarrow.parquet.write_table(pytable, where=tempfile, compression="zstd") tempfile.seek(0) self.buffer = tempfile.read() if self.format == "zstd": # zstandard is an non-optional installed dependency self.buffer = zstandard.compress(self.buffer) committed_blob_name = self.inner_writer.commit( byte_data=bytes(self.buffer), override_blob_name=None) for column in self.indexes: index = self.index_builders[column].build() bucket, path, stem, suffix = get_parts(committed_blob_name) index_name = f"{bucket}/{path}{stem}.{safe_field_name(column)}.idx" committed_index_name = self.inner_writer.commit( byte_data=index.bytes(), override_blob_name=index_name) if "BACKOUT" in committed_blob_name: get_logger().warning( f"{self.records_in_buffer:n} failed records written to BACKOUT partition `{committed_blob_name}`" ) get_logger().debug({ "committed_blob": committed_blob_name, "records": self.records_in_buffer, "bytes": len(self.buffer), }) finally: lock.release() self.buffer = bytearray() return committed_blob_name
def encode_base64zstd(self, layer): ''' base64 layers are one huge row, thanks to CSV assuming row data. ''' format = '<' + 'I' * len(layer[0]) data = zstandard.compress(struct.pack(format, *(layer[0]))) return base64.b64encode(data).decode('utf-8')