예제 #1
0
    def test_simple(self):
        frame = zstd.compress(b"foobar")

        fp = zstd.get_frame_parameters(frame)
        self.assertEqual(fp.content_size, 6)
        self.assertFalse(fp.has_checksum)

        zstd.compress(b"foobar" * 16384, level=7)
예제 #2
0
def transmit(result, sock):
    pickler = pickle.Pickler(sock);
    cols = list(result.keys());
    pickler.dump(cols);

    for col in cols:
        if(result[col].dtype == object):
            colz = zstd.compress(pickle.dumps(result[col]))
        else:
            colz = zstd.compress(result[col]);
        pickler.dump(result[col].dtype);
        pickler.dump(colz);
예제 #3
0
    def compress_data(self):
        if self.settings['use_lzma']:
            print('[*] Compressing texture with lzma')

            filters = [
                       {
                        "id": lzma.FILTER_LZMA1,
                        "dict_size": 256 * 1024,
                        "lc": 3,
                        "lp": 0,
                        "pb": 2,
                        "mode": lzma.MODE_NORMAL
                        },
                       ]

            compressed = lzma.compress(self.buffer, format=lzma.FORMAT_ALONE, filters=filters)
            compressed = compressed[0:5] + len(self.buffer).to_bytes(4, 'little') + compressed[13:]

        elif self.settings['use_lzham']:
            print('[*] Compressing texture with lzham')

            dict_size = 18

            compressed = lzham.compress(self.buffer, {'dict_size_log2': dict_size})
            compressed = 'SCLZ'.encode('utf-8') + dict_size.to_bytes(1, 'big') + len(self.buffer).to_bytes(4, 'little') + compressed

        else:
            print('[*] Compressing texture with zstandard')
            compressed = zstandard.compress(self.buffer, level=zstandard.MAX_COMPRESSION_LEVEL)

        fileMD5 = hashlib.md5(self.buffer).digest()

        # Flush the previous buffer
        self.buffer = b''

        if self.settings['header']:
            self.write('SC'.encode('utf-8'))

            if self.settings['use_zstd']:
                self.write_uint32(3, 'big')

            else:
                self.write_uint32(1, 'big')

            self.write_uint32(len(fileMD5), 'big')
            self.write(fileMD5)

            print('[*] Header wrote !')

        self.write(compressed)

        print('[*] Compression done !')
    def __serialize(self) -> bytes:
        batch = tsbatch_pb2.Batch()
        for timeseries in self.__timeseries.values():
            protobuf_ts = batch.timeseries.add()
            timeseries.serialize_to(protobuf_ts=protobuf_ts)

        frame = tsbatch_pb2.Frame()
        frame.contentType = tsbatch_pb2.Frame.ContentType.ZSTD_COMPRESSED_BATCH
        # level 14 has been empirically determined as the threshold for dimnishing returns
        content = zstandard.compress(batch.SerializeToString(), level=14)
        h = hashlib.sha3_512()
        h.update(content)
        frame.content = content
        frame.messageId = h.hexdigest()
        self.__message = frame.SerializeToString()
예제 #5
0
def mutation_create(item):
    path, source, coverage, mutation_predicate = item

    if not coverage:
        msg = "Ignoring file {} because there is no associated coverage."
        log.trace(msg, path)
        return []

    log.trace("Mutating file: {}...", path)
    mutations = [m for m in Mutation.ALL if mutation_predicate(m)]
    deltas = deltas_compute(source, path, coverage, mutations)
    # return the compressed deltas to save some time in the
    # mainthread.
    out = [(path, zstd.compress(x.encode("utf8"))) for x in deltas]
    log.trace("There is {} mutations for the file `{}`", len(out), path)
    return out
예제 #6
0
async def index(tx, store, docuid, counter):
    # translate keys that are string tokens, into uuid4 bytes with
    # store.tokens
    tokens = dict()
    for string, count in counter.items():
        query = nstore.select(tx, store.tokens, string, nstore.var('uid'))
        try:
            uid = await query.__anext__()
        except StopAsyncIteration:
            uid = uuid4()
            nstore.add(tx, store.tokens, string, uid)
        else:
            uid = uid['uid']
        tokens[uid] = count

    # store tokens to use later during search for filtering
    found.set(tx, found.pack((store.prefix_counters, docuid)),
              zstd.compress(found.pack(tuple(tokens.items()))))

    # store tokens keys for candidate selection
    for token in tokens:
        found.set(tx, found.pack((store.prefix_index, token, docuid)), b'')
예제 #7
0
파일: message.py 프로젝트: yifan/pipeline
def serialize_message(message: BaseModel, compress: bool = False) -> bytes:
    data = message.json().encode("utf-8")
    if compress:
        data = b"Z" + zstandard.compress(data)
    return data
예제 #8
0
 def test_simple(self):
     source = b"foobar" * 8192
     frame = zstd.compress(source)
     self.assertEqual(zstd.decompress(frame), source)
예제 #9
0
    def commit(self):

        committed_blob_name = ""

        if len(self.buffer) > 0:

            lock = threading.Lock()

            try:
                lock.acquire(blocking=True, timeout=10)

                if self.format == "parquet":
                    try:
                        import pyarrow.json
                        import pyarrow.parquet as pq  # type:ignore
                    except ImportError as err:  # pragma: no cover
                        raise MissingDependencyError(
                            "`pyarrow` is missing, please install or includein requirements.txt"
                        )

                    import io

                    tempfile = io.BytesIO()

                    temp_list = [
                        orjson.loads(record)
                        for record in self.buffer.splitlines()
                    ]
                    pytable = pyarrow.Table.from_pylist(temp_list)
                    pyarrow.parquet.write_table(pytable,
                                                where=tempfile,
                                                compression="zstd")

                    tempfile.seek(0)
                    self.buffer = tempfile.read()

                if self.format == "zstd":
                    # zstandard is an non-optional installed dependency
                    self.buffer = zstandard.compress(self.buffer)

                committed_blob_name = self.inner_writer.commit(
                    byte_data=bytes(self.buffer), override_blob_name=None)

                for column in self.indexes:
                    index = self.index_builders[column].build()

                    bucket, path, stem, suffix = get_parts(committed_blob_name)
                    index_name = f"{bucket}/{path}{stem}.{safe_field_name(column)}.idx"
                    committed_index_name = self.inner_writer.commit(
                        byte_data=index.bytes(), override_blob_name=index_name)

                if "BACKOUT" in committed_blob_name:
                    get_logger().warning(
                        f"{self.records_in_buffer:n} failed records written to BACKOUT partition `{committed_blob_name}`"
                    )
                get_logger().debug({
                    "committed_blob": committed_blob_name,
                    "records": self.records_in_buffer,
                    "bytes": len(self.buffer),
                })
            finally:
                lock.release()

        self.buffer = bytearray()
        return committed_blob_name
예제 #10
0
 def encode_base64zstd(self, layer):
     ''' base64 layers are one huge row, thanks to CSV assuming row data.
     '''
     format = '<' + 'I' * len(layer[0])
     data = zstandard.compress(struct.pack(format, *(layer[0])))
     return base64.b64encode(data).decode('utf-8')