예제 #1
0
def test_compress_decompress():
    INPUT_SIZE = 10000
    test_data = (np.random.randint(0, 255, size=INPUT_SIZE).astype(
        np.uint8).tostring())
    test_buf = pa.frombuffer(test_data)

    codecs = ['lz4', 'snappy', 'gzip', 'zstd', 'brotli']
    for codec in codecs:
        compressed_buf = pa.compress(test_buf, codec=codec)
        compressed_bytes = pa.compress(test_data, codec=codec, asbytes=True)

        assert isinstance(compressed_bytes, bytes)

        decompressed_buf = pa.decompress(compressed_buf,
                                         INPUT_SIZE,
                                         codec=codec)
        decompressed_bytes = pa.decompress(compressed_bytes,
                                           INPUT_SIZE,
                                           codec=codec,
                                           asbytes=True)

        assert isinstance(decompressed_bytes, bytes)

        assert decompressed_buf.equals(test_buf)
        assert decompressed_bytes == test_data

        with pytest.raises(ValueError):
            pa.decompress(compressed_bytes, codec=codec)
예제 #2
0
파일: test_io.py 프로젝트: sunchao/arrow
def test_compress_decompress():
    INPUT_SIZE = 10000
    test_data = (np.random.randint(0, 255, size=INPUT_SIZE)
                 .astype(np.uint8)
                 .tostring())
    test_buf = pa.py_buffer(test_data)

    codecs = ['lz4', 'snappy', 'gzip', 'zstd', 'brotli']
    for codec in codecs:
        compressed_buf = pa.compress(test_buf, codec=codec)
        compressed_bytes = pa.compress(test_data, codec=codec, asbytes=True)

        assert isinstance(compressed_bytes, bytes)

        decompressed_buf = pa.decompress(compressed_buf, INPUT_SIZE,
                                         codec=codec)
        decompressed_bytes = pa.decompress(compressed_bytes, INPUT_SIZE,
                                           codec=codec, asbytes=True)

        assert isinstance(decompressed_bytes, bytes)

        assert decompressed_buf.equals(test_buf)
        assert decompressed_bytes == test_data

        with pytest.raises(ValueError):
            pa.decompress(compressed_bytes, codec=codec)
예제 #3
0
def test_compress_decompress(compression):
    if not Codec.is_available(compression):
        pytest.skip("{} support is not built".format(compression))

    INPUT_SIZE = 10000
    test_data = (np.random.randint(0, 255, size=INPUT_SIZE).astype(
        np.uint8).tostring())
    test_buf = pa.py_buffer(test_data)

    compressed_buf = pa.compress(test_buf, codec=compression)
    compressed_bytes = pa.compress(test_data, codec=compression, asbytes=True)

    assert isinstance(compressed_bytes, bytes)

    decompressed_buf = pa.decompress(compressed_buf,
                                     INPUT_SIZE,
                                     codec=compression)
    decompressed_bytes = pa.decompress(compressed_bytes,
                                       INPUT_SIZE,
                                       codec=compression,
                                       asbytes=True)

    assert isinstance(decompressed_bytes, bytes)

    assert decompressed_buf.equals(test_buf)
    assert decompressed_bytes == test_data

    with pytest.raises(ValueError):
        pa.decompress(compressed_bytes, codec=compression)
예제 #4
0
def setRedis(keyname,data,sessionID):
    inData = pa.serialize(data).to_buffer()
    compressLength = len(inData)
    inDataCompress= pa.compress(inData,asbytes=True)
    inDataDict = {'compressLength':compressLength,'inDataCompress':inDataCompress}
    keyDict = {'key':f"{keyname}Cache{sessionID}"}
    redis.hmset(keyDict['key'],inDataDict)
예제 #5
0
    def test_download_result(self, stub):
        job = Job(
            job_pb2.Job(
                id="foo",
                status=job_pb2.STATUS_SUCCESS,
                error=job_pb2.JobError(code=errors_pb2.ERROR_NONE),
            ))

        result = {}
        buffer = pa.serialize(result,
                              context=serialization_context).to_buffer()
        codec = "lz4"

        responses.add(
            responses.GET,
            Job.BUCKET_PREFIX.format(job.id),
            body=pa.compress(buffer, codec=codec, asbytes=True),
            headers={
                "x-goog-meta-codec": codec,
                "x-goog-meta-decompressed_size": str(len(buffer)),
            },
            status=200,
        )

        assert job._download_result() == result
예제 #6
0
def object2proto(obj: pd.Series) -> PandasSeries_PB:
    """Convert pd.Series to PandasDataFrame_PB with pyarrow.

    Args:
        obj: target Series

    Returns:
        Serialized version of Series, which will be used to reconstruction.

    """
    # https://arrow.apache.org/docs/python/pandas.html
    # series must either be converted to a dataframe or use pa.Array
    # however pa.Array mentions you must account for the null values yourself
    dataframe = obj.to_frame()
    schema = pa.Schema.from_pandas(dataframe)
    table = pa.Table.from_pandas(dataframe)
    sink = pa.BufferOutputStream()

    writer = pa.ipc.new_file(sink, schema)
    writer.write(table)
    writer.close()

    buf = sink.getvalue()

    siz = len(buf)
    df_bytes = pa.compress(buf, asbytes=True)

    return PandasSeries_PB(series=df_bytes, decompressed_size=siz)
예제 #7
0
    def f(val):
        # first serialize the data
        buf = pa.serialize(val, context=context).to_buffer()
        if compress:
            original_len = len(buf)

            # compress the data
            buf = pa.compress(buf, codec=CODEC, asbytes=True)

            # add metadata required for decompression
            return pa.serialize((original_len, CODEC, buf)).to_buffer()
        else:
            return buf
예제 #8
0
    def test_download_result(self, stub):
        job = Job._from_proto(
            job_pb2.Job(
                id="foo", state=job_pb2.Job.State(stage=job_pb2.Job.Stage.SUCCEEDED),
            )
        )

        result = {}
        buffer = pa.serialize(result, context=serialization_context).to_buffer()
        codec = "lz4"

        responses.add(
            responses.GET,
            Job.BUCKET_PREFIX.format(job.id),
            body=pa.compress(buffer, codec=codec, asbytes=True),
            headers={
                "x-goog-meta-codec": codec,
                "x-goog-meta-decompressed_size": str(len(buffer)),
            },
            status=200,
        )

        assert job._download_result() == result
예제 #9
0
파일: frame.py 프로젝트: stoic-signs/PySyft
def object2proto(obj: pd.DataFrame) -> PandasDataFrame_PB:
    """Convert pd.DataFrame to PandasDataFrame_PB with pyarrow.

    Args:
        obj: target Dataframe

    Returns:
        Serialized version of Dataframe, which will be used to reconstruction.

    """
    schema = pa.Schema.from_pandas(obj)
    table = pa.Table.from_pandas(obj)
    sink = pa.BufferOutputStream()

    writer = pa.ipc.new_file(sink, schema)
    writer.write(table)
    writer.close()

    buf = sink.getvalue()

    siz = len(buf)
    df_bytes = pa.compress(buf, asbytes=True)

    return PandasDataFrame_PB(dataframe=df_bytes, decompressed_size=siz)
예제 #10
0
    def write_time_series_cache_to_disk(
            self,
            fname,
            data_frame,
            engine='hdf5_fixed',
            append_data=False,
            db_server=constants.db_server,
            db_port=constants.db_port,
            username=constants.db_username,
            password=constants.db_password,
            filter_out_matching=None,
            timeout=10,
            use_cache_compression=constants.use_cache_compression,
            parquet_compression=constants.parquet_compression,
            md_request=None,
            ticker=None):
        """Writes Pandas data frame to disk as HDF5 format or bcolz format or in Arctic

        Parmeters
        ---------
        fname : str
            path of file
        data_frame : DataFrame
            data frame to be written to disk
        engine : str
            'hdf5_fixed' - use HDF5 fixed format, very quick, but cannot append to this
            'hdf5_table' - use HDF5 table format, slower but can append to
            'parquet' - use Parquet
            'arctic' - use Arctic/MongoDB database
            'redis' - use Redis
        append_data : bool
            False - write a fresh copy of data on disk each time
            True - append data to disk
        db_server : str
            Database server for arctic (default: '127.0.0.1')
        timeout : int
            Number of seconds to do timeout
        """

        logger = LoggerManager().getLogger(__name__)

        if md_request is not None:
            fname = self.path_join(
                fname, md_request.create_category_key(ticker=ticker))

        # default HDF5 format
        hdf5_format = 'fixed'

        if 'hdf5' in engine:
            hdf5_format = engine.split('_')[1]
            engine = 'hdf5'

        if (engine == 'bcolz'):
            # convert invalid characters to substitutes (which Bcolz can't deal with)
            data_frame.columns = self.find_replace_chars(
                data_frame.columns, _invalid_chars, _replace_chars)
            data_frame.columns = ['A_' + x for x in data_frame.columns]

            data_frame['DTS_'] = pandas.to_datetime(data_frame.index,
                                                    unit='ns')

            bcolzpath = self.get_bcolz_filename(fname)
            shutil.rmtree(bcolzpath, ignore_errors=True)
            zlens = bcolz.ctable.fromdataframe(data_frame, rootdir=bcolzpath)
        elif (engine == 'redis'):

            fname = os.path.basename(fname).replace('.', '_')

            # Will fail if Redis is not installed
            try:
                r = redis.StrictRedis(host=db_server,
                                      port=db_port,
                                      db=0,
                                      socket_timeout=timeout,
                                      socket_connect_timeout=timeout)

                ping = r.ping()

                # If Redis is alive, try pushing to it
                if ping:
                    if data_frame is not None:
                        if isinstance(data_frame, pandas.DataFrame):
                            mem = data_frame.memory_usage(deep='deep').sum()
                            mem_float = round(
                                float(mem) / (1024.0 * 1024.0), 3)

                            if mem_float < 500:
                                # msgpack/blosc is deprecated
                                # r.set(fname, data_frame.to_msgpack(compress='blosc'))

                                # now uses pyarrow
                                context = pa.default_serialization_context()

                                ser = context.serialize(data_frame).to_buffer()

                                if use_cache_compression:
                                    comp = pa.compress(ser,
                                                       codec='lz4',
                                                       asbytes=True)
                                    siz = len(ser)  # siz = 3912

                                    r.set('comp_' + str(siz) + '_' + fname,
                                          comp)
                                else:
                                    r.set(fname, ser.to_pybytes())

                                logger.info("Pushed " + fname + " to Redis")
                            else:
                                logger.warn("Did not push " + fname +
                                            " to Redis, given size")
                    else:
                        logger.info("Object " + fname +
                                    " is empty, not pushed to Redis.")
                else:
                    logger.warning("Didn't push " + fname +
                                   " to Redis given not running")

            except Exception as e:
                logger.warning("Couldn't push " + fname + " to Redis: " +
                               str(e))

        elif (engine == 'arctic'):
            from arctic import Arctic
            import pymongo

            socketTimeoutMS = 30 * 1000
            fname = os.path.basename(fname).replace('.', '_')

            logger.info('Load Arctic/MongoDB library: ' + fname)

            if username is not None and password is not None:
                c = pymongo.MongoClient(
                    host="mongodb://" + username + ":" + password + "@" +
                    str(db_server) + ":" + str(db_port),
                    connect=False)  # , username=username, password=password)
            else:
                c = pymongo.MongoClient(host="mongodb://" + str(db_server) +
                                        ":" + str(db_port),
                                        connect=False)

            store = Arctic(c,
                           socketTimeoutMS=socketTimeoutMS,
                           serverSelectionTimeoutMS=socketTimeoutMS,
                           connectTimeoutMS=socketTimeoutMS)

            database = None

            try:
                database = store[fname]
            except:
                pass

            if database is None:
                store.initialize_library(fname, audit=False)
                logger.info("Created MongoDB library: " + fname)
            else:
                logger.info("Got MongoDB library: " + fname)

            # Access the library
            library = store[fname]

            if ('intraday' in fname):
                data_frame = data_frame.astype('float32')

            if filter_out_matching is not None:
                cols = data_frame.columns

                new_cols = []

                for col in cols:
                    if filter_out_matching not in col:
                        new_cols.append(col)

                data_frame = data_frame[new_cols]

            # Problems with Arctic when writing timezone to disk sometimes, so strip
            data_frame = data_frame.copy().tz_localize(None)

            try:
                # Can duplicate values if we have existing dates
                if append_data:
                    library.append(fname, data_frame)
                else:
                    library.write(fname, data_frame)

                c.close()
                logger.info("Written MongoDB library: " + fname)
            except Exception as e:
                logger.warning("Couldn't write MongoDB library: " + fname +
                               " " + str(e))

        elif (engine == 'hdf5'):
            h5_filename = self.get_h5_filename(fname)

            # append data only works for HDF5 stored as tables (but this is much slower than fixed format)
            # removes duplicated entries at the end
            if append_data:
                store = pandas.HDFStore(h5_filename,
                                        format=hdf5_format,
                                        complib="zlib",
                                        complevel=9)

                if ('intraday' in fname):
                    data_frame = data_frame.astype('float32')

                # get last row which matches and remove everything after that (because append
                # function doesn't check for duplicated rows
                nrows = len(store['data'].index)
                last_point = data_frame.index[-1]

                i = nrows - 1

                while (i > 0):
                    read_index = store.select('data', start=i,
                                              stop=nrows).index[0]

                    if (read_index <= last_point): break

                    i = i - 1

                # remove rows at the end, which are duplicates of the incoming time series
                store.remove(key='data', start=i, stop=nrows)
                store.put(key='data',
                          value=data_frame,
                          format=hdf5_format,
                          append=True)
                store.close()
            else:
                h5_filename_temp = self.get_h5_filename(fname + ".temp")

                # delete the old copy
                try:
                    os.remove(h5_filename_temp)
                except:
                    pass

                store = pandas.HDFStore(h5_filename_temp,
                                        complib="zlib",
                                        complevel=9)

                if ('intraday' in fname):
                    data_frame = data_frame.astype('float32')

                store.put(key='data', value=data_frame, format=hdf5_format)
                store.close()

                # delete the old copy
                try:
                    os.remove(h5_filename)
                except:
                    pass

                # once written to disk rename
                os.rename(h5_filename_temp, h5_filename)

            logger.info("Written HDF5: " + fname)

        elif (engine == 'parquet'):
            if '.parquet' not in fname:
                if fname[-5:] != '.gzip':
                    fname = fname + '.parquet'

            self.to_parquet(data_frame,
                            fname,
                            aws_region=constants.aws_region,
                            parquet_compression=parquet_compression)
            # data_frame.to_parquet(fname, compression=parquet_compression)

            logger.info("Written Parquet: " + fname)
        elif engine == 'csv':
            if '.csv' not in fname:
                fname = fname + '.csv'

            data_frame.to_csv(fname)

            logger.info("Written CSV: " + fname)
예제 #11
0
파일: use_arrow.py 프로젝트: hahagan/study
def test_compress(buf, codec=None):
    return pa.compress(buf)
예제 #12
0
파일: use_arrow.py 프로젝트: hahagan/study
    with open(test_file) as fin:
        raw_dict = json.load(fin)
        raw_dict = {'hits': raw_dict['hits']['hits'] * 1}
        batch = pa.Table.from_pydict(raw_dict)

    # logging.info("arrow 序列化反序列化:")
    # buf = test_serialize(batch)
    # data = test_deserialize(buf)

    raw_bytes = str.encode(json.dumps(raw_dict))
    print("raw txt: ", len(raw_bytes))
    print("Table: ", batch.nbytes)
    buf = pa.serialize(batch).to_buffer()
    print("serialize buf: ", len(buf.to_pybytes()))

    com_buf = pa.compress(buf, codec='gzip')
    com_txt = pa.compress(raw_bytes, codec='gzip')

    print("compressed raw txt", len(com_txt.to_pybytes()))
    print("compress buf: ", len(com_buf.to_pybytes()))
    print(buf.to_pybytes())
    print(raw_bytes)

    # array = batch.to_batches()[0][0]
    # field = ['_id', '_index', '_score', '_source', '_type']
    # sum = 0
    # array_size = 0
    # for i in field:
    #     tmp = array.field(i)
    #     print('-'*50)
    #     print(i, tmp.nbytes)
예제 #13
0
    def convert_python_to_binary(self, obj, key):
        """

        Parameters
        ----------
        obj : DataFrame (or Figure)
            Object to serialize

        key : str
            Key to store object

        Returns
        -------
        binary, str
        """

        if obj is None:
            return None

        # For pandas DataFrames
        if '_df' in key and isinstance(obj, pd.DataFrame):
            obj_list = self._chunk_dataframes(
                obj,
                chunk_size_mb=constants.
                volatile_cache_redis_max_cache_chunk_size_mb)

            # If compression has been specified (recommended!)
            if '_comp' in key:
                if constants.volatile_cache_redis_format == 'msgpack':

                    for i in range(0, len(obj_list)):
                        if obj_list[i] is not None:
                            obj_list[i] = obj_list[i].to_msgpack(
                                compress=constants.
                                volatile_cache_redis_compression[
                                    constants.volatile_cache_redis_format])

                elif constants.volatile_cache_redis_format == 'arrow':
                    # Set the size of each compressed object, so can read back later
                    # eg. key might be xxxx_size_354534_size_345345_endsize etc.
                    # Ignore bit before first '_size_' and after '_endsize'
                    for i in range(0, len(obj_list)):
                        if obj_list[i] is not None:
                            ser = context.serialize(obj_list[i]).to_buffer()

                            obj_list[i] = pa.compress(
                                ser,
                                codec=constants.
                                volatile_cache_redis_compression[
                                    constants.volatile_cache_redis_format],
                                asbytes=True)

                            key = key + '_size_' + str(len(ser))

                    key = key + '_endsizearrow_'

                else:
                    raise Exception("Invalid volatile cache format specified.")
            elif '_comp' not in key:
                if constants.volatile_cache_redis_format == 'msgpack':

                    for i in range(0, len(obj_list)):
                        if obj_list[i] is not None:
                            obj_list[i] = obj_list[i].to_msgpack()
                elif constants.volatile_cache_redis_format == 'arrow':
                    # context = pa.default_serialization_context()

                    for i in range(0, len(obj_list)):
                        if obj_list[i] is not None:
                            obj_list[i] = context.serialize(
                                obj_list[i]).to_buffer().to_pybytes()
                else:
                    raise Exception("Invalid volatile cache format specified.")

        # For Plotly JSON style objects (assume these will fit in the cache, as they tend to used downsampled data)
        elif '_fig' in key:
            # print("--------------- Converting " + key)
            # print(obj)
            obj_list = [self._plotly_fig_2_json(obj)]
        else:
            obj_list = [obj]

        return obj_list, key
예제 #14
0
    def _convert_python_to_binary(self, obj, key, convert_cache_handle=True):
        logger = LoggerManager.getLogger(__name__)

        if obj is None:
            return None

        # For pandas DataFrames
        if '_df' in key and isinstance(obj, pd.DataFrame):
            # if obj.empty:
                # return None
            obj_list = self._chunk_dataframes(obj)

            if '_comp' in key:
                if constants.volatile_cache_format == 'msgpack':
                    # def to_msgpack(convert):
                    #     if convert is not None:
                    #         return convert.to_msgpack(
                    #                 compress=constants.volatile_cache_compression[constants.volatile_cache_format])
                    #
                    #     return convert
                    #
                    # with PoolExecutor(max_workers=100) as executor:
                    #     obj_list = executor.map(to_msgpack, obj_list)

                    for i in range(0, len(obj_list)):
                       if obj_list[i] is not None:
                           obj_list[i] = obj_list[i].to_msgpack(
                               compress=constants.volatile_cache_compression[constants.volatile_cache_format])

                elif constants.volatile_cache_format == 'arrow':
                    # Get the size of each compressed object
                    # eg. key might be xxxx_size_354534_size_345345_endsize etc.
                    # Ignore bit before first '_size_' and after '_endsize'

                    # context = pa.default_serialization_context()

                    # def compress(convert):
                    #     if convert is not None:
                    #         ser = context.serialize(convert).to_buffer()
                    #
                    #         convert = pa.compress(ser,
                    #                                 codec=constants.volatile_cache_compression[constants.volatile_cache_format],
                    #                                 asbytes=True)
                    #
                    #         size = len(ser)
                    #
                    #     return convert, size
                    #
                    # with PoolExecutor(max_workers=100) as executor:
                    #     obj_list, size_list = zip(*executor.map(compress, obj_list))
                    #
                    # # obj_list, size_list = zip(*temp)
                    #
                    # for s in size_list:
                    #     key = key + '_size_' + str(s)

                    for i in range(0, len(obj_list)):
                       if obj_list[i] is not None:
                           ser = context.serialize(obj_list[i]).to_buffer()

                           obj_list[i] = pa.compress(ser,
                                           codec=constants.volatile_cache_compression[constants.volatile_cache_format],
                                           asbytes=True)

                           key = key + '_size_' + str(len(ser))

                    key = key + '_endsize_'

                else:
                    raise Exception("Invalid volatile cache format specified.")
            elif '_comp' not in key:
                if constants.volatile_cache_format == 'msgpack':

                    for i in range(0, len(obj_list)):
                        if obj_list[i] is not None:
                            obj_list[i] = obj_list[i].to_msgpack()
                elif constants.volatile_cache_format == 'arrow':
                    # context = pa.default_serialization_context()

                    for i in range(0, len(obj_list)):
                        if obj_list[i] is not None:
                            obj_list[i] = context.serialize(obj_list[i]).to_buffer().to_pybytes()
                else:
                    raise Exception("Invalid volatile cache format specified.")

        # For Plotly JSON style objects (assume these will fit in the cache, as they tend to used downsampled data)
        elif '_fig' in key:
            # print("--------------- Converting " + key)
            # print(obj)
            obj_list = [self._plotly_fig_2_json(obj)]
        elif isinstance(obj, CacheHandle) and convert_cache_handle:
            obj_list = self._convert_python_to_binary(self.get_dataframe_handle(obj, burn_after_reading=True), key)
        else:
            obj_list = [obj]

        return obj_list, key