Пример #1
0
def test_compress_decompress():
    INPUT_SIZE = 10000
    test_data = (np.random.randint(0, 255, size=INPUT_SIZE)
                 .astype(np.uint8)
                 .tostring())
    test_buf = pa.py_buffer(test_data)

    codecs = ['lz4', 'snappy', 'gzip', 'zstd', 'brotli']
    for codec in codecs:
        compressed_buf = pa.compress(test_buf, codec=codec)
        compressed_bytes = pa.compress(test_data, codec=codec, asbytes=True)

        assert isinstance(compressed_bytes, bytes)

        decompressed_buf = pa.decompress(compressed_buf, INPUT_SIZE,
                                         codec=codec)
        decompressed_bytes = pa.decompress(compressed_bytes, INPUT_SIZE,
                                           codec=codec, asbytes=True)

        assert isinstance(decompressed_bytes, bytes)

        assert decompressed_buf.equals(test_buf)
        assert decompressed_bytes == test_data

        with pytest.raises(ValueError):
            pa.decompress(compressed_bytes, codec=codec)
Пример #2
0
def test_compress_decompress():
    INPUT_SIZE = 10000
    test_data = (np.random.randint(0, 255, size=INPUT_SIZE).astype(
        np.uint8).tostring())
    test_buf = pa.py_buffer(test_data)

    codecs = ['lz4', 'snappy', 'gzip', 'zstd', 'brotli']
    for codec in codecs:
        compressed_buf = pa.compress(test_buf, codec=codec)
        compressed_bytes = pa.compress(test_data, codec=codec, asbytes=True)

        assert isinstance(compressed_bytes, bytes)

        decompressed_buf = pa.decompress(compressed_buf,
                                         INPUT_SIZE,
                                         codec=codec)
        decompressed_bytes = pa.decompress(compressed_bytes,
                                           INPUT_SIZE,
                                           codec=codec,
                                           asbytes=True)

        assert isinstance(decompressed_bytes, bytes)

        assert decompressed_buf.equals(test_buf)
        assert decompressed_bytes == test_data

        with pytest.raises(ValueError):
            pa.decompress(compressed_bytes, codec=codec)
Пример #3
0
def test_compress_decompress(compression):
    if not Codec.is_available(compression):
        pytest.skip("{} support is not built".format(compression))

    INPUT_SIZE = 10000
    test_data = (np.random.randint(0, 255, size=INPUT_SIZE)
                 .astype(np.uint8)
                 .tostring())
    test_buf = pa.py_buffer(test_data)

    compressed_buf = pa.compress(test_buf, codec=compression)
    compressed_bytes = pa.compress(test_data, codec=compression,
                                   asbytes=True)

    assert isinstance(compressed_bytes, bytes)

    decompressed_buf = pa.decompress(compressed_buf, INPUT_SIZE,
                                     codec=compression)
    decompressed_bytes = pa.decompress(compressed_bytes, INPUT_SIZE,
                                       codec=compression, asbytes=True)

    assert isinstance(decompressed_bytes, bytes)

    assert decompressed_buf.equals(test_buf)
    assert decompressed_bytes == test_data

    with pytest.raises(ValueError):
        pa.decompress(compressed_bytes, codec=compression)
Пример #4
0
def getRedis(keyname,sessionID):

    keyDict = {'key':f"{keyname}Cache{sessionID}"}

    cacheDataCompress = redis.hget(keyDict['key'],'inDataCompress')
    cacheDataLen = int(redis.hget(keyDict['key'],'compressLength'))
    cacheSerialize = pa.decompress(cacheDataCompress,decompressed_size=cacheDataLen)
    cache = pa.deserialize(cacheSerialize)
    return cache
Пример #5
0
    def f(buf):
        if compress:
            # first deserialize the compressed data
            l, codec, buf = pa.deserialize(buf)

            # extract the data
            buf = pa.decompress(buf, l, codec=codec)

        # deserialize the actual data
        return pa.deserialize(buf, context=context)
Пример #6
0
    def _download_result(self):
        response = requests.get(self.BUCKET_PREFIX.format(self.id))
        response.raise_for_status()

        buffer = pa.decompress(
            response.content,
            codec=response.headers["x-goog-meta-codec"],
            decompressed_size=int(
                response.headers["x-goog-meta-decompressed_size"]),
        )
        return pa.deserialize(buffer, context=serialization_context)
Пример #7
0
def proto2object(proto: PandasDataFrame_PB) -> pd.DataFrame:
    """Proto to object conversion using to return desired model.

    Args:
        proto: Serialized version of Dataframe, which will be used to reconstruction.

    Returns:
        Re-constructed dataframe.
    """
    buf = pa.decompress(proto.dataframe,
                        decompressed_size=proto.decompressed_size)
    return pa.ipc.open_file(buf).read_pandas()
Пример #8
0
    def data_sparql(self,
                    sparql,
                    source=None,
                    start=None,
                    end=None,
                    agg=None,
                    window=None):
        params = {"sparql": sparql}
        if agg is not None and window is not None:
            params["agg"] = agg
            params["window"] = window
        if start is not None:
            if isinstance(start, datetime):
                params["start"] = start.localize().strftime(
                    "%Y-%m-%dT%H:%M:%SZ")
            else:
                params["start"] = start
        else:
            params["start"] = "1970-01-01T00:00:00Z"

        if end is not None:
            if isinstance(end, datetime):
                params["end"] = end.localize().strftime("%Y-%m-%dT%H:%M:%SZ")
            else:
                params["end"] = end
        else:
            params["end"] = "2100-01-01T00:00:00Z"

        if source is not None:
            params["source"] = source

        metadata = self.sparql(sparql,
                               sites=[source] if source is not None else None)

        resp = requests.get(f"{self._endpoint}/query", params=params)
        # print(len(resp.content))

        buf = pa.decompress(resp.content,
                            decompressed_size=4e10,
                            codec='lz4',
                            asbytes=True)
        buf = io.BytesIO(buf)
        # read metadata first
        r = pa.ipc.open_stream(buf)
        md = r.read_pandas()
        # then read data
        r = pa.ipc.open_stream(buf)
        df = r.read_pandas()
        return Dataset(metadata, md, df)
Пример #9
0
def proto2object(proto: PandasSeries_PB) -> pd.Series:
    """Convert PandasSeries_PB to pd.Series with pyarrow.

    Args:
        proto: Serialized version of Series, which will be used to reconstruction.

    Returns:
        Re-constructed Series.

    """
    buf = pa.decompress(proto.series,
                        decompressed_size=proto.decompressed_size)
    dataframe = pa.ipc.open_file(buf).read_pandas()
    # we know that this is a series being stored as a dataframe so just grab the first
    return dataframe[dataframe.columns[0]]
Пример #10
0
    def data_uris(self, uris, start=None, end=None, agg=None, window=None):
        parts = []
        if start is not None:
            if isinstance(start, datetime):
                parts.append(
                    f"start={start.localize().strftime('%Y-%m-%dT%H:%M:%SZ')}")
            else:
                parts.append(f"start={start}")
        else:
            parts.append("start=1970-01-01T00:00:00Z")

        for uri in uris:
            uri = urllib.parse.quote_plus(uri)
            parts.append(f"uri={uri}")

        query_string = "&".join(parts)
        if agg is not None and window is not None:
            resp = requests.get(
                f"{self._endpoint}/query?{query_string}&agg={agg}&window={window}"
            )
        else:
            resp = requests.get(f"{self._endpoint}/query?{query_string}")

        if not resp.ok:
            logging.error("Error getting data %s" % resp.content)
            raise Exception(resp.content)

        buf = pa.decompress(resp.content,
                            decompressed_size=4e10,
                            codec='lz4',
                            asbytes=True)
        buf = io.BytesIO(buf)
        # read metadata first
        try:
            r = pa.ipc.open_stream(buf)
        except pa.ArrowInvalid as e:
            logging.error("Error deserializing metadata %s" % e)
            raise Exception(e)
        md = r.read_pandas()

        # then read data
        try:
            r = pa.ipc.open_stream(buf)
        except pa.ArrowInvalid as e:
            logging.error("Error deserializing data %s" % e)
            raise Exception(e)
        df = r.read_pandas()
        return Dataset(None, md, df)
Пример #11
0
    def inspect(self,
                obj,
                timeout=30,
                format="pyarrow",
                format_options=None,
                **params):
        graft = obj.graft
        params_dict = parameters_to_grafts(**params)

        # TODO little dumb to have to serialize the typespec just to get the unmarshal name; EC-300 plz
        typespec = serialize_typespec(type(obj))
        result_type = typespec_to_unmarshal_str(typespec)
        # ^ this also preemptively checks whether the result type is something we'll know how to unmarshal

        mimetype = format_to_mimetype(format, format_options=format_options)

        # TODO stream=True, use resp.raw and stream through pyarrow?
        try:
            resp = self.session.post(
                "/inspect",
                json={
                    "graft": graft,
                    "parameters": params_dict
                },
                timeout=timeout,
                headers={"Accept": mimetype},
            )
        except requests.exceptions.Timeout as e:
            raise JobTimeoutError(e) from None

        if resp.headers["content-type"] == "application/vnd.pyarrow":
            buffer = pa.decompress(
                resp.content,
                codec=resp.headers["X-Arrow-Codec"],
                decompressed_size=int(resp.headers["X-Decompressed-Size"]),
            )

            marshalled = pa.deserialize(buffer, context=serialization_context)
            return unmarshal.unmarshal(result_type, marshalled)
        elif resp.headers["content-type"] == "application/json":
            return json.loads(resp.content)
        else:
            # return the raw data
            return resp.content
Пример #12
0
    def read_time_series_cache_from_disk(self,
                                         fname,
                                         engine='hdf5',
                                         start_date=None,
                                         finish_date=None,
                                         db_server=constants.db_server,
                                         db_port=constants.db_port,
                                         username=constants.db_username,
                                         password=constants.db_password):
        """Reads time series cache from disk in either HDF5 or bcolz

        Parameters
        ----------
        fname : str (or list)
            file to be read from
        engine : str (optional)
            'hd5' - reads HDF5 files (default)
            'arctic' - reads from Arctic/MongoDB database
            'bcolz' - reads from bcolz file (not fully implemented)
            'parquet' - reads from Parquet
        start_date : str/datetime (optional)
            Start date
        finish_date : str/datetime (optional)
            Finish data
        db_server : str
            IP address of MongdDB (default '127.0.0.1')

        Returns
        -------
        DataFrame
        """

        logger = LoggerManager.getLogger(__name__)

        data_frame_list = []

        if not (isinstance(fname, list)):
            if '*' in fname:
                fname = glob.glob(fname)
            else:
                fname = [fname]

        for fname_single in fname:
            logger.debug("Reading " + fname_single + "..")

            if engine == 'parquet' and '.gzip' not in fname_single and '.parquet' not in fname_single:
                fname_single = fname_single + '.parquet'

            if (engine == 'bcolz'):
                try:
                    name = self.get_bcolz_filename(fname_single)
                    zlens = bcolz.open(rootdir=name)
                    data_frame = zlens.todataframe()

                    data_frame.index = pandas.DatetimeIndex(data_frame['DTS_'])
                    data_frame.index.name = 'Date'
                    del data_frame['DTS_']

                    # convert invalid characters (which Bcolz can't deal with) to more readable characters for pandas
                    data_frame.columns = self.find_replace_chars(
                        data_frame.columns, _replace_chars, _invalid_chars)
                    data_frame.columns = [x[2:] for x in data_frame.columns]
                except:
                    data_frame = None

            elif (engine == 'redis'):
                fname_single = os.path.basename(fname_single).replace('.', '_')

                msg = None

                try:
                    # for pyarrow
                    context = pa.default_serialization_context()

                    r = redis.StrictRedis(host=db_server, port=db_port, db=0)

                    # is there a compressed key stored?)
                    k = r.keys('comp_*_' + fname_single)

                    # if so, then it means that we have stored it as a compressed object
                    # if have more than 1 element, take the last (which will be the latest to be added)
                    if (len(k) >= 1):
                        k = k[-1].decode('utf-8')

                        comp = r.get(k)

                        siz = int(k.split('_')[1])
                        dec = pa.decompress(comp,
                                            codec='lz4',
                                            decompressed_size=siz)

                        msg = context.deserialize(dec)
                    else:
                        msg = r.get(fname_single)

                        # print(fname_single)
                        if msg is not None:
                            msg = context.deserialize(msg)
                            # logger.warning("Key " + fname_single + " not in Redis cache?")

                except Exception as e:
                    logger.info("Cache not existent for " + fname_single +
                                " in Redis: " + str(e))

                if msg is None:
                    data_frame = None
                else:
                    logger.info('Load Redis cache: ' + fname_single)

                    data_frame = msg  # pandas.read_msgpack(msg)

            elif (engine == 'arctic'):
                socketTimeoutMS = 2 * 1000

                import pymongo
                from arctic import Arctic

                fname_single = os.path.basename(fname_single).replace('.', '_')

                logger.info('Load Arctic/MongoDB library: ' + fname_single)

                if username is not None and password is not None:
                    c = pymongo.MongoClient(
                        host="mongodb://" + username + ":" + password + "@" +
                        str(db_server) + ":" + str(db_port),
                        connect=False
                    )  # , username=username, password=password)
                else:
                    c = pymongo.MongoClient(host="mongodb://" +
                                            str(db_server) + ":" +
                                            str(db_port),
                                            connect=False)

                store = Arctic(c,
                               socketTimeoutMS=socketTimeoutMS,
                               serverSelectionTimeoutMS=socketTimeoutMS)

                # Access the library
                try:
                    library = store[fname_single]

                    if start_date is None and finish_date is None:
                        item = library.read(fname_single)

                    else:
                        from arctic.date import DateRange
                        item = library.read(
                            fname_single,
                            date_range=DateRange(
                                start_date.replace(tzinfo=None),
                                finish_date.replace(tzinfo=None)))

                    c.close()

                    logger.info('Read ' + fname_single)

                    data_frame = item.data

                except Exception as e:
                    logger.warning('Library may not exist or another error: ' +
                                   fname_single + ' & message is ' + str(e))
                    data_frame = None

            elif self.path_exists(self.get_h5_filename(fname_single)):
                store = pandas.HDFStore(self.get_h5_filename(fname_single))
                data_frame = store.select("data")

                if ('intraday' in fname_single):
                    data_frame = data_frame.astype('float32')

                store.close()

            elif self.path_exists(fname_single) and '.csv' in fname_single:
                data_frame = pandas.read_csv(fname_single, index_col=0)

                data_frame.index = pd.to_datetime(data_frame.index)

            elif self.path_exists(fname_single):
                data_frame = self.read_parquet(fname_single)
                # data_frame = pandas.read_parquet(fname_single)

            data_frame_list.append(data_frame)

        if len(data_frame_list) == 1:
            return data_frame_list[0]

        return data_frame_list
Пример #13
0
def test_decompress(buf, *args, **kwargs):
    return pa.decompress(buf, *args, **kwargs)
Пример #14
0
    def convert_binary_to_python(self, obj, key):
        if obj is None: return None

        if '_df' in key:
            if not (isinstance(obj, list)):
                obj = [obj]

            if constants.volatile_cache_redis_format == 'msgpack':

                for i in range(0, len(obj)):
                    if obj[i] is not None:
                        obj[i] = pd.read_msgpack(obj[i])

            elif constants.volatile_cache_redis_format == 'arrow':

                # If compressed we need to know the size, to decompress it
                if '_comp' in key:
                    # Get the size of each compressed object
                    # eg. key might be xxxx_size_354534_size_345345_endsize etc.
                    # Ignore bit before first '_size_' and after '_endsize'

                    start = '_size_'
                    end = '_endsizearrow_'

                    if len(obj) > 0:
                        key = self._util_func.find_sub_string_between(
                            key, start, end)
                        siz = self._util_func.keep_numbers_list(
                            key.split('_size_'))

                    for i in range(0, len(obj)):
                        if obj[i] is not None:
                            obj[i] = pa.decompress(
                                obj[i],
                                codec=constants.
                                volatile_cache_redis_compression[
                                    constants.volatile_cache_redis_format],
                                decompressed_size=siz[i])

                            obj[i] = context.deserialize(obj[i])
                else:
                    for i in range(0, len(obj)):
                        if obj[i] is not None:
                            obj[i] = context.deserialize(obj[i])

                # Need to copy because Arrow doesn't allow writing on a DataFrame
                for i in range(0, len(obj)):
                    if obj[i] is not None:
                        obj[i] = obj[i].copy()
            else:
                raise Exception("Invalid volatile cache format specified.")

            if len(obj) == 1:
                obj = obj[0]
            elif len(obj) > 1:
                obj = pd.concat(obj)
            else:
                obj = None

        elif '_fig' in key:
            # print("--------- " + len(obj) + " ---------")
            obj = self._plotly_from_json(obj[0].decode("utf-8"))

        return obj
Пример #15
0
    def data_sparql(self,
                    sparql,
                    start=None,
                    end=None,
                    agg=None,
                    window=None,
                    sites=None,
                    memsize=4e10):
        params = {"sparql": sparql}
        if agg is not None and window is not None:
            params["agg"] = agg
            params["window"] = window
        if start is not None:
            if isinstance(start, datetime):
                params["start"] = start.localize().strftime(
                    "%Y-%m-%dT%H:%M:%SZ")
            else:
                params["start"] = start
        else:
            params["start"] = "1970-01-01T00:00:00Z"

        if end is not None:
            if isinstance(end, datetime):
                params["end"] = end.localize().strftime("%Y-%m-%dT%H:%M:%SZ")
            else:
                params["end"] = end
        else:
            params["end"] = "2100-01-01T00:00:00Z"

        if sites is not None:
            params["sites"] = sites

        metadata = self.sparql(sparql, sites=sites)

        resp = requests.get(f"{self._endpoint}/query", params=params)
        if not resp.ok:
            logging.error("Error getting data %s" % resp.content)
            raise Exception(resp.content)
        # print(len(resp.content))

        buf = pa.decompress(resp.content,
                            decompressed_size=memsize,
                            codec='lz4',
                            asbytes=True)
        buf = io.BytesIO(buf)
        # # before: no compression
        # buf = io.BytesIO(resp.content)
        # read metadata first
        try:
            rdr = pa.ipc.open_stream(buf)
        except pa.ArrowInvalid as e:
            logging.error("Error deserializing metadata %s" % e)
            raise Exception(e)
        md = rdr.read_pandas()

        # then read data
        try:
            rdr = pa.ipc.open_stream(buf)
        except pa.ArrowInvalid as e:
            logging.error("Error deserializing data %s" % e)
            raise Exception(e)
        df = rdr.read_pandas()
        return Dataset(metadata, md, df)