def test_compress_decompress(): INPUT_SIZE = 10000 test_data = (np.random.randint(0, 255, size=INPUT_SIZE) .astype(np.uint8) .tostring()) test_buf = pa.py_buffer(test_data) codecs = ['lz4', 'snappy', 'gzip', 'zstd', 'brotli'] for codec in codecs: compressed_buf = pa.compress(test_buf, codec=codec) compressed_bytes = pa.compress(test_data, codec=codec, asbytes=True) assert isinstance(compressed_bytes, bytes) decompressed_buf = pa.decompress(compressed_buf, INPUT_SIZE, codec=codec) decompressed_bytes = pa.decompress(compressed_bytes, INPUT_SIZE, codec=codec, asbytes=True) assert isinstance(decompressed_bytes, bytes) assert decompressed_buf.equals(test_buf) assert decompressed_bytes == test_data with pytest.raises(ValueError): pa.decompress(compressed_bytes, codec=codec)
def test_compress_decompress(): INPUT_SIZE = 10000 test_data = (np.random.randint(0, 255, size=INPUT_SIZE).astype( np.uint8).tostring()) test_buf = pa.py_buffer(test_data) codecs = ['lz4', 'snappy', 'gzip', 'zstd', 'brotli'] for codec in codecs: compressed_buf = pa.compress(test_buf, codec=codec) compressed_bytes = pa.compress(test_data, codec=codec, asbytes=True) assert isinstance(compressed_bytes, bytes) decompressed_buf = pa.decompress(compressed_buf, INPUT_SIZE, codec=codec) decompressed_bytes = pa.decompress(compressed_bytes, INPUT_SIZE, codec=codec, asbytes=True) assert isinstance(decompressed_bytes, bytes) assert decompressed_buf.equals(test_buf) assert decompressed_bytes == test_data with pytest.raises(ValueError): pa.decompress(compressed_bytes, codec=codec)
def test_compress_decompress(compression): if not Codec.is_available(compression): pytest.skip("{} support is not built".format(compression)) INPUT_SIZE = 10000 test_data = (np.random.randint(0, 255, size=INPUT_SIZE) .astype(np.uint8) .tostring()) test_buf = pa.py_buffer(test_data) compressed_buf = pa.compress(test_buf, codec=compression) compressed_bytes = pa.compress(test_data, codec=compression, asbytes=True) assert isinstance(compressed_bytes, bytes) decompressed_buf = pa.decompress(compressed_buf, INPUT_SIZE, codec=compression) decompressed_bytes = pa.decompress(compressed_bytes, INPUT_SIZE, codec=compression, asbytes=True) assert isinstance(decompressed_bytes, bytes) assert decompressed_buf.equals(test_buf) assert decompressed_bytes == test_data with pytest.raises(ValueError): pa.decompress(compressed_bytes, codec=compression)
def getRedis(keyname,sessionID): keyDict = {'key':f"{keyname}Cache{sessionID}"} cacheDataCompress = redis.hget(keyDict['key'],'inDataCompress') cacheDataLen = int(redis.hget(keyDict['key'],'compressLength')) cacheSerialize = pa.decompress(cacheDataCompress,decompressed_size=cacheDataLen) cache = pa.deserialize(cacheSerialize) return cache
def f(buf): if compress: # first deserialize the compressed data l, codec, buf = pa.deserialize(buf) # extract the data buf = pa.decompress(buf, l, codec=codec) # deserialize the actual data return pa.deserialize(buf, context=context)
def _download_result(self): response = requests.get(self.BUCKET_PREFIX.format(self.id)) response.raise_for_status() buffer = pa.decompress( response.content, codec=response.headers["x-goog-meta-codec"], decompressed_size=int( response.headers["x-goog-meta-decompressed_size"]), ) return pa.deserialize(buffer, context=serialization_context)
def proto2object(proto: PandasDataFrame_PB) -> pd.DataFrame: """Proto to object conversion using to return desired model. Args: proto: Serialized version of Dataframe, which will be used to reconstruction. Returns: Re-constructed dataframe. """ buf = pa.decompress(proto.dataframe, decompressed_size=proto.decompressed_size) return pa.ipc.open_file(buf).read_pandas()
def data_sparql(self, sparql, source=None, start=None, end=None, agg=None, window=None): params = {"sparql": sparql} if agg is not None and window is not None: params["agg"] = agg params["window"] = window if start is not None: if isinstance(start, datetime): params["start"] = start.localize().strftime( "%Y-%m-%dT%H:%M:%SZ") else: params["start"] = start else: params["start"] = "1970-01-01T00:00:00Z" if end is not None: if isinstance(end, datetime): params["end"] = end.localize().strftime("%Y-%m-%dT%H:%M:%SZ") else: params["end"] = end else: params["end"] = "2100-01-01T00:00:00Z" if source is not None: params["source"] = source metadata = self.sparql(sparql, sites=[source] if source is not None else None) resp = requests.get(f"{self._endpoint}/query", params=params) # print(len(resp.content)) buf = pa.decompress(resp.content, decompressed_size=4e10, codec='lz4', asbytes=True) buf = io.BytesIO(buf) # read metadata first r = pa.ipc.open_stream(buf) md = r.read_pandas() # then read data r = pa.ipc.open_stream(buf) df = r.read_pandas() return Dataset(metadata, md, df)
def proto2object(proto: PandasSeries_PB) -> pd.Series: """Convert PandasSeries_PB to pd.Series with pyarrow. Args: proto: Serialized version of Series, which will be used to reconstruction. Returns: Re-constructed Series. """ buf = pa.decompress(proto.series, decompressed_size=proto.decompressed_size) dataframe = pa.ipc.open_file(buf).read_pandas() # we know that this is a series being stored as a dataframe so just grab the first return dataframe[dataframe.columns[0]]
def data_uris(self, uris, start=None, end=None, agg=None, window=None): parts = [] if start is not None: if isinstance(start, datetime): parts.append( f"start={start.localize().strftime('%Y-%m-%dT%H:%M:%SZ')}") else: parts.append(f"start={start}") else: parts.append("start=1970-01-01T00:00:00Z") for uri in uris: uri = urllib.parse.quote_plus(uri) parts.append(f"uri={uri}") query_string = "&".join(parts) if agg is not None and window is not None: resp = requests.get( f"{self._endpoint}/query?{query_string}&agg={agg}&window={window}" ) else: resp = requests.get(f"{self._endpoint}/query?{query_string}") if not resp.ok: logging.error("Error getting data %s" % resp.content) raise Exception(resp.content) buf = pa.decompress(resp.content, decompressed_size=4e10, codec='lz4', asbytes=True) buf = io.BytesIO(buf) # read metadata first try: r = pa.ipc.open_stream(buf) except pa.ArrowInvalid as e: logging.error("Error deserializing metadata %s" % e) raise Exception(e) md = r.read_pandas() # then read data try: r = pa.ipc.open_stream(buf) except pa.ArrowInvalid as e: logging.error("Error deserializing data %s" % e) raise Exception(e) df = r.read_pandas() return Dataset(None, md, df)
def inspect(self, obj, timeout=30, format="pyarrow", format_options=None, **params): graft = obj.graft params_dict = parameters_to_grafts(**params) # TODO little dumb to have to serialize the typespec just to get the unmarshal name; EC-300 plz typespec = serialize_typespec(type(obj)) result_type = typespec_to_unmarshal_str(typespec) # ^ this also preemptively checks whether the result type is something we'll know how to unmarshal mimetype = format_to_mimetype(format, format_options=format_options) # TODO stream=True, use resp.raw and stream through pyarrow? try: resp = self.session.post( "/inspect", json={ "graft": graft, "parameters": params_dict }, timeout=timeout, headers={"Accept": mimetype}, ) except requests.exceptions.Timeout as e: raise JobTimeoutError(e) from None if resp.headers["content-type"] == "application/vnd.pyarrow": buffer = pa.decompress( resp.content, codec=resp.headers["X-Arrow-Codec"], decompressed_size=int(resp.headers["X-Decompressed-Size"]), ) marshalled = pa.deserialize(buffer, context=serialization_context) return unmarshal.unmarshal(result_type, marshalled) elif resp.headers["content-type"] == "application/json": return json.loads(resp.content) else: # return the raw data return resp.content
def read_time_series_cache_from_disk(self, fname, engine='hdf5', start_date=None, finish_date=None, db_server=constants.db_server, db_port=constants.db_port, username=constants.db_username, password=constants.db_password): """Reads time series cache from disk in either HDF5 or bcolz Parameters ---------- fname : str (or list) file to be read from engine : str (optional) 'hd5' - reads HDF5 files (default) 'arctic' - reads from Arctic/MongoDB database 'bcolz' - reads from bcolz file (not fully implemented) 'parquet' - reads from Parquet start_date : str/datetime (optional) Start date finish_date : str/datetime (optional) Finish data db_server : str IP address of MongdDB (default '127.0.0.1') Returns ------- DataFrame """ logger = LoggerManager.getLogger(__name__) data_frame_list = [] if not (isinstance(fname, list)): if '*' in fname: fname = glob.glob(fname) else: fname = [fname] for fname_single in fname: logger.debug("Reading " + fname_single + "..") if engine == 'parquet' and '.gzip' not in fname_single and '.parquet' not in fname_single: fname_single = fname_single + '.parquet' if (engine == 'bcolz'): try: name = self.get_bcolz_filename(fname_single) zlens = bcolz.open(rootdir=name) data_frame = zlens.todataframe() data_frame.index = pandas.DatetimeIndex(data_frame['DTS_']) data_frame.index.name = 'Date' del data_frame['DTS_'] # convert invalid characters (which Bcolz can't deal with) to more readable characters for pandas data_frame.columns = self.find_replace_chars( data_frame.columns, _replace_chars, _invalid_chars) data_frame.columns = [x[2:] for x in data_frame.columns] except: data_frame = None elif (engine == 'redis'): fname_single = os.path.basename(fname_single).replace('.', '_') msg = None try: # for pyarrow context = pa.default_serialization_context() r = redis.StrictRedis(host=db_server, port=db_port, db=0) # is there a compressed key stored?) k = r.keys('comp_*_' + fname_single) # if so, then it means that we have stored it as a compressed object # if have more than 1 element, take the last (which will be the latest to be added) if (len(k) >= 1): k = k[-1].decode('utf-8') comp = r.get(k) siz = int(k.split('_')[1]) dec = pa.decompress(comp, codec='lz4', decompressed_size=siz) msg = context.deserialize(dec) else: msg = r.get(fname_single) # print(fname_single) if msg is not None: msg = context.deserialize(msg) # logger.warning("Key " + fname_single + " not in Redis cache?") except Exception as e: logger.info("Cache not existent for " + fname_single + " in Redis: " + str(e)) if msg is None: data_frame = None else: logger.info('Load Redis cache: ' + fname_single) data_frame = msg # pandas.read_msgpack(msg) elif (engine == 'arctic'): socketTimeoutMS = 2 * 1000 import pymongo from arctic import Arctic fname_single = os.path.basename(fname_single).replace('.', '_') logger.info('Load Arctic/MongoDB library: ' + fname_single) if username is not None and password is not None: c = pymongo.MongoClient( host="mongodb://" + username + ":" + password + "@" + str(db_server) + ":" + str(db_port), connect=False ) # , username=username, password=password) else: c = pymongo.MongoClient(host="mongodb://" + str(db_server) + ":" + str(db_port), connect=False) store = Arctic(c, socketTimeoutMS=socketTimeoutMS, serverSelectionTimeoutMS=socketTimeoutMS) # Access the library try: library = store[fname_single] if start_date is None and finish_date is None: item = library.read(fname_single) else: from arctic.date import DateRange item = library.read( fname_single, date_range=DateRange( start_date.replace(tzinfo=None), finish_date.replace(tzinfo=None))) c.close() logger.info('Read ' + fname_single) data_frame = item.data except Exception as e: logger.warning('Library may not exist or another error: ' + fname_single + ' & message is ' + str(e)) data_frame = None elif self.path_exists(self.get_h5_filename(fname_single)): store = pandas.HDFStore(self.get_h5_filename(fname_single)) data_frame = store.select("data") if ('intraday' in fname_single): data_frame = data_frame.astype('float32') store.close() elif self.path_exists(fname_single) and '.csv' in fname_single: data_frame = pandas.read_csv(fname_single, index_col=0) data_frame.index = pd.to_datetime(data_frame.index) elif self.path_exists(fname_single): data_frame = self.read_parquet(fname_single) # data_frame = pandas.read_parquet(fname_single) data_frame_list.append(data_frame) if len(data_frame_list) == 1: return data_frame_list[0] return data_frame_list
def test_decompress(buf, *args, **kwargs): return pa.decompress(buf, *args, **kwargs)
def convert_binary_to_python(self, obj, key): if obj is None: return None if '_df' in key: if not (isinstance(obj, list)): obj = [obj] if constants.volatile_cache_redis_format == 'msgpack': for i in range(0, len(obj)): if obj[i] is not None: obj[i] = pd.read_msgpack(obj[i]) elif constants.volatile_cache_redis_format == 'arrow': # If compressed we need to know the size, to decompress it if '_comp' in key: # Get the size of each compressed object # eg. key might be xxxx_size_354534_size_345345_endsize etc. # Ignore bit before first '_size_' and after '_endsize' start = '_size_' end = '_endsizearrow_' if len(obj) > 0: key = self._util_func.find_sub_string_between( key, start, end) siz = self._util_func.keep_numbers_list( key.split('_size_')) for i in range(0, len(obj)): if obj[i] is not None: obj[i] = pa.decompress( obj[i], codec=constants. volatile_cache_redis_compression[ constants.volatile_cache_redis_format], decompressed_size=siz[i]) obj[i] = context.deserialize(obj[i]) else: for i in range(0, len(obj)): if obj[i] is not None: obj[i] = context.deserialize(obj[i]) # Need to copy because Arrow doesn't allow writing on a DataFrame for i in range(0, len(obj)): if obj[i] is not None: obj[i] = obj[i].copy() else: raise Exception("Invalid volatile cache format specified.") if len(obj) == 1: obj = obj[0] elif len(obj) > 1: obj = pd.concat(obj) else: obj = None elif '_fig' in key: # print("--------- " + len(obj) + " ---------") obj = self._plotly_from_json(obj[0].decode("utf-8")) return obj
def data_sparql(self, sparql, start=None, end=None, agg=None, window=None, sites=None, memsize=4e10): params = {"sparql": sparql} if agg is not None and window is not None: params["agg"] = agg params["window"] = window if start is not None: if isinstance(start, datetime): params["start"] = start.localize().strftime( "%Y-%m-%dT%H:%M:%SZ") else: params["start"] = start else: params["start"] = "1970-01-01T00:00:00Z" if end is not None: if isinstance(end, datetime): params["end"] = end.localize().strftime("%Y-%m-%dT%H:%M:%SZ") else: params["end"] = end else: params["end"] = "2100-01-01T00:00:00Z" if sites is not None: params["sites"] = sites metadata = self.sparql(sparql, sites=sites) resp = requests.get(f"{self._endpoint}/query", params=params) if not resp.ok: logging.error("Error getting data %s" % resp.content) raise Exception(resp.content) # print(len(resp.content)) buf = pa.decompress(resp.content, decompressed_size=memsize, codec='lz4', asbytes=True) buf = io.BytesIO(buf) # # before: no compression # buf = io.BytesIO(resp.content) # read metadata first try: rdr = pa.ipc.open_stream(buf) except pa.ArrowInvalid as e: logging.error("Error deserializing metadata %s" % e) raise Exception(e) md = rdr.read_pandas() # then read data try: rdr = pa.ipc.open_stream(buf) except pa.ArrowInvalid as e: logging.error("Error deserializing data %s" % e) raise Exception(e) df = rdr.read_pandas() return Dataset(metadata, md, df)