def get_coldata(coldata): """ return values and rowmask """ dtype = np.dtype(coldata[DTYPE]) values = np.frombuffer(decompress(coldata[DATA]), dtype=dtype) rowmask = np.unpackbits( np.frombuffer(decompress(coldata[ROWMASK]), dtype='uint8')) return list(values), list(rowmask)
def _read_bucket(self, doc, column_set, column_dtypes, include_symbol, include_images, columns): rtn = {} if doc[VERSION] != 3: raise ArcticException("Unhandled document version: %s" % doc[VERSION]) # np.cumsum copies the read-only array created with frombuffer rtn[INDEX] = np.cumsum( np.frombuffer(decompress(doc[INDEX]), dtype='uint64')) doc_length = len(rtn[INDEX]) column_set.update(doc[COLUMNS].keys()) # get the mask for the columns we're about to load union_mask = np.zeros((doc_length + 7) // 8, dtype='uint8') for c in column_set: try: coldata = doc[COLUMNS][c] # the or below will make a copy of this read-only array mask = np.frombuffer(decompress(coldata[ROWMASK]), dtype='uint8') union_mask = union_mask | mask except KeyError: rtn[c] = None union_mask = np.unpackbits(union_mask)[:doc_length].astype('bool') rtn_length = np.sum(union_mask) rtn[INDEX] = rtn[INDEX][union_mask] if include_symbol: rtn['SYMBOL'] = [ doc[SYMBOL], ] * rtn_length # Unpack each requested column in turn for c in column_set: try: coldata = doc[COLUMNS][c] dtype = np.dtype(coldata[DTYPE]) # values ends up being copied by pandas before being returned to the user. However, we # copy it into a bytearray here for safety. values = np.frombuffer(bytearray(decompress(coldata[DATA])), dtype=dtype) self._set_or_promote_dtype(column_dtypes, c, dtype) rtn[c] = self._empty(rtn_length, dtype=column_dtypes[c]) # unpackbits will make a copy of the read-only array created by frombuffer rowmask = np.unpackbits( np.frombuffer(decompress(coldata[ROWMASK]), dtype='uint8'))[:doc_length].astype('bool') rowmask = rowmask[union_mask] rtn[c][rowmask] = values except KeyError: rtn[c] = None if include_images and doc.get(IMAGE_DOC, {}).get(IMAGE, {}): rtn = self._prepend_image(rtn, doc[IMAGE_DOC], rtn_length, column_dtypes, column_set, columns) return rtn
def test_exceptions(): data = c.compress(b'1010101010100000000000000000000000000000000000000000000000000000000011111111111111111111111111111') data = data[0:16] with pytest.raises(Exception) as e: c.decompress(data) assert("decompressor wrote" in str(e.value).lower() or "corrupt input at" in str(e.value).lower() or "decompression failed: corrupt input" in str(e.value).lower()) data = c.compress(b'1010101010100000000000000000000000000000000000000000000000000000000011111111111111111111111111111') data = [data[0:16] for x in (1, 2, 3)] with pytest.raises(Exception) as e: c.decompress_array(data) assert ("decompressor wrote" in str(e.value).lower() or "corrupt input at" in str(e.value).lower() or "decompression failed: corrupt input" in str(e.value).lower())
def test_exceptions(): data = c.compress(b'1010101010100000000000000000000000000000000000000000000000000000000011111111111111111111111111111') data = data[0:16] with pytest.raises(Exception) as e: c.decompress(data) assert("decompressor wrote" in str(e).lower() or "corrupt input at" in str(e).lower() or "decompression failed: corrupt input" in str(e).lower()) data = c.compress(b'1010101010100000000000000000000000000000000000000000000000000000000011111111111111111111111111111') data = [data[0:16] for x in (1, 2, 3)] with pytest.raises(Exception) as e: c.decompress_array(data) assert ("decompressor wrote" in str(e).lower() or "corrupt input at" in str(e).lower() or "decompression failed: corrupt input" in str(e).lower())
def _read_bucket(self, doc, column_set, column_dtypes, include_symbol, include_images, columns): rtn = {} if doc[VERSION] != 3: raise ArcticException("Unhandled document version: %s" % doc[VERSION]) rtn[INDEX] = np.cumsum( np.frombuffer(decompress(doc[INDEX]), dtype='uint64')) doc_length = len(rtn[INDEX]) column_set.update(doc[COLUMNS].keys()) # get the mask for the columns we're about to load union_mask = np.zeros((doc_length + 7) // 8, dtype='uint8') for c in column_set: try: coldata = doc[COLUMNS][c] mask = np.frombuffer(decompress(coldata[ROWMASK]), dtype='uint8') union_mask = union_mask | mask except KeyError: rtn[c] = None union_mask = np.unpackbits(union_mask)[:doc_length].astype('bool') rtn_length = np.sum(union_mask) rtn[INDEX] = rtn[INDEX][union_mask] if include_symbol: rtn['SYMBOL'] = [ doc[SYMBOL], ] * rtn_length # Unpack each requested column in turn for c in column_set: try: coldata = doc[COLUMNS][c] dtype = np.dtype(coldata[DTYPE]) values = np.frombuffer(decompress(coldata[DATA]), dtype=dtype) self._set_or_promote_dtype(column_dtypes, c, dtype) rtn[c] = self._empty(rtn_length, dtype=column_dtypes[c]) rowmask = np.unpackbits( np.frombuffer(decompress(coldata[ROWMASK]), dtype='uint8'))[:doc_length].astype('bool') rowmask = rowmask[union_mask] rtn[c][rowmask] = values except KeyError: rtn[c] = None if include_images and doc.get(IMAGE_DOC, {}).get(IMAGE, {}): rtn = self._prepend_image(rtn, doc[IMAGE_DOC], rtn_length, column_dtypes, column_set, columns) return rtn
def test_tickstore_pandas_to_bucket_image(): symbol = 'SYM' tz = 'UTC' initial_image = {'index': dt(2014, 1, 1, 0, 0, tzinfo=mktz(tz)), 'A': 123, 'B': 54.4, 'C': 'DESC'} data = [{'A': 120, 'D': 1}, {'A': 122, 'B': 2.0}, {'A': 3, 'B': 3.0, 'D': 1}] tick_index = [dt(2014, 1, 2, 0, 0, tzinfo=mktz(tz)), dt(2014, 1, 3, 0, 0, tzinfo=mktz(tz)), dt(2014, 1, 4, 0, 0, tzinfo=mktz(tz))] data = pd.DataFrame(data, index=tick_index) bucket, final_image = TickStore._pandas_to_bucket(data, symbol, initial_image) assert final_image == {'index': dt(2014, 1, 4, 0, 0, tzinfo=mktz(tz)), 'A': 3, 'B': 3.0, 'C': 'DESC', 'D': 1} assert IMAGE_DOC in bucket assert bucket[COUNT] == 3 assert bucket[START] == dt(2014, 1, 1, 0, 0, tzinfo=mktz(tz)) assert bucket[END] == dt(2014, 1, 4, 0, 0, tzinfo=mktz(tz)) assert set(bucket[COLUMNS]) == set(('A', 'B', 'D')) assert set(bucket[COLUMNS]['A']) == set((ROWMASK, DTYPE, DATA)) assert get_coldata(bucket[COLUMNS]['A']) == ([120, 122, 3], [1, 1, 1, 0, 0, 0, 0, 0]) values, rowmask = get_coldata(bucket[COLUMNS]['B']) assert np.isnan(values[0]) and values[1:] == [2.0, 3.0] assert rowmask == [1, 1, 1, 0, 0, 0, 0, 0] values, rowmask = get_coldata(bucket[COLUMNS]['D']) assert np.isnan(values[1]) assert values[0] == 1 and values[2] == 1 assert rowmask == [1, 1, 1, 0, 0, 0, 0, 0] index = [dt.fromtimestamp(int(i/1000)).replace(tzinfo=mktz(tz)) for i in list(np.cumsum(np.frombuffer(decompress(bucket[INDEX]), dtype='uint64')))] assert index == tick_index assert bucket[COLUMNS]['A'][DTYPE] == 'int64' assert bucket[COLUMNS]['B'][DTYPE] == 'float64' assert bucket[SYMBOL] == symbol assert bucket[IMAGE_DOC] == {IMAGE: initial_image, IMAGE_TIME: initial_image['index']}
def test_tickstore_to_bucket_with_image(): symbol = 'SYM' tz = 'UTC' initial_image = {'index': dt(2014, 1, 1, 0, 0, tzinfo=mktz(tz)), 'A': 123, 'B': 54.4, 'C': 'DESC'} data = [{'index': dt(2014, 1, 1, 0, 1, tzinfo=mktz(tz)), 'A': 124, 'D': 0}, {'index': dt(2014, 1, 1, 0, 2, tzinfo=mktz(tz)), 'A': 125, 'B': 27.2}] bucket, final_image = TickStore._to_bucket(data, symbol, initial_image) assert bucket[COUNT] == 2 assert bucket[END] == dt(2014, 1, 1, 0, 2, tzinfo=mktz(tz)) assert set(bucket[COLUMNS]) == set(('A', 'B', 'D')) assert set(bucket[COLUMNS]['A']) == set((ROWMASK, DTYPE, DATA)) assert get_coldata(bucket[COLUMNS]['A']) == ([124, 125], [1, 1, 0, 0, 0, 0, 0, 0]) assert get_coldata(bucket[COLUMNS]['B']) == ([27.2], [0, 1, 0, 0, 0, 0, 0, 0]) assert get_coldata(bucket[COLUMNS]['D']) == ([0], [1, 0, 0, 0, 0, 0, 0, 0]) index = [dt.fromtimestamp(int(i/1000)).replace(tzinfo=mktz(tz)) for i in list(np.cumsum(np.frombuffer(decompress(bucket[INDEX]), dtype='uint64')))] assert index == [i['index'] for i in data] assert bucket[COLUMNS]['A'][DTYPE] == 'int64' assert bucket[COLUMNS]['B'][DTYPE] == 'float64' assert bucket[SYMBOL] == symbol assert bucket[START] == initial_image['index'] assert bucket[IMAGE_DOC][IMAGE] == initial_image assert bucket[IMAGE_DOC] == {IMAGE: initial_image, IMAGE_TIME: initial_image['index']} assert final_image == {'index': data[-1]['index'], 'A': 125, 'B': 27.2, 'C': 'DESC', 'D': 0}
def _do_read(self, backing_store, library_name, version, symbol, index_range=None): ''' index_range is a 2-tuple of integers - a [from, to) range of segments to be read. Either from or to can be None, indicating no bound. ''' from_index = index_range[0] if index_range else None to_index = version['up_to'] if index_range and index_range[1] and index_range[1] < version['up_to']: to_index = index_range[1] segment_keys = version['segment_keys'] filtered_segment_keys = [] for i, segment_index in enumerate(version['raw_segment_index']): if (from_index is None or segment_index >= from_index) and \ (to_index is None or segment_index <= to_index): filtered_segment_keys.append(segment_keys[i]) data = bytearray() for segment in backing_store.read_segments(library_name, filtered_segment_keys): data.extend(decompress(segment)) dtype = self._dtype(version['dtype'], version.get('dtype_metadata', {})) rtn = np.frombuffer(data, dtype=dtype).reshape(version.get('shape', (-1))) return rtn
def test_exceptions(): data = c.compress( b'1010101010100000000000000000000000000000000000000000000000000000000011111111111111111111111111111' ) data = data[0:16] with pytest.raises(Exception) as e: c.decompress(data) assert ("Decompressor wrote" in str(e) or "Corrupt input at" in str(e)) data = c.compress( b'1010101010100000000000000000000000000000000000000000000000000000000011111111111111111111111111111' ) data = [data[0:16] for x in (1, 2, 3)] with pytest.raises(Exception) as e: c.decompress_array(data) assert ("Decompressor wrote" in str(e) or "Corrupt input at" in str(e))
def test_compress_decompress_no_parallel(): with patch('arctic._compression.clz4', sentinel.clz4), \ patch('arctic._compression.ENABLE_PARALLEL', False), \ patch('arctic._compression.lz4', wraps=lz4) as patch_lz4: # patching clz4 with sentinel will make accessing any clz4 function explode assert decompress(compress(b'Foo')) == b'Foo' assert patch_lz4.compress.call_args_list == [call(b'Foo')] assert patch_lz4.decompress.call_args_list == [call(compress(b'Foo'))]
def test_compress_decompress_no_parallel(): with patch("arctic._compression.clz4", sentinel.clz4), patch("arctic._compression.ENABLE_PARALLEL", False), patch( "arctic._compression.lz4", wraps=lz4 ) as patch_lz4: # patching clz4 with sentinel will make accessing any clz4 function explode assert decompress(compress("Foo")) == "Foo" assert patch_lz4.compress.call_args_list == [call("Foo")] assert patch_lz4.decompress.call_args_list == [call(compress("Foo"))]
def _read_bucket(self, doc, column_set, column_dtypes, include_symbol, include_images, columns): rtn = {} if doc[VERSION] != 3: raise ArcticException("Unhandled document version: %s" % doc[VERSION]) rtn[INDEX] = np.cumsum(np.fromstring(decompress(doc[INDEX]), dtype='uint64')) doc_length = len(rtn[INDEX]) column_set.update(doc[COLUMNS].keys()) # get the mask for the columns we're about to load union_mask = np.zeros((doc_length + 7) // 8, dtype='uint8') for c in column_set: try: coldata = doc[COLUMNS][c] mask = np.fromstring(decompress(coldata[ROWMASK]), dtype='uint8') union_mask = union_mask | mask except KeyError: rtn[c] = None union_mask = np.unpackbits(union_mask)[:doc_length].astype('bool') rtn_length = np.sum(union_mask) rtn[INDEX] = rtn[INDEX][union_mask] if include_symbol: rtn['SYMBOL'] = [doc[SYMBOL], ] * rtn_length # Unpack each requested column in turn for c in column_set: try: coldata = doc[COLUMNS][c] dtype = np.dtype(coldata[DTYPE]) values = np.fromstring(decompress(coldata[DATA]), dtype=dtype) self._set_or_promote_dtype(column_dtypes, c, dtype) rtn[c] = self._empty(rtn_length, dtype=column_dtypes[c]) rowmask = np.unpackbits(np.fromstring(decompress(coldata[ROWMASK]), dtype='uint8'))[:doc_length].astype('bool') rowmask = rowmask[union_mask] rtn[c][rowmask] = values except KeyError: rtn[c] = None if include_images and doc.get(IMAGE_DOC, {}).get(IMAGE, {}): rtn = self._prepend_image(rtn, doc[IMAGE_DOC], rtn_length, column_dtypes, column_set, columns) return rtn
def test_performance_sequential(n, length): _str = random_string(length) _strarr = [_str for _ in range(n)] now = dt.now() [c.decompress(y) for y in [c.compressHC(x) for x in _strarr]] clz4_time = (dt.now() - now).total_seconds() now = dt.now() c.decompress_array(c.compressHC_array(_strarr)) clz4_time_p = (dt.now() - now).total_seconds() now = dt.now() [lz4_decompress(y) for y in [lz4_compress(x) for x in _strarr]] lz4_time = (dt.now() - now).total_seconds() print() print("LZ4 Test %sx len:%s" % (n, length)) print(" LZ4 HC %s s" % clz4_time) print(" LZ4 HC Parallel %s s" % clz4_time_p) print(" LZ4 %s s" % lz4_time)
def _segment_index(self, recarr, existing_index, start, new_segments): """ Generate index of datetime64 -> item offset. Parameters: ----------- new_data: new data being written (or appended) existing_index: index field from the versions document of the previous version start: first (0-based) offset of the new data segments: list of offsets. Each offset is the row index of the the last row of a particular chunk relative to the start of the _original_ item. array(new_data) - segments = array(offsets in item) Returns: -------- Binary(compress(array([(index, datetime)])) Where index is the 0-based index of the datetime in the DataFrame """ # find the index of the first datetime64 column idx_col = self._datetime64_index(recarr) # if one exists let's create the index on it if idx_col is not None: new_segments = np.array(new_segments, dtype='i8') last_rows = recarr[new_segments - start] # create numpy index index = np.core.records.fromarrays([last_rows[idx_col]] + [ new_segments, ], dtype=INDEX_DTYPE) # append to existing index if exists if existing_index: # existing_index_arr is read-only but it's never written to existing_index_arr = np.frombuffer(decompress(existing_index), dtype=INDEX_DTYPE) if start > 0: existing_index_arr = existing_index_arr[ existing_index_arr['index'] < start] index = np.concatenate((existing_index_arr, index)) return Binary(compress(index.tostring())) elif existing_index: raise ArcticException( "Could not find datetime64 index in item but existing data contains one" ) return None
def _index_range(self, version, symbol, date_range=None, **kwargs): """ Given a version, read the segment_index and return the chunks associated with the date_range. As the segment index is (id -> last datetime) we need to take care in choosing the correct chunks. """ if date_range and 'segment_index' in version: # index is read-only but it's never written to index = np.frombuffer(decompress(version['segment_index']), dtype=INDEX_DTYPE) dtcol = self._datetime64_index(index) if dtcol and len(index): dts = index[dtcol] start, end = _start_end(date_range, dts) if start > dts[-1]: return -1, -1 idxstart = min(np.searchsorted(dts, start), len(dts) - 1) idxend = min(np.searchsorted(dts, end, side='right'), len(dts) - 1) return int( index['index'][idxstart]), int(index['index'][idxend] + 1) return super(PandasStore, self)._index_range(version, symbol, **kwargs)
def test_roundtrip_multi(n): _str = random_string(n) cstr = c.compress(_str) assert _str == c.decompress(cstr)
def test_decompress(): assert decompress(compress(b"foo")) == b"foo"
def test_decompress(): assert decompress(compress("foo")) == "foo"
def test_roundtrip(compress): _str = b"hello world" cstr = compress(_str) assert _str == c.decompress(cstr)
def test_compress_empty_string(): assert(decompress(compress(b'')) == b'')
def get_coldata(coldata): """ return values and rowmask """ dtype = np.dtype(coldata[DTYPE]) values = np.frombuffer(decompress(coldata[DATA]), dtype=dtype) rowmask = np.unpackbits(np.frombuffer(decompress(coldata[ROWMASK]), dtype='uint8')) return list(values), list(rowmask)
def test_compress_empty_string(): assert (decompress(compress(b'')) == b'')