def _pandas_to_bucket(self, df, symbol): start = to_dt(df.index[0].to_datetime()) end = to_dt(df.index[0].to_datetime()) rtn = {START: start, END: end, SYMBOL: symbol} rtn[VERSION] = CHUNK_VERSION_NUMBER rtn[COUNT] = len(df) rtn[COLUMNS] = {} logger.warn("NB treating all values as 'exists' - no longer sparse") rowmask = Binary(lz4.compressHC(np.packbits(np.ones(len(df), dtype="uint8")))) recs = df.to_records(convert_datetime64=False) for col in df: array = self._ensure_supported_dtypes(recs[col]) col_data = {} col_data[DATA] = Binary(lz4.compressHC(array.tostring())) col_data[ROWMASK] = rowmask col_data[DTYPE] = self._str_dtype(array.dtype) rtn[COLUMNS][col] = col_data rtn[INDEX] = Binary( lz4.compressHC( np.concatenate( ( [recs["index"][0].astype("datetime64[ms]").view("uint64")], np.diff(recs["index"].astype("datetime64[ms]").view("uint64")), ) ).tostring() ) ) return rtn
def _pandas_to_bucket(self, df, symbol): start = self._to_dt(df.index[0].to_datetime()) end = self._to_dt(df.index[0].to_datetime()) rtn = {START: start, END: end, SYMBOL: symbol} rtn[VERSION] = CHUNK_VERSION_NUMBER rtn[COUNT] = len(df) rtn[COLUMNS] = {} logger.warn("NB treating all values as 'exists' - no longer sparse") rowmask = Binary( lz4.compressHC(np.packbits(np.ones(len(df), dtype='uint8')))) recs = df.to_records(convert_datetime64=False) for col in df: array = self._ensure_supported_dtypes(recs[col]) col_data = {} col_data[DATA] = Binary(lz4.compressHC(array.tostring())) col_data[ROWMASK] = rowmask col_data[DTYPE] = self._str_dtype(array.dtype) rtn[COLUMNS][col] = col_data rtn[INDEX] = Binary( lz4.compressHC( np.concatenate(([ recs['index'][0].astype('datetime64[ms]').view('uint64') ], np.diff(recs['index'].astype('datetime64[ms]'). view('uint64')))).tostring())) return rtn
def _pandas_to_bucket(df, symbol, initial_image): rtn = {SYMBOL: symbol, VERSION: CHUNK_VERSION_NUMBER, COLUMNS: {}, COUNT: len(df)} end = to_dt(df.index[-1].to_datetime()) if initial_image : if 'index' in initial_image: start = min(to_dt(df.index[0].to_datetime()), initial_image['index']) else: start = to_dt(df.index[0].to_datetime()) image_start = initial_image.get('index', start) image = {k: v for k, v in initial_image.items() if k != 'index'} rtn[IMAGE_DOC] = {IMAGE_TIME: image_start, IMAGE: initial_image} final_image = TickStore._pandas_compute_final_image(df, initial_image, end) else: start = to_dt(df.index[0].to_datetime()) final_image = {} rtn[END] = end rtn[START] = start logger.warning("NB treating all values as 'exists' - no longer sparse") rowmask = Binary(lz4.compressHC(np.packbits(np.ones(len(df), dtype='uint8')))) recs = df.to_records(convert_datetime64=False) for col in df: array = TickStore._ensure_supported_dtypes(recs[col]) col_data = {} col_data[DATA] = Binary(lz4.compressHC(array.tostring())) col_data[ROWMASK] = rowmask col_data[DTYPE] = TickStore._str_dtype(array.dtype) rtn[COLUMNS][col] = col_data rtn[INDEX] = Binary(lz4.compressHC(np.concatenate(([recs['index'][0].astype('datetime64[ms]').view('uint64')], np.diff(recs['index'].astype('datetime64[ms]').view('uint64'))) ).tostring())) return rtn, final_image
def test_write_object(): arctic_lib = Mock() self = create_autospec(PickleStore) version = {"_id": ObjectId()} PickleStore.write(self, arctic_lib, version, "sentinel.symbol", sentinel.item, sentinel.previous_version) assert "data" not in version assert version["blob"] == "__chunked__" coll = arctic_lib.get_top_level_collection.return_value assert coll.update_one.call_args_list == [ call( { "sha": checksum( "sentinel.symbol", {"data": Binary(lz4.compressHC(cPickle.dumps(sentinel.item, cPickle.HIGHEST_PROTOCOL)))}, ), "symbol": "sentinel.symbol", }, { "$set": { "segment": 0, "data": Binary(lz4.compressHC(cPickle.dumps(sentinel.item, cPickle.HIGHEST_PROTOCOL)), 0), }, "$addToSet": {"parent": version["_id"]}, }, upsert=True, ) ]
def _to_bucket(ticks, symbol, initial_image): rtn = { SYMBOL: symbol, VERSION: CHUNK_VERSION_NUMBER, COLUMNS: {}, COUNT: len(ticks) } data = {} rowmask = {} start = to_dt(ticks[0]['index']) end = to_dt(ticks[-1]['index']) final_image = copy.copy(initial_image) if initial_image else {} for i, t in enumerate(ticks): if initial_image: final_image.update(t) for k, v in iteritems(t): try: if k != 'index': rowmask[k][i] = 1 else: v = TickStore._to_ms(v) if data[k][-1] > v: raise UnorderedDataException( "Timestamps out-of-order: %s > %s" % (ms_to_datetime(data[k][-1]), t)) data[k].append(v) except KeyError: if k != 'index': rowmask[k] = np.zeros(len(ticks), dtype='uint8') rowmask[k][i] = 1 data[k] = [v] rowmask = dict([(k, Binary(lz4.compressHC(np.packbits(v).tostring()))) for k, v in iteritems(rowmask)]) for k, v in iteritems(data): if k != 'index': v = np.array(v) v = TickStore._ensure_supported_dtypes(v) rtn[COLUMNS][k] = { DATA: Binary(lz4.compressHC(v.tostring())), DTYPE: TickStore._str_dtype(v.dtype), ROWMASK: rowmask[k] } if initial_image: image_start = initial_image.get('index', start) if image_start > start: raise UnorderedDataException( "Image timestamp is after first tick: %s > %s" % (image_start, start)) start = min(start, image_start) rtn[IMAGE_DOC] = {IMAGE_TIME: image_start, IMAGE: initial_image} rtn[END] = end rtn[START] = start rtn[INDEX] = Binary( lz4.compressHC( np.concatenate( ([data['index'][0]], np.diff(data['index']))).tostring())) return rtn, final_image
def test_write_object(): arctic_lib = Mock() self = create_autospec(PickleStore) version = {'_id': ObjectId()} PickleStore.write(self, arctic_lib, version, 'sentinel.symbol', sentinel.item, sentinel.previous_version) assert 'data' not in version assert version['blob'] == '__chunked__' coll = arctic_lib.get_top_level_collection.return_value assert coll.update_one.call_args_list == [call({'sha': checksum('sentinel.symbol', {'data': Binary(lz4.compressHC(cPickle.dumps(sentinel.item, cPickle.HIGHEST_PROTOCOL)))}), 'symbol': 'sentinel.symbol'}, {'$set': {'segment': 0, 'data': Binary(lz4.compressHC(cPickle.dumps(sentinel.item, cPickle.HIGHEST_PROTOCOL)), 0)}, '$addToSet': {'parent': version['_id']}}, upsert=True)]
def _to_bucket(ticks, symbol, initial_image): rtn = {SYMBOL: symbol, VERSION: CHUNK_VERSION_NUMBER, COLUMNS: {}, COUNT: len(ticks)} data = {} rowmask = {} start = to_dt(ticks[0]["index"]) end = to_dt(ticks[-1]["index"]) final_image = copy.copy(initial_image) if initial_image else {} for i, t in enumerate(ticks): if initial_image: final_image.update(t) for k, v in iteritems(t): try: if k != "index": rowmask[k][i] = 1 else: v = TickStore._to_ms(v) if data[k][-1] > v: raise UnorderedDataException( "Timestamps out-of-order: %s > %s" % (ms_to_datetime(data[k][-1]), t) ) data[k].append(v) except KeyError: if k != "index": rowmask[k] = np.zeros(len(ticks), dtype="uint8") rowmask[k][i] = 1 data[k] = [v] rowmask = dict([(k, Binary(lz4.compressHC(np.packbits(v).tostring()))) for k, v in iteritems(rowmask)]) for k, v in iteritems(data): if k != "index": v = np.array(v) v = TickStore._ensure_supported_dtypes(v) rtn[COLUMNS][k] = { DATA: Binary(lz4.compressHC(v.tostring())), DTYPE: TickStore._str_dtype(v.dtype), ROWMASK: rowmask[k], } if initial_image: image_start = initial_image.get("index", start) if image_start > start: raise UnorderedDataException("Image timestamp is after first tick: %s > %s" % (image_start, start)) start = min(start, image_start) rtn[IMAGE_DOC] = {IMAGE_TIME: image_start, IMAGE: initial_image} rtn[END] = end rtn[START] = start rtn[INDEX] = Binary(lz4.compressHC(np.concatenate(([data["index"][0]], np.diff(data["index"]))).tostring())) return rtn, final_image
def zip_compress(plain, level=9): if not USE_LZ4: compressed = zlib.compress(plain, level) return compressed[2:] else: compressed = lz4.compress(plain) if level < 9 else lz4.compressHC(plain) return compressed[4:]
def dump_cache(self): batch = leveldb.WriteBatch() for pattern, triple_id_pairs in self.cache.iteritems(): try: pattern_triples = self.leveldb.Get(pattern) pattern_triples = lz4.decompress(pattern_triples) pattern_triples = pattern_triples.split( MERGING_INDEX_TRIPLE_LINE_DELIMITER) except KeyError: pattern_triples = [] logging.info("Merging bin from %d to %d (%d new)." % ( len(pattern_triples), len(pattern_triples) + len(triple_id_pairs), len(triple_id_pairs), )) for triple_id_pair in triple_id_pairs: pattern_triples.append( MERGING_INDEX_TRIPLE_ID_DELIMITER.join(triple_id_pair)) pattern_triples_dump = MERGING_INDEX_TRIPLE_LINE_DELIMITER.join( pattern_triples) batch.Put(pattern, lz4.compressHC(pattern_triples_dump)) self.leveldb.Write(batch) logging.info("Dump %d bins." % len(self.cache)) self.cache = {} self.cache_size = 0 gc.collect()
def test_read_backward_compatibility(): """Test backwards compatibility with a pickled file that's created with Python 2.7.3, Numpy 1.7.1_ahl2 and Pandas 0.14.1 """ fname = path.join(path.dirname(__file__), "data", "test-data.pkl") # For newer versions; verify that unpickling fails when using cPickle if PANDAS_VERSION >= LooseVersion("0.16.1"): if sys.version_info[0] >= 3: with pytest.raises(UnicodeDecodeError), open(fname) as fh: cPickle.load(fh) else: with pytest.raises(TypeError), open(fname) as fh: cPickle.load(fh) # Verify that PickleStore() uses a backwards compatible unpickler. store = PickleStore() with open(fname) as fh: # PickleStore compresses data with lz4 version = {'blob': lz4.compressHC(fh.read())} df = store.read(sentinel.arctic_lib, version, sentinel.symbol) expected = pd.DataFrame(range(4), pd.date_range(start="20150101", periods=4)) assert (df == expected).all().all()
def to_response(self, save=True, enable_debug=False): if self.task_error_code != 0: if self.response_body_blob is None: response_body = self.request_body self.response_body = response_body else: response_body = self.response_body else: if self.response_body_blob is None: response_body = json.dumps({ "error_code": self.task_error_code, "error_message": self.task_error_message, }) self.response_body = response_body else: response_body = self.response_body if save: log_str = self.log.getvalue() self.task_log_blob = lz4.compressHC(log_str) self.response_time = datetime.now() self.save() if enable_debug: response_body = json.dumps({ "log": self.log_body, "henry": self.henry_out, "parse": self.parse_out, "graph": self.dot_out, "response": json.loads(response_body), }, indent=4) return HttpResponse(response_body, content_type="application/json", status=self.response_status)
def open_db(self): self.terms_ldb = leveldb.LevelDB(self.terms_fl) self.docs_ldb = leveldb.LevelDB(self.docs_fl) self.doc_buffer_size = 0 self.term_buffer_size = 0 #self.doc_flush_buffer = np.empty(self.max_doc_flush_buffer_size, dtype="S%d" % self.max_doc_size) self.doc_flush_buffer = [None] * self.max_doc_flush_buffer_size self.term_flush_buffer = np.empty(self.max_term_flush_buffer_size, dtype="S%d" % self.max_term_size) self.doc_id_flush_buffer = np.empty(self.max_doc_flush_buffer_size, dtype=np.int64) self.term_id_flush_buffer = np.empty(self.max_term_flush_buffer_size, dtype=np.int64) if self.compression == COMPRESSION.NONE: self.compress = lambda string: string self.decompress = lambda string: string elif self.compression == COMPRESSION.ZLIB: import zlib self.compress = lambda string: zlib.compress(string, self.compression_level) self.decompress = lambda string: zlib.decompress(string) elif self.compression == COMPRESSION.LZMA: import backports.lzma as lzma self.compress = lambda string: lzma.compress(bytearray(string), format=lzma.FORMAT_RAW) self.decompress = lambda data: lzma.decompress(data, format=lzma.FORMAT_RAW) elif self.compression == COMPRESSION.LZ4R: import lz4 self.compress = lambda string: lz4.compress(string) self.decompress = lambda string: lz4.decompress(string) elif self.compression == COMPRESSION.LZ4H: import lz4 self.compress = lambda string: lz4.compressHC(string) self.decompress = lambda string: lz4.decompress(string) else: raise Exception("Wrong compression type %r" % self.compression)
def test_read_backward_compatibility(): """Test backwards compatibility with a pickled file that's created with Python 2.7.3, Numpy 1.7.1_ahl2 and Pandas 0.14.1 """ fname = path.join(path.dirname(__file__), "data", "test-data.pkl") # For newer versions; verify that unpickling fails when using cPickle if PANDAS_VERSION >= LooseVersion("0.16.1"): if sys.version_info[0] >= 3: with pytest.raises(UnicodeDecodeError), open(fname) as fh: pickle.load(fh) else: with pytest.raises(TypeError), open(fname) as fh: pickle.load(fh) # Verify that PickleStore() uses a backwards compatible unpickler. store = PickleStore() with open(fname) as fh: # PickleStore compresses data with lz4 version = {'blob': lz4.compressHC(fh.read())} df = store.read(sentinel.arctic_lib, version, sentinel.symbol) expected = pd.DataFrame(range(4), pd.date_range(start="20150101", periods=4)) assert (df == expected).all().all()
def test_pickle_chunk_V1_read(): data = {'foo': b'abcdefghijklmnopqrstuvwxyz'} version = {'_id': sentinel._id, 'blob': '__chunked__'} coll = Mock() arctic_lib = Mock() datap = lz4.compressHC( cPickle.dumps(data, protocol=cPickle.HIGHEST_PROTOCOL)) data_1 = datap[0:5] data_2 = datap[5:] coll.find.return_value = [ { 'data': Binary(data_1), 'symbol': 'sentinel.symbol', 'segment': 0 }, { 'data': Binary(data_2), 'symbol': 'sentinel.symbol', 'segment': 1 }, ] arctic_lib.get_top_level_collection.return_value = coll ps = PickleStore() assert (data == ps.read(arctic_lib, version, sentinel.symbol))
def write(self, arctic_lib, version, symbol, item, previous_version): try: # If it's encodeable, then ship it b = bson.BSON.encode({'data': item}) if len(b) < _MAX_BSON_ENCODE: version['data'] = item return except InvalidDocument: pass # Pickle, chunk and store the data collection = arctic_lib.get_top_level_collection() # Try to pickle it. This is best effort version['blob'] = _MAGIC_CHUNKED pickled = lz4.compressHC( cPickle.dumps(item, protocol=cPickle.HIGHEST_PROTOCOL)) for i in xrange(int(len(pickled) / _CHUNK_SIZE + 1)): segment = { 'data': Binary(pickled[i * _CHUNK_SIZE:(i + 1) * _CHUNK_SIZE]) } sha = checksum(symbol, segment) segment['segment'] = i collection.update_one({ 'symbol': symbol, 'sha': sha }, { '$set': segment, '$addToSet': { 'parent': version['_id'] } }, upsert=True)
def test_pickle_store_future_version(): data = {'foo': b'abcdefghijklmnopqrstuvwxyz'} version = {'_id': sentinel._id, 'blob': '__chunked__VERSION_ONE_MILLION'} coll = Mock() arctic_lib = Mock() datap = lz4.compressHC( cPickle.dumps(data, protocol=cPickle.HIGHEST_PROTOCOL)) data_1 = datap[0:5] data_2 = datap[5:] coll.find.return_value = [ { 'data': Binary(data_1), 'symbol': 'sentinel.symbol', 'segment': 0 }, { 'data': Binary(data_2), 'symbol': 'sentinel.symbol', 'segment': 1 }, ] arctic_lib.get_top_level_collection.return_value = coll ps = PickleStore() with pytest.raises(UnsupportedPickleStoreVersion) as e: ps.read(arctic_lib, version, sentinel.symbol) assert ('unsupported version of pickle store' in str(e))
def loadCOCOAndOverSeg(im_set="test", detector="sf", N_SPIX=1000, fold=0): from pickle import dumps, loads try: import lz4, pickle decompress = lambda s: pickle.loads(lz4.decompress(s)) compress = lambda o: lz4.compressHC(pickle.dumps(o)) except: compress = lambda x: x decompress = lambda x: x from gop import contour, dataset, segmentation FILE_NAME = '/tmp/coco_%s_%s_%d_%d.dat' % (im_set, detector, N_SPIX, fold) try: with open(FILE_NAME, 'rb') as f: over_segs, segmentations = loads(f.read()) f.close() over_seg = segmentation.ImageOverSegmentationVec() for i in over_segs: over_seg.append(decompress(i)) return over_seg, [decompress(i) for i in segmentations], [] #return over_segs,segmentations,[] except FileNotFoundError: pass # Load the dataset data = dataset.loadCOCO2014(im_set == "train", im_set == "valid", fold) # COCO has some pretty gray scale images (WTF!!!) images = [ e['image'] if e['image'].C == 3 else e['image'].tileC(3) for e in data ] try: segmentations = [e['segmentation'] for e in data] except: segmentations = [] # Do the over-segmentation if detector == 'sf': detector = contour.StructuredForest() detector.load('../data/sf.dat') elif detector == "mssf": detector = contour.MultiScaleStructuredForest() detector.load("../data/sf.dat") elif detector == 'st': detector = contour.SketchTokens() detector.load('../data/st_full_c.dat') else: detector = contour.DirectedSobel() if detector != None: over_segs = segmentation.generateGeodesicKMeans( detector, images, N_SPIX) with open(FILE_NAME, 'wb') as f: #f.write( dumps( (over_segs,segmentations) ) ) f.write( dumps( ([compress(i) for i in over_segs], [compress(i) for i in segmentations]))) f.close() return over_segs, segmentations, []
def tostring(self): return lz4.compressHC( json.dumps({ "u": self.url, "t": self.title, "c": self.content, "i": self.idx_terms, }))
def roundtrip(size=None): if size is None: size = struct.unpack(">I", b"\0" + os.urandom(3))[0] data = os.urandom(size) assert rustlz4.decompress(pylz4.compress(data)) == data assert pylz4.decompress(buffer(rustlz4.compress(data))) == data assert rustlz4.decompress(pylz4.compressHC(data)) == data assert pylz4.decompress(buffer(rustlz4.compresshc(data))) == data
def loadVOCAndOverSeg( im_set="test", detector="sf", N_SPIX=1000, EVAL_DIFFICULT=False, year="2012" ): from pickle import dumps,loads try: import lz4, pickle decompress = lambda s: pickle.loads( lz4.decompress( s ) ) compress = lambda o: lz4.compressHC( pickle.dumps( o ) ) except: compress = lambda x: x decompress = lambda x: x from gop import contour,dataset,segmentation FILE_NAME = '/tmp/%s_%s_%d_%d_%s.dat'%(im_set,detector,N_SPIX,EVAL_DIFFICULT,year) try: with open(FILE_NAME,'rb') as f: over_segs,segmentations,boxes = loads( f.read() ) f.close() over_seg = segmentation.ImageOverSegmentationVec() for i in over_segs: over_seg.append( decompress(i) ) return over_seg,[decompress(i) for i in segmentations],[decompress(i) for i in boxes] except IOError: pass # Load the dataset #data = eval("dataset.loadVOC2012_small")(im_set=="train",im_set=="valid",im_set=="test") data = eval("dataset.loadVOC%s"%year)(im_set=="train",im_set=="valid",im_set=="test") images = [e['image'] for e in data] try: segmentations = [e['segmentation'] for e in data] except: segmentations = [] boxes = [[a['bbox'] for a in e['annotation'] if not a['difficult'] or EVAL_DIFFICULT] for e in data] # Do the over-segmentation if detector=='sf': detector = contour.StructuredForest() detector.load( '../data/sf.dat' ) elif detector == "mssf": detector = contour.MultiScaleStructuredForest() detector.load( "../data/sf.dat" ) elif detector=='st': detector = contour.SketchTokens() detector.load( '../data/st_full_c.dat' ) else: detector = contour.DirectedSobel() if detector != None: over_segs = segmentation.generateGeodesicKMeans( detector, images, N_SPIX ) #try: with open(FILE_NAME,'wb') as f: f.write( dumps( ([compress(i) for i in over_segs],[compress(i) for i in segmentations],[compress(i) for i in boxes]) ) ) f.close() #except FileNotFoundError: #pass return over_segs,segmentations,boxes
def lz4_compress(packet, level): if level >= 9: return level | LZ4_FLAG, compressHC(packet) #clamp it: 0->17, 1->12, 2->7, 3->2, >=4->1 if level <= 2: #clamp it: 0->17, 1->12, 2->7, 3->2, >=4->1 accel = max(1, 17 - level * 5) return level | LZ4_FLAG, LZ4_compress_fast(packet, accel) return level | LZ4_FLAG, LZ4_compress(packet)
def streamer(): fin = proto.fin opener = repo.sopener cachepath = repo.ui.config("remotefilelog", "servercachepath") if not cachepath: cachepath = os.path.join(repo.path, "remotefilelogcache") # everything should be user & group read/writable oldumask = os.umask(0o002) try: while True: request = fin.readline()[:-1] if not request: break node = bin(request[:40]) if node == nullid: yield '0\n' continue path = request[40:] filecachepath = os.path.join(cachepath, path, hex(node)) if not os.path.exists(filecachepath): filectx = repo.filectx(path, fileid=node) if filectx.node() == nullid: repo.changelog = changelog.changelog(repo.sopener) filectx = repo.filectx(path, fileid=node) text = createfileblob(filectx) text = lz4.compressHC(text) dirname = os.path.dirname(filecachepath) if not os.path.exists(dirname): os.makedirs(dirname) try: with open(filecachepath, "w") as f: f.write(text) except IOError: # Don't abort if the user only has permission to read, # and not write. pass else: with open(filecachepath, "r") as f: text = f.read() yield '%d\n%s' % (len(text), text) # it would be better to only flush after processing a whole batch # but currently we don't know if there are more requests coming proto.fout.flush() finally: os.umask(oldumask)
def test_unpickle_highest_protocol(): """Pandas version 0.14.1 fails to unpickle a pandas.Series() in compat mode if the container has been pickled with HIGHEST_PROTOCOL. """ version = {"blob": lz4.compressHC(cPickle.dumps(pd.Series(), protocol=cPickle.HIGHEST_PROTOCOL))} store = PickleStore() ps = store.read(sentinel.arctic_lib, version, sentinel.symbol) expected = pd.Series() assert (ps == expected).all()
def loadCOCOAndOverSeg( im_set="test", detector="sf", N_SPIX=1000, fold=0 ): from pickle import dumps,loads try: import lz4, pickle decompress = lambda s: pickle.loads( lz4.decompress( s ) ) compress = lambda o: lz4.compressHC( pickle.dumps( o ) ) except: compress = lambda x: x decompress = lambda x: x from gop import contour,dataset,segmentation FILE_NAME = '/tmp/coco_%s_%s_%d_%d.dat'%(im_set,detector,N_SPIX,fold) try: with open(FILE_NAME,'rb') as f: over_segs,segmentations = loads( f.read() ) f.close() over_seg = segmentation.ImageOverSegmentationVec() for i in over_segs: over_seg.append( decompress(i) ) return over_seg,[decompress(i) for i in segmentations],[] #return over_segs,segmentations,[] except FileNotFoundError: pass # Load the dataset data = dataset.loadCOCO2014( im_set=="train",im_set=="valid", fold) # COCO has some pretty gray scale images (WTF!!!) images = [e['image'] if e['image'].C==3 else e['image'].tileC(3) for e in data] try: segmentations = [e['segmentation'] for e in data] except: segmentations = [] # Do the over-segmentation if detector=='sf': detector = contour.StructuredForest() detector.load( '../data/sf.dat' ) elif detector == "mssf": detector = contour.MultiScaleStructuredForest() detector.load( "../data/sf.dat" ) elif detector=='st': detector = contour.SketchTokens() detector.load( '../data/st_full_c.dat' ) else: detector = contour.DirectedSobel() if detector != None: over_segs = segmentation.generateGeodesicKMeans( detector, images, N_SPIX ) with open(FILE_NAME,'wb') as f: #f.write( dumps( (over_segs,segmentations) ) ) f.write( dumps( ([compress(i) for i in over_segs],[compress(i) for i in segmentations]) ) ) f.close() return over_segs,segmentations,[]
def _to_bucket(ticks, symbol, initial_image): rtn = {SYMBOL: symbol, VERSION: CHUNK_VERSION_NUMBER, COLUMNS: {}, COUNT: len(ticks)} data = {} rowmask = {} start = to_dt(ticks[0]['index']) end = to_dt(ticks[-1]['index']) final_image = copy.copy(initial_image) if initial_image else {} for i, t in enumerate(ticks): if initial_image: final_image.update(t) for k, v in iteritems(t): try: if k != 'index': rowmask[k][i] = 1 else: v = TickStore._to_ms(v) data[k].append(v) except KeyError: if k != 'index': rowmask[k] = np.zeros(len(ticks), dtype='uint8') rowmask[k][i] = 1 data[k] = [v] rowmask = dict([(k, Binary(lz4.compressHC(np.packbits(v).tostring()))) for k, v in iteritems(rowmask)]) for k, v in iteritems(data): if k != 'index': v = np.array(v) v = TickStore._ensure_supported_dtypes(v) rtn[COLUMNS][k] = {DATA: Binary(lz4.compressHC(v.tostring())), DTYPE: TickStore._str_dtype(v.dtype), ROWMASK: rowmask[k]} if initial_image: image_start = initial_image.get('index', start) start = min(start, image_start) rtn[IMAGE_DOC] = {IMAGE_TIME: image_start, IMAGE: initial_image} rtn[END] = end rtn[START] = start rtn[INDEX] = Binary(lz4.compressHC(np.concatenate(([data['index'][0]], np.diff(data['index']))).tostring())) return rtn, final_image
def _to_bucket(self, ticks, symbol): data = {} rowmask = {} start = self._to_dt(ticks[0]['index']) end = self._to_dt(ticks[-1]['index']) for i, t in enumerate(ticks): for k, v in t.iteritems(): try: if k != 'index': rowmask[k][i] = 1 else: v = self._to_ms(v) data[k].append(v) except KeyError: if k != 'index': rowmask[k] = np.zeros(len(ticks), dtype='uint8') rowmask[k][i] = 1 data[k] = [v] rowmask = dict([(k, Binary(lz4.compressHC(np.packbits(v).tostring()))) for k, v in rowmask.iteritems()]) rtn = {START: start, END: end, SYMBOL: symbol} rtn[VERSION] = CHUNK_VERSION_NUMBER rtn[COUNT] = len(ticks) rtn[COLUMNS] = {} for k, v in data.iteritems(): if k != 'index': v = np.array(v) v = self._ensure_supported_dtypes(v) rtn[COLUMNS][k] = { DATA: Binary(lz4.compressHC(v.tostring())), DTYPE: self._str_dtype(v.dtype), ROWMASK: rowmask[k] } rtn[INDEX] = Binary( lz4.compressHC( np.concatenate( ([data['index'][0]], np.diff(data['index']))).tostring())) return rtn
def test_read_object_2(): self = create_autospec(PickleStore) version = {"_id": sentinel._id, "blob": "__chunked__"} coll = Mock() arctic_lib = Mock() coll.find.return_value = [{"data": Binary(lz4.compressHC(cPickle.dumps(object))), "symbol": "sentinel.symbol"}] arctic_lib.get_top_level_collection.return_value = coll assert PickleStore.read(self, arctic_lib, version, sentinel.symbol) == object assert coll.find.call_args_list == [ call({"symbol": sentinel.symbol, "parent": sentinel._id}, sort=[("segment", 1)]) ]
def test_write_object(): arctic_lib = Mock() self = create_autospec(PickleStore) version = {'_id': ObjectId()} PickleStore.write(self, arctic_lib, version, 'sentinel.symbol', sentinel.item, sentinel.previous_version) assert 'data' not in version assert version['blob'] == '__chunked__' coll = arctic_lib.get_top_level_collection.return_value assert coll.update_one.call_args_list == [ call( { 'sha': checksum( 'sentinel.symbol', { 'data': Binary( lz4.compressHC( cPickle.dumps(sentinel.item, cPickle.HIGHEST_PROTOCOL))) }), 'symbol': 'sentinel.symbol' }, { '$set': { 'segment': 0, 'data': Binary( lz4.compressHC( cPickle.dumps(sentinel.item, cPickle.HIGHEST_PROTOCOL)), 0) }, '$addToSet': { 'parent': version['_id'] } }, upsert=True) ]
def test_read_object_2(): self = create_autospec(PickleStore) version = {'_id': sentinel._id, 'blob': '__chunked__'} coll = Mock() arctic_lib = Mock() coll.find.return_value = [{'data': Binary(lz4.compressHC(cPickle.dumps(object))), 'symbol': 'sentinel.symbol'} ] arctic_lib.get_top_level_collection.return_value = coll assert PickleStore.read(self, arctic_lib, version, sentinel.symbol) == object assert coll.find.call_args_list == [call({'symbol': sentinel.symbol, 'parent': sentinel._id}, sort=[('segment', 1)])]
def _to_bucket(self, ticks, symbol): data = {} rowmask = {} start = to_dt(ticks[0]["index"]) end = to_dt(ticks[-1]["index"]) for i, t in enumerate(ticks): for k, v in t.iteritems(): try: if k != "index": rowmask[k][i] = 1 else: v = self._to_ms(v) data[k].append(v) except KeyError: if k != "index": rowmask[k] = np.zeros(len(ticks), dtype="uint8") rowmask[k][i] = 1 data[k] = [v] rowmask = dict([(k, Binary(lz4.compressHC(np.packbits(v).tostring()))) for k, v in rowmask.iteritems()]) rtn = {START: start, END: end, SYMBOL: symbol} rtn[VERSION] = CHUNK_VERSION_NUMBER rtn[COUNT] = len(ticks) rtn[COLUMNS] = {} for k, v in data.iteritems(): if k != "index": v = np.array(v) v = self._ensure_supported_dtypes(v) rtn[COLUMNS][k] = { DATA: Binary(lz4.compressHC(v.tostring())), DTYPE: self._str_dtype(v.dtype), ROWMASK: rowmask[k], } rtn[INDEX] = Binary(lz4.compressHC(np.concatenate(([data["index"][0]], np.diff(data["index"]))).tostring())) return rtn
def test_unpickle_highest_protocol(): """Pandas version 0.14.1 fails to unpickle a pandas.Series() in compat mode if the container has been pickled with HIGHEST_PROTOCOL. """ version = { 'blob': lz4.compressHC( cPickle.dumps(pd.Series(), protocol=cPickle.HIGHEST_PROTOCOL)), } store = PickleStore() ps = store.read(sentinel.arctic_lib, version, sentinel.symbol) expected = pd.Series() assert (ps == expected).all()
def compressGr(dat, version) : if ord(dat[1]) < version : dat = dat[0] + chr(version) + dat[2:] datc = lz4.compressHC(dat[:-4])[4:] # strip initial length and last 4 bytes # now find the final tuple end = len(datc) start = 0 curr = lz4tuple(start) while curr.end < end : start = curr.end curr = parseTuple(datc, start, end) if curr.end > end : print "Sync error: %s" % (curr) newend = write_literal(curr.literal_len + 4, 4) + datc[curr.literal:curr.literal+curr.literal_len+1] + dat[-4:] lz4hdr = struct.pack(">L", (1 << 27) + (len(dat) & 0x7FFFFFF)) return dat[0:4] + lz4hdr + datc[0:curr.start] + newend
def test_performance_sequential(n, length): _str = random_string(length) _strarr = [_str for _ in range(n)] now = dt.now() [c.decompress(y) for y in [c.compressHC(x) for x in _strarr]] clz4_time = (dt.now() - now).total_seconds() now = dt.now() c.decompressarr(c.compressarrHC(_strarr)) clz4_time_p = (dt.now() - now).total_seconds() now = dt.now() [lz4.decompress(y) for y in [lz4.compressHC(x) for x in _strarr]] lz4_time = (dt.now() - now).total_seconds() print() print("LZ4 Test %sx len:%s" % (n, length)) print(" Cython LZ4 %s s" % clz4_time) print(" Cython LZ4 Parallel %s s" % clz4_time_p) print(" LZ4 %s s" % lz4_time)
def incominghook(ui, repo, node, source, url, **kwargs): """Server hook that produces the shallow file blobs immediately after a commit, in anticipation of them being requested soon. """ cachepath = repo.ui.config("remotefilelog", "servercachepath") if not cachepath: cachepath = os.path.join(repo.path, "remotefilelogcache") heads = repo.revs("heads(%s::)" % node) # everything should be user & group read/writable oldumask = os.umask(0o002) try: count = 0 for head in heads: mf = repo[head].manifest() for filename, filenode in mf.iteritems(): filecachepath = os.path.join(cachepath, filename, hex(filenode)) if os.path.exists(filecachepath): continue # This can be a bit slow. Don't block the commit returning # for large commits. if count > 500: break count += 1 filectx = repo.filectx(filename, fileid=filenode) text = createfileblob(filectx) text = lz4.compressHC(text) dirname = os.path.dirname(filecachepath) if not os.path.exists(dirname): os.makedirs(dirname) f = open(filecachepath, "w") try: f.write(text) finally: f.close() finally: os.umask(oldumask)
def compress_array(str_list): """ Compress an array of strings By default LZ4 mode is standard in interactive mode, and high compresion in applications/scripts """ if _should_use_lz4hc(): # Less than 5 chunks its quicker to compress sequentially.. if len(str_list) > LZ4HC_N_PARALLEL: return clz4.compressarrHC(str_list) else: return [clz4.compressHC(s) for s in str_list] else: # Less than 50 chunks its quicker to compress sequentially.. if len(str_list) > LZ4_N_PARALLEL: return clz4.compressarr(str_list) else: return [clz4.compress(s) for s in str_list]
def flush(self, keep_closed=False): buffered_bytes = self.buffer.getvalue() self.buffer.truncate(0) if len(buffered_bytes) == 0: # we don't want to write an empty file because decompressing it does weird things if keep_closed: name = self.outputstream.name self.outputstream.close() os.unlink(name) return else: # if we're still writing, this can be a noop return compressed_bytes = lz4.compressHC(buffered_bytes) self.outputstream.write(compressed_bytes) self.outputstream.close() if not keep_closed: self.outputstream = open(self._get_next_file(), 'w')
def open_db(self): self.terms_ldb = leveldb.LevelDB(self.terms_fl) self.docs_ldb = leveldb.LevelDB(self.docs_fl) self.doc_buffer_size = 0 self.term_buffer_size = 0 #self.doc_flush_buffer = np.empty(self.max_doc_flush_buffer_size, dtype="S%d" % self.max_doc_size) self.doc_flush_buffer = [None] * self.max_doc_flush_buffer_size self.term_flush_buffer = np.empty(self.max_term_flush_buffer_size, dtype="S%d" % self.max_term_size) self.doc_id_flush_buffer = np.empty(self.max_doc_flush_buffer_size, dtype=np.int64) self.term_id_flush_buffer = np.empty(self.max_term_flush_buffer_size, dtype=np.int64) if self.compression == COMPRESSION.NONE: self.compress = lambda string: string self.decompress = lambda string: string elif self.compression == COMPRESSION.ZLIB: import zlib self.compress = lambda string: zlib.compress( string, self.compression_level) self.decompress = lambda string: zlib.decompress(string) elif self.compression == COMPRESSION.LZMA: import backports.lzma as lzma self.compress = lambda string: lzma.compress( bytearray(string), format=lzma.FORMAT_RAW) self.decompress = lambda data: lzma.decompress( data, format=lzma.FORMAT_RAW) elif self.compression == COMPRESSION.LZ4R: import lz4 self.compress = lambda string: lz4.compress(string) self.decompress = lambda string: lz4.decompress(string) elif self.compression == COMPRESSION.LZ4H: import lz4 self.compress = lambda string: lz4.compressHC(string) self.decompress = lambda string: lz4.decompress(string) else: raise Exception("Wrong compression type %r" % self.compression)
def test_read_object_2(): self = create_autospec(PickleStore) version = {'_id': sentinel._id, 'blob': '__chunked__'} coll = Mock() arctic_lib = Mock() coll.find.return_value = [{ 'data': Binary(lz4.compressHC(cPickle.dumps(object))), 'symbol': 'sentinel.symbol' }] arctic_lib.get_top_level_collection.return_value = coll assert PickleStore.read(self, arctic_lib, version, sentinel.symbol) == object assert coll.find.call_args_list == [ call({ 'symbol': sentinel.symbol, 'parent': sentinel._id }, sort=[('segment', 1)]) ]
def write(self, arctic_lib, version, symbol, item, previous_version): try: # If it's encodeable, then ship it bson.BSON.encode({'data': item}) version['data'] = item return except InvalidDocument: pass # Pickle, chunk and store the data collection = arctic_lib.get_top_level_collection() # Try to pickle it. This is best effort version['blob'] = _MAGIC_CHUNKED pickled = lz4.compressHC(cPickle.dumps(item, protocol=cPickle.HIGHEST_PROTOCOL)) for i in xrange(len(pickled) / _CHUNK_SIZE + 1): segment = {'data': Binary(pickled[i * _CHUNK_SIZE : (i + 1) * _CHUNK_SIZE])} sha = checksum(symbol, segment) segment['segment'] = i collection.update_one({'symbol': symbol, 'sha': sha}, {'$set': segment, '$addToSet': {'parent': version['_id']}}, upsert=True)
def dump_cache(self): batch = leveldb.WriteBatch() for pattern, triple_id_pairs in self.cache.iteritems(): try: pattern_triples = self.leveldb.Get(pattern) pattern_triples = lz4.decompress(pattern_triples) pattern_triples = pattern_triples.split(MERGING_INDEX_TRIPLE_LINE_DELIMITER) except KeyError: pattern_triples = [] logging.info("Merging bin from %d to %d (%d new)." % ( len(pattern_triples), len(pattern_triples) + len(triple_id_pairs), len(triple_id_pairs), )) for triple_id_pair in triple_id_pairs: pattern_triples.append(MERGING_INDEX_TRIPLE_ID_DELIMITER.join(triple_id_pair)) pattern_triples_dump = MERGING_INDEX_TRIPLE_LINE_DELIMITER.join(pattern_triples) batch.Put(pattern, lz4.compressHC(pattern_triples_dump)) self.leveldb.Write(batch) logging.info("Dump %d bins." % len(self.cache)) self.cache = {} self.cache_size = 0 gc.collect()
def test_read_object_backwards_compat(): self = create_autospec(PickleStore) version = {'blob': Binary(lz4.compressHC(cPickle.dumps(object)))} assert PickleStore.read(self, sentinel.arctic_lib, version, sentinel.symbol) == object
def encode_posting_list(plist): return lz4.compressHC(numencode.encode_plist(plist))
def update_posting_list(old_plist_blob, new_plist): plist_blob = lz4.decompress(old_plist_blob) updated_plist = numencode.update_plist(plist_blob, new_plist) return lz4.compressHC(updated_plist)
def lz4_compress(packet, level): if level>=9: return level | LZ4_FLAG, compressHC(packet) return level | LZ4_FLAG, LZ4_compress(packet)
def parse_out(self, value): self.parse_out_blob = lz4.compressHC(value)
def dot_out(self, value): self.dot_out_blob = lz4.compressHC(json.dumps(value))
def response_body(self, value): self.response_body_blob = lz4.compressHC(value)
def lz4_compress(packet, level): if level>=9: return level | LZ4_FLAG, compressHC(packet) #clamp it: 0->17, 1->12, 2->7, 3->2, >=4->1 accel = max(1, 17-level*5) return level | LZ4_FLAG, LZ4_compress_fast(packet, accel)
def _add_indicator(self, score, indicator, value): type_ = value['type'] type_mapper = _TYPE_MAPPING.get(type_, None) if type_mapper is None: LOG.error('%s - Unsupported indicator type: %s', self.name, type_) return nsdict = {} nsdict[self.namespaceuri] = self.namespace stix.utils.set_id_namespace(nsdict) spid = '{}:indicator-{}'.format( self.namespace, uuid.uuid4() ) sp = stix.core.STIXPackage(id_=spid) observables = type_mapper['mapper'](self.namespace, indicator, value) for o in observables: id_ = '{}:indicator-{}'.format( self.namespace, uuid.uuid4() ) if value['type'] == 'URL': eindicator = werkzeug.urls.iri_to_uri(indicator, safe_conversion=True) else: eindicator = indicator sindicator = stix.indicator.indicator.Indicator( id_=id_, title='{}: {}'.format( value['type'], eindicator ), description='{} indicator from {}'.format( value['type'], ', '.join(value['sources']) ), timestamp=datetime.datetime.utcnow().replace(tzinfo=pytz.utc) ) confidence = value.get('confidence', None) if confidence is None: LOG.error('%s - indicator without confidence', self.name) sindicator.confidence = "Unknown" # We shouldn't be here elif confidence < 50: sindicator.confidence = "Low" elif confidence < 75: sindicator.confidence = "Medium" else: sindicator.confidence = "High" sindicator.add_indicator_type(type_mapper['indicator_type']) sindicator.add_observable(o) sp.add_indicator(sindicator) spackage = 'lz4'+lz4.compressHC(sp.to_json()) with self.SR.pipeline() as p: p.multi() p.zadd(self.redis_skey, score, spid) p.hset(self.redis_skey_value, spid, spackage) result = p.execute()[0] self.statistics['added'] += result
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ from __future__ import print_function from sys import stdout try: import lz4, pickle decompress = lambda s: pickle.loads(lz4.decompress(s)) compress = lambda o: lz4.compressHC(pickle.dumps(o)) except: compress = lambda x: x decompress = lambda x: x def getDetector(detector="sf"): from lpo import contour from os import path basedir = path.dirname(path.dirname(path.abspath(__file__))) if detector == 'sf': r = contour.StructuredForest() r.load(path.join(basedir, 'data', 'sf.dat')) elif detector == "mssf": r = contour.MultiScaleStructuredForest() r.load(path.join(basedir, 'data', 'sf.dat'))