def compute(self, split): buffers = [list() for i in self.fields] remain_size = STRIPE_DATA_SIZE path = os.path.join(self.path, '%04d.dt' % split.index) indices = dict((i, AdaptiveIndex()) for i in self.indices) def write_stripe(f, compressed, header, padding=True): h = compress(marshal.dumps(header)) assert len(h) < STRIPE_HEADER_SIZE f.write(struct.pack('I', len(h))) f.write(h) padding_size = STRIPE_SIZE - len(h) - 4 for c in compressed: f.write(c) padding_size -= len(c) if padding: f.write('\0' * padding_size) with atomic_file(path) as f: stripe_id = 0 for it in chain(self.prev.iterator(sp) for sp in split.splits): row = it[:len(self.fields)] size = len(marshal.dumps(tuple(row))) if size > STRIPE_DATA_SIZE: raise RuntimeError('Row too big') if size > remain_size: compressed = [compress(marshal.dumps(tuple(b))) for b in buffers] _sizes = tuple(map(len, compressed)) _remain_size = STRIPE_DATA_SIZE - sum(_sizes) if size > _remain_size: write_stripe(f, compressed, _sizes) buffers = [list() for i in self.fields] remain_size = STRIPE_DATA_SIZE stripe_id += 1 else: remain_size = _remain_size remain_size -= size for i, value in enumerate(row): buffers[i].append(value) field = self.fields[i] if field in self.indices: indices[field].add(value, stripe_id) if any(buffers): compressed = [compress(marshal.dumps(tuple(b))) for b in buffers] _sizes = tuple(map(len, compressed)) write_stripe(f, compressed, _sizes, False) footer_indices = zlib.compress(cPickle.dumps(indices, -1)) footer_fields = compress(marshal.dumps(self.fields)) f.write(footer_indices) f.write(footer_fields) f.write(struct.pack('II', len(footer_fields), len(footer_indices))) yield path
def compress_array(str_list): """ Compress an array of strings By default LZ4 mode is standard in interactive mode, and high compresion in applications/scripts """ if not ENABLE_PARALLEL: return [lz4.compress(s) for s in str_list] # Less than 50 chunks its quicker to compress sequentially.. if len(str_list) > LZ4_N_PARALLEL: return clz4.compressarr(str_list) else: return [clz4.compress(s) for s in str_list]
def _put(self, item): hash_key = item.__hash__().to_bytes(length=20, byteorder=sys.byteorder, signed=True) data = compress(pickle.dumps( item, protocol=4)) if self.compression else pickle.dumps( item, protocol=4) with self._sem: try: with self._lmdb.begin(write=True) as txn: if not txn.replace(hash_key, data, db=self._hashes_db): key = next(self._idx).to_bytes(length=511, byteorder='big', signed=False) self.logger.debug( "Queuing new task with SERIAL {sn}".format( sn=int.from_bytes( key, byteorder='big', signed=False))) txn.put(key, hash_key, append=True, db=self._queue_db) else: self.logger.debug("Updating already queued task") except lmdb.MapFullError: self.logger.critical( "Database file {path} reached maximum size!".format( path=self.path)) raise Full()
def scanSubvolume(self, subvolume): hex = hashlib.sha1(subvolume).hexdigest() file = self.folder + "/" + hex + ".lz4" completed = self.folder + "/" + hex + ".complete" print "scaning: " + subvolume + " (" + hex + ")" if os.path.exists(completed): return p = Popen('btrfs subvolume find-new ' + subvolume + ' 0', shell=True, stdout=PIPE, stderr=STDOUT) list = [] for line in p.stdout.readlines(): match = self.matchAllocation.match(line) if match is None: continue list.append(match.group(0)) with open(file, "w") as text_file: text_file.write(lz4.compress("\n".join(list))) open(completed, 'a').close()
def open_db(self): self.terms_ldb = leveldb.LevelDB(self.terms_fl) self.docs_ldb = leveldb.LevelDB(self.docs_fl) self.doc_buffer_size = 0 self.term_buffer_size = 0 #self.doc_flush_buffer = np.empty(self.max_doc_flush_buffer_size, dtype="S%d" % self.max_doc_size) self.doc_flush_buffer = [None] * self.max_doc_flush_buffer_size self.term_flush_buffer = np.empty(self.max_term_flush_buffer_size, dtype="S%d" % self.max_term_size) self.doc_id_flush_buffer = np.empty(self.max_doc_flush_buffer_size, dtype=np.int64) self.term_id_flush_buffer = np.empty(self.max_term_flush_buffer_size, dtype=np.int64) if self.compression == COMPRESSION.NONE: self.compress = lambda string: string self.decompress = lambda string: string elif self.compression == COMPRESSION.ZLIB: import zlib self.compress = lambda string: zlib.compress(string, self.compression_level) self.decompress = lambda string: zlib.decompress(string) elif self.compression == COMPRESSION.LZMA: import backports.lzma as lzma self.compress = lambda string: lzma.compress(bytearray(string), format=lzma.FORMAT_RAW) self.decompress = lambda data: lzma.decompress(data, format=lzma.FORMAT_RAW) elif self.compression == COMPRESSION.LZ4R: import lz4 self.compress = lambda string: lz4.compress(string) self.decompress = lambda string: lz4.decompress(string) elif self.compression == COMPRESSION.LZ4H: import lz4 self.compress = lambda string: lz4.compressHC(string) self.decompress = lambda string: lz4.decompress(string) else: raise Exception("Wrong compression type %r" % self.compression)
def append(self, eventId, data): try: with self._sem: with self.lmdb.begin(write=True) as txn: eventId = eventId.encode('utf-8') data = compress(json.dumps(data)) mtime = int(time.time() * 1000).to_bytes( length=6, byteorder=sys.byteorder, signed=False) delta_idx = next(self.delta_idx).to_bytes(length=511, byteorder='big', signed=False) index_db = self.lmdb.open_db(key=self.index_name, txn=txn, dupsort=True) mtimes_db = self.lmdb.open_db(key=self.mtimes_name, txn=txn) deltas_db = self.lmdb.open_db(key=self.deltas_name, txn=txn) txn.put(delta_idx, data, append=True, db=deltas_db) txn.put(eventId, delta_idx, dupdata=True, db=index_db) txn.put(eventId, mtime, overwrite=True, db=mtimes_db) except lmdb.MapFullError: raise DeltaStoreFull( "Database file at {path} has reached its maximum size!".format( path=self.db_path))
def zip_compress(plain, level=9): if not USE_LZ4: compressed = zlib.compress(plain, level) return compressed[2:] else: compressed = lz4.compress(plain) if level < 9 else lz4.compressHC(plain) return compressed[4:]
def compress(file): data = open(file,"r+b") if data.name.endswith('.tar'): compressed_data = open(data.name + ".lz4","w") compressed_data.write(lz4.compress(data.read())) else: print "The file type is not the expected."
def do_test_rountrip_method(compress, i_data, c_data=None): from lz4 import LZ4_uncompress #@UnresolvedImport c = compress(i_data) if c_data is not None: assert c_data==c, "expected compressed data to look like %s, but got %s" % (hl(c_data), hl(c)) d = LZ4_uncompress(c) assert d==i_data, "expected decompressed data to look like original %s, but got %s" % (hl(i_data), hl(d))
def run(self, cb_upload, cb_hash): if not self._blocksize: raise Exception("Blocksize is not defined") block_cnt = 0 block_new_cnt = 0 bs = self._block_size * 1024 * 1024 with open(self._device, "rb") as f: while True: #Read block and check for EOF block = f.read(bs) if not block: self.log.info("Input EOF reached") break block_cnt += 1 #Create hash h = xxhash.xxh64(block).hexdigest() #Hash doesnt exist. Upload! if h not in self.blocks: block_new_cnt += 1 if self._compression: block = lz4.compress(block) cb_upload(h, block) #Add hash to sync table cb_hash(h) self.log.info( "Dedup::run() completed. Uploaded {} new blocks, {} blocks in total" .format(block_new_cnt, block_cnt))
def send(self, message): version = (0 & 0xf) << 28 msg_id = message.msg_id if not msg_id: msg_id = next(self.msg_ids) message.msg_ig = msg_id msg_id = (msg_id & 0xfff) << 16 msg_type = (message._MESSAGE_TYPE & 0xff) << 8 compress = False compression = 0 if self.compress: compress = True elif self.compress is False and message._MESSAGE_TYPE != RESPONSE: compress = True msg = message.pack() if compress and len(msg) >= COMPRESSION_THREASHOLD: compression = 1 msg = lz4.compress(msg) header = version + msg_id + msg_type + compression data = _HEADER.pack(header, len(msg)) + msg self.sock.sendall(data) self.last_send = datetime.datetime.now()
def _compress( self, data ): # a compression funcion like lrzip in spirit: lz4>lz0>zlib>bz2>lzma if self.shuffle == True: try: print "shuffling..." data = buff_shuffle( data) # shuffling will work for a filter < 1GB print "data shuffled..." except: pass print "Compressing..." try: data = lz4.compress(data) # will fail if filter > 1GB print "lz4 ok" except: pass try: data = lzo.compress(data) # will fail if filter > 1GB print "lzo ok" except: pass #data = data.encode('zlib') #data = zlib.compress(data,1) #data = zlib.compress(data,9) #data = bz2.compress(data,9) data = zlib.compress(data) data = bz2.compress(data) data = lzma.compress(data) return data
def benchmark(data, hcdata=None): number = 100 size = len(data) hcdata = hcdata or data for modname, func in [("pylz4", pylz4.compress), ("rustlz4", rustlz4.compress)]: timer = timeit.Timer(functools.partial(func, data)) elapsed = timer.timeit(number=number) perf = size * number / elapsed / 1e6 name = "%s.%s" % (modname, func.__name__) print("%24s: %8.2f MB/s" % (name, perf)) for modname, func in [("pylz4", pylz4.compressHC), ("rustlz4", rustlz4.compresshc)]: timer = timeit.Timer(functools.partial(func, hcdata)) elapsed = timer.timeit(number=number) perf = size * number / elapsed / 1e6 name = "%s.%s" % (modname, func.__name__) print("%24s: %8.2f MB/s" % (name, perf)) data = pylz4.compress(data) for modname, func in [("pylz4", pylz4.decompress), ("rustlz4", rustlz4.decompress)]: timer = timeit.Timer(functools.partial(func, data)) elapsed = timer.timeit(number=number) perf = size * number / elapsed / 1e6 name = "%s.%s" % (modname, func.__name__) print("%24s: %8.2f MB/s" % (name, perf))
def fset(self, inst, value): nprow = getattr(inst, 'NumpyArrayTable__' + self.name) #~ print 'fset',self.name, nprow, value if nprow is None: nprow = self.NumpyArrayTableClass() setattr(inst, 'NumpyArrayTable__' + self.name, nprow) if value is None: if hasattr(inst, self.name + '_array'): delattr(inst, self.name + '_array') nprow.shape = None nprow.dtype = None nprow.blob = None nprow.units = None nprow.compress = None return if self.arraytype == np.ndarray: assert (type(value) == np.ndarray) or ( type(value) == np.memmap ), 'Value is not np.array or np.memmap but {}'.format(type(value)) if self.arraytype == pq.Quantity: assert type( value ) == pq.Quantity, '{} {} {} value is not pq.Quantity'.format( inst.__class__.__name__, self.name, value) shape = ('{},' * value.ndim)[:-1].format(*value.shape) if shape.endswith(','): shape = shape[:-1] nprow.shape = shape nprow.dtype = value.dtype.str if self.compress == 'blosc': blob = blosc.compress(value.tostring(), typesize=value.dtype.itemsize, clevel=9) else: if not value.flags['C_CONTIGUOUS']: buf = np.getbuffer(np.array(value, copy=True)) else: buf = np.getbuffer(value) if self.compress == 'zlib': blob = zlib.compress(buf) elif self.compress == 'lz4': blob = lz4.compress(buf) elif self.compress == 'snappy': blob = snappy.compress(buf) else: blob = buf nprow.compress = self.compress nprow.blob = blob if self.arraytype == pq.Quantity: nprow.units = value.dimensionality.string setattr(inst, self.name + '_array', value)
def chunks(inp): sr = dill.dumps(inp) inp = compress(sr) bs = 2**14 out = [] for x in range((len(inp) / bs) + 1): out.append(inp[(bs*x):(bs)*(x+1)]) return out
def compute_probabilities(replacement_idx, a, conditional_binary): """ Compute the compressed length of a bit string with number as bit a replacing the number as bit in position replacement_idx """ conditional_binary[replacement_idx] = a return len(lz4.compress(''.join(conditional_binary))) #54.65
def roundtrip(size=None): if size is None: size = struct.unpack(">I", b"\0" + os.urandom(3))[0] data = os.urandom(size) assert rustlz4.decompress(pylz4.compress(data)) == data assert pylz4.decompress(buffer(rustlz4.compress(data))) == data assert rustlz4.decompress(pylz4.compressHC(data)) == data assert pylz4.decompress(buffer(rustlz4.compresshc(data))) == data
def process_document(self, session, doc): data = doc.get_raw(session) new = lz4.compress(data) return StringDocument(new, self.id, doc.processHistory, parent=doc.parent, filename=doc.filename)
def __init__(self, data_dir, obj_to_terms, obj_to_str, str_to_obj): self.data_dir = data_dir self.obj_to_terms = obj_to_terms self.obj_to_str = obj_to_str self.str_to_obj = str_to_obj self.id_term_map = None self.term_id_map = None self.objnum = 0 try: import lz4 as compressor self.compress = compressor.compress self.compressHC = compressor.compressHC self.decompress = compressor.decompress except ImportError: import zlib as compressor self.compress = lambda data: compressor.compress(data, 3) self.compressHC = lambda data: compressor.compress(data, 9) self.decompress = lambda data: compressor.decompress(data)
def test_val_to_store_info_compress(self): """_val_to_store_info() should compress large values. """ value = 'foo' * 32 compressed_value = lz4.compress(value) client = memcache.Client('127.0.0.1', 11211, client_driver=NoopDriver) result = client._val_to_store_info(value, min_compress_len=1) self.assertEqual(result, (memcache.Client._FLAG_COMPRESSED, compressed_value))
def do_test_rountrip_method(compress, i_data, c_data=None): from lz4 import LZ4_uncompress #@UnresolvedImport c = compress(i_data) if c_data is not None: assert c_data == c, "expected compressed data to look like %s, but got %s" % ( hl(c_data), hl(c)) d = LZ4_uncompress(c) assert d == i_data, "expected decompressed data to look like original %s, but got %s" % ( hl(i_data), hl(d))
def _dump2stor(store, bucketname, data, compress): if len(data) == 0: return "" key = j.data.hash.md5_string(data) if not key in objects or not key in new_objects: if compress: data = lz4.compress(data) store.set_object(bucketname, key, data) new_objects.append(key) return key
def compress(scheme, data): hdr = data[:4] + struct.pack(">L", (scheme << 27) + (len(data) & 0x07ffffff)) if scheme == 0 : return data elif scheme == 1 and lz4: res = lz4.compress(hdr + data) return res else: warnings.warn("Table failed to compress by unsupported compression scheme") return data
def _dump2stor(store, bucketname, data, compress): if len(data)==0: return "" key = j.data.hash.md5_string(data) if not key in objects or not key in new_objects: if compress: data = lz4.compress(data) store.set_object(bucketname, key, data) new_objects.append(key) return key
def serialize_subarray(cls, subarray): if not subarray.flags['C_CONTIGUOUS']: subarray = subarray.copy(order='C') # Buffers larger than 1 GB would overflow # We could fix this by slicing each slice into smaller pieces... assert subarray.nbytes <= cls.MAX_LZ4_BUFFER_SIZE, \ "FIXME: This class doesn't support compression of arrays whose slices are each > 1 GB" return lz4.compress(subarray)
def _put(self, item): key = next(self._idx).to_bytes(length=511, byteorder=sys.byteorder, signed=False) data = compress(pickle.dumps( item, protocol=4)) if self.compression else pickle.dumps( item, protocol=4) with self._sem: with self._lmdb.begin(write=True) as txn: txn.put(key, data, append=True, db=self._queue_db)
def fset(self, inst, value): nprow = getattr(inst, 'NumpyArrayTable__'+self.name) #~ print 'fset',self.name, nprow, value if nprow is None: nprow = self.NumpyArrayTableClass() setattr(inst, 'NumpyArrayTable__'+self.name, nprow) if value is None: if hasattr(inst, self.name+'_array') : delattr(inst, self.name+'_array') nprow.shape = None nprow.dtype = None nprow.blob = None nprow.units = None nprow.compress = None return if self.arraytype == np.ndarray: assert (type(value) == np.ndarray) or (type(value) == np.memmap) , 'Value is not np.array or np.memmap but {}'.format(type(value)) if self.arraytype == pq.Quantity: assert type(value) == pq.Quantity , '{} {} {} value is not pq.Quantity'.format(inst.__class__.__name__, self.name, value) shape = ('{},'*value.ndim)[:-1].format(*value.shape) if shape.endswith(',') : shape = shape[:-1] nprow.shape = shape nprow.dtype = value.dtype.str if self.compress == 'blosc': blob = blosc.compress(value.tostring(), typesize = value.dtype.itemsize, clevel= 9) else: if not value.flags['C_CONTIGUOUS']: #~ buf = np.getbuffer(np.array(value, copy = True)) buf = np.array(value, copy=True).data else: #~ buf = np.getbuffer(value) buf = value.data if self.compress == 'zlib': blob = zlib.compress(buf) elif self.compress == 'lz4': blob = lz4.compress(buf) elif self.compress == 'snappy': blob = snappy.compress(buf) else : blob = buf nprow.compress = self.compress nprow.blob = blob if self.arraytype == pq.Quantity: nprow.units = value.dimensionality.string setattr(inst, self.name+'_array', value)
def test_value_to_store_info_compress_length(self): """ _val_to_store_info() should not use compressed values if too long. That is, if the compressed value is longer than the original value, use the original value instead. """ value = '...' compressed_value = lz4.compress(value) self.assertGreater(len(compressed_value), len(value)) client = memcache.Client('127.0.0.1', 11211, client_driver=NoopDriver) result = client._val_to_store_info(value, min_compress_len=1) self.assertEqual(result, (0, value))
def write_stripe(f, compressed, header, padding=True): h = compress(marshal.dumps(header)) assert len(h) < STRIPE_HEADER_SIZE f.write(struct.pack('I', len(h))) f.write(h) padding_size = STRIPE_SIZE - len(h) - 4 for c in compressed: f.write(c) padding_size -= len(c) if padding: f.write('\0' * padding_size)
def _compress_as_lz4(self): if self._lz4_items is None: self._uncompress() # Ensure not currently compressed as draco compressed = [] flat_vertices = self._vertices_zyx.reshape(-1) compressed.append( lz4.compress(flat_vertices) ) #@UndefinedVariable self._vertices_zyx = None flat_normals = self._normals_zyx.reshape(-1) compressed.append( lz4.compress(flat_normals) ) #@UndefinedVariable self._normals_zyx = None flat_faces = self._faces.reshape(-1) compressed.append( lz4.compress(flat_faces) ) #@UndefinedVariable self._faces = None # Compress twice: still fast, even smaller self._lz4_items = list(map(lz4.compress, compressed)) #@UndefinedVariable return sum(map(len, self._lz4_items))
def send(self, data, host, port): """Send raw data through a socket""" if not isinstance(data, dict) and not isinstance(data, list): raise TypeError("data must be either a list or a dictionary") # Getting raw data if self._use_lz4: data_raw = lz4.compress(self._codec.encode(data)) else: data_raw = self._codec.encode(data) # Put the data on the wire as an UTF-8 JSON string self.sock.sendto(data_raw, (host, port))
def __init__(self, numpy_array): """Serializes and compresses the numpy array with LZ4""" self.raw_buffer = None # only used if we can't compress self.compressed_label_blocks = None # only used for label arrays of suitable shape self.compressed_mask_array = None # only used for binary masks self.serialized_subarrays = [] if numpy_array.flags['F_CONTIGUOUS']: self.layout = 'F' else: self.layout = 'C' if self.layout == 'F': numpy_array = numpy_array.transpose() self.dtype = numpy_array.dtype self.shape = numpy_array.shape # TODO: Also support compression of bool arrays via the special DVID binary compression if self.is_labels(numpy_array): self.compressed_label_blocks = serialize_uint64_blocks(numpy_array) elif self.dtype == np.bool and numpy_array.ndim == 3: # It turns out that encode_mask_array + lz4.compress is better than # lz4.compression alone (even multiple rounds of lz4 alone) self.compressed_mask_array = lz4.compress( encode_mask_array(numpy_array)) else: if numpy_array.ndim <= 1: slice_bytes = numpy_array.nbytes else: slice_bytes = numpy_array[0].nbytes if slice_bytes > CompressedNumpyArray.MAX_LZ4_BUFFER_SIZE: warnings.warn( "Array is too large to compress -- not compressing.") if not numpy_array.flags['C_CONTIGUOUS']: numpy_array = numpy_array.copy(order='C') self.raw_buffer = bytearray(numpy_array) else: # For 2D, 1D or 0D arrays, serialize everything in one buffer. if numpy_array.ndim <= 2: self.serialized_subarrays.append( self.serialize_subarray(numpy_array)) else: # For ND arrays, serialize each slice independently, to ease RAM usage for subarray in numpy_array: self.serialized_subarrays.append( self.serialize_subarray(subarray))
def __init__(self, data_dir): # term = str() # triple = str() # args(triple) = (int) self.data_dir = data_dir # table: id(term) -> term self.term_id_map = None # table: id(triple) -> args(triple) self.triple_id_map = None # table: id(term) -> args(triple) self.arg_cache = None self.rel_id_map = REL_NAME_ID_MAP self.id_rel_map = REL_ID_NAME_MAP try: import lz4 as compressor self.compress = compressor.compress self.compressHC = compressor.compressHC self.decompress = compressor.decompress except ImportError: import zlib as compressor self.compress = lambda data: compressor.compress(data, 3) self.compressHC = lambda data: compressor.compress(data, 9) self.decompress = lambda data: compressor.decompress(data)
def save_book_raw_data(book_raw_data_obj): if use_compression: f = open( join("/Volumes/NewVolume/Emotional-Arcs/database/cache", str(book_raw_data_obj.this_Book.pk) + ".p.lz4"), "wb") f.write( lz4.compress( pickle.dumps(book_raw_data_obj, pickle.HIGHEST_PROTOCOL))) f.close() else: f = open( join("/Volumes/NewVolume/Emotional-Arcs/database/cache", str(book_raw_data_obj.this_Book.pk) + ".p"), "wb") f.write(pickle.dumps(book_raw_data_obj, pickle.HIGHEST_PROTOCOL)) f.close()
def save_book_raw_data(book_raw_data_obj): if use_compression: f = open( join("/Users/andyreagan/projects/2014/09-books/data/cache", str(book_raw_data_obj.this_Book.pk) + ".p.lz4"), "wb") f.write( lz4.compress( pickle.dumps(book_raw_data_obj, pickle.HIGHEST_PROTOCOL))) f.close() else: f = open( join("/Users/andyreagan/projects/2014/09-books/data/cache", str(book_raw_data_obj.this_Book.pk) + ".p"), "wb") f.write(pickle.dumps(book_raw_data_obj, pickle.HIGHEST_PROTOCOL)) f.close()
def push_to_s3(self, compress=True): """Upload object to Amazon s3 bucket""" s3 = self.connect_and_check_bucket() # Archive logging.info('Archiving {}...'.format(self.name)) tar_path = os.path.join(self.PATH, '{}.tar'.format(self.name)) lz4_path = os.path.join(self.PATH, '{}.tar.lz4'.format(self.name)) with tarfile.open(tar_path, 'w') as in_file: for filename in glob.glob( os.path.join(self.PATH, '{stem}.*'.format(stem=self.name))): in_file.add(filename, arcname=os.path.split(filename)[1]) # Compress if compress: logging.info('{} LZ4 compressing...'.format(self.name)) with open(tar_path, 'rb') as in_file: with open(lz4_path, 'wb') as out_file: out_file.write(lz4.compress(in_file.read())) upload_path = lz4_path else: upload_path = tar_path # Get key key = self.get_key(compress=compress) # Upload logging.info('Uploading {} on s3...'.format(self.name)) s3.meta.client.upload_file(upload_path, self.BUCKET_NAME, key, Callback=self.ProgressPercentage( float(os.path.getsize(upload_path)))) # Clean os.remove(tar_path) if compress: os.remove(lz4_path) logging.info('Upload {} DONE'.format(self.name))
def compress(data, clibs=[]): if len(clibs) == 0: return (False, None, data) for c in clibs: try: if c == "zlib": import zlib return (True, "zlib", zlib.compress(data)) elif c == "lz4": import lz4 return (True, "zlib", lz4.compress(data)) elif c == "snappy": import pysnappy return (True, "snappy", pysnappy.compress(data)) else: import zlib return (True, "zlib", zlib.compress(data)) except: return (False, None, data) return (False, None, data)
def open_db(self): self.terms_ldb = leveldb.LevelDB(self.terms_fl) self.docs_ldb = leveldb.LevelDB(self.docs_fl) self.doc_buffer_size = 0 self.term_buffer_size = 0 #self.doc_flush_buffer = np.empty(self.max_doc_flush_buffer_size, dtype="S%d" % self.max_doc_size) self.doc_flush_buffer = [None] * self.max_doc_flush_buffer_size self.term_flush_buffer = np.empty(self.max_term_flush_buffer_size, dtype="S%d" % self.max_term_size) self.doc_id_flush_buffer = np.empty(self.max_doc_flush_buffer_size, dtype=np.int64) self.term_id_flush_buffer = np.empty(self.max_term_flush_buffer_size, dtype=np.int64) if self.compression == COMPRESSION.NONE: self.compress = lambda string: string self.decompress = lambda string: string elif self.compression == COMPRESSION.ZLIB: import zlib self.compress = lambda string: zlib.compress( string, self.compression_level) self.decompress = lambda string: zlib.decompress(string) elif self.compression == COMPRESSION.LZMA: import backports.lzma as lzma self.compress = lambda string: lzma.compress( bytearray(string), format=lzma.FORMAT_RAW) self.decompress = lambda data: lzma.decompress( data, format=lzma.FORMAT_RAW) elif self.compression == COMPRESSION.LZ4R: import lz4 self.compress = lambda string: lz4.compress(string) self.decompress = lambda string: lz4.decompress(string) elif self.compression == COMPRESSION.LZ4H: import lz4 self.compress = lambda string: lz4.compressHC(string) self.decompress = lambda string: lz4.decompress(string) else: raise Exception("Wrong compression type %r" % self.compression)
def compress_fixture(self, fixture_lz4_path, with_header): """ Compress json fixture with lz4 fixture_lz4_path: The path to the compressed fixture with_header: Specify if you want to have the mozilla header at the start of the compressed file """ fixture_json_path = 'fixtures/fixture.json' with open(fixture_json_path, mode='rb') as json_file: data = json_file.read() compressed_data = lz4.compress(data) with open(fixture_lz4_path, mode='wb') as lz4_file: if with_header: header = b'mozLz40\0' lz4_file.write(header) lz4_file.write(compressed_data)
def _val_to_store_info(self, val, min_compress_len): """ Transform val to a storable representation, returning a tuple of the flags, the length of the new value, and the new value itself. """ flags = 0 if isinstance(val, str): pass elif isinstance(val, int): flags |= Client._FLAG_INTEGER val = "%d" % val # maxint is pretty tiny. just return return (flags, val) elif isinstance(val, long): flags |= Client._FLAG_LONG val = "%d" % val # longs can be huge, so check length and compress if long enough else: if self.pickle: flags |= Client._FLAG_PICKLE val = pickle.dumps(val, self.pickle_proto) lv = len(val) # do not store if value length exceeds maximum if self.max_value_length and lv > self.max_value_length: raise MemcacheValueError( "Value is larger than configured max_value_length. %d > %d" % (lv, self.max_value_length)) # We should try to compress if min_compress_len > 0 and this # string is longer than min threshold. if min_compress_len and lv > min_compress_len: comp_val = lz4.compress(val) # Only actually compress if the compressed result is smaller # than the original. if len(comp_val) < lv: flags |= Client._FLAG_COMPRESSED val = comp_val return (flags, val)
def run(): fileDir = '/root/thermite/target/release/thermite_test_Neil' ratioFile_path = sys.argv[1] ratioList = [] openData = open(fileDir, "rb") ratioFile = open(ratioFile_path, "w") vvfd = os.open(fileDir, os.O_RDONLY) vvData = os.lseek(vvfd, 0, os.SEEK_END) upper_limit = vvData / 4096 for x in xrange(0, upper_limit, 1): openData.seek(4096 * x, 0) data = openData.read(4096) compOutput = lz4.compress(data) sys.getsizeof(compOutput) ratio = sys.getsizeof(compOutput) / float(sys.getsizeof(data)) ratioFile.write(str(ratio) + '\n') ratioFile.close()
def pack(fname): global total, compressed f = open(fname) data = f.read() f.close() checksum = siphashc.siphash('\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0', data) fsize = len(data) data = lz4.compress(data) # size of packet(4), checksum(8), fnamelen(2)+fname, uncompressed size(4), compressed data l = len(data) pktlen = 4 + 8 + 2 + len(fname) + 4 + l total += fsize compressed += len(data) sys.stderr.write("%s: %d -> %d\n" %(fname, fsize, len(data))) sys.stdout.write( struct.pack('<IQH%dsI%ds' % (len(fname), l), pktlen, checksum, len(fname), fname,fsize,data)) sys.stdout.flush()
for line in bz2_fh: line = line.decode('utf-8') title_match = title_re.match(line) if title_match: if 'lat_d' in old_coords and 'long_d' in old_coords: lat, lng = normalize_coords(**old_coords) j += 1 if lat and lng: abstract, img = extract_abstract(text) rank = len(text) #print("\t".join(map(str, (title, lat, lng)))) print(title) try: POI(name=title, at=[lng, lat], abstract=lz4.compress(abstract), rank=rank, img=img).save() #POI(name=title, at=[lng, lat], abstract=abstract).save() except Exception as e: print("Insert error:", str(e), title, lat, lng, file=sys.stderr) # raise # print("Begin abstract") # print(abstract) # print("End abstract") title = title_match.group(1) coords, old_coords, lat, lng, text, in_text = None, {}, None, None, '', False continue coord_match = coord_re.match(line) if coord_match: try: lat, lng = coord2latlng(line) # print("\t".join(map(str, (title, lat, lng))))
import uuid import timeit import lz4 import snappy import os from timeit import Timer DATA = open("../src/lz4.c", "rb").read() LZ4_DATA = lz4.compress(DATA) SNAPPY_DATA = snappy.compress(DATA) LOOPS = 200000 print("Data Size:") print(" Input: %d" % len(DATA)) print(" LZ4: %d (%.2f)" % (len(LZ4_DATA), len(LZ4_DATA) / float(len(DATA)))) print(" Snappy: %d (%.2f)" % (len(SNAPPY_DATA), len(SNAPPY_DATA) / float(len(DATA)))) print(" LZ4 / Snappy: %f" % (float(len(LZ4_DATA)) / float(len(SNAPPY_DATA)))) print("Benchmark: %d calls" % LOOPS) print(" LZ4 Compression: %fs" % Timer("lz4.compress(DATA)", "from __main__ import DATA; import lz4").timeit(number=LOOPS)) print(" Snappy Compression: %fs" % Timer("snappy.compress(DATA)", "from __main__ import DATA; import snappy").timeit(number=LOOPS)) print(" LZ4 Decompression: %fs" % Timer("lz4.uncompress(LZ4_DATA)", "from __main__ import LZ4_DATA; import lz4").timeit(number=LOOPS)) print(" Snappy Decompression : %fs" % Timer("snappy.uncompress(SNAPPY_DATA)", "from __main__ import SNAPPY_DATA; import snappy").timeit(number=LOOPS))
def test_roundtripLZ4(): _str = "hello world" cstr = lz4.compress(_str) assert _str == c.decompress(cstr)
def lz4_pickle_dump(data, filename): path = pathlib.Path(filename) with path.open('wb') as f: f.write(lz4.compress(pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)))
def _test_block(labels, test_name): # labelarray with Timer() as timer: dvid_encoded_list = serialize_uint64_blocks(labels) dvid_encoded_bytes = sum(map(len, dvid_encoded_list)) dvid_enc_time = timer.seconds dvid_enc_throughput = (labels.nbytes / dvid_enc_time) / 1e6 with Timer() as timer: decoded = deserialize_uint64_blocks(dvid_encoded_list, labels.shape) assert (decoded == labels).all() dvid_dec_time = timer.seconds dvid_dec_throughput = (labels.nbytes / dvid_dec_time) / 1e6 # DVID + gzip with Timer() as timer: gzipped_dvid_encoded_list = list(map(gzip.compress, dvid_encoded_list)) gzipped_dvid_enc_time = timer.seconds + dvid_enc_time gzipped_dvid_enc_throughput = (labels.nbytes / gzipped_dvid_enc_time) / 1e6 gzipped_dvid_encoded_bytes = sum(map(len, gzipped_dvid_encoded_list)) print("+ GZIP:", gzipped_dvid_encoded_bytes) print(f"Compression ratio: {labels.nbytes/gzipped_dvid_encoded_bytes:.1f}x") print(f"DVID+GZIP encode throughput: {gzipped_dvid_enc_throughput} MB/s") with Timer() as timer: unzippped = list(map(gzip.decompress, gzipped_dvid_encoded_list)) assert (decoded == labels).all() gzipped_dvid_dec_time = timer.seconds + dvid_dec_time gzipped_dvid_dec_throughput = (labels.nbytes / gzipped_dvid_dec_time) / 1e6 print(f"DVID+GZIP decode throughput: {gzipped_dvid_dec_throughput} MB/s") # DVID + LZ4 with Timer() as timer: lz4_dvid_encoded_list = list(map(lz4.compress, dvid_encoded_list)) lz4_dvid_enc_time = timer.seconds + dvid_enc_time lz4_dvid_enc_throughput = (labels.nbytes / lz4_dvid_enc_time) / 1e6 lz4_dvid_encoded_bytes = sum(map(len, lz4_dvid_encoded_list)) print("+ LZ4:", lz4_dvid_encoded_bytes) print(f"Compression ratio: {labels.nbytes/lz4_dvid_encoded_bytes:.1f}x") print(f"DVID+LZ4 encode throughput: {lz4_dvid_enc_throughput} MB/s") with Timer() as timer: unzippped = list(map(lz4.decompress, lz4_dvid_encoded_list)) assert (decoded == labels).all() lz4_dvid_dec_time = timer.seconds + dvid_dec_time lz4_dvid_dec_throughput = (labels.nbytes / lz4_dvid_dec_time) / 1e6 print(f"DVID+LZ4 decode throughput: {lz4_dvid_dec_throughput} MB/s") # lz4 with Timer() as timer: lz4_encoded = lz4.compress(labels) lz4_encoded_bytes = len(lz4_encoded) lz4_enc_time = timer.seconds lz4_enc_throughput = (labels.nbytes / lz4_enc_time) / 1e6 with Timer() as timer: lz4_decoded = lz4.decompress(lz4_encoded) decoded_labels = np.frombuffer(lz4_decoded, np.uint64).reshape(labels.shape) assert (decoded_labels == labels).all() lz4_dec_time = timer.seconds lz4_dec_throughput = (labels.nbytes / lz4_dec_time) / 1e6 global HEADER_PRINTED if not HEADER_PRINTED: print(f"{'':>20s} {'______ ENCODED BYTES ______ ':^41s} | {'______ ENCODING TIME ______ ':^77s} | {'______ DECODING TIME ______ ':^77s} |") print(f"{'':>20s} {'LZ4':>10s} {'DVID':>10s} {'D+G':>10s} {'DECREASE':>9s} |" f"{'------- LZ4 -------':>22s} {'------ DVID ------':>22s} {'---- DVID+GZIP ----':>22s} {'SLOWDOWN':>9s} |" f"{'------- LZ4 -------':>22s} {'------ DVID ------':>22s} {'---- DVID+GZIP ----':>22s} {'SLOWDOWN':>9s} |") HEADER_PRINTED = True print(f"{test_name:>19s}: {lz4_encoded_bytes: 10d} {dvid_encoded_bytes: 10d} {lz4_encoded_bytes/dvid_encoded_bytes:8.1f}x |" f"{lz4_enc_time:6.2f}s ({lz4_enc_throughput:7.1f} MB/s) {dvid_enc_time:6.2f}s ({dvid_enc_throughput:7.1f} MB/s) {dvid_enc_time/lz4_enc_time:8.1f}x |" f"{lz4_dec_time:6.2f}s ({lz4_dec_throughput:7.1f} MB/s) {dvid_dec_time:6.2f}s ({dvid_dec_throughput:7.1f} MB/s) {dvid_dec_time/lz4_dec_time:8.1f}x |")
def test_lz4_4_2(): lz4.compress(garbage100k)
def lz4_compress(byts): # write length in big-endian instead of little-endian return int32_pack(len(byts)) + lz4.compress(byts)[4:]
from mokujin.resource import StopList from mokujin.misc import transliterate_ru from mokujin.resource import ConceptNetList from mokujin.index import TripleSearchEngine from mokujin.sourcesearch import TripleStoreExplorer from mokujin.patternsearch import PatternCollection try: import lz4 as comp comp_format = "lz4" compress = comp.compressHC decompress = comp.decompress except ImportError: import zlib as comp comp_format = "zip" compress = lambda string: comp.compress(string, 9) decompress = comp.decompress if __name__ == "__main__": logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument("-i", "--index", default="data/index", help="Triple store index directory", type=str) parser.add_argument("-o", "--outputdir", default="output", help="Directory where script's ouput will be placed", type=str) parser.add_argument("-q", "--queryterm", default=None, help="Query term", type=str) parser.add_argument("-qf", "--queryterms_file", default=None, help="File with query terms", type=str) parser.add_argument("-s", "--stoplist", default="resources/word.freq.ru.csv", help="Stop list file", type=str) parser.add_argument("-ts", "--t_stop", default=500, help="Stop words frequency threshold", type=float) parser.add_argument("-tt", "--t_triple", default=5, help="Min frequency treshold for target triples", type=float)