def testDecompress4G(self, size): # "Test BZ2Decompressor.decompress() with >4GiB input" blocksize = 10 * 1024 * 1024 block = random.randbytes(blocksize) try: data = block * (size // blocksize + 1) compressed = bz2.compress(data) bz2d = BZ2Decompressor() decompressed = bz2d.decompress(compressed) self.assertTrue(decompressed == data) finally: data = None compressed = None decompressed = None
def test_decompress_chunks_10(self): from bz2 import BZ2Decompressor bz2d = BZ2Decompressor() decompressed_data = b"" n = 0 while True: temp = self.DATA[n * 10:(n + 1) * 10] if not temp: break decompressed_data += bz2d.decompress(temp) n += 1 assert decompressed_data == self.TEXT
def __init__(self, hasher, inputStream, outFile, start=0, length=None, notify=None, decompress=None, decFile=None): """Initializes the files. @type hasher: hashing object, e.g. C{sha1} @param hasher: the hash object for the data @type inputStream: L{twisted.web2.stream.IByteStream} @param inputStream: the input stream to read from @type outFile: C{file} @param outFile: the open file to write to @type start: C{int} @param start: the file position to start writing at (optional, defaults to the start of the file) @type length: C{int} @param length: the maximum amount of data to write to the file (optional, defaults to not limiting the writing to the file @param notify: a method that will be notified of the length of received data (optional) @type decompress: C{string} @param decompress: also decompress the file as this type (currently only '.gz' and '.bz2' are supported) @type decFile: C{twisted.python.FilePath} @param decFile: the file to write the decompressed data to """ self.stream = inputStream self.outFile = outFile self.hasher = hasher self.gzfile = None self.bz2file = None if decompress == ".gz": self.gzheader = True self.gzfile = decFile.open('w') self.gzdec = decompressobj(-MAX_WBITS) elif decompress == ".bz2": self.bz2file = decFile.open('w') self.bz2dec = BZ2Decompressor() self.position = start self.length = None if length is not None: self.length = start + length self.notify = notify self.doneDefer = None
def try_decompress_at(input_file : bytes, offset : int) -> bytes: decoded = None try: if Signature.check(input_file, offset, Signature.DTB_Appended_Qualcomm): # Merely unpack a Qualcomm kernel file containing a magic and DTB offset at the start (so that offsets aren't wrong) dtb_offset_le = int.from_bytes(input_file[offset + 16:offset + 20], 'little') dtb_offset_be = int.from_bytes(input_file[offset + 16:offset + 20], 'big') decoded = input_file[offset + 20:offset + 20 + min(dtb_offset_le, dtb_offset_be)] elif Signature.check(input_file, offset, Signature.Compressed_GZIP): decoded = SingleGzipReader(BytesIO(input_file[offset:])).read(-1) # GZIP - Will stop reading after the GZip footer thanks to our modification above. elif (Signature.check(input_file, offset, Signature.Compressed_XZ) or Signature.check(input_file, offset, Signature.Compressed_LZMA)): try: decoded = LZMADecompressor().decompress(input_file[offset:]) # LZMA - Will discard the extra bytes and put it an attribute. except Exception: decoded = LZMADecompressor().decompress(input_file[offset:offset + 5] + b'\xff' * 8 + input_file[offset + 5:]) # pylzma format compatibility elif Signature.check(input_file, offset, Signature.Compressed_BZ2): decoded = BZ2Decompressor().decompress(input_file[offset:]) # BZ2 - Will discard the extra bytes and put it an attribute. elif Signature.check(input_file, offset, Signature.Compressed_LZ4): # LZ4 support try: LZ4Decompressor = importlib.import_module('lz4.frame') except ModuleNotFoundError: logging.error('ERROR: This kernel requres LZ4 decompression.') logging.error(' But "lz4" python package does not found.') logging.error(' Example installation command: "sudo pip3 install lz4"') logging.error() return context = LZ4Decompressor.create_decompression_context() decoded, bytes_read, end_of_frame = LZ4Decompressor.decompress_chunk(context, input_file[offset:]) except Exception: pass if decoded and len(decoded) > 0x1000: logging.info(('[+] Kernel successfully decompressed in-memory (the offsets that ' + 'follow will be given relative to the decompressed binary)')) return decoded
def getArticle(self, byte, pageID): bz2d = BZ2Decompressor() byte_string = bz2d.decompress(byte) doc = etree.parse(io.BytesIO(b'<root> ' + byte_string + b' </root>')) r = self.getReadable(doc.xpath("*/id")) index = r.index(str(pageID)) r = self.getReadable(doc.xpath("*/revision/text"))[index] # print(r) article = r.find("'''") r = r[article:] articleE = r.find("==") r = r[:articleE] return r
def _download(self, filename): """Download requested file.""" # GET request from TLOPO download servers for file, all files are downloaded w/ .b2 extension data = requests.get(self.downloadServer + filename + '.bz2', stream=True) file = pathlib.Path(self.currdir, self.location, filename) # If file path does not exist create it if not os.path.exists(os.path.dirname(file)): os.makedirs(os.path.dirname(file)) # Write file from downloaded content, decompress bz2 directly from memory before writing decompressor = BZ2Decompressor() with open(file, 'wb') as content: for chunk in data.iter_content(chunk_size=128): content.write(decompressor.decompress(chunk))
def test_decompressor_inputbuf_3(self): # Test reusing input buffer after extending it bzd = BZ2Decompressor() out = [] # Create almost full input buffer out.append(bzd.decompress(self.DATA[:200], 5)) # Add even more data to it, requiring resize out.append(bzd.decompress(self.DATA[200:300], 5)) # Decompress rest of data out.append(bzd.decompress(self.DATA[300:])) self.assertEqual(b''.join(out), self.TEXT)
def test_decompress_max_length(self): from bz2 import BZ2Decompressor bz2d = BZ2Decompressor() decomp = [] length = len(self.DATA) decomp.append(bz2d.decompress(self.DATA, max_length=100)) assert len(decomp[-1]) == 100 while not bz2d.eof: decomp.append(bz2d.decompress(b"", max_length=200)) assert len(decomp[-1]) <= 200 assert b''.join(decomp) == self.TEXT
def _releases(self) -> Dict[str, Dict[str, Dict[str, Any]]]: all_deps = defaultdict(dict) for channel in self._channels: cache = JSONCache('conda.anaconda.org', 'releases', channel, ttl=config['cache']['ttl']) channel_deps = cache.load() if channel_deps is not None: for dep, releases in channel_deps.items(): all_deps[dep].update(releases) continue channel_deps = defaultdict(dict) for url in self._get_urls(channel=channel): with requests_session() as session: response = session.get(url) response.raise_for_status() content = BZ2Decompressor().decompress( response.content).decode('utf-8') base_url = url.rsplit('/', 1)[0] for fname, info in json.loads(content)['packages'].items(): # release info name = canonicalize_name(info.pop('name')) version = info.pop('version') if version not in channel_deps[name]: channel_deps[name][version] = dict( depends=set(), timestamp=info.get('timestamp', 0) // 1000, files=[], ) # file info channel_deps[name][version]['depends'].update( info['depends']) channel_deps[name][version]['files'].append( dict( url=base_url + '/' + fname, sha256=info.get('sha256', None), size=info['size'], )) for dep, releases in channel_deps.items(): for release in releases.values(): release['depends'] = list(release['depends']) all_deps[dep].update(releases) cache.dump(channel_deps) return dict(all_deps)
def scan_bzip_content(file_, substr=None, _chunksize=25 * (1024**2)): # bunzip2 -c RC_20##-##.bz2 | grep '"author":"$AUTHOR"' | jq .id -r with open(file_, "rb") as fin: chunk = fin.read(_chunksize) decompressor = BZ2Decompressor() extra = None while chunk: data = decompressor.decompress(chunk) if data: lines = data.split(b"\n") if extra: lines[0] = extra + lines[0] extra = lines.pop(-1) for line in lines: if substr is None or substr in line: yield loads(line.strip()) chunk = fin.read(_chunksize)
def get_X_y(dataset, compressed_path, multilabel, replace=False): """Load a LIBSVM dataset as sparse X and observation y/Y. If X and y already exists as npz and npy, they are not redownloaded unless replace=True.""" ext = '.npz' if multilabel else '.npy' y_path = pjoin(CELER_PATH, "%s_target%s" % (NAMES[dataset], ext)) X_path = pjoin(CELER_PATH, "%s_data.npz" % NAMES[dataset]) if replace or not os.path.isfile(y_path) or not os.path.isfile(X_path): tmp_path = pjoin(CELER_PATH, "%s" % NAMES[dataset]) decompressor = BZ2Decompressor() print("Decompressing...") with open(tmp_path, "wb") as f, open(compressed_path, "rb") as g: for data in iter(lambda: g.read(100 * 1024), b''): f.write(decompressor.decompress(data)) n_features_total = N_FEATURES[dataset] print("Loading svmlight file...") with open(tmp_path, 'rb') as f: X, y = load_svmlight_file(f, n_features_total, multilabel=multilabel) os.remove(tmp_path) X = sparse.csc_matrix(X) X.sort_indices() sparse.save_npz(X_path, X) if multilabel: indices = np.array([lab for labels in y for lab in labels]) indptr = np.cumsum([0] + [len(labels) for labels in y]) data = np.ones_like(indices) Y = sparse.csr_matrix((data, indices, indptr)) sparse.save_npz(y_path, Y) return X, Y else: np.save(y_path, y) else: X = sparse.load_npz(X_path) y = np.load(y_path) return X, y
def testDecompressorChunksMaxsize(self): bzd = BZ2Decompressor() max_length = 100 out = [] len_ = len(self.BIG_DATA) - 64 out.append(bzd.decompress(self.BIG_DATA[:len_], max_length=max_length)) self.assertFalse(bzd.needs_input) self.assertEqual(len(out[-1]), max_length) out.append(bzd.decompress(b'', max_length=max_length)) self.assertFalse(bzd.needs_input) self.assertEqual(len(out[-1]), max_length) out.append(bzd.decompress(self.BIG_DATA[len_:], max_length=max_length)) self.assertLessEqual(len(out[-1]), max_length) while not bzd.eof: out.append(bzd.decompress(b'', max_length=max_length)) self.assertLessEqual(len(out[-1]), max_length) out = b''.join(out) self.assertEqual(out, self.BIG_TEXT) self.assertEqual(bzd.unused_data, b'')
def __init__(self, fpatch, compression): if compression == 'lzma': self._decompressor = LZMADecompressor() elif compression == 'bz2': self._decompressor = BZ2Decompressor() elif compression == 'crle': self._decompressor = CrleDecompressor(patch_data_length(fpatch)) elif compression == 'none': self._decompressor = NoneDecompressor(patch_data_length(fpatch)) elif compression == 'heatshrink': self._decompressor = HeatshrinkDecompressor(patch_data_length(fpatch)) elif compression == 'zstd': self._decompressor = ZstdDecompressor(patch_data_length(fpatch)) elif compression == 'lz4': self._decompressor = Lz4Decompressor() else: raise Error(format_bad_compression_string(compression)) self._fpatch = fpatch
def test_decompressor_inputbuf_2(self): # Test reusing input buffer by appending data at the # end right away bzd = BZ2Decompressor() out = [] # Create input buffer and empty it self.assertEqual(bzd.decompress(self.DATA[:200], max_length=0), b'') out.append(bzd.decompress(b'')) # Fill buffer with new data out.append(bzd.decompress(self.DATA[200:280], 2)) # Append some more data, not enough to require resize out.append(bzd.decompress(self.DATA[280:300], 2)) # Decompress rest of data out.append(bzd.decompress(self.DATA[300:])) self.assertEqual(b''.join(out), self.TEXT)
def httpFinished(self): self.outFile.close() if self.httpRequestAborted or self._reply.error(): self.outFile.remove() self._reply.deleteLater() del self._reply # 下载完成解压文件并加载摄像头 self.setText("正在解压数据。。。") try: bz = BZ2Decompressor() data = bz.decompress( open('D:/access55/shape_predictor_68_face_landmarks.dat.bz2', 'rb').read()) open('D:/access55/shape_predictor_68_face_landmarks.dat', 'wb').write(data) except Exception as e: self.setText('解压失败:' + str(e)) return self.setText('正在开启摄像头。。。') self.startCapture()
def test_decompressor_inputbuf_1(self): # Test reusing input buffer after moving existing # contents to beginning bzd = BZ2Decompressor() out = [] # Create input buffer and fill it self.assertEqual(bzd.decompress(self.DATA[:100], max_length=0), b'') # Retrieve some results, freeing capacity at beginning # of input buffer out.append(bzd.decompress(b'', 2)) # Add more data that fits into input buffer after # moving existing data to beginning out.append(bzd.decompress(self.DATA[100:105], 15)) # Decompress rest of data out.append(bzd.decompress(self.DATA[105:])) self.assertEqual(b''.join(out), self.TEXT)
def run(self): while True: delay = self.config.get("phishtank_update_delay") try: delay = int(delay) except (ValueError, TypeError): delay = 0 if delay < 1: self.logger.info("update not set or <1, not running") time.sleep(60) continue if delay < 3600: self.logging.info( "specified delay of {} too short, forcing to 1 hour". format(delay)) delay = 3600 apikey = self.config.get("phishtank_api_key") if not apikey: self.logger.info("apikey not configured, not running") continue self.logger.info("running phishtank update operation") res = requests.get(PhishTank.FILE_URL.format(apikey)) if res.status_code != requests.codes.ok: self.logging.info("error fetching file: {} - {}".format( res.status_code, res.text)) continue # let any error bubble up archive = StringIO() decompressor = BZ2Decompressor() data = decompressor.decompress(res.content) archive.write(data.decode("us-ascii")) dec = JSONDecoder() json_data = dec.decode(archive.getvalue()) for item in json_data: url = item.get("url") if url: self.cache.setex(url, delay, "suspicious") self.logger.info("phishtank update operation completed") time.sleep(delay)
def try_decompress_at(input_file: bytes, offset: int) -> bytes: decoded = None try: if input_file[offset:offset + 3] == b'\x1f\x8b\x08': # GZIP Signature decoded = SingleGzipReader(BytesIO(input_file[offset:])).read( -1 ) # Will stop reading after the GZip footer thanks to our modification above. elif input_file[offset:offset + 6] == b'\xfd7zXZ\x00' or input_file[ offset:offset + 3] == b']\x00\x00': # XZ/LZMA Signature try: decoded = LZMADecompressor().decompress( input_file[offset:] ) # Will discard the extra bytes and put it an attribute. except Exception: decoded = LZMADecompressor().decompress( input_file[offset:offset + 5] + b'\xff' * 8 + input_file[offset + 5:]) # pylzma format compatibility elif input_file[offset:offset + 3] == b'BZh': # BZ2 Signature decoded = BZ2Decompressor().decompress( input_file[offset:] ) # Will discard the extra bytes and put it an attribute. except Exception: pass if decoded and len(decoded) > 0x1000: print(( '[+] Kernel successfully decompressed in-memory (the offsets that ' + 'follow will be given relative to the decompressed binary)')) return decoded
def get_metadata(self): """ Get the namespace correspondance from the begining of the bz2 file + get the number of pages """ # uncompressing data with open(self.path_index) as index: max_byte = int(index.readline().split(':')[0]) with open(self.path_data, 'rb') as data: byte_file = data.read(565) uncompressed_data = BZ2Decompressor().decompress(byte_file).decode() # getting namespaces st = uncompressed_data+"</mediawiki>" m = re.search(r'xmlns=[^ ]+? ', st) st = st[:m.start()] + st[m.end():] root_header = ET.fromstring(st) namespaces_dict = {i.attrib['key']:i.text for i in root_header[0][5]} # get size of index: with os.popen('cat {} | wc -l'.format(self.path_index)) as cmd: total = int(cmd.read()) return namespaces_dict, total
def get_data(self, url): try: tmp_handle = tempfile.NamedTemporaryFile() response = requests.get(url, stream=True) if not response.ok: raise Exception('Failed to download from url "{0}".'.format( self.cached_url)) # represents if decompression should be applied decompress = self.decompress if decompress: # create sequential decompressor decompressor = BZ2Decompressor() # iterate over the image data in chunks for chunk in response.iter_content(1024 * 1024): if not chunk: break if decompress: # send data through decompressor if necessary tmp_handle.write(decompressor.decompress(chunk)) else: # write data directly into tmp file tmp_handle.write(chunk) # clean up your open connections response.connection.close() # reset file descriptor to position 0 before returning it tmp_handle.seek(0) except Exception as e: raise Exception('Failed to get image: "{0}".'.format(str(e))) return tmp_handle
def _fill_buffer(self): if self._mode == _MODE_READ_EOF: return False # Depending on the input data, our call to the decompressor may not # return any data. In this case, try again after reading another block. while self._buffer_offset == len(self._buffer): rawblock = (self._decompressor.unused_data or self._fp.read(_BUFFER_SIZE)) if not rawblock: try: self._decompressor.decompress(b"") except EOFError: # End-of-stream marker and end of file. We're good. self._mode = _MODE_READ_EOF self._size = self._pos return False else: # Problem - we were expecting more compressed data. raise EOFError("Compressed file ended before the " "end-of-stream marker was reached") try: self._buffer = self._decompressor.decompress(rawblock) except EOFError: # Continue to next stream. self._decompressor = BZ2Decompressor() try: self._buffer = self._decompressor.decompress(rawblock) except IOError: # Trailing data isn't a valid bzip2 stream. We're done here. self._mode = _MODE_READ_EOF self._size = self._pos return False self._buffer_offset = 0 return True
def _fill_buffer(self): if self._buffer: return True if self._decompressor.unused_data: rawblock = self._decompressor.unused_data else: rawblock = self._fp.read(_BUFFER_SIZE) if not rawblock: if self._decompressor.eof: self._mode = _MODE_READ_EOF self._size = self._pos return False else: raise EOFError("Compressed file ended before the " "end-of-stream marker was reached") # Continue to next stream. if self._decompressor.eof: self._decompressor = BZ2Decompressor() self._buffer = self._decompressor.decompress(rawblock) return True
def testEOFError(self): # "Calling BZ2Decompressor.decompress() after EOS must raise EOFError" bz2d = BZ2Decompressor() text = bz2d.decompress(self.DATA) self.assertRaises(EOFError, bz2d.decompress, "anything") self.assertRaises(EOFError, bz2d.decompress, "")
def testDecompress(self): # "Test BZ2Decompressor.decompress()" bz2d = BZ2Decompressor() self.assertRaises(TypeError, bz2d.decompress) text = bz2d.decompress(self.DATA) self.assertEqual(text, self.TEXT)
def testPickle(self): for proto in range(pickle.HIGHEST_PROTOCOL + 1): with self.assertRaises(TypeError): pickle.dumps(BZ2Decompressor(), proto)
def testEOFError(self): bz2d = BZ2Decompressor() text = bz2d.decompress(self.DATA) self.assertRaises(EOFError, bz2d.decompress, b"anything") self.assertRaises(EOFError, bz2d.decompress, b"")
def testDecompressUnusedData(self): bz2d = BZ2Decompressor() unused_data = b"this is unused data" text = bz2d.decompress(self.DATA + unused_data) self.assertEqual(text, self.TEXT) self.assertEqual(bz2d.unused_data, unused_data)
def test_failure(self): bzd = BZ2Decompressor() self.assertRaises(Exception, bzd.decompress, self.BAD_DATA * 30) self.assertRaises(Exception, bzd.decompress, self.BAD_DATA * 30)
def apply_patch_bsdiff(ffrom, fpatch, fto): """Apply given bsdiff patch `fpatch` to `ffrom` to create `fto`. Returns the size of the created to-data. All arguments are file-like objects. >>> ffrom = open('foo.mem', 'rb') >>> fpatch = open('foo-bsdiff.patch', 'rb') >>> fto = open('foo.new', 'wb') >>> apply_patch_bsdiff(ffrom, fpatch, fto) 2780 """ ctrl_size, diff_size, to_size = read_header_bsdiff(fpatch) ctrl_decompressor = BZ2Decompressor() diff_decompressor = BZ2Decompressor() extra_decompressor = BZ2Decompressor() ctrl_decompressor.decompress(fpatch.read(ctrl_size), 0) diff_decompressor.decompress(fpatch.read(diff_size), 0) extra_decompressor.decompress(fpatch.read(), 0) to_pos = 0 while to_pos < to_size: # Control data. diff_size = offtin(ctrl_decompressor.decompress(b'', 8)) extra_size = offtin(ctrl_decompressor.decompress(b'', 8)) adjustment = offtin(ctrl_decompressor.decompress(b'', 8)) # Diff data. if to_pos + diff_size > to_size: raise Error("Patch diff data too long.") if diff_size > 0: diff_data = diff_decompressor.decompress(b'', diff_size) from_data = ffrom.read(diff_size) fto.write(bsdiff.add_bytes(diff_data, from_data)) to_pos += diff_size # Extra data. if to_pos + extra_size > to_size: raise Error("Patch extra data too long.") if extra_size > 0: extra_data = extra_decompressor.decompress(b'', extra_size) fto.write(extra_data) to_pos += extra_size # Adjustment. ffrom.seek(adjustment, os.SEEK_CUR) if not ctrl_decompressor.eof: raise Error('End of control data not found.') if not diff_decompressor.eof: raise Error('End of diff data not found.') if not extra_decompressor.eof: raise Error('End of extra data not found.') return to_size
def test_failure(self): bzd = BZ2Decompressor() self.assertRaises(Exception, bzd.decompress, self.BAD_DATA * 30) # Previously, a second call could crash due to internal inconsistency self.assertRaises(Exception, bzd.decompress, self.BAD_DATA * 30)