def _extract_ole(self, data: bytearray) -> str: stream = MemoryFile(data) with self._olefile.OleFileIO(stream) as ole: doc = ole.openstream('WordDocument').read() with StructReader(doc) as reader: table_name = F'{(doc[11]>>1)&1}Table' reader.seek(0x1A2) offset = reader.u32() length = reader.u32() with StructReader(ole.openstream(table_name).read()) as reader: reader.seek(offset) table = reader.read(length) piece_table = self._load_piece_table(table) return self._get_text(doc, piece_table)
def process(self, data): dst = bytearray() src = StructReader(data) while not src.eof: copy = src.read_byte() for mask in (0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80): if src.eof: break if not copy & mask: dst.append(src.read_byte()) continue elif not dst: raise ValueError('copy requested against empty buffer') with src.be: match_len = src.read_integer(6) + _MATCH_MIN match_pos = src.read_integer(10) if not match_pos or match_pos > len(dst): raise RuntimeError(F'invalid match offset at position {src.tell()}') match_pos = len(dst) - match_pos while match_len > 0: match = dst[match_pos:match_pos + match_len] dst.extend(match) match_pos += len(match) match_len -= len(match) return dst
def process(self, data: bytearray): with MemoryFile() as output, StructReader(data) as reader: if reader.read(2) != B'JC': self.log_warn( 'data does not begin with magic sequence, assuming that header is missing' ) reader.seek(0) size = checksum = None else: size = reader.u32() checksum = reader.u32() if self.args.ignore_header: size = None self._decompress(output, reader, size) if size is not None: if len(output) > size: self.log_info(F'tuncating to size {size}') output.truncate(size) elif len(output) < size: self.log_warn( F'header size was {size}, but only {len(data)} bytes were decompressed' ) data = output.getvalue() if checksum: c = self._checksum(data) if c != checksum: self.log_warn( F'header checksum was {checksum:08X}, computed value is {c:08X}' ) return data
def __init__(self, buffer: Union[bytearray, StructReader], bits_per_read: int = 32): if not isinstance(buffer, StructReader): buffer = StructReader(memoryview(buffer), bigendian=False) self._reader: StructReader[memoryview] = buffer self._bit_buffer_data: int = 0 self._bit_buffer_size: int = 0 self._bits_per_read = bits_per_read
def unpack(self, data): def cpio(): with suppress(EOF): return CPIOEntry(reader) reader = StructReader(memoryview(data)) for entry in iter(cpio, None): if entry.name == 'TRAILER!!!': break yield self._pack(entry.name, entry.mtime, entry.data)
def __init__(self, reader: StructReader): reader.bigendian = True self.max_stack = reader.u16() self.max_locals = reader.u16() self.disassembly: List[JvOpCode] = [] with StructReader(reader.read(reader.u32())) as code: code.bigendian = True while not code.eof: self.disassembly.append(JvOpCode(code, pool=self.pool)) self.exceptions = [JvException(reader) for _ in range(reader.u16())] self.attributes = [JvAttribute(reader) for _ in range(reader.u16())]
def _load_piece_table(self, table: bytes) -> bytes: with StructReader(table) as reader: while not reader.eof: entry_type = reader.read_byte() if entry_type == 1: reader.seekrel(reader.read_byte()) continue if entry_type == 2: length = reader.u32() return reader.read(length) raise NotImplementedError( F'Unsupported table entry type value 0x{entry_type:X}.')
def process(self, data): with StructReader(data) as archive: if archive.read(8) != b'SZDD\x88\xF0\x27\x33': if not self.args.lenient: raise ValueError('signature missing') self.log_fail( 'the header signature is invalid, this is likely not an SZDD archive' ) if archive.read_byte() != 0x41: raise ValueError('Unsupported compression mode') # ignore the missing file extension letter: archive.seekrel(1) output_len = archive.u32() window_pos = 0x1000 - 0x10 output_pos = 0 output = bytearray(output_len) window = bytearray(0x1000) for k in range(len(window)): window[k] = 0x20 while not archive.eof: control = archive.read_byte() for cb in (0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80): if archive.eof: break if control & cb: output[output_pos] = window[ window_pos] = archive.read_byte() output_pos += 1 window_pos += 1 window_pos &= 0xFFF else: match_pos = archive.read_byte() match_len = archive.read_byte() match_pos |= (match_len & 0xF0) << 4 match_len = (match_len & 0x0F) + 3 match_pos &= 0xFFF for _ in range(match_len): window[window_pos] = window[match_pos] output[output_pos] = window[window_pos] output_pos += 1 window_pos += 1 match_pos += 1 window_pos &= 0xFFF match_pos &= 0xFFF return output
def test_bitreader_le(self): data = 0b10010100111010100100001111101_11_00000000_0101010101010010010111100000101001010101100000001110010111110100_111_000_100 size, remainder = divmod(data.bit_length(), 8) self.assertEqual(remainder, 0) data = memoryview(data.to_bytes(size, 'little')) sr = StructReader(data) self.assertEqual(sr.read_integer(3), 0b100) self.assertEqual(sr.read_integer(3), 0b000) self.assertEqual(sr.read_integer(3), 0b111) self.assertEqual( sr.u64(), 0b0101010101010010010111100000101001010101100000001110010111110100) self.assertFalse(any(sr.read_flags(8, reverse=True))) self.assertEqual(sr.read_bit(), 1) self.assertRaises(ValueError, lambda: sr.read_struct('')) self.assertEqual(sr.read_bit(), 1) self.assertEqual(sr.read_integer(29), 0b10010100111010100100001111101) self.assertTrue(sr.eof)
def test_bitreader_be(self): data = 0b01010_10011101_0100100001_1111_0111101010000101010101010010010111100000101001010101100000001110010111110100111000_101 size, remainder = divmod(data.bit_length(), 8) self.assertEqual(remainder, 7) data = memoryview(data.to_bytes(size + 1, 'big')) sr = StructReader(data) with sr.be: self.assertEqual(sr.read_bit(), 0) self.assertEqual(sr.read_bit(), 1) self.assertEqual(sr.read_bit(), 0) self.assertEqual(sr.read_bit(), 1) self.assertEqual(sr.read_bit(), 0) self.assertEqual(sr.read_byte(), 0b10011101) self.assertEqual(sr.read_integer(10), 0b100100001) self.assertTrue(all(sr.read_flags(4))) self.assertEqual( sr.read_integer(82), 0b0111101010000101010101010010010111100000101001010101100000001110010111110100111000 ) self.assertRaises(EOF, sr.u16)
def process(self, data: bytearray): view = memoryview(data) with MemoryFile() as output, StructReader(view) as reader: for k in count(1): if reader.eof: break trailing_size = len(data) - reader.tell() try: ID, VN, DS = reader.read_struct('4sBB') if ID != B'LZIP': if k > 1: raise EOF else: self.log_warn(F'ignoring invalid LZIP signature: {ID.hex()}') if VN != 1: self.log_warn(F'ignoring invalid LZIP version: {VN}') dict_size = 1 << (DS & 0x1F) dict_size -= (dict_size // 16) * ((DS >> 5) & 7) if dict_size not in range(_MIN_DICT_SIZE, _MAX_DICT_SIZE + 1): raise ValueError( F'The dictionary size {dict_size} is out of the valid range ' F'[{_MIN_DICT_SIZE}, {_MAX_DICT_SIZE}]; unable to proceed.' ) decoder = MemberDecoder(dict_size, reader, output) if not decoder(): raise ValueError(F'Data error in stream {k}.') crc32, data_size, member_size = reader.read_struct('<LQQ') if crc32 != decoder.crc32: self.log_warn(F'checksum in stream {k} was {decoder.crc:08X}, should have been {crc32:08X}.') if member_size - 20 != decoder.member_position: self.log_warn(F'member size in stream {k} was {decoder.member_position}, should have been {member_size}.') if data_size != decoder.data_position: self.log_warn(F'data size in stream {k} was {decoder.data_position}, should have been {data_size}.') except EOF: if k <= 1: raise self.log_info(F'silently ignoring {trailing_size} bytes of trailing data') break return output.getvalue()
def test_bitreader_structured(self): items = ( 0b1100101, # noqa -0x1337, # noqa 0xDEFACED, # noqa 0xC0CAC01A, # noqa -0o1337, # noqa 2076.171875, # noqa math.pi # noqa ) data = struct.pack('<bhiLqfd', *items) sr = StructReader(data) self.assertEqual(sr.read_nibble(), 0b101) self.assertRaises(sr.Unaligned, lambda: sr.read_exactly(2)) sr.seek(0) self.assertEqual(sr.read_byte(), 0b1100101) self.assertEqual(sr.i16(), -0x1337) self.assertEqual(sr.i32(), 0xDEFACED) self.assertEqual(sr.u32(), 0xC0CAC01A) self.assertEqual(sr.i64(), -0o1337) self.assertAlmostEqual(sr.read_struct('f', True), 2076.171875) self.assertAlmostEqual(sr.read_struct('d', True), math.pi) self.assertTrue(sr.eof)
def _get_text(self, doc: bytes, piece_table: bytes) -> str: piece_count: int = 1 + (len(piece_table) - 4) // 12 with StringIO() as text: with StructReader(piece_table) as reader: character_positions = [ reader.u32() for _ in range(piece_count) ] for i in range(piece_count - 1): cp_start = character_positions[i] cp_end = character_positions[i + 1] fc_value = reader.read_struct('xxLxx', unwrap=True) is_ansi = bool((fc_value >> 30) & 1) fc = fc_value & 0xBFFFFFFF cb = cp_end - cp_start if is_ansi: encoding = 'cp1252' fc = fc // 2 else: encoding = 'utf16' cb *= 2 raw = doc[fc:fc + cb] text.write(raw.decode(encoding).replace('\r', '\n')) return text.getvalue()
def decompress_stream(self, data: ByteString, LZOv1: bool = False) -> bytearray: """ An implementation of LZO decompression. We use the article "[LZO stream format as understood by Linux's LZO decompressor](https://www.kernel.org/doc/html/latest/staging/lzo.html)" as a reference since no proper specification is available. """ def integer() -> int: length = 0 while True: byte = src.read_byte() if byte: return length + byte length += 0xFF if length > 0x100000: raise LZOError('Too many zeros in integer encoding.') def literal(count): dst.write(src.read_bytes(count)) def copy(distance: int, length: int): if distance > len(dst): raise LZOError(F'Distance {distance} > bufsize {len(dst)}') buffer = dst.getbuffer() if distance > length: start = len(buffer) - distance end = start + length dst.write(buffer[start:end]) else: block = buffer[-distance:] while len(block) < length: block += block[:length - len(block)] if len(block) > length: block[length:] = () dst.write(block) src = StructReader(memoryview(data)) dst = MemoryFile() state = 0 first = src.read_byte() if first == 0x10: raise LZOError('Invalid first stream byte 0x10.') elif first <= 0x12: src.seekrel(-1) elif first <= 0x15: state = first - 0x11 literal(state) else: state = 4 literal(first - 0x11) while True: instruction = src.read_byte() if instruction < 0x10: if state == 0: length = instruction or integer() + 15 state = length + 3 if state < 4: raise LZOError('Literal encoding is too short.') else: state = instruction & 0b0011 D = (instruction & 0b1100) >> 2 H = src.read_byte() distance = (H << 2) + D + 1 if state >= 4: distance += 0x800 length = 3 else: length = 2 copy(distance, length) elif instruction < 0x20: L = instruction & 0b0111 H = instruction & 0b1000 length = L or integer() + 7 argument = src.u16() state = argument & 3 distance = (H << 11) + (argument >> 2) if not distance: return dst.getbuffer() if LZOv1 and distance & 0x803F == 0x803F and length in range(261, 265): raise LZOError('Compressed data contains sequence that is banned in LZOv1.') if LZOv1 and distance == 0xBFFF: X = src.read_byte() count = ((X << 3) | L) + 4 self.log_debug(F'Writing run of {X} zero bytes according to LZOv1.') dst.write(B'\0' * count) else: copy(distance + 0x4000, length + 2) elif instruction < 0x40: L = instruction & 0b11111 length = L or integer() + 31 argument = src.u16() state = argument & 3 distance = (argument >> 2) + 1 copy(distance, length + 2) else: if instruction < 0x80: length = 3 + ((instruction >> 5) & 1) else: length = 5 + ((instruction >> 5) & 3) H = src.read_byte() D = (instruction & 0b11100) >> 2 state = instruction & 3 distance = (H << 3) + D + 1 copy(distance, length) if state: literal(state)
def _begin(self, data): self._src = StructReader(memoryview(data)) self._dst = MemoryFile(bytearray()) return self
def process(self, data: bytearray): formatter = string.Formatter() until = self.args.until until = until and PythonExpression(until, all_variables_allowed=True) reader = StructReader(memoryview(data)) mainspec = self.args.spec byteorder = mainspec[:1] if byteorder in '<!=@>': mainspec = mainspec[1:] else: byteorder = '=' def fixorder(spec): if spec[0] not in '<!=@>': spec = byteorder + spec return spec it = itertools.count() if self.args.multi else (0, ) for index in it: if reader.eof: break if index >= self.args.count: break meta = metavars(data, ghost=True) meta['index'] = index args = [] last = None checkpoint = reader.tell() try: for prefix, name, spec, conversion in formatter.parse( mainspec): if prefix: args.extend(reader.read_struct(fixorder(prefix))) if name is None: continue if conversion: reader.byte_align( PythonExpression.evaluate(conversion, meta)) if spec: spec = meta.format_str(spec, self.codec, args) if spec != '': try: spec = PythonExpression.evaluate(spec, meta) except ParserError: pass if spec == '': last = value = reader.read() elif isinstance(spec, int): last = value = reader.read_bytes(spec) else: value = reader.read_struct(fixorder(spec)) if not value: self.log_warn(F'field {name} was empty, ignoring.') continue if len(value) > 1: self.log_info( F'parsing field {name} produced {len(value)} items reading a tuple' ) else: value = value[0] args.append(value) if name == _SHARP: raise ValueError( 'Extracting a field with name # is forbidden.') elif name.isdecimal(): index = int(name) limit = len(args) - 1 if index > limit: self.log_warn( F'cannot assign index field {name}, the highest index is {limit}' ) else: args[index] = value continue elif name: meta[name] = value if until and not until(meta): self.log_info( F'the expression ({until}) evaluated to zero; aborting.' ) break with StreamDetour(reader, checkpoint) as detour: full = reader.read(detour.cursor - checkpoint) if last is None: last = full outputs = [] for template in self.args.outputs: used = set() outputs.append( meta.format(template, self.codec, [full, *args], {_SHARP: last}, True, used=used)) for key in used: meta.pop(key, None) for output in outputs: chunk = self.labelled(output, **meta) chunk.set_next_batch(index) yield chunk except EOF: leftover = repr(SizeInt(len(reader) - checkpoint)).strip() self.log_info(F'discarding {leftover} left in buffer') break
def process(self, data): mode: MODE = self.args.mode with StructReader(memoryview(data)) as reader, MemoryFile() as writer: reader: StructReader[memoryview] check = zlib.crc32(reader.peek(6)) magic = reader.read(4) if magic != self._SIGNATURE: if mode is None: self.log_warn( F'data starts with {magic.hex().upper()} rather than the expected sequence ' F'{self._SIGNATURE.hex().upper()}; this could be a raw stream.' ) else: reader.seek(0) handler = self._get_handler(mode) handler(reader, writer, None) return writer.getbuffer() header_size = reader.u16() if header_size != 24: self.log_warn( F'the header size {header_size} was not equal to 24') crc32byte = reader.u8() check = zlib.crc32(reader.peek(0x11), check) & 0xFF if check != crc32byte: self.log_warn( F'the CRC32 check byte was {crc32byte}, computed value was {check}' ) _mode_code = reader.u8() try: _mode = MODE(_mode_code) except ValueError: msg = F'header contains unknown compression type code {_mode_code}' if mode is None: raise ValueError(msg) else: self.log_warn(msg) else: if mode is not None and mode != _mode: logger = self.log_warn else: logger = self.log_info mode = _mode logger(F'header specifies algorithm {_mode.name}') self.log_info(F'using algorithm {mode.name}') decompress = self._get_handler(mode) final_size = reader.u32() _unknown_1 = reader.u32() chunk_size = reader.u32() _unknown_2 = reader.u32() if _unknown_1 != 0: self.log_warn( F'unknown value 1 was unexpectedly nonzero: 0x{_unknown_1:08X}' ) if _unknown_2 != 0: self.log_warn( F'unknown value 2 was unexpectedly nonzero: 0x{_unknown_2:08X}' ) self.log_debug(F'final size: 0x{final_size:08X}') self.log_debug(F'chunk size: 0x{chunk_size:08X}') if chunk_size > COMPRESS_MAX_CHUNK: raise ValueError( 'the header chunk size is greater than the maximum value') while len(writer) < final_size: src_size = reader.u32() src_data = reader.read(src_size) if len(src_data) != src_size: raise IndexError( F'Attempted to read {src_size} bytes, but got only {len(src_data)}.' ) if src_size + len(writer) == final_size: self.log_debug( F'final chunk is uncompressed, appending {src_size} raw bytes to output' ) writer.write(src_data) break self.log_debug(F'reading chunk of size {src_size}') start = writer.tell() chunk = StructReader(src_data) target = min(chunk_size, final_size - len(writer)) decompress(chunk, writer, target) writer.flush() written = writer.tell() - start if written != target: raise RuntimeError( F'decompressed output had unexpected size {written} instead of {chunk_size}' ) if not reader.eof: self.log_info( F'compression complete with {reader.remaining_bytes} bytes remaining in input' ) return writer.getbuffer()