def read_secondary_header(self): offset = self.hdr_len + (self.num_pieces * self.PIECE_SIZE) bytes = self.read_raw(offset, self.sec_hdr_len) offset = int32(bytes[4:]) while offset < len(bytes): blocktype = bytes[offset:offset + 4] blockver = u32(bytes[offset + 4:]) if blocktype == 'CAOL': if blockver != 2: raise LitError('Unknown CAOL block format %d' % blockver) self.creator_id = u32(bytes[offset + 12:]) self.entry_chunklen = u32(bytes[offset + 20:]) self.count_chunklen = u32(bytes[offset + 24:]) self.entry_unknown = u32(bytes[offset + 28:]) self.count_unknown = u32(bytes[offset + 32:]) offset += 48 elif blocktype == 'ITSF': if blockver != 4: raise LitError('Unknown ITSF block format %d' % blockver) if u32(bytes[offset + 4 + 16:]): raise LitError('This file has a 64bit content offset') self.content_offset = u32(bytes[offset + 16:]) self.timestamp = u32(bytes[offset + 24:]) self.language_id = u32(bytes[offset + 28:]) offset += 48 if not hasattr(self, 'content_offset'): raise LitError('Could not figure out the content offset')
def read_manifest(self): if '/manifest' not in self.entries: raise LitError('Lit file does not have a valid manifest') raw = self.get_file('/manifest') self.manifest = {} self.paths = {self.opf_path: None} while raw: slen, raw = ord(raw[0]), raw[1:] if slen == 0: break root, raw = raw[:slen].decode('utf8'), raw[slen:] if not raw: raise LitError('Truncated manifest') for state in ['spine', 'not spine', 'css', 'images']: num_files, raw = int32(raw), raw[4:] if num_files == 0: continue for i in range(num_files): if len(raw) < 5: raise LitError('Truncated manifest') offset, raw = u32(raw), raw[4:] internal, raw = consume_sized_utf8_string(raw) original, raw = consume_sized_utf8_string(raw) # The path should be stored unquoted, but not always original = urlunquote(original) # Is this last one UTF-8 or ASCIIZ? mime_type, raw = consume_sized_utf8_string(raw, zpad=True) self.manifest[internal] = ManifestItem( original, internal, mime_type, offset, root, state) mlist = self.manifest.values() # Remove any common path elements if len(mlist) > 1: shared = mlist[0].path for item in mlist[1:]: path = item.path while shared and not path.startswith(shared): try: shared = shared[:shared.rindex("/", 0, -2) + 1] except ValueError: shared = None if not shared: break if shared: slen = len(shared) for item in mlist: item.path = item.path[slen:] # Fix any straggling absolute paths for item in mlist: if item.path[0] == '/': item.path = os.path.basename(item.path) self.paths[item.path] = item
def read_section_names(self): if '::DataSpace/NameList' not in self.entries: raise LitError('Lit file does not have a valid NameList') raw = self.get_file('::DataSpace/NameList') if len(raw) < 4: raise LitError('Invalid Namelist section') pos = 4 num_sections = u16(raw[2:pos]) self.section_names = [""] * num_sections self.section_data = [None] * num_sections for section in range(num_sections): size = u16(raw[pos:pos + 2]) pos += 2 size = size * 2 + 2 if pos + size > len(raw): raise LitError('Invalid Namelist section') self.section_names[section] = \ raw[pos:pos+size].decode('utf-16-le').rstrip('\000') pos += size
def __init__(self, filename_or_stream, log): self._warn = log.warn if hasattr(filename_or_stream, 'read'): self.stream = filename_or_stream else: self.stream = open(filename_or_stream, 'rb') try: self.opf_path = os.path.splitext(os.path.basename( self.stream.name))[0] + '.opf' except AttributeError: self.opf_path = 'content.opf' if self.magic != 'ITOLITLS': raise LitError('Not a valid LIT file') if self.version != 1: raise LitError('Unknown LIT version %d' % (self.version, )) self.read_secondary_header() self.read_header_pieces() self.read_section_names() self.read_manifest() self.read_drm()
def read_directory(self, piece): if not piece.startswith('IFCM'): raise LitError('Header piece #1 is not main directory.') chunk_size, num_chunks = int32(piece[8:12]), int32(piece[24:28]) if (32 + (num_chunks * chunk_size)) != len(piece): raise LitError('IFCM header has incorrect length') self.entries = {} for i in range(num_chunks): offset = 32 + (i * chunk_size) chunk = piece[offset:offset + chunk_size] tag, chunk = chunk[:4], chunk[4:] if tag != 'AOLL': continue remaining, chunk = int32(chunk[:4]), chunk[4:] if remaining >= chunk_size: raise LitError('AOLL remaining count is negative') remaining = chunk_size - (remaining + 48) entries = u16(chunk[-2:]) if entries == 0: # Hopefully will work even without a correct entries count entries = (2**16) - 1 chunk = chunk[40:] for j in range(entries): if remaining <= 0: break namelen, chunk, remaining = encint(chunk, remaining) if namelen != (namelen & 0x7fffffff): raise LitError('Directory entry had 64bit name length.') if namelen > remaining - 3: raise LitError('Read past end of directory chunk') try: name = chunk[:namelen].decode('utf-8') chunk = chunk[namelen:] remaining -= namelen except UnicodeDecodeError: break section, chunk, remaining = encint(chunk, remaining) offset, chunk, remaining = encint(chunk, remaining) size, chunk, remaining = encint(chunk, remaining) entry = DirectoryEntry(name, section, offset, size) self.entries[name] = entry
def read_utf8_char(bytes, pos): c = ord(bytes[pos]) mask = 0x80 if (c & mask): elsize = 0 while c & mask: mask >>= 1 elsize += 1 if (mask <= 1) or (mask == 0x40): raise LitError('Invalid UTF8 character: %s' % repr(bytes[pos])) else: elsize = 1 if elsize > 1: if elsize + pos > len(bytes): raise LitError('Invalid UTF8 character: %s' % repr(bytes[pos])) c &= (mask - 1) for i in range(1, elsize): b = ord(bytes[pos + i]) if (b & 0xC0) != 0x80: raise LitError('Invalid UTF8 character: %s' % repr(bytes[pos:pos + i])) c = (c << 6) | (b & 0x3F) return codepoint_to_chr(c), pos + elsize
def decompress(self, content, control, reset_table): if len(control) < 32 or control[CONTROL_TAG:CONTROL_TAG + 4] != b"LZXC": raise LitError("Invalid ControlData tag value") if len(reset_table) < (RESET_INTERVAL + 8): raise LitError("Reset table is too short") if u32(reset_table[RESET_UCLENGTH + 4:]) != 0: raise LitError("Reset table has 64bit value for UCLENGTH") result = [] window_size = 14 u = u32(control[CONTROL_WINDOW_SIZE:]) while u > 0: u >>= 1 window_size += 1 if window_size < 15 or window_size > 21: raise LitError("Invalid window in ControlData") lzx.init(window_size) ofs_entry = int32(reset_table[RESET_HDRLEN:]) + 8 uclength = int32(reset_table[RESET_UCLENGTH:]) accum = int32(reset_table[RESET_INTERVAL:]) bytes_remaining = uclength window_bytes = (1 << window_size) base = 0 while ofs_entry < len(reset_table): if accum >= window_bytes: accum = 0 size = int32(reset_table[ofs_entry:]) u = int32(reset_table[ofs_entry + 4:]) if u != 0: raise LitError("Reset table entry greater than 32 bits") if size >= len(content): self._warn("LZX reset table entry out of bounds") if bytes_remaining >= window_bytes: lzx.reset() try: result.append( lzx.decompress(content[base:size], window_bytes)) except lzx.LZXError: self.warn("LZX decompression error; skipping chunk") bytes_remaining -= window_bytes base = size accum += int32(reset_table[RESET_INTERVAL:]) ofs_entry += 8 if bytes_remaining < window_bytes and bytes_remaining > 0: lzx.reset() try: result.append(lzx.decompress(content[base:], bytes_remaining)) except lzx.LZXError: self.warn("LZX decompression error; skipping chunk") bytes_remaining = 0 if bytes_remaining > 0: raise LitError("Failed to completely decompress section") return b''.join(result)
def get_section_uncached(self, section): name = self.section_names[section] path = '::DataSpace/Storage/' + name transform = self.get_file(path + '/Transform/List') content = self.get_file(path + '/Content') control = self.get_file(path + '/ControlData') while len(transform) >= 16: csize = (int32(control) + 1) * 4 if csize > len(control) or csize <= 0: raise LitError("ControlData is too short") guid = msguid(transform) if guid == DESENCRYPT_GUID: content = self.decrypt(content) control = control[csize:] elif guid == LZXCOMPRESS_GUID: reset_table = self.get_file('/'.join( ('::DataSpace/Storage', name, 'Transform', LZXCOMPRESS_GUID, 'InstanceData/ResetTable'))) content = self.decompress(content, control, reset_table) control = control[csize:] else: raise LitError("Unrecognized transform: %s." % repr(guid)) transform = transform[16:] return content
def read_header_pieces(self): src = self.header[self.hdr_len:] for i in range(self.num_pieces): piece = src[i * self.PIECE_SIZE:(i + 1) * self.PIECE_SIZE] if u32(piece[4:]) != 0 or u32(piece[12:]) != 0: raise LitError('Piece %s has 64bit value' % repr(piece)) offset, size = u32(piece), int32(piece[8:]) piece = self.read_raw(offset, size) if i == 0: continue # Dont need this piece elif i == 1: if u32(piece[8:]) != self.entry_chunklen or \ u32(piece[12:]) != self.entry_unknown: raise LitError('Secondary header does not match piece') self.read_directory(piece) elif i == 2: if u32(piece[8:]) != self.count_chunklen or \ u32(piece[12:]) != self.count_unknown: raise LitError('Secondary header does not match piece') continue # No data needed from this piece elif i == 3: self.piece3_guid = piece elif i == 4: self.piece4_guid = piece
def read_drm(self): self.drmlevel = 0 if '/DRMStorage/Licenses/EUL' in self.entries: self.drmlevel = 5 elif '/DRMStorage/DRMBookplate' in self.entries: self.drmlevel = 3 elif '/DRMStorage/DRMSealed' in self.entries: self.drmlevel = 1 else: return if self.drmlevel < 5: msdes.deskey(self.calculate_deskey(), msdes.DE1) bookkey = msdes.des(self.get_file('/DRMStorage/DRMSealed')) if bookkey[0] != '\000': raise LitError('Unable to decrypt title key!') self.bookkey = bookkey[1:9] else: raise DRMError("Cannot access DRM-protected book")
def binary_to_text_inner(self, bin, buf, stack): (depth, tag_name, current_map, dynamic_tag, errors, in_censorship, is_goingdown, state, flags) = stack.pop() if state == 'close tag': if not tag_name: raise LitError('Tag ends before it begins.') buf.write(encode(u''.join(('</', tag_name, '>')))) dynamic_tag = 0 tag_name = None state = 'text' while self.cpos < len(bin): c, self.cpos = read_utf8_char(bin, self.cpos) oc = ord(c) if state == 'text': if oc == 0: state = 'get flags' continue elif c == '\v': c = '\n' elif c == '>': c = '>>' elif c == '<': c = '<<' buf.write(encode(c)) elif state == 'get flags': if oc == 0: state = 'text' continue flags = oc state = 'get tag' elif state == 'get tag': state = 'text' if oc == 0 else 'get attr' if flags & FLAG_OPENING: tag = oc buf.write('<') if not (flags & FLAG_CLOSING): is_goingdown = True if tag == 0x8000: state = 'get custom length' continue if flags & FLAG_ATOM: if not self.tag_atoms or tag not in self.tag_atoms: raise LitError("atom tag %d not in atom tag list" % tag) tag_name = self.tag_atoms[tag] current_map = self.attr_atoms elif tag < len(self.tag_map): tag_name = self.tag_map[tag] current_map = self.tag_to_attr_map[tag] else: dynamic_tag += 1 errors += 1 tag_name = '?' + codepoint_to_chr(tag) + '?' current_map = self.tag_to_attr_map[tag] print('WARNING: tag %s unknown' % codepoint_to_chr(tag)) buf.write(encode(tag_name)) elif flags & FLAG_CLOSING: if depth == 0: raise LitError('Extra closing tag %s at %d' % (tag_name, self.cpos)) break elif state == 'get attr': in_censorship = False if oc == 0: state = 'text' if not is_goingdown: tag_name = None dynamic_tag = 0 buf.write(' />') else: buf.write('>') frame = (depth, tag_name, current_map, dynamic_tag, errors, in_censorship, False, 'close tag', flags) stack.append(frame) frame = (depth + 1, None, None, 0, 0, False, False, 'text', 0) stack.append(frame) break else: if oc == 0x8000: state = 'get attr length' continue attr = None if current_map and oc in current_map and current_map[oc]: attr = current_map[oc] elif oc in self.attr_map: attr = self.attr_map[oc] if not attr or not isinstance(attr, string_or_bytes): raise LitError('Unknown attribute %d in tag %s' % (oc, tag_name)) if attr.startswith('%'): in_censorship = True state = 'get value length' continue buf.write(' ' + encode(attr) + '=') if attr in ['href', 'src']: state = 'get href length' else: state = 'get value length' elif state == 'get value length': if not in_censorship: buf.write('"') count = oc - 1 if count == 0: if not in_censorship: buf.write('"') in_censorship = False state = 'get attr' continue state = 'get value' if oc == 0xffff: continue if count < 0 or count > (len(bin) - self.cpos): raise LitError('Invalid character count %d' % count) elif state == 'get value': if count == 0xfffe: if not in_censorship: buf.write('%s"' % (oc - 1)) in_censorship = False state = 'get attr' elif count > 0: if not in_censorship: if c == '"': c = '"' elif c == '<': c = '<' buf.write(c.encode('ascii', 'xmlcharrefreplace')) count -= 1 if count == 0: if not in_censorship: buf.write('"') in_censorship = False state = 'get attr' elif state == 'get custom length': count = oc - 1 if count <= 0 or count > len(bin) - self.cpos: raise LitError('Invalid character count %d' % count) dynamic_tag += 1 state = 'get custom' tag_name = '' elif state == 'get custom': tag_name += c count -= 1 if count == 0: buf.write(encode(tag_name)) state = 'get attr' elif state == 'get attr length': count = oc - 1 if count <= 0 or count > (len(bin) - self.cpos): raise LitError('Invalid character count %d' % count) buf.write(' ') state = 'get custom attr' elif state == 'get custom attr': buf.write(encode(c)) count -= 1 if count == 0: buf.write('=') state = 'get value length' elif state == 'get href length': count = oc - 1 if count <= 0 or count > (len(bin) - self.cpos): raise LitError('Invalid character count %d' % count) href = '' state = 'get href' elif state == 'get href': href += c count -= 1 if count == 0: doc, frag = urldefrag(href[1:]) path = self.item_path(doc) if frag: path = '#'.join((path, frag)) path = urlnormalize(path) buf.write(encode(u'"%s"' % path)) state = 'get attr'