def parsetag(self, s): p = 1 tname = None ttype = None tattr = dict_() while s[p:p+1] == ' ' : p += 1 if s[p:p+1] == '/': ttype = 'end' p += 1 while s[p:p+1] == ' ' : p += 1 b = p while s[p:p+1] not in ('>', '/', ' ', '"', "'",'\r','\n') : p += 1 tname=s[b:p].lower() # some special cases if tname == '?xml': tname = 'xml' if tname == '!--': ttype = 'single' comment = s[p:-3].strip() tattr['comment'] = comment if ttype is None: # parse any attributes of begin or single tags while s.find('=',p) != -1 : while s[p:p+1] == ' ' : p += 1 b = p while s[p:p+1] != '=' : p += 1 aname = s[b:p].lower() aname = aname.rstrip(' ') p += 1 while s[p:p+1] == ' ' : p += 1 if s[p:p+1] in ('"', "'") : p = p + 1 b = p while s[p:p+1] not in ('"', "'"): p += 1 val = s[b:p] p += 1 else : b = p while s[p:p+1] not in ('>', '/', ' ') : p += 1 val = s[b:p] tattr[aname] = val if ttype is None: ttype = 'begin' if s.find('/',p) >= 0: ttype = 'single' return ttype, tname, tattr
def parsetag(self, s): p = 1 tname = None ttype = None tattr = dict_() while s[p:p + 1] == " ": p += 1 if s[p:p + 1] == "/": ttype = "end" p += 1 while s[p:p + 1] == " ": p += 1 b = p while s[p:p + 1] not in (">", "/", " ", '"', "'", "\r", "\n"): p += 1 tname = s[b:p].lower() # some special cases if tname == "?xml": tname = "xml" if tname == "!--": ttype = "single" comment = s[p:-3].strip() tattr["comment"] = comment if ttype is None: # parse any attributes of begin or single tags while s.find("=", p) != -1: while s[p:p + 1] == " ": p += 1 b = p while s[p:p + 1] != "=": p += 1 aname = s[b:p].lower() aname = aname.rstrip(" ") p += 1 while s[p:p + 1] == " ": p += 1 if s[p:p + 1] in ('"', "'"): p = p + 1 b = p while s[p:p + 1] not in ('"', "'"): p += 1 val = s[b:p] p += 1 else: b = p while s[p:p + 1] not in (">", "/", " "): p += 1 val = s[b:p] tattr[aname] = val if ttype is None: ttype = "begin" if s.find("/", p) >= 0: ttype = "single" return ttype, tname, tattr
def __init__(self, fpath, debug): self.fpath = fpath self.f = None self.debug = debug self.type = b'' self.sec_offset = 0 self.sec_count = 0 self.header = b'' self.header_offset = 0 self.header_size = 0 self.mobi_header_offset = 0x10 self.version = 0 self.codepage = 1252 self.codec = 'windows-1252' self.first_resc_offset = 0 # self.exth = b'' self.exth_offset = 0 self.exth_size = 0 # self.meta_data = dict_() self.image_data = dict_()
def __init__(self, sect, sectNumber): self.sect = sect self.start = sectNumber self.header = self.sect.loadSection(self.start) if len(self.header) > 20 and self.header[16:20] == b'MOBI': self.sect.setsectiondescription(0, "Mobipocket Header") self.palm = False elif self.sect.ident == b'TEXtREAd': self.sect.setsectiondescription(0, "PalmDOC Header") self.palm = True else: raise unpackException('Unknown File Format') self.records, = struct.unpack_from(b'>H', self.header, 0x8) # set defaults in case this is a PalmDOC self.title = self.sect.palmname.decode('latin-1', errors='replace') self.length = len(self.header) - 16 self.type = 3 self.codepage = 1252 self.codec = 'windows-1252' self.unique_id = 0 self.version = 0 self.hasExth = False self.exth = b'' self.exth_offset = self.length + 16 self.exth_length = 0 self.crypto_type = 0 self.firstnontext = self.start + self.records + 1 self.firstresource = self.start + self.records + 1 self.ncxidx = 0xffffffff self.metaOrthIndex = 0xffffffff self.metaInflIndex = 0xffffffff self.skelidx = 0xffffffff self.fragidx = 0xffffffff self.guideidx = 0xffffffff self.fdst = 0xffffffff self.mlstart = self.sect.loadSection(self.start + 1)[:4] self.rawSize = 0 self.metadata = dict_() # set up for decompression/unpacking self.compression, = struct.unpack_from(b'>H', self.header, 0x0) if self.compression == 0x4448: reader = HuffcdicReader() huffoff, huffnum = struct.unpack_from(b'>LL', self.header, 0x70) huffoff = huffoff + self.start self.sect.setsectiondescription(huffoff, "Huffman Compression Seed") reader.loadHuff(self.sect.loadSection(huffoff)) for i in range(1, huffnum): self.sect.setsectiondescription( huffoff + i, "Huffman CDIC Compression Seed %d" % i) reader.loadCdic(self.sect.loadSection(huffoff + i)) self.unpack = reader.unpack elif self.compression == 2: self.unpack = PalmdocReader().unpack elif self.compression == 1: self.unpack = UncompressedReader().unpack else: raise unpackException('invalid compression type: 0x%4x' % self.compression) if self.palm: return self.length, self.type, self.codepage, self.unique_id, self.version = struct.unpack( b'>LLLLL', self.header[20:40]) codec_map = { 1252: 'windows-1252', 65001: 'utf-8', } if self.codepage in codec_map: self.codec = codec_map[self.codepage] # title toff, tlen = struct.unpack(b'>II', self.header[0x54:0x5c]) tend = toff + tlen self.title = self.header[toff:tend].decode(self.codec, errors='replace') exth_flag, = struct.unpack(b'>L', self.header[0x80:0x84]) self.hasExth = exth_flag & 0x40 self.exth_offset = self.length + 16 self.exth_length = 0 if self.hasExth: self.exth_length, = struct.unpack_from(b'>L', self.header, self.exth_offset + 4) self.exth_length = ((self.exth_length + 3) >> 2) << 2 # round to next 4 byte boundary self.exth = self.header[self.exth_offset:self.exth_offset + self.exth_length] # parse the exth / metadata self.parseMetaData() # self.mlstart = self.sect.loadSection(self.start+1) # self.mlstart = self.mlstart[0:4] self.crypto_type, = struct.unpack_from(b'>H', self.header, 0xC) # Start sector for additional files such as images, fonts, resources, etc # Can be missing so fall back to default set previously ofst, = struct.unpack_from(b'>L', self.header, 0x6C) if ofst != 0xffffffff: self.firstresource = ofst + self.start ofst, = struct.unpack_from(b'>L', self.header, 0x50) if ofst != 0xffffffff: self.firstnontext = ofst + self.start if self.isPrintReplica(): return if self.version < 8: # Dictionary metaOrthIndex self.metaOrthIndex, = struct.unpack_from(b'>L', self.header, 0x28) if self.metaOrthIndex != 0xffffffff: self.metaOrthIndex += self.start # Dictionary metaInflIndex self.metaInflIndex, = struct.unpack_from(b'>L', self.header, 0x2C) if self.metaInflIndex != 0xffffffff: self.metaInflIndex += self.start # handle older headers without any ncxindex info and later # specifically 0xe4 headers if self.length + 16 < 0xf8: return # NCX Index self.ncxidx, = struct.unpack(b'>L', self.header[0xf4:0xf8]) if self.ncxidx != 0xffffffff: self.ncxidx += self.start # K8 specific Indexes if self.start != 0 or self.version == 8: # Index into <xml> file skeletons in RawML self.skelidx, = struct.unpack_from(b'>L', self.header, 0xfc) if self.skelidx != 0xffffffff: self.skelidx += self.start # Index into <div> sections in RawML self.fragidx, = struct.unpack_from(b'>L', self.header, 0xf8) if self.fragidx != 0xffffffff: self.fragidx += self.start # Index into Other files self.guideidx, = struct.unpack_from(b'>L', self.header, 0x104) if self.guideidx != 0xffffffff: self.guideidx += self.start # dictionaries do not seem to use the same approach in K8's # so disable them self.metaOrthIndex = 0xffffffff self.metaInflIndex = 0xffffffff # need to use the FDST record to find out how to properly unpack # the rawML into pieces # it is simply a table of start and end locations for each flow piece self.fdst, = struct.unpack_from(b'>L', self.header, 0xc0) self.fdstcnt, = struct.unpack_from(b'>L', self.header, 0xc4) # if cnt is 1 or less, fdst section mumber can be garbage if self.fdstcnt <= 1: self.fdst = 0xffffffff if self.fdst != 0xffffffff: self.fdst += self.start
def __init__(self, sect, sectNumber): self.sect = sect self.start = sectNumber self.header = self.sect.loadSection(self.start) if len(self.header)>20 and self.header[16:20] == b'MOBI': self.sect.setsectiondescription(0,"Mobipocket Header") self.palm = False elif self.sect.ident == b'TEXtREAd': self.sect.setsectiondescription(0, "PalmDOC Header") self.palm = True else: raise unpackException('Unknown File Format') self.records, = struct.unpack_from(b'>H', self.header, 0x8) # set defaults in case this is a PalmDOC self.title = self.sect.palmname.decode('latin-1', errors='replace') self.length = len(self.header)-16 self.type = 3 self.codepage = 1252 self.codec = 'windows-1252' self.unique_id = 0 self.version = 0 self.hasExth = False self.exth = b'' self.exth_offset = self.length + 16 self.exth_length = 0 self.crypto_type = 0 self.firstnontext = self.start+self.records + 1 self.firstresource = self.start+self.records + 1 self.ncxidx = 0xffffffff self.metaOrthIndex = 0xffffffff self.metaInflIndex = 0xffffffff self.skelidx = 0xffffffff self.fragidx = 0xffffffff self.guideidx = 0xffffffff self.fdst = 0xffffffff self.mlstart = self.sect.loadSection(self.start+1)[:4] self.rawSize = 0 self.metadata = dict_() # set up for decompression/unpacking self.compression, = struct.unpack_from(b'>H', self.header, 0x0) if self.compression == 0x4448: reader = HuffcdicReader() huffoff, huffnum = struct.unpack_from(b'>LL', self.header, 0x70) huffoff = huffoff + self.start self.sect.setsectiondescription(huffoff,"Huffman Compression Seed") reader.loadHuff(self.sect.loadSection(huffoff)) for i in range(1, huffnum): self.sect.setsectiondescription(huffoff+i,"Huffman CDIC Compression Seed %d" % i) reader.loadCdic(self.sect.loadSection(huffoff+i)) self.unpack = reader.unpack elif self.compression == 2: self.unpack = PalmdocReader().unpack elif self.compression == 1: self.unpack = UncompressedReader().unpack else: raise unpackException('invalid compression type: 0x%4x' % self.compression) if self.palm: return self.length, self.type, self.codepage, self.unique_id, self.version = struct.unpack(b'>LLLLL', self.header[20:40]) codec_map = { 1252 : 'windows-1252', 65001: 'utf-8', } if self.codepage in codec_map: self.codec = codec_map[self.codepage] # title toff, tlen = struct.unpack(b'>II', self.header[0x54:0x5c]) tend = toff + tlen self.title=self.header[toff:tend].decode(self.codec, errors='replace') exth_flag, = struct.unpack(b'>L', self.header[0x80:0x84]) self.hasExth = exth_flag & 0x40 self.exth_offset = self.length + 16 self.exth_length = 0 if self.hasExth: self.exth_length, = struct.unpack_from(b'>L', self.header, self.exth_offset+4) self.exth_length = ((self.exth_length + 3)>>2)<<2 # round to next 4 byte boundary self.exth = self.header[self.exth_offset:self.exth_offset+self.exth_length] # parse the exth / metadata self.parseMetaData() # self.mlstart = self.sect.loadSection(self.start+1) # self.mlstart = self.mlstart[0:4] self.crypto_type, = struct.unpack_from(b'>H', self.header, 0xC) # Start sector for additional files such as images, fonts, resources, etc # Can be missing so fall back to default set previously ofst, = struct.unpack_from(b'>L', self.header, 0x6C) if ofst != 0xffffffff: self.firstresource = ofst + self.start ofst, = struct.unpack_from(b'>L', self.header, 0x50) if ofst != 0xffffffff: self.firstnontext = ofst + self.start if self.isPrintReplica(): return if self.version < 8: # Dictionary metaOrthIndex self.metaOrthIndex, = struct.unpack_from(b'>L', self.header, 0x28) if self.metaOrthIndex != 0xffffffff: self.metaOrthIndex += self.start # Dictionary metaInflIndex self.metaInflIndex, = struct.unpack_from(b'>L', self.header, 0x2C) if self.metaInflIndex != 0xffffffff: self.metaInflIndex += self.start # handle older headers without any ncxindex info and later # specifically 0xe4 headers if self.length + 16 < 0xf8: return # NCX Index self.ncxidx, = struct.unpack(b'>L', self.header[0xf4:0xf8]) if self.ncxidx != 0xffffffff: self.ncxidx += self.start # K8 specific Indexes if self.start != 0 or self.version == 8: # Index into <xml> file skeletons in RawML self.skelidx, = struct.unpack_from(b'>L', self.header, 0xfc) if self.skelidx != 0xffffffff: self.skelidx += self.start # Index into <div> sections in RawML self.fragidx, = struct.unpack_from(b'>L', self.header, 0xf8) if self.fragidx != 0xffffffff: self.fragidx += self.start # Index into Other files self.guideidx, = struct.unpack_from(b'>L', self.header, 0x104) if self.guideidx != 0xffffffff: self.guideidx += self.start # dictionaries do not seem to use the same approach in K8's # so disable them self.metaOrthIndex = 0xffffffff self.metaInflIndex = 0xffffffff # need to use the FDST record to find out how to properly unpack # the rawML into pieces # it is simply a table of start and end locations for each flow piece self.fdst, = struct.unpack_from(b'>L', self.header, 0xc0) self.fdstcnt, = struct.unpack_from(b'>L', self.header, 0xc4) # if cnt is 1 or less, fdst section mumber can be garbage if self.fdstcnt <= 1: self.fdst = 0xffffffff if self.fdst != 0xffffffff: self.fdst += self.start
def number_to_field_sorted(self) -> "dict_[int, MessageField]": """Dict version of sorted_fields(), in format of field number to field.""" return dict_((field.number, field) for field in self.sorted_fields())
def number_to_field(self) -> "dict_[int, MessageField]": """Returns the dict field number to field.""" return dict_((field.number, field) for field in self.fields())
def value_to_names(self) -> "dict_[int, str]": """Returns the dict of field value to field name.""" return dict_((field.value, field.name) for field in self.fields())
def name_to_values(self) -> "dict_[str, int]": """Returns the dict of field name to field value.""" return dict_((field.name, field.value) for field in self.fields())
def options_as_dict(self) -> "dict_[str, Option]": return dict_((name, option) for name, option in self.options())