Пример #1
0
    def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False):
        self.log = log
        self.compression_type = raw[:2]
        self.records, self.records_size = struct.unpack('>HH', raw[8:12])
        self.encryption_type, = struct.unpack('>H', raw[12:14])
        if ident == 'TEXTREAD':
            self.codepage = 1252
        if len(raw) <= 16:
            self.codec = 'cp1252'
            self.extra_flags = 0
            self.title = _('Unknown')
            self.language = 'ENGLISH'
            self.sublanguage = 'NEUTRAL'
            self.exth_flag, self.exth = 0, None
            self.ancient = True
            self.first_image_index = -1
            self.mobi_version = 1
        else:
            self.ancient = False
            self.doctype = raw[16:20]
            self.length, self.type, self.codepage, self.unique_id, \
                self.version = struct.unpack('>LLLLL', raw[20:40])

            try:
                self.codec = {
                    1252: 'cp1252',
                    65001: 'utf-8',
                    }[self.codepage]
            except (IndexError, KeyError):
                self.codec = 'cp1252' if not user_encoding else user_encoding
                log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
                    self.codec))
            # Some KF8 files have header length == 256 (generated by kindlegen
            # 2.7?). See https://bugs.launchpad.net/bugs/1067310
            max_header_length = 0x100

            if (ident == 'TEXTREAD' or self.length < 0xE4 or
                    self.length > max_header_length or
                    (try_extra_data_fix and self.length == 0xE4)):
                self.extra_flags = 0
            else:
                self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])

            if self.compression_type == 'DH':
                self.huff_offset, self.huff_number = struct.unpack('>LL',
                        raw[0x70:0x78])

            toff, tlen = struct.unpack('>II', raw[0x54:0x5c])
            tend = toff + tlen
            self.title = raw[toff:tend] if tend < len(raw) else _('Unknown')
            langcode  = struct.unpack('!L', raw[0x5C:0x60])[0]
            langid    = langcode & 0xFF
            sublangid = (langcode >> 10) & 0xFF
            self.language = main_language.get(langid, 'ENGLISH')
            self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
            self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
            self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0]

            self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
            self.exth = None
            if not isinstance(self.title, unicode):
                self.title = self.title.decode(self.codec, 'replace')
            if self.exth_flag & 0x40:
                try:
                    self.exth = EXTHHeader(raw[16 + self.length:], self.codec,
                            self.title)
                    self.exth.mi.uid = self.unique_id
                    if self.exth.mi.is_null('language'):
                        try:
                            self.exth.mi.language = mobi2iana(langid, sublangid)
                        except:
                            self.log.exception('Unknown language code')
                except:
                    self.log.exception('Invalid EXTH header')
                    self.exth_flag = 0

            self.ncxidx = NULL_INDEX
            if len(raw) >= 0xF8:
                self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4)

            # Ancient PRC files from Baen can have random values for
            # mobi_version, so be conservative
            if self.mobi_version == 8 and len(raw) >= (0xF8 + 16):
                self.dividx, self.skelidx, self.datpidx, self.othidx = \
                        struct.unpack_from(b'>4L', raw, 0xF8)

                # need to use the FDST record to find out how to properly
                # unpack the raw_ml into pieces it is simply a table of start
                # and end locations for each flow piece
                self.fdstidx, self.fdstcnt = struct.unpack_from(b'>2L', raw, 0xC0)
                # if cnt is 1 or less, fdst section number can be garbage
                if self.fdstcnt <= 1:
                    self.fdstidx = NULL_INDEX
            else: # Null values
                self.skelidx = self.dividx = self.othidx = self.fdstidx = \
                        NULL_INDEX
Пример #2
0
    def __init__(self,
                 raw,
                 ident,
                 user_encoding,
                 log,
                 try_extra_data_fix=False):
        self.log = log
        self.compression_type = raw[:2]
        self.records, self.records_size = struct.unpack('>HH', raw[8:12])
        self.encryption_type, = struct.unpack('>H', raw[12:14])
        if ident == 'TEXTREAD':
            self.codepage = 1252
        if len(raw) <= 16:
            self.codec = 'cp1252'
            self.extra_flags = 0
            self.title = _('Unknown')
            self.language = 'ENGLISH'
            self.sublanguage = 'NEUTRAL'
            self.exth_flag, self.exth = 0, None
            self.ancient = True
            self.first_image_index = -1
            self.mobi_version = 1
        else:
            self.ancient = False
            self.doctype = raw[16:20]
            self.length, self.type, self.codepage, self.unique_id, \
                self.version = struct.unpack('>LLLLL', raw[20:40])

            try:
                self.codec = {
                    1252: 'cp1252',
                    65001: 'utf-8',
                }[self.codepage]
            except (IndexError, KeyError):
                self.codec = 'cp1252' if not user_encoding else user_encoding
                log.warn('Unknown codepage %d. Assuming %s' %
                         (self.codepage, self.codec))
            # Some KF8 files have header length == 264 (generated by kindlegen
            # 2.9?). See https://bugs.launchpad.net/bugs/1179144
            max_header_length = 500  # We choose 500 for future versions of kindlegen

            if (ident == 'TEXTREAD' or self.length < 0xE4
                    or self.length > max_header_length
                    or (try_extra_data_fix and self.length == 0xE4)):
                self.extra_flags = 0
            else:
                self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])

            if self.compression_type == 'DH':
                self.huff_offset, self.huff_number = struct.unpack(
                    '>LL', raw[0x70:0x78])

            toff, tlen = struct.unpack('>II', raw[0x54:0x5c])
            tend = toff + tlen
            self.title = raw[toff:tend] if tend < len(raw) else _('Unknown')
            langcode = struct.unpack('!L', raw[0x5C:0x60])[0]
            langid = langcode & 0xFF
            sublangid = (langcode >> 10) & 0xFF
            self.language = main_language.get(langid, 'ENGLISH')
            self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
            self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
            self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0]

            self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
            self.exth = None
            if not isinstance(self.title, unicode):
                self.title = self.title.decode(self.codec, 'replace')
            if self.exth_flag & 0x40:
                try:
                    self.exth = EXTHHeader(raw[16 + self.length:], self.codec,
                                           self.title)
                    self.exth.mi.uid = self.unique_id
                    if self.exth.mi.is_null('language'):
                        try:
                            self.exth.mi.language = mobi2iana(
                                langid, sublangid)
                        except:
                            self.log.exception('Unknown language code')
                except:
                    self.log.exception('Invalid EXTH header')
                    self.exth_flag = 0

            self.ncxidx = NULL_INDEX
            if len(raw) >= 0xF8:
                self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4)

            # Ancient PRC files from Baen can have random values for
            # mobi_version, so be conservative
            if self.mobi_version == 8 and len(raw) >= (0xF8 + 16):
                self.dividx, self.skelidx, self.datpidx, self.othidx = \
                        struct.unpack_from(b'>4L', raw, 0xF8)

                # need to use the FDST record to find out how to properly
                # unpack the raw_ml into pieces it is simply a table of start
                # and end locations for each flow piece
                self.fdstidx, self.fdstcnt = struct.unpack_from(
                    b'>2L', raw, 0xC0)
                # if cnt is 1 or less, fdst section number can be garbage
                if self.fdstcnt <= 1:
                    self.fdstidx = NULL_INDEX
            else:  # Null values
                self.skelidx = self.dividx = self.othidx = self.fdstidx = \
                        NULL_INDEX
Пример #3
0
    def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False):
        self.log = log
        self.compression_type = raw[:2]
        self.records, self.records_size = struct.unpack(">HH", raw[8:12])
        self.encryption_type, = struct.unpack(">H", raw[12:14])
        if ident == "TEXTREAD":
            self.codepage = 1252
        if len(raw) <= 16:
            self.codec = "cp1252"
            self.extra_flags = 0
            self.title = _("Unknown")
            self.language = "ENGLISH"
            self.sublanguage = "NEUTRAL"
            self.exth_flag, self.exth = 0, None
            self.ancient = True
            self.first_image_index = -1
            self.mobi_version = 1
        else:
            self.ancient = False
            self.doctype = raw[16:20]
            self.length, self.type, self.codepage, self.unique_id, self.version = struct.unpack(">LLLLL", raw[20:40])

            try:
                self.codec = {1252: "cp1252", 65001: "utf-8"}[self.codepage]
            except (IndexError, KeyError):
                self.codec = "cp1252" if not user_encoding else user_encoding
                log.warn("Unknown codepage %d. Assuming %s" % (self.codepage, self.codec))
            # Some KF8 files have header length == 264 (generated by kindlegen
            # 2.9?). See https://bugs.launchpad.net/bugs/1179144
            max_header_length = 500  # We choose 500 for future versions of kindlegen

            if (
                ident == "TEXTREAD"
                or self.length < 0xE4
                or self.length > max_header_length
                or (try_extra_data_fix and self.length == 0xE4)
            ):
                self.extra_flags = 0
            else:
                self.extra_flags, = struct.unpack(">H", raw[0xF2:0xF4])

            if self.compression_type == "DH":
                self.huff_offset, self.huff_number = struct.unpack(">LL", raw[0x70:0x78])

            toff, tlen = struct.unpack(">II", raw[0x54:0x5C])
            tend = toff + tlen
            self.title = raw[toff:tend] if tend < len(raw) else _("Unknown")
            langcode = struct.unpack("!L", raw[0x5C:0x60])[0]
            langid = langcode & 0xFF
            sublangid = (langcode >> 10) & 0xFF
            self.language = main_language.get(langid, "ENGLISH")
            self.sublanguage = sub_language.get(sublangid, "NEUTRAL")
            self.mobi_version = struct.unpack(">I", raw[0x68:0x6C])[0]
            self.first_image_index = struct.unpack(">L", raw[0x6C : 0x6C + 4])[0]

            self.exth_flag, = struct.unpack(">L", raw[0x80:0x84])
            self.exth = None
            if not isinstance(self.title, unicode):
                self.title = self.title.decode(self.codec, "replace")
            if self.exth_flag & 0x40:
                try:
                    self.exth = EXTHHeader(raw[16 + self.length :], self.codec, self.title)
                    self.exth.mi.uid = self.unique_id
                    if self.exth.mi.is_null("language"):
                        try:
                            self.exth.mi.language = mobi2iana(langid, sublangid)
                        except:
                            self.log.exception("Unknown language code")
                except:
                    self.log.exception("Invalid EXTH header")
                    self.exth_flag = 0

            self.ncxidx = NULL_INDEX
            if len(raw) >= 0xF8:
                self.ncxidx, = struct.unpack_from(b">L", raw, 0xF4)

            # Ancient PRC files from Baen can have random values for
            # mobi_version, so be conservative
            if self.mobi_version == 8 and len(raw) >= (0xF8 + 16):
                self.dividx, self.skelidx, self.datpidx, self.othidx = struct.unpack_from(b">4L", raw, 0xF8)

                # need to use the FDST record to find out how to properly
                # unpack the raw_ml into pieces it is simply a table of start
                # and end locations for each flow piece
                self.fdstidx, self.fdstcnt = struct.unpack_from(b">2L", raw, 0xC0)
                # if cnt is 1 or less, fdst section number can be garbage
                if self.fdstcnt <= 1:
                    self.fdstidx = NULL_INDEX
            else:  # Null values
                self.skelidx = self.dividx = self.othidx = self.fdstidx = NULL_INDEX