Exemplo n.º 1
0
    def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False):
        self.log = log
        self.compression_type = raw[:2]
        self.records, self.records_size = struct.unpack('>HH', raw[8:12])
        self.encryption_type, = struct.unpack('>H', raw[12:14])
        if ident == 'TEXTREAD':
            self.codepage = 1252
        if len(raw) <= 16:
            self.codec = 'cp1252'
            self.extra_flags = 0
            self.title = _('Unknown')
            self.language = 'ENGLISH'
            self.sublanguage = 'NEUTRAL'
            self.exth_flag, self.exth = 0, None
            self.ancient = True
            self.first_image_index = -1
            self.mobi_version = 1
        else:
            self.ancient = False
            self.doctype = raw[16:20]
            self.length, self.type, self.codepage, self.unique_id, \
                self.version = struct.unpack('>LLLLL', raw[20:40])

            try:
                self.codec = {
                    1252: 'cp1252',
                    65001: 'utf-8',
                    }[self.codepage]
            except (IndexError, KeyError):
                self.codec = 'cp1252' if not user_encoding else user_encoding
                log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
                    self.codec))
            # Some KF8 files have header length == 256 (generated by kindlegen
            # 2.7?). See https://bugs.launchpad.net/bugs/1067310
            max_header_length = 0x100

            if (ident == 'TEXTREAD' or self.length < 0xE4 or
                    self.length > max_header_length or
                    (try_extra_data_fix and self.length == 0xE4)):
                self.extra_flags = 0
            else:
                self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])

            if self.compression_type == 'DH':
                self.huff_offset, self.huff_number = struct.unpack('>LL',
                        raw[0x70:0x78])

            toff, tlen = struct.unpack('>II', raw[0x54:0x5c])
            tend = toff + tlen
            self.title = raw[toff:tend] if tend < len(raw) else _('Unknown')
            langcode  = struct.unpack('!L', raw[0x5C:0x60])[0]
            langid    = langcode & 0xFF
            sublangid = (langcode >> 10) & 0xFF
            self.language = main_language.get(langid, 'ENGLISH')
            self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
            self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
            self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0]

            self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
            self.exth = None
            if not isinstance(self.title, unicode):
                self.title = self.title.decode(self.codec, 'replace')
            if self.exth_flag & 0x40:
                try:
                    self.exth = EXTHHeader(raw[16 + self.length:], self.codec,
                            self.title)
                    self.exth.mi.uid = self.unique_id
                    if self.exth.mi.is_null('language'):
                        try:
                            self.exth.mi.language = mobi2iana(langid, sublangid)
                        except:
                            self.log.exception('Unknown language code')
                except:
                    self.log.exception('Invalid EXTH header')
                    self.exth_flag = 0

            self.ncxidx = NULL_INDEX
            if len(raw) >= 0xF8:
                self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4)

            # Ancient PRC files from Baen can have random values for
            # mobi_version, so be conservative
            if self.mobi_version == 8 and len(raw) >= (0xF8 + 16):
                self.dividx, self.skelidx, self.datpidx, self.othidx = \
                        struct.unpack_from(b'>4L', raw, 0xF8)

                # need to use the FDST record to find out how to properly
                # unpack the raw_ml into pieces it is simply a table of start
                # and end locations for each flow piece
                self.fdstidx, self.fdstcnt = struct.unpack_from(b'>2L', raw, 0xC0)
                # if cnt is 1 or less, fdst section number can be garbage
                if self.fdstcnt <= 1:
                    self.fdstidx = NULL_INDEX
            else: # Null values
                self.skelidx = self.dividx = self.othidx = self.fdstidx = \
                        NULL_INDEX
Exemplo n.º 2
0
    def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False):
        self.log = log
        self.compression_type = raw[:2]
        self.records, self.records_size = struct.unpack(">HH", raw[8:12])
        self.encryption_type, = struct.unpack(">H", raw[12:14])
        if ident == "TEXTREAD":
            self.codepage = 1252
        if len(raw) <= 16:
            self.codec = "cp1252"
            self.extra_flags = 0
            self.title = _("Unknown")
            self.language = "ENGLISH"
            self.sublanguage = "NEUTRAL"
            self.exth_flag, self.exth = 0, None
            self.ancient = True
            self.first_image_index = -1
            self.mobi_version = 1
        else:
            self.ancient = False
            self.doctype = raw[16:20]
            self.length, self.type, self.codepage, self.unique_id, self.version = struct.unpack(">LLLLL", raw[20:40])

            try:
                self.codec = {1252: "cp1252", 65001: "utf-8"}[self.codepage]
            except (IndexError, KeyError):
                self.codec = "cp1252" if not user_encoding else user_encoding
                log.warn("Unknown codepage %d. Assuming %s" % (self.codepage, self.codec))
            # Some KF8 files have header length == 264 (generated by kindlegen
            # 2.9?). See https://bugs.launchpad.net/bugs/1179144
            max_header_length = 500  # We choose 500 for future versions of kindlegen

            if (
                ident == "TEXTREAD"
                or self.length < 0xE4
                or self.length > max_header_length
                or (try_extra_data_fix and self.length == 0xE4)
            ):
                self.extra_flags = 0
            else:
                self.extra_flags, = struct.unpack(">H", raw[0xF2:0xF4])

            if self.compression_type == "DH":
                self.huff_offset, self.huff_number = struct.unpack(">LL", raw[0x70:0x78])

            toff, tlen = struct.unpack(">II", raw[0x54:0x5C])
            tend = toff + tlen
            self.title = raw[toff:tend] if tend < len(raw) else _("Unknown")
            langcode = struct.unpack("!L", raw[0x5C:0x60])[0]
            langid = langcode & 0xFF
            sublangid = (langcode >> 10) & 0xFF
            self.language = main_language.get(langid, "ENGLISH")
            self.sublanguage = sub_language.get(sublangid, "NEUTRAL")
            self.mobi_version = struct.unpack(">I", raw[0x68:0x6C])[0]
            self.first_image_index = struct.unpack(">L", raw[0x6C : 0x6C + 4])[0]

            self.exth_flag, = struct.unpack(">L", raw[0x80:0x84])
            self.exth = None
            if not isinstance(self.title, unicode):
                self.title = self.title.decode(self.codec, "replace")
            if self.exth_flag & 0x40:
                try:
                    self.exth = EXTHHeader(raw[16 + self.length :], self.codec, self.title)
                    self.exth.mi.uid = self.unique_id
                    if self.exth.mi.is_null("language"):
                        try:
                            self.exth.mi.language = mobi2iana(langid, sublangid)
                        except:
                            self.log.exception("Unknown language code")
                except:
                    self.log.exception("Invalid EXTH header")
                    self.exth_flag = 0

            self.ncxidx = NULL_INDEX
            if len(raw) >= 0xF8:
                self.ncxidx, = struct.unpack_from(b">L", raw, 0xF4)

            # Ancient PRC files from Baen can have random values for
            # mobi_version, so be conservative
            if self.mobi_version == 8 and len(raw) >= (0xF8 + 16):
                self.dividx, self.skelidx, self.datpidx, self.othidx = struct.unpack_from(b">4L", raw, 0xF8)

                # need to use the FDST record to find out how to properly
                # unpack the raw_ml into pieces it is simply a table of start
                # and end locations for each flow piece
                self.fdstidx, self.fdstcnt = struct.unpack_from(b">2L", raw, 0xC0)
                # if cnt is 1 or less, fdst section number can be garbage
                if self.fdstcnt <= 1:
                    self.fdstidx = NULL_INDEX
            else:  # Null values
                self.skelidx = self.dividx = self.othidx = self.fdstidx = NULL_INDEX
Exemplo n.º 3
0
    def __init__(self, record0, offset):
        self.raw = record0.raw
        self.header_offset = offset

        self.compression_raw = self.raw[:2]
        self.compression = {
            1: 'No compression',
            2: 'PalmDoc compression',
            17480: 'HUFF/CDIC compression'
        }.get(
            struct.unpack(b'>H', self.compression_raw)[0],
            repr(self.compression_raw))
        self.unused = self.raw[2:4]
        self.text_length, = struct.unpack(b'>I', self.raw[4:8])
        self.number_of_text_records, self.text_record_size = \
                struct.unpack(b'>HH', self.raw[8:12])
        self.encryption_type_raw, = struct.unpack(b'>H', self.raw[12:14])
        self.encryption_type = {
            0: 'No encryption',
            1: 'Old mobipocket encryption',
            2: 'Mobipocket encryption'
        }.get(self.encryption_type_raw, repr(self.encryption_type_raw))
        self.unknown = self.raw[14:16]

        self.identifier = self.raw[16:20]
        if self.identifier != b'MOBI':
            raise ValueError('Identifier %r unknown' % self.identifier)

        self.length, = struct.unpack(b'>I', self.raw[20:24])
        self.type_raw, = struct.unpack(b'>I', self.raw[24:28])
        self.type = {
            2: 'Mobipocket book',
            3: 'PalmDOC book',
            4: 'Audio',
            257: 'News',
            258: 'News Feed',
            259: 'News magazine',
            513: 'PICS',
            514: 'Word',
            515: 'XLS',
            516: 'PPT',
            517: 'TEXT',
            518: 'HTML',
        }.get(self.type_raw, repr(self.type_raw))

        self.encoding_raw, = struct.unpack(b'>I', self.raw[28:32])
        self.encoding = {
            1252: 'cp1252',
            65001: 'utf-8',
        }.get(self.encoding_raw, repr(self.encoding_raw))
        self.uid = self.raw[32:36]
        self.file_version, = struct.unpack(b'>I', self.raw[36:40])
        self.meta_orth_indx, self.meta_infl_indx = struct.unpack(
            b'>II', self.raw[40:48])
        self.secondary_index_record, = struct.unpack(b'>I', self.raw[48:52])
        self.reserved = self.raw[52:80]
        self.first_non_book_record, = struct.unpack(b'>I', self.raw[80:84])
        self.fullname_offset, = struct.unpack(b'>I', self.raw[84:88])
        self.fullname_length, = struct.unpack(b'>I', self.raw[88:92])
        self.locale_raw, = struct.unpack(b'>I', self.raw[92:96])
        langcode = self.locale_raw
        langid = langcode & 0xFF
        sublangid = (langcode >> 10) & 0xFF
        self.language = main_language.get(langid, 'ENGLISH')
        self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')

        self.input_language = self.raw[96:100]
        self.output_langauage = self.raw[100:104]
        self.min_version, = struct.unpack(b'>I', self.raw[104:108])
        self.first_image_index, = struct.unpack(b'>I', self.raw[108:112])
        self.huffman_record_offset, = struct.unpack(b'>I', self.raw[112:116])
        self.huffman_record_count, = struct.unpack(b'>I', self.raw[116:120])
        self.datp_record_offset, = struct.unpack(b'>I', self.raw[120:124])
        self.datp_record_count, = struct.unpack(b'>I', self.raw[124:128])
        self.exth_flags, = struct.unpack(b'>I', self.raw[128:132])
        self.has_exth = bool(self.exth_flags & 0x40)
        self.has_drm_data = self.length >= 174 and len(self.raw) >= 184
        if self.has_drm_data:
            self.unknown3 = self.raw[132:168]
            self.drm_offset, self.drm_count, self.drm_size, self.drm_flags = \
                    struct.unpack(b'>4I', self.raw[168:184])
        self.has_extra_data_flags = self.length >= 232 and len(
            self.raw) >= 232 + 16
        self.has_fcis_flis = False
        self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False
        self.extra_data_flags = 0
        if self.has_extra_data_flags:
            self.unknown4 = self.raw[184:192]
            if self.file_version < 8:
                self.first_text_record, self.last_text_record = \
                    struct.unpack_from(b'>HH', self.raw, 192)
                self.fdst_count = struct.unpack_from(b'>L', self.raw, 196)
            else:
                self.fdst_idx, self.fdst_count = struct.unpack_from(
                    b'>LL', self.raw, 192)
                if self.fdst_count <= 1:
                    self.fdst_idx = NULL_INDEX
            (self.fcis_number, self.fcis_count, self.flis_number,
             self.flis_count) = struct.unpack(b'>IIII', self.raw[200:216])
            self.unknown6 = self.raw[216:224]
            self.srcs_record_index = struct.unpack(b'>I', self.raw[224:228])[0]
            self.num_srcs_records = struct.unpack(b'>I', self.raw[228:232])[0]
            self.unknown7 = self.raw[232:240]
            self.extra_data_flags = struct.unpack(b'>I', self.raw[240:244])[0]
            self.has_multibytes = bool(self.extra_data_flags & 0b1)
            self.has_indexing_bytes = bool(self.extra_data_flags & 0b10)
            self.has_uncrossable_breaks = bool(self.extra_data_flags & 0b100)
            self.primary_index_record, = struct.unpack(b'>I',
                                                       self.raw[244:248])

        if self.length >= 248:
            (self.sect_idx, self.skel_idx, self.datp_idx,
             self.oth_idx) = struct.unpack_from(b'>4L', self.raw, 248)
            self.unknown9 = self.raw[264:self.length + 16]
            if self.meta_orth_indx not in {NULL_INDEX, self.sect_idx}:
                raise ValueError('KF8 header has different Meta orth and '
                                 'section indices')

        # The following are all relative to the position of the header record
        # make them absolute for ease of debugging
        self.relative_records = {
            'sect_idx', 'skel_idx', 'datp_idx', 'oth_idx', 'meta_orth_indx',
            'huffman_record_offset', 'first_non_book_record',
            'datp_record_offset', 'fcis_number', 'flis_number',
            'primary_index_record', 'fdst_idx', 'first_image_index'
        }
        for x in self.relative_records:
            if hasattr(self, x) and getattr(self, x) != NULL_INDEX:
                setattr(self, x, self.header_offset + getattr(self, x))

        # Try to find the first non-text record
        self.first_resource_record = offset + 1 + self.number_of_text_records  # Default to first record after all text records
        pointer = min(getattr(self, 'first_non_book_record', NULL_INDEX),
                      getattr(self, 'first_image_index', NULL_INDEX))
        if pointer != NULL_INDEX:
            self.first_resource_record = max(pointer,
                                             self.first_resource_record)
        self.last_resource_record = NULL_INDEX

        if self.has_exth:
            self.exth_offset = 16 + self.length

            self.exth = EXTHHeader(self.raw[self.exth_offset:])

            self.end_of_exth = self.exth_offset + self.exth.length
            self.bytes_after_exth = self.raw[self.end_of_exth:self.
                                             fullname_offset]

            if self.exth.kf8_header_index is not None and offset == 0:
                # MOBI 6 header in a joint file, adjust self.last_resource_record
                self.last_resource_record = self.exth.kf8_header_index - 2
Exemplo n.º 4
0
    def __init__(self, record0, offset):
        self.raw = record0.raw
        self.header_offset = offset

        self.compression_raw = self.raw[:2]
        self.compression = {1: 'No compression', 2: 'PalmDoc compression',
                17480: 'HUFF/CDIC compression'}.get(struct.unpack(b'>H',
                    self.compression_raw)[0],
                    repr(self.compression_raw))
        self.unused = self.raw[2:4]
        self.text_length, = struct.unpack(b'>I', self.raw[4:8])
        self.number_of_text_records, self.text_record_size = \
                struct.unpack(b'>HH', self.raw[8:12])
        self.encryption_type_raw, = struct.unpack(b'>H', self.raw[12:14])
        self.encryption_type = {
                0: 'No encryption',
                1: 'Old mobipocket encryption',
                2: 'Mobipocket encryption'
            }.get(self.encryption_type_raw, repr(self.encryption_type_raw))
        self.unknown = self.raw[14:16]

        self.identifier = self.raw[16:20]
        if self.identifier != b'MOBI':
            raise ValueError('Identifier %r unknown'%self.identifier)

        self.length, = struct.unpack(b'>I', self.raw[20:24])
        self.type_raw, = struct.unpack(b'>I', self.raw[24:28])
        self.type = {
                2 : 'Mobipocket book',
                3 : 'PalmDOC book',
                4 : 'Audio',
                257 : 'News',
                258 : 'News Feed',
                259 : 'News magazine',
                513 : 'PICS',
                514 : 'Word',
                515 : 'XLS',
                516 : 'PPT',
                517 : 'TEXT',
                518 : 'HTML',
            }.get(self.type_raw, repr(self.type_raw))

        self.encoding_raw, = struct.unpack(b'>I', self.raw[28:32])
        self.encoding = {
                1252 : 'cp1252',
                65001: 'utf-8',
            }.get(self.encoding_raw, repr(self.encoding_raw))
        self.uid = self.raw[32:36]
        self.file_version, = struct.unpack(b'>I', self.raw[36:40])
        self.meta_orth_indx, self.meta_infl_indx = struct.unpack(
                b'>II', self.raw[40:48])
        self.secondary_index_record, = struct.unpack(b'>I', self.raw[48:52])
        self.reserved = self.raw[52:80]
        self.first_non_book_record, = struct.unpack(b'>I', self.raw[80:84])
        self.fullname_offset, = struct.unpack(b'>I', self.raw[84:88])
        self.fullname_length, = struct.unpack(b'>I', self.raw[88:92])
        self.locale_raw, = struct.unpack(b'>I', self.raw[92:96])
        langcode = self.locale_raw
        langid    = langcode & 0xFF
        sublangid = (langcode >> 10) & 0xFF
        self.language = main_language.get(langid, 'ENGLISH')
        self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')

        self.input_language = self.raw[96:100]
        self.output_langauage = self.raw[100:104]
        self.min_version, = struct.unpack(b'>I', self.raw[104:108])
        self.first_image_index, = struct.unpack(b'>I', self.raw[108:112])
        self.huffman_record_offset, = struct.unpack(b'>I', self.raw[112:116])
        self.huffman_record_count, = struct.unpack(b'>I', self.raw[116:120])
        self.datp_record_offset, = struct.unpack(b'>I', self.raw[120:124])
        self.datp_record_count, = struct.unpack(b'>I', self.raw[124:128])
        self.exth_flags, = struct.unpack(b'>I', self.raw[128:132])
        self.has_exth = bool(self.exth_flags & 0x40)
        self.has_drm_data = self.length >= 174 and len(self.raw) >= 184
        if self.has_drm_data:
            self.unknown3 = self.raw[132:168]
            self.drm_offset, self.drm_count, self.drm_size, self.drm_flags = \
                    struct.unpack(b'>4I', self.raw[168:184])
        self.has_extra_data_flags = self.length >= 232 and len(self.raw) >= 232+16
        self.has_fcis_flis = False
        self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False
        self.extra_data_flags = 0
        if self.has_extra_data_flags:
            self.unknown4 = self.raw[184:192]
            if self.file_version < 8:
                self.first_text_record, self.last_text_record = \
                    struct.unpack_from(b'>HH', self.raw, 192)
                self.fdst_count = struct.unpack_from(b'>L', self.raw, 196)
            else:
                self.fdst_idx, self.fdst_count = struct.unpack_from(b'>LL',
                        self.raw, 192)
                if self.fdst_count <= 1:
                    self.fdst_idx = NULL_INDEX
            (self.fcis_number, self.fcis_count, self.flis_number,
                    self.flis_count) = struct.unpack(b'>IIII',
                            self.raw[200:216])
            self.unknown6 = self.raw[216:224]
            self.srcs_record_index = struct.unpack(b'>I',
                self.raw[224:228])[0]
            self.num_srcs_records = struct.unpack(b'>I',
                self.raw[228:232])[0]
            self.unknown7 = self.raw[232:240]
            self.extra_data_flags = struct.unpack(b'>I',
                self.raw[240:244])[0]
            self.has_multibytes = bool(self.extra_data_flags & 0b1)
            self.has_indexing_bytes = bool(self.extra_data_flags & 0b10)
            self.has_uncrossable_breaks = bool(self.extra_data_flags & 0b100)
            self.primary_index_record, = struct.unpack(b'>I',
                    self.raw[244:248])

        if self.length >= 248:
            (self.sect_idx, self.skel_idx, self.datp_idx, self.oth_idx
                    ) = struct.unpack_from(b'>4L', self.raw, 248)
            self.unknown9 = self.raw[264:self.length+16]
            if self.meta_orth_indx not in {NULL_INDEX, self.sect_idx}:
                raise ValueError('KF8 header has different Meta orth and '
                        'section indices')

        # The following are all relative to the position of the header record
        # make them absolute for ease of debugging
        self.relative_records = {'sect_idx', 'skel_idx', 'datp_idx', 'oth_idx',
                'meta_orth_indx', 'huffman_record_offset',
                'first_non_book_record', 'datp_record_offset', 'fcis_number',
                'flis_number', 'primary_index_record', 'fdst_idx',
                'first_image_index'}
        for x in self.relative_records:
            if hasattr(self, x) and getattr(self, x) != NULL_INDEX:
                setattr(self, x, self.header_offset+getattr(self, x))

        # Try to find the first non-text record
        self.first_resource_record = offset + 1 + self.number_of_text_records  # Default to first record after all text records
        pointer = min(getattr(self, 'first_non_book_record', NULL_INDEX), getattr(self, 'first_image_index', NULL_INDEX))
        if pointer != NULL_INDEX:
            self.first_resource_record = max(pointer, self.first_resource_record)
        self.last_resource_record = NULL_INDEX

        if self.has_exth:
            self.exth_offset = 16 + self.length

            self.exth = EXTHHeader(self.raw[self.exth_offset:])

            self.end_of_exth = self.exth_offset + self.exth.length
            self.bytes_after_exth = self.raw[self.end_of_exth:self.fullname_offset]

            if self.exth.kf8_header_index is not None and offset == 0:
                # MOBI 6 header in a joint file, adjust self.last_resource_record
                self.last_resource_record = self.exth.kf8_header_index - 2
Exemplo n.º 5
0
    def __init__(self,
                 raw,
                 ident,
                 user_encoding,
                 log,
                 try_extra_data_fix=False):
        self.log = log
        self.compression_type = raw[:2]
        self.records, self.records_size = struct.unpack('>HH', raw[8:12])
        self.encryption_type, = struct.unpack('>H', raw[12:14])
        if ident == 'TEXTREAD':
            self.codepage = 1252
        if len(raw) <= 16:
            self.codec = 'cp1252'
            self.extra_flags = 0
            self.title = _('Unknown')
            self.language = 'ENGLISH'
            self.sublanguage = 'NEUTRAL'
            self.exth_flag, self.exth = 0, None
            self.ancient = True
            self.first_image_index = -1
            self.mobi_version = 1
        else:
            self.ancient = False
            self.doctype = raw[16:20]
            self.length, self.type, self.codepage, self.unique_id, \
                self.version = struct.unpack('>LLLLL', raw[20:40])

            try:
                self.codec = {
                    1252: 'cp1252',
                    65001: 'utf-8',
                }[self.codepage]
            except (IndexError, KeyError):
                self.codec = 'cp1252' if not user_encoding else user_encoding
                log.warn('Unknown codepage %d. Assuming %s' %
                         (self.codepage, self.codec))
            # Some KF8 files have header length == 264 (generated by kindlegen
            # 2.9?). See https://bugs.launchpad.net/bugs/1179144
            max_header_length = 500  # We choose 500 for future versions of kindlegen

            if (ident == 'TEXTREAD' or self.length < 0xE4
                    or self.length > max_header_length
                    or (try_extra_data_fix and self.length == 0xE4)):
                self.extra_flags = 0
            else:
                self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4])

            if self.compression_type == 'DH':
                self.huff_offset, self.huff_number = struct.unpack(
                    '>LL', raw[0x70:0x78])

            toff, tlen = struct.unpack('>II', raw[0x54:0x5c])
            tend = toff + tlen
            self.title = raw[toff:tend] if tend < len(raw) else _('Unknown')
            langcode = struct.unpack('!L', raw[0x5C:0x60])[0]
            langid = langcode & 0xFF
            sublangid = (langcode >> 10) & 0xFF
            self.language = main_language.get(langid, 'ENGLISH')
            self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
            self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
            self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0]

            self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
            self.exth = None
            if not isinstance(self.title, unicode):
                self.title = self.title.decode(self.codec, 'replace')
            if self.exth_flag & 0x40:
                try:
                    self.exth = EXTHHeader(raw[16 + self.length:], self.codec,
                                           self.title)
                    self.exth.mi.uid = self.unique_id
                    if self.exth.mi.is_null('language'):
                        try:
                            self.exth.mi.language = mobi2iana(
                                langid, sublangid)
                        except:
                            self.log.exception('Unknown language code')
                except:
                    self.log.exception('Invalid EXTH header')
                    self.exth_flag = 0

            self.ncxidx = NULL_INDEX
            if len(raw) >= 0xF8:
                self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4)

            # Ancient PRC files from Baen can have random values for
            # mobi_version, so be conservative
            if self.mobi_version == 8 and len(raw) >= (0xF8 + 16):
                self.dividx, self.skelidx, self.datpidx, self.othidx = \
                        struct.unpack_from(b'>4L', raw, 0xF8)

                # need to use the FDST record to find out how to properly
                # unpack the raw_ml into pieces it is simply a table of start
                # and end locations for each flow piece
                self.fdstidx, self.fdstcnt = struct.unpack_from(
                    b'>2L', raw, 0xC0)
                # if cnt is 1 or less, fdst section number can be garbage
                if self.fdstcnt <= 1:
                    self.fdstidx = NULL_INDEX
            else:  # Null values
                self.skelidx = self.dividx = self.othidx = self.fdstidx = \
                        NULL_INDEX