Пример #1
0
    def get_pages_fast(self, mobi_file_path):
        '''
        2300 characters of uncompressed text per page. This is
        not meant to map 1 to 1 to a print book but to be a
        close enough measure.

        A test book was chosen and the characters were counted
        on one page. This number was round to 2240 then 60
        characters of markup were added to the total giving
        2300.

        Uncompressed text length is used because it's easily
        accessible in MOBI files (part of the header). Also,
        It's faster to work off of the length then to
        decompress and parse the actual text.
        '''
        text_length = 0
        pages = []
        count = 0

        with lopen(mobi_file_path, 'rb') as mf:
            phead = PdbHeaderReader(mf)
            r0 = phead.section_data(0)
            text_length = struct.unpack('>I', r0[4:8])[0]

        while count < text_length:
            pages.append(count)
            count += 2300

        return pages
Пример #2
0
def get_metadata(stream, extract_cover=True):
    """
    Return metadata as a L{MetaInfo} object
    """
    mi = MetaInformation(None, [_('Unknown')])
    stream.seek(0)

    pheader = PdbHeaderReader(stream)

    # Only Dropbook produced 132 byte record0 files are supported
    if len(pheader.section_data(0)) == 132:
        hr = HeaderRecord(pheader.section_data(0))

        if hr.compression in (2, 10) and hr.has_metadata == 1:
            try:
                mdata = pheader.section_data(hr.metadata_offset)

                mdata = mdata.split('\x00')
                mi.title = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[0])
                mi.authors = [re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[1])]
                mi.publisher = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[3])
                mi.isbn = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[4])
            except:
                pass

            if extract_cover:
                mi.cover_data = get_cover(pheader, hr)

    if not mi.title:
        mi.title = pheader.title if pheader.title else _('Unknown')

    return mi
Пример #3
0
def get_metadata(stream, extract_cover=True):
    """
    Return metadata as a L{MetaInfo} object
    """
    mi = MetaInformation(None, [_('Unknown')])
    stream.seek(0)

    pheader = PdbHeaderReader(stream)

    # Only Dropbook produced 132 byte record0 files are supported
    if len(pheader.section_data(0)) == 132:
        hr = HeaderRecord(pheader.section_data(0))

        if hr.compression in (2, 10) and hr.has_metadata == 1:
            try:
                mdata = pheader.section_data(hr.metadata_offset)

                mdata = mdata.decode('cp1252', 'replace').split('\x00')
                mi.title = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[0])
                mi.authors = [re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[1])]
                mi.publisher = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[3])
                mi.isbn = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[4])
            except Exception:
                pass

            if extract_cover:
                mi.cover_data = get_cover(pheader, hr)

    if not mi.title:
        mi.title = pheader.title if pheader.title else _('Unknown')

    return mi
Пример #4
0
    def get_pages_fast(self, mobi_file_path):
        '''
        2300 characters of uncompressed text per page. This is
        not meant to map 1 to 1 to a print book but to be a
        close enough measure.

        A test book was chosen and the characters were counted
        on one page. This number was round to 2240 then 60
        characters of markup were added to the total giving
        2300.

        Uncompressed text length is used because it's easily
        accessible in MOBI files (part of the header). Also,
        It's faster to work off of the length then to
        decompress and parse the actual text.
        '''
        text_length = 0
        pages = []
        count = 0

        with lopen(mobi_file_path, 'rb') as mf:
            phead = PdbHeaderReader(mf)
            r0 = phead.section_data(0)
            text_length = struct.unpack('>I', r0[4:8])[0]

        while count < text_length:
            pages.append(count)
            count += 2300

        return pages
Пример #5
0
    def write_apnx(self, mobi_file_path, apnx_path, accurate=True, page_count=0):
        '''
        If you want a fixed number of pages (such as from a custom column) then
        pass in a value to page_count, otherwise a count will be estimated
        using either the fast or accurate algorithm.
        '''
        import uuid
        apnx_meta = { 'guid': str(uuid.uuid4()).replace('-', '')[:8], 'asin':
                '', 'cdetype': 'EBOK', 'format': 'MOBI_7', 'acr': '' }

        with open(mobi_file_path, 'rb') as mf:
            ident = PdbHeaderReader(mf).identity()
            if ident != 'BOOKMOBI':
                # Check that this is really a MOBI file.
                raise Exception(_('Not a valid MOBI file. Reports identity of %s') % ident)
            apnx_meta['acr'] = str(PdbHeaderReader(mf).name())

        # We'll need the PDB name, the MOBI version, and some metadata to make FW 3.4 happy with KF8 files...
        with open(mobi_file_path, 'rb') as mf:
            mh = MetadataHeader(mf, default_log)
            if mh.mobi_version == 8:
                apnx_meta['format'] = 'MOBI_8'
            else:
                apnx_meta['format'] = 'MOBI_7'
            if mh.exth is None or not mh.exth.cdetype:
                apnx_meta['cdetype'] = 'EBOK'
            else:
                apnx_meta['cdetype'] = str(mh.exth.cdetype)
            if mh.exth is None or not mh.exth.uuid:
                apnx_meta['asin'] = ''
            else:
                apnx_meta['asin'] = str(mh.exth.uuid)

        # Get the pages depending on the chosen parser
        pages = []
        if page_count:
            pages = self.get_pages_exact(mobi_file_path, page_count)
        else:
            if accurate:
                try:
                    pages = self.get_pages_accurate(mobi_file_path)
                except:
                    # Fall back to the fast parser if we can't
                    # use the accurate one. Typically this is
                    # due to the file having DRM.
                    pages = self.get_pages_fast(mobi_file_path)
            else:
                pages = self.get_pages_fast(mobi_file_path)

        if not pages:
            raise Exception(_('Could not generate page mapping.'))

        # Generate the APNX file from the page mapping.
        apnx = self.generate_apnx(pages, apnx_meta)

        # Write the APNX.
        with open(apnx_path, 'wb') as apnxf:
            apnxf.write(apnx)
            fsync(apnxf)
Пример #6
0
def get_metadata(stream, extract_cover=True):
    '''
    Return metadata as a L{MetaInfo} object
    '''
    mi = MetaInformation(_('Unknown'), [_('Unknown')])
    stream.seek(0)

    pheader = PdbHeaderReader(stream)
    section_data = None
    for i in range(1, pheader.num_sections):
        raw_data = pheader.section_data(i)
        section_header = SectionHeader(raw_data)
        if section_header.type == DATATYPE_METADATA:
            section_data = raw_data[8:]
            break

    if not section_data:
        return mi

    default_encoding = 'latin-1'
    record_count, = struct.unpack('>H', section_data[0:2])
    adv = 0
    title = None
    author = None
    pubdate = 0
    for i in xrange(record_count):
        try:
            type, length = struct.unpack_from('>HH', section_data, 2 + adv)
        except struct.error:
            break

        # CharSet
        if type == 1:
            val, = struct.unpack('>H', section_data[6+adv:8+adv])
            default_encoding = MIBNUM_TO_NAME.get(val, 'latin-1')
        # Author
        elif type == 4:
            author = section_data[6+adv+(2*length)]
        # Title
        elif type == 5:
            title = section_data[6+adv+(2*length)]
        # Publication Date
        elif type == 6:
            pubdate, = struct.unpack('>I', section_data[6+adv:6+adv+4])

        adv += 2*length

    if title:
        mi.title = title.replace('\0', '').decode(default_encoding, 'replace')
    if author:
        author = author.replace('\0', '').decode(default_encoding, 'replace')
        mi.author = author.split(',')
    mi.pubdate = datetime.fromtimestamp(pubdate)

    return mi
Пример #7
0
def get_metadata(stream, extract_cover=True):
    '''
    Return metadata as a L{MetaInfo} object
    '''
    mi = MetaInformation(_('Unknown'), [_('Unknown')])
    stream.seek(0)

    pheader = PdbHeaderReader(stream)
    section_data = None
    for i in range(1, pheader.num_sections):
        raw_data = pheader.section_data(i)
        section_header = SectionHeader(raw_data)
        if section_header.type == DATATYPE_METADATA:
            section_data = raw_data[8:]
            break

    if not section_data:
        return mi

    default_encoding = 'latin-1'
    record_count, = struct.unpack('>H', section_data[0:2])
    adv = 0
    title = None
    author = None
    pubdate = 0
    for i in range(record_count):
        try:
            type, length = struct.unpack_from('>HH', section_data, 2 + adv)
        except struct.error:
            break

        # CharSet
        if type == 1:
            val, = struct.unpack('>H', section_data[6+adv:8+adv])
            default_encoding = MIBNUM_TO_NAME.get(val, 'latin-1')
        # Author
        elif type == 4:
            author = section_data[6+adv+(2*length)]
        # Title
        elif type == 5:
            title = section_data[6+adv+(2*length)]
        # Publication Date
        elif type == 6:
            pubdate, = struct.unpack('>I', section_data[6+adv:6+adv+4])

        adv += 2*length

    if title:
        mi.title = title.replace('\0', '').decode(default_encoding, 'replace')
    if author:
        author = author.replace('\0', '').decode(default_encoding, 'replace')
        mi.author = author.split(',')
    mi.pubdate = datetime.fromtimestamp(pubdate)

    return mi
Пример #8
0
def set_metadata(stream, mi):
    pheader = PdbHeaderReader(stream)

    # Only Dropbook produced 132 byte record0 files are supported
    if pheader.section_data(0) != 132:
        return

    sections = [
        pheader.section_data(x) for x in range(0, pheader.section_count())
    ]
    hr = HeaderRecord(sections[0])

    if hr.compression not in (2, 10):
        return

    # Create a metadata record for the file if one does not already exist
    if not hr.has_metadata:
        sections += [b'', b'MeTaInFo\x00']
        last_data = len(sections) - 1

        for i in range(0, 132, 2):
            val, = struct.unpack('>H', sections[0][i:i + 2])
            if val >= hr.last_data_offset:
                sections[0][i:i + 2] = struct.pack('>H', last_data)

        sections[0][24:26] = struct.pack('>H', 1)  # Set has metadata
        sections[0][44:46] = struct.pack('>H', last_data -
                                         1)  # Set location of metadata
        sections[0][52:54] = struct.pack(
            '>H', last_data)  # Ensure last data offset is updated

    # Merge the metadata into the file
    file_mi = get_metadata(stream, False)
    file_mi.smart_update(mi)
    sections[hr.metadata_offset] = (
        '%s\x00%s\x00%s\x00%s\x00%s\x00' %
        (file_mi.title, authors_to_string(file_mi.authors), '',
         file_mi.publisher, file_mi.isbn)).encode('cp1252', 'replace')

    # Rebuild the PDB wrapper because the offsets have changed due to the
    # new metadata.
    pheader_builder = PdbHeaderBuilder(pheader.ident, pheader.title)
    stream.seek(0)
    stream.truncate(0)
    pheader_builder.build_header([len(x) for x in sections], stream)

    # Write the data back to the file
    for item in sections:
        stream.write(item)
Пример #9
0
    def get_metadata(self, stream, ftype):
        header = PdbHeaderReader(stream)
        if header.ident not in (UPDB_IDENT, BPDB_IDENT):
            stream.seek(0)
            return super(HaoDooPdb, self).get_metadata(stream, ftype)
        reader = Reader(header, stream, None, None)

        return reader.get_metadata()
Пример #10
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.pdb.header import PdbHeaderReader
        from calibre.ebooks.azw4.reader import Reader

        header = PdbHeaderReader(stream)
        reader = Reader(header, stream, log, options)
        opf = reader.extract_content(os.getcwd())

        return opf
Пример #11
0
    def convert(self, stream, options, file_ext, log, accelerators):
        header = PdbHeaderReader(stream)
        if header.ident not in (UPDB_IDENT, BPDB_IDENT):
            return super(HaoDooPdb, self).convert(stream, options, file_ext,
                                                  log, accelerators)
        reader = Reader(header, stream, log, options)
        opf = reader.extract_content(os.getcwd())

        return opf
Пример #12
0
def set_metadata(stream, mi):
    pheader = PdbHeaderReader(stream)

    # Only Dropbook produced 132 byte record0 files are supported
    if pheader.section_data(0) != 132:
        return

    sections = [pheader.section_data(x) for x in range(0, pheader.section_count())]
    hr = HeaderRecord(sections[0])

    if hr.compression not in (2, 10):
        return

    # Create a metadata record for the file if one does not alreay exist
    if not hr.has_metadata:
        sections += ['', 'MeTaInFo\x00']
        last_data = len(sections) - 1

        for i in range(0, 132, 2):
            val, = struct.unpack('>H', sections[0][i:i + 2])
            if val >= hr.last_data_offset:
                sections[0][i:i + 2] = struct.pack('>H', last_data)

        sections[0][24:26] = struct.pack('>H', 1)  # Set has metadata
        sections[0][44:46] = struct.pack('>H', last_data - 1)  # Set location of metadata
        sections[0][52:54] = struct.pack('>H', last_data)  # Ensure last data offset is updated

    # Merge the metadata into the file
    file_mi = get_metadata(stream, False)
    file_mi.smart_update(mi)
    sections[hr.metadata_offset] = '%s\x00%s\x00%s\x00%s\x00%s\x00' % \
        (file_mi.title, authors_to_string(file_mi.authors), '', file_mi.publisher, file_mi.isbn)

    # Rebuild the PDB wrapper because the offsets have changed due to the
    # new metadata.
    pheader_builder = PdbHeaderBuilder(pheader.ident, pheader.title)
    stream.seek(0)
    stream.truncate(0)
    pheader_builder.build_header([len(x) for x in sections], stream)

    # Write the data back to the file
    for item in sections:
        stream.write(item)
Пример #13
0
def get_metadata(stream, extract_cover=True):
    '''
    Return metadata as a L{MetaInfo} object
    '''
    stream.seek(0)

    pheader = PdbHeaderReader(stream)
    reader = Reader(pheader, stream, None, None)

    return reader.get_metadata()
Пример #14
0
def set_metadata(stream, mi):
    stream.seek(0)

    pheader = PdbHeaderReader(stream)

    MetadataWriter = MWRITER.get(pheader.ident, None)

    if MetadataWriter:
        MetadataWriter(stream, mi)

    stream.seek(0)
    stream.write(re.sub('[^-A-Za-z0-9 ]+', '_', mi.title).ljust(31, '\x00')[:31].encode('ascii', 'replace') + b'\x00')
Пример #15
0
def get_metadata(stream, extract_cover=True):
    """
    Return metadata as a L{MetaInfo} object
    """

    pheader = PdbHeaderReader(stream)

    MetadataReader = MREADER.get(pheader.ident, None)

    if MetadataReader is None:
        return MetaInformation(pheader.title, [_('Unknown')])

    return MetadataReader(stream, extract_cover)
Пример #16
0
def main(args=sys.argv):
    if len(args) < 2:
        print('Error: requires input file.')
        return 1

    f = open(sys.argv[1], 'rb')

    pheader = PdbHeaderReader(f)

    pdb_header_info(pheader)
    ereader_header_info(pheader)
    section_lengths(pheader)

    return 0
Пример #17
0
    def get_pages_exact(self, mobi_file_path, page_count):
        '''
        Given a specified page count (such as from a custom column),
        create our array of pages for the apnx file by dividing by
        the content size of the book.
        '''
        pages = []
        count = 0

        with lopen(mobi_file_path, 'rb') as mf:
            phead = PdbHeaderReader(mf)
            r0 = phead.section_data(0)
            text_length = struct.unpack('>I', r0[4:8])[0]

        chars_per_page = int(text_length // page_count)
        while count < text_length:
            pages.append(count)
            count += chars_per_page

        if len(pages) > page_count:
            # Rounding created extra page entries
            pages = pages[:page_count]

        return pages
Пример #18
0
    def get_pages_exact(self, mobi_file_path, page_count):
        '''
        Given a specified page count (such as from a custom column),
        create our array of pages for the apnx file by dividing by
        the content size of the book.
        '''
        pages = []
        count = 0

        with lopen(mobi_file_path, 'rb') as mf:
            phead = PdbHeaderReader(mf)
            r0 = phead.section_data(0)
            text_length = struct.unpack('>I', r0[4:8])[0]

        chars_per_page = int(text_length / page_count)
        while count < text_length:
            pages.append(count)
            count += chars_per_page

        if len(pages) > page_count:
            # Rounding created extra page entries
            pages = pages[:page_count]

        return pages
Пример #19
0
def get_metadata(stream, extract_cover=True):
    """
    Return metadata as a L{MetaInfo} object
    """

    pheader = PdbHeaderReader(stream)

    MetadataReader = MREADER.get(pheader.ident, None)

    if MetadataReader is None:
        t = pheader.title
        if isinstance(t, bytes):
            t = t.decode('utf-8', 'replace')
        return MetaInformation(t, [_('Unknown')])

    return MetadataReader(stream, extract_cover)
Пример #20
0
    def convert(self, stream, options, file_ext, log,
                accelerators):
        from calibre.ebooks.pdb.header import PdbHeaderReader
        from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader

        header = PdbHeaderReader(stream)
        Reader = get_reader(header.ident)

        if Reader is None:
            raise PDBError('No reader available for format within container.\n Identity is %s. Book type is %s' % (header.ident, IDENTITY_TO_NAME.get(header.ident, _('Unknown'))))

        log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident))

        reader = Reader(header, stream, log, options)
        opf = reader.extract_content(os.getcwdu())

        return opf
Пример #21
0
    def write_apnx(self,
                   mobi_file_path,
                   apnx_path,
                   accurate=True,
                   page_count=0):
        '''
        If you want a fixed number of pages (such as from a custom column) then
        pass in a value to page_count, otherwise a count will be estimated
        using either the fast or accurate algorithm.
        '''
        # Check that this is really a MOBI file.
        with open(mobi_file_path, 'rb') as mf:
            ident = PdbHeaderReader(mf).identity()
        if ident != 'BOOKMOBI':
            raise Exception(
                _('Not a valid MOBI file. Reports identity of %s') % ident)

        # Get the pages depending on the chosen parser
        pages = []
        if page_count:
            pages = self.get_pages_exact(mobi_file_path, page_count)
        else:
            if accurate:
                try:
                    pages = self.get_pages_accurate(mobi_file_path)
                except:
                    # Fall back to the fast parser if we can't
                    # use the accurate one. Typically this is
                    # due to the file having DRM.
                    pages = self.get_pages_fast(mobi_file_path)
            else:
                pages = self.get_pages_fast(mobi_file_path)

        if not pages:
            raise Exception(_('Could not generate page mapping.'))

        # Generate the APNX file from the page mapping.
        apnx = self.generate_apnx(pages)

        # Write the APNX.
        with open(apnx_path, 'wb') as apnxf:
            apnxf.write(apnx)