def get_pages_fast(self, mobi_file_path): ''' 2300 characters of uncompressed text per page. This is not meant to map 1 to 1 to a print book but to be a close enough measure. A test book was chosen and the characters were counted on one page. This number was round to 2240 then 60 characters of markup were added to the total giving 2300. Uncompressed text length is used because it's easily accessible in MOBI files (part of the header). Also, It's faster to work off of the length then to decompress and parse the actual text. ''' text_length = 0 pages = [] count = 0 with lopen(mobi_file_path, 'rb') as mf: phead = PdbHeaderReader(mf) r0 = phead.section_data(0) text_length = struct.unpack('>I', r0[4:8])[0] while count < text_length: pages.append(count) count += 2300 return pages
def get_metadata(stream, extract_cover=True): """ Return metadata as a L{MetaInfo} object """ mi = MetaInformation(None, [_('Unknown')]) stream.seek(0) pheader = PdbHeaderReader(stream) # Only Dropbook produced 132 byte record0 files are supported if len(pheader.section_data(0)) == 132: hr = HeaderRecord(pheader.section_data(0)) if hr.compression in (2, 10) and hr.has_metadata == 1: try: mdata = pheader.section_data(hr.metadata_offset) mdata = mdata.split('\x00') mi.title = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[0]) mi.authors = [re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[1])] mi.publisher = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[3]) mi.isbn = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[4]) except: pass if extract_cover: mi.cover_data = get_cover(pheader, hr) if not mi.title: mi.title = pheader.title if pheader.title else _('Unknown') return mi
def get_metadata(stream, extract_cover=True): """ Return metadata as a L{MetaInfo} object """ mi = MetaInformation(None, [_('Unknown')]) stream.seek(0) pheader = PdbHeaderReader(stream) # Only Dropbook produced 132 byte record0 files are supported if len(pheader.section_data(0)) == 132: hr = HeaderRecord(pheader.section_data(0)) if hr.compression in (2, 10) and hr.has_metadata == 1: try: mdata = pheader.section_data(hr.metadata_offset) mdata = mdata.decode('cp1252', 'replace').split('\x00') mi.title = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[0]) mi.authors = [re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[1])] mi.publisher = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[3]) mi.isbn = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[4]) except Exception: pass if extract_cover: mi.cover_data = get_cover(pheader, hr) if not mi.title: mi.title = pheader.title if pheader.title else _('Unknown') return mi
def write_apnx(self, mobi_file_path, apnx_path, accurate=True, page_count=0): ''' If you want a fixed number of pages (such as from a custom column) then pass in a value to page_count, otherwise a count will be estimated using either the fast or accurate algorithm. ''' import uuid apnx_meta = { 'guid': str(uuid.uuid4()).replace('-', '')[:8], 'asin': '', 'cdetype': 'EBOK', 'format': 'MOBI_7', 'acr': '' } with open(mobi_file_path, 'rb') as mf: ident = PdbHeaderReader(mf).identity() if ident != 'BOOKMOBI': # Check that this is really a MOBI file. raise Exception(_('Not a valid MOBI file. Reports identity of %s') % ident) apnx_meta['acr'] = str(PdbHeaderReader(mf).name()) # We'll need the PDB name, the MOBI version, and some metadata to make FW 3.4 happy with KF8 files... with open(mobi_file_path, 'rb') as mf: mh = MetadataHeader(mf, default_log) if mh.mobi_version == 8: apnx_meta['format'] = 'MOBI_8' else: apnx_meta['format'] = 'MOBI_7' if mh.exth is None or not mh.exth.cdetype: apnx_meta['cdetype'] = 'EBOK' else: apnx_meta['cdetype'] = str(mh.exth.cdetype) if mh.exth is None or not mh.exth.uuid: apnx_meta['asin'] = '' else: apnx_meta['asin'] = str(mh.exth.uuid) # Get the pages depending on the chosen parser pages = [] if page_count: pages = self.get_pages_exact(mobi_file_path, page_count) else: if accurate: try: pages = self.get_pages_accurate(mobi_file_path) except: # Fall back to the fast parser if we can't # use the accurate one. Typically this is # due to the file having DRM. pages = self.get_pages_fast(mobi_file_path) else: pages = self.get_pages_fast(mobi_file_path) if not pages: raise Exception(_('Could not generate page mapping.')) # Generate the APNX file from the page mapping. apnx = self.generate_apnx(pages, apnx_meta) # Write the APNX. with open(apnx_path, 'wb') as apnxf: apnxf.write(apnx) fsync(apnxf)
def get_metadata(stream, extract_cover=True): ''' Return metadata as a L{MetaInfo} object ''' mi = MetaInformation(_('Unknown'), [_('Unknown')]) stream.seek(0) pheader = PdbHeaderReader(stream) section_data = None for i in range(1, pheader.num_sections): raw_data = pheader.section_data(i) section_header = SectionHeader(raw_data) if section_header.type == DATATYPE_METADATA: section_data = raw_data[8:] break if not section_data: return mi default_encoding = 'latin-1' record_count, = struct.unpack('>H', section_data[0:2]) adv = 0 title = None author = None pubdate = 0 for i in xrange(record_count): try: type, length = struct.unpack_from('>HH', section_data, 2 + adv) except struct.error: break # CharSet if type == 1: val, = struct.unpack('>H', section_data[6+adv:8+adv]) default_encoding = MIBNUM_TO_NAME.get(val, 'latin-1') # Author elif type == 4: author = section_data[6+adv+(2*length)] # Title elif type == 5: title = section_data[6+adv+(2*length)] # Publication Date elif type == 6: pubdate, = struct.unpack('>I', section_data[6+adv:6+adv+4]) adv += 2*length if title: mi.title = title.replace('\0', '').decode(default_encoding, 'replace') if author: author = author.replace('\0', '').decode(default_encoding, 'replace') mi.author = author.split(',') mi.pubdate = datetime.fromtimestamp(pubdate) return mi
def get_metadata(stream, extract_cover=True): ''' Return metadata as a L{MetaInfo} object ''' mi = MetaInformation(_('Unknown'), [_('Unknown')]) stream.seek(0) pheader = PdbHeaderReader(stream) section_data = None for i in range(1, pheader.num_sections): raw_data = pheader.section_data(i) section_header = SectionHeader(raw_data) if section_header.type == DATATYPE_METADATA: section_data = raw_data[8:] break if not section_data: return mi default_encoding = 'latin-1' record_count, = struct.unpack('>H', section_data[0:2]) adv = 0 title = None author = None pubdate = 0 for i in range(record_count): try: type, length = struct.unpack_from('>HH', section_data, 2 + adv) except struct.error: break # CharSet if type == 1: val, = struct.unpack('>H', section_data[6+adv:8+adv]) default_encoding = MIBNUM_TO_NAME.get(val, 'latin-1') # Author elif type == 4: author = section_data[6+adv+(2*length)] # Title elif type == 5: title = section_data[6+adv+(2*length)] # Publication Date elif type == 6: pubdate, = struct.unpack('>I', section_data[6+adv:6+adv+4]) adv += 2*length if title: mi.title = title.replace('\0', '').decode(default_encoding, 'replace') if author: author = author.replace('\0', '').decode(default_encoding, 'replace') mi.author = author.split(',') mi.pubdate = datetime.fromtimestamp(pubdate) return mi
def set_metadata(stream, mi): pheader = PdbHeaderReader(stream) # Only Dropbook produced 132 byte record0 files are supported if pheader.section_data(0) != 132: return sections = [ pheader.section_data(x) for x in range(0, pheader.section_count()) ] hr = HeaderRecord(sections[0]) if hr.compression not in (2, 10): return # Create a metadata record for the file if one does not already exist if not hr.has_metadata: sections += [b'', b'MeTaInFo\x00'] last_data = len(sections) - 1 for i in range(0, 132, 2): val, = struct.unpack('>H', sections[0][i:i + 2]) if val >= hr.last_data_offset: sections[0][i:i + 2] = struct.pack('>H', last_data) sections[0][24:26] = struct.pack('>H', 1) # Set has metadata sections[0][44:46] = struct.pack('>H', last_data - 1) # Set location of metadata sections[0][52:54] = struct.pack( '>H', last_data) # Ensure last data offset is updated # Merge the metadata into the file file_mi = get_metadata(stream, False) file_mi.smart_update(mi) sections[hr.metadata_offset] = ( '%s\x00%s\x00%s\x00%s\x00%s\x00' % (file_mi.title, authors_to_string(file_mi.authors), '', file_mi.publisher, file_mi.isbn)).encode('cp1252', 'replace') # Rebuild the PDB wrapper because the offsets have changed due to the # new metadata. pheader_builder = PdbHeaderBuilder(pheader.ident, pheader.title) stream.seek(0) stream.truncate(0) pheader_builder.build_header([len(x) for x in sections], stream) # Write the data back to the file for item in sections: stream.write(item)
def get_metadata(self, stream, ftype): header = PdbHeaderReader(stream) if header.ident not in (UPDB_IDENT, BPDB_IDENT): stream.seek(0) return super(HaoDooPdb, self).get_metadata(stream, ftype) reader = Reader(header, stream, None, None) return reader.get_metadata()
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.pdb.header import PdbHeaderReader from calibre.ebooks.azw4.reader import Reader header = PdbHeaderReader(stream) reader = Reader(header, stream, log, options) opf = reader.extract_content(os.getcwd()) return opf
def convert(self, stream, options, file_ext, log, accelerators): header = PdbHeaderReader(stream) if header.ident not in (UPDB_IDENT, BPDB_IDENT): return super(HaoDooPdb, self).convert(stream, options, file_ext, log, accelerators) reader = Reader(header, stream, log, options) opf = reader.extract_content(os.getcwd()) return opf
def set_metadata(stream, mi): pheader = PdbHeaderReader(stream) # Only Dropbook produced 132 byte record0 files are supported if pheader.section_data(0) != 132: return sections = [pheader.section_data(x) for x in range(0, pheader.section_count())] hr = HeaderRecord(sections[0]) if hr.compression not in (2, 10): return # Create a metadata record for the file if one does not alreay exist if not hr.has_metadata: sections += ['', 'MeTaInFo\x00'] last_data = len(sections) - 1 for i in range(0, 132, 2): val, = struct.unpack('>H', sections[0][i:i + 2]) if val >= hr.last_data_offset: sections[0][i:i + 2] = struct.pack('>H', last_data) sections[0][24:26] = struct.pack('>H', 1) # Set has metadata sections[0][44:46] = struct.pack('>H', last_data - 1) # Set location of metadata sections[0][52:54] = struct.pack('>H', last_data) # Ensure last data offset is updated # Merge the metadata into the file file_mi = get_metadata(stream, False) file_mi.smart_update(mi) sections[hr.metadata_offset] = '%s\x00%s\x00%s\x00%s\x00%s\x00' % \ (file_mi.title, authors_to_string(file_mi.authors), '', file_mi.publisher, file_mi.isbn) # Rebuild the PDB wrapper because the offsets have changed due to the # new metadata. pheader_builder = PdbHeaderBuilder(pheader.ident, pheader.title) stream.seek(0) stream.truncate(0) pheader_builder.build_header([len(x) for x in sections], stream) # Write the data back to the file for item in sections: stream.write(item)
def get_metadata(stream, extract_cover=True): ''' Return metadata as a L{MetaInfo} object ''' stream.seek(0) pheader = PdbHeaderReader(stream) reader = Reader(pheader, stream, None, None) return reader.get_metadata()
def set_metadata(stream, mi): stream.seek(0) pheader = PdbHeaderReader(stream) MetadataWriter = MWRITER.get(pheader.ident, None) if MetadataWriter: MetadataWriter(stream, mi) stream.seek(0) stream.write(re.sub('[^-A-Za-z0-9 ]+', '_', mi.title).ljust(31, '\x00')[:31].encode('ascii', 'replace') + b'\x00')
def get_metadata(stream, extract_cover=True): """ Return metadata as a L{MetaInfo} object """ pheader = PdbHeaderReader(stream) MetadataReader = MREADER.get(pheader.ident, None) if MetadataReader is None: return MetaInformation(pheader.title, [_('Unknown')]) return MetadataReader(stream, extract_cover)
def main(args=sys.argv): if len(args) < 2: print('Error: requires input file.') return 1 f = open(sys.argv[1], 'rb') pheader = PdbHeaderReader(f) pdb_header_info(pheader) ereader_header_info(pheader) section_lengths(pheader) return 0
def get_pages_exact(self, mobi_file_path, page_count): ''' Given a specified page count (such as from a custom column), create our array of pages for the apnx file by dividing by the content size of the book. ''' pages = [] count = 0 with lopen(mobi_file_path, 'rb') as mf: phead = PdbHeaderReader(mf) r0 = phead.section_data(0) text_length = struct.unpack('>I', r0[4:8])[0] chars_per_page = int(text_length // page_count) while count < text_length: pages.append(count) count += chars_per_page if len(pages) > page_count: # Rounding created extra page entries pages = pages[:page_count] return pages
def get_pages_exact(self, mobi_file_path, page_count): ''' Given a specified page count (such as from a custom column), create our array of pages for the apnx file by dividing by the content size of the book. ''' pages = [] count = 0 with lopen(mobi_file_path, 'rb') as mf: phead = PdbHeaderReader(mf) r0 = phead.section_data(0) text_length = struct.unpack('>I', r0[4:8])[0] chars_per_page = int(text_length / page_count) while count < text_length: pages.append(count) count += chars_per_page if len(pages) > page_count: # Rounding created extra page entries pages = pages[:page_count] return pages
def get_metadata(stream, extract_cover=True): """ Return metadata as a L{MetaInfo} object """ pheader = PdbHeaderReader(stream) MetadataReader = MREADER.get(pheader.ident, None) if MetadataReader is None: t = pheader.title if isinstance(t, bytes): t = t.decode('utf-8', 'replace') return MetaInformation(t, [_('Unknown')]) return MetadataReader(stream, extract_cover)
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.pdb.header import PdbHeaderReader from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader header = PdbHeaderReader(stream) Reader = get_reader(header.ident) if Reader is None: raise PDBError('No reader available for format within container.\n Identity is %s. Book type is %s' % (header.ident, IDENTITY_TO_NAME.get(header.ident, _('Unknown')))) log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident)) reader = Reader(header, stream, log, options) opf = reader.extract_content(os.getcwdu()) return opf
def write_apnx(self, mobi_file_path, apnx_path, accurate=True, page_count=0): ''' If you want a fixed number of pages (such as from a custom column) then pass in a value to page_count, otherwise a count will be estimated using either the fast or accurate algorithm. ''' # Check that this is really a MOBI file. with open(mobi_file_path, 'rb') as mf: ident = PdbHeaderReader(mf).identity() if ident != 'BOOKMOBI': raise Exception( _('Not a valid MOBI file. Reports identity of %s') % ident) # Get the pages depending on the chosen parser pages = [] if page_count: pages = self.get_pages_exact(mobi_file_path, page_count) else: if accurate: try: pages = self.get_pages_accurate(mobi_file_path) except: # Fall back to the fast parser if we can't # use the accurate one. Typically this is # due to the file having DRM. pages = self.get_pages_fast(mobi_file_path) else: pages = self.get_pages_fast(mobi_file_path) if not pages: raise Exception(_('Could not generate page mapping.')) # Generate the APNX file from the page mapping. apnx = self.generate_apnx(pages) # Write the APNX. with open(apnx_path, 'wb') as apnxf: apnxf.write(apnx)