def __init__(self, oeb, opts, resources): self.oeb, self.opts, self.log = oeb, opts, oeb.log self.compress = not self.opts.dont_compress self.has_tbs = False self.log.info('Creating KF8 output') # Create an inline ToC if one does not already exist self.toc_adder = TOCAdder(oeb, opts) self.used_images = set() self.resources = resources self.flows = [None] # First flow item is reserved for the text self.records = [None] # Placeholder for zeroth record self.log('\tGenerating KF8 markup...') self.dup_data() self.cleanup_markup() self.replace_resource_links() self.extract_css_into_flows() self.extract_svg_into_flows() self.replace_internal_links_with_placeholders() self.insert_aid_attributes() self.chunk_it_up() # Dump the cloned data as it is no longer needed del self._data_cache self.create_text_records() self.log('\tCreating indices...') self.create_fdst_records() self.create_indices() self.create_guide() # We do not want to use this ToC for MOBI 6, so remove it self.toc_adder.remove_generated_toc()
def __init__(self, oeb, opts, resources): self.oeb, self.opts, self.log = oeb, opts, oeb.log self.compress = not self.opts.dont_compress self.has_tbs = False self.log.info('Creating KF8 output') # Create an inline ToC if one does not already exist self.toc_adder = TOCAdder(oeb, opts) self.used_images = set() self.resources = resources self.flows = [None] # First flow item is reserved for the text self.records = [None] # Placeholder for zeroth record self.log.info('\tGenerating KF8 markup...') self.dup_data() self.cleanup_markup() self.replace_resource_links() self.extract_css_into_flows() self.extract_svg_into_flows() self.replace_internal_links_with_placeholders() self.insert_aid_attributes() self.chunk_it_up() # Dump the cloned data as it is no longer needed del self._data_cache self.create_text_records() self.log.info('\tCreating indices...') self.create_fdst_records() self.create_indices() self.create_guide() # We do not want to use this ToC for MOBI 6, so remove it self.toc_adder.remove_generated_toc()
class KF8Writer(object): def __init__(self, oeb, opts, resources): self.oeb, self.opts, self.log = oeb, opts, oeb.log self.compress = not self.opts.dont_compress self.has_tbs = False self.log.info('Creating KF8 output') # Create an inline ToC if one does not already exist self.toc_adder = TOCAdder(oeb, opts) self.used_images = set() self.resources = resources self.flows = [None] # First flow item is reserved for the text self.records = [None] # Placeholder for zeroth record self.log('\tGenerating KF8 markup...') self.dup_data() self.cleanup_markup() self.replace_resource_links() self.extract_css_into_flows() self.extract_svg_into_flows() self.replace_internal_links_with_placeholders() self.insert_aid_attributes() self.chunk_it_up() # Dump the cloned data as it is no longer needed del self._data_cache self.create_text_records() self.log('\tCreating indices...') self.create_fdst_records() self.create_indices() self.create_guide() # We do not want to use this ToC for MOBI 6, so remove it self.toc_adder.remove_generated_toc() def dup_data(self): ''' Duplicate data so that any changes we make to markup/CSS only affect KF8 output and not MOBI 6 output ''' self._data_cache = {} # Suppress cssutils logging output as it is duplicated anyway earlier # in the pipeline cssutils.log.setLevel(logging.CRITICAL) for item in self.oeb.manifest: if item.media_type in XML_DOCS: self._data_cache[item.href] = copy.deepcopy(item.data) elif item.media_type in OEB_STYLES: # I can't figure out how to make an efficient copy of the # in-memory CSSStylesheet, as deepcopy doesn't work (raises an # exception) self._data_cache[item.href] = cssutils.parseString( item.data.cssText, validate=False) def data(self, item): return self._data_cache.get(item.href, item.data) def cleanup_markup(self): for item in self.oeb.spine: root = self.data(item) # Remove empty script tags as they are pointless for tag in XPath('//h:script')(root): if not tag.text and not tag.get('src', False): tag.getparent().remove(tag) # Remove [ac]id attributes as they are used by this code for anchor # to offset mapping for tag in XPath('//*[@aid or @cid]')(root): tag.attrib.pop('aid', None), tag.attrib.pop('cid', None) def replace_resource_links(self): ''' Replace links to resources (raster images/fonts) with pointers to the MOBI record containing the resource. The pointers are of the form: kindle:embed:XXXX?mime=image/* The ?mime= is apparently optional and not used for fonts. ''' def pointer(item, oref): ref = urlnormalize(item.abshref(oref)) idx = self.resources.item_map.get(ref, None) if idx is not None: is_image = self.resources.records[idx - 1][:4] not in {b'FONT'} idx = to_ref(idx) if is_image: self.used_images.add(ref) return 'kindle:embed:%s?mime=%s' % ( idx, self.resources.mime_map[ref]) else: return 'kindle:embed:%s' % idx return oref for item in self.oeb.manifest: if item.media_type in XML_DOCS: root = self.data(item) for tag in XPath('//h:img|//svg:image')(root): for attr, ref in tag.attrib.iteritems(): if attr.split('}')[-1].lower() in {'src', 'href'}: tag.attrib[attr] = pointer(item, ref) for tag in XPath('//h:style')(root): if tag.text: sheet = cssutils.parseString(tag.text, validate=False) replacer = partial(pointer, item) cssutils.replaceUrls(sheet, replacer, ignoreImportRules=True) repl = sheet.cssText if isbytestring(repl): repl = repl.decode('utf-8') tag.text = '\n' + repl + '\n' elif item.media_type in OEB_STYLES: sheet = self.data(item) replacer = partial(pointer, item) cssutils.replaceUrls(sheet, replacer, ignoreImportRules=True) def extract_css_into_flows(self): inlines = defaultdict(list) # Ensure identical <style>s not repeated sheets = {} passthrough = getattr(self.opts, 'mobi_passthrough', False) for item in self.oeb.manifest: if item.media_type in OEB_STYLES: sheet = self.data(item) if not passthrough and not self.opts.expand_css and hasattr( item.data, 'cssText'): condense_sheet(sheet) sheets[item.href] = len(self.flows) self.flows.append(sheet) def fix_import_rules(sheet): changed = False for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE): if rule.href: href = item.abshref(rule.href) idx = sheets.get(href, None) if idx is not None: idx = to_ref(idx) rule.href = 'kindle:flow:%s?mime=text/css' % idx changed = True return changed for item in self.oeb.spine: root = self.data(item) for link in XPath('//h:link[@href]')(root): href = item.abshref(link.get('href')) idx = sheets.get(href, None) if idx is not None: idx = to_ref(idx) link.set('href', 'kindle:flow:%s?mime=text/css' % idx) for tag in XPath('//h:style')(root): p = tag.getparent() idx = p.index(tag) raw = tag.text if not raw or not raw.strip(): extract(tag) continue sheet = cssutils.parseString(raw, validate=False) if fix_import_rules(sheet): raw = force_unicode(sheet.cssText, 'utf-8') repl = etree.Element(XHTML('link'), type='text/css', rel='stylesheet') repl.tail = '\n' p.insert(idx, repl) extract(tag) inlines[raw].append(repl) for raw, elems in inlines.iteritems(): idx = to_ref(len(self.flows)) self.flows.append(raw) for link in elems: link.set('href', 'kindle:flow:%s?mime=text/css' % idx) for item in self.oeb.manifest: if item.media_type in OEB_STYLES: sheet = self.data(item) if hasattr(sheet, 'cssRules'): fix_import_rules(sheet) for i, sheet in enumerate(tuple(self.flows)): if hasattr(sheet, 'cssText'): self.flows[i] = force_unicode(sheet.cssText, 'utf-8') def extract_svg_into_flows(self): images = {} for item in self.oeb.manifest: if item.media_type == SVG_MIME: data = self.data(item) images[item.href] = len(self.flows) self.flows.append( etree.tostring(data, encoding='UTF-8', with_tail=True, xml_declaration=True)) for item in self.oeb.spine: root = self.data(item) for svg in XPath('//svg:svg')(root): raw = etree.tostring(svg, encoding=unicode, with_tail=False) idx = len(self.flows) self.flows.append(raw) p = svg.getparent() pos = p.index(svg) img = etree.Element(XHTML('img'), src="kindle:flow:%s?mime=image/svg+xml" % to_ref(idx)) p.insert(pos, img) extract(svg) for img in XPath('//h:img[@src]')(root): src = img.get('src') abshref = item.abshref(src) idx = images.get(abshref, None) if idx is not None: img.set('src', 'kindle:flow:%s?mime=image/svg+xml' % to_ref(idx)) def replace_internal_links_with_placeholders(self): self.link_map = {} count = 0 hrefs = {item.href for item in self.oeb.spine} for item in self.oeb.spine: root = self.data(item) for a in XPath('//h:a[@href]')(root): count += 1 ref = item.abshref(a.get('href')) href, _, frag = ref.partition('#') try: href = urlnormalize(href) except ValueError: # a non utf-8 quoted url? Since we cannot interpret it, pass it through. pass if href in hrefs: placeholder = 'kindle:pos:fid:0000:off:%s' % to_href(count) self.link_map[placeholder] = (href, frag) a.set('href', placeholder) def insert_aid_attributes(self): self.id_map = {} cid = 0 for i, item in enumerate(self.oeb.spine): root = self.data(item) aidbase = i * int(1e6) j = 0 def in_table(elem): p = elem.getparent() if p is None: return False if barename(p.tag).lower() == 'table': return True return in_table(p) for tag in root.iterdescendants(etree.Element): id_ = tag.attrib.get('id', None) if id_ is None and tag.tag == XHTML('a'): # Can happen during tweaking id_ = tag.attrib.get('name', None) if id_ is not None: tag.attrib['id'] = id_ tagname = barename(tag.tag).lower() if id_ is not None or tagname in aid_able_tags: if tagname == 'table' or in_table(tag): # The Kindle renderer barfs on large tables that have # aid on any of their tags. See # https://bugs.launchpad.net/bugs/1489495 if id_: cid += 1 val = 'c%d' % cid self.id_map[(item.href, id_)] = val tag.set('cid', val) else: aid = to_base(aidbase + j, base=32) tag.set('aid', aid) if tag.tag == XHTML('body'): self.id_map[(item.href, '')] = aid if id_ is not None: self.id_map[(item.href, id_)] = aid j += 1 def chunk_it_up(self): placeholder_map = {} for placeholder, x in self.link_map.iteritems(): href, frag = x aid = self.id_map.get(x, None) if aid is None: aid = self.id_map.get((href, '')) placeholder_map[placeholder] = aid chunker = Chunker(self.oeb, self.data, placeholder_map) for x in ('skel_table', 'chunk_table', 'aid_offset_map'): setattr(self, x, getattr(chunker, x)) self.flows[0] = chunker.text def create_text_records(self): self.flows = [ x.encode('utf-8') if isinstance(x, unicode) else x for x in self.flows ] text = b''.join(self.flows) self.text_length = len(text) text = BytesIO(text) nrecords = 0 records_size = 0 self.uncompressed_record_lengths = [] if self.compress: self.oeb.logger.info('\tCompressing markup...') while text.tell() < self.text_length: data, overlap = create_text_record(text) self.uncompressed_record_lengths.append(len(data)) if self.compress: data = compress_doc(data) data += overlap data += pack(b'>B', len(overlap)) self.records.append(data) records_size += len(data) nrecords += 1 self.last_text_record_idx = nrecords self.first_non_text_record_idx = nrecords + 1 # Pad so that the next records starts at a 4 byte boundary if records_size % 4 != 0: self.records.append(b'\x00' * (records_size % 4)) self.first_non_text_record_idx += 1 def create_fdst_records(self): FDST = namedtuple('Flow', 'start end') entries = [] self.fdst_table = [] for i, flow in enumerate(self.flows): start = 0 if i == 0 else self.fdst_table[-1].end self.fdst_table.append(FDST(start, start + len(flow))) entries.extend(self.fdst_table[-1]) rec = (b'FDST' + pack(b'>LL', 12, len(self.fdst_table)) + pack(b'>%dL' % len(entries), *entries)) self.fdst_records = [rec] self.fdst_count = len(self.fdst_table) def create_indices(self): self.skel_records = SkelIndex(self.skel_table)() self.chunk_records = ChunkIndex(self.chunk_table)() self.ncx_records = [] toc = self.oeb.toc entries = [] is_periodical = self.opts.mobi_periodical if toc.count() < 1: self.log.warn('Document has no ToC, MOBI will have no NCX index') return # Flatten the ToC into a depth first list fl = toc.iterdescendants() for i, item in enumerate(fl): entry = { 'id': id(item), 'index': i, 'label': (item.title or _('Unknown')), 'children': [] } entry['depth'] = getattr(item, 'ncx_hlvl', 0) p = getattr(item, 'ncx_parent', None) if p is not None: entry['parent_id'] = p for child in item: child.ncx_parent = entry['id'] child.ncx_hlvl = entry['depth'] + 1 entry['children'].append(id(child)) if is_periodical: if item.author: entry['author'] = item.author if item.description: entry['description'] = item.description entries.append(entry) href = item.href or '' href, frag = href.partition('#')[0::2] aid = self.id_map.get((href, frag), None) if aid is None: aid = self.id_map.get((href, ''), None) if aid is None: pos, fid = 0, 0 chunk = self.chunk_table[pos] offset = chunk.insert_pos + fid else: pos, fid, offset = self.aid_offset_map[aid] entry['pos_fid'] = (pos, fid) entry['offset'] = offset # The Kindle requires entries to be sorted by (depth, playorder) # However, I cannot figure out how to deal with non linear ToCs, i.e. # ToCs whose nth entry at depth d has an offset after its n+k entry at # the same depth, so we sort on (depth, offset) instead. This re-orders # the ToC to be linear. A non-linear ToC causes section to section # jumping to not work. kindlegen somehow handles non-linear tocs, but I # cannot figure out how. original = sorted(entries, key=lambda entry: (entry['depth'], entry['index'])) linearized = sorted(entries, key=lambda entry: (entry['depth'], entry['offset'])) is_non_linear = original != linearized entries = linearized is_non_linear = False # False as we are using the linearized entries if is_non_linear: for entry in entries: entry['kind'] = 'chapter' for i, entry in enumerate(entries): entry['index'] = i id_to_index = {entry['id']: entry['index'] for entry in entries} # Write the hierarchical information for entry in entries: children = entry.pop('children') if children: entry['first_child'] = id_to_index[children[0]] entry['last_child'] = id_to_index[children[-1]] if 'parent_id' in entry: entry['parent'] = id_to_index[entry.pop('parent_id')] # Write the lengths def get_next_start(entry): enders = [ e['offset'] for e in entries if e['depth'] <= entry['depth'] and e['offset'] > entry['offset'] ] if enders: return min(enders) return len(self.flows[0]) for entry in entries: entry['length'] = get_next_start(entry) - entry['offset'] self.has_tbs = apply_trailing_byte_sequences( entries, self.records, self.uncompressed_record_lengths) idx_type = NonLinearNCXIndex if is_non_linear else NCXIndex self.ncx_records = idx_type(entries)() def create_guide(self): self.start_offset = None self.guide_table = [] self.guide_records = [] GuideRef = namedtuple('GuideRef', 'title type pos_fid') for ref in self.oeb.guide.values(): href, frag = ref.href.partition('#')[0::2] aid = self.id_map.get((href, frag), None) if aid is None: aid = self.id_map.get((href, '')) if aid is None: continue pos, fid, offset = self.aid_offset_map[aid] if is_guide_ref_start(ref): self.start_offset = offset self.guide_table.append( GuideRef(ref.title or _('Unknown'), ref.type, (pos, fid))) if self.guide_table: self.guide_table.sort(key=lambda x: x.type) # Needed by the Kindle self.guide_records = GuideIndex(self.guide_table)()
class KF8Writer(object): def __init__(self, oeb, opts, resources): self.oeb, self.opts, self.log = oeb, opts, oeb.log self.compress = not self.opts.dont_compress self.has_tbs = False self.log.info('Creating KF8 output') # Create an inline ToC if one does not already exist self.toc_adder = TOCAdder(oeb, opts) self.used_images = set() self.resources = resources self.flows = [None] # First flow item is reserved for the text self.records = [None] # Placeholder for zeroth record self.log.info('\tGenerating KF8 markup...') self.dup_data() self.cleanup_markup() self.replace_resource_links() self.extract_css_into_flows() self.extract_svg_into_flows() self.replace_internal_links_with_placeholders() self.insert_aid_attributes() self.chunk_it_up() # Dump the cloned data as it is no longer needed del self._data_cache self.create_text_records() self.log.info('\tCreating indices...') self.create_fdst_records() self.create_indices() self.create_guide() # We do not want to use this ToC for MOBI 6, so remove it self.toc_adder.remove_generated_toc() def dup_data(self): ''' Duplicate data so that any changes we make to markup/CSS only affect KF8 output and not MOBI 6 output ''' self._data_cache = {} # Suppress cssutils logging output as it is duplicated anyway earlier # in the pipeline cssutils.log.setLevel(logging.CRITICAL) for item in self.oeb.manifest: if item.media_type in XML_DOCS: self._data_cache[item.href] = copy.deepcopy(item.data) elif item.media_type in OEB_STYLES: # I can't figure out how to make an efficient copy of the # in-memory CSSStylesheet, as deepcopy doesn't work (raises an # exception) self._data_cache[item.href] = cssutils.parseString( item.data.cssText, validate=False) def data(self, item): return self._data_cache.get(item.href, item.data) def cleanup_markup(self): for item in self.oeb.spine: root = self.data(item) # Remove empty script tags as they are pointless for tag in XPath('//h:script')(root): if not tag.text and not tag.get('src', False): tag.getparent().remove(tag) def replace_resource_links(self): ''' Replace links to resources (raster images/fonts) with pointers to the MOBI record containing the resource. The pointers are of the form: kindle:embed:XXXX?mime=image/* The ?mime= is apparently optional and not used for fonts. ''' def pointer(item, oref): ref = urlnormalize(item.abshref(oref)) idx = self.resources.item_map.get(ref, None) if idx is not None: is_image = self.resources.records[idx-1][:4] not in {b'FONT'} idx = to_ref(idx) if is_image: self.used_images.add(ref) return 'kindle:embed:%s?mime=%s'%(idx, self.resources.mime_map[ref]) else: return 'kindle:embed:%s'%idx return oref for item in self.oeb.manifest: if item.media_type in XML_DOCS: root = self.data(item) for tag in XPath('//h:img|//svg:image')(root): for attr, ref in tag.attrib.iteritems(): if attr.split('}')[-1].lower() in {'src', 'href'}: tag.attrib[attr] = pointer(item, ref) for tag in XPath('//h:style')(root): if tag.text: sheet = cssutils.parseString(tag.text, validate=False) replacer = partial(pointer, item) cssutils.replaceUrls(sheet, replacer, ignoreImportRules=True) repl = sheet.cssText if isbytestring(repl): repl = repl.decode('utf-8') tag.text = '\n'+ repl + '\n' elif item.media_type in OEB_STYLES: sheet = self.data(item) replacer = partial(pointer, item) cssutils.replaceUrls(sheet, replacer, ignoreImportRules=True) def extract_css_into_flows(self): inlines = defaultdict(list) # Ensure identical <style>s not repeated sheets = {} for item in self.oeb.manifest: if item.media_type in OEB_STYLES: data = self.data(item).cssText sheets[item.href] = len(self.flows) self.flows.append(force_unicode(data, 'utf-8')) for item in self.oeb.spine: root = self.data(item) for link in XPath('//h:link[@href]')(root): href = item.abshref(link.get('href')) idx = sheets.get(href, None) if idx is not None: idx = to_ref(idx) link.set('href', 'kindle:flow:%s?mime=text/css'%idx) for tag in XPath('//h:style')(root): p = tag.getparent() idx = p.index(tag) raw = tag.text if not raw or not raw.strip(): extract(tag) continue repl = etree.Element(XHTML('link'), type='text/css', rel='stylesheet') repl.tail='\n' p.insert(idx, repl) extract(tag) inlines[raw].append(repl) for raw, elems in inlines.iteritems(): idx = to_ref(len(self.flows)) self.flows.append(raw) for link in elems: link.set('href', 'kindle:flow:%s?mime=text/css'%idx) def extract_svg_into_flows(self): images = {} for item in self.oeb.manifest: if item.media_type == SVG_MIME: data = self.data(item) images[item.href] = len(self.flows) self.flows.append(etree.tostring(data, encoding='UTF-8', with_tail=True, xml_declaration=True)) for item in self.oeb.spine: root = self.data(item) for svg in XPath('//svg:svg')(root): raw = etree.tostring(svg, encoding=unicode, with_tail=False) idx = len(self.flows) self.flows.append(raw) p = svg.getparent() pos = p.index(svg) img = etree.Element(XHTML('img'), src="kindle:flow:%s?mime=image/svg+xml"%to_ref(idx)) p.insert(pos, img) extract(svg) for img in XPath('//h:img[@src]')(root): src = img.get('src') abshref = item.abshref(src) idx = images.get(abshref, None) if idx is not None: img.set('src', 'kindle:flow:%s?mime=image/svg+xml'% to_ref(idx)) def replace_internal_links_with_placeholders(self): self.link_map = {} count = 0 hrefs = {item.href for item in self.oeb.spine} for item in self.oeb.spine: root = self.data(item) for a in XPath('//h:a[@href]')(root): count += 1 ref = item.abshref(a.get('href')) href, _, frag = ref.partition('#') href = urlnormalize(href) if href in hrefs: placeholder = 'kindle:pos:fid:0000:off:%s'%to_href(count) self.link_map[placeholder] = (href, frag) a.set('href', placeholder) def insert_aid_attributes(self): self.id_map = {} for i, item in enumerate(self.oeb.spine): root = self.data(item) aidbase = i * int(1e6) j = 0 for tag in root.iterdescendants(etree.Element): id_ = tag.attrib.get('id', None) if id_ is None and tag.tag == XHTML('a'): # Can happen during tweaking id_ = tag.attrib.get('name', None) if id_ is not None: tag.attrib['id'] = id_ if id_ is not None or barename(tag.tag).lower() in aid_able_tags: aid = aidbase + j tag.attrib['aid'] = to_base(aid, base=32) if tag.tag == XHTML('body'): self.id_map[(item.href, '')] = tag.attrib['aid'] if id_ is not None: self.id_map[(item.href, id_)] = tag.attrib['aid'] j += 1 def chunk_it_up(self): placeholder_map = {} for placeholder, x in self.link_map.iteritems(): href, frag = x aid = self.id_map.get(x, None) if aid is None: aid = self.id_map.get((href, '')) placeholder_map[placeholder] = aid chunker = Chunker(self.oeb, self.data, placeholder_map) for x in ('skel_table', 'chunk_table', 'aid_offset_map'): setattr(self, x, getattr(chunker, x)) self.flows[0] = chunker.text def create_text_records(self): self.flows = [x.encode('utf-8') if isinstance(x, unicode) else x for x in self.flows] text = b''.join(self.flows) self.text_length = len(text) text = BytesIO(text) nrecords = 0 records_size = 0 self.uncompressed_record_lengths = [] if self.compress: self.oeb.logger.info('\tCompressing markup...') while text.tell() < self.text_length: data, overlap = create_text_record(text) self.uncompressed_record_lengths.append(len(data)) if self.compress: data = compress_doc(data) data += overlap data += pack(b'>B', len(overlap)) self.records.append(data) records_size += len(data) nrecords += 1 self.last_text_record_idx = nrecords self.first_non_text_record_idx = nrecords + 1 # Pad so that the next records starts at a 4 byte boundary if records_size % 4 != 0: self.records.append(b'\x00'*(records_size % 4)) self.first_non_text_record_idx += 1 def create_fdst_records(self): FDST = namedtuple('Flow', 'start end') entries = [] self.fdst_table = [] for i, flow in enumerate(self.flows): start = 0 if i == 0 else self.fdst_table[-1].end self.fdst_table.append(FDST(start, start + len(flow))) entries.extend(self.fdst_table[-1]) rec = (b'FDST' + pack(b'>LL', 12, len(self.fdst_table)) + pack(b'>%dL'%len(entries), *entries)) self.fdst_records = [rec] self.fdst_count = len(self.fdst_table) def create_indices(self): self.skel_records = SkelIndex(self.skel_table)() self.chunk_records = ChunkIndex(self.chunk_table)() self.ncx_records = [] toc = self.oeb.toc entries = [] is_periodical = self.opts.mobi_periodical if toc.count() < 2: self.log.warn('Document has no ToC, MOBI will have no NCX index') return # Flatten the ToC into a depth first list fl = toc.iterdescendants() for i, item in enumerate(fl): entry = {'id': id(item), 'index': i, 'label':(item.title or _('Unknown')), 'children':[]} entry['depth'] = getattr(item, 'ncx_hlvl', 0) p = getattr(item, 'ncx_parent', None) if p is not None: entry['parent_id'] = p for child in item: child.ncx_parent = entry['id'] child.ncx_hlvl = entry['depth'] + 1 entry['children'].append(id(child)) if is_periodical: if item.author: entry['author'] = item.author if item.description: entry['description'] = item.description entries.append(entry) href = item.href or '' href, frag = href.partition('#')[0::2] aid = self.id_map.get((href, frag), None) if aid is None: aid = self.id_map.get((href, ''), None) if aid is None: pos, fid = 0, 0 chunk = self.chunk_table[pos] offset = chunk.insert_pos + fid else: pos, fid, offset = self.aid_offset_map[aid] entry['pos_fid'] = (pos, fid) entry['offset'] = offset # The Kindle requires entries to be sorted by (depth, playorder) # However, I cannot figure out how to deal with non linear ToCs, i.e. # ToCs whose nth entry at depth d has an offset after its n+k entry at # the same depth, so we sort on (depth, offset) instead. This re-orders # the ToC to be linear. A non-linear ToC causes section to section # jumping to not work. kindlegen somehow handles non-linear tocs, but I # cannot figure out how. original = sorted(entries, key=lambda entry: (entry['depth'], entry['index'])) linearized = sorted(entries, key=lambda entry: (entry['depth'], entry['offset'])) is_non_linear = original != linearized entries = linearized is_non_linear = False # False as we are using the linearized entries if is_non_linear: for entry in entries: entry['kind'] = 'chapter' for i, entry in enumerate(entries): entry['index'] = i id_to_index = {entry['id']:entry['index'] for entry in entries} # Write the hierarchical information for entry in entries: children = entry.pop('children') if children: entry['first_child'] = id_to_index[children[0]] entry['last_child'] = id_to_index[children[-1]] if 'parent_id' in entry: entry['parent'] = id_to_index[entry.pop('parent_id')] # Write the lengths def get_next_start(entry): enders = [e['offset'] for e in entries if e['depth'] <= entry['depth'] and e['offset'] > entry['offset']] if enders: return min(enders) return len(self.flows[0]) for entry in entries: entry['length'] = get_next_start(entry) - entry['offset'] self.has_tbs = apply_trailing_byte_sequences(entries, self.records, self.uncompressed_record_lengths) idx_type = NonLinearNCXIndex if is_non_linear else NCXIndex self.ncx_records = idx_type(entries)() def create_guide(self): self.start_offset = None self.guide_table = [] self.guide_records = [] GuideRef = namedtuple('GuideRef', 'title type pos_fid') for ref in self.oeb.guide.values(): href, frag = ref.href.partition('#')[0::2] aid = self.id_map.get((href, frag), None) if aid is None: aid = self.id_map.get((href, '')) if aid is None: continue pos, fid, offset = self.aid_offset_map[aid] if is_guide_ref_start(ref): self.start_offset = offset self.guide_table.append(GuideRef(ref.title or _('Unknown'), ref.type, (pos, fid))) if self.guide_table: self.guide_table.sort(key=lambda x:x.type) # Needed by the Kindle self.guide_records = GuideIndex(self.guide_table)()
def convert(self, oeb, output_path, input_plugin, opts, log): self.log, self.opts, self.oeb = log, opts, oeb if self.opts.epub_inline_toc: from calibre.ebooks.mobi.writer8.toc import TOCAdder opts.mobi_toc_at_start = not opts.epub_toc_at_end opts.mobi_passthrough = False opts.no_inline_toc = False TOCAdder(oeb, opts, replace_previous_inline_toc=True, ignore_existing_toc=True) if self.opts.epub_flatten: from calibre.ebooks.oeb.transforms.filenames import FlatFilenames FlatFilenames()(oeb, opts) else: from calibre.ebooks.oeb.transforms.filenames import UniqueFilenames UniqueFilenames()(oeb, opts) self.workaround_ade_quirks() self.workaround_webkit_quirks() self.upshift_markup() from calibre.ebooks.oeb.transforms.rescale import RescaleImages RescaleImages(check_colorspaces=True)(oeb, opts) from calibre.ebooks.oeb.transforms.split import Split split = Split(not self.opts.dont_split_on_page_breaks, max_flow_size=self.opts.flow_size * 1024) split(self.oeb, self.opts) from calibre.ebooks.oeb.transforms.cover import CoverManager cm = CoverManager( no_default_cover=self.opts.no_default_epub_cover, no_svg_cover=self.opts.no_svg_cover, preserve_aspect_ratio=self.opts.preserve_cover_aspect_ratio) cm(self.oeb, self.opts, self.log) self.workaround_sony_quirks() if self.oeb.toc.count() == 0: self.log.warn('This EPUB file has no Table of Contents. ' 'Creating a default TOC') first = next(iter(self.oeb.spine)) self.oeb.toc.add(_('Start'), first.href) from calibre.ebooks.oeb.base import OPF identifiers = oeb.metadata['identifier'] uuid = None for x in identifiers: if x.get(OPF('scheme'), None).lower() == 'uuid' or str(x).startswith('urn:uuid:'): uuid = str(x).split(':')[-1] break encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', []) if uuid is None: self.log.warn('No UUID identifier found') from uuid import uuid4 uuid = str(uuid4()) oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid) if encrypted_fonts and not uuid.startswith('urn:uuid:'): # Apparently ADE requires this value to start with urn:uuid: # for some absurd reason, or it will throw a hissy fit and refuse # to use the obfuscated fonts. for x in identifiers: if str(x) == uuid: x.content = 'urn:uuid:' + uuid with TemporaryDirectory('_epub_output') as tdir: from calibre.customize.ui import plugin_for_output_format metadata_xml = None extra_entries = [] if self.is_periodical: if self.opts.output_profile.epub_periodical_format == 'sony': from calibre.ebooks.epub.periodical import sony_metadata metadata_xml, atom_xml = sony_metadata(oeb) extra_entries = [('atom.xml', 'application/atom+xml', atom_xml)] oeb_output = plugin_for_output_format('oeb') oeb_output.convert(oeb, tdir, input_plugin, opts, log) opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0] self.condense_ncx([ os.path.join(tdir, x) for x in os.listdir(tdir) if x.endswith('.ncx') ][0]) encryption = None if encrypted_fonts: encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid) from calibre.ebooks.epub import initialize_container with initialize_container(output_path, os.path.basename(opf), extra_entries=extra_entries) as epub: epub.add_dir(tdir) if encryption is not None: epub.writestr('META-INF/encryption.xml', encryption) if metadata_xml is not None: epub.writestr('META-INF/metadata.xml', metadata_xml.encode('utf-8')) if opts.extract_to is not None: from calibre.utils.zipfile import ZipFile if os.path.exists(opts.extract_to): if os.path.isdir(opts.extract_to): shutil.rmtree(opts.extract_to) else: os.remove(opts.extract_to) os.mkdir(opts.extract_to) with ZipFile(output_path) as zf: zf.extractall(path=opts.extract_to) self.log.info('EPUB extracted to', opts.extract_to)