def test_toc_detection(self): ep = os.path.join(self.tdir, 'book.epub') create_book(Metadata('Test ToC'), ep) c = get_container(ep, tdir=os.path.join(self.tdir, 'container'), tweak_mode=True) self.assertEqual(2, c.opf_version_parsed.major) self.assertTrue(len(get_toc(c))) c.opf.set('version', '3.0') self.assertEqual(3, c.opf_version_parsed.major) self.assertTrue(len(get_toc(c))) # detect NCX toc even in epub 3 files c.add_file('nav.html', b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">' '<body><nav epub:type="toc"><ol><li><a href="start.xhtml">EPUB 3 nav</a></li></ol></nav></body></html>', process_manifest_item=lambda item:item.set('properties', 'nav')) toc = get_toc(c) self.assertTrue(len(toc)) self.assertEqual(toc.as_dict['children'][0]['title'], 'EPUB 3 nav')
def __init__(self, path_to_ebook, tdir, log=None, book_hash=None): log = log or default_log book_fmt, opfpath, input_fmt = extract_book(path_to_ebook, tdir, log=log) ContainerBase.__init__(self, tdir, opfpath, log) excluded_names = { name for name, mt in self.mime_map.iteritems() if name == self.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') or name == 'mimetype' } self.book_render_data = data = { 'version': RENDER_VERSION, 'toc':get_toc(self).as_dict, 'spine':[name for name, is_linear in self.spine_names], 'link_uid': uuid4(), 'book_hash': book_hash, 'is_comic': input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'}, } # Mark the spine as dirty since we have to ensure it is normalized for name in data['spine']: self.parsed(name), self.dirty(name) self.virtualized_names = set() self.virtualize_resources() def manifest_data(name): return {'size':os.path.getsize(self.name_path_map[name]), 'is_virtualized': name in self.virtualized_names, 'mimetype':self.mime_map.get(name)} data['files'] = {name:manifest_data(name) for name in set(self.name_path_map) - excluded_names} self.commit() for name in excluded_names: os.remove(self.name_path_map[name]) with lopen(os.path.join(self.root, 'calibre-book-manifest.json'), 'wb') as f: f.write(json.dumps(self.book_render_data, ensure_ascii=False).encode('utf-8'))
def __init__(self, path_to_ebook, tdir, log=None): log = log or default_log book_fmt, opfpath, input_fmt = extract_book(path_to_ebook, tdir, log=log) ContainerBase.__init__(self, tdir, opfpath, log) excluded_names = { name for name, mt in self.mime_map.iteritems() if name == self.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') } self.book_render_data = data = { 'version': self.RENDER_VERSION, 'toc':get_toc(self).as_dict, 'spine':[name for name, is_linear in self.spine_names], 'link_uid': uuid4(), 'is_comic': input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'}, 'manifest': list(set(self.name_path_map) - excluded_names), } # Mark the spine as dirty since we have to ensure it is normalized for name in data['spine']: self.parsed(name), self.dirty(name) self.virtualize_resources() self.commit() for name in excluded_names: os.remove(self.name_path_map[name]) with lopen(os.path.join(self.root, 'calibre-book-manifest.json'), 'wb') as f: f.write(json.dumps(self.book_render_data, ensure_ascii=False).encode('utf-8'))
def test_toc_detection(self): ep = os.path.join(self.tdir, 'book.epub') create_book(Metadata('Test ToC'), ep) c = get_container(ep, tdir=os.path.join(self.tdir, 'container'), tweak_mode=True) self.assertEqual(2, c.opf_version_parsed.major) self.assertTrue(len(get_toc(c))) c.opf.set('version', '3.0') self.assertEqual(3, c.opf_version_parsed.major) self.assertTrue(len(get_toc(c))) # detect NCX toc even in epub 3 files c.add_file( 'nav.html', b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">' b'<body><nav epub:type="toc"><ol><li><a href="start.xhtml">EPUB 3 nav</a></li></ol></nav></body></html>', process_manifest_item=lambda item: item.set('properties', 'nav')) toc = get_toc(c) self.assertTrue(len(toc)) self.assertEqual(toc.as_dict['children'][0]['title'], 'EPUB 3 nav') def tfx(linear, expected): items = ['<t{0}>{0}</t{0}>'.format(x) for x in linear] html = '<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">' html += '<body>%s</body></html>' % '\n'.join(items) with c.open('nav.html', 'wb') as f: f.write(html.encode('utf-8')) toc = toc_from_xpaths(c, ['//h:t' + x for x in sorted(set(linear))]) def p(node): ans = '' if node.children: ans += '[' for c in node.children: ans += c.title + p(c) ans += ']' return ans self.assertEqual('[%s]' % expected, p(toc)) tfx('121333', '1[2]1[333]') tfx('1223424', '1[22[3[4]]2[4]]') tfx('32123', '321[2[3]]') tfx('123123', '1[2[3]]1[2[3]]')
def test_toc_detection(self): ep = os.path.join(self.tdir, 'book.epub') create_book(Metadata('Test ToC'), ep) c = get_container(ep, tdir=os.path.join(self.tdir, 'container'), tweak_mode=True) self.assertEqual(2, c.opf_version_parsed.major) self.assertTrue(len(get_toc(c))) c.opf.set('version', '3.0') self.assertEqual(3, c.opf_version_parsed.major) self.assertTrue(len(get_toc(c))) # detect NCX toc even in epub 3 files c.add_file( 'nav.html', b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">' '<body><nav epub:type="toc"><ol><li><a href="start.xhtml">EPUB 3 nav</a></li></ol></nav></body></html>', process_manifest_item=lambda item: item.set('properties', 'nav')) toc = get_toc(c) self.assertTrue(len(toc)) self.assertEqual(toc.as_dict['children'][0]['title'], 'EPUB 3 nav')
def __init__(self, path_to_ebook, tdir, log=None, book_hash=None): log = log or default_log book_fmt, opfpath, input_fmt = extract_book(path_to_ebook, tdir, log=log) ContainerBase.__init__(self, tdir, opfpath, log) excluded_names = { name for name, mt in self.mime_map.iteritems() if name == self.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') or name == 'mimetype' } raster_cover_name, titlepage_name = self.create_cover_page(input_fmt.lower()) self.book_render_data = data = { 'version': RENDER_VERSION, 'toc':get_toc(self).as_dict, 'spine':[name for name, is_linear in self.spine_names], 'link_uid': uuid4(), 'book_hash': book_hash, 'is_comic': input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'}, 'raster_cover_name': raster_cover_name, 'title_page_name': titlepage_name, 'has_maths': False, 'total_length': 0, 'spine_length': 0, } # Mark the spine as dirty since we have to ensure it is normalized for name in data['spine']: self.parsed(name), self.dirty(name) self.transform_css() self.virtualized_names = set() self.virtualize_resources() def manifest_data(name): mt = (self.mime_map.get(name) or 'application/octet-stream').lower() ans = { 'size':os.path.getsize(self.name_path_map[name]), 'is_virtualized': name in self.virtualized_names, 'mimetype':mt, 'is_html': mt in OEB_DOCS, } if ans['is_html']: root = self.parsed(name) ans['length'] = l = get_length(root) self.book_render_data['total_length'] += l if name in data['spine']: self.book_render_data['spine_length'] += l ans['has_maths'] = hm = check_for_maths(root) if hm: self.book_render_data['has_maths'] = True return ans data['files'] = {name:manifest_data(name) for name in set(self.name_path_map) - excluded_names} self.commit() for name in excluded_names: os.remove(self.name_path_map[name]) with lopen(os.path.join(self.root, 'calibre-book-manifest.json'), 'wb') as f: f.write(json.dumps(self.book_render_data, ensure_ascii=False).encode('utf-8'))
def test_toc_detection(self): ep = os.path.join(self.tdir, 'book.epub') create_book(Metadata('Test ToC'), ep) c = get_container(ep, tdir=os.path.join(self.tdir, 'container'), tweak_mode=True) self.assertEqual(2, c.opf_version_parsed.major) self.assertTrue(len(get_toc(c))) c.opf.set('version', '3.0') self.assertEqual(3, c.opf_version_parsed.major) self.assertTrue(len(get_toc(c))) # detect NCX toc even in epub 3 files c.add_file('nav.html', b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">' '<body><nav epub:type="toc"><ol><li><a href="start.xhtml">EPUB 3 nav</a></li></ol></nav></body></html>', process_manifest_item=lambda item:item.set('properties', 'nav')) toc = get_toc(c) self.assertTrue(len(toc)) self.assertEqual(toc.as_dict['children'][0]['title'], 'EPUB 3 nav') def tfx(linear, expected): items = ['<t{0}>{0}</t{0}>'.format(x) for x in linear] html = '<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">' html += '<body>%s</body></html>' % '\n'.join(items) with c.open('nav.html', 'wb') as f: f.write(html.encode('utf-8')) toc = toc_from_xpaths(c, ['//h:t'+x for x in sorted(set(linear))]) def p(node): ans = '' if node.children: ans += '[' for c in node.children: ans += c.title + p(c) ans += ']' return ans self.assertEqual('[%s]'%expected, p(toc)) tfx('121333', '1[2]1[333]') tfx('1223424', '1[22[3[4]]2[4]]') tfx('32123', '321[2[3]]') tfx('123123', '1[2[3]]1[2[3]]')
def epub_2_to_3(container, report, previous_nav=None): upgrade_metadata(container.opf) collect_properties(container) toc = get_toc(container) toc_name = find_existing_ncx_toc(container) if toc_name: container.remove_item(toc_name) container.opf_xpath('./opf:spine')[0].attrib.pop('toc', None) landmarks = get_landmarks(container) for guide in container.opf_xpath('./opf:guide'): guide.getparent().remove(guide) create_nav(container, toc, landmarks, previous_nav) container.opf.set('version', '3.0') fix_font_mime_types(container) container.dirty(container.opf_name)
def build(self): c = current_container() if c is None: return toc = get_toc(c, verify_destinations=False) def process_node(toc, parent): for child in toc: node = QTreeWidgetItem(parent) node.setText(0, child.title or '') node.setData(0, DEST_ROLE, child.dest or '') node.setData(0, FRAG_ROLE, child.frag or '') tt = _('File: {0}\nAnchor: {1}').format( child.dest or '', child.frag or _('Top of file')) node.setData(0, Qt.ToolTipRole, tt) process_node(child, node) self.view.clear() process_node(toc, self.view.invisibleRootItem())
def __call__(self, ebook): self.ebook = ebook if not isinstance(ebook, AZW3Container): self.item_view.hide_azw3_warning() self.toc = get_toc(self.ebook) self.toc_lang, self.toc_uid = self.toc.lang, self.toc.uid self.blank = QIcon(I('blank.png')) self.ok = QIcon(I('ok.png')) self.err = QIcon(I('dot_red.png')) self.icon_map = {None:self.blank, True:self.ok, False:self.err} def process_item(toc_node, parent): for child in toc_node: c = self.create_item(parent, child) process_item(child, c) root = self.root = self.tocw.invisibleRootItem() root.setData(0, Qt.UserRole, self.toc) process_item(self.toc, root) self.tocw.model().dataChanged.connect(self.data_changed) self.tocw.currentItemChanged.connect(self.current_item_changed) self.tocw.setCurrentItem(None)
def __call__(self, ebook): self.ebook = ebook if not isinstance(ebook, AZW3Container): self.item_view.hide_azw3_warning() self.toc = get_toc(self.ebook) self.toc_lang, self.toc_uid = self.toc.lang, self.toc.uid self.blank = QIcon(I('blank.png')) self.ok = QIcon(I('ok.png')) self.err = QIcon(I('dot_red.png')) self.icon_map = {None: self.blank, True: self.ok, False: self.err} def process_item(toc_node, parent): for child in toc_node: c = self.create_item(parent, child) process_item(child, c) root = self.root = self.tocw.invisibleRootItem() root.setData(0, Qt.UserRole, self.toc) process_item(self.toc, root) self.tocw.model().dataChanged.connect(self.data_changed) self.tocw.currentItemChanged.connect(self.current_item_changed) self.tocw.setCurrentItem(None)
def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, cover_data=None, report_progress=lambda x, y: None): container = Container(opf_path, log) fix_markup(container) report_progress(0.05, _('Parsed all content for markup transformation')) if opts.pdf_hyphenate: from calibre.ebooks.oeb.polish.hyphenation import add_soft_hyphens add_soft_hyphens(container) has_maths = add_maths_script(container) fix_fullscreen_images(container) name_anchor_map = make_anchors_unique(container, log) margin_files = tuple(create_margin_files(container)) toc = get_toc(container, verify_destinations=False) has_toc = toc and len(toc) links_page_uuid = add_all_links(container, margin_files) container.commit() report_progress(0.1, _('Completed markup transformation')) manager = RenderManager(opts, log, container.root) page_layout = get_page_layout(opts) pdf_doc = None anchor_locations = {} jobs = [] for margin_file in margin_files: jobs.append( job_for_name(container, margin_file.name, margin_file.margins, page_layout)) results = manager.convert_html_files(jobs, settle_time=1, has_maths=has_maths) num_pages = 0 page_margins_map = [] for margin_file in margin_files: name = margin_file.name data = results[name] if not isinstance(data, bytes): raise SystemExit(data) doc = data_as_pdf_doc(data) anchor_locations.update( get_anchor_locations(name, doc, num_pages + 1, links_page_uuid, log)) doc_pages = doc.page_count() page_margins_map.extend( repeat(resolve_margins(margin_file.margins, page_layout), doc_pages)) num_pages += doc_pages if pdf_doc is None: pdf_doc = doc else: pdf_doc.append(doc) page_number_display_map = get_page_number_display_map( manager, opts, num_pages, log) if has_toc: annotate_toc(toc, anchor_locations, name_anchor_map, log) if opts.pdf_add_toc: tocname = create_skeleton(container) root = container.parsed(tocname) add_pagenum_toc(root, toc, opts, page_number_display_map) container.commit() jobs = [job_for_name(container, tocname, None, page_layout)] results = manager.convert_html_files(jobs, settle_time=1) tocdoc = data_as_pdf_doc(results[tocname]) page_margins_map.extend( repeat(resolve_margins(None, page_layout), tocdoc.page_count())) pdf_doc.append(tocdoc) report_progress(0.7, _('Rendered all HTML as PDF')) fix_links(pdf_doc, anchor_locations, name_anchor_map, opts.pdf_mark_links, log) if toc and len(toc): add_toc(PDFOutlineRoot(pdf_doc), toc) report_progress(0.75, _('Added links to PDF content')) pdf_metadata = PDFMetadata(metadata) add_header_footer(manager, opts, pdf_doc, container, page_number_display_map, page_layout, page_margins_map, pdf_metadata, report_progress, toc if has_toc else None) merge_fonts(pdf_doc, log) num_removed = dedup_type3_fonts(pdf_doc) if num_removed: log('Removed', num_removed, 'duplicated Type3 glyphs') num_removed = remove_unused_fonts(pdf_doc) if num_removed: log('Removed', num_removed, 'unused fonts') # Needed because of https://bugreports.qt.io/browse/QTBUG-88976 subset_fonts(pdf_doc, log) num_removed = pdf_doc.dedup_images() if num_removed: log('Removed', num_removed, 'duplicate images') if opts.pdf_odd_even_offset: for i in range(1, pdf_doc.page_count()): margins = page_margins_map[i] mult = -1 if i % 2 else 1 val = opts.pdf_odd_even_offset if abs(val) < min(margins.left, margins.right): box = list(pdf_doc.get_page_box("CropBox", i)) box[0] += val * mult pdf_doc.set_page_box("CropBox", i, *box) if cover_data: add_cover(pdf_doc, cover_data, page_layout, opts) if metadata is not None: update_metadata(pdf_doc, pdf_metadata) report_progress(1, _('Updated metadata in PDF')) if opts.uncompressed_pdf: pdf_doc.uncompress() pdf_data = pdf_doc.write() if output_path is None: return pdf_data with open(output_path, 'wb') as f: f.write(pdf_data)
def process_exploded_book(book_fmt, opfpath, input_fmt, tdir, render_manager, log=None, book_hash=None, save_bookmark_data=False, book_metadata=None, virtualize_resources=True): log = log or default_log container = SimpleContainer(tdir, opfpath, log) input_plugin = plugin_for_input_format(input_fmt) is_comic = bool(getattr(input_plugin, 'is_image_collection', False)) def needs_work(mt): return mt in OEB_STYLES or mt in OEB_DOCS or mt == 'image/svg+xml' def work_priority(name): # ensure workers with large files or stylesheets # have the less names size = os.path.getsize(container.name_path_map[name]), is_html = container.mime_map.get(name) in OEB_DOCS return (0 if is_html else 1), size if not is_comic: render_manager.launch_workers( tuple(n for n, mt in iteritems(container.mime_map) if needs_work(mt)), container) bookmark_data = None if save_bookmark_data: bm_file = 'META-INF/calibre_bookmarks.txt' if container.exists(bm_file): with container.open(bm_file, 'rb') as f: bookmark_data = f.read() # We do not add zero byte sized files as the IndexedDB API in the # browser has no good way to distinguish between zero byte files and # load failures. excluded_names = { name for name, mt in iteritems(container.mime_map) if name == container.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') or name == 'mimetype' or not container.has_name_and_is_not_empty(name) } raster_cover_name, titlepage_name = create_cover_page( container, input_fmt.lower(), is_comic, book_metadata) toc = get_toc(container, verify_destinations=False).to_dict(count()) if not toc or not toc.get('children'): toc = from_xpaths(container, ['//h:h1', '//h:h2', '//h:h3']).to_dict(count()) spine = [name for name, is_linear in container.spine_names] spineq = frozenset(spine) landmarks = [l for l in get_landmarks(container) if l['dest'] in spineq] book_render_data = { 'version': RENDER_VERSION, 'toc': toc, 'book_format': book_fmt, 'spine': spine, 'link_uid': uuid4(), 'book_hash': book_hash, 'is_comic': is_comic, 'raster_cover_name': raster_cover_name, 'title_page_name': titlepage_name, 'has_maths': False, 'total_length': 0, 'spine_length': 0, 'toc_anchor_map': toc_anchor_map(toc), 'landmarks': landmarks, 'link_to_map': {}, } names = sorted( (n for n, mt in iteritems(container.mime_map) if needs_work(mt)), key=work_priority) results = render_manager( names, (tdir, opfpath, virtualize_resources, book_render_data['link_uid'], container.data_for_clone()), container) ltm = book_render_data['link_to_map'] html_data = {} virtualized_names = set() def merge_ltm(dest, src): for k, v in iteritems(src): if k in dest: dest[k] |= v else: dest[k] = v for link_to_map, hdata, vnames in results: html_data.update(hdata) virtualized_names |= vnames for k, v in iteritems(link_to_map): if k in ltm: merge_ltm(ltm[k], v) else: ltm[k] = v def manifest_data(name): mt = (container.mime_map.get(name) or 'application/octet-stream').lower() ans = { 'size': os.path.getsize(container.name_path_map[name]), 'is_virtualized': name in virtualized_names, 'mimetype': mt, 'is_html': mt in OEB_DOCS, } if ans['is_html']: data = html_data[name] ans['length'] = l = data['length'] book_render_data['total_length'] += l if name in book_render_data['spine']: book_render_data['spine_length'] += l ans['has_maths'] = hm = data['has_maths'] if hm: book_render_data['has_maths'] = True ans['anchor_map'] = data['anchor_map'] return ans book_render_data['files'] = { name: manifest_data(name) for name in set(container.name_path_map) - excluded_names } container.commit() for name in excluded_names: os.remove(container.name_path_map[name]) ltm = book_render_data['link_to_map'] for name, amap in iteritems(ltm): for k, v in tuple(iteritems(amap)): amap[k] = tuple(v) # needed for JSON serialization data = as_bytes(json.dumps(book_render_data, ensure_ascii=False)) with lopen(os.path.join(container.root, 'calibre-book-manifest.json'), 'wb') as f: f.write(data) return container, bookmark_data
def set_metadata_toc(container, language, criteria, changed_files, converter): # Returns True if either the metadata or TOC files changed # changed_files is updated opfChanged = False tocChanged = False # List of dc items in OPF file that get a simple text replacement # Add more items to this list if needed dc_list = ['//opf:metadata/dc:title', '//opf:metadata/dc:description', '//opf:metadata/dc:publisher', '//opf:metadata/dc:subject' '//opf:metadata/dc:contributor', '//opf:metadata/dc:coverage', '//opf:metadata/dc:rights']; # Update the OPF metadata # The language and creator fields are special # Only update the dc language if the original language was a Chinese type and epub format if container.book_type == u'epub': items = container.opf_xpath('//opf:metadata/dc:language') if len(items) > 0: for item in items: old_item = item.text if re.search('zh-\w+|zh', item.text, flags=re.IGNORECASE) != None: item.text = language if item.text != old_item: opfChanged = True # Update the creator text and file-as attribute items = container.opf_xpath('//opf:metadata/dc:creator') if len(items) > 0: for item in items: old_item = item.text if (item.text != None): item.text = converter.convert(item.text) if item.text != old_item: opfChanged = True for attribute in item.attrib: # update file-as attribute item.attrib[attribute] = converter.convert(item.attrib[attribute]) # Update the remaining dc items using a loop for dc_item in dc_list: items = container.opf_xpath(dc_item) if len(items) > 0: for item in items: old_item = item.text if (item.text != None): item.text = converter.convert(item.text) if item.text != old_item: opfChanged = True # Update the TOC - Do this after modifying the OPF data # Just grab all <text> fields (AKA "title" attribute in a TOC object) # and convert to the desired Chinese. Let Calibre set the title and # language automatically from the OPF file modified earlier book_toc = get_toc(container) for item in book_toc.iterdescendants(): if(item.title != None): old_title = item.title item.title = converter.convert(item.title) if old_title != item.title: tocChanged = True # Update the files with the changes if tocChanged: commit_toc(container, book_toc) container.dirty(book_toc.toc_file_name) changed_files.append(book_toc.toc_file_name) if opfChanged: container.dirty(container.opf_name) changed_files.append(container.opf_name) return(tocChanged or opfChanged)
def __init__(self, path_to_ebook, tdir, log=None, book_hash=None): log = log or default_log book_fmt, opfpath, input_fmt = extract_book(path_to_ebook, tdir, log=log) ContainerBase.__init__(self, tdir, opfpath, log) excluded_names = { name for name, mt in self.mime_map.iteritems() if name == self.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') or name == 'mimetype' } raster_cover_name, titlepage_name = self.create_cover_page( input_fmt.lower()) toc = get_toc(self).to_dict(count()) spine = [name for name, is_linear in self.spine_names] spineq = frozenset(spine) landmarks = [l for l in get_landmarks(self) if l['dest'] in spineq] self.book_render_data = data = { 'version': RENDER_VERSION, 'toc': toc, 'spine': spine, 'link_uid': uuid4(), 'book_hash': book_hash, 'is_comic': input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'}, 'raster_cover_name': raster_cover_name, 'title_page_name': titlepage_name, 'has_maths': False, 'total_length': 0, 'spine_length': 0, 'toc_anchor_map': toc_anchor_map(toc), 'landmarks': landmarks, 'link_to_map': {}, } # Mark the spine as dirty since we have to ensure it is normalized for name in data['spine']: self.parsed(name), self.dirty(name) self.transform_css() self.virtualized_names = set() self.virtualize_resources() def manifest_data(name): mt = (self.mime_map.get(name) or 'application/octet-stream').lower() ans = { 'size': os.path.getsize(self.name_path_map[name]), 'is_virtualized': name in self.virtualized_names, 'mimetype': mt, 'is_html': mt in OEB_DOCS, } if ans['is_html']: root = self.parsed(name) ans['length'] = l = get_length(root) self.book_render_data['total_length'] += l if name in data['spine']: self.book_render_data['spine_length'] += l ans['has_maths'] = hm = check_for_maths(root) if hm: self.book_render_data['has_maths'] = True ans['anchor_map'] = anchor_map(root) return ans data['files'] = { name: manifest_data(name) for name in set(self.name_path_map) - excluded_names } self.commit() for name in excluded_names: os.remove(self.name_path_map[name]) with lopen(os.path.join(self.root, 'calibre-book-manifest.json'), 'wb') as f: f.write( json.dumps(self.book_render_data, ensure_ascii=False).encode('utf-8'))
def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, cover_data=None, report_progress=lambda x, y: None): container = Container(opf_path, log) report_progress(0.05, _('Parsed all content for markup transformation')) has_maths = add_maths_script(container) fix_fullscreen_images(container) name_anchor_map = make_anchors_unique(container, log) margin_files = tuple(create_margin_files(container)) toc = get_toc(container, verify_destinations=False) has_toc = toc and len(toc) links_page_uuid = add_all_links(container, margin_files) container.commit() report_progress(0.1, _('Completed markup transformation')) manager = RenderManager(opts, log, container.root) page_layout = get_page_layout(opts) pdf_doc = None anchor_locations = {} jobs = [] for margin_file in margin_files: jobs.append(job_for_name(container, margin_file.name, margin_file.margins, page_layout)) results = manager.convert_html_files(jobs, settle_time=1, has_maths=has_maths) num_pages = 0 page_margins_map = [] for margin_file in margin_files: name = margin_file.name data = results[name] if not isinstance(data, bytes): raise SystemExit(data) doc = data_as_pdf_doc(data) anchor_locations.update(get_anchor_locations(doc, num_pages + 1, links_page_uuid)) doc_pages = doc.page_count() page_margins_map.extend(repeat(resolve_margins(margin_file.margins, page_layout), doc_pages)) num_pages += doc_pages if pdf_doc is None: pdf_doc = doc else: pdf_doc.append(doc) page_number_display_map = get_page_number_display_map(manager, opts, num_pages, log) if has_toc: annotate_toc(toc, anchor_locations, name_anchor_map, log) if opts.pdf_add_toc: tocname = create_skeleton(container) root = container.parsed(tocname) add_pagenum_toc(root, toc, opts, page_number_display_map) container.commit() jobs = [job_for_name(container, tocname, None, page_layout)] results = manager.convert_html_files(jobs, settle_time=1) tocdoc = data_as_pdf_doc(results[tocname]) page_margins_map.extend(repeat(resolve_margins(None, page_layout), tocdoc.page_count())) pdf_doc.append(tocdoc) report_progress(0.7, _('Rendered all HTML as PDF')) fix_links(pdf_doc, anchor_locations, name_anchor_map, opts.pdf_mark_links, log) if toc and len(toc): add_toc(PDFOutlineRoot(pdf_doc), toc) report_progress(0.75, _('Added links to PDF content')) pdf_metadata = PDFMetadata(metadata) add_header_footer( manager, opts, pdf_doc, container, page_number_display_map, page_layout, page_margins_map, pdf_metadata, report_progress, toc if has_toc else None) merge_fonts(pdf_doc) num_removed = dedup_type3_fonts(pdf_doc) if num_removed: log('Removed', num_removed, 'duplicated Type3 glyphs') num_removed = remove_unused_fonts(pdf_doc) if num_removed: log('Removed', num_removed, 'unused fonts') num_removed = pdf_doc.dedup_images() if num_removed: log('Removed', num_removed, 'duplicate images') if cover_data: add_cover(pdf_doc, cover_data, page_layout, opts) if metadata is not None: update_metadata(pdf_doc, pdf_metadata) report_progress(1, _('Updated metadata in PDF')) if opts.uncompressed_pdf: pdf_doc.uncompress() pdf_data = pdf_doc.write() if output_path is None: return pdf_data with open(output_path, 'wb') as f: f.write(pdf_data)
def __init__(self, path_to_ebook, tdir, log=None, book_hash=None): log = log or default_log book_fmt, opfpath, input_fmt = extract_book(path_to_ebook, tdir, log=log) ContainerBase.__init__(self, tdir, opfpath, log) # We do not add zero byte sized files as the IndexedDB API in the # browser has no good way to distinguish between zero byte files and # load failures. excluded_names = { name for name, mt in iteritems(self.mime_map) if name == self.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') or name == 'mimetype' or not self.has_name_and_is_not_empty(name)} raster_cover_name, titlepage_name = self.create_cover_page(input_fmt.lower()) toc = get_toc(self).to_dict(count()) spine = [name for name, is_linear in self.spine_names] spineq = frozenset(spine) landmarks = [l for l in get_landmarks(self) if l['dest'] in spineq] self.book_render_data = data = { 'version': RENDER_VERSION, 'toc':toc, 'spine':spine, 'link_uid': uuid4(), 'book_hash': book_hash, 'is_comic': input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'}, 'raster_cover_name': raster_cover_name, 'title_page_name': titlepage_name, 'has_maths': False, 'total_length': 0, 'spine_length': 0, 'toc_anchor_map': toc_anchor_map(toc), 'landmarks': landmarks, 'link_to_map': {}, } # Mark the spine as dirty since we have to ensure it is normalized for name in data['spine']: self.parsed(name), self.dirty(name) self.transform_css() self.virtualized_names = set() self.virtualize_resources() def manifest_data(name): mt = (self.mime_map.get(name) or 'application/octet-stream').lower() ans = { 'size':os.path.getsize(self.name_path_map[name]), 'is_virtualized': name in self.virtualized_names, 'mimetype':mt, 'is_html': mt in OEB_DOCS, } if ans['is_html']: root = self.parsed(name) ans['length'] = l = get_length(root) self.book_render_data['total_length'] += l if name in data['spine']: self.book_render_data['spine_length'] += l ans['has_maths'] = hm = check_for_maths(root) if hm: self.book_render_data['has_maths'] = True ans['anchor_map'] = anchor_map(root) return ans data['files'] = {name:manifest_data(name) for name in set(self.name_path_map) - excluded_names} self.commit() for name in excluded_names: os.remove(self.name_path_map[name]) data = json.dumps(self.book_render_data, ensure_ascii=False) if not isinstance(data, bytes): data = data.encode('utf-8') with lopen(os.path.join(self.root, 'calibre-book-manifest.json'), 'wb') as f: f.write(data)
def __init__(self, path_to_ebook, tdir, log=None, book_hash=None, save_bookmark_data=False, book_metadata=None): log = log or default_log book_fmt, opfpath, input_fmt = extract_book(path_to_ebook, tdir, log=log) ContainerBase.__init__(self, tdir, opfpath, log) self.book_metadata = book_metadata if save_bookmark_data: bm_file = 'META-INF/calibre_bookmarks.txt' self.bookmark_data = None if self.exists(bm_file): with self.open(bm_file, 'rb') as f: self.bookmark_data = f.read() # We do not add zero byte sized files as the IndexedDB API in the # browser has no good way to distinguish between zero byte files and # load failures. excluded_names = { name for name, mt in iteritems(self.mime_map) if name == self.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') or name == 'mimetype' or not self.has_name_and_is_not_empty(name) } raster_cover_name, titlepage_name = self.create_cover_page( input_fmt.lower()) toc = get_toc(self).to_dict(count()) if not toc or not toc.get('children'): toc = from_xpaths(self, ['//h:h1', '//h:h2', '//h:h3']).to_dict(count()) spine = [name for name, is_linear in self.spine_names] spineq = frozenset(spine) landmarks = [l for l in get_landmarks(self) if l['dest'] in spineq] self.book_render_data = data = { 'version': RENDER_VERSION, 'toc': toc, 'book_format': book_fmt, 'spine': spine, 'link_uid': uuid4(), 'book_hash': book_hash, 'is_comic': input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'}, 'raster_cover_name': raster_cover_name, 'title_page_name': titlepage_name, 'has_maths': False, 'total_length': 0, 'spine_length': 0, 'toc_anchor_map': toc_anchor_map(toc), 'landmarks': landmarks, 'link_to_map': {}, } # Mark the spine as dirty since we have to ensure it is normalized for name in data['spine']: self.parsed(name), self.dirty(name) self.transform_css() self.virtualized_names = set() self.virtualize_resources() def manifest_data(name): mt = (self.mime_map.get(name) or 'application/octet-stream').lower() ans = { 'size': os.path.getsize(self.name_path_map[name]), 'is_virtualized': name in self.virtualized_names, 'mimetype': mt, 'is_html': mt in OEB_DOCS, } if ans['is_html']: root = self.parsed(name) ans['length'] = l = get_length(root) self.book_render_data['total_length'] += l if name in data['spine']: self.book_render_data['spine_length'] += l ans['has_maths'] = hm = check_for_maths(root) if hm: self.book_render_data['has_maths'] = True ans['anchor_map'] = anchor_map(root) return ans data['files'] = { name: manifest_data(name) for name in set(self.name_path_map) - excluded_names } self.commit() for name in excluded_names: os.remove(self.name_path_map[name]) data = json.dumps(self.book_render_data, ensure_ascii=False) if not isinstance(data, bytes): data = data.encode('utf-8') with lopen(os.path.join(self.root, 'calibre-book-manifest.json'), 'wb') as f: f.write(data)