def filter_chapter(self, html, new_css_item): '''每一章都是一个html ''' #给每一个html文件添加css rel_css_dir = os.path.relpath('.', os.path.dirname(html.get_name())) rel_css_file_name=os.path.join(rel_css_dir, self.new_css_filename) html.add_item(epub.EpubItem(file_name=rel_css_file_name, media_type='text/css')) rel_image_dir=os.path.relpath(self.font_image_dir, os.path.dirname(html.get_name())) #生成html对应的links self.init_links_of_html(html) #只处理body里面正文处理 html_tree = parse_html_string(html.get_body_content()) root=html_tree.getroottree() build_text_list = etree.XPath("//text()") text_list=build_text_list(root) for text in text_list: #找出一段文字中生僻字的位置 pos_list = self.find_uncommon_words_in_one_text(text) self.add_image_tag_for_uncommon_words_in_one_text(text, pos_list,rel_image_dir) #将root更新到html的content中,不然的话,不会保存 ori_root=parse_html_string(html.content) #删除旧的body body=ori_root.find('body') ori_root.remove(body) #新的body ori_root.append(root.find('body')) html.content = etree.tostring(ori_root, pretty_print=True, encoding='utf-8', xml_declaration=True)
def html_after_read(self, book, chapter): try: tree = parse_html_string(chapter.content) except: return root = tree.getroottree() if len(root.find("head")) != 0: head = tree.find("head") title = head.find("title") if title is not None: chapter.title = title.text if len(root.find("body")) != 0: body = tree.find("body") # todo: # - fix <a href=""> # - fix .... for _item in body.iter(): for t in self.remove_attributes: if t in _item.attrib: del _item.attrib[t] chapter.content = etree.tostring(tree, pretty_print=True, encoding="utf-8", xml_declaration=True)
def _parse_nav(self, data, base_path): html_node = parse_html_string(data) nav_node = html_node.xpath("//nav[@*='toc']")[0] def parse_list(list_node): items = [] for item_node in list_node.findall("li"): sublist_node = item_node.find("ol") link_node = item_node.find("a") if sublist_node is not None: title = item_node[0].text children = parse_list(sublist_node) if link_node is not None: href = zip_path.normpath(zip_path.join(base_path, link_node.get("href"))) items.append((Section(title, href=href), children)) else: items.append((Section(title), children)) elif link_node is not None: title = link_node.text href = zip_path.normpath(zip_path.join(base_path, link_node.get("href"))) items.append(Link(href, title)) return items self.book.toc = parse_list(nav_node.find("ol"))
def get_body_content(self): """ Returns content of BODY element for this HTML document. Content will be of type 'str' (Python 2) or 'bytes' (Python 3). :Returns: Returns content of this document. """ content = self.get_content() try: html_tree = parse_html_string(self.content) except: return '' html_root = html_tree.getroottree() if len(html_root.find('body')) != 0: body = html_tree.find('body') tree_str = etree.tostring(body, pretty_print=True, encoding='utf-8', xml_declaration=False) # this is so stupid if tree_str.startswith(six.b('<body>')): n = tree_str.rindex(six.b('</body>')) return tree_str[7:n] return tree_str return ''
def get_body_content(self): """ Returns content of BODY element for this HTML document. Content will be of type 'str' (Python 2) or 'bytes' (Python 3). :Returns: Returns content of this document. """ try: html_tree = parse_html_string(self.content) except: return '' html_root = html_tree.getroottree() if len(html_root.find('body')) != 0: body = html_tree.find('body') tree_str = etree.tostring(body, pretty_print=True, encoding='utf-8', xml_declaration=False) # this is so stupid if tree_str.startswith(six.b('<body>')): n = tree_str.rindex(six.b('</body>')) return tree_str[7:n] return tree_str return ''
def get_body_content(self): content = self.get_content() try: html_tree = parse_html_string(self.content) except: return '' html_root = html_tree.getroottree() if len(html_root.find('body')) != 0: body = html_tree.find('body') tree_str = etree.tostring(body, pretty_print=True, encoding='utf-8', xml_declaration=False) # this is so stupid if tree_str.startswith('<body>'): n = tree_str.rindex('</body>') return tree_str[7:n] return tree_str return ''
def _reformat_endnotes(content): try: tree = parse_html_string(content.encode('utf-8')) except Exception as err: logger.error('Error parsing chapter content {err}'.format(err=err)) return content for elem in tree.iter(): # remove endnotes without reference if elem.tag == 'ol' and elem.get('class') == 'endnotes': for li in elem.xpath("//li[@class='orphan-endnote']"): li.drop_tree() # insert internal link to endnote's body into the sup elif elem.tag == 'sup' and elem.get('data-id'): a = etree.Element("a") a.set('href', '#endnote-{0}'.format(elem.get('data-id'))) a.text = elem.text elem.text = '' elem.insert(0, a) content = etree.tostring(tree, method='html', encoding='utf-8', xml_declaration=False) content = content.replace('<html><body>', '').replace('</body></html>', '') return content
def remove_unknown_tags(html_content): """ Remove unknown tags from a given html content string. This method is based on a method of Cleaner class on lxml.html module """ from lxml.html import defs try: tree = parse_html_string(html_content) except Exception as err: logger.error( "RemoveUnknownTags: Problem while trying to parse content %s" % err) allow_tags = set(defs.tags) if allow_tags: bad = [] for el in tree.iter(): if el.tag not in allow_tags: bad.append(el) if bad: if bad[0] is tree: el = bad.pop(0) el.tag = 'div' el.attrib.clear() for el in bad: el.drop_tag() return etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)
def get_body_content(self): content = self.get_content() try: html_tree = parse_html_string(self.content) except: return '' html_root = html_tree.getroottree() if len(html_root.find('body')) != 0: body = html_tree.find('body') if sys.version_info >= (3, 0): tree_str = etree.tostring(body, encoding='unicode') else: tree_str = etree.tostring(body, pretty_print=True, encoding='utf-8', xml_declaration=False) # this is so stupid if tree_str.startswith('<body>'): n = tree_str.rindex('</body>') return tree_str[7:n] return tree_str return ''
def html_after_read(self, book, chapter): try: tree = parse_html_string(chapter.content) except: return root = tree.getroottree() if len(root.find('head')) != 0: head = tree.find('head') title = head.find('title') if title is not None: chapter.title = title.text if len(root.find('body')) != 0: body = tree.find('body') # todo: # - fix <a href=""> # - fix .... for _item in body.iter(): if _item.tag == 'img': _name = _item.get('src') # this is not a good check if _name and not _name.lower().startswith('http'): _item.set('src', 'static/%s' % _convert_file_name(_name)) for t in self.remove_attributes: if t in _item.attrib: del _item.attrib[t] chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)
def remove_unknown_tags(html_content): """ Remove unknown tags from a given html content string. This method is based on a method of Cleaner class on lxml.html module """ from lxml.html import defs try: tree = parse_html_string(html_content) except Exception as err: logger.error("RemoveUnknownTags: Problem while trying to parse content %s" % err) allow_tags = set(defs.tags) if allow_tags: bad = [] for el in tree.iter(): if el.tag not in allow_tags: bad.append(el) if bad: if bad[0] is tree: el = bad.pop(0) el.tag = 'div' el.attrib.clear() for el in bad: el.drop_tag() return etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)
def html_after_read(self, book, chapter): if not chapter.is_chapter(): return from lxml import etree from ebooklib.utils import parse_html_string try: tree = parse_html_string(chapter.content) except: return root = tree.getroottree() if len(root.find('head')) != 0: head = tree.find('head') title = head.find('title') if title is not None: chapter.title = title.text chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)
def html_after_read(self, book, chapter): try: tree = parse_html_string(chapter.content) except: return root = tree.getroottree() if len(root.find('head')) != 0: head = tree.find('head') title = head.find('title') if title is not None: chapter.title = title.text if len(root.find('body')) != 0: body = tree.find('body') # todo: # - fix <a href=""> # - fix .... for _item in body.iter(): for t in self.remove_attributes: if t in _item.attrib: del _item.attrib[t] chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)
def _parse_nav(self, data, base_path): html_node = parse_html_string(data) nav_node = html_node.xpath("//nav[@*='toc']")[0] def parse_list(list_node): items = [] for item_node in list_node.findall("li"): sublist_node = item_node.find("ol") link_node = item_node.find("a") if sublist_node is not None: title = item_node[0].text children = parse_list(sublist_node) items.append((Section(title), children)) elif link_node is not None: title = link_node.text href = os.path.normpath(os.path.join(base_path, link_node.get("href"))) items.append(Link(href, title)) return items self.book.toc = parse_list(nav_node.find("ol"))
def _create_toc(self): """ Create table of contents :Args: - self (:class:`ExportBook`): current class instance """ self.toc = OrderedDict() self.spine = ['nav'] self.hold_chapters_urls = [i.url_title for i in self.book_version.get_hold_chapters()] for chapter in self.book_version.get_toc(): if chapter.chapter: c1 = epub.EpubHtml( title=chapter.chapter.title, file_name='%s.xhtml' % (chapter.chapter.url_title, ) ) # hook for some extra customizations cont = self._chapter_content_hook(chapter.chapter.content) try: tree = parse_html_string(cont.encode('utf-8')) except Exception as err: logger.error('Error parsing chapter content %s' % err) continue # hook for some extra customizations self._chapter_tree_hook(tree) for elem in tree.iter(): self._handle_chapter_element(elem) c1.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True) # hook for some extra customizations self._epub_chapter_hook(c1) self.epub_book.add_item(c1) self.spine.append(c1) if chapter.parent: self.toc[chapter.parent.id][1].append(c1) else: if chapter.has_children(): self.toc[chapter.id] = [c1, []] else: self.toc[chapter.id] = c1 else: epub_sec = epub.Section(chapter.name) if chapter.parent: self.toc[chapter.parent.id][1].append(epub_sec) else: self.toc[chapter.id] = [epub_sec, []]
def get_content(self, default=None): tree = parse_string(self.book.get_template(self._template_name)) tree_root = tree.getroot() tree_root.set('lang', self.lang or self.book.language) tree_root.attrib['{%s}lang' % NAMESPACES['XML']] = self.lang or self.book.language # add to the head also # <meta charset="utf-8" /> try: html_tree = parse_html_string(self.content) except: return '' html_root = html_tree.getroottree() # create and populate head _head = etree.SubElement(tree_root, 'head') if self.title != '': _title = etree.SubElement(_head, 'title') _title.text = self.title for lnk in self.links: if lnk.get("type") == "text/javascript": _lnk = etree.SubElement(_head, 'script', lnk) # force <script></script> _lnk.text = '' else: _lnk = etree.SubElement(_head, 'link', lnk) # this should not be like this # head = html_root.find('head') # if head is not None: # for i in head.getchildren(): # if i.tag == 'title' and self.title != '': # continue # _head.append(i) # create and populate body _body = etree.SubElement(tree_root, 'body') body = html_tree.find('body') if body is not None: for i in body.getchildren(): _body.append(i) tree_str = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True) return tree_str
def get_content(self, default=None): tree = parse_string(self.book.get_template(self._template_name)) tree_root = tree.getroot() tree_root.set("lang", self.lang or self.book.language) tree_root.attrib["{%s}lang" % NAMESPACES["XML"]] = self.lang or self.book.language # add to the head also # <meta charset="utf-8" /> try: html_tree = parse_html_string(self.content) except: return "" html_root = html_tree.getroottree() # create and populate head _head = etree.SubElement(tree_root, "head") if self.title != "": _title = etree.SubElement(_head, "title") _title.text = self.title if hasattr(self, "img_width") and hasattr(self, "img_height"): opts = {"name": "viewport", "content": "width=%d, height=%d" % (self.img_width, self.img_height)} _meta = etree.SubElement(_head, "meta", opts) for lnk in self.links: if lnk.get("type") == "text/javascript": _lnk = etree.SubElement(_head, "script", lnk) # force <script></script> _lnk.text = "" else: _lnk = etree.SubElement(_head, "link", lnk) # this should not be like this # head = html_root.find('head') # if head is not None: # for i in head.getchildren(): # if i.tag == 'title' and self.title != '': # continue # _head.append(i) # create and populate body _body = etree.SubElement(tree_root, "body") body = html_tree.find("body") if body is not None: for i in body.getchildren(): _body.append(i) tree_str = etree.tostring(tree, pretty_print=True, encoding="utf-8", xml_declaration=True) return tree_str
def html_before_write(self, book, chapter): if not chapter.content: return None tree = parse_html_string(chapter.get_content()) # remove comments reference bubble from the chapter content for commentsBubble in tree.xpath(".//a[@class='comment-link']"): commentsBubble.drop_tree() chapter.content = etree.tostring( tree, pretty_print=True, encoding='utf-8', xml_declaration=True)
def get_content(self, default=None): tree = parse_string(self.book.get_template(self._template_name)) tree_root = tree.getroot() tree_root.set('lang', self.lang or self.book.language) tree_root.attrib['{%s}lang' % NAMESPACES['XML']] = self.lang or self.book.language # add to the head also # <meta charset="utf-8" /> try: html_tree = parse_html_string(self.content) except: return '' html_root = html_tree.getroottree() # create and populate head _head = etree.SubElement(tree_root, 'head') if self.title != '': _title = etree.SubElement(_head, 'title') _title.text = self.title for lnk in self.links: if lnk.get("type") == "text/javascript": _lnk = etree.SubElement(_head, 'script', lnk) # force <script></script> _lnk.text = '' else: _lnk = etree.SubElement(_head, 'link', lnk) # this should not be like this head = html_root.find('head') if head is not None: for i in head.getchildren(): if i.tag == 'title' and self.title != '': continue _head.append(i) # create and populate body _body = etree.SubElement(tree_root, 'body') body = html_tree.find('body') if body is not None: for i in body.getchildren(): _body.append(i) tree_str = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True) return tree_str
def html_before_write(self, book, chapter): from lxml import etree try: from urlparse import urlparse, urljoin except ImportError: from urllib.parse import urlparse, urljoin try: tree = parse_html_string(chapter.content) except: return root = tree.getroottree() if len(root.find('body')) != 0: body = tree.find('body') # should also be aware to handle # ../chapter/ # ../chapter/#reference # ../chapter#reference for _link in body.xpath('//a'): # This is just temporary for the footnotes if _link.get('href', '').find('InsertNoteID') != -1: _ln = _link.get('href', '') i = _ln.find('#') _link.set('href', _ln[i:]) continue _u = urlparse(_link.get('href', '')) # Let us care only for internal links at the moment if _u.scheme == '': if _u.path != '': _link.set('href', '%s.xhtml' % _u.path) if _u.fragment != '': _link.set( 'href', urljoin(_link.get('href'), '#%s' % _u.fragment)) if _link.get('name') != None: _link.set('id', _link.get('name')) etree.strip_attributes(_link, 'name') chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8')
def html_before_write(self, book, chapter): if not chapter.content: return None tree = parse_html_string(chapter.get_content()) # remove comments reference bubble from the chapter content for commentsBubble in tree.xpath(".//a[@class='comment-link']"): commentsBubble.drop_tree() chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)
def html_before_write(self, book, chapter): from lxml import etree, html from pygments import highlight from pygments.formatters import HtmlFormatter from ebooklib import epub try: tree = parse_html_string(chapter.content) except: return root = tree.getroottree() had_source = False if len(root.find('body')) != 0: body = tree.find('body') # check for embeded source for source in body.xpath('//pre[contains(@class,"source-")]'): css_class = source.get('class') source_text = (source.text or '') + ''.join( [html.tostring(child) for child in source.iterchildren()]) if 'source-python' in css_class: from pygments.lexers import PythonLexer # _text = highlight(source_text, PythonLexer(), HtmlFormatter(linenos="inline")) _text = highlight(source_text, PythonLexer(), HtmlFormatter()) if 'source-css' in css_class: from pygments.lexers import CssLexer _text = highlight(source_text, CssLexer(), HtmlFormatter()) _parent = source.getparent() _parent.replace(source, etree.XML(_text)) had_source = True if had_source: chapter.add_link(href="style/code.css", rel="stylesheet", type="text/css") chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8')
def html_before_write(self, book, chapter): if chapter.get_type() != ebooklib.ITEM_DOCUMENT or isinstance( chapter, ebooklib.epub.EpubNav): return True tags_allowed_to_be_empty = config.get_configuration( 'ALLOWED_EMPTY_TAGS') tags_to_remove_on_cleanup = config.get_configuration( 'TAGS_TO_REMOVE_ON_CLEANUP') attrs_to_remove_on_cleanup = config.get_configuration( 'ATTRS_TO_REMOVE_ON_CLEANUP') allowed_empty_by_classes = config.get_configuration( 'ALLOWED_EMPTY_BY_CLASSES') root = parse_html_string(chapter.get_content()) # let's remove all the tags we don't want to have on export # this will affect all the converters since they use the generated # epub as base for converting process for tag in tags_to_remove_on_cleanup: for node in root.iter(tag): node.drop_tree() # walk over all elements in the tree and remove all # nodes that are recursively empty body = root.find('body') for elem in body.xpath("//body//*"): # remove not wanted attributes for attr in attrs_to_remove_on_cleanup: if attr in elem.attrib: del elem.attrib[attr] klasses = elem.get('class', '').split() allowed_by_class = any( [x in allowed_empty_by_classes for x in klasses]) if recursively_empty( elem ) and elem.tag not in tags_allowed_to_be_empty and not allowed_by_class: # just in case if text contains spaces or tabs, because drop_tag removes only tag elem.text = '' elem.drop_tag() chapter.content = etree.tostring(root, pretty_print=True, encoding="utf-8", xml_declaration=True) return True
def ice_cleanup(content, **kwargs): tree = parse_html_string(content) # remove tags of deletes-tracked changes spans_with_deletes = tree.xpath("//%(tag)s[contains(@class, '%(insert_class)s')]" % kwargs) for span in spans_with_deletes: span.drop_tree() # remove tag, but keep content of inserted changes spans_with_inserts = tree.xpath("//%(tag)s[contains(@class, '%(delete_class)s')]" % kwargs) for span in spans_with_inserts: span.drop_tag() return tree
def html_before_write(self, book, chapter): from lxml import etree try: from urlparse import urlparse, urljoin except ImportError: from urllib.parse import urlparse, urljoin try: tree = parse_html_string(chapter.content) except: return root = tree.getroottree() if len(root.find('body')) != 0: body = tree.find('body') # should also be aware to handle # ../chapter/ # ../chapter/#reference # ../chapter#reference for _link in body.xpath('//a'): # This is just temporary for the footnotes if _link.get('href', '').find('InsertNoteID') != -1: _ln = _link.get('href', '') i = _ln.find('#') _link.set('href', _ln[i:]); continue _u = urlparse(_link.get('href', '')) # Let us care only for internal links at the moment if _u.scheme == '': if _u.path != '': _link.set('href', '%s.xhtml' % _u.path) if _u.fragment != '': _link.set('href', urljoin(_link.get('href'), '#%s' % _u.fragment)) if _link.get('name') != None: _link.set('id', _link.get('name')) etree.strip_attributes(_link, 'name') chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8')
def html_before_write(self, book, chapter): try: html_tree = parse_html_string(chapter.content) except: return for img_elem in html_tree.iterfind(".//img"): href = img_elem.attrib["src"] split_href = os.path.splitext(img_elem.attrib["src"]) # We can just slugify the original URL to determine the new URL img_local_filename = slugify(split_href[0]) + split_href[1] book.add_item( epub.EpubItem(uid=img_local_filename, file_name=img_local_filename, content=requests.get(href).content) ) # Alter the HTML element to point at the local resource img_elem.attrib["src"] = img_local_filename chapter.content = etree.tostring(html_tree, pretty_print=True, encoding="utf-8")
def html_before_write(self, book, chapter): from lxml import etree, html from pygments import highlight from pygments.formatters import HtmlFormatter from ebooklib import epub try: tree = parse_html_string(chapter.content) except: return root = tree.getroottree() had_source = False if len(root.find('body')) != 0: body = tree.find('body') # check for embeded source for source in body.xpath('//pre[contains(@class,"source-")]'): css_class = source.get('class') source_text = (source.text or '') + ''.join([html.tostring(child) for child in source.iterchildren()]) if 'source-python' in css_class: from pygments.lexers import PythonLexer # _text = highlight(source_text, PythonLexer(), HtmlFormatter(linenos="inline")) _text = highlight(source_text, PythonLexer(), HtmlFormatter()) if 'source-css' in css_class: from pygments.lexers import CssLexer _text = highlight(source_text, CssLexer(), HtmlFormatter()) _parent = source.getparent() _parent.replace(source, etree.XML(_text)) had_source = True if had_source: chapter.add_link(href="style/code.css", rel="stylesheet", type="text/css") chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8')
def html_after_read(self, book, chapter): if not chapter.is_chapter(): return try: tree = parse_html_string(chapter.content) except: return root = tree.getroottree() if len(root.find('head')) != 0: head = tree.find('head') title = head.find('title') if title is not None: chapter.title = title.text chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)
def html_before_write(self, book, chapter): try: html_tree = parse_html_string(chapter.content) except: return for img_elem in html_tree.iterfind('.//img'): href = img_elem.attrib['src'] split_href = os.path.splitext(img_elem.attrib['src']) # We can just slugify the original URL to determine the new URL img_local_filename = slugify(split_href[0]) + split_href[1] book.add_item( epub.EpubItem(uid=img_local_filename, file_name=img_local_filename, content=requests.get(href).content)) # Alter the HTML element to point at the local resource img_elem.attrib['src'] = img_local_filename chapter.content = etree.tostring(html_tree, pretty_print=True, encoding='utf-8')
def html_before_write(self, book, chapter): if chapter.get_type() != ebooklib.ITEM_DOCUMENT or isinstance(chapter, ebooklib.epub.EpubNav): return True tags_allowed_to_be_empty = config.get_configuration('ALLOWED_EMPTY_TAGS') tags_to_remove_on_cleanup = config.get_configuration('TAGS_TO_REMOVE_ON_CLEANUP') attrs_to_remove_on_cleanup = config.get_configuration('ATTRS_TO_REMOVE_ON_CLEANUP') allowed_empty_by_classes = config.get_configuration('ALLOWED_EMPTY_BY_CLASSES') root = parse_html_string(chapter.get_content()) # let's remove all the tags we don't want to have on export # this will affect all the converters since they use the generated # epub as base for converting process for tag in tags_to_remove_on_cleanup: for node in root.iter(tag): node.drop_tree() # walk over all elements in the tree and remove all # nodes that are recursively empty body = root.find('body') for elem in body.xpath("//body//*"): # remove not wanted attributes for attr in attrs_to_remove_on_cleanup: if attr in elem.attrib: del elem.attrib[attr] klasses = elem.get('class', '').split() allowed_by_class = any([x in allowed_empty_by_classes for x in klasses]) if recursively_empty(elem) and elem.tag not in tags_allowed_to_be_empty and not allowed_by_class: # just in case if text contains spaces or tabs, because drop_tag removes only tag elem.text = '' elem.drop_tag() chapter.content = etree.tostring(root, pretty_print=True, encoding="utf-8", xml_declaration=True) return True
def ice_cleanup(content, **kwargs): """ This method removes "inserted" content and remove tags of "deleted" changes of the tracking engine trail. For example: <span class="ins">content and tag will be deleted</span> -> cause means it's not approved yet. <span class="del">content will be kept and tag removed</span> -> cause is previous content state. """ tree = parse_html_string(content) # remove tags and content of inserted changes (not approved) spans_with_inserts = tree.xpath("//%(tag)s[contains(@class, '%(insert_class)s')]" % kwargs) for span in spans_with_inserts: span.drop_tree() # remove tag, but keep content of deleted changes spans_with_deletes = tree.xpath("//%(tag)s[contains(@class, '%(delete_class)s')]" % kwargs) for span in spans_with_deletes: span.drop_tag() return tree
def html_before_write(self, book, chapter): from lxml import etree from ebooklib import epub try: tree = parse_html_string(chapter.content) except: return root = tree.getroottree() if len(root.find('body')) != 0: body = tree.find('body') # <span id="InsertNoteID_1_marker1" class="InsertNoteMarker"><sup><a href="#InsertNoteID_1">1</a></sup><span> # <ol id="InsertNote_NoteList"><li id="InsertNoteID_1">prvi footnote <span id="InsertNoteID_1_LinkBacks"><sup><a href="#InsertNoteID_1_marker1">^</a></sup></span></li> # <a epub:type="noteref" href="#n1">1</a></p> # <aside epub:type="footnote" id="n1"><p>These have been corrected in this EPUB3 edition.</p></aside> for footnote in body.xpath('//span[@class="InsertNoteMarker"]'): footnote_id = footnote.get('id')[:-8] a = footnote.getchildren()[0].getchildren()[0] footnote_text = body.xpath('//li[@id="%s"]' % footnote_id)[0] a.attrib['{%s}type' % epub.NAMESPACES['EPUB']] = 'noteref' ftn = etree.SubElement(body, 'aside', {'id': footnote_id}) ftn.attrib['{%s}type' % epub.NAMESPACES['EPUB']] = 'footnote' ftn_p = etree.SubElement(ftn, 'p') ftn_p.text = footnote_text.text old_footnote = body.xpath('//ol[@id="InsertNote_NoteList"]') if len(old_footnote) > 0: body.remove(old_footnote[0]) chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8')
def get_body_content(self): content = self.get_content() try: html_tree = parse_html_string(self.content) except: return "" html_root = html_tree.getroottree() if len(html_root.find("body")) != 0: body = html_tree.find("body") tree_str = etree.tostring(body, pretty_print=True, encoding="utf-8", xml_declaration=False) # this is so stupid if tree_str.startswith("<body>"): n = tree_str.rindex("</body>") return tree_str[7:n] return tree_str return ""
def ice_cleanup(content, **kwargs): """ This method removes "inserted" content and remove tags of "deleted" changes of the tracking engine trail. For example: <span class="ins">content and tag will be deleted</span> -> cause means it's not approved yet. <span class="del">content will be kept and tag removed</span> -> cause is previous content state. """ tree = parse_html_string(content) # remove tags and content of inserted changes (not approved) spans_with_inserts = tree.xpath( "//%(tag)s[contains(@class, '%(insert_class)s')]" % kwargs) for span in spans_with_inserts: span.drop_tree() # remove tag, but keep content of deleted changes spans_with_deletes = tree.xpath( "//%(tag)s[contains(@class, '%(delete_class)s')]" % kwargs) for span in spans_with_deletes: span.drop_tag() return tree
def export_book(input_file, filename): """Reads content of book in Booki.zip format and converts it to EPUB format. This function reads content of the book in Booki.zip file, creates new book in EPUB format and converts entire content into it. There are some things which are different in new EPUB format. One of them is how links and interlinks are handled. """ epub_book = ExportEpubBook() # Creating new EPUB file epub_book.add_prefix("bkterms", "http://booktype.org/") # Read old Booki.zip format bookizip = BookiZip(input_file) _toc, _section, _section_name = [], [], None spine = ["nav"] # Get filesnames of all the chapters/sections file_names = [file_name[6:-5] for _, file_name, _ in bookizip.get_toc()] x = 0 for typ, file_name, title in bookizip.get_toc(): # Ignore sections if typ == 1: if _section_name is None and len(_section) > 0: _toc.append(_section) elif len(_section) > 0: _toc.append((epub.Section(_section_name), _section[:])) _section_name = title _section = [] continue # Create new chapter with new filename c1 = epub.EpubHtml(title=title, file_name="{}.xhtml".format(file_name[6:-5])) cont = unicode(bookizip.read(file_name), "utf-8") _section.append(c1) try: tree = parse_html_string(cont.encode("utf-8")) except: # Just ignore everything if we can not parse the chapter continue # Change all the links in the document for elem in tree.iter(): if elem.tag == "a": href = elem.get("href") if href: urlp = urlparse.urlparse(href) url_title = urlp.path if urlp.scheme == "": if url_title and url_title in file_names: fixed_href = url_title + ".xhtml" if urlp.fragment: fixed_href = "{}#{}".format(fixed_href, urlp.fragment) elem.set("href", fixed_href) else: # ovdje brishe sve shto je externo. to se ne bi trebalo desavati elem.drop_tag() c1.content = etree.tostring(tree, pretty_print=True, encoding="utf-8", xml_declaration=True) epub_book.add_item(c1) spine.append(c1) x += 1 if _section_name is None and len(_section) > 0: _toc.append(_section) elif len(_section) > 0: _toc.append((epub.Section(_section_name), _section[:])) # Add all of the attachments for att_name in bookizip.get_attachments(): try: blob = bookizip.read(att_name) except (IOError, OSError): continue else: itm = epub.EpubImage() itm.file_name = att_name itm.content = blob epub_book.add_item(itm) epub_book.set_title("Title", "main") epub_book.set_language("en") epub_book.add_author("Author", role="aut", uid="author") epub_book.toc = _toc epub_book.spine = spine epub_book.add_item(epub.EpubNcx()) epub_book.add_item(epub.EpubNav()) opts = {"plugins": [TidyPlugin(), standard.SyntaxPlugin()]} epub.write_epub(filename, epub_book, opts)
def html_before_write(self, book, chapter): from lxml import etree try: tree = parse_html_string(chapter.content) except: return root = tree.getroottree() # delete deprecated tags # i should really have a list of allowed tags for tag in DEPRECATED_TAGS: etree.strip_tags(root, tag) head = tree.find('head') if head is not None and len(head) != 0: for _item in head: if _item.tag == 'base': leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'target']) elif _item.tag == 'link': leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'crossorigin', 'rel', 'media', 'hreflang', 'type', 'sizes']) elif _item.tag == 'title': if _item.text == '': head.remove(_item) elif _item.tag == 'meta': leave_only(_item, ATTRIBUTES_GLOBAL + ['name', 'http-equiv', 'content', 'charset']) # just remove for now, but really should not be like this head.remove(_item) elif _item.tag == 'script': leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'charset', 'async', 'defer', 'crossorigin']) elif _item.tag == 'source': leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'media']) elif _item.tag == 'style': leave_only(_item, ATTRIBUTES_GLOBAL + ['media', 'type', 'scoped']) else: leave_only(_item, ATTRIBUTES_GLOBAL) if len(root.find('body')) != 0: body = tree.find('body') for _item in body.iter(): # it is not # <a class="indexterm" href="ch05.html#ix_epub:trigger_element"> if _item.tag == 'a': leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'target', 'download', 'rel', 'hreflang', 'type']) elif _item.tag == 'area': leave_only(_item, ATTRIBUTES_GLOBAL + ['alt', 'coords', 'shape', 'href', 'target', 'download', 'rel', 'hreflang', 'type']) elif _item.tag == 'audio': leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'crossorigin', 'preload', 'autoplay', 'mediagroup', 'loop', 'muted', 'controls']) elif _item.tag == 'blockquote': leave_only(_item, ATTRIBUTES_GLOBAL + ['cite']) elif _item.tag == 'button': leave_only(_item, ATTRIBUTES_GLOBAL + ['autofocus', 'disabled', 'form', 'formaction', 'formenctype', 'formmethod', 'formnovalidate', 'formtarget', 'name', 'type', 'value', 'menu']) elif _item.tag == 'canvas': leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height']) elif _item.tag == 'canvas': leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height']) elif _item.tag == 'del': leave_only(_item, ATTRIBUTES_GLOBAL + ['cite', 'datetime']) elif _item.tag == 'details': leave_only(_item, ATTRIBUTES_GLOBAL + ['open']) elif _item.tag == 'embed': leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'width', 'height']) elif _item.tag == 'fieldset': leave_only(_item, ATTRIBUTES_GLOBAL + ['disable', 'form', 'name']) elif _item.tag == 'details': leave_only(_item, ATTRIBUTES_GLOBAL + ['accept-charset', 'action', 'autocomplete', 'enctype', 'method', 'name', 'novalidate', 'target']) elif _item.tag == 'iframe': leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'srcdoc', 'name', 'sandbox', 'seamless', 'allowfullscreen', 'width', 'height']) elif _item.tag == 'img': _src = _item.get('src', '').lower() if _src.startswith('http://') or _src.startswith('https://'): if 'remote-resources' not in chapter.properties: chapter.properties.append('remote-resources') # THIS DOES NOT WORK, ONLY VIDEO AND AUDIO FILES CAN BE REMOTE RESOURCES # THAT MEANS I SHOULD ALSO CATCH <SOURCE TAG from ebooklib import epub _img = epub.EpubImage(file_name = _item.get('src')) book.add_item(_img) leave_only(_item, ATTRIBUTES_GLOBAL + ['alt', 'src', 'crossorigin', 'usemap', 'ismap', 'width', 'height']) elif _item.tag == 'input': leave_only(_item, ATTRIBUTES_GLOBAL + ['accept', 'alt', 'autocomplete', 'autofocus', 'checked', 'dirname', 'disabled', 'form', 'formaction', 'formenctype', 'formmethod', 'formnovalidate', 'formtarget', 'height', 'inputmode', 'list', 'max', 'maxlength', 'min', 'multiple', 'name', 'pattern', 'placeholder', 'readonly', 'required', 'size', 'src', 'step' 'type', 'value', 'width']) elif _item.tag == 'ins': leave_only(_item, ATTRIBUTES_GLOBAL + ['cite', 'datetime']) elif _item.tag == 'keygen': leave_only(_item, ATTRIBUTES_GLOBAL + ['autofocus', 'challenge', 'disabled', 'form', 'keytype', 'name']) elif _item.tag == 'label': leave_only(_item, ATTRIBUTES_GLOBAL + ['form', 'for']) elif _item.tag == 'label': leave_only(_item, ATTRIBUTES_GLOBAL + ['form', 'for']) elif _item.tag == 'map': leave_only(_item, ATTRIBUTES_GLOBAL + ['name']) elif _item.tag == 'menu': leave_only(_item, ATTRIBUTES_GLOBAL + ['type', 'label']) elif _item.tag == 'object': leave_only(_item, ATTRIBUTES_GLOBAL + ['data', 'type', 'typemustmatch', 'name', 'usemap', 'form', 'width', 'height']) elif _item.tag == 'ol': leave_only(_item, ATTRIBUTES_GLOBAL + ['reversed', 'start', 'type']) elif _item.tag == 'optgroup': leave_only(_item, ATTRIBUTES_GLOBAL + ['disabled', 'label']) elif _item.tag == 'option': leave_only(_item, ATTRIBUTES_GLOBAL + ['disabled', 'label', 'selected', 'value']) elif _item.tag == 'output': leave_only(_item, ATTRIBUTES_GLOBAL + ['for', 'form', 'name']) elif _item.tag == 'param': leave_only(_item, ATTRIBUTES_GLOBAL + ['name', 'value']) elif _item.tag == 'progress': leave_only(_item, ATTRIBUTES_GLOBAL + ['value', 'max']) elif _item.tag == 'q': leave_only(_item, ATTRIBUTES_GLOBAL + ['cite']) elif _item.tag == 'select': leave_only(_item, ATTRIBUTES_GLOBAL + ['autofocus', 'disabled', 'form', 'multiple', 'name', 'required', 'size']) elif _item.tag == 'table': if _item.get('border', None): if _item.get('border') == '0': _item.set('border', '') if _item.get('summary', None): _caption = etree.Element('caption', {}) _caption.text = _item.get('summary') _item.insert(0, _caption) # add it as caption del _item.attrib['summary'] leave_only(_item, ATTRIBUTES_GLOBAL + ['border', 'sortable']) elif _item.tag == 'dl': _d = _item.find('dd') if _d is not None and len(_d) == 0: pass # http://html5doctor.com/the-dl-element/ # should be like this really # some of the elements can be missing # dl # dt # dd # dt # dd elif _item.tag == 'td': leave_only(_item, ATTRIBUTES_GLOBAL + ['colspan', 'rowspan', 'headers']) elif _item.tag == 'textarea': leave_only(_item, ATTRIBUTES_GLOBAL + ['autocomplete', 'autofocus', 'cols', 'dirname', 'disabled', 'form', 'inputmode', 'maxlength', 'name', 'placeholder', 'readonly', 'required', 'rows', 'wrap']) elif _item.tag in ['col', 'colgroup']: leave_only(_item, ATTRIBUTES_GLOBAL + ['span']) elif _item.tag == 'th': leave_only(_item, ATTRIBUTES_GLOBAL + ['colspan', 'rowspan', 'headers', 'scope', 'abbr', 'sorted']) elif _item.tag in ['time']: leave_only(_item, ATTRIBUTES_GLOBAL + ['datetime']) elif _item.tag in ['track']: leave_only(_item, ATTRIBUTES_GLOBAL + ['kind', 'src', 'srclang', 'label', 'default']) elif _item.tag == 'video': leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'crossorigin', 'poster', 'preload', 'autoplay', 'mediagroup', 'loop', 'muted', 'controls', 'width', 'height']) elif _item.tag == 'svg': # We need to add property "svg" in case we have embeded svg file if 'svg' not in chapter.properties: chapter.properties.append('svg') if _item.get('viewbox', None): del _item.attrib['viewbox'] if _item.get('preserveaspectratio', None): del _item.attrib['preserveaspectratio'] else: for _attr in six.iterkeys(_item.attrib): if _attr not in ATTRIBUTES_GLOBAL: del _item.attrib[_attr] chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True) return chapter.content
def export_book(input_file, filename): """Reads content of book in Booki.zip format and converts it to EPUB format. This function reads content of the book in Booki.zip file, creates new book in EPUB format and converts entire content into it. There are some things which are different in new EPUB format. One of them is how links and interlinks are handled. """ epub_book = ExportEpubBook() # Creating new EPUB file epub_book.add_prefix('bkterms', 'http://booktype.org/') # Read old Booki.zip format bookizip = BookiZip(input_file) _toc, _section, _section_name = [], [], None spine = ['nav'] # Get filesnames of all the chapters/sections file_names = [file_name[6:-5] for _, file_name, _ in bookizip.get_toc()] x = 0 for typ, file_name, title in bookizip.get_toc(): # Ignore sections if typ == 1: if _section_name is None and len(_section) > 0: _toc.append(_section) elif len(_section) > 0: _toc.append((epub.Section(_section_name), _section[:])) _section_name = title _section = [] continue # Create new chapter with new filename c1 = epub.EpubHtml(title=title, file_name='{}.xhtml'.format(file_name[6:-5])) cont = unicode(bookizip.read(file_name), 'utf-8') _section.append(c1) try: tree = parse_html_string(cont.encode('utf-8')) except: # Just ignore everything if we can not parse the chapter continue # Change all the links in the document for elem in tree.iter(): if elem.tag == 'a': href = elem.get('href') if href: urlp = urlparse.urlparse(href) url_title = urlp.path if urlp.scheme == '': if url_title and url_title in file_names: fixed_href = url_title + '.xhtml' if urlp.fragment: fixed_href = "{}#{}".format( fixed_href, urlp.fragment) elem.set('href', fixed_href) else: # ovdje brishe sve shto je externo. to se ne bi trebalo desavati elem.drop_tag() c1.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True) epub_book.add_item(c1) spine.append(c1) x += 1 if _section_name is None and len(_section) > 0: _toc.append(_section) elif len(_section) > 0: _toc.append((epub.Section(_section_name), _section[:])) # Add all of the attachments for att_name in bookizip.get_attachments(): try: blob = bookizip.read(att_name) except (IOError, OSError): continue else: itm = epub.EpubImage() itm.file_name = att_name itm.content = blob epub_book.add_item(itm) epub_book.set_title('Title', 'main') epub_book.set_language('en') epub_book.add_author('Author', role='aut', uid='author') epub_book.toc = _toc epub_book.spine = spine epub_book.add_item(epub.EpubNcx()) epub_book.add_item(epub.EpubNav()) opts = {'plugins': [TidyPlugin(), standard.SyntaxPlugin()]} epub.write_epub(filename, epub_book, opts)
def _import_old_epub(self, lyrics_path): new_verse_pattern = re.compile(r"^\s*(\d)\.\s+(.+)$") no_and_title_pattern = re.compile(r"^\s*(\d+)\s+(.+)$") if not self.epubs_page.old_epub: return book = epub.read_epub(self.epubs_page.old_epub) for item in list(filter(lambda i: isinstance(i, epub.EpubHtml), book.items)): tree = parse_html_string(item.content).getroottree() titles = tree.xpath("//title/text()") if titles: title = titles[0] m = no_and_title_pattern.match(title) if m is None: continue no, title = m.groups() markers = [] marker = None for line_element in tree.xpath("//div[@class='pGroup']/*"): if line_element.tag == 'p': while line_element.getchildren(): line_element.getchildren()[0].drop_tag() line_text = line_element.text m = new_verse_pattern.match(line_text) if m is not None: verse_no, line_text = m.groups() if marker is not None: markers.append(marker) marker = { 'name': str(verse_no), 'text': line_text, } else: marker['text'] += "\n{}".format(line_text) elif "chorus" in line_element.attrib['class']: if marker is not None: markers.append(marker) marker = { 'name': line_element.getchildren()[0].text.strip(). replace('(', '').replace(')', '').lower().capitalize(), 'text': "", } for chorus_line_element in line_element.getchildren()[1:]: marker['text'] += "{}\n".format(chorus_line_element.text) marker['text'] = marker['text'][:-1] markers.append(marker) with open(os.path.join(lyrics_path, "{}.json".format(no)), "w") as f: json.dump({ 'title': title, 'markers': markers, }, f, indent=2)
def _import_new_epub(self, lyrics_path): if not self.epubs_page.new_epub: return book = epub.read_epub(self.epubs_page.new_epub) for item in filter(lambda i: isinstance(i, epub.EpubHtml), book.items): tree = parse_html_string(item.content).getroottree() title = tree.xpath("//h1/strong/text()") if title: title = title[0] try: song_no = int(tree.xpath("//head/title/text()")[0].split(" ", 1)[0]) markers = [] marker = None for verse_no, verse_element in enumerate(tree.xpath("//div[@class='pGroup']/ol/li"), 1): marker = { 'name': str(verse_no), 'text': '', } for line_element in verse_element.getchildren(): if line_element.tag == 'p' and not 'se' in line_element.attrib.get('class', ''): while line_element.getchildren(): line_element.getchildren()[0].drop_tag() line_text = line_element.text.strip() marker['text'] += "{}\n".format(line_text) elif "chorus" in line_element.attrib['class']: if marker is not None: marker['text'] = marker['text'][:-1] markers.append(marker) marker = { 'name': line_element.getchildren()[0].text.strip(). replace('(', '').replace(')', '').lower().capitalize(), 'text': "", } for chorus_line_element in line_element.getchildren()[1:]: marker['text'] += "{}\n".format(chorus_line_element.text) else: if marker is not None: marker['text'] = marker['text'][:-1] markers.append(marker) marker = { 'name': line_element.text.strip().replace('(', '').replace(')', '').lower().capitalize(), 'text': "", } marker['text'] = marker['text'][:-1] markers.append(marker) if markers: with open(os.path.join(lyrics_path, "{}.json".format(song_no)), "w") as f: json.dump({ 'title': title, 'markers': markers, }, f, indent=2) except ValueError: pass
def html_before_write(self, book, chapter): from lxml import etree try: tree = parse_html_string(chapter.content) except: return root = tree.getroottree() # delete deprecated tags # i should really have a list of allowed tags for tag in DEPRECATED_TAGS: etree.strip_tags(root, tag) head = tree.find('head') if head is not None and len(head) != 0: for _item in head: if _item.tag == 'base': leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'target']) elif _item.tag == 'link': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'href', 'crossorigin', 'rel', 'media', 'hreflang', 'type', 'sizes' ]) elif _item.tag == 'title': if _item.text == '': head.remove(_item) elif _item.tag == 'meta': leave_only( _item, ATTRIBUTES_GLOBAL + ['name', 'http-equiv', 'content', 'charset']) # just remove for now, but really should not be like this head.remove(_item) elif _item.tag == 'script': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'src', 'type', 'charset', 'async', 'defer', 'crossorigin' ]) elif _item.tag == 'source': leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'media']) elif _item.tag == 'style': leave_only(_item, ATTRIBUTES_GLOBAL + ['media', 'type', 'scoped']) else: leave_only(_item, ATTRIBUTES_GLOBAL) if len(root.find('body')) != 0: body = tree.find('body') for _item in body.iter(): # it is not # <a class="indexterm" href="ch05.html#ix_epub:trigger_element"> if _item.tag == 'a': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'href', 'target', 'download', 'rel', 'hreflang', 'type' ]) elif _item.tag == 'area': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'alt', 'coords', 'shape', 'href', 'target', 'download', 'rel', 'hreflang', 'type' ]) elif _item.tag == 'audio': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'src', 'crossorigin', 'preload', 'autoplay', 'mediagroup', 'loop', 'muted', 'controls' ]) elif _item.tag == 'blockquote': leave_only(_item, ATTRIBUTES_GLOBAL + ['cite']) elif _item.tag == 'button': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'autofocus', 'disabled', 'form', 'formaction', 'formenctype', 'formmethod', 'formnovalidate', 'formtarget', 'name', 'type', 'value', 'menu' ]) elif _item.tag == 'canvas': leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height']) elif _item.tag == 'canvas': leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height']) elif _item.tag == 'del': leave_only(_item, ATTRIBUTES_GLOBAL + ['cite', 'datetime']) elif _item.tag == 'details': leave_only(_item, ATTRIBUTES_GLOBAL + ['open']) elif _item.tag == 'embed': leave_only( _item, ATTRIBUTES_GLOBAL + ['src', 'type', 'width', 'height']) elif _item.tag == 'fieldset': leave_only(_item, ATTRIBUTES_GLOBAL + ['disable', 'form', 'name']) elif _item.tag == 'details': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'accept-charset', 'action', 'autocomplete', 'enctype', 'method', 'name', 'novalidate', 'target' ]) elif _item.tag == 'iframe': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'src', 'srcdoc', 'name', 'sandbox', 'seamless', 'allowfullscreen', 'width', 'height' ]) elif _item.tag == 'img': _src = _item.get('src', '').lower() if _src.startswith('http://') or _src.startswith( 'https://'): if 'remote-resources' not in chapter.properties: chapter.properties.append('remote-resources') # THIS DOES NOT WORK, ONLY VIDEO AND AUDIO FILES CAN BE REMOTE RESOURCES # THAT MEANS I SHOULD ALSO CATCH <SOURCE TAG from ebooklib import epub _img = epub.EpubImage(file_name=_item.get('src')) book.add_item(_img) leave_only( _item, ATTRIBUTES_GLOBAL + [ 'alt', 'src', 'crossorigin', 'usemap', 'ismap', 'width', 'height' ]) elif _item.tag == 'input': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'accept', 'alt', 'autocomplete', 'autofocus', 'checked', 'dirname', 'disabled', 'form', 'formaction', 'formenctype', 'formmethod', 'formnovalidate', 'formtarget', 'height', 'inputmode', 'list', 'max', 'maxlength', 'min', 'multiple', 'name', 'pattern', 'placeholder', 'readonly', 'required', 'size', 'src', 'step' 'type', 'value', 'width' ]) elif _item.tag == 'ins': leave_only(_item, ATTRIBUTES_GLOBAL + ['cite', 'datetime']) elif _item.tag == 'keygen': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'autofocus', 'challenge', 'disabled', 'form', 'keytype', 'name' ]) elif _item.tag == 'label': leave_only(_item, ATTRIBUTES_GLOBAL + ['form', 'for']) elif _item.tag == 'label': leave_only(_item, ATTRIBUTES_GLOBAL + ['form', 'for']) elif _item.tag == 'map': leave_only(_item, ATTRIBUTES_GLOBAL + ['name']) elif _item.tag == 'menu': leave_only(_item, ATTRIBUTES_GLOBAL + ['type', 'label']) elif _item.tag == 'object': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'data', 'type', 'typemustmatch', 'name', 'usemap', 'form', 'width', 'height' ]) elif _item.tag == 'ol': leave_only( _item, ATTRIBUTES_GLOBAL + ['reversed', 'start', 'type']) elif _item.tag == 'optgroup': leave_only(_item, ATTRIBUTES_GLOBAL + ['disabled', 'label']) elif _item.tag == 'option': leave_only( _item, ATTRIBUTES_GLOBAL + ['disabled', 'label', 'selected', 'value']) elif _item.tag == 'output': leave_only(_item, ATTRIBUTES_GLOBAL + ['for', 'form', 'name']) elif _item.tag == 'param': leave_only(_item, ATTRIBUTES_GLOBAL + ['name', 'value']) elif _item.tag == 'progress': leave_only(_item, ATTRIBUTES_GLOBAL + ['value', 'max']) elif _item.tag == 'q': leave_only(_item, ATTRIBUTES_GLOBAL + ['cite']) elif _item.tag == 'select': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'autofocus', 'disabled', 'form', 'multiple', 'name', 'required', 'size' ]) elif _item.tag == 'table': if _item.get('border', None): if _item.get('border') == '0': _item.set('border', '') if _item.get('summary', None): _caption = etree.Element('caption', {}) _caption.text = _item.get('summary') _item.insert(0, _caption) # add it as caption del _item.attrib['summary'] leave_only(_item, ATTRIBUTES_GLOBAL + ['border', 'sortable']) elif _item.tag == 'dl': _d = _item.find('dd') if _d is not None and len(_d) == 0: pass # http://html5doctor.com/the-dl-element/ # should be like this really # some of the elements can be missing # dl # dt # dd # dt # dd elif _item.tag == 'td': leave_only( _item, ATTRIBUTES_GLOBAL + ['colspan', 'rowspan', 'headers']) elif _item.tag == 'textarea': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'autocomplete', 'autofocus', 'cols', 'dirname', 'disabled', 'form', 'inputmode', 'maxlength', 'name', 'placeholder', 'readonly', 'required', 'rows', 'wrap' ]) elif _item.tag in ['col', 'colgroup']: leave_only(_item, ATTRIBUTES_GLOBAL + ['span']) elif _item.tag == 'th': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'colspan', 'rowspan', 'headers', 'scope', 'abbr', 'sorted' ]) elif _item.tag in ['time']: leave_only(_item, ATTRIBUTES_GLOBAL + ['datetime']) elif _item.tag in ['track']: leave_only( _item, ATTRIBUTES_GLOBAL + ['kind', 'src', 'srclang', 'label', 'default']) elif _item.tag == 'video': leave_only( _item, ATTRIBUTES_GLOBAL + [ 'src', 'crossorigin', 'poster', 'preload', 'autoplay', 'mediagroup', 'loop', 'muted', 'controls', 'width', 'height' ]) elif _item.tag == 'svg': # We need to add property "svg" in case we have embeded svg file if 'svg' not in chapter.properties: chapter.properties.append('svg') if _item.get('viewbox', None): del _item.attrib['viewbox'] if _item.get('preserveaspectratio', None): del _item.attrib['preserveaspectratio'] else: for _attr in six.iterkeys(_item.attrib): if _attr not in ATTRIBUTES_GLOBAL: del _item.attrib[_attr] chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True) return chapter.content
def import_book_from_file(epub_file, user, **kwargs): import uuid from django.utils.timezone import utc from lxml import etree from ebooklib.utils import parse_html_string from .book import create_book opts = {'plugins': [TidyPlugin(), ImportPlugin()]} epub_book = epub.read_epub(epub_file, opts) chapters = {} toc = [] def _parse_toc(elements, parent=None): for _elem in elements: # used later to get parent of an elem unique_id = uuid.uuid4().hex if isinstance(_elem, tuple): toc.append((1, _elem[0].title, unique_id, parent)) _parse_toc(_elem[1], unique_id) elif isinstance(_elem, epub.Section): pass elif isinstance(_elem, epub.Link): _u = urlparse.urlparse(_elem.href) _name = urllib.unquote(os.path.basename(_u.path)) if not _name: _name = _elem.title if _name not in chapters: chapters[_name] = _elem.title toc.append((0, _name, unique_id, parent)) _parse_toc(epub_book.toc) epub_book_name = epub_book.metadata[epub.NAMESPACES['DC']]['title'][0][0] title = kwargs.get('book_title', epub_book_name) book_url = kwargs.get('book_url', None) # must check if title already exists book = create_book(user, title, book_url=book_url) now = datetime.datetime.utcnow().replace(tzinfo=utc) stat = models.BookStatus.objects.filter(book=book, name="new")[0] for attach in epub_book.get_items_of_type(ebooklib.ITEM_IMAGE): att = models.Attachment( book=book, version=book.version, status=stat ) s = attach.get_content() f = StringIO.StringIO(s) f2 = File(f) f2.size = len(s) att.attachment.save(attach.file_name, f2, save=False) att.save() f.close() _imported = {} # TODO: ask about importing empty sections for chap in epub_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): # Nav and Cover are not imported if not chap.is_chapter(): continue # check if this chapter name already exists name = urllib.unquote(os.path.basename(chap.file_name)) content = chap.get_body_content() # maybe this part has to go to the plugin # but you can not get title from <title> if name in chapters: name = chapters[name] else: name = _convert_file_name(name) if name.rfind('.') != -1: name = name[:name.rfind('.')] name = name.replace('.', '') chapter = models.Chapter( book=book, version=book.version, url_title=booktype_slugify(unicode(name)), title=name, status=stat, content=content, created=now, modified=now ) chapter.save() _imported[urllib.unquote(os.path.basename(chap.file_name))] = chapter # fix links for chap in epub_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): if not chap.is_chapter(): continue content = chap.get_content() try: tree = parse_html_string(content) except: pass root = tree.getroottree() if len(root.find('body')) != 0: body = tree.find('body') to_save = False for _item in body.iter(): if _item.tag == 'a': _href = _item.get('href') if _href: _u = urlparse.urlparse(_href) pth = urllib.unquote(os.path.basename(_u.path)) if pth in _imported: _name = _imported[pth].url_title _u2 = urlparse.urljoin(_href, '../' + _name + '/') _item.set('href', _u2) to_save = True if to_save: chap.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True) _imported[urllib.unquote(os.path.basename(chap.file_name))].content = chap.content _imported[urllib.unquote(os.path.basename(chap.file_name))].save() n = len(toc) + 1 parents = {} for _elem in toc: if _elem[0] == 1: # section toc_item = models.BookToc( book=book, version=book.version, name=_elem[1], chapter=None, weight=n, typeof=2 ) else: if not _elem[1] in _imported: continue chap = _imported[_elem[1]] toc_item = models.BookToc( book=book, version=book.version, name=chap.title, chapter=chap, weight=n, typeof=1 ) # check if elem has parent if _elem[3]: toc_item.parent = parents.get(_elem[3], None) toc_item.save() # decrease weight n -= 1 # save temporarily the toc_item in parent parents[_elem[2]] = toc_item return book
def import_book_from_file(epub_file, user, **kwargs): import uuid from django.utils.timezone import utc from lxml import etree from ebooklib.utils import parse_html_string from .book import create_book opts = {'plugins': [TidyPlugin(), ImportPlugin()]} epub_book = epub.read_epub(epub_file, opts) chapters = {} toc = [] def _parse_toc(elements, parent=None): for _elem in elements: # used later to get parent of an elem unique_id = uuid.uuid4().hex if isinstance(_elem, tuple): toc.append((1, _elem[0].title, unique_id, parent)) _parse_toc(_elem[1], unique_id) elif isinstance(_elem, epub.Section): pass elif isinstance(_elem, epub.Link): _u = urlparse.urlparse(_elem.href) _name = urllib.unquote(os.path.basename(_u.path)) if not _name: _name = _elem.title if _name not in chapters: chapters[_name] = _elem.title toc.append((0, _name, unique_id, parent)) _parse_toc(epub_book.toc) epub_book_name = epub_book.metadata[epub.NAMESPACES['DC']]['title'][0][0] title = kwargs.get('book_title', epub_book_name) book_url = kwargs.get('book_url', None) # must check if title already exists book = create_book(user, title, book_url=book_url) now = datetime.datetime.utcnow().replace(tzinfo=utc) stat = models.BookStatus.objects.filter(book=book, name="new")[0] for attach in epub_book.get_items_of_type(ebooklib.ITEM_IMAGE): att = models.Attachment(book=book, version=book.version, status=stat) s = attach.get_content() f = StringIO.StringIO(s) f2 = File(f) f2.size = len(s) att.attachment.save(attach.file_name, f2, save=False) att.save() f.close() _imported = {} # TODO: ask about importing empty sections for chap in epub_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): # Nav and Cover are not imported if not chap.is_chapter(): continue # check if this chapter name already exists name = urllib.unquote(os.path.basename(chap.file_name)) content = chap.get_body_content() # maybe this part has to go to the plugin # but you can not get title from <title> if name in chapters: name = chapters[name] else: name = _convert_file_name(name) if name.rfind('.') != -1: name = name[:name.rfind('.')] name = name.replace('.', '') chapter = models.Chapter(book=book, version=book.version, url_title=booktype_slugify(unicode(name)), title=name, status=stat, content=content, created=now, modified=now) chapter.save() _imported[urllib.unquote(os.path.basename(chap.file_name))] = chapter # fix links for chap in epub_book.get_items_of_type(ebooklib.ITEM_DOCUMENT): if not chap.is_chapter(): continue content = chap.get_content() try: tree = parse_html_string(content) except: pass root = tree.getroottree() if len(root.find('body')) != 0: body = tree.find('body') to_save = False for _item in body.iter(): if _item.tag == 'a': _href = _item.get('href') if _href: _u = urlparse.urlparse(_href) pth = urllib.unquote(os.path.basename(_u.path)) if pth in _imported: _name = _imported[pth].url_title _u2 = urlparse.urljoin(_href, '../' + _name + '/') _item.set('href', _u2) to_save = True if to_save: chap.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True) _imported[urllib.unquote(os.path.basename( chap.file_name))].content = chap.content _imported[urllib.unquote(os.path.basename( chap.file_name))].save() n = len(toc) + 1 parents = {} for _elem in toc: if _elem[0] == 1: # section toc_item = models.BookToc(book=book, version=book.version, name=_elem[1], chapter=None, weight=n, typeof=2) else: if not _elem[1] in _imported: continue chap = _imported[_elem[1]] toc_item = models.BookToc(book=book, version=book.version, name=chap.title, chapter=chap, weight=n, typeof=1) # check if elem has parent if _elem[3]: toc_item.parent = parents.get(_elem[3], None) toc_item.save() # decrease weight n -= 1 # save temporarily the toc_item in parent parents[_elem[2]] = toc_item return book
def get_content(self, default=None): """ Returns content for this document as HTML string. Content will be of type 'str' (Python 2) or 'bytes' (Python 3). :Args: - default: Default value for the content if it is not defined. :Returns: Returns content of this document. """ tree = parse_string(self.book.get_template(self._template_name)) tree_root = tree.getroot() tree_root.set('lang', self.lang or self.book.language) tree_root.attrib['{%s}lang' % NAMESPACES['XML']] = self.lang or self.book.language # add to the head also # <meta charset="utf-8" /> try: html_tree = parse_html_string(self.content) except: return '' html_root = html_tree.getroottree() # create and populate head _head = etree.SubElement(tree_root, 'head') if self.title != '': _title = etree.SubElement(_head, 'title') _title.text = self.title for lnk in self.links: if lnk.get("type") == "text/javascript": _lnk = etree.SubElement(_head, 'script', lnk) # force <script></script> _lnk.text = '' else: _lnk = etree.SubElement(_head, 'link', lnk) # this should not be like this # head = html_root.find('head') # if head is not None: # for i in head.getchildren(): # if i.tag == 'title' and self.title != '': # continue # _head.append(i) # create and populate body _body = etree.SubElement(tree_root, 'body') if self.direction: _body.set('dir', self.direction) body = html_tree.find('body') if body is not None: for i in body.getchildren(): _body.append(i) tree_str = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True) return tree_str
def init_links_of_html(self, html_item): html_tree = parse_html_string(html_item.content) for link in html_tree.getroottree().xpath('//*[local-name()="link"]'): item=epub.EpubItem(file_name= link.get('href'), media_type='text/css') html_item.add_item(item)