def _clean_opf(self, opf): nsmap = {} for elem in opf.iter(tag=etree.Element): nsmap.update(elem.nsmap) for elem in opf.iter(tag=etree.Element): if (parse_utils.namespace(elem.tag) in ('', const.OPF1_NS) and ':' not in parse_utils.barename(elem.tag)): elem.tag = base.tag('opf', parse_utils.barename(elem.tag)) nsmap.update(const.OPF2_NSMAP) attrib = dict(opf.attrib) nroot = etree.Element(base.tag('opf', 'package'), nsmap={None: const.OPF2_NS}, attrib=attrib) metadata = etree.SubElement(nroot, base.tag('opf', 'metadata'), nsmap=nsmap) ignored = (base.tag('opf', 'dc-metadata'), base.tag('opf', 'x-metadata')) for elem in base.xpath(opf, 'o2:metadata//*'): if elem.tag in ignored: continue if parse_utils.namespace(elem.tag) in const.DC_NSES: tag = parse_utils.barename(elem.tag).lower() elem.tag = '{%s}%s' % (const.DC11_NS, tag) if elem.tag.startswith('dc:'): tag = elem.tag.partition(':')[-1].lower() elem.tag = '{%s}%s' % (const.DC11_NS, tag) metadata.append(elem) for element in base.xpath(opf, 'o2:metadata//o2:meta'): metadata.append(element) for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'): for element in base.xpath(opf, tag): nroot.append(element) return nroot
def __call__(self, oeb, opts): self.log = oeb.log self.oeb = oeb self.opts = opts self.log('Detecting structure...') self.detect_chapters() if self.oeb.auto_generated_toc or opts.use_auto_toc: orig_toc = self.oeb.toc self.oeb.toc = base.TOC() self.create_level_based_toc() if self.oeb.toc.count() < 1: if not opts.no_chapters_in_toc and self.detected_chapters: self.create_toc_from_chapters() if self.oeb.toc.count() < opts.toc_threshold: self.create_toc_from_links() if self.oeb.toc.count() < 2 and orig_toc.count() > 2: self.oeb.toc = orig_toc else: self.oeb.auto_generated_toc = True self.log('Auto generated TOC with %d entries.' % self.oeb.toc.count()) if opts.toc_filter is not None: regexp = re.compile(opts.toc_filter) for node in list(self.oeb.toc.iter()): if not node.title or regexp.search(node.title) is not None: self.log('Filtering', node.title if node.title else 'empty node', 'from TOC') self.oeb.toc.remove(node) if opts.page_breaks_before is not None: pb_xpath = XPath(opts.page_breaks_before) for item in oeb.spine: for elem in pb_xpath(item.data): try: prev = next(elem.itersiblings(tag=etree.Element, preceding=True)) if (parse_utils.barename(elem.tag) in {'h1', 'h2'} and parse_utils.barename(prev.tag) in {'h1', 'h2'} and (not prev.tail or not prev.tail.split())): # We have two adjacent headings, do not put a page # break on the second one continue except StopIteration: pass style = elem.get('style', '') if style: style += '; ' elem.set('style', style+'page-break-before:always') for node in self.oeb.toc.iter(): if not node.title or not node.title.strip(): node.title = 'Unnamed' if self.opts.start_reading_at: self.detect_start_reading()
def dump_text(self, elem, stylizer, page): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. ''' # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, (str, bytes)) \ or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS): p = elem.getparent() if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \ and elem.tail: return [elem.tail] return [''] # Setup our variables. text = [''] tags = [] tag = parse_utils.barename(elem.tag) attribs = elem.attrib if tag == 'body': tag = 'div' tags.append(tag) # Remove attributes we won't want. if 'style' in attribs: del attribs['style'] # Turn the rest of the attributes into a string we can write with the tag. at = '' for k, v in attribs.items(): at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True)) # Write the tag. text.append('<%s%s' % (tag, at)) if tag in SELF_CLOSING_TAGS: text.append(' />') else: text.append('>') # Process tags that contain text. if hasattr(elem, 'text') and elem.text: text.append(self.prepare_string_for_html(elem.text)) # Recurse down into tags within the tag we are in. for item in elem: text += self.dump_text(item, stylizer, page) # Close all open tags. tags.reverse() for t in tags: if t not in SELF_CLOSING_TAGS: text.append('</%s>' % t) # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: text.append(self.prepare_string_for_html(elem.tail)) return text
def add_block_tag(self, tagname, html_tag, tag_style, stylizer, is_table_cell=False, float_spec=None, is_list_item=False): block = self.blocks.start_new_block(html_tag, tag_style, is_table_cell=is_table_cell, float_spec=float_spec, is_list_item=is_list_item) anchor = html_tag.get('id') or html_tag.get('name') if anchor: block.bookmarks.add(self.bookmark_for_anchor(anchor, html_tag)) if tagname == 'img': self.images_manager.add_image(html_tag, block, stylizer, as_block=True) else: text = html_tag.text if text: block.add_text(text, tag_style, ignore_leading_whitespace=True, is_parent_style=True, link=self.current_link, lang=self.current_lang) elif tagname == 'li' and len(html_tag) and parse_utils.barename( html_tag[0].tag) in ('ul', 'ol') and len(html_tag[0]): block.force_not_empty = True
def find_levels(self): def level_of(elem, body): ans = 1 while elem.getparent() is not body: ans += 1 elem = elem.getparent() return ans paras = XPath('descendant::h:p|descendant::h:div') for item in self.oeb.spine: body = XPath('//h:body')(item.data) if not body: continue body = body[0] for p in paras(body): level = level_of(p, body) level = '%s_%d' % (parse_utils.barename(p.tag), level) if level not in self.levels: self.levels[level] = [] self.levels[level].append(p) remove = set() for k, v in self.levels.items(): num = len(v) self.log.debug('Found %d items of level:'%num, k) level = int(k.split('_')[-1]) tag = k.split('_')[0] if tag == 'p' and num < 25: remove.add(k) if tag == 'div': if level > 2 and num < 25: remove.add(k) elif level < 3: # Check each level < 3 element and only keep those # that have many child paras for elem in list(v): children = len(paras(elem)) if children < 5: v.remove(elem) for k in remove: self.levels.pop(k) self.log.debug('Ignoring level', k)
def adjust_split_point(split_point, log): ''' Move the split point up its ancestor chain if it has no content before it. This handles the common case: <div id="chapter1"><h2>Chapter 1</h2>...</div> with a page break on the h2. ''' sp = split_point while True: parent = sp.getparent() if (parent is None or parse_utils.barename(parent.tag) in {'body', 'html'} or (parent.text and parent.text.strip()) or parent.index(sp) > 0): break sp = parent if sp is not split_point: log.debug('Adjusted split point to ancestor') return sp
def pretty_block(parent, level=1, indent=' '): ''' Surround block tags with blank lines and recurse into child block tags that contain only other block tags ''' if not parent.text or isspace(parent.text): parent.text = '' if (hasattr(parent.tag, 'strip') and parse_utils.barename(parent.tag) in {'tr', 'td', 'th'}): nn = '\n' else: nn = '\n\n' parent.text = parent.text + nn + (indent * level) for i, child in enumerate(parent): if isblock(child) and has_only_blocks(child): pretty_block(child, level=level + 1, indent=indent) elif child.tag == base.tag('svg', 'svg'): pretty_xml_tree(child, level=level, indent=indent) new_level = level if i == len(parent) - 1: new_level -= 1 if not child.tail or isspace(child.tail): child.tail = '' child.tail = child.tail + nn + (indent * new_level)
def pretty_html_tree(container, root): root.text = '\n\n' for child in root: child.tail = '\n\n' if hasattr(child.tag, 'endswith') and child.tag.endswith('}head'): pretty_xml_tree(child) for body in root.findall('h:body', namespaces=const.XPNSMAP): pretty_block(body) # Special case the handling of a body that contains a single block tag # with all content. In this case we prettify the containing block tag # even if it has non block children. if (len(body) == 1 and not callable(body[0].tag) and isblock(body[0]) and not has_only_blocks(body[0]) and parse_utils.barename(body[0].tag) not in ('pre', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6') and len(body[0]) > 0): pretty_block(body[0], level=2) if container is not None: # Handle <script> and <style> tags for child in root.xpath('//*[local-name()="script" or local-name()=' '"style"]'): pretty_script_or_style(container, child)
def __init__(self, namespace, styles_manager, links_manager, html_block, style, is_table_cell=False, float_spec=None, is_list_item=False, parent_bg=None): self.force_not_empty = False self.namespace = namespace self.bookmarks = set() self.list_tag = (html_block, style) if is_list_item else None self.is_first_block = False self.numbering_id = None self.parent_items = None self.html_block = html_block self.html_tag = parse_utils.barename(html_block.tag) self.float_spec = float_spec if float_spec is not None: float_spec.blocks.append(self) self.html_style = style self.style = styles_manager.create_block_style( style, html_block, is_table_cell=is_table_cell, parent_bg=parent_bg) self.styles_manager, self.links_manager = styles_manager, links_manager self.keep_next = False self.runs = [] self.skipped = False self.linked_style = None self.page_break_before = style['page-break-before'] == 'always' self.keep_lines = style['page-break-inside'] == 'avoid' self.page_break_after = False self.block_lang = None
def dump_text(self, elem, stylizer, page): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. @page: OEB page used to determine absolute urls. ''' if not isinstance(elem.tag, (str, bytes)) \ or parse_utils.namespace(elem.tag) != const.XHTML_NS: p = elem.getparent() if (p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) == const.XHTML_NS and elem.tail): return [elem.tail] return [''] text = [''] style = stylizer.style(elem) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return [elem.tail] return [''] tag = parse_utils.barename(elem.tag) tag_id = elem.attrib.get('id', None) in_block = False in_heading = False # Are we in a heading? # This can either be a heading tag or a TOC item. if tag in HEADING_TAGS or '%s#%s' % (page.href, tag_id) in self.toc_ids: in_heading = True if not self.last_was_heading: text.append('\n\n\n\n\n\n') # Are we in a paragraph block? if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: if self.opts.remove_paragraph_spacing and not in_heading: text.append('\t') in_block = True if tag in SPACE_TAGS: text.append(' ') # Hard scene breaks. if tag == 'hr': text.append('\n\n* * *\n\n') # Soft scene breaks. try: ems = int(round((float(style.marginTop) / style.fontSize) - 1)) if ems >= 1: text.append('\n' * ems) except Exception: pass # Process tags that contain text. if hasattr(elem, 'text') and elem.text: text.append(elem.text) # Recurse down into tags within the tag we are in. for item in elem: text += self.dump_text(item, stylizer, page) if in_block: text.append('\n\n') if in_heading: text.append('\n') self.last_was_heading = True else: self.last_was_heading = False if hasattr(elem, 'tail') and elem.tail: text.append(elem.tail) return text
def dump_text(self, elem_tree, stylizer, page, tag_stack=[]): """ This function is intended to be used in a recursive manner. dump_text will run though all elements in the elem_tree and call itself on each element. self.image_hrefs will be populated by calling this function. @param elem_tree: etree representation of XHTML content to be transformed. @param stylizer: Used to track the style of elements within the tree. @param page: OEB page used to determine absolute urls. @param tag_stack: List of open FB2 tags to take into account. @return: List of string representing the XHTML converted to FB2 markup. """ elem = elem_tree # Ensure what we are converting is not a string and that the fist tag # is part of the XHTML namespace. if (not isinstance(elem_tree.tag, (str, bytes)) or parse_utils.namespace(elem_tree.tag) != const.XHTML_NS): p = elem.getparent() if (p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) == const.XHTML_NS and elem.tail): return [elem.tail] return [] style = stylizer.style(elem_tree) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return [elem.tail] return [] # FB2 generated output. fb2_out = [] # FB2 tags in the order they are opened. This will be used to close # the tags. tags = [] # First tag in tree tag = parse_utils.barename(elem_tree.tag) # Number of blank lines above tag try: ems = int(round((float(style.marginTop) / style.fontSize) - 1)) if ems < 0: ems = 0 except Exception: ems = 0 # Convert TOC entries to <title>s and add <section>s if self.opts.sectionize == 'toc': # A section cannot be a child of any other element than another # section, so leave the tag alone if there are parents if not tag_stack: # There are two reasons to start a new section here: the TOC # pointed to this page (then we use the first non-<body> on # the page as a <title>), or the TOC pointed to a specific # element newlevel = 0 toc_entry = self.toc.get(page.href, None) if toc_entry is not None: if None in toc_entry: if (tag != 'body' and hasattr(elem_tree, 'text') and elem_tree.text): newlevel = 1 self.toc[page.href] = None if (not newlevel and elem_tree.attrib.get('id', None) is not None): newlevel = toc_entry.get( elem_tree.attrib.get('id', None), None) # Start a new section if necessary if newlevel: while newlevel <= self.section_level: fb2_out.append('</section>') self.section_level -= 1 fb2_out.append('<section>') self.section_level += 1 fb2_out.append('<title>') tags.append('title') if self.section_level == 0: # If none of the prior processing made a section, make one now # to be FB2 spec compliant fb2_out.append('<section>') self.section_level += 1 # Process the XHTML tag and styles. Converted to an FB2 tag. # Use individual if statement not if else. There can be only one XHTML # tag but it can have multiple styles. if tag == 'img' and elem_tree.attrib.get('src', None): # Only write the image tag if it is in the manifest. ihref = base.urlnormalize(page.abshref(elem_tree.attrib['src'])) if ihref in self.oeb_book.manifest.hrefs: if ihref not in self.image_hrefs: self.image_hrefs[ihref] = 'img_%s' % len(self.image_hrefs) p_txt, p_tag = self.ensure_p() fb2_out += p_txt tags += p_tag fb2_out.append('<image l:href="#%s"/>' % self.image_hrefs[ihref]) else: self.log.warn(u'Ignoring image not in manifest: %s' % ihref) if tag in ('br', 'hr') or ems >= 1: if ems < 1: multiplier = 1 else: multiplier = ems if self.in_p: closed_tags = [] open_tags = tag_stack + tags open_tags.reverse() for t in open_tags: fb2_out.append('</%s>' % t) closed_tags.append(t) if t == 'p': break fb2_out.append('<empty-line/>' * multiplier) closed_tags.reverse() for t in closed_tags: fb2_out.append('<%s>' % t) else: fb2_out.append('<empty-line/>' * multiplier) if tag in ('div', 'li', 'p'): p_text, added_p = self.close_open_p(tag_stack + tags) fb2_out += p_text if added_p: tags.append('p') if tag == 'a' and elem_tree.attrib.get('href', None): # Handle only external links for now if urllib.parse.urlparse(elem_tree.attrib['href']).netloc: p_txt, p_tag = self.ensure_p() fb2_out += p_txt tags += p_tag fb2_out.append('<a l:href="%s">' % base.urlnormalize(elem_tree.attrib['href'])) tags.append('a') if tag == 'b' or style['font-weight'] in ('bold', 'bolder'): s_out, s_tags = self.handle_simple_tag('strong', tag_stack + tags) fb2_out += s_out tags += s_tags if tag == 'i' or style['font-style'] == 'italic': s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack + tags) fb2_out += s_out tags += s_tags if (tag in ('del', 'strike') or style['text-decoration'] == 'line-through'): s_out, s_tags = self.handle_simple_tag('strikethrough', tag_stack + tags) fb2_out += s_out tags += s_tags if tag == 'sub': s_out, s_tags = self.handle_simple_tag('sub', tag_stack + tags) fb2_out += s_out tags += s_tags if tag == 'sup': s_out, s_tags = self.handle_simple_tag('sup', tag_stack + tags) fb2_out += s_out tags += s_tags # Process element text. if hasattr(elem_tree, 'text') and elem_tree.text: if not self.in_p: fb2_out.append('<p>') fb2_out.append(prepare_string_for_xml(elem_tree.text)) if not self.in_p: fb2_out.append('</p>') # Process sub-elements. for item in elem_tree: fb2_out += self.dump_text(item, stylizer, page, tag_stack + tags) # Close open FB2 tags. tags.reverse() fb2_out += self.close_tags(tags) # Process element text that comes after the close of the XHTML tag but # before the next XHTML tag. if hasattr(elem_tree, 'tail') and elem_tree.tail: if not self.in_p: fb2_out.append('<p>') fb2_out.append(prepare_string_for_xml(elem_tree.tail)) if not self.in_p: fb2_out.append('</p>') return fb2_out
def mobimlize_elem(self, elem, stylizer, bstate, istates, ignore_valign=False): if not isinstance(elem.tag, (str, bytes)) \ or parse_utils.namespace(elem.tag) != const.XHTML_NS: return style = stylizer.style(elem) # <mbp:frame-set/> does not exist lalalala if ((style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden') and elem.get('data-calibre-jacket-searchable-tags', None) != '1'): id_ = elem.get('id', None) if id_: # Keep anchors so people can use display:none # to generate hidden TOCs tail = elem.tail elem.clear() elem.text = None elem.set('id', id_) elem.tail = tail elem.tag = base.tag('xhtml', 'a') else: return tag = parse_utils.barename(elem.tag) istate = copy.copy(istates[-1]) istate.rendered = False istate.list_num = 0 if tag == 'ol' and 'start' in elem.attrib: try: istate.list_num = int(elem.attrib['start'])-1 except: pass istates.append(istate) left = 0 display = style['display'] if display == 'table-cell': display = 'inline' elif display.startswith('table'): display = 'block' isblock = (not display.startswith('inline') and style['display'] != 'none') isblock = isblock and style['float'] == 'none' isblock = isblock and tag != 'br' if isblock: bstate.para = None istate.halign = style['text-align'] rawti = style._get('text-indent') istate.indent = style['text-indent'] if hasattr(rawti, 'strip') and '%' in rawti: # We have a percentage text indent, these can come out looking # too large if the user chooses a wide output profile like # tablet istate.indent = min(style._unit_convert(rawti, base=500), istate.indent) if style['margin-left'] == 'auto' \ and style['margin-right'] == 'auto': istate.halign = 'center' margin = asfloat(style['margin-left']) padding = asfloat(style['padding-left']) if tag != 'body': left = margin + padding istate.left += left vmargin = asfloat(style['margin-top']) bstate.vmargin = max((bstate.vmargin, vmargin)) vpadding = asfloat(style['padding-top']) if vpadding > 0: bstate.vpadding += bstate.vmargin bstate.vmargin = 0 bstate.vpadding += vpadding elif not istate.href: margin = asfloat(style['margin-left']) padding = asfloat(style['padding-left']) lspace = margin + padding if lspace > 0: spaces = int(round((lspace * 3) / style['font-size'])) elem.text = ('\xa0' * spaces) + (elem.text or '') margin = asfloat(style['margin-right']) padding = asfloat(style['padding-right']) rspace = margin + padding if rspace > 0: spaces = int(round((rspace * 3) / style['font-size'])) if len(elem) == 0: elem.text = (elem.text or '') + ('\xa0' * spaces) else: last = elem[-1] last.text = (last.text or '') + ('\xa0' * spaces) if bstate.content and style['page-break-before'] in PAGE_BREAKS: bstate.pbreak = True istate.fsize = self.mobimlize_font(style['font-size']) istate.italic = True if style['font-style'] == 'italic' else False weight = style['font-weight'] istate.bold = weight in ('bold', 'bolder') or asfloat(weight) > 400 istate.preserve = style['white-space'] == 'pre' istate.pre_wrap = style['white-space'] == 'pre-wrap' istate.bgcolor = style['background-color'] istate.fgcolor = style['color'] istate.strikethrough = style.effective_text_decoration == 'line-through' istate.underline = style.effective_text_decoration == 'underline' ff = style['font-family'].lower() if hasattr(style['font-family'], 'lower') else '' if 'monospace' in ff or 'courier' in ff or ff.endswith(' mono'): istate.family = 'monospace' elif ('sans-serif' in ff or 'sansserif' in ff or 'verdana' in ff or 'arial' in ff or 'helvetica' in ff): istate.family = 'sans-serif' else: istate.family = 'serif' if 'id' in elem.attrib: istate.ids.add(elem.attrib['id']) if 'name' in elem.attrib: istate.ids.add(elem.attrib['name']) if tag == 'a' and 'href' in elem.attrib: istate.href = elem.attrib['href'] istate.attrib.clear() if tag == 'img' and 'src' in elem.attrib: istate.attrib['src'] = elem.attrib['src'] istate.attrib['align'] = 'baseline' cssdict = style.cssdict() valign = cssdict.get('vertical-align', None) if valign in ('top', 'bottom', 'middle'): istate.attrib['align'] = valign for prop in ('width', 'height'): if cssdict[prop] != 'auto': value = style[prop] if value == getattr(self.profile, prop): result = '100%' else: # Amazon's renderer does not support # img sizes in units other than px # See #7520 for test case try: pixs = int(round(float(value) / (72/self.profile.dpi))) except: continue result = str(pixs) istate.attrib[prop] = result if 'width' not in istate.attrib or 'height' not in istate.attrib: href = self.current_spine_item.abshref(elem.attrib['src']) try: item = self.oeb.manifest.hrefs[base.urlnormalize(href)] except: self.oeb.logger.warn('Failed to find image:', href) else: try: width, height = identify(item.data)[1:] except Exception: self.oeb.logger.warn('Invalid image:', href) else: if 'width' not in istate.attrib and 'height' not in \ istate.attrib: istate.attrib['width'] = str(width) istate.attrib['height'] = str(height) else: ar = width / height if 'width' not in istate.attrib: try: width = int(istate.attrib['height'])*ar except: pass istate.attrib['width'] = str(int(width)) else: try: height = int(istate.attrib['width'])/ar except: pass istate.attrib['height'] = str(int(height)) item.unload_data_from_memory() elif tag == 'hr' and asfloat(style['width']) > 0 and style._get('width') not in {'100%', 'auto'}: raww = style._get('width') if hasattr(raww, 'strip') and '%' in raww: istate.attrib['width'] = raww else: prop = style['width'] / self.profile.width istate.attrib['width'] = "%d%%" % int(round(prop * 100)) elif display == 'table': tag = 'table' elif display == 'table-row': tag = 'tr' elif display == 'table-cell': tag = 'td' if tag in TABLE_TAGS and self.ignore_tables: tag = 'span' if tag == 'td' else 'div' if tag in ('table', 'td', 'tr'): col = style.backgroundColor if col: elem.set('bgcolor', col) css = style.cssdict() if 'border' in css or 'border-width' in css: elem.set('border', '1') if tag in TABLE_TAGS: for attr in ('rowspan', 'colspan', 'width', 'border', 'scope', 'bgcolor'): if attr in elem.attrib: istate.attrib[attr] = elem.attrib[attr] if tag == 'q': t = elem.text if not t: t = '' elem.text = '\u201c' + t t = elem.tail if not t: t = '' elem.tail = '\u201d' + t text = None if elem.text: if istate.preserve or istate.pre_wrap: text = elem.text elif (len(elem) > 0 and isspace(elem.text) and hasattr(elem[0].tag, 'rpartition') and elem[0].tag.rpartition('}')[-1] not in INLINE_TAGS): text = None else: text = COLLAPSE.sub(' ', elem.text) valign = style['vertical-align'] not_baseline = valign in ('super', 'sub', 'text-top', 'text-bottom', 'top', 'bottom') or ( isinstance(valign, numbers.Number) and abs(valign) != 0) issup = valign in ('super', 'text-top', 'top') or ( isinstance(valign, numbers.Number) and valign > 0) vtag = 'sup' if issup else 'sub' if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock: nroot = etree.Element(base.tag('xhtml', 'html'), nsmap=MOBI_NSMAP) vbstate = BlockState(etree.SubElement(nroot, base.tag('xhtml', 'body'))) vbstate.para = etree.SubElement(vbstate.body, base.tag('xhtml', 'p')) self.mobimlize_elem(elem, stylizer, vbstate, istates, ignore_valign=True) if len(istates) > 0: istates.pop() if len(istates) == 0: istates.append(FormatState()) at_start = bstate.para is None if at_start: self.mobimlize_content('span', '', bstate, istates) parent = bstate.para if bstate.inline is None else bstate.inline if parent is not None: vtag = etree.SubElement(parent, base.tag('xhtml', vtag)) vtag = etree.SubElement(vtag, base.tag('xhtml', 'small')) # Add anchors for child in vbstate.body: if child is not vbstate.para: vtag.append(child) else: break if vbstate.para is not None: if vbstate.para.text: vtag.text = vbstate.para.text for child in vbstate.para: vtag.append(child) return if tag == 'blockquote': old_mim = self.opts.mobi_ignore_margins self.opts.mobi_ignore_margins = False if (text or tag in CONTENT_TAGS or tag in NESTABLE_TAGS or ( # We have an id but no text and no children, the id should still # be added. istate.ids and tag in ('a', 'span', 'i', 'b', 'u') and len(elem)==0)): if tag == 'li' and len(istates) > 1 and 'value' in elem.attrib: try: value = int(elem.attrib['value']) istates[-2].list_num = value - 1 except: pass self.mobimlize_content(tag, text, bstate, istates) for child in elem: self.mobimlize_elem(child, stylizer, bstate, istates) tail = None if child.tail: if istate.preserve or istate.pre_wrap: tail = child.tail elif bstate.para is None and isspace(child.tail): tail = None else: tail = COLLAPSE.sub(' ', child.tail) if tail: self.mobimlize_content(tag, tail, bstate, istates) if tag == 'blockquote': self.opts.mobi_ignore_margins = old_mim if bstate.content and style['page-break-after'] in PAGE_BREAKS: bstate.pbreak = True if isblock: para = bstate.para if para is not None and para.text == '\xa0' and len(para) < 1: if style.height > 2: para.getparent().replace(para, etree.Element(base.tag('xhtml', 'br'))) else: # This is too small to be rendered effectively, drop it para.getparent().remove(para) bstate.para = None bstate.istate = None vmargin = asfloat(style['margin-bottom']) bstate.vmargin = max((bstate.vmargin, vmargin)) vpadding = asfloat(style['padding-bottom']) if vpadding > 0: bstate.vpadding += bstate.vmargin bstate.vmargin = 0 bstate.vpadding += vpadding if bstate.nested and bstate.nested[-1].tag == elem.tag: bstate.nested.pop() istates.pop()
def mobimlize_content(self, tag, text, bstate, istates): 'Convert text content' if text or tag != 'br': bstate.content = True istate = istates[-1] para = bstate.para if tag in SPECIAL_TAGS and not text: para = para if para is not None else bstate.body elif para is None or tag in ('td', 'th'): body = bstate.body if bstate.pbreak: etree.SubElement(body, MBP('pagebreak')) bstate.pbreak = False bstate.istate = None bstate.anchor = None parent = bstate.nested[-1] if bstate.nested else bstate.body indent = istate.indent left = istate.left if isinstance(indent, (str, bytes)): indent = 0 if indent < 0 and abs(indent) < left: left += indent indent = 0 elif indent != 0 and abs(indent) < self.profile.fbase: indent = (indent / abs(indent)) * self.profile.fbase if tag in NESTABLE_TAGS and not istate.rendered: para = wrapper = etree.SubElement( parent, base.tag('xhtml', tag), attrib=istate.attrib) bstate.nested.append(para) if tag == 'li' and len(istates) > 1: istates[-2].list_num += 1 para.attrib['value'] = str(istates[-2].list_num) elif tag in NESTABLE_TAGS and istate.rendered: para = wrapper = bstate.nested[-1] elif not self.opts.mobi_ignore_margins and left > 0 and indent >= 0: ems = self.profile.mobi_ems_per_blockquote para = wrapper = etree.SubElement(parent, base.tag('xhtml', 'blockquote')) para = wrapper emleft = int(round(left / self.profile.fbase)) - ems emleft = min((emleft, 10)) while emleft > ems / 2: para = etree.SubElement(para, base.tag('xhtml', 'blockquote')) emleft -= ems else: para = wrapper = etree.SubElement(parent, base.tag('xhtml', 'p')) bstate.inline = bstate.para = para vspace = bstate.vpadding + bstate.vmargin bstate.vpadding = bstate.vmargin = 0 if tag not in TABLE_TAGS: if tag in ('ul', 'ol') and vspace > 0: wrapper.addprevious(etree.Element(base.tag('xhtml', 'div'), height=self.mobimlize_measure(vspace))) else: wrapper.attrib['height'] = self.mobimlize_measure(vspace) para.attrib['width'] = self.mobimlize_measure(indent) elif tag == 'table' and vspace > 0: vspace = int(round(vspace / self.profile.fbase)) while vspace > 0: wrapper.addprevious(etree.Element(base.tag('xhtml', 'br'))) vspace -= 1 if istate.halign != 'auto' and isinstance(istate.halign, (bytes, str)): if isinstance(istate.halign, bytes): istate.halign = istate.halign.decode('utf-8') para.attrib['align'] = istate.halign istate.rendered = True pstate = bstate.istate if tag in CONTENT_TAGS: bstate.inline = para pstate = bstate.istate = None try: etree.SubElement(para, base.tag('xhtml', tag), attrib=istate.attrib) except: print('Invalid subelement:', para, tag, istate.attrib) raise elif tag in TABLE_TAGS: para.attrib['valign'] = 'top' if istate.ids: for id_ in istate.ids: anchor = etree.Element(base.tag('xhtml', 'a'), attrib={'id': id_}) if tag == 'li': try: last = bstate.body[-1][-1] except: break last.insert(0, anchor) anchor.tail = last.text last.text = None else: last = bstate.body[-1] # We use append instead of addprevious so that inline # anchors in large blocks point to the correct place. See # https://bugs.launchpad.net/calibre/+bug/899831 # This could potentially break if inserting an anchor at # this point in the markup is illegal, but I cannot think # of such a case offhand. if parse_utils.barename(last.tag) in LEAF_TAGS: last.addprevious(anchor) else: last.append(anchor) istate.ids.clear() if not text: return if not pstate or istate != pstate: inline = para fsize = istate.fsize href = istate.href if not href: bstate.anchor = None elif pstate and pstate.href == href: inline = bstate.anchor else: inline = etree.SubElement(inline, base.tag('xhtml', 'a'), href=href) bstate.anchor = inline if fsize != 3: inline = etree.SubElement(inline, base.tag('xhtml', 'font'), size=str(fsize)) if istate.family == 'monospace': inline = etree.SubElement(inline, base.tag('xhtml', 'tt')) if istate.italic: inline = etree.SubElement(inline, base.tag('xhtml', 'i')) if istate.bold: inline = etree.SubElement(inline, base.tag('xhtml', 'b')) if istate.bgcolor is not None and istate.bgcolor != 'transparent' : inline = etree.SubElement(inline, base.tag('xhtml', 'span'), bgcolor=convert_color_for_font_tag(istate.bgcolor)) if istate.fgcolor != 'black': inline = etree.SubElement(inline, base.tag('xhtml', 'font'), color=convert_color_for_font_tag(istate.fgcolor)) if istate.strikethrough: inline = etree.SubElement(inline, base.tag('xhtml', 's')) if istate.underline: inline = etree.SubElement(inline, base.tag('xhtml', 'u')) bstate.inline = inline bstate.istate = istate inline = bstate.inline content = self.preize_text(text, pre_wrap=istate.pre_wrap) if istate.preserve or istate.pre_wrap else [text] for item in content: if isinstance(item, (str, bytes)): if len(inline) == 0: inline.text = (inline.text or '') + item else: last = inline[-1] last.tail = (last.tail or '') + item else: inline.append(item)
def dckey(x): return {'title': 0, 'creator': 1}.get(parse_utils.barename(x.tag), 2)
def flatten_node(self, node, stylizer, names, styles, pseudo_styles, psize, item_id, recurse=True): if not isinstance(node.tag, (str, bytes)) \ or parse_utils.namespace(node.tag) != const.XHTML_NS: return tag = parse_utils.barename(node.tag) style = stylizer.style(node) cssdict = style.cssdict() try: font_size = style['font-size'] except: font_size = self.sbase if self.sbase is not None else \ self.context.source.fbase if tag == 'body' and isinstance(font_size, numbers.Number): stylizer.body_font_size = font_size if 'align' in node.attrib: if tag != 'img': cssdict['text-align'] = node.attrib['align'] if cssdict['text-align'] == 'center': # align=center causes tables to be center aligned, # which text-align does not. And the ever trustworthy Word # uses this construct in its HTML output. See # https://bugs.launchpad.net/bugs/1569583 if tag == 'table': if 'margin-left' not in cssdict and 'margin-right' not in cssdict: cssdict['margin-left'] = cssdict[ 'margin-right'] = 'auto' else: for table in node.iterchildren( base.tag('xhtml', "table")): ts = stylizer.style(table) if ts.get('margin-left') is None and ts.get( 'margin-right') is None: ts.set('margin-left', 'auto') ts.set('margin-right', 'auto') else: val = node.attrib['align'] if val in ('middle', 'bottom', 'top'): cssdict['vertical-align'] = val elif val in ('left', 'right'): cssdict['float'] = val del node.attrib['align'] if 'valign' in node.attrib and tag == 'td': if cssdict.get('vertical-align') == 'inherit': cssdict['vertical-align'] = node.attrib['valign'] del node.attrib['valign'] if node.tag == base.tag('xhtml', 'font'): tags = [ 'descendant::h:%s' % x for x in ('p', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'dl', 'blockquote') ] # TODO(gryf): this will override tag from line 355. On purpose? tag = 'div' if base.XPath('|'.join(tags))(node) else 'span' node.tag = base.tag('xhtml', tag) if 'size' in node.attrib: def force_int(raw): return int(re.search(r'([0-9+-]+)', raw).group(1)) size = node.attrib['size'].strip() if size: fnums = self.context.source.fnums if size[0] in ('+', '-'): # Oh, the warcrimes try: esize = 3 + force_int(size) except: esize = 3 if esize < 1: esize = 1 if esize > 7: esize = 7 font_size = fnums[esize] else: try: font_size = fnums[force_int(size)] except: font_size = fnums[3] cssdict['font-size'] = '%.1fpt' % font_size del node.attrib['size'] if 'face' in node.attrib: cssdict['font-family'] = node.attrib['face'] del node.attrib['face'] if 'color' in node.attrib: try: cssdict['color'] = cp_css.Property('color', node.attrib['color']).value except (ValueError, dom.SyntaxErr): pass del node.attrib['color'] if 'bgcolor' in node.attrib: try: cssdict['background-color'] = cp_css.Property( 'background-color', node.attrib['bgcolor']).value except (ValueError, dom.SyntaxErr): pass del node.attrib['bgcolor'] if tag == 'ol' and 'type' in node.attrib: del node.attrib['type'] if cssdict.get('font-weight', '').lower() == 'medium': cssdict[ 'font-weight'] = 'normal' # ADE chokes on font-weight medium fsize = font_size is_drop_cap = ( cssdict.get('float', None) == 'left' and 'font-size' in cssdict and len(node) == 0 and node.text and (len(node.text) == 1 or (len(node.text) == 2 and 0x2000 <= ord(node.text[0]) <= 0x206f))) # Detect drop caps generated by the docx input plugin if node.tag and node.tag.endswith('}p') and len(node) == 0 and node.text and len(node.text.strip()) == 1 and \ not node.tail and 'line-height' in cssdict and 'font-size' in cssdict: dp = node.getparent() if dp.tag and dp.tag.endswith('}div') and len( dp) == 1 and not dp.text: if stylizer.style(dp).cssdict().get('float', None) == 'left': is_drop_cap = True if not self.context.disable_font_rescaling and not is_drop_cap: _sbase = self.sbase if self.sbase is not None else \ self.context.source.fbase dyn_rescale = node.attrib.pop('data-calibre-rescale', None) if dyn_rescale is not None: try: dyn_rescale = float(dyn_rescale) / 100 except Exception: dyn_rescale = 1 fsize = self.fmap[_sbase] fsize *= dyn_rescale cssdict['font-size'] = '%0.5fem' % (fsize / psize) psize = fsize elif 'font-size' in cssdict or tag == 'body': fsize = self.fmap[font_size] try: cssdict['font-size'] = "%0.5fem" % (fsize / psize) except ZeroDivisionError: cssdict['font-size'] = '%.1fpt' % fsize psize = fsize try: minlh = self.context.minimum_line_height / 100. slh = style['line-height'] if not is_drop_cap and isinstance( slh, numbers.Number) and slh < minlh * fsize: cssdict['line-height'] = str(minlh) except Exception: self.oeb.logger.exception('Failed to set minimum line-height') if cssdict: for x in self.filter_css: popval = cssdict.pop(x, None) if self.body_font_family and popval and x == 'font-family' \ and popval.partition(',')[0][1:-1] == self.body_font_family.partition(',')[0][1:-1]: cssdict[x] = popval if cssdict: if self.lineh and self.fbase and tag != 'body': self.clean_edges(cssdict, style, psize) if 'display' in cssdict and cssdict['display'] == 'in-line': cssdict['display'] = 'inline' if self.unfloat and 'float' in cssdict \ and cssdict.get('display', 'none') != 'none': del cssdict['display'] if self.untable and 'display' in cssdict \ and cssdict['display'].startswith('table'): display = cssdict['display'] if display == 'table-cell': cssdict['display'] = 'inline' else: cssdict['display'] = 'block' if 'vertical-align' in cssdict \ and cssdict['vertical-align'] == 'sup': cssdict['vertical-align'] = 'super' if self.lineh and 'line-height' not in cssdict: lineh = self.lineh / psize cssdict['line-height'] = "%0.5fem" % lineh if (self.context.remove_paragraph_spacing or self.context.insert_blank_line) and tag in ('p', 'div'): if item_id != 'calibre_jacket' or self.context.output_profile.name == 'Kindle': for prop in ('margin', 'padding', 'border'): for edge in ('top', 'bottom'): cssdict['%s-%s' % (prop, edge)] = '0pt' if self.context.insert_blank_line: cssdict['margin-top'] = cssdict['margin-bottom'] = \ '%fem'%self.context.insert_blank_line_size indent_size = self.context.remove_paragraph_spacing_indent_size keep_indents = indent_size < 0.0 if (self.context.remove_paragraph_spacing and not keep_indents and cssdict.get('text-align', None) not in ('center', 'right')): cssdict['text-indent'] = "%1.1fem" % indent_size pseudo_classes = style.pseudo_classes(self.filter_css) if cssdict or pseudo_classes: keep_classes = set() if cssdict: items = sorted(cssdict.items()) css = ';\n'.join(u'%s: %s' % (key, val) for key, val in items) classes = node.get('class', '').strip() or 'calibre' classes_list = classes.split() # lower() because otherwise if the document uses the same class # name with different case, both cases will apply, leading # to incorrect results. klass = ascii_text(STRIPNUM.sub( '', classes_list[0])).lower().strip().replace(' ', '_') if css in styles: match = styles[css] else: match = klass + str(names[klass] or '') styles[css] = match names[klass] += 1 node.attrib['class'] = match keep_classes.add(match) for psel, cssdict in pseudo_classes.items(): items = sorted(cssdict.items()) css = ';\n'.join('%s: %s' % (key, val) for key, val in items) pstyles = pseudo_styles[psel] if css in pstyles: match = pstyles[css] else: # We have to use a different class for each psel as # otherwise you can have incorrect styles for a situation # like: a:hover { color: red } a:link { color: blue } a.x:hover { color: green } # If the pcalibre class for a:hover and a:link is the same, # then the class attribute for a.x tags will contain both # that class and the class for a.x:hover, which is wrong. klass = 'pcalibre' match = klass + str(names[klass] or '') pstyles[css] = match names[klass] += 1 keep_classes.add(match) node.attrib['class'] = ' '.join(keep_classes) elif 'class' in node.attrib: del node.attrib['class'] if 'style' in node.attrib: del node.attrib['style'] if recurse: for child in node: self.flatten_node(child, stylizer, names, styles, pseudo_styles, psize, item_id)
def workaround_ade_quirks(self): # {{{ """ Perform various markup transforms to get the output to render correctly in the quirky ADE. """ stylesheet = self.oeb.manifest.main_stylesheet # ADE cries big wet tears when it encounters an invalid fragment # identifier in the NCX toc. frag_pat = re.compile(r'[-A-Za-z0-9_:.]+$') for node in self.oeb.toc.iter(): href = getattr(node, 'href', None) if hasattr(href, 'partition'): _base, _, frag = href.partition('#') frag = urllib.parse.unquote(frag) if frag and frag_pat.match(frag) is None: self.log.warn( 'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'%frag) node.href = _base for x in self.oeb.spine: root = x.data body = base.XPath('//h:body')(root) if body: body = body[0] if hasattr(body, 'xpath'): # remove <img> tags with empty src elements bad = [] for x in base.XPath('//h:img')(body): src = x.get('src', '').strip() if src in ('', '#') or src.startswith('http:'): bad.append(x) for img in bad: img.getparent().remove(img) # Add id attribute to <a> tags that have name for x in base.XPath('//h:a[@name]')(body): if not x.get('id', False): x.set('id', x.get('name')) # The delightful epubcheck has started complaining about <a> tags that # have name attributes. x.attrib.pop('name') # Replace <br> that are children of <body> as ADE doesn't handle them for br in base.XPath('./h:br')(body): if br.getparent() is None: continue try: prior = next(br.itersiblings(preceding=True)) priortag = parse_utils.barename(prior.tag) priortext = prior.tail except: priortag = 'body' priortext = body.text if priortext: priortext = priortext.strip() br.tag = base.tag('xhtml', 'p') br.text = '\u00a0' style = br.get('style', '').split(';') style = list(filter(None, map(lambda x: x.strip(), style))) style.append('margin:0pt; border:0pt') # If the prior tag is a block (including a <br> we replaced) # then this <br> replacement should have a 1-line height. # Otherwise it should have no height. if not priortext and priortag in block_level_tags: style.append('height:1em') else: style.append('height:0pt') br.set('style', '; '.join(style)) for tag in base.XPath('//h:embed')(root): tag.getparent().remove(tag) for tag in base.XPath('//h:object')(root): if tag.get('type', '').lower().strip() in {'image/svg+xml', 'application/svg+xml'}: continue tag.getparent().remove(tag) for tag in base.XPath('//h:title|//h:style')(root): if not tag.text: tag.getparent().remove(tag) for tag in base.XPath('//h:script')(root): if (not tag.text and not tag.get('src', False) and tag.get('type', None) != 'text/x-mathjax-config'): tag.getparent().remove(tag) for tag in base.XPath('//h:body/descendant::h:script')(root): tag.getparent().remove(tag) formchildren = base.XPath('./h:input|./h:button|./h:textarea|' './h:label|./h:fieldset|./h:legend') for tag in base.XPath('//h:form')(root): if formchildren(tag): tag.getparent().remove(tag) else: # Not a real form tag.tag = base.tag('xhtml', 'div') for tag in base.XPath('//h:center')(root): tag.tag = base.tag('xhtml', 'div') tag.set('style', 'text-align:center') # ADE can't handle & in an img url for tag in base.XPath('//h:img[@src]')(root): tag.set('src', tag.get('src', '').replace('&', '')) # ADE whimpers in fright when it encounters a <td> outside a # <table> in_table = base.XPath('ancestor::h:table') for tag in base.XPath('//h:td|//h:tr|//h:th')(root): if not in_table(tag): tag.tag = base.tag('xhtml', 'div') # ADE fails to render non breaking hyphens/soft hyphens/zero width spaces special_chars = re.compile('[\u200b\u00ad]') for elem in root.iterdescendants('*'): if elem.text: elem.text = special_chars.sub('', elem.text) elem.text = elem.text.replace('\u2011', '-') if elem.tail: elem.tail = special_chars.sub('', elem.tail) elem.tail = elem.tail.replace('\u2011', '-') if stylesheet is not None: # ADE doesn't render lists correctly if they have left margins from css_parser.css import CSSRule for lb in base.XPath('//h:ul[@class]|//h:ol[@class]')(root): sel = '.'+lb.get('class') for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE): if sel == rule.selectorList.selectorText: rule.style.removeProperty('margin-left') # padding-left breaks rendering in webkit and gecko rule.style.removeProperty('padding-left') # Change whitespace:pre to pre-wrap to accommodate readers that # cannot scroll horizontally for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE): style = rule.style ws = style.getPropertyValue('white-space') if ws == 'pre': style.setProperty('white-space', 'pre-wrap')
def process_tag(self, html_tag, stylizer, is_first_tag=False, float_spec=None): tagname = parse_utils.barename(html_tag.tag) tag_style = stylizer.style(html_tag) ignore_tag_contents = tagname in {'script', 'style', 'title', 'meta' } or tag_style.is_hidden display = tag_style._get('display') is_block = False if not ignore_tag_contents: previous_link = self.current_link if tagname == 'a' and html_tag.get('href'): self.current_link = (self.current_item, html_tag.get('href'), html_tag.get('title')) previous_lang = self.current_lang tag_lang = lang_for_tag(html_tag) if tag_lang: self.current_lang = tag_lang is_float = tag_style['float'] in {'left', 'right' } and not is_first_tag if float_spec is None and is_float: float_spec = FloatSpec(self.docx.namespace, html_tag, tag_style) if display in { 'inline', 'inline-block' } or tagname == 'br': # <br> has display:block but we dont want to start a new paragraph if is_float and float_spec.is_dropcaps: self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec) float_spec = None else: self.add_inline_tag(tagname, html_tag, tag_style, stylizer) elif display == 'list-item': self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_list_item=True) elif display.startswith('table') or display == 'inline-table': if display == 'table-cell': self.blocks.start_new_cell(html_tag, tag_style) self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_table_cell=True) elif display == 'table-row': self.blocks.start_new_row(html_tag, tag_style) elif display in {'table', 'inline-table'}: self.blocks.end_current_block() self.blocks.start_new_table(html_tag, tag_style) else: if tagname == 'img' and is_float: # Image is floating so dont start a new paragraph for it self.add_inline_tag(tagname, html_tag, tag_style, stylizer) else: if tagname == 'hr': for edge in 'right bottom left'.split(): tag_style.set('border-%s-style' % edge, 'none') self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec) for child in html_tag.iterchildren(): if isinstance(getattr(child, 'tag', None), (str, bytes)): self.process_tag(child, stylizer, float_spec=float_spec) else: # Comment/PI/etc. tail = getattr(child, 'tail', None) if tail: block = self.create_block_from_parent( html_tag, stylizer) block.add_text(tail, tag_style, is_parent_style=False, link=self.current_link, lang=self.current_lang) is_block = html_tag in self.blocks.open_html_blocks self.blocks.finish_tag(html_tag) if is_block and tag_style['page-break-after'] == 'avoid': self.blocks.all_blocks[-1].keep_next = True self.current_link = previous_link self.current_lang = previous_lang # Now, process the tail if any if display == 'table-row': return # We ignore the tail for these tags ignore_whitespace_tail = is_block or display.startswith('table') if not is_first_tag and html_tag.tail and ( not ignore_whitespace_tail or not html_tag.tail.isspace()): # Ignore trailing space after a block tag, as otherwise it will # become a new empty paragraph block = self.create_block_from_parent(html_tag, stylizer) block.add_text(html_tag.tail, stylizer.style(html_tag.getparent()), is_parent_style=True, link=self.current_link, lang=self.current_lang)
def dump_text(self, elem, stylizer, page): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. ''' # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, (str, bytes)) \ or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS): p = elem.getparent() if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \ and elem.tail: return [elem.tail] return [''] # Setup our variables. text = [''] style = stylizer.style(elem) tags = [] tag = parse_utils.barename(elem.tag) attribs = elem.attrib style_a = '%s' % style style_a = style_a if style_a else '' if tag == 'body': # Change the body to a div so we can merge multiple files. tag = 'div' # Add page-break-brefore: always because renders typically treat a new file (we're merging files) # as a page break and remove all other page break types that might be set. style_a = 'page-break-before: always; %s' % re.sub( 'page-break-[^:]+:[^;]+;?', '', style_a) # Remove unnecessary spaces. style_a = re.sub(r'\s{2,}', ' ', style_a).strip() tags.append(tag) # Remove attributes we won't want. if 'class' in attribs: del attribs['class'] if 'style' in attribs: del attribs['style'] # Turn the rest of the attributes into a string we can write with the tag. at = '' for k, v in attribs.items(): at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True)) # Turn style into strings for putting in the tag. style_t = '' if style_a: style_t = ' style="%s"' % style_a.replace('"', "'") # Write the tag. text.append('<%s%s%s' % (tag, at, style_t)) if tag in SELF_CLOSING_TAGS: text.append(' />') else: text.append('>') # Process tags that contain text. if hasattr(elem, 'text') and elem.text: text.append(self.prepare_string_for_html(elem.text)) # Recurse down into tags within the tag we are in. for item in elem: text += self.dump_text(item, stylizer, page) # Close all open tags. tags.reverse() for t in tags: if t not in SELF_CLOSING_TAGS: text.append('</%s>' % t) # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: text.append(self.prepare_string_for_html(elem.tail)) return text
def dump_text(self, elem, stylizer, page): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. ''' # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, (str, bytes)) \ or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS): p = elem.getparent() if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \ and elem.tail: return [elem.tail] return [''] # Setup our variables. text = [''] style = stylizer.style(elem) tags = [] tag = parse_utils.barename(elem.tag) attribs = elem.attrib if tag == 'body': tag = 'div' tags.append(tag) # Ignore anything that is set to not be displayed. if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': return [''] # Remove attributes we won't want. if 'class' in attribs: del attribs['class'] if 'style' in attribs: del attribs['style'] # Turn the rest of the attributes into a string we can write with the tag. at = '' for k, v in attribs.items(): at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True)) # Write the tag. text.append('<%s%s' % (tag, at)) if tag in SELF_CLOSING_TAGS: text.append(' />') else: text.append('>') # Turn styles into tags. if style['font-weight'] in ('bold', 'bolder'): text.append('<b>') tags.append('b') if style['font-style'] == 'italic': text.append('<i>') tags.append('i') if style['text-decoration'] == 'underline': text.append('<u>') tags.append('u') if style['text-decoration'] == 'line-through': text.append('<s>') tags.append('s') # Process tags that contain text. if hasattr(elem, 'text') and elem.text: text.append(self.prepare_string_for_html(elem.text)) # Recurse down into tags within the tag we are in. for item in elem: text += self.dump_text(item, stylizer, page) # Close all open tags. tags.reverse() for t in tags: if t not in SELF_CLOSING_TAGS: text.append('</%s>' % t) # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: text.append(self.prepare_string_for_html(elem.tail)) return text