Exemplo n.º 1
0
 def _clean_opf(self, opf):
     nsmap = {}
     for elem in opf.iter(tag=etree.Element):
         nsmap.update(elem.nsmap)
     for elem in opf.iter(tag=etree.Element):
         if namespace(elem.tag) in ('', OPF1_NS) and ':' not in barename(elem.tag):
             elem.tag = OPF(barename(elem.tag))
     nsmap.update(OPF2_NSMAP)
     attrib = dict(opf.attrib)
     nroot = etree.Element(OPF('package'),
         nsmap={None: OPF2_NS}, attrib=attrib)
     metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap)
     ignored = (OPF('dc-metadata'), OPF('x-metadata'))
     for elem in xpath(opf, 'o2:metadata//*'):
         if elem.tag in ignored:
             continue
         if namespace(elem.tag) in DC_NSES:
             tag = barename(elem.tag).lower()
             elem.tag = '{%s}%s' % (DC11_NS, tag)
         if elem.tag.startswith('dc:'):
             tag = elem.tag.partition(':')[-1].lower()
             elem.tag = '{%s}%s' % (DC11_NS, tag)
         metadata.append(elem)
     for element in xpath(opf, 'o2:metadata//o2:meta'):
         metadata.append(element)
     for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'):
         for element in xpath(opf, tag):
             nroot.append(element)
     return nroot
Exemplo n.º 2
0
    def __call__(self, oeb, opts):
        self.log = oeb.log
        self.oeb = oeb
        self.opts = opts
        self.log('Detecting structure...')

        self.detect_chapters()
        if self.oeb.auto_generated_toc or opts.use_auto_toc:
            orig_toc = self.oeb.toc
            self.oeb.toc = TOC()
            self.create_level_based_toc()
            if self.oeb.toc.count() < 1:
                if not opts.no_chapters_in_toc and self.detected_chapters:
                    self.create_toc_from_chapters()
                if self.oeb.toc.count() < opts.toc_threshold:
                    self.create_toc_from_links()
            if self.oeb.toc.count() < 2 and orig_toc.count() > 2:
                self.oeb.toc = orig_toc
            else:
                self.oeb.auto_generated_toc = True
                self.log('Auto generated TOC with %d entries.' %
                         self.oeb.toc.count())

        if opts.toc_filter is not None:
            regexp = re.compile(opts.toc_filter)
            for node in list(self.oeb.toc.iter()):
                if not node.title or regexp.search(node.title) is not None:
                    self.log('Filtering',
                             node.title if node.title else 'empty node',
                             'from TOC')
                    self.oeb.toc.remove(node)

        if opts.page_breaks_before is not None:
            pb_xpath = XPath(opts.page_breaks_before)
            for item in oeb.spine:
                for elem in pb_xpath(item.data):
                    try:
                        prev = next(
                            elem.itersiblings(tag=etree.Element,
                                              preceding=True))
                        if (barename(elem.tag) in {'h1', 'h2'}
                                and barename(prev.tag) in {'h1', 'h2'}
                                and (not prev.tail or not prev.tail.split())):
                            # We have two adjacent headings, do not put a page
                            # break on the second one
                            continue
                    except StopIteration:
                        pass

                    style = elem.get('style', '')
                    if style:
                        style += '; '
                    elem.set('style', style + 'page-break-before:always')

        for node in self.oeb.toc.iter():
            if not node.title or not node.title.strip():
                node.title = _('Unnamed')

        if self.opts.start_reading_at:
            self.detect_start_reading()
Exemplo n.º 3
0
 def postprocess_book(self, oeb, opts, log):
     from calibre.ebooks.oeb.base import XHTML, barename
     for item in oeb.spine:
         if hasattr(item.data, 'xpath'):
             for heading in item.data.iterdescendants(
                     *map(XHTML, 'h1 h2 h3 h4 h5 h6'.split())):
                 if not len(heading):
                     continue
                 span = heading[0]
                 if not heading.text and not span.text and not len(
                         span) and barename(span.tag) == 'span':
                     if not heading.get('id') and span.get('id'):
                         heading.set('id', span.get('id'))
                         heading.text = span.tail
                         heading.remove(span)
                 if len(heading) == 1 and heading[0].get(
                         'style') == 'text-align: center; margin: auto;':
                     div = heading[0]
                     if barename(div.tag) == 'div' and not len(
                             div) and not div.get('id') and not heading.get(
                                 'style'):
                         heading.text = (heading.text or '') + (
                             div.text or '') + (div.tail or '')
                         heading.remove(div)
                         heading.set('style', 'text-align: center')
Exemplo n.º 4
0
def count_chars_in_html_tag(tag, counter, file_name, parent_locale, locale):
    if tag.text is not None and barename(tag.tag) not in html_spell_tags:
        count_chars_in_text(tag, 'text', counter, file_name, locale)
    for attr in {'alt', 'title'}:
        count_chars_in_attr(tag, attr, counter, file_name, locale)
    if tag.tail is not None and tag.getparent() is not None and barename(tag.getparent().tag) not in html_spell_tags:
        count_chars_in_text(tag, 'tail', counter, file_name, parent_locale)
Exemplo n.º 5
0
def read_words_from_html_tag(tag, words, file_name, parent_locale, locale):
    if tag.text is not None and barename(tag.tag) not in html_spell_tags:
        add_words_from_text(tag, 'text', words, file_name, locale)
    for attr in {'alt', 'title'}:
        add_words_from_attr(tag, attr, words, file_name, locale)
    if tag.tail is not None and tag.getparent() is not None and barename(tag.getparent().tag) not in html_spell_tags:
        add_words_from_text(tag, 'tail', words, file_name, parent_locale)
Exemplo n.º 6
0
def read_words_from_opf(root, words, file_name, book_locale):
    for tag in root.iterdescendants('*'):
        if tag.text is not None and barename(tag.tag) in opf_spell_tags:
            if barename(tag.tag) == 'description':
                add_words_from_escaped_html(tag.text, words, file_name, tag, 'text', book_locale)
            else:
                add_words_from_text(tag, 'text', words, file_name, book_locale)
        add_words_from_attr(tag, _opf_file_as, words, file_name, book_locale)
Exemplo n.º 7
0
def count_chars_in_opf(root, counter, file_name, book_locale):
    for tag in root.iterdescendants('*'):
        if tag.text is not None and barename(tag.tag) in opf_spell_tags:
            if barename(tag.tag) == 'description':
                count_chars_in_escaped_html(tag.text, counter, file_name, tag, 'text', book_locale)
            else:
                count_chars_in_text(tag, 'text', counter, file_name, book_locale)
        count_chars_in_attr(tag, _opf_file_as, counter, file_name, book_locale)
Exemplo n.º 8
0
    def __call__(self, oeb, opts):
        self.log = oeb.log
        self.oeb = oeb
        self.opts = opts
        self.log('Detecting structure...')

        self.detect_chapters()
        if self.oeb.auto_generated_toc or opts.use_auto_toc:
            orig_toc = self.oeb.toc
            self.oeb.toc = TOC()
            self.create_level_based_toc()
            if self.oeb.toc.count() < 1:
                if not opts.no_chapters_in_toc and self.detected_chapters:
                    self.create_toc_from_chapters()
                if self.oeb.toc.count() < opts.toc_threshold:
                    self.create_toc_from_links()
            if self.oeb.toc.count() < 2 and orig_toc.count() > 2:
                self.oeb.toc = orig_toc
            else:
                self.oeb.auto_generated_toc = True
                self.log('Auto generated TOC with %d entries.' %
                        self.oeb.toc.count())

        if opts.toc_filter is not None:
            regexp = re.compile(opts.toc_filter)
            for node in list(self.oeb.toc.iter()):
                if not node.title or regexp.search(node.title) is not None:
                    self.log('Filtering', node.title if node.title else
                            'empty node', 'from TOC')
                    self.oeb.toc.remove(node)

        if opts.page_breaks_before is not None:
            pb_xpath = XPath(opts.page_breaks_before)
            for item in oeb.spine:
                for elem in pb_xpath(item.data):
                    try:
                        prev = next(elem.itersiblings(tag=etree.Element,
                                preceding=True))
                        if (barename(elem.tag) in {'h1', 'h2'} and barename(
                                prev.tag) in {'h1', 'h2'} and (not prev.tail or
                                    not prev.tail.split())):
                            # We have two adjacent headings, do not put a page
                            # break on the second one
                            continue
                    except StopIteration:
                        pass

                    style = elem.get('style', '')
                    if style:
                        style += '; '
                    elem.set('style', style+'page-break-before:always')

        for node in self.oeb.toc.iter():
            if not node.title or not node.title.strip():
                node.title = _('Unnamed')

        if self.opts.start_reading_at:
            self.detect_start_reading()
Exemplo n.º 9
0
    def dump_text(self, elem, stylizer, page):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        '''

        # We can only processes tags. If there isn't a tag return any text.
        if not isinstance(elem.tag, string_or_bytes) \
           or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return ['']

        # Setup our variables.
        text = ['']
        tags = []
        tag = barename(elem.tag)
        attribs = elem.attrib

        if tag == 'body':
            tag = 'div'
        tags.append(tag)

        # Remove attributes we won't want.
        if 'style' in attribs:
            del attribs['style']

        # Turn the rest of the attributes into a string we can write with the tag.
        at = ''
        for k, v in attribs.items():
            at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))

        # Write the tag.
        text.append('<%s%s' % (tag, at))
        if tag in SELF_CLOSING_TAGS:
            text.append(' />')
        else:
            text.append('>')

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            text.append(self.prepare_string_for_html(elem.text))

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer, page)

        # Close all open tags.
        tags.reverse()
        for t in tags:
            if t not in SELF_CLOSING_TAGS:
                text.append('</%s>' % t)

        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            text.append(self.prepare_string_for_html(elem.tail))

        return text
Exemplo n.º 10
0
 def add_block_tag(self,
                   tagname,
                   html_tag,
                   tag_style,
                   stylizer,
                   is_table_cell=False,
                   float_spec=None,
                   is_list_item=False):
     block = self.blocks.start_new_block(html_tag,
                                         tag_style,
                                         is_table_cell=is_table_cell,
                                         float_spec=float_spec,
                                         is_list_item=is_list_item)
     anchor = html_tag.get('id') or html_tag.get('name')
     if anchor:
         block.bookmarks.add(self.bookmark_for_anchor(anchor, html_tag))
     if tagname == 'img':
         self.images_manager.add_image(html_tag,
                                       block,
                                       stylizer,
                                       as_block=True)
     else:
         text = html_tag.text
         if text:
             block.add_text(text,
                            tag_style,
                            ignore_leading_whitespace=True,
                            is_parent_style=True,
                            link=self.current_link,
                            lang=self.current_lang)
         elif tagname == 'li' and len(html_tag) and barename(
                 html_tag[0].tag) in ('ul', 'ol') and len(html_tag[0]):
             block.force_not_empty = True
Exemplo n.º 11
0
 def unsmarten(self, root):
     for x in self.html_tags(root):
         if not barename(x.tag) == 'pre':
             if getattr(x, 'text', None):
                 x.text = unsmarten_text(x.text)
             if getattr(x, 'tail', None) and x.tail:
                 x.tail = unsmarten_text(x.tail)
Exemplo n.º 12
0
 def __iter__(self):
     from calibre.ebooks.oeb.base import namespace, barename, DC11_NS
     meta = self.meta
     for item_name in meta.items:
         for item in meta[item_name]:
             if namespace(item.term) == DC11_NS:
                 yield {'name': barename(item.term), 'value': item.value}
Exemplo n.º 13
0
 def __init__(self,
              namespace,
              styles_manager,
              links_manager,
              html_block,
              style,
              is_table_cell=False,
              float_spec=None,
              is_list_item=False):
     self.namespace = namespace
     self.bookmarks = set()
     self.list_tag = (html_block, style) if is_list_item else None
     self.is_first_block = False
     self.numbering_id = None
     self.parent_items = None
     self.html_block = html_block
     self.html_tag = barename(html_block.tag)
     self.float_spec = float_spec
     if float_spec is not None:
         float_spec.blocks.append(self)
     self.html_style = style
     self.style = styles_manager.create_block_style(
         style, html_block, is_table_cell=is_table_cell)
     self.styles_manager, self.links_manager = styles_manager, links_manager
     self.keep_next = False
     self.runs = []
     self.skipped = False
     self.linked_style = None
     self.page_break_before = style['page-break-before'] == 'always'
     self.keep_lines = style['page-break-inside'] == 'avoid'
     self.page_break_after = False
     self.block_lang = None
Exemplo n.º 14
0
    def adjust_split_point(self, root, path):
        '''
        Move the split point up its ancestor chain if it has no textual content
        before it. This handles the common case:
        <div id="chapter1"><h2>Chapter 1</h2>...</div> with a page break on the
        h2.
        '''
        sp = root.xpath(path)[0]
        while True:
            parent = sp.getparent()
            if barename(parent.tag) in ('body', 'html'):
                break
            if parent.text and parent.text.strip():
                break
            if parent.index(sp) > 0:
                break
            sp = parent

        npath = sp.getroottree().getpath(sp)

        if self.opts.verbose > 3 and npath != path:
            self.log.debug('\t\t\tMoved split point %s to %s'%(path, npath))


        return npath
Exemplo n.º 15
0
    def process_block(self, html_block, docx_block, stylizer, ignore_tail=False):
        block_style = stylizer.style(html_block)
        if block_style.is_hidden:
            return
        if html_block.text:
            docx_block.add_text(html_block.text, block_style, ignore_leading_whitespace=True, is_parent_style=True)

        for child in html_block.iterchildren(etree.Element):
            tag = barename(child.tag)
            style = stylizer.style(child)
            display = style._get('display')
            if tag == 'img':
                pass  # TODO: Handle images
            if display == 'block' and tag != 'br':
                b = Block(self.styles_manager, child, style)
                self.blocks.append(b)
                self.process_block(child, b, stylizer)
            else:
                self.process_inline(child, self.blocks[-1], stylizer)

        if ignore_tail is False and html_block.tail and html_block.tail.strip():
            b = docx_block
            if b is not self.blocks[-1]:
                b = Block(self.styles_manager, html_block, block_style)
                self.blocks.append(b)
            b.add_text(html_block.tail, stylizer.style(html_block.getparent()), is_parent_style=True)
        if block_style['page-break-after'] == 'avoid':
            self.blocks[-1].keep_next = True
Exemplo n.º 16
0
    def adjust_split_point(self, root, path):
        '''
        Move the split point up its ancestor chain if it has no textual content
        before it. This handles the common case:
        <div id="chapter1"><h2>Chapter 1</h2>...</div> with a page break on the
        h2.
        '''
        sp = root.xpath(path)[0]
        while True:
            parent = sp.getparent()
            if barename(parent.tag) in ('body', 'html'):
                break
            if parent.text and parent.text.strip():
                break
            if parent.index(sp) > 0:
                break
            sp = parent

        npath = sp.getroottree().getpath(sp)

        if self.opts.verbose > 3 and npath != path:
            self.log.debug('\t\t\tMoved split point %s to %s'%(path, npath))


        return npath
Exemplo n.º 17
0
    def process_block(self, html_block, docx_block, stylizer, ignore_tail=False):
        block_style = stylizer.style(html_block)
        if block_style.is_hidden:
            return
        if html_block.tag.endswith('}img'):
            self.images_manager.add_image(html_block, docx_block, stylizer)
        else:
            if html_block.text:
                docx_block.add_text(html_block.text, block_style, ignore_leading_whitespace=True, is_parent_style=True)

            for child in html_block.iterchildren(etree.Element):
                tag = barename(child.tag)
                style = stylizer.style(child)
                display = style._get('display')
                if display == 'block' and tag != 'br':
                    if tag == 'img' and style['float'] in {'left', 'right'}:
                        # Image is floating so dont start a new paragraph for
                        # it
                        self.process_inline(child, self.blocks[-1], stylizer)
                    else:
                        b = Block(self.styles_manager, child, style)
                        self.blocks.append(b)
                        self.process_block(child, b, stylizer)
                else:
                    self.process_inline(child, self.blocks[-1], stylizer)

        if block_style['page-break-after'] == 'avoid':
            self.blocks[-1].keep_next = True

        if ignore_tail is False and html_block.tail and html_block.tail.strip():
            style = stylizer.style(html_block.getparent())
            b = Block(self.styles_manager, html_block.getparent(), style)
            self.blocks.append(b)
            b.add_text(html_block.tail, style, is_parent_style=True)
Exemplo n.º 18
0
    def process_tag(self, html_tag, stylizer, is_first_tag=False, float_spec=None):
        tagname = barename(html_tag.tag)
        if tagname in {'script', 'style', 'title', 'meta'}:
            return
        tag_style = stylizer.style(html_tag)
        if tag_style.is_hidden:
            return

        previous_link = self.current_link
        if tagname == 'a' and html_tag.get('href'):
            self.current_link = (self.current_item, html_tag.get('href'), html_tag.get('title'))

        display = tag_style._get('display')
        is_float = tag_style['float'] in {'left', 'right'} and not is_first_tag
        if float_spec is None and is_float:
            float_spec = FloatSpec(self.docx.namespace, html_tag, tag_style)

        if display in {'inline', 'inline-block'} or tagname == 'br':  # <br> has display:block but we dont want to start a new paragraph
            if is_float and float_spec.is_dropcaps:
                self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec)
                float_spec = None
            else:
                self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
        elif display == 'list-item':
            self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_list_item=True)
        elif display.startswith('table') or display == 'inline-table':
            if display == 'table-cell':
                self.blocks.start_new_cell(html_tag, tag_style)
                self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_table_cell=True)
            elif display == 'table-row':
                self.blocks.start_new_row(html_tag, tag_style)
            elif display in {'table', 'inline-table'}:
                self.blocks.end_current_block()
                self.blocks.start_new_table(html_tag, tag_style)
        else:
            if tagname == 'img' and is_float:
                # Image is floating so dont start a new paragraph for it
                self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
            else:
                self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec)

        for child in html_tag.iterchildren('*'):
            self.process_tag(child, stylizer, float_spec=float_spec)

        is_block = html_tag in self.blocks.open_html_blocks
        self.blocks.finish_tag(html_tag)
        if is_block and tag_style['page-break-after'] == 'avoid':
            self.blocks.all_blocks[-1].keep_next = True

        self.current_link = previous_link

        if display == 'table-row':
            return  # We ignore the tail for these tags

        ignore_whitespace_tail = is_block or display.startswith('table')
        if not is_first_tag and html_tag.tail and (not ignore_whitespace_tail or not html_tag.tail.isspace()):
            # Ignore trailing space after a block tag, as otherwise it will
            # become a new empty paragraph
            block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent()))
            block.add_text(html_tag.tail, stylizer.style(html_tag.getparent()), is_parent_style=True, link=self.current_link)
Exemplo n.º 19
0
    def dump_text(self, elem, stylizer, page):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        '''

        # We can only processes tags. If there isn't a tag return any text.
        if not isinstance(elem.tag, string_or_bytes) \
           or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return ['']

        # Setup our variables.
        text = ['']
        tags = []
        tag = barename(elem.tag)
        attribs = elem.attrib

        if tag == 'body':
            tag = 'div'
        tags.append(tag)

        # Remove attributes we won't want.
        if 'style' in attribs:
            del attribs['style']

        # Turn the rest of the attributes into a string we can write with the tag.
        at = ''
        for k, v in attribs.items():
            at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))

        # Write the tag.
        text.append('<%s%s' % (tag, at))
        if tag in SELF_CLOSING_TAGS:
            text.append(' />')
        else:
            text.append('>')

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            text.append(self.prepare_string_for_html(elem.text))

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer, page)

        # Close all open tags.
        tags.reverse()
        for t in tags:
            if t not in SELF_CLOSING_TAGS:
                text.append('</%s>' % t)

        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            text.append(self.prepare_string_for_html(elem.tail))

        return text
Exemplo n.º 20
0
 def unsmarten(self, root):
     for x in self.html_tags(root):
         if not barename(x.tag) == 'pre':
             if getattr(x, 'text', None):
                 x.text = unsmarten_text(x.text)
             if getattr(x, 'tail', None) and x.tail:
                 x.tail = unsmarten_text(x.tail)
Exemplo n.º 21
0
    def process_inline(self, html_child, docx_block, stylizer):
        tag = barename(html_child.tag)
        style = stylizer.style(html_child)
        if style.is_hidden:
            return
        if tag == 'br':
            if html_child.tail or html_child is not html_child.getparent()[-1]:
                docx_block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(style['clear'], 'none'))
        elif tag == 'img':
            return  # TODO: Handle images
        else:
            if html_child.text:
                docx_block.add_text(html_child.text, style, html_parent=html_child)
            for child in html_child.iterchildren(etree.Element):
                style = stylizer.style(child)
                display = style.get('display', 'inline')
                if display == 'block':
                    b = Block(self.styles_manager, child, style)
                    self.blocks.append(b)
                    self.process_block(child, b, stylizer)
                else:
                    self.process_inline(child, self.blocks[-1], stylizer)

        if html_child.tail:
            self.blocks[-1].add_text(html_child.tail, stylizer.style(html_child.getparent()), html_parent=html_child.getparent(), is_parent_style=True)
Exemplo n.º 22
0
    def process_tag(self, html_tag, stylizer, is_first_tag=False):
        tagname = barename(html_tag.tag)
        if tagname in {'script', 'style', 'title', 'meta'}:
            return
        tag_style = stylizer.style(html_tag)
        if tag_style.is_hidden:
            return
        display = tag_style._get('display')
        if display in {
                'inline', 'inline-block'
        } or tagname == 'br':  # <br> has display:block but we dont want to start a new paragraph
            self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
        elif display == 'list-item':
            # TODO: Implement this
            self.add_block_tag(tagname, html_tag, tag_style, stylizer)
        elif display.startswith('table') or display == 'inline-table':
            if display == 'table-cell':
                self.blocks.start_new_cell(html_tag, tag_style)
                self.add_block_tag(tagname,
                                   html_tag,
                                   tag_style,
                                   stylizer,
                                   is_table_cell=True)
            elif display == 'table-row':
                self.blocks.start_new_row(html_tag, tag_style)
            elif display in {'table', 'inline-table'}:
                self.blocks.end_current_block()
                self.blocks.start_new_table(html_tag, tag_style)
        else:
            if tagname == 'img' and tag_style['float'] in {'left', 'right'}:
                # Image is floating so dont start a new paragraph for it
                self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
            else:
                self.add_block_tag(tagname, html_tag, tag_style, stylizer)

        for child in html_tag.iterchildren('*'):
            self.process_tag(child, stylizer)

        is_block = html_tag in self.blocks.open_html_blocks
        self.blocks.finish_tag(html_tag)
        if is_block and tag_style['page-break-after'] == 'avoid':
            self.blocks.all_blocks[-1].keep_next = True

        if display == 'table-row':
            return  # We ignore the tail for these tags

        ignore_whitespace_tail = is_block or display.startswith('table')
        if not is_first_tag and html_tag.tail and (
                not ignore_whitespace_tail or not html_tag.tail.isspace()):
            # Ignore trailing space after a block tag, as otherwise it will
            # become a new empty paragraph
            block = self.blocks.current_or_new_block(
                html_tag.getparent(), stylizer.style(html_tag.getparent()))
            block.add_text(html_tag.tail,
                           stylizer.style(html_tag.getparent()),
                           is_parent_style=True)
Exemplo n.º 23
0
def count_chars_in_opf(root, counter, file_name, book_locale):
    for tag in root.iterdescendants('*'):
        if barename(tag.tag) in opf_spell_tags:
            if barename(tag.tag) == 'description':
                if tag.text:
                    count_chars_in_escaped_html(tag.text, counter, file_name,
                                                tag, 'text', book_locale)
                for child in tag:
                    if child.tail:
                        count_chars_in_escaped_html(child.tail, counter,
                                                    file_name, tag, 'tail',
                                                    book_locale)
            else:
                if tag.text:
                    count_chars_in_text(tag, 'text', counter, file_name,
                                        book_locale)
                for child in tag:
                    if child.tail:
                        count_chars_in_text(tag, 'tail', counter, file_name,
                                            book_locale)
        count_chars_in_attr(tag, _opf_file_as, counter, file_name, book_locale)
Exemplo n.º 24
0
def read_words_from_opf(root, words, file_name, book_locale):
    for tag in root.iterdescendants('*'):
        if barename(tag.tag) in opf_spell_tags:
            if barename(tag.tag) == 'description':
                if tag.text:
                    add_words_from_escaped_html(tag.text, words, file_name,
                                                tag, 'text', book_locale)
                for child in tag:
                    if child.tail:
                        add_words_from_escaped_html(child.tail, words,
                                                    file_name, child, 'tail',
                                                    book_locale)
            else:
                if tag.text:
                    add_words_from_text(tag, 'text', words, file_name,
                                        book_locale)
                for child in tag:
                    if child.tail:
                        add_words_from_text(child, 'tail', words, file_name,
                                            book_locale)
        add_words_from_attr(tag, _opf_file_as, words, file_name, book_locale)
Exemplo n.º 25
0
    def find_levels(self):

        def level_of(elem, body):
            ans = 1
            while elem.getparent() is not body:
                ans += 1
                elem = elem.getparent()
            return ans

        paras = XPath('descendant::h:p|descendant::h:div')

        for item in self.oeb.spine:
            body = XPath('//h:body')(item.data)
            if not body:
                continue
            body = body[0]

            for p in paras(body):
                level = level_of(p, body)
                level = '%s_%d'%(barename(p.tag), level)
                if level not in self.levels:
                    self.levels[level] = []
                self.levels[level].append(p)

        remove = set()
        for k, v in iteritems(self.levels):
            num = len(v)
            self.log.debug('Found %d items of level:'%num, k)
            level = int(k.split('_')[-1])
            tag = k.split('_')[0]
            if tag == 'p' and num < 25:
                remove.add(k)
            if tag == 'div':
                if level > 2 and num < 25:
                    remove.add(k)
                elif level < 3:
                    # Check each level < 3 element and only keep those
                    # that have many child paras
                    for elem in list(v):
                        children = len(paras(elem))
                        if children < 5:
                            v.remove(elem)

        for k in remove:
            self.levels.pop(k)
            self.log.debug('Ignoring level', k)
Exemplo n.º 26
0
    def find_levels(self):

        def level_of(elem, body):
            ans = 1
            while elem.getparent() is not body:
                ans += 1
                elem = elem.getparent()
            return ans

        paras = XPath('descendant::h:p|descendant::h:div')

        for item in self.oeb.spine:
            body = XPath('//h:body')(item.data)
            if not body:
                continue
            body = body[0]

            for p in paras(body):
                level = level_of(p, body)
                level = '%s_%d'%(barename(p.tag), level)
                if level not in self.levels:
                    self.levels[level] = []
                self.levels[level].append(p)

        remove = set()
        for k, v in self.levels.iteritems():
            num = len(v)
            self.log.debug('Found %d items of level:'%num, k)
            level = int(k.split('_')[-1])
            tag = k.split('_')[0]
            if tag == 'p' and num < 25:
                remove.add(k)
            if tag == 'div':
                if level > 2 and num < 25:
                    remove.add(k)
                elif level < 3:
                    # Check each level < 3 element and only keep those
                    # that have many child paras
                    for elem in list(v):
                        children = len(paras(elem))
                        if children < 5:
                            v.remove(elem)

        for k in remove:
            self.levels.pop(k)
            self.log.debug('Ignoring level', k)
Exemplo n.º 27
0
def pretty_block(parent, level=1, indent='  '):
    ''' Surround block tags with blank lines and recurse into child block tags
    that contain only other block tags '''
    if not parent.text or isspace(parent.text):
        parent.text = ''
    nn = '\n' if hasattr(parent.tag, 'strip') and barename(parent.tag) in {'tr', 'td', 'th'} else '\n\n'
    parent.text = parent.text + nn + (indent * level)
    for i, child in enumerate(parent):
        if isblock(child) and has_only_blocks(child):
            pretty_block(child, level=level+1, indent=indent)
        elif child.tag == SVG_TAG:
            pretty_xml_tree(child, level=level, indent=indent)
        l = level
        if i == len(parent) - 1:
            l -= 1
        if not child.tail or isspace(child.tail):
            child.tail = ''
        child.tail = child.tail + nn + (indent * l)
Exemplo n.º 28
0
def pretty_block(parent, level=1, indent='  '):
    ''' Surround block tags with blank lines and recurse into child block tags
    that contain only other block tags '''
    if not parent.text or isspace(parent.text):
        parent.text = ''
    nn = '\n' if hasattr(parent.tag, 'strip') and barename(parent.tag) in {'tr', 'td', 'th'} else '\n\n'
    parent.text = parent.text + nn + (indent * level)
    for i, child in enumerate(parent):
        if isblock(child) and has_only_blocks(child):
            pretty_block(child, level=level+1, indent=indent)
        elif child.tag == SVG_TAG:
            pretty_xml_tree(child, level=level, indent=indent)
        l = level
        if i == len(parent) - 1:
            l -= 1
        if not child.tail or isspace(child.tail):
            child.tail = ''
        child.tail = child.tail + nn + (indent * l)
Exemplo n.º 29
0
    def process_tag(self, html_tag, stylizer, is_first_tag=False):
        tagname = barename(html_tag.tag)
        if tagname in {'script', 'style', 'title', 'meta'}:
            return
        tag_style = stylizer.style(html_tag)
        if tag_style.is_hidden:
            return
        display = tag_style._get('display')
        if display in {'inline', 'inline-block'} or tagname == 'br':  # <br> has display:block but we dont want to start a new paragraph
            self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
        elif display == 'list-item':
            # TODO: Implement this
            self.add_block_tag(tagname, html_tag, tag_style, stylizer)
        elif display.startswith('table') or display == 'inline-table':
            if display == 'table-cell':
                self.blocks.start_new_cell(html_tag, tag_style)
                self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_table_cell=True)
            elif display == 'table-row':
                self.blocks.start_new_row(html_tag, tag_style)
            elif display in {'table', 'inline-table'}:
                self.blocks.start_new_table(html_tag, tag_style)
        else:
            if tagname == 'img' and tag_style['float'] in {'left', 'right'}:
                # Image is floating so dont start a new paragraph for it
                self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
            else:
                self.add_block_tag(tagname, html_tag, tag_style, stylizer)

        for child in html_tag.iterchildren('*'):
            self.process_tag(child, stylizer)

        is_block = html_tag in self.blocks.open_html_blocks
        self.blocks.finish_tag(html_tag)
        if is_block and tag_style['page-break-after'] == 'avoid':
            self.blocks.all_blocks[-1].keep_next = True

        if display == 'table-row':
            return  # We ignore the tail for these tags

        if not is_first_tag and html_tag.tail and (not is_block or not html_tag.tail.isspace()):
            # Ignore trailing space after a block tag, as otherwise it will
            # become a new empty paragraph
            block = self.blocks.current_or_new_block(html_tag.getparent(), stylizer.style(html_tag.getparent()))
            block.add_text(html_tag.tail, stylizer.style(html_tag.getparent()), is_parent_style=True)
Exemplo n.º 30
0
def pretty_html_tree(container, root):
    root.text = '\n\n'
    for child in root:
        child.tail = '\n\n'
        if hasattr(child.tag, 'endswith') and child.tag.endswith('}head'):
            pretty_xml_tree(child)
    for body in root.findall('h:body', namespaces=XPNSMAP):
        pretty_block(body)
        # Special case the handling of a body that contains a single block tag
        # with all content. In this case we prettify the containing block tag
        # even if it has non block children.
        if (len(body) == 1 and not callable(body[0].tag) and isblock(body[0]) and
                not has_only_blocks(body[0]) and barename(body[0].tag) != 'pre' and len(body[0]) > 0):
            pretty_block(body[0], level=2)

    if container is not None:
        # Handle <script> and <style> tags
        for child in root.xpath('//*[local-name()="script" or local-name()="style"]'):
            pretty_script_or_style(container, child)
Exemplo n.º 31
0
def pretty_html_tree(container, root):
    root.text = '\n\n'
    for child in root:
        child.tail = '\n\n'
        if hasattr(child.tag, 'endswith') and child.tag.endswith('}head'):
            pretty_xml_tree(child)
    for body in root.findall('h:body', namespaces=XPNSMAP):
        pretty_block(body)
        # Special case the handling of a body that contains a single block tag
        # with all content. In this case we prettify the containing block tag
        # even if it has non block children.
        if (len(body) == 1 and not callable(body[0].tag) and isblock(body[0]) and
                not has_only_blocks(body[0]) and barename(body[0].tag) not in (
                    'pre', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6') and len(body[0]) > 0):
            pretty_block(body[0], level=2)

    if container is not None:
        # Handle <script> and <style> tags
        for child in root.xpath('//*[local-name()="script" or local-name()="style"]'):
            pretty_script_or_style(container, child)
Exemplo n.º 32
0
def adjust_split_point(split_point, log):
    '''
    Move the split point up its ancestor chain if it has no content
    before it. This handles the common case:
    <div id="chapter1"><h2>Chapter 1</h2>...</div> with a page break on the
    h2.
    '''
    sp = split_point
    while True:
        parent = sp.getparent()
        if (parent is None or barename(parent.tag) in {'body', 'html'}
                or (parent.text and parent.text.strip())
                or parent.index(sp) > 0):
            break
        sp = parent

    if sp is not split_point:
        log.debug('Adjusted split point to ancestor')

    return sp
Exemplo n.º 33
0
 def __init__(self, namespace, styles_manager, links_manager, html_block, style, is_table_cell=False, float_spec=None, is_list_item=False, parent_bg=None):
     self.namespace = namespace
     self.bookmarks = set()
     self.list_tag = (html_block, style) if is_list_item else None
     self.is_first_block = False
     self.numbering_id = None
     self.parent_items = None
     self.html_block = html_block
     self.html_tag = barename(html_block.tag)
     self.float_spec = float_spec
     if float_spec is not None:
         float_spec.blocks.append(self)
     self.html_style = style
     self.style = styles_manager.create_block_style(style, html_block, is_table_cell=is_table_cell, parent_bg=parent_bg)
     self.styles_manager, self.links_manager = styles_manager, links_manager
     self.keep_next = False
     self.runs = []
     self.skipped = False
     self.linked_style = None
     self.page_break_before = style['page-break-before'] == 'always'
     self.keep_lines = style['page-break-inside'] == 'avoid'
     self.page_break_after = False
     self.block_lang = None
Exemplo n.º 34
0
def adjust_split_point(split_point, log):
    '''
    Move the split point up its ancestor chain if it has no content
    before it. This handles the common case:
    <div id="chapter1"><h2>Chapter 1</h2>...</div> with a page break on the
    h2.
    '''
    sp = split_point
    while True:
        parent = sp.getparent()
        if (
            parent is None or
            barename(parent.tag) in {'body', 'html'} or
            (parent.text and parent.text.strip()) or
            parent.index(sp) > 0
        ):
            break
        sp = parent

    if sp is not split_point:
        log.debug('Adjusted split point to ancestor')

    return sp
Exemplo n.º 35
0
    def dump_text(self, elem, stylizer, page):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        '''

        # We can only processes tags. If there isn't a tag return any text.
        if not isinstance(elem.tag, string_or_bytes) \
           or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return ['']

        # Setup our variables.
        text = ['']
        style = stylizer.style(elem)
        tags = []
        tag = barename(elem.tag)
        attribs = elem.attrib

        if tag == 'body':
            tag = 'div'
        tags.append(tag)

        # Ignore anything that is set to not be displayed.
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            return ['']

        # Remove attributes we won't want.
        if 'class' in attribs:
            del attribs['class']
        if 'style' in attribs:
            del attribs['style']

        # Turn the rest of the attributes into a string we can write with the tag.
        at = ''
        for k, v in attribs.items():
            at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))

        # Write the tag.
        text.append('<%s%s' % (tag, at))
        if tag in SELF_CLOSING_TAGS:
            text.append(' />')
        else:
            text.append('>')

        # Turn styles into tags.
        if style['font-weight'] in ('bold', 'bolder'):
            text.append('<b>')
            tags.append('b')
        if style['font-style'] == 'italic':
            text.append('<i>')
            tags.append('i')
        if style['text-decoration'] == 'underline':
            text.append('<u>')
            tags.append('u')
        if style['text-decoration'] == 'line-through':
            text.append('<s>')
            tags.append('s')

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            text.append(self.prepare_string_for_html(elem.text))

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer, page)

        # Close all open tags.
        tags.reverse()
        for t in tags:
            if t not in SELF_CLOSING_TAGS:
                text.append('</%s>' % t)

        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            text.append(self.prepare_string_for_html(elem.tail))

        return text
Exemplo n.º 36
0
    def process_tag(self, html_tag, stylizer, is_first_tag=False, float_spec=None):
        tagname = barename(html_tag.tag)
        tag_style = stylizer.style(html_tag)
        ignore_tag_contents = tagname in {'script', 'style', 'title', 'meta'} or tag_style.is_hidden
        display = tag_style._get('display')
        is_block = False

        if not ignore_tag_contents:
            previous_link = self.current_link
            if tagname == 'a' and html_tag.get('href'):
                self.current_link = (self.current_item, html_tag.get('href'), html_tag.get('title'))
            previous_lang = self.current_lang
            tag_lang = lang_for_tag(html_tag)
            if tag_lang:
                self.current_lang = tag_lang

            is_float = tag_style['float'] in {'left', 'right'} and not is_first_tag
            if float_spec is None and is_float:
                float_spec = FloatSpec(self.docx.namespace, html_tag, tag_style)

            if display in {'inline', 'inline-block'} or tagname == 'br':  # <br> has display:block but we dont want to start a new paragraph
                if is_float and float_spec.is_dropcaps:
                    self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec)
                    float_spec = None
                else:
                    self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
            elif display == 'list-item':
                self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_list_item=True)
            elif display.startswith('table') or display == 'inline-table':
                if display == 'table-cell':
                    self.blocks.start_new_cell(html_tag, tag_style)
                    self.add_block_tag(tagname, html_tag, tag_style, stylizer, is_table_cell=True)
                elif display == 'table-row':
                    self.blocks.start_new_row(html_tag, tag_style)
                elif display in {'table', 'inline-table'}:
                    self.blocks.end_current_block()
                    self.blocks.start_new_table(html_tag, tag_style)
            else:
                if tagname == 'img' and is_float:
                    # Image is floating so dont start a new paragraph for it
                    self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
                else:
                    if tagname == 'hr':
                        for edge in 'right bottom left'.split():
                            tag_style.set('border-%s-style' % edge, 'none')
                    self.add_block_tag(tagname, html_tag, tag_style, stylizer, float_spec=float_spec)

            for child in html_tag.iterchildren():
                if isinstance(getattr(child, 'tag', None), basestring):
                    self.process_tag(child, stylizer, float_spec=float_spec)
                else:  # Comment/PI/etc.
                    tail = getattr(child, 'tail', None)
                    if tail:
                        block = self.create_block_from_parent(html_tag, stylizer)
                        block.add_text(tail, tag_style, is_parent_style=False, link=self.current_link, lang=self.current_lang)

            is_block = html_tag in self.blocks.open_html_blocks
            self.blocks.finish_tag(html_tag)
            if is_block and tag_style['page-break-after'] == 'avoid':
                self.blocks.all_blocks[-1].keep_next = True

            self.current_link = previous_link
            self.current_lang = previous_lang

        # Now, process the tail if any

        if display == 'table-row':
            return  # We ignore the tail for these tags

        ignore_whitespace_tail = is_block or display.startswith('table')
        if not is_first_tag and html_tag.tail and (not ignore_whitespace_tail or not html_tag.tail.isspace()):
            # Ignore trailing space after a block tag, as otherwise it will
            # become a new empty paragraph
            block = self.create_block_from_parent(html_tag, stylizer)
            block.add_text(html_tag.tail, stylizer.style(html_tag.getparent()), is_parent_style=True, link=self.current_link, lang=self.current_lang)
Exemplo n.º 37
0
    def flatten_node(self, node, stylizer, names, styles, pseudo_styles, psize, item_id):
        if not isinstance(node.tag, string_or_bytes) \
           or namespace(node.tag) != XHTML_NS:
            return
        tag = barename(node.tag)
        style = stylizer.style(node)
        cssdict = style.cssdict()
        try:
            font_size = style['font-size']
        except:
            font_size = self.sbase if self.sbase is not None else \
                self.context.source.fbase
        if tag == 'body' and isinstance(font_size, numbers.Number):
            stylizer.body_font_size = font_size
        if 'align' in node.attrib:
            if tag != 'img':
                cssdict['text-align'] = node.attrib['align']
                if cssdict['text-align'] == 'center':
                    # align=center causes tables to be center aligned,
                    # which text-align does not. And the ever trustworthy Word
                    # uses this construct in its HTML output. See
                    # https://bugs.launchpad.net/bugs/1569583
                    if tag == 'table':
                        if 'margin-left' not in cssdict and 'margin-right' not in cssdict:
                            cssdict['margin-left'] = cssdict['margin-right'] = 'auto'
                    else:
                        for table in node.iterchildren(XHTML("table")):
                            ts = stylizer.style(table)
                            if ts.get('margin-left') is None and ts.get('margin-right') is None:
                                ts.set('margin-left', 'auto')
                                ts.set('margin-right', 'auto')
            else:
                val = node.attrib['align']
                if val in ('middle', 'bottom', 'top'):
                    cssdict['vertical-align'] = val
                elif val in ('left', 'right'):
                    cssdict['float'] = val
            del node.attrib['align']
        if 'valign' in node.attrib and tag == 'td':
            if cssdict.get('vertical-align') == 'inherit':
                cssdict['vertical-align'] = node.attrib['valign']
            del node.attrib['valign']
        if node.tag == XHTML('font'):
            tags = ['descendant::h:%s'%x for x in ('p', 'div', 'table', 'h1',
                'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'dl', 'blockquote')]
            tag = 'div' if XPath('|'.join(tags))(node) else 'span'
            node.tag = XHTML(tag)
            if 'size' in node.attrib:
                def force_int(raw):
                    return int(re.search(r'([0-9+-]+)', raw).group(1))
                size = node.attrib['size'].strip()
                if size:
                    fnums = self.context.source.fnums
                    if size[0] in ('+', '-'):
                        # Oh, the warcrimes
                        try:
                            esize = 3 + force_int(size)
                        except:
                            esize = 3
                        if esize < 1:
                            esize = 1
                        if esize > 7:
                            esize = 7
                        font_size = fnums[esize]
                    else:
                        try:
                            font_size = fnums[force_int(size)]
                        except:
                            font_size = fnums[3]
                    cssdict['font-size'] = '%.1fpt'%font_size
                del node.attrib['size']
            if 'face' in node.attrib:
                cssdict['font-family'] = node.attrib['face']
                del node.attrib['face']
        if 'color' in node.attrib:
            try:
                cssdict['color'] = Property('color', node.attrib['color']).value
            except (ValueError, SyntaxErr):
                pass
            del node.attrib['color']
        if 'bgcolor' in node.attrib:
            try:
                cssdict['background-color'] = Property('background-color', node.attrib['bgcolor']).value
            except (ValueError, SyntaxErr):
                pass
            del node.attrib['bgcolor']
        if tag == 'ol' and 'type' in node.attrib:
            del node.attrib['type']
        if cssdict.get('font-weight', '').lower() == 'medium':
            cssdict['font-weight'] = 'normal'  # ADE chokes on font-weight medium

        fsize = font_size
        is_drop_cap = (cssdict.get('float', None) == 'left' and 'font-size' in cssdict and len(node) == 0 and node.text and (
            len(node.text) == 1 or (len(node.text) == 2 and 0x2000 <= ord(node.text[0]) <= 0x206f)))
        # Detect drop caps generated by the docx input plugin
        if node.tag and node.tag.endswith('}p') and len(node) == 0 and node.text and len(node.text.strip()) == 1 and \
                not node.tail and 'line-height' in cssdict and 'font-size' in cssdict:
            dp = node.getparent()
            if dp.tag and dp.tag.endswith('}div') and len(dp) == 1 and not dp.text:
                if stylizer.style(dp).cssdict().get('float', None) == 'left':
                    is_drop_cap = True
        if not self.context.disable_font_rescaling and not is_drop_cap:
            _sbase = self.sbase if self.sbase is not None else \
                self.context.source.fbase
            dyn_rescale = dynamic_rescale_factor(node)
            if dyn_rescale is not None:
                fsize = self.fmap[_sbase]
                fsize *= dyn_rescale
                cssdict['font-size'] = '%0.5fem'%(fsize/psize)
                psize = fsize
            elif 'font-size' in cssdict or tag == 'body':
                fsize = self.fmap[font_size]
                try:
                    cssdict['font-size'] = "%0.5fem" % (fsize / psize)
                except ZeroDivisionError:
                    cssdict['font-size'] = '%.1fpt'%fsize
                psize = fsize

        try:
            minlh = self.context.minimum_line_height / 100.
            if not is_drop_cap and style['line-height'] < minlh * fsize:
                cssdict['line-height'] = str(minlh)
        except:
            self.oeb.logger.exception('Failed to set minimum line-height')

        if cssdict:
            for x in self.filter_css:
                popval = cssdict.pop(x, None)
                if self.body_font_family and popval and x == 'font-family' \
                    and popval.partition(',')[0][1:-1] == self.body_font_family.partition(',')[0][1:-1]:
                    cssdict[x] = popval

        if cssdict:
            if self.lineh and self.fbase and tag != 'body':
                self.clean_edges(cssdict, style, psize)
            if 'display' in cssdict and cssdict['display'] == 'in-line':
                cssdict['display'] = 'inline'
            if self.unfloat and 'float' in cssdict \
               and cssdict.get('display', 'none') != 'none':
                del cssdict['display']
            if self.untable and 'display' in cssdict \
               and cssdict['display'].startswith('table'):
                display = cssdict['display']
                if display == 'table-cell':
                    cssdict['display'] = 'inline'
                else:
                    cssdict['display'] = 'block'
            if 'vertical-align' in cssdict \
               and cssdict['vertical-align'] == 'sup':
                cssdict['vertical-align'] = 'super'
        if self.lineh and 'line-height' not in cssdict:
            lineh = self.lineh / psize
            cssdict['line-height'] = "%0.5fem" % lineh

        if (self.context.remove_paragraph_spacing or self.context.insert_blank_line) and tag in ('p', 'div'):
            if item_id != 'calibre_jacket' or self.context.output_profile.name == 'Kindle':
                for prop in ('margin', 'padding', 'border'):
                    for edge in ('top', 'bottom'):
                        cssdict['%s-%s'%(prop, edge)] = '0pt'
            if self.context.insert_blank_line:
                cssdict['margin-top'] = cssdict['margin-bottom'] = \
                    '%fem'%self.context.insert_blank_line_size
            indent_size = self.context.remove_paragraph_spacing_indent_size
            keep_indents = indent_size < 0.0
            if (self.context.remove_paragraph_spacing and not keep_indents and cssdict.get('text-align', None) not in ('center', 'right')):
                cssdict['text-indent'] =  "%1.1fem" % indent_size

        pseudo_classes = style.pseudo_classes(self.filter_css)
        if cssdict or pseudo_classes:
            keep_classes = set()

            if cssdict:
                items = sorted(iteritems(cssdict))
                css = u';\n'.join(u'%s: %s' % (key, val) for key, val in items)
                classes = node.get('class', '').strip() or 'calibre'
                # lower() because otherwise if the document uses the same class
                # name with different case, both cases will apply, leading
                # to incorrect results.
                klass = ascii_text(STRIPNUM.sub('', classes.split()[0])).lower().strip().replace(' ', '_')
                if css in styles:
                    match = styles[css]
                else:
                    match = klass + str(names[klass] or '')
                    styles[css] = match
                    names[klass] += 1
                node.attrib['class'] = match
                keep_classes.add(match)

            for psel, cssdict in iteritems(pseudo_classes):
                items = sorted(iteritems(cssdict))
                css = u';\n'.join(u'%s: %s' % (key, val) for key, val in items)
                pstyles = pseudo_styles[psel]
                if css in pstyles:
                    match = pstyles[css]
                else:
                    # We have to use a different class for each psel as
                    # otherwise you can have incorrect styles for a situation
                    # like: a:hover { color: red } a:link { color: blue } a.x:hover { color: green }
                    # If the pcalibre class for a:hover and a:link is the same,
                    # then the class attribute for a.x tags will contain both
                    # that class and the class for a.x:hover, which is wrong.
                    klass = 'pcalibre'
                    match = klass + str(names[klass] or '')
                    pstyles[css] = match
                    names[klass] += 1
                keep_classes.add(match)
                node.attrib['class'] = ' '.join(keep_classes)

        elif 'class' in node.attrib:
            del node.attrib['class']
        if 'style' in node.attrib:
            del node.attrib['style']
        for child in node:
            self.flatten_node(child, stylizer, names, styles, pseudo_styles, psize, item_id)
Exemplo n.º 38
0
    def mobimlize_content(self, tag, text, bstate, istates):
        'Convert text content'
        if text or tag != 'br':
            bstate.content = True
        istate = istates[-1]
        para = bstate.para
        if tag in SPECIAL_TAGS and not text:
            para = para if para is not None else bstate.body
        elif para is None or tag in ('td', 'th'):
            body = bstate.body
            if bstate.pbreak:
                etree.SubElement(body, MBP('pagebreak'))
                bstate.pbreak = False
            bstate.istate = None
            bstate.anchor = None
            parent = bstate.nested[-1] if bstate.nested else bstate.body
            indent = istate.indent
            left = istate.left
            if isinstance(indent, basestring):
                indent = 0
            if indent < 0 and abs(indent) < left:
                left += indent
                indent = 0
            elif indent != 0 and abs(indent) < self.profile.fbase:
                indent = (indent / abs(indent)) * self.profile.fbase
            if tag in NESTABLE_TAGS and not istate.rendered:
                para = wrapper = etree.SubElement(parent,
                                                  XHTML(tag),
                                                  attrib=istate.attrib)
                bstate.nested.append(para)
                if tag == 'li' and len(istates) > 1:
                    istates[-2].list_num += 1
                    para.attrib['value'] = str(istates[-2].list_num)
            elif tag in NESTABLE_TAGS and istate.rendered:
                para = wrapper = bstate.nested[-1]
            elif not self.opts.mobi_ignore_margins and left > 0 and indent >= 0:
                ems = self.profile.mobi_ems_per_blockquote
                para = wrapper = etree.SubElement(parent, XHTML('blockquote'))
                para = wrapper
                emleft = int(round(left / self.profile.fbase)) - ems
                emleft = min((emleft, 10))
                while emleft > ems / 2.0:
                    para = etree.SubElement(para, XHTML('blockquote'))
                    emleft -= ems
            else:
                para = wrapper = etree.SubElement(parent, XHTML('p'))
            bstate.inline = bstate.para = para
            vspace = bstate.vpadding + bstate.vmargin
            bstate.vpadding = bstate.vmargin = 0
            if tag not in TABLE_TAGS:
                if tag in ('ul', 'ol') and vspace > 0:
                    wrapper.addprevious(
                        etree.Element(XHTML('div'),
                                      height=self.mobimlize_measure(vspace)))
                else:
                    wrapper.attrib['height'] = self.mobimlize_measure(vspace)
                para.attrib['width'] = self.mobimlize_measure(indent)
            elif tag == 'table' and vspace > 0:
                vspace = int(round(vspace / self.profile.fbase))
                while vspace > 0:
                    wrapper.addprevious(etree.Element(XHTML('br')))
                    vspace -= 1
            if istate.halign != 'auto' and isinstance(istate.halign,
                                                      (str, unicode)):
                para.attrib['align'] = istate.halign
        istate.rendered = True
        pstate = bstate.istate
        if tag in CONTENT_TAGS:
            bstate.inline = para
            pstate = bstate.istate = None
            try:
                etree.SubElement(para, XHTML(tag), attrib=istate.attrib)
            except:
                print 'Invalid subelement:', para, tag, istate.attrib
                raise
        elif tag in TABLE_TAGS:
            para.attrib['valign'] = 'top'
        if istate.ids:
            for id_ in istate.ids:
                anchor = etree.Element(XHTML('a'), attrib={'id': id_})
                if tag == 'li':
                    try:
                        last = bstate.body[-1][-1]
                    except:
                        break
                    last.insert(0, anchor)
                    anchor.tail = last.text
                    last.text = None
                else:
                    last = bstate.body[-1]
                    # We use append instead of addprevious so that inline
                    # anchors in large blocks point to the correct place. See
                    # https://bugs.launchpad.net/calibre/+bug/899831
                    # This could potentially break if inserting an anchor at
                    # this point in the markup is illegal, but I cannot think
                    # of such a case offhand.
                    if barename(last.tag) in LEAF_TAGS:
                        last.addprevious(anchor)
                    else:
                        last.append(anchor)

            istate.ids.clear()
        if not text:
            return
        if not pstate or istate != pstate:
            inline = para
            fsize = istate.fsize
            href = istate.href
            if not href:
                bstate.anchor = None
            elif pstate and pstate.href == href:
                inline = bstate.anchor
            else:
                inline = etree.SubElement(inline, XHTML('a'), href=href)
                bstate.anchor = inline

            if fsize != 3:
                inline = etree.SubElement(inline,
                                          XHTML('font'),
                                          size=str(fsize))
            if istate.family == 'monospace':
                inline = etree.SubElement(inline, XHTML('tt'))
            if istate.italic:
                inline = etree.SubElement(inline, XHTML('i'))
            if istate.bold:
                inline = etree.SubElement(inline, XHTML('b'))
            if istate.bgcolor is not None and istate.bgcolor != 'transparent':
                inline = etree.SubElement(inline,
                                          XHTML('span'),
                                          bgcolor=istate.bgcolor)
            if istate.fgcolor != 'black':
                inline = etree.SubElement(inline,
                                          XHTML('font'),
                                          color=unicode(istate.fgcolor))
            if istate.strikethrough:
                inline = etree.SubElement(inline, XHTML('s'))
            if istate.underline:
                inline = etree.SubElement(inline, XHTML('u'))
            bstate.inline = inline
        bstate.istate = istate
        inline = bstate.inline
        content = self.preize_text(text) if istate.preserve else [text]
        for item in content:
            if isinstance(item, basestring):
                if len(inline) == 0:
                    inline.text = (inline.text or '') + item
                else:
                    last = inline[-1]
                    last.tail = (last.tail or '') + item
            else:
                inline.append(item)
Exemplo n.º 39
0
    def mobimlize_elem(self,
                       elem,
                       stylizer,
                       bstate,
                       istates,
                       ignore_valign=False):
        if not isinstance(elem.tag, basestring) \
           or namespace(elem.tag) != XHTML_NS:
            return
        style = stylizer.style(elem)
        # <mbp:frame-set/> does not exist lalalala
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            id_ = elem.get('id', None)
            if id_:
                # Keep anchors so people can use display:none
                # to generate hidden TOCs
                tail = elem.tail
                elem.clear()
                elem.text = None
                elem.set('id', id_)
                elem.tail = tail
                elem.tag = XHTML('a')
            else:
                return
        tag = barename(elem.tag)
        istate = copy.copy(istates[-1])
        istate.rendered = False
        istate.list_num = 0
        if tag == 'ol' and 'start' in elem.attrib:
            try:
                istate.list_num = int(elem.attrib['start']) - 1
            except:
                pass
        istates.append(istate)
        left = 0
        display = style['display']
        if display == 'table-cell':
            display = 'inline'
        elif display.startswith('table'):
            display = 'block'
        isblock = (not display.startswith('inline')
                   and style['display'] != 'none')
        isblock = isblock and style['float'] == 'none'
        isblock = isblock and tag != 'br'
        if isblock:
            bstate.para = None
            istate.halign = style['text-align']
            istate.indent = style['text-indent']
            if style['margin-left'] == 'auto' \
               and style['margin-right'] == 'auto':
                istate.halign = 'center'
            margin = asfloat(style['margin-left'])
            padding = asfloat(style['padding-left'])
            if tag != 'body':
                left = margin + padding
            istate.left += left
            vmargin = asfloat(style['margin-top'])
            bstate.vmargin = max((bstate.vmargin, vmargin))
            vpadding = asfloat(style['padding-top'])
            if vpadding > 0:
                bstate.vpadding += bstate.vmargin
                bstate.vmargin = 0
                bstate.vpadding += vpadding
        elif not istate.href:
            margin = asfloat(style['margin-left'])
            padding = asfloat(style['padding-left'])
            lspace = margin + padding
            if lspace > 0:
                spaces = int(round((lspace * 3) / style['font-size']))
                elem.text = (u'\xa0' * spaces) + (elem.text or '')
            margin = asfloat(style['margin-right'])
            padding = asfloat(style['padding-right'])
            rspace = margin + padding
            if rspace > 0:
                spaces = int(round((rspace * 3) / style['font-size']))
                if len(elem) == 0:
                    elem.text = (elem.text or '') + (u'\xa0' * spaces)
                else:
                    last = elem[-1]
                    last.text = (last.text or '') + (u'\xa0' * spaces)
        if bstate.content and style['page-break-before'] in PAGE_BREAKS:
            bstate.pbreak = True
        istate.fsize = self.mobimlize_font(style['font-size'])
        istate.italic = True if style['font-style'] == 'italic' else False
        weight = style['font-weight']
        istate.bold = weight in ('bold', 'bolder') or asfloat(weight) > 400
        istate.preserve = (style['white-space'] in ('pre', 'pre-wrap'))
        istate.bgcolor = style['background-color']
        istate.fgcolor = style['color']
        istate.strikethrough = style.effective_text_decoration == 'line-through'
        istate.underline = style.effective_text_decoration == 'underline'
        ff = style['font-family'].lower() if style['font-family'] else ''
        if 'monospace' in ff or 'courier' in ff or ff.endswith(' mono'):
            istate.family = 'monospace'
        elif ('sans-serif' in ff or 'sansserif' in ff or 'verdana' in ff
              or 'arial' in ff or 'helvetica' in ff):
            istate.family = 'sans-serif'
        else:
            istate.family = 'serif'
        if 'id' in elem.attrib:
            istate.ids.add(elem.attrib['id'])
        if 'name' in elem.attrib:
            istate.ids.add(elem.attrib['name'])
        if tag == 'a' and 'href' in elem.attrib:
            istate.href = elem.attrib['href']
        istate.attrib.clear()
        if tag == 'img' and 'src' in elem.attrib:
            istate.attrib['src'] = elem.attrib['src']
            istate.attrib['align'] = 'baseline'
            cssdict = style.cssdict()
            valign = cssdict.get('vertical-align', None)
            if valign in ('top', 'bottom', 'middle'):
                istate.attrib['align'] = valign
            for prop in ('width', 'height'):
                if cssdict[prop] != 'auto':
                    value = style[prop]
                    if value == getattr(self.profile, prop):
                        result = '100%'
                    else:
                        # Amazon's renderer does not support
                        # img sizes in units other than px
                        # See #7520 for test case
                        try:
                            pixs = int(
                                round(float(value) / (72. / self.profile.dpi)))
                        except:
                            continue
                        result = str(pixs)
                    istate.attrib[prop] = result
            if 'width' not in istate.attrib or 'height' not in istate.attrib:
                href = self.current_spine_item.abshref(elem.attrib['src'])
                try:
                    item = self.oeb.manifest.hrefs[urlnormalize(href)]
                except:
                    self.oeb.logger.warn('Failed to find image:', href)
                else:
                    try:
                        width, height = identify_data(item.data)[:2]
                    except:
                        self.oeb.logger.warn('Invalid image:', href)
                    else:
                        if 'width' not in istate.attrib and 'height' not in \
                                    istate.attrib:
                            istate.attrib['width'] = str(width)
                            istate.attrib['height'] = str(height)
                        else:
                            ar = float(width) / float(height)
                            if 'width' not in istate.attrib:
                                try:
                                    width = int(istate.attrib['height']) * ar
                                except:
                                    pass
                                istate.attrib['width'] = str(int(width))
                            else:
                                try:
                                    height = int(istate.attrib['width']) / ar
                                except:
                                    pass
                                istate.attrib['height'] = str(int(height))
                        item.unload_data_from_memory()
        elif tag == 'hr' and asfloat(style['width']) > 0:
            prop = style['width'] / self.profile.width
            istate.attrib['width'] = "%d%%" % int(round(prop * 100))
        elif display == 'table':
            tag = 'table'
        elif display == 'table-row':
            tag = 'tr'
        elif display == 'table-cell':
            tag = 'td'
        if tag in TABLE_TAGS and self.ignore_tables:
            tag = 'span' if tag == 'td' else 'div'

        if tag in ('table', 'td', 'tr'):
            col = style.backgroundColor
            if col:
                elem.set('bgcolor', col)
            css = style.cssdict()
            if 'border' in css or 'border-width' in css:
                elem.set('border', '1')
        if tag in TABLE_TAGS:
            for attr in ('rowspan', 'colspan', 'width', 'border', 'scope',
                         'bgcolor'):
                if attr in elem.attrib:
                    istate.attrib[attr] = elem.attrib[attr]
        if tag == 'q':
            t = elem.text
            if not t:
                t = ''
            elem.text = u'\u201c' + t
            t = elem.tail
            if not t:
                t = ''
            elem.tail = u'\u201d' + t
        text = None
        if elem.text:
            if istate.preserve:
                text = elem.text
            elif (len(elem) > 0 and isspace(elem.text)
                  and hasattr(elem[0].tag, 'rpartition')
                  and elem[0].tag.rpartition('}')[-1] not in INLINE_TAGS):
                text = None
            else:
                text = COLLAPSE.sub(' ', elem.text)
        valign = style['vertical-align']
        not_baseline = valign in ('super', 'sub', 'text-top', 'text-bottom',
                                  'top', 'bottom') or (isinstance(
                                      valign,
                                      (float, int)) and abs(valign) != 0)
        issup = valign in ('super', 'text-top',
                           'top') or (isinstance(valign,
                                                 (float, int)) and valign > 0)
        vtag = 'sup' if issup else 'sub'
        if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock:
            nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP)
            vbstate = BlockState(etree.SubElement(nroot, XHTML('body')))
            vbstate.para = etree.SubElement(vbstate.body, XHTML('p'))
            self.mobimlize_elem(elem,
                                stylizer,
                                vbstate,
                                istates,
                                ignore_valign=True)
            if len(istates) > 0:
                istates.pop()
            if len(istates) == 0:
                istates.append(FormatState())
            at_start = bstate.para is None
            if at_start:
                self.mobimlize_content('span', '', bstate, istates)
            parent = bstate.para if bstate.inline is None else bstate.inline
            if parent is not None:
                vtag = etree.SubElement(parent, XHTML(vtag))
                vtag = etree.SubElement(vtag, XHTML('small'))
                # Add anchors
                for child in vbstate.body:
                    if child is not vbstate.para:
                        vtag.append(child)
                    else:
                        break
                if vbstate.para is not None:
                    if vbstate.para.text:
                        vtag.text = vbstate.para.text
                    for child in vbstate.para:
                        vtag.append(child)
                return

        if tag == 'blockquote':
            old_mim = self.opts.mobi_ignore_margins
            self.opts.mobi_ignore_margins = False

        if (text or tag in CONTENT_TAGS or tag in NESTABLE_TAGS or (
                # We have an id but no text and no children, the id should still
                # be added.
                istate.ids and tag in ('a', 'span', 'i', 'b', 'u')
                and len(elem) == 0)):
            self.mobimlize_content(tag, text, bstate, istates)
        for child in elem:
            self.mobimlize_elem(child, stylizer, bstate, istates)
            tail = None
            if child.tail:
                if istate.preserve:
                    tail = child.tail
                elif bstate.para is None and isspace(child.tail):
                    tail = None
                else:
                    tail = COLLAPSE.sub(' ', child.tail)
            if tail:
                self.mobimlize_content(tag, tail, bstate, istates)

        if tag == 'blockquote':
            self.opts.mobi_ignore_margins = old_mim

        if bstate.content and style['page-break-after'] in PAGE_BREAKS:
            bstate.pbreak = True
        if isblock:
            para = bstate.para
            if para is not None and para.text == u'\xa0' and len(para) < 1:
                if style.height > 2:
                    para.getparent().replace(para, etree.Element(XHTML('br')))
                else:
                    # This is too small to be rendered effectively, drop it
                    para.getparent().remove(para)
            bstate.para = None
            bstate.istate = None
            vmargin = asfloat(style['margin-bottom'])
            bstate.vmargin = max((bstate.vmargin, vmargin))
            vpadding = asfloat(style['padding-bottom'])
            if vpadding > 0:
                bstate.vpadding += bstate.vmargin
                bstate.vmargin = 0
                bstate.vpadding += vpadding
        if bstate.nested and bstate.nested[-1].tag == elem.tag:
            bstate.nested.pop()
        istates.pop()
Exemplo n.º 40
0
def iterhtmllinks(container, name):
    for el, attr, link, pos in iterlinks(container.parsed(name)):
        tag = barename(el.tag).lower()
        if tag != 'a' and is_external(link):
            yield el, attr, link
Exemplo n.º 41
0
    def dump_text(self, elem, stylizer, tag_stack=[]):
        from calibre.ebooks.oeb.base import (XHTML_NS, namespace, barename,
                urlnormalize)

        if not isinstance(elem.tag, basestring) \
           or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return elem.tail
            return u''

        text = u''
        style = stylizer.style(elem)

        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return elem.tail
            return u''

        tag = barename(elem.tag)
        tag_count = 0

        # Are we in a paragraph block?
        if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
            if 'block' not in tag_stack:
                tag_count += 1
                tag_stack.append('block')

        # Process tags that need special processing and that do not have inner
        # text. Usually these require an argument
        if tag == 'img':
            src = elem.get('src')
            if src:
                src = urlnormalize(self.currently_dumping_item.abshref(src))
                block_start = ''
                block_end = ''
                if 'block' not in tag_stack:
                    block_start = '{\\par\\pard\\hyphpar '
                    block_end = '}'
                text += '%s SPECIAL_IMAGE-%s-REPLACE_ME %s' % (block_start, src, block_end)

        single_tag = SINGLE_TAGS.get(tag, None)
        if single_tag:
            text += single_tag

        rtf_tag = TAGS.get(tag, None)
        if rtf_tag and rtf_tag not in tag_stack:
            tag_count += 1
            text += '{%s\n' % rtf_tag
            tag_stack.append(rtf_tag)

        # Processes style information
        for s in STYLES:
            style_tag = s[1].get(style[s[0]], None)
            if style_tag and style_tag not in tag_stack:
                tag_count += 1
                text += '{%s\n' % style_tag
                tag_stack.append(style_tag)

        # Proccess tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            text += txt2rtf(elem.text)

        for item in elem:
            text += self.dump_text(item, stylizer, tag_stack)

        for i in range(0, tag_count):
            end_tag =  tag_stack.pop()
            if end_tag != 'block':
                if tag in BLOCK_TAGS:
                    text += u'\\par\\pard\\plain\\hyphpar}'
                else:
                    text += u'}'

        if hasattr(elem, 'tail') and elem.tail:
            if 'block' in tag_stack:
                text += '%s' % txt2rtf(elem.tail)
            else:
                text += '{\\par\\pard\\hyphpar %s}' % txt2rtf(elem.tail)

        return text
Exemplo n.º 42
0
    def dump_text(self, elem, stylizer):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        '''

        # We can only processes tags. If there isn't a tag return any text.
        if not isinstance(elem.tag, string_or_bytes) \
           or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return ['']

        # Setup our variables.
        text = []
        style = stylizer.style(elem)
        tags = []
        tag = barename(elem.tag)
        attribs = elem.attrib

        # Ignore anything that is set to not be displayed.
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return [elem.tail]
            return ['']

        # Soft scene breaks.
        if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto':
            ems = int(round(float(style.marginTop) / style.fontSize) - 1)
            if ems >= 1:
                text.append(u'\n\n' * ems)

        bq = '> ' * self.blockquotes
        # Block level elements
        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
            h_tag = ''
            if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
                h_tag = '#' * int(tag[1]) + ' '
            text.append('\n' + bq + h_tag)
            tags.append('\n')
            self.remove_space_after_newline = True

        if style['font-style'] == 'italic' or tag in ('i', 'em'):
            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
                if self.style_italic == False:  # noqa
                    text.append('*')
                    tags.append('*')
                    self.style_italic = True
        if style['font-weight'] in ('bold', 'bolder') or tag in ('b',
                                                                 'strong'):
            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
                if self.style_bold == False:  # noqa
                    text.append('**')
                    tags.append('**')
                    self.style_bold = True
        if tag == 'br':
            text.append('  \n')
            self.remove_space_after_newline = True
        if tag == 'blockquote':
            self.blockquotes += 1
            tags.append('>')
            text.append('> ' * self.blockquotes)
        elif tag == 'code':
            if not self.in_pre and not self.in_code:
                text.append('`')
                tags.append('`')
                self.in_code = True
        elif tag == 'pre':
            if not self.in_pre:
                text.append('\n')
                tags.append('pre')
                self.in_pre = True
        elif tag == 'hr':
            text.append('\n* * *')
            tags.append('\n')
        elif tag == 'a':
            # Only write links with absolute (external) urls.
            if self.opts.keep_links and 'href' in attribs and '://' in attribs[
                    'href']:
                title = ''
                if 'title' in attribs:
                    title = ' "' + attribs['title'] + '"'
                    remove_space = self.remove_space_after_newline
                    title = self.remove_newlines(title)
                    self.remove_space_after_newline = remove_space
                text.append('[')
                tags.append('](' + attribs['href'] + title + ')')
        elif tag == 'img':
            if self.opts.keep_image_references:
                txt = '!'
                if 'alt' in attribs:
                    remove_space = self.remove_space_after_newline
                    txt += '[' + self.remove_newlines(attribs['alt']) + ']'
                    self.remove_space_after_newline = remove_space
                txt += '(' + attribs['src'] + ')'
                text.append(txt)
        elif tag in ('ol', 'ul'):
            tags.append(tag)
            # Add the list to our lists of lists so we can track
            # nested lists.
            self.list.append({'name': tag, 'num': 0})
        elif tag == 'li':
            # Get the last list from our list of lists
            if self.list:
                li = self.list[-1]
            else:
                li = {'name': 'ul', 'num': 0}
            # Add a new line to start the item
            text.append('\n')
            # Add indent if we have nested lists.
            list_count = len(self.list)
            # We only care about indenting nested lists.
            if (list_count - 1) > 0:
                text.append('\t' * (list_count - 1))
            # Add blockquote if we have a blockquote in a list item.
            text.append(bq)
            # Write the proper sign for ordered and unorded lists.
            if li['name'] == 'ul':
                text.append('+ ')
            elif li['name'] == 'ol':
                li['num'] += 1
                text.append(unicode_type(li['num']) + '. ')

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            txt = elem.text
            if self.in_pre:
                txt = self.prepare_string_for_pre(txt)
            elif self.in_code:
                txt = self.remove_newlines(txt)
            else:
                txt = self.prepare_string_for_markdown(
                    self.remove_newlines(txt))
            text.append(txt)

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer)

        # Close all open tags.
        tags.reverse()
        for t in tags:
            if t in ('pre', 'ul', 'ol', '>'):
                if t == 'pre':
                    self.in_pre = False
                    text.append('\n')
                elif t == '>':
                    self.blockquotes -= 1
                elif t in ('ul', 'ol'):
                    if self.list:
                        self.list.pop()
                    text.append('\n')
            else:
                if t == '**':
                    self.style_bold = False
                elif t == '*':
                    self.style_italic = False
                elif t == '`':
                    self.in_code = False
                text.append('%s' % t)

        # Soft scene breaks.
        if 'margin-bottom' in style.cssdict(
        ) and style['margin-bottom'] != 'auto':
            ems = int(round((float(style.marginBottom) / style.fontSize) - 1))
            if ems >= 1:
                text.append(u'\n\n' * ems)

        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            tail = elem.tail
            if self.in_pre:
                tail = self.prepare_string_for_pre(tail)
            elif self.in_code:
                tail = self.remove_newlines(tail)
            else:
                tail = self.prepare_string_for_markdown(
                    self.remove_newlines(tail))
            text.append(tail)

        return text
Exemplo n.º 43
0
    def workaround_ade_quirks(self):  # {{{
        '''
        Perform various markup transforms to get the output to render correctly
        in the quirky ADE.
        '''
        from calibre.ebooks.oeb.base import XPath, XHTML, barename, urlunquote

        stylesheet = self.oeb.manifest.main_stylesheet

        # ADE cries big wet tears when it encounters an invalid fragment
        # identifier in the NCX toc.
        frag_pat = re.compile(r'[-A-Za-z0-9_:.]+$')
        for node in self.oeb.toc.iter():
            href = getattr(node, 'href', None)
            if hasattr(href, 'partition'):
                base, _, frag = href.partition('#')
                frag = urlunquote(frag)
                if frag and frag_pat.match(frag) is None:
                    self.log.warn(
                        'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'
                        % frag)
                    node.href = base

        for x in self.oeb.spine:
            root = x.data
            body = XPath('//h:body')(root)
            if body:
                body = body[0]

            if hasattr(body, 'xpath'):
                # remove <img> tags with empty src elements
                bad = []
                for x in XPath('//h:img')(body):
                    src = x.get('src', '').strip()
                    if src in ('', '#') or src.startswith('http:'):
                        bad.append(x)
                for img in bad:
                    img.getparent().remove(img)

                # Add id attribute to <a> tags that have name
                for x in XPath('//h:a[@name]')(body):
                    if not x.get('id', False):
                        x.set('id', x.get('name'))
                    # The delightful epubcheck has started complaining about <a> tags that
                    # have name attributes.
                    x.attrib.pop('name')

                # Replace <br> that are children of <body> as ADE doesn't handle them
                for br in XPath('./h:br')(body):
                    if br.getparent() is None:
                        continue
                    try:
                        prior = next(br.itersiblings(preceding=True))
                        priortag = barename(prior.tag)
                        priortext = prior.tail
                    except:
                        priortag = 'body'
                        priortext = body.text
                    if priortext:
                        priortext = priortext.strip()
                    br.tag = XHTML('p')
                    br.text = '\u00a0'
                    style = br.get('style', '').split(';')
                    style = [_f for _f in [x.strip() for x in style] if _f]
                    style.append('margin:0pt; border:0pt')
                    # If the prior tag is a block (including a <br> we replaced)
                    # then this <br> replacement should have a 1-line height.
                    # Otherwise it should have no height.
                    if not priortext and priortag in block_level_tags:
                        style.append('height:1em')
                    else:
                        style.append('height:0pt')
                    br.set('style', '; '.join(style))

            for tag in XPath('//h:embed')(root):
                tag.getparent().remove(tag)
            for tag in XPath('//h:object')(root):
                if tag.get('type', '').lower().strip() in {
                        'image/svg+xml', 'application/svg+xml'
                }:
                    continue
                tag.getparent().remove(tag)

            for tag in XPath('//h:title|//h:style')(root):
                if not tag.text:
                    tag.getparent().remove(tag)
            for tag in XPath('//h:script')(root):
                if (not tag.text and not tag.get('src', False)
                        and tag.get('type', None) != 'text/x-mathjax-config'):
                    tag.getparent().remove(tag)
            for tag in XPath('//h:body/descendant::h:script')(root):
                tag.getparent().remove(tag)

            formchildren = XPath('./h:input|./h:button|./h:textarea|'
                                 './h:label|./h:fieldset|./h:legend')
            for tag in XPath('//h:form')(root):
                if formchildren(tag):
                    tag.getparent().remove(tag)
                else:
                    # Not a real form
                    tag.tag = XHTML('div')

            for tag in XPath('//h:center')(root):
                tag.tag = XHTML('div')
                tag.set('style', 'text-align:center')
            # ADE can't handle &amp; in an img url
            for tag in XPath('//h:img[@src]')(root):
                tag.set('src', tag.get('src', '').replace('&', ''))

            # ADE whimpers in fright when it encounters a <td> outside a
            # <table>
            in_table = XPath('ancestor::h:table')
            for tag in XPath('//h:td|//h:tr|//h:th')(root):
                if not in_table(tag):
                    tag.tag = XHTML('div')

            # ADE fails to render non breaking hyphens/soft hyphens/zero width spaces
            special_chars = re.compile('[\u200b\u00ad]')
            for elem in root.iterdescendants('*'):
                if elem.text:
                    elem.text = special_chars.sub('', elem.text)
                    elem.text = elem.text.replace('\u2011', '-')
                if elem.tail:
                    elem.tail = special_chars.sub('', elem.tail)
                    elem.tail = elem.tail.replace('\u2011', '-')

            if stylesheet is not None:
                # ADE doesn't render lists correctly if they have left margins
                from cssutils.css import CSSRule
                for lb in XPath('//h:ul[@class]|//h:ol[@class]')(root):
                    sel = '.' + lb.get('class')
                    for rule in stylesheet.data.cssRules.rulesOfType(
                            CSSRule.STYLE_RULE):
                        if sel == rule.selectorList.selectorText:
                            rule.style.removeProperty('margin-left')
                            # padding-left breaks rendering in webkit and gecko
                            rule.style.removeProperty('padding-left')
                # Change whitespace:pre to pre-wrap to accommodate readers that
                # cannot scroll horizontally
                for rule in stylesheet.data.cssRules.rulesOfType(
                        CSSRule.STYLE_RULE):
                    style = rule.style
                    ws = style.getPropertyValue('white-space')
                    if ws == 'pre':
                        style.setProperty('white-space', 'pre-wrap')
Exemplo n.º 44
0
    def process_tag(self,
                    html_tag,
                    stylizer,
                    is_first_tag=False,
                    float_spec=None):
        tagname = barename(html_tag.tag)
        tag_style = stylizer.style(html_tag)
        ignore_tag_contents = tagname in {'script', 'style', 'title', 'meta'
                                          } or tag_style.is_hidden
        display = tag_style._get('display')
        is_block = False

        if not ignore_tag_contents:
            previous_link = self.current_link
            if tagname == 'a' and html_tag.get('href'):
                self.current_link = (self.current_item, html_tag.get('href'),
                                     html_tag.get('title'))
            previous_lang = self.current_lang
            tag_lang = lang_for_tag(html_tag)
            if tag_lang:
                self.current_lang = tag_lang

            is_float = tag_style['float'] in {'left', 'right'
                                              } and not is_first_tag
            if float_spec is None and is_float:
                float_spec = FloatSpec(self.docx.namespace, html_tag,
                                       tag_style)

            if display in {
                    'inline', 'inline-block'
            } or tagname == 'br':  # <br> has display:block but we dont want to start a new paragraph
                if is_float and float_spec.is_dropcaps:
                    self.add_block_tag(tagname,
                                       html_tag,
                                       tag_style,
                                       stylizer,
                                       float_spec=float_spec)
                    float_spec = None
                else:
                    self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
            elif display == 'list-item':
                self.add_block_tag(tagname,
                                   html_tag,
                                   tag_style,
                                   stylizer,
                                   is_list_item=True)
            elif display.startswith('table') or display == 'inline-table':
                if display == 'table-cell':
                    self.blocks.start_new_cell(html_tag, tag_style)
                    self.add_block_tag(tagname,
                                       html_tag,
                                       tag_style,
                                       stylizer,
                                       is_table_cell=True)
                elif display == 'table-row':
                    self.blocks.start_new_row(html_tag, tag_style)
                elif display in {'table', 'inline-table'}:
                    self.blocks.end_current_block()
                    self.blocks.start_new_table(html_tag, tag_style)
            else:
                if tagname == 'img' and is_float:
                    # Image is floating so dont start a new paragraph for it
                    self.add_inline_tag(tagname, html_tag, tag_style, stylizer)
                else:
                    if tagname == 'hr':
                        for edge in 'right bottom left'.split():
                            tag_style.set('border-%s-style' % edge, 'none')
                    self.add_block_tag(tagname,
                                       html_tag,
                                       tag_style,
                                       stylizer,
                                       float_spec=float_spec)

            for child in html_tag.iterchildren():
                if isinstance(getattr(child, 'tag', None), string_or_bytes):
                    self.process_tag(child, stylizer, float_spec=float_spec)
                else:  # Comment/PI/etc.
                    tail = getattr(child, 'tail', None)
                    if tail:
                        block = self.create_block_from_parent(
                            html_tag, stylizer)
                        block.add_text(tail,
                                       tag_style,
                                       is_parent_style=False,
                                       link=self.current_link,
                                       lang=self.current_lang)

            is_block = html_tag in self.blocks.open_html_blocks
            self.blocks.finish_tag(html_tag)
            if is_block and tag_style['page-break-after'] == 'avoid':
                self.blocks.all_blocks[-1].keep_next = True

            self.current_link = previous_link
            self.current_lang = previous_lang

        # Now, process the tail if any

        if display == 'table-row':
            return  # We ignore the tail for these tags

        ignore_whitespace_tail = is_block or display.startswith('table')
        if not is_first_tag and html_tag.tail and (
                not ignore_whitespace_tail or not html_tag.tail.isspace()):
            # Ignore trailing space after a block tag, as otherwise it will
            # become a new empty paragraph
            block = self.create_block_from_parent(html_tag, stylizer)
            block.add_text(html_tag.tail,
                           stylizer.style(html_tag.getparent()),
                           is_parent_style=True,
                           link=self.current_link,
                           lang=self.current_lang)
Exemplo n.º 45
0
    def dump_text(self, elem, stylizer, page, tag_stack=[]):
        from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace

        if not isinstance(elem.tag, string_or_bytes) or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return []

        text = []
        tags = []
        style = stylizer.style(elem)

        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return [elem.tail]
            return []

        tag = barename(elem.tag)

        # Are we in a paragraph block?
        if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
            tags.append('block')

        # Process tags that need special processing and that do not have inner
        # text. Usually these require an argument.
        if tag in IMAGE_TAGS:
            if elem.attrib.get('src', None):
                if page.abshref(elem.attrib['src']) not in self.image_hrefs.keys():
                    if len(self.image_hrefs.keys()) == 0:
                        self.image_hrefs[page.abshref(elem.attrib['src'])] = 'cover.png'
                    else:
                        self.image_hrefs[page.abshref(elem.attrib['src'])] = image_name(
                            '%s.png' % len(self.image_hrefs.keys()), self.image_hrefs.keys()).strip('\x00')
                text.append('\\m="%s"' % self.image_hrefs[page.abshref(elem.attrib['src'])])
        elif tag == 'hr':
            w = r'\w'
            width = elem.get('width')
            if width:
                if not width.endswith('%'):
                    width += '%'
                w += '="%s"' % width
            else:
                w += '="50%"'
            text.append(w)
        elif tag == 'br':
            text.append('\n\\c \n\\c\n')

        # TOC markers.
        toc_name = elem.attrib.get('name', None)
        toc_id = elem.attrib.get('id', None)
        # Only write the TOC marker if the tag isn't a heading and we aren't in one.
        if (toc_id or toc_name) and tag not in ('h1', 'h2','h3','h4','h5','h6') and \
            'x' not in tag_stack+tags and 'X0' not in tag_stack+tags and \
            'X1' not in tag_stack+tags and 'X2' not in tag_stack+tags and \
            'X3' not in tag_stack+tags and 'X4' not in tag_stack+tags:

            toc_page = page.href
            if self.toc.get(toc_page, None):
                for toc_x in (toc_name, toc_id):
                    toc_title, toc_depth = self.toc[toc_page].get(toc_x, (None, 0))
                    if toc_title:
                        toc_depth = max(min(toc_depth, 4), 0)
                        text.append(fr'\C{toc_depth}="{toc_title}"')

        # Process style information that needs holds a single tag.
        # Commented out because every page in an OEB book starts with this style.
        if style['page-break-before'] == 'always':
            text.append(r'\p')

        # Process basic PML tags.
        pml_tag = TAG_MAP.get(tag, None)
        if pml_tag and pml_tag not in tag_stack+tags:
            text.append(r'\%s' % pml_tag)
            tags.append(pml_tag)

        # Special processing of tags that require an argument.
        # Anchors links
        if tag in LINK_TAGS and 'q' not in tag_stack+tags:
            href = elem.get('href')
            if href:
                href = page.abshref(href)
                if '://' not in href:
                    if '#' not in href:
                        href += '#'
                    if href not in self.link_hrefs.keys():
                        self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys())
                    href = '#%s' % self.link_hrefs[href]
                    text.append(r'\q="%s"' % href)
                    tags.append('q')

        # Anchor ids
        id_name = elem.get('id')
        name_name = elem.get('name')
        for name_x in (id_name, name_name):
            if name_x:
                text.append(self.get_anchor(page, name_x))

        # Processes style information
        for s in STYLES:
            style_tag = s[1].get(style[s[0]], None)
            if style_tag and style_tag not in tag_stack+tags:
                text.append(r'\%s' % style_tag)
                tags.append(style_tag)

        # margin left
        try:
            mms = int(float(style['margin-left']) * 100 / style.height)
            if mms:
                text.append(r'\T="%s%%"' % mms)
        except:
            pass

        # Soft scene breaks.
        try:
            ems = int(round((float(style.marginTop) / style.fontSize) - 1))
            if ems >= 1:
                text.append('\n\\c \n\\c\n')
        except:
            pass

        # Process text within this tag.
        if hasattr(elem, 'text') and elem.text:
            text.append(self.prepare_string_for_pml(elem.text))

        # Process inner tags
        for item in elem:
            text += self.dump_text(item, stylizer, page, tag_stack+tags)

        # Close opened tags.
        tags.reverse()
        text += self.close_tags(tags)

        # if tag in SEPARATE_TAGS:
        #    text.append('\n\n')

        if style['page-break-after'] == 'always':
            text.append(r'\p')

        # Process text after this tag but not within another.
        if hasattr(elem, 'tail') and elem.tail:
            text.append(self.prepare_string_for_pml(elem.tail))

        return text
Exemplo n.º 46
0
 def dckey(x):
     return {'title':0, 'creator':1}.get(barename(x.tag), 2)
Exemplo n.º 47
0
    def dump_text(self, elem_tree, stylizer, page, tag_stack=[]):
        '''
        This function is intended to be used in a recursive manner. dump_text will
        run though all elements in the elem_tree and call itself on each element.

        self.image_hrefs will be populated by calling this function.

        @param elem_tree: etree representation of XHTML content to be transformed.
        @param stylizer: Used to track the style of elements within the tree.
        @param page: OEB page used to determine absolute urls.
        @param tag_stack: List of open FB2 tags to take into account.

        @return: List of string representing the XHTML converted to FB2 markup.
        '''
        from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace
        elem = elem_tree

        # Ensure what we are converting is not a string and that the fist tag is part of the XHTML namespace.
        if not isinstance(elem_tree.tag, basestring) or namespace(elem_tree.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return []

        style = stylizer.style(elem_tree)
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return [elem.tail]
            return []

        # FB2 generated output.
        fb2_out = []
        # FB2 tags in the order they are opened. This will be used to close the tags.
        tags = []
        # First tag in tree
        tag = barename(elem_tree.tag)
        # Number of blank lines above tag
        try:
            ems = int(round((float(style.marginTop) / style.fontSize) - 1))
            if ems < 0:
                ems = 0
        except:
            ems = 0

        # Convert TOC entries to <title>s and add <section>s
        if self.opts.sectionize == 'toc':
            # A section cannot be a child of any other element than another section,
            # so leave the tag alone if there are parents
            if not tag_stack:
                # There are two reasons to start a new section here: the TOC pointed to
                # this page (then we use the first non-<body> on the page as a <title>), or
                # the TOC pointed to a specific element
                newlevel = 0
                toc_entry = self.toc.get(page.href, None)
                if toc_entry is not None:
                    if None in toc_entry:
                        if tag != 'body' and hasattr(elem_tree, 'text') and elem_tree.text:
                            newlevel = 1
                            self.toc[page.href] = None
                    if not newlevel and elem_tree.attrib.get('id', None) is not None:
                        newlevel = toc_entry.get(elem_tree.attrib.get('id', None), None)

                # Start a new section if necessary
                if newlevel:
                    while newlevel <= self.section_level:
                        fb2_out.append('</section>')
                        self.section_level -= 1
                    fb2_out.append('<section>')
                    self.section_level += 1
                    fb2_out.append('<title>')
                    tags.append('title')
            if self.section_level == 0:
                # If none of the prior processing made a section, make one now to be FB2 spec compliant
                fb2_out.append('<section>')
                self.section_level += 1

        # Process the XHTML tag and styles. Converted to an FB2 tag.
        # Use individual if statement not if else. There can be
        # only one XHTML tag but it can have multiple styles.
        if tag == 'img':
            if elem_tree.attrib.get('src', None):
                # Only write the image tag if it is in the manifest.
                ihref = urlnormalize(page.abshref(elem_tree.attrib['src']))
                if ihref in self.oeb_book.manifest.hrefs:
                    if ihref not in self.image_hrefs:
                        self.image_hrefs[ihref] = '_%s.jpg' % len(self.image_hrefs)
                    p_txt, p_tag = self.ensure_p()
                    fb2_out += p_txt
                    tags += p_tag
                    fb2_out.append('<image xlink:href="#%s" />' % self.image_hrefs[ihref])
                else:
                    self.log.warn(u'Ignoring image not in manifest: %s'%ihref)
        if tag in ('br', 'hr') or ems >= 1:
            if ems < 1:
                multiplier = 1
            else:
                multiplier = ems
            if self.in_p:
                closed_tags = []
                open_tags = tag_stack+tags
                open_tags.reverse()
                for t in open_tags:
                    fb2_out.append('</%s>' % t)
                    closed_tags.append(t)
                    if t == 'p':
                        break
                fb2_out.append('<empty-line />' * multiplier)
                closed_tags.reverse()
                for t in closed_tags:
                    fb2_out.append('<%s>' % t)
            else:
                fb2_out.append('<empty-line />' * multiplier)
        if tag in ('div', 'li', 'p'):
            p_text, added_p = self.close_open_p(tag_stack+tags)
            fb2_out += p_text
            if added_p:
                tags.append('p')
        if tag == 'b' or style['font-weight'] in ('bold', 'bolder'):
            s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags)
            fb2_out += s_out
            tags += s_tags
        if tag == 'i' or style['font-style'] == 'italic':
            s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags)
            fb2_out += s_out
            tags += s_tags
        if tag in ('del', 'strike') or style['text-decoration'] == 'line-through':
            s_out, s_tags = self.handle_simple_tag('strikethrough', tag_stack+tags)
            fb2_out += s_out
            tags += s_tags
        if tag == 'sub':
            s_out, s_tags = self.handle_simple_tag('sub', tag_stack+tags)
            fb2_out += s_out
            tags += s_tags
        if tag == 'sup':
            s_out, s_tags = self.handle_simple_tag('sup', tag_stack+tags)
            fb2_out += s_out
            tags += s_tags

        # Process element text.
        if hasattr(elem_tree, 'text') and elem_tree.text:
            if not self.in_p:
                fb2_out.append('<p>')
            fb2_out.append(prepare_string_for_xml(elem_tree.text))
            if not self.in_p:
                fb2_out.append('</p>')

        # Process sub-elements.
        for item in elem_tree:
            fb2_out += self.dump_text(item, stylizer, page, tag_stack+tags)

        # Close open FB2 tags.
        tags.reverse()
        fb2_out += self.close_tags(tags)

        # Process element text that comes after the close of the XHTML tag but before the next XHTML tag.
        if hasattr(elem_tree, 'tail') and elem_tree.tail:
            if not self.in_p:
                fb2_out.append('<p>')
            fb2_out.append(prepare_string_for_xml(elem_tree.tail))
            if not self.in_p:
                fb2_out.append('</p>')

        return fb2_out
Exemplo n.º 48
0
    def workaround_ade_quirks(self):  # {{{
        '''
        Perform various markup transforms to get the output to render correctly
        in the quirky ADE.
        '''
        from calibre.ebooks.oeb.base import XPath, XHTML, barename, urlunquote

        stylesheet = self.oeb.manifest.main_stylesheet

        # ADE cries big wet tears when it encounters an invalid fragment
        # identifier in the NCX toc.
        frag_pat = re.compile(r'[-A-Za-z0-9_:.]+$')
        for node in self.oeb.toc.iter():
            href = getattr(node, 'href', None)
            if hasattr(href, 'partition'):
                base, _, frag = href.partition('#')
                frag = urlunquote(frag)
                if frag and frag_pat.match(frag) is None:
                    self.log.warn(
                            'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'%frag)
                    node.href = base

        for x in self.oeb.spine:
            root = x.data
            body = XPath('//h:body')(root)
            if body:
                body = body[0]

            if hasattr(body, 'xpath'):
                # remove <img> tags with empty src elements
                bad = []
                for x in XPath('//h:img')(body):
                    src = x.get('src', '').strip()
                    if src in ('', '#') or src.startswith('http:'):
                        bad.append(x)
                for img in bad:
                    img.getparent().remove(img)

                # Add id attribute to <a> tags that have name
                for x in XPath('//h:a[@name]')(body):
                    if not x.get('id', False):
                        x.set('id', x.get('name'))
                    # The delightful epubcheck has started complaining about <a> tags that
                    # have name attributes.
                    x.attrib.pop('name')

                # Replace <br> that are children of <body> as ADE doesn't handle them
                for br in XPath('./h:br')(body):
                    if br.getparent() is None:
                        continue
                    try:
                        prior = br.itersiblings(preceding=True).next()
                        priortag = barename(prior.tag)
                        priortext = prior.tail
                    except:
                        priortag = 'body'
                        priortext = body.text
                    if priortext:
                        priortext = priortext.strip()
                    br.tag = XHTML('p')
                    br.text = u'\u00a0'
                    style = br.get('style', '').split(';')
                    style = filter(None, map(lambda x: x.strip(), style))
                    style.append('margin:0pt; border:0pt')
                    # If the prior tag is a block (including a <br> we replaced)
                    # then this <br> replacement should have a 1-line height.
                    # Otherwise it should have no height.
                    if not priortext and priortag in block_level_tags:
                        style.append('height:1em')
                    else:
                        style.append('height:0pt')
                    br.set('style', '; '.join(style))

            for tag in XPath('//h:embed')(root):
                tag.getparent().remove(tag)
            for tag in XPath('//h:object')(root):
                if tag.get('type', '').lower().strip() in {'image/svg+xml', 'application/svg+xml'}:
                    continue
                tag.getparent().remove(tag)

            for tag in XPath('//h:title|//h:style')(root):
                if not tag.text:
                    tag.getparent().remove(tag)
            for tag in XPath('//h:script')(root):
                if (not tag.text and not tag.get('src', False) and tag.get('type', None) != 'text/x-mathjax-config'):
                    tag.getparent().remove(tag)
            for tag in XPath('//h:body/descendant::h:script')(root):
                tag.getparent().remove(tag)

            formchildren = XPath('./h:input|./h:button|./h:textarea|'
                    './h:label|./h:fieldset|./h:legend')
            for tag in XPath('//h:form')(root):
                if formchildren(tag):
                    tag.getparent().remove(tag)
                else:
                    # Not a real form
                    tag.tag = XHTML('div')

            for tag in XPath('//h:center')(root):
                tag.tag = XHTML('div')
                tag.set('style', 'text-align:center')
            # ADE can't handle &amp; in an img url
            for tag in XPath('//h:img[@src]')(root):
                tag.set('src', tag.get('src', '').replace('&', ''))

            # ADE whimpers in fright when it encounters a <td> outside a
            # <table>
            in_table = XPath('ancestor::h:table')
            for tag in XPath('//h:td|//h:tr|//h:th')(root):
                if not in_table(tag):
                    tag.tag = XHTML('div')

            # ADE fails to render non breaking hyphens/soft hyphens/zero width spaces
            special_chars = re.compile(u'[\u200b\u00ad]')
            for elem in root.iterdescendants('*'):
                if elem.text:
                    elem.text = special_chars.sub('', elem.text)
                    elem.text = elem.text.replace(u'\u2011', '-')
                if elem.tail:
                    elem.tail = special_chars.sub('', elem.tail)
                    elem.tail = elem.tail.replace(u'\u2011', '-')

            if stylesheet is not None:
                # ADE doesn't render lists correctly if they have left margins
                from cssutils.css import CSSRule
                for lb in XPath('//h:ul[@class]|//h:ol[@class]')(root):
                    sel = '.'+lb.get('class')
                    for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
                        if sel == rule.selectorList.selectorText:
                            rule.style.removeProperty('margin-left')
                            # padding-left breaks rendering in webkit and gecko
                            rule.style.removeProperty('padding-left')
                # Change whitespace:pre to pre-wrap to accommodate readers that
                # cannot scroll horizontally
                for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
                    style = rule.style
                    ws = style.getPropertyValue('white-space')
                    if ws == 'pre':
                        style.setProperty('white-space', 'pre-wrap')
Exemplo n.º 49
0
    def dump_text(self, elem, stylizer, page, tag_stack=[]):
        from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace

        if not isinstance(elem.tag, basestring) \
           or namespace(elem.tag) != XHTML_NS:
            return []

        text = []
        tags = []
        style = stylizer.style(elem)

        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            return []

        tag = barename(elem.tag)

        # Are we in a paragraph block?
        if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
            tags.append('block')

        # Process tags that need special processing and that do not have inner
        # text. Usually these require an argument.
        if tag in IMAGE_TAGS:
            if elem.attrib.get('src', None):
                if page.abshref(elem.attrib['src']) not in self.image_hrefs.keys():
                    if len(self.image_hrefs.keys()) == 0:
                        self.image_hrefs[page.abshref(elem.attrib['src'])] = 'cover.png'
                    else:
                        self.image_hrefs[page.abshref(elem.attrib['src'])] = image_name('%s.png' % len(self.image_hrefs.keys()), self.image_hrefs.keys()).strip('\x00')
                text.append('\\m="%s"' % self.image_hrefs[page.abshref(elem.attrib['src'])])
        elif tag == 'hr':
            w = '\\w'
            width = elem.get('width')
            if width:
                if not width.endswith('%'):
                    width += '%'
                w += '="%s"' % width
            else:
                w += '="50%"'
            text.append(w)
        elif tag == 'br':
            text.append('\n\\c \n\\c\n')

        # TOC markers.
        toc_name = elem.attrib.get('name', None)
        toc_id = elem.attrib.get('id', None)
        # Only write the TOC marker if the tag isn't a heading and we aren't in one.
        if (toc_id or toc_name) and tag not in ('h1', 'h2','h3','h4','h5','h6') and \
            'x' not in tag_stack+tags and 'X0' not in tag_stack+tags and \
            'X1' not in tag_stack+tags and 'X2' not in tag_stack+tags and \
            'X3' not in tag_stack+tags and 'X4' not in tag_stack+tags:

            toc_page = page.href
            if self.toc.get(toc_page, None):
                for toc_x in (toc_name, toc_id):
                    toc_title, toc_depth = self.toc[toc_page].get(toc_x, (None, 0))
                    if toc_title:
                        toc_depth = max(min(toc_depth, 4), 0)
                        text.append('\\C%s="%s"' % (toc_depth, toc_title))

        # Process style information that needs holds a single tag.
        # Commented out because every page in an OEB book starts with this style.
        if style['page-break-before'] == 'always':
            text.append('\\p')

        # Process basic PML tags.
        pml_tag = TAG_MAP.get(tag, None)
        if pml_tag and pml_tag not in tag_stack+tags:
            text.append('\\%s' % pml_tag)
            tags.append(pml_tag)

        # Special processing of tags that require an argument.
        # Anchors links
        if tag in LINK_TAGS and 'q' not in tag_stack+tags:
            href = elem.get('href')
            if href:
                href = page.abshref(href)
                if '://' not in href:
                    if '#' not in href:
                        href += '#'
                    if href not in self.link_hrefs.keys():
                        self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys())
                    href = '#%s' % self.link_hrefs[href]
                    text.append('\\q="%s"' % href)
                    tags.append('q')

        # Anchor ids
        id_name = elem.get('id')
        name_name = elem.get('name')
        for name_x in (id_name, name_name):
            if name_x:
                text.append(self.get_anchor(page, name_x))

        # Processes style information
        for s in STYLES:
            style_tag = s[1].get(style[s[0]], None)
            if style_tag and style_tag not in tag_stack+tags:
                text.append('\\%s' % style_tag)
                tags.append(style_tag)

        # margin left
        try:
            mms = int(float(style['margin-left']) * 100 / style.height)
            if mms:
                text.append('\\T="%s%%"' % mms)
        except:
            pass

        # Soft scene breaks.
        try:
            ems = int(round((float(style.marginTop) / style.fontSize) - 1))
            if ems >= 1:
                text.append('\n\\c \n\\c\n')
        except:
            pass

        # Proccess text within this tag.
        if hasattr(elem, 'text') and elem.text:
            text.append(self.prepare_string_for_pml(elem.text))

        # Process inner tags
        for item in elem:
            text += self.dump_text(item, stylizer, page, tag_stack+tags)

        # Close opened tags.
        tags.reverse()
        text += self.close_tags(tags)

        #if tag in SEPARATE_TAGS:
        #    text.append('\n\n')

        if style['page-break-after'] == 'always':
            text.append('\\p')

        # Process text after this tag but not within another.
        if hasattr(elem, 'tail') and elem.tail:
            text.append(self.prepare_string_for_pml(elem.tail))

        return text
Exemplo n.º 50
0
    def workaround_ade_quirks(self):  # {{{
        """
        Perform various markup transforms to get the output to render correctly
        in the quirky ADE.
        """
        from calibre.ebooks.oeb.base import XPath, XHTML, barename, urlunquote

        stylesheet = self.oeb.manifest.main_stylesheet

        # ADE cries big wet tears when it encounters an invalid fragment
        # identifier in the NCX toc.
        frag_pat = re.compile(r"[-A-Za-z0-9_:.]+$")
        for node in self.oeb.toc.iter():
            href = getattr(node, "href", None)
            if hasattr(href, "partition"):
                base, _, frag = href.partition("#")
                frag = urlunquote(frag)
                if frag and frag_pat.match(frag) is None:
                    self.log.warn("Removing invalid fragment identifier %r from TOC" % frag)
                    node.href = base

        for x in self.oeb.spine:
            root = x.data
            body = XPath("//h:body")(root)
            if body:
                body = body[0]

            if hasattr(body, "xpath"):
                # remove <img> tags with empty src elements
                bad = []
                for x in XPath("//h:img")(body):
                    src = x.get("src", "").strip()
                    if src in ("", "#") or src.startswith("http:"):
                        bad.append(x)
                for img in bad:
                    img.getparent().remove(img)

                # Add id attribute to <a> tags that have name
                for x in XPath("//h:a[@name]")(body):
                    if not x.get("id", False):
                        x.set("id", x.get("name"))
                    # The delightful epubcheck has started complaining about <a> tags that
                    # have name attributes.
                    x.attrib.pop("name")

                # Replace <br> that are children of <body> as ADE doesn't handle them
                for br in XPath("./h:br")(body):
                    if br.getparent() is None:
                        continue
                    try:
                        prior = br.itersiblings(preceding=True).next()
                        priortag = barename(prior.tag)
                        priortext = prior.tail
                    except:
                        priortag = "body"
                        priortext = body.text
                    if priortext:
                        priortext = priortext.strip()
                    br.tag = XHTML("p")
                    br.text = u"\u00a0"
                    style = br.get("style", "").split(";")
                    style = filter(None, map(lambda x: x.strip(), style))
                    style.append("margin:0pt; border:0pt")
                    # If the prior tag is a block (including a <br> we replaced)
                    # then this <br> replacement should have a 1-line height.
                    # Otherwise it should have no height.
                    if not priortext and priortag in block_level_tags:
                        style.append("height:1em")
                    else:
                        style.append("height:0pt")
                    br.set("style", "; ".join(style))

            for tag in XPath("//h:embed")(root):
                tag.getparent().remove(tag)
            for tag in XPath("//h:object")(root):
                if tag.get("type", "").lower().strip() in {"image/svg+xml", "application/svg+xml"}:
                    continue
                tag.getparent().remove(tag)

            for tag in XPath("//h:title|//h:style")(root):
                if not tag.text:
                    tag.getparent().remove(tag)
            for tag in XPath("//h:script")(root):
                if not tag.text and not tag.get("src", False) and tag.get("type", None) != "text/x-mathjax-config":
                    tag.getparent().remove(tag)
            for tag in XPath("//h:body/descendant::h:script")(root):
                tag.getparent().remove(tag)

            formchildren = XPath("./h:input|./h:button|./h:textarea|" "./h:label|./h:fieldset|./h:legend")
            for tag in XPath("//h:form")(root):
                if formchildren(tag):
                    tag.getparent().remove(tag)
                else:
                    # Not a real form
                    tag.tag = XHTML("div")

            for tag in XPath("//h:center")(root):
                tag.tag = XHTML("div")
                tag.set("style", "text-align:center")
            # ADE can't handle &amp; in an img url
            for tag in XPath("//h:img[@src]")(root):
                tag.set("src", tag.get("src", "").replace("&", ""))

            # ADE whimpers in fright when it encounters a <td> outside a
            # <table>
            in_table = XPath("ancestor::h:table")
            for tag in XPath("//h:td|//h:tr|//h:th")(root):
                if not in_table(tag):
                    tag.tag = XHTML("div")

            special_chars = re.compile(u"[\u200b\u00ad]")
            for elem in root.iterdescendants():
                if getattr(elem, "text", False):
                    elem.text = special_chars.sub("", elem.text)
                    elem.text = elem.text.replace(u"\u2011", "-")
                if getattr(elem, "tail", False):
                    elem.tail = special_chars.sub("", elem.tail)
                    elem.tail = elem.tail.replace(u"\u2011", "-")

            if stylesheet is not None:
                # ADE doesn't render lists correctly if they have left margins
                from cssutils.css import CSSRule

                for lb in XPath("//h:ul[@class]|//h:ol[@class]")(root):
                    sel = "." + lb.get("class")
                    for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
                        if sel == rule.selectorList.selectorText:
                            rule.style.removeProperty("margin-left")
                            # padding-left breaks rendering in webkit and gecko
                            rule.style.removeProperty("padding-left")
                # Change whitespace:pre to pre-wrap to accommodate readers that
                # cannot scroll horizontally
                for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
                    style = rule.style
                    ws = style.getPropertyValue("white-space")
                    if ws == "pre":
                        style.setProperty("white-space", "pre-wrap")
Exemplo n.º 51
0
    def dump_text(self, elem, stylizer, page):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        '''

        # We can only processes tags. If there isn't a tag return any text.
        if not isinstance(elem.tag, string_or_bytes) \
           or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return ['']

        # Setup our variables.
        text = ['']
        style = stylizer.style(elem)
        tags = []
        tag = barename(elem.tag)
        attribs = elem.attrib

        style_a = '%s' % style
        style_a = style_a if style_a else ''
        if tag == 'body':
            # Change the body to a div so we can merge multiple files.
            tag = 'div'
            # Add page-break-brefore: always because renders typically treat a new file (we're merging files)
            # as a page break and remove all other page break types that might be set.
            style_a = 'page-break-before: always; %s' % re.sub('page-break-[^:]+:[^;]+;?', '', style_a)
        # Remove unnecessary spaces.
        style_a = re.sub(r'\s{2,}', ' ', style_a).strip()
        tags.append(tag)

        # Remove attributes we won't want.
        if 'class' in attribs:
            del attribs['class']
        if 'style' in attribs:
            del attribs['style']

        # Turn the rest of the attributes into a string we can write with the tag.
        at = ''
        for k, v in attribs.items():
            at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))

        # Turn style into strings for putting in the tag.
        style_t = ''
        if style_a:
            style_t = ' style="%s"' % style_a.replace('"', "'")

        # Write the tag.
        text.append('<%s%s%s' % (tag, at, style_t))
        if tag in SELF_CLOSING_TAGS:
            text.append(' />')
        else:
            text.append('>')

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            text.append(self.prepare_string_for_html(elem.text))

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer, page)

        # Close all open tags.
        tags.reverse()
        for t in tags:
            if t not in SELF_CLOSING_TAGS:
                text.append('</%s>' % t)

        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            text.append(self.prepare_string_for_html(elem.tail))

        return text
Exemplo n.º 52
0
    def dump_text(self, elem_tree, stylizer, page, tag_stack=[]):
        '''
        This function is intended to be used in a recursive manner. dump_text will
        run though all elements in the elem_tree and call itself on each element.

        self.image_hrefs will be populated by calling this function.

        @param elem_tree: etree representation of XHTML content to be transformed.
        @param stylizer: Used to track the style of elements within the tree.
        @param page: OEB page used to determine absolute urls.
        @param tag_stack: List of open FB2 tags to take into account.

        @return: List of string representing the XHTML converted to FB2 markup.
        '''
        from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace
        elem = elem_tree

        # Ensure what we are converting is not a string and that the fist tag is part of the XHTML namespace.
        if not isinstance(elem_tree.tag, string_or_bytes) or namespace(
                elem_tree.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return []

        style = stylizer.style(elem_tree)
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return [elem.tail]
            return []

        # FB2 generated output.
        fb2_out = []
        # FB2 tags in the order they are opened. This will be used to close the tags.
        tags = []
        # First tag in tree
        tag = barename(elem_tree.tag)
        # Number of blank lines above tag
        try:
            ems = int(round((float(style.marginTop) / style.fontSize) - 1))
            if ems < 0:
                ems = 0
        except:
            ems = 0

        # Convert TOC entries to <title>s and add <section>s
        if self.opts.sectionize == 'toc':
            # A section cannot be a child of any other element than another section,
            # so leave the tag alone if there are parents
            if not tag_stack:
                # There are two reasons to start a new section here: the TOC pointed to
                # this page (then we use the first non-<body> on the page as a <title>), or
                # the TOC pointed to a specific element
                newlevel = 0
                toc_entry = self.toc.get(page.href, None)
                if toc_entry is not None:
                    if None in toc_entry:
                        if tag != 'body' and hasattr(
                                elem_tree, 'text') and elem_tree.text:
                            newlevel = 1
                            self.toc[page.href] = None
                    if not newlevel and elem_tree.attrib.get('id',
                                                             None) is not None:
                        newlevel = toc_entry.get(
                            elem_tree.attrib.get('id', None), None)

                # Start a new section if necessary
                if newlevel:
                    while newlevel <= self.section_level:
                        fb2_out.append('</section>')
                        self.section_level -= 1
                    fb2_out.append('<section>')
                    self.section_level += 1
                    fb2_out.append('<title>')
                    tags.append('title')
            if self.section_level == 0:
                # If none of the prior processing made a section, make one now to be FB2 spec compliant
                fb2_out.append('<section>')
                self.section_level += 1

        # Process the XHTML tag and styles. Converted to an FB2 tag.
        # Use individual if statement not if else. There can be
        # only one XHTML tag but it can have multiple styles.
        if tag == 'img' and elem_tree.attrib.get('src', None):
            # Only write the image tag if it is in the manifest.
            ihref = urlnormalize(page.abshref(elem_tree.attrib['src']))
            if ihref in self.oeb_book.manifest.hrefs:
                if ihref not in self.image_hrefs:
                    self.image_hrefs[ihref] = 'img_%s' % len(self.image_hrefs)
                p_txt, p_tag = self.ensure_p()
                fb2_out += p_txt
                tags += p_tag
                fb2_out.append('<image l:href="#%s"/>' %
                               self.image_hrefs[ihref])
            else:
                self.log.warn(u'Ignoring image not in manifest: %s' % ihref)
        if tag in ('br', 'hr') or ems >= 1:
            if ems < 1:
                multiplier = 1
            else:
                multiplier = ems
            if self.in_p:
                closed_tags = []
                open_tags = tag_stack + tags
                open_tags.reverse()
                for t in open_tags:
                    fb2_out.append('</%s>' % t)
                    closed_tags.append(t)
                    if t == 'p':
                        break
                fb2_out.append('<empty-line/>' * multiplier)
                closed_tags.reverse()
                for t in closed_tags:
                    fb2_out.append('<%s>' % t)
            else:
                fb2_out.append('<empty-line/>' * multiplier)
        if tag in ('div', 'li', 'p'):
            p_text, added_p = self.close_open_p(tag_stack + tags)
            fb2_out += p_text
            if added_p:
                tags.append('p')
        if tag == 'a' and elem_tree.attrib.get('href', None):
            # Handle only external links for now
            if urlparse(elem_tree.attrib['href']).netloc:
                p_txt, p_tag = self.ensure_p()
                fb2_out += p_txt
                tags += p_tag
                fb2_out.append('<a l:href="%s">' %
                               urlnormalize(elem_tree.attrib['href']))
                tags.append('a')
        if tag == 'b' or style['font-weight'] in ('bold', 'bolder'):
            s_out, s_tags = self.handle_simple_tag('strong', tag_stack + tags)
            fb2_out += s_out
            tags += s_tags
        if tag == 'i' or style['font-style'] == 'italic':
            s_out, s_tags = self.handle_simple_tag('emphasis',
                                                   tag_stack + tags)
            fb2_out += s_out
            tags += s_tags
        if tag in ('del',
                   'strike') or style['text-decoration'] == 'line-through':
            s_out, s_tags = self.handle_simple_tag('strikethrough',
                                                   tag_stack + tags)
            fb2_out += s_out
            tags += s_tags
        if tag == 'sub':
            s_out, s_tags = self.handle_simple_tag('sub', tag_stack + tags)
            fb2_out += s_out
            tags += s_tags
        if tag == 'sup':
            s_out, s_tags = self.handle_simple_tag('sup', tag_stack + tags)
            fb2_out += s_out
            tags += s_tags

        # Process element text.
        if hasattr(elem_tree, 'text') and elem_tree.text:
            if not self.in_p:
                fb2_out.append('<p>')
            fb2_out.append(prepare_string_for_xml(elem_tree.text))
            if not self.in_p:
                fb2_out.append('</p>')

        # Process sub-elements.
        for item in elem_tree:
            fb2_out += self.dump_text(item, stylizer, page, tag_stack + tags)

        # Close open FB2 tags.
        tags.reverse()
        fb2_out += self.close_tags(tags)

        # Process element text that comes after the close of the XHTML tag but before the next XHTML tag.
        if hasattr(elem_tree, 'tail') and elem_tree.tail:
            if not self.in_p:
                fb2_out.append('<p>')
            fb2_out.append(prepare_string_for_xml(elem_tree.tail))
            if not self.in_p:
                fb2_out.append('</p>')

        return fb2_out