Python prepare_string_for_xml示例，ebook_converter.prepare_string_for_xml Python示例

示例#1

0

显示文件

    def extract_content(self, output_dir):
        txt = ''

        self.log.info(u'Decompressing text...')
        for i in range(1, self.header_record.num_records + 1):
            self.log.debug(u'\tDecompressing text section %i' % i)
            title = self.header_record.chapter_titles[i - 1]
            lines = []
            title_added = False
            for line in self.decompress_text(i).splitlines():
                line = fix_punct(line)
                line = line.strip()
                if not title_added and title in line:
                    line = '<h1 class="chapter">' + line + '</h1>\n'
                    title_added = True
                else:
                    line = prepare_string_for_xml(line)
                lines.append('<p>%s</p>' % line)
            if not title_added:
                lines.insert(0, '<h1 class="chapter">' + title + '</h1>\n')
            txt += '\n'.join(lines)

        self.log.info(u'Converting text to OEB...')
        html = HTML_TEMPLATE % (self.header_record.title, txt)
        with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
            index.write(html.encode('utf-8'))

        mi = self.get_metadata()
        manifest = [('index.html', None)]
        spine = ['index.html']
        opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)

        return os.path.join(output_dir, 'metadata.opf')

示例#2

0

显示文件

文件： oeb2html.py 项目： keshavbhatt/ebook-converter

    def dump_text(self, elem, stylizer, page):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        '''

        # We can only processes tags. If there isn't a tag return any text.
        if not isinstance(elem.tag, (str, bytes)) \
           or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS):
            p = elem.getparent()
            if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \
                    and elem.tail:
                return [elem.tail]
            return ['']

        # Setup our variables.
        text = ['']
        tags = []
        tag = parse_utils.barename(elem.tag)
        attribs = elem.attrib

        if tag == 'body':
            tag = 'div'
        tags.append(tag)

        # Remove attributes we won't want.
        if 'style' in attribs:
            del attribs['style']

        # Turn the rest of the attributes into a string we can write with the tag.
        at = ''
        for k, v in attribs.items():
            at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))

        # Write the tag.
        text.append('<%s%s' % (tag, at))
        if tag in SELF_CLOSING_TAGS:
            text.append(' />')
        else:
            text.append('>')

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            text.append(self.prepare_string_for_html(elem.text))

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer, page)

        # Close all open tags.
        tags.reverse()
        for t in tags:
            if t not in SELF_CLOSING_TAGS:
                text.append('</%s>' % t)

        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            text.append(self.prepare_string_for_html(elem.tail))

        return text

示例#3

0

显示文件

文件： oeb2html.py 项目： keshavbhatt/ebook-converter

 def prepare_string_for_html(self, raw):
     raw = prepare_string_for_xml(raw)
     raw = raw.replace(u'\u00ad', '&shy;')
     raw = raw.replace(u'\u2014', '&mdash;')
     raw = raw.replace(u'\u2013', '&ndash;')
     raw = raw.replace(u'\u00a0', '&nbsp;')
     return raw

示例#4

0

显示文件

文件： oeb2html.py 项目： keshavbhatt/ebook-converter

 def mlize_spine(self, oeb_book):
     output = [
         u'<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" /><title>%s</title></head><body>'
         % (prepare_string_for_xml(self.book_title))
     ]
     for item in oeb_book.spine:
         self.log.debug('Converting %s to HTML...' % item.href)
         self.rewrite_ids(item.data, item)
         base.rewrite_links(item.data, partial(self.rewrite_link,
                                               page=item))
         stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
         output += self.dump_text(item.data.find(base.tag('xhtml', 'body')),
                                  stylizer, item)
         output.append('\n\n')
     output.append('</body></html>')
     return ''.join(output)

示例#5

0

显示文件

文件： oeb2html.py 项目： keshavbhatt/ebook-converter

 def mlize_spine(self, oeb_book):
     output = []
     for item in oeb_book.spine:
         self.log.debug('Converting %s to HTML...' % item.href)
         self.rewrite_ids(item.data, item)
         base.rewrite_links(item.data, partial(self.rewrite_link,
                                               page=item))
         stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
         output += self.dump_text(item.data.find(base.tag('xhtml', 'body')),
                                  stylizer, item)
         output.append('\n\n')
     if self.opts.htmlz_class_style == 'external':
         css = u'<link href="style.css" rel="stylesheet" type="text/css" />'
     else:
         css = u'<style type="text/css">' + self.get_css(
             oeb_book) + u'</style>'
     title = u'<title>%s</title>' % prepare_string_for_xml(self.book_title)
     output = [u'<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" />'] + \
         [css] + [title, u'</head><body>'] + output + [u'</body></html>']
     return ''.join(output)

示例#6

0

显示文件

def convert_basic(txt, title='', epub_split_size_kb=0):
    '''
    Converts plain text to html by putting all paragraphs in
    <p> tags. It condense and retains blank lines when necessary.

    Requires paragraphs to be in single line format.
    '''
    txt = clean_txt(txt)
    txt = split_txt(txt, epub_split_size_kb)

    lines = []
    blank_count = 0
    # Split into paragraphs based on having a blank line between text.
    for line in txt.split('\n'):
        if line.strip():
            blank_count = 0
            lines.append(u'<p>%s</p>' % prepare_string_for_xml(line.replace('\n', ' ')))
        else:
            blank_count += 1
            if blank_count == 2:
                lines.append(u'<p>&nbsp;</p>')

    return HTML_TEMPLATE % (title, u'\n'.join(lines))

示例#7

0

显示文件

文件： objects.py 项目： keshavbhatt/ebook-converter

    def __str__(self):
        s = ''
        open_containers = collections.deque()
        for c in self.content:
            if isinstance(c, str):
                s += prepare_string_for_xml(c).replace('\0', '')
            elif c is None:
                if open_containers:
                    p = open_containers.pop()
                    s += '</%s>' % (p.name, )
            else:
                s += str(c)
                if not c.self_closing:
                    open_containers.append(c)

        if len(open_containers) > 0:
            if len(open_containers) == 1:
                s += '</%s>' % (open_containers[0].name, )
            else:
                raise LRFParseError('Malformed text stream %s' % ([
                    i.name
                    for i in open_containers if isinstance(i, Text.TextTag)
                ], ))
        return s

示例#8

0

显示文件

文件： fb2ml.py 项目： keshavbhatt/ebook-converter

    def dump_text(self, elem_tree, stylizer, page, tag_stack=[]):
        """
        This function is intended to be used in a recursive manner. dump_text
        will run though all elements in the elem_tree and call itself on each
        element.

        self.image_hrefs will be populated by calling this function.

        @param elem_tree: etree representation of XHTML content to be
            transformed.
        @param stylizer: Used to track the style of elements within the tree.
        @param page: OEB page used to determine absolute urls.
        @param tag_stack: List of open FB2 tags to take into account.

        @return: List of string representing the XHTML converted to FB2 markup.
        """
        elem = elem_tree

        # Ensure what we are converting is not a string and that the fist tag
        # is part of the XHTML namespace.
        if (not isinstance(elem_tree.tag, (str, bytes))
                or parse_utils.namespace(elem_tree.tag) != const.XHTML_NS):
            p = elem.getparent()
            if (p is not None and isinstance(p.tag, (str, bytes))
                    and parse_utils.namespace(p.tag) == const.XHTML_NS
                    and elem.tail):
                return [elem.tail]
            return []

        style = stylizer.style(elem_tree)
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return [elem.tail]
            return []

        # FB2 generated output.
        fb2_out = []
        # FB2 tags in the order they are opened. This will be used to close
        # the tags.
        tags = []
        # First tag in tree
        tag = parse_utils.barename(elem_tree.tag)
        # Number of blank lines above tag
        try:
            ems = int(round((float(style.marginTop) / style.fontSize) - 1))
            if ems < 0:
                ems = 0
        except Exception:
            ems = 0

        # Convert TOC entries to <title>s and add <section>s
        if self.opts.sectionize == 'toc':
            # A section cannot be a child of any other element than another
            # section, so leave the tag alone if there are parents
            if not tag_stack:
                # There are two reasons to start a new section here: the TOC
                # pointed to this page (then we use the first non-<body> on
                # the page as a <title>), or the TOC pointed to a specific
                # element
                newlevel = 0
                toc_entry = self.toc.get(page.href, None)
                if toc_entry is not None:
                    if None in toc_entry:
                        if (tag != 'body' and hasattr(elem_tree, 'text')
                                and elem_tree.text):
                            newlevel = 1
                            self.toc[page.href] = None
                    if (not newlevel
                            and elem_tree.attrib.get('id', None) is not None):
                        newlevel = toc_entry.get(
                            elem_tree.attrib.get('id', None), None)

                # Start a new section if necessary
                if newlevel:
                    while newlevel <= self.section_level:
                        fb2_out.append('</section>')
                        self.section_level -= 1
                    fb2_out.append('<section>')
                    self.section_level += 1
                    fb2_out.append('<title>')
                    tags.append('title')
            if self.section_level == 0:
                # If none of the prior processing made a section, make one now
                # to be FB2 spec compliant
                fb2_out.append('<section>')
                self.section_level += 1

        # Process the XHTML tag and styles. Converted to an FB2 tag.
        # Use individual if statement not if else. There can be only one XHTML
        # tag but it can have multiple styles.
        if tag == 'img' and elem_tree.attrib.get('src', None):
            # Only write the image tag if it is in the manifest.
            ihref = base.urlnormalize(page.abshref(elem_tree.attrib['src']))
            if ihref in self.oeb_book.manifest.hrefs:
                if ihref not in self.image_hrefs:
                    self.image_hrefs[ihref] = 'img_%s' % len(self.image_hrefs)
                p_txt, p_tag = self.ensure_p()
                fb2_out += p_txt
                tags += p_tag
                fb2_out.append('<image l:href="#%s"/>' %
                               self.image_hrefs[ihref])
            else:
                self.log.warn(u'Ignoring image not in manifest: %s' % ihref)
        if tag in ('br', 'hr') or ems >= 1:
            if ems < 1:
                multiplier = 1
            else:
                multiplier = ems
            if self.in_p:
                closed_tags = []
                open_tags = tag_stack + tags
                open_tags.reverse()
                for t in open_tags:
                    fb2_out.append('</%s>' % t)
                    closed_tags.append(t)
                    if t == 'p':
                        break
                fb2_out.append('<empty-line/>' * multiplier)
                closed_tags.reverse()
                for t in closed_tags:
                    fb2_out.append('<%s>' % t)
            else:
                fb2_out.append('<empty-line/>' * multiplier)
        if tag in ('div', 'li', 'p'):
            p_text, added_p = self.close_open_p(tag_stack + tags)
            fb2_out += p_text
            if added_p:
                tags.append('p')
        if tag == 'a' and elem_tree.attrib.get('href', None):
            # Handle only external links for now
            if urllib.parse.urlparse(elem_tree.attrib['href']).netloc:
                p_txt, p_tag = self.ensure_p()
                fb2_out += p_txt
                tags += p_tag
                fb2_out.append('<a l:href="%s">' %
                               base.urlnormalize(elem_tree.attrib['href']))
                tags.append('a')
        if tag == 'b' or style['font-weight'] in ('bold', 'bolder'):
            s_out, s_tags = self.handle_simple_tag('strong', tag_stack + tags)
            fb2_out += s_out
            tags += s_tags
        if tag == 'i' or style['font-style'] == 'italic':
            s_out, s_tags = self.handle_simple_tag('emphasis',
                                                   tag_stack + tags)
            fb2_out += s_out
            tags += s_tags
        if (tag in ('del', 'strike')
                or style['text-decoration'] == 'line-through'):
            s_out, s_tags = self.handle_simple_tag('strikethrough',
                                                   tag_stack + tags)
            fb2_out += s_out
            tags += s_tags
        if tag == 'sub':
            s_out, s_tags = self.handle_simple_tag('sub', tag_stack + tags)
            fb2_out += s_out
            tags += s_tags
        if tag == 'sup':
            s_out, s_tags = self.handle_simple_tag('sup', tag_stack + tags)
            fb2_out += s_out
            tags += s_tags

        # Process element text.
        if hasattr(elem_tree, 'text') and elem_tree.text:
            if not self.in_p:
                fb2_out.append('<p>')
            fb2_out.append(prepare_string_for_xml(elem_tree.text))
            if not self.in_p:
                fb2_out.append('</p>')

        # Process sub-elements.
        for item in elem_tree:
            fb2_out += self.dump_text(item, stylizer, page, tag_stack + tags)

        # Close open FB2 tags.
        tags.reverse()
        fb2_out += self.close_tags(tags)

        # Process element text that comes after the close of the XHTML tag but
        # before the next XHTML tag.
        if hasattr(elem_tree, 'tail') and elem_tree.tail:
            if not self.in_p:
                fb2_out.append('<p>')
            fb2_out.append(prepare_string_for_xml(elem_tree.tail))
            if not self.in_p:
                fb2_out.append('</p>')

        return fb2_out

示例#9

0

显示文件

文件： fb2ml.py 项目： keshavbhatt/ebook-converter

    def fb2_header(self):
        metadata = {}
        metadata['title'] = self.oeb_book.metadata.title[0].value
        metadata['appname'] = __appname__
        metadata['version'] = __version__
        metadata['date'] = '%i.%i.%i' % (
            datetime.now().day, datetime.now().month, datetime.now().year)
        if self.oeb_book.metadata.language:
            lc = lang_as_iso639_1(self.oeb_book.metadata.language[0].value)
            if not lc:
                lc = self.oeb_book.metadata.language[0].value
            metadata['lang'] = lc or 'en'
        else:
            metadata['lang'] = u'en'
        metadata['id'] = None
        metadata['cover'] = self.get_cover()
        metadata['genre'] = self.opts.fb2_genre

        metadata['author'] = ''
        for auth in self.oeb_book.metadata.creator:
            author_first = ''
            author_middle = ''
            author_last = ''
            author_parts = auth.value.split(' ')
            if len(author_parts) == 1:
                author_last = author_parts[0]
            elif len(author_parts) == 2:
                author_first = author_parts[0]
                author_last = author_parts[1]
            else:
                author_first = author_parts[0]
                author_middle = ' '.join(author_parts[1:-1])
                author_last = author_parts[-1]
            metadata['author'] += '<author>'
            metadata['author'] += ('<first-name>%s</first-name>' %
                                   prepare_string_for_xml(author_first))
            if author_middle:
                metadata['author'] += ('<middle-name>%s</middle-name>' %
                                       prepare_string_for_xml(author_middle))
            metadata['author'] += ('<last-name>%s</last-name>' %
                                   prepare_string_for_xml(author_last))
            metadata['author'] += '</author>'
        if not metadata['author']:
            metadata['author'] = ('<author><first-name></first-name>'
                                  '<last-name></last-name></author>')

        metadata['keywords'] = ''
        tags = list(map(str, self.oeb_book.metadata.subject))
        if tags:
            tags = ', '.join(prepare_string_for_xml(x) for x in tags)
            metadata['keywords'] = '<keywords>%s</keywords>' % tags

        metadata['sequence'] = ''
        if self.oeb_book.metadata.series:
            index = '1'
            if self.oeb_book.metadata.series_index:
                index = self.oeb_book.metadata.series_index[0]
            seq = prepare_string_for_xml(str(self.oeb_book.metadata.series[0]))
            metadata['sequence'] = ('<sequence name="%s" number="%s"/>' %
                                    (seq, index))

        year = publisher = isbn = ''
        identifiers = self.oeb_book.metadata['identifier']
        for x in identifiers:
            if (x.get(base.tag('opf', 'scheme'), None).lower() == 'uuid'
                    or str(x).startswith('urn:uuid:')):
                metadata['id'] = str(x).split(':')[-1]
                break
        if metadata['id'] is None:
            self.log.warn('No UUID identifier found')
            metadata['id'] = str(uuid.uuid4())

        try:
            date = self.oeb_book.metadata['date'][0]
        except IndexError:
            pass
        else:
            year = ('<year>%s</year>' %
                    prepare_string_for_xml(date.value.partition('-')[0]))

        try:
            publisher = self.oeb_book.metadata['publisher'][0]
        except IndexError:
            pass
        else:
            publisher = ('<publisher>%s</publisher>' %
                         prepare_string_for_xml(publisher.value))

        for x in identifiers:
            if x.get(base.tag('opf', 'scheme'), None).lower() == 'isbn':
                isbn = '<isbn>%s</isbn>' % prepare_string_for_xml(x.value)

        metadata['year'] = year
        metadata['isbn'] = isbn
        metadata['publisher'] = publisher
        for key, value in metadata.items():
            if key not in ('author', 'cover', 'sequence', 'keywords', 'year',
                           'publisher', 'isbn'):
                metadata[key] = prepare_string_for_xml(value)

        try:
            comments = self.oeb_book.metadata['description'][0]
        except Exception:
            metadata['comments'] = ''
        else:
            from ebook_converter.utils.html2text import html2text
            annot = prepare_string_for_xml(html2text(comments.value).strip())
            metadata['comments'] = f'<annotation><p>{annot}</p></annotation>'

        # Keep the indentation level of the description the same as the body.
        header = textwrap.dedent('''\
            <FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:l="http://www.w3.org/1999/xlink">
            <description>
                <title-info>
                    <genre>%(genre)s</genre>
                    %(author)s
                    <book-title>%(title)s</book-title>
                    %(cover)s
                    <lang>%(lang)s</lang>
                    %(keywords)s
                    %(sequence)s
                    %(comments)s
                </title-info>
                <document-info>
                    %(author)s
                    <program-used>%(appname)s %(version)s</program-used>
                    <date>%(date)s</date>
                    <id>%(id)s</id>
                    <version>1.0</version>
                </document-info>
                <publish-info>
                    %(publisher)s
                    %(year)s
                    %(isbn)s
                </publish-info>
            </description>''') % metadata

        # Remove empty lines.
        return '\n'.join(filter(str.strip, header.splitlines()))

示例#10

0

显示文件

文件： comments.py 项目： keshavbhatt/ebook-converter

def comments_to_html(comments):
    '''
    Convert random comment text to normalized, xml-legal block of <p>s
    'plain text' returns as
    <p>plain text</p>

    'plain text with <i>minimal</i> <b>markup</b>' returns as
    <p>plain text with <i>minimal</i> <b>markup</b></p>

    '<p>pre-formatted text</p> returns untouched

    'A line of text\n\nFollowed by a line of text' returns as
    <p>A line of text</p>
    <p>Followed by a line of text</p>

    'A line of text.\nA second line of text.\rA third line of text' returns as
    <p>A line of text.<br />A second line of text.<br />A third line of text.</p>

    '...end of a paragraph.Somehow the break was lost...' returns as
    <p>...end of a paragraph.</p>
    <p>Somehow the break was lost...</p>

    Deprecated HTML returns as HTML via BeautifulSoup()

    '''
    if not comments:
        return u'<p></p>'
    if not isinstance(comments, str):
        comments = comments.decode(preferred_encoding, 'replace')

    if comments.lstrip().startswith('<'):
        # Comment is already HTML do not mess with it
        return comments

    if '<' not in comments:
        comments = prepare_string_for_xml(comments)
        parts = [
            u'<p class="description">%s</p>' % x.replace(u'\n', u'<br />')
            for x in comments.split('\n\n')
        ]
        return '\n'.join(parts)

    if sanitize_pat.search(comments) is not None:
        try:
            return sanitize_comments_html(comments)
        except:
            import traceback
            traceback.print_exc()
            return u'<p></p>'

    # Explode lost CRs to \n\n
    comments = lost_cr_exception_pat.sub(
        lambda m: m.group().replace('.', '.\r'), comments)
    for lost_cr in lost_cr_pat.finditer(comments):
        comments = comments.replace(
            lost_cr.group(), '%s%s\n\n%s' %
            (lost_cr.group(1), lost_cr.group(2), lost_cr.group(3)))

    comments = comments.replace(u'\r', u'')
    # Convert \n\n to <p>s
    comments = comments.replace(u'\n\n', u'<p>')
    # Convert solo returns to <br />
    comments = comments.replace(u'\n', '<br />')
    # Convert two hyphens to emdash
    comments = comments.replace('--', '&mdash;')

    soup = html5_parser('<div>' + comments + '</div>').find('div')
    result = html5_parser('<div>')
    container = result.find('div')
    rtc = 0
    open_pTag = False

    all_tokens = list(soup.contents)
    inline_tags = ('br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a', 'hr')
    for token in all_tokens:
        if isinstance(token, (bs4.CData, bs4.Comment, bs4.Declaration,
                              bs4.ProcessingInstruction)):
            continue
        if isinstance(token, bs4.NavigableString):
            if not open_pTag:
                pTag = result.new_tag('p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc, token)
            ptc += 1
        elif token.name in inline_tags:
            if not open_pTag:
                pTag = result.new_tag('p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc, token)
            ptc += 1
        else:
            if open_pTag:
                container.insert(rtc, pTag)
                rtc += 1
                open_pTag = False
                ptc = 0
            container.insert(rtc, token)
            rtc += 1

    if open_pTag:
        container.insert(rtc, pTag)

    for p in container.findAll('p'):
        p['class'] = 'description'

    return container.decode_contents()

示例#11

0

显示文件

文件： oeb2html.py 项目： keshavbhatt/ebook-converter

    def dump_text(self, elem, stylizer, page):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        '''

        # We can only processes tags. If there isn't a tag return any text.
        if not isinstance(elem.tag, (str, bytes)) \
           or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS):
            p = elem.getparent()
            if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \
                    and elem.tail:
                return [elem.tail]
            return ['']

        # Setup our variables.
        text = ['']
        style = stylizer.style(elem)
        tags = []
        tag = parse_utils.barename(elem.tag)
        attribs = elem.attrib

        style_a = '%s' % style
        style_a = style_a if style_a else ''
        if tag == 'body':
            # Change the body to a div so we can merge multiple files.
            tag = 'div'
            # Add page-break-brefore: always because renders typically treat a new file (we're merging files)
            # as a page break and remove all other page break types that might be set.
            style_a = 'page-break-before: always; %s' % re.sub(
                'page-break-[^:]+:[^;]+;?', '', style_a)
        # Remove unnecessary spaces.
        style_a = re.sub(r'\s{2,}', ' ', style_a).strip()
        tags.append(tag)

        # Remove attributes we won't want.
        if 'class' in attribs:
            del attribs['class']
        if 'style' in attribs:
            del attribs['style']

        # Turn the rest of the attributes into a string we can write with the tag.
        at = ''
        for k, v in attribs.items():
            at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))

        # Turn style into strings for putting in the tag.
        style_t = ''
        if style_a:
            style_t = ' style="%s"' % style_a.replace('"', "'")

        # Write the tag.
        text.append('<%s%s%s' % (tag, at, style_t))
        if tag in SELF_CLOSING_TAGS:
            text.append(' />')
        else:
            text.append('>')

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            text.append(self.prepare_string_for_html(elem.text))

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer, page)

        # Close all open tags.
        tags.reverse()
        for t in tags:
            if t not in SELF_CLOSING_TAGS:
                text.append('</%s>' % t)

        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            text.append(self.prepare_string_for_html(elem.tail))

        return text

示例#12

0

显示文件

文件： oeb2html.py 项目： keshavbhatt/ebook-converter

    def dump_text(self, elem, stylizer, page):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        '''

        # We can only processes tags. If there isn't a tag return any text.
        if not isinstance(elem.tag, (str, bytes)) \
           or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS):
            p = elem.getparent()
            if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \
                    and elem.tail:
                return [elem.tail]
            return ['']

        # Setup our variables.
        text = ['']
        style = stylizer.style(elem)
        tags = []
        tag = parse_utils.barename(elem.tag)
        attribs = elem.attrib

        if tag == 'body':
            tag = 'div'
        tags.append(tag)

        # Ignore anything that is set to not be displayed.
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            return ['']

        # Remove attributes we won't want.
        if 'class' in attribs:
            del attribs['class']
        if 'style' in attribs:
            del attribs['style']

        # Turn the rest of the attributes into a string we can write with the tag.
        at = ''
        for k, v in attribs.items():
            at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))

        # Write the tag.
        text.append('<%s%s' % (tag, at))
        if tag in SELF_CLOSING_TAGS:
            text.append(' />')
        else:
            text.append('>')

        # Turn styles into tags.
        if style['font-weight'] in ('bold', 'bolder'):
            text.append('<b>')
            tags.append('b')
        if style['font-style'] == 'italic':
            text.append('<i>')
            tags.append('i')
        if style['text-decoration'] == 'underline':
            text.append('<u>')
            tags.append('u')
        if style['text-decoration'] == 'line-through':
            text.append('<s>')
            tags.append('s')

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            text.append(self.prepare_string_for_html(elem.text))

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer, page)

        # Close all open tags.
        tags.reverse()
        for t in tags:
            if t not in SELF_CLOSING_TAGS:
                text.append('</%s>' % t)

        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            text.append(self.prepare_string_for_html(elem.tail))

        return text