Python xml2text示例，ebook_converter.ebooks.oeb.base.xml2text Python示例

示例#1

0

显示文件

def commit_ncx_toc(container, toc, lang=None, uid=None):
    tocname = find_existing_ncx_toc(container)
    if tocname is None:
        item = container.generate_item('toc.ncx', id_prefix='toc')
        tocname = container.href_to_name(item.get('href'),
                                         base=container.opf_name)
        ncx_id = item.get('id')
        [s.set('toc', ncx_id) for s in container.opf_xpath('//opf:spine')]
    if not lang:
        lang = get_lang()
        for _l in container.opf_xpath('//dc:language'):
            _l = canonicalize_lang(base.xml2text(_l).strip())
            if _l:
                lang = _l
                lang = lang_as_iso639_1(_l) or _l
                break
    lang = lang_as_iso639_1(lang) or lang
    if not uid:
        uid = base.uuid_id()
        eid = container.opf.get('unique-identifier', None)
        if eid:
            m = container.opf_xpath('//*[@id="%s"]' % eid)
            if m:
                uid = base.xml2text(m[0])

    title = 'Table of Contents'
    m = container.opf_xpath('//dc:title')
    if m:
        x = base.xml2text(m[0]).strip()
        title = x or title

    to_href = functools.partial(container.name_to_href, base=tocname)
    root = create_ncx(toc, to_href, title, lang, uid)
    container.replace(tocname, root)
    container.pretty_print.add(tocname)

示例#2

0

显示文件

文件： jacket.py 项目： keshavbhatt/ebook-converter

 def remove_first_image(self):
     deleted_item = None
     for item in self.oeb.spine:
         if XPath(JACKET_XPATH)(item.data):
             continue
         removed = self.remove_images(item)
         if removed > 0:
             self.log('Removed first image')
             body = XPath('//h:body')(item.data)
             if body:
                 raw = xml2text(body[0]).strip()
                 imgs = XPath('//h:img|//svg:svg')(item.data)
                 if not raw and not imgs:
                     self.log('Removing %s as it has no content' %
                              item.href)
                     self.oeb.manifest.remove(item)
                     deleted_item = item
             break
     else:
         self.log.warn('Could not find first image to remove')
     if deleted_item is not None:
         for item in list(self.oeb.toc):
             href = urllib.parse.urldefrag(item.href)[0]
             if href == deleted_item.href:
                 self.oeb.toc.remove(item)
         self.oeb.guide.remove_by_href(deleted_item.href)

示例#3

0

显示文件

 def _toc_from_html(self, opf):
     if 'toc' not in self.oeb.guide:
         return False
     self.log.debug('Reading TOC from HTML...')
     itempath, frag = urllib.parse.urldefrag(self.oeb.guide['toc'].href)
     item = self.oeb.manifest.hrefs[itempath]
     html = item.data
     if frag:
         elems = base.xpath(html, './/*[@id="%s"]' % frag)
         if not elems:
             elems = base.xpath(html, './/*[@name="%s"]' % frag)
         elem = elems[0] if elems else html
         while elem != html and not base.xpath(elem, './/h:a[@href]'):
             elem = elem.getparent()
         html = elem
     titles = collections.defaultdict(list)
     order = []
     for anchor in base.xpath(html, './/h:a[@href]'):
         href = anchor.attrib['href']
         href = item.abshref(base.urlnormalize(href))
         path, frag = urllib.parse.urldefrag(href)
         if path not in self.oeb.manifest.hrefs:
             continue
         title = base.xml2text(anchor)
         title = base.COLLAPSE_RE.sub(' ', title.strip())
         if href not in titles:
             order.append(href)
         titles[href].append(title)
     toc = self.oeb.toc
     for href in order:
         toc.add(' '.join(titles[href]), href)
     return True

示例#4

0

显示文件

 def create_toc_from_links(self):
     num = 0
     for item in self.oeb.spine:
         for a in XPath('//h:a[@href]')(item.data):
             href = a.get('href')
             try:
                 purl = urllib.parse.urlparse(href)
             except ValueError:
                 self.log.warning('Ignoring malformed URL:', href)
                 continue
             if not purl[0] or purl[0] == 'file':
                 href, frag = purl.path, purl.fragment
                 href = item.abshref(href)
                 if frag:
                     href = '#'.join((href, frag))
                 if not self.oeb.toc.has_href(href):
                     text = base.xml2text(a)
                     text = text[:100].strip()
                     if (not self.opts.duplicate_links_in_toc and
                             self.oeb.toc.has_text(text)):
                         continue
                     try:
                         self.oeb.toc.add(
                             text, href,
                             play_order=self.oeb.toc.next_play_order())
                         num += 1
                     except ValueError:
                         self.oeb.log.exception('Failed to process link: '
                                                '%r' % href)
                         # Most likely an incorrectly URL encoded link
                         continue
                     if self.opts.max_toc_links > 0 and \
                             num >= self.opts.max_toc_links:
                         self.log('Maximum TOC links reached, stopping.')
                         return

示例#5

0

显示文件

def elem_to_toc_text(elem):
    text = base.xml2text(elem).strip()
    if not text:
        text = elem.get('title', '')
    if not text:
        text = elem.get('alt', '')
    text = re.sub(r'\s+', ' ', text.strip())
    text = text[:1000].strip()
    if not text:
        text = '(Untitled)'
    return text

示例#6

0

显示文件

    def detect_chapters(self):
        self.detected_chapters = []
        self.chapter_title_attribute = None

        def find_matches(expr, doc):
            try:
                ans = XPath(expr)(doc)
                len(ans)
                return ans
            except Exception:
                self.log.warn('Invalid chapter expression, ignoring: %s' %
                              expr)
                return []

        if self.opts.chapter:
            chapter_path, title_attribute = (
                self.get_toc_parts_for_xpath(self.opts.chapter))
            self.chapter_title_attribute = title_attribute
            for item in self.oeb.spine:
                for x in find_matches(chapter_path, item.data):
                    self.detected_chapters.append((item, x))

            chapter_mark = self.opts.chapter_mark
            page_break_before = 'display: block; page-break-before: always'
            page_break_after = 'display: block; page-break-after: always'
            c = collections.Counter()
            for item, elem in self.detected_chapters:
                c[item] += 1
                text = base.xml2text(elem).strip()
                text = re.sub(r'\s+', ' ', text.strip())
                self.log('\tDetected chapter:', text[:50])
                if chapter_mark == 'none':
                    continue
                if chapter_mark == 'rule':
                    mark = elem.makeelement(base.tag('xhtml', 'hr'))
                elif chapter_mark == 'pagebreak':
                    if c[item] < 3 and at_start(elem):
                        # For the first two elements in this item, check if
                        # they are at the start of the file, in which case
                        # inserting a page break in unnecessary and can lead
                        # to extra blank pages in the PDF Output plugin. We
                        # need to use two as feedbooks epubs match both a
                        # heading tag and its containing div with the default
                        # chapter expression.
                        continue
                    mark = elem.makeelement(base.tag('xhtml', 'div'),
                                            style=page_break_after)
                else:  # chapter_mark == 'both':
                    mark = elem.makeelement(base.tag('xhtml', 'hr'),
                                            style=page_break_before)
                try:
                    elem.addprevious(mark)
                except TypeError:
                    self.log.exception('Failed to mark chapter')

示例#7

0

显示文件

def find_text(node):
    LIMIT = 200
    pat = re.compile(r'\s+')
    for child in node:
        if isinstance(child, etree._Element):
            text = base.xml2text(child).strip()
            text = pat.sub(' ', text)
            if len(text) < 1:
                continue
            if len(text) > LIMIT:
                # Look for less text in a child of this node, recursively
                ntext = find_text(child)
                return ntext or (text[:LIMIT] + '...')
            else:
                return text

示例#8

0

显示文件

 def elem_to_link(self, item, elem, title_attribute, counter):
     text = ''
     if title_attribute is not None:
         text = elem.get(title_attribute, '')
     if not text:
         text = base.xml2text(elem).strip()
     if not text:
         text = elem.get('title', '')
     if not text:
         text = elem.get('alt', '')
     text = re.sub(r'\s+', ' ', text.strip())
     text = text[:1000].strip()
     id = elem.get('id', 'calibre_toc_%d' % counter)
     elem.set('id', id)
     href = '#'.join((item.href, id))
     return text, href

示例#9

0

显示文件

    def read_inline_toc(self, href, frag):
        ans = TOC()
        base_href = '/'.join(href.split('/')[:-1])
        with open(href.replace('/', os.sep), 'rb') as f:
            raw = f.read().decode(self.header.codec)
        root = parse_html(raw, log=self.log)
        body = base.XPath('//h:body')(root)
        reached = False
        if body:
            start = body[0]
        else:
            start = None
            reached = True
        if frag:
            elems = base.XPath('//*[@id="%s"]' % frag)(root)
            if elems:
                start = elems[0]

        def node_depth(elem):
            ans = 0
            parent = elem.getparent()
            while parent is not None:
                parent = parent.getparent()
                ans += 1
            return ans

        # Layer the ToC based on nesting order in the source HTML
        current_depth = None
        parent = ans
        seen = set()
        links = []
        for elem in root.iterdescendants(etree.Element):
            if reached and elem.tag == base.tag('xhtml', 'a') and elem.get(
                    'href', False):
                href = elem.get('href')
                href, frag = urllib.parse.urldefrag(href)
                href = base_href + '/' + href
                text = base.xml2text(elem).strip()
                if (text, href, frag) in seen:
                    continue
                seen.add((text, href, frag))
                links.append((text, href, frag, node_depth(elem)))
            elif elem is start:
                reached = True

        depths = sorted(set(x[-1] for x in links))
        depth_map = {x: i for i, x in enumerate(depths)}
        for text, href, frag, depth in links:
            depth = depth_map[depth]
            if current_depth is None:
                current_depth = 0
                parent.add_item(href, frag, text)
            elif current_depth == depth:
                parent.add_item(href, frag, text)
            elif current_depth < depth:
                parent = parent[-1] if len(parent) > 0 else parent
                parent.add_item(href, frag, text)
                current_depth += 1
            else:
                delta = current_depth - depth
                while delta > 0 and parent.parent is not None:
                    parent = parent.parent
                    delta -= 1
                parent.add_item(href, frag, text)
                current_depth = depth
        return ans