Exemplo n.º 1
0
 def processWebpage(self, webpage, dump_xhtml=False):
     if not hasattr(webpage, 'tree'):
         webpage.tree = webpage._get_parse_tree()
     from copy import copy
     self.remapLinks(webpage)
     self.tree_processor = TreeProcessor()
     #self.tree_processor.getMetaInfo(webpage)
     self.tree_processor.annotateNodes(webpage)
     self.tree_processor.clean(webpage)
     webpage.xml = self.serializeArticle(copy(webpage.tree))
     self.container.addArticle(webpage)
     if dump_xhtml:
         return webpage.xml
     del webpage.tree
     del webpage.xml
Exemplo n.º 2
0
 def processWebpage(self, webpage, dump_xhtml=False):
     if not hasattr(webpage, 'tree'):
         webpage.tree = webpage._get_parse_tree()
     from copy import copy
     self.remapLinks(webpage)
     self.tree_processor = TreeProcessor()
     #self.tree_processor.getMetaInfo(webpage)
     self.tree_processor.annotateNodes(webpage)
     self.tree_processor.clean(webpage)
     webpage.xml = self.serializeArticle(copy(webpage.tree))
     self.container.addArticle(webpage)
     if dump_xhtml:
         return webpage.xml
     del webpage.tree
     del webpage.xml
Exemplo n.º 3
0
class EpubWriter(object):

    def __init__(self, output, coll, status_callback=None, cover_img=None):
        self.output = output
        self.target_dir = os.path.dirname(output)
        self.coll = coll
        self.scaled_images = {}
        self.status_callback = status_callback
        self.cover_img = cover_img

    def initContainer(self):
        if not os.path.exists(self.target_dir):
            print 'created dir'
            os.makedirs(self.target_dir)
        self.container = EpubContainer(self.output, self.coll)
        self.container.link_file(os.path.join(os.path.dirname(__file__), 'wp.css'),
                                 'OPS/wp.css')

    def closeContainer(self):
        self.container.close()

    def renderColl(self, dump_xhtml=False):
        xhtml = None
        self.initContainer()
        self.processCoverImage()
        self.processTitlePage()
        progress_inc = 100.0/len(self.coll.outline.items)
        for n, (lvl, webpage) in enumerate(self.coll.outline.walk()):
            if isinstance(webpage, collection.WebPage):
                xhtml = self.processWebpage(webpage, dump_xhtml=dump_xhtml)
            elif isinstance(webpage, collection.Chapter):
                self.processChapter(webpage)
            if self.status_callback:
                self.status_callback(progress=n*progress_inc)
        self.processMetaInfo()
        self.closeContainer()
        if dump_xhtml:
            return xhtml

    def processCoverImage(self):
        if not self.cover_img:
            return
        content = [E.div(dict(style='width:100%;height:100%;'),
                         E.img(dict(src='images/' + 'cover'+ os.path.splitext(self.cover_img)[1],
                                    alt='',
                                    #width='100%',
                                    height='100%',
                                    style='max-height:100%;max-width:100%;margin:auto;',
                                    ))),
                   ]
        xml = misc.xhtml_page(title='cover',
                               body_content=content,
                               flatten=True)
        self.container.addCover(xml, os.path.abspath(self.cover_img))


    def processTitlePage(self):
        if not any(txt != '' for txt in [self.coll.title,
                                         self.coll.subtitle,
                                         self.coll.editor]):
            return
        titlepage = collection.Chapter(self.coll.title)
        titlepage.id = 'titlepage'
        body_content = [E.h1(self.coll.title,
                             style="margin-top:20%;font-size:200%;text-align:center;"),
                        E.h2(self.coll.subtitle,
                             style="margin-top:1em;font-size:150%;text-align:center;"),
                        E.h3(self.coll.editor,
                             style="margin-top:1em;font-size:100%;text-align:center;"),
                        ]
        if any('wikipedia.org' in url for url in self.coll.url2webpage):
            img_src = 'wikipedia_logo.jpg'
            titlepage.images = {img_src:
                                os.path.join(os.path.dirname(__file__), img_src)}
            body_content.append(E.div(E.img(src='images/'+img_src,
                                            width='50%', alt='',
                                            ),
                                      style='text-align:center;margin-top:4em;'
                                      ))
        tree = misc.xhtml_page(title=self.coll.title,
                               body_content=body_content,
                               flatten=False)
        titlepage.tree = tree
        titlepage.xml = misc.flatten_tree(tree)
        self.container.addArticle(titlepage)

    def processMetaInfo(self):
        from mwlib.epub import metainfo
        chapter = collection.Chapter(_('Article Sources and Contributors'))
        chapter.id = '_articlesources'
        chapter.xml = metainfo.getArticleMetainfo(chapter, self.coll)
        self.container.addArticle(chapter)

        chapter = collection.Chapter(_('Image Sources, Licenses and Contributors'))
        chapter.id = '_imagesources'
        chapter.xml = metainfo.getImageMetainfo(chapter, self.coll)
        self.container.addArticle(chapter)

    def processChapter(self, chapter):
        self.num_chapters = getattr(self, 'num_chapters', 0) + 1
        chapter.id = 'chapter_%02d' % self.num_chapters
        title = xmlescape(chapter.title)
        chapter.xml = misc.xhtml_page(
            title=title,
            body_content=[E.h1({'style':
                                'margin-top:15%;font-size:200%;text-align:center;'},
                               title)]
            )
        self.container.addArticle(chapter)


    def processWebpage(self, webpage, dump_xhtml=False):
        if not hasattr(webpage, 'tree'):
            webpage.tree = webpage._get_parse_tree()
        from copy import copy
        self.remapLinks(webpage)
        self.tree_processor = TreeProcessor()
        #self.tree_processor.getMetaInfo(webpage)
        self.tree_processor.annotateNodes(webpage)
        self.tree_processor.clean(webpage)
        webpage.xml = self.serializeArticle(copy(webpage.tree))
        self.container.addArticle(webpage)
        if dump_xhtml:
            return webpage.xml
        del webpage.tree
        del webpage.xml

    def remapLinks(self, webpage):
        for img in webpage.tree.findall('.//img'):
            img_fn = webpage.images.get(img.attrib['src'])
            if img_fn:
                zip_rel_path = os.path.join(config.img_rel_path, os.path.basename(img_fn))
                img.attrib['src'] = zip_rel_path
            else:
                remove_node(img)

        target_ids = [safe_xml_id(_id) for _id in webpage.tree.xpath('.//@id')]
        for a in webpage.tree.findall('.//a'):
            href = a.get('href')
            if not href: # this link is probably just an anchor
                continue
            if href.startswith('#'):
                target_id = safe_xml_id(href)[1:]
                if target_id not in target_ids:
                    a.set('id', target_id)
                    target_ids.append(target_id)
                a.set('href', '#'+target_id)
            else:
                url = clean_url(urlparse.urljoin(webpage.url, href))
                linked_wp = webpage.coll.url2webpage.get(url)
                if linked_wp:
                    a.set('href', linked_wp.id + '.xhtml')
                else:
                    a.set('href', url)

    def serializeArticle(self, node):
        assert not node.find('.//body'), 'error: node contains BODY tag'

        html = E.html({'xmlns':"http://www.w3.org/1999/xhtml"},
                      E.head(E.meta({'http-equiv':"Content-Type",
                                     'content': "application/xhtml+xml; charset=utf-8"})
                             ),
                      )

        head = html.find('.//head')
        node_head = node.find('.//head')
        for head_content in node_head.iterchildren():
            head.append(head_content)
        node_head.getparent().remove(node_head)

        body = E.body()
        html.append(body)
        body.extend(node)

        return misc.flatten_tree(html)
Exemplo n.º 4
0
class EpubWriter(object):
    def __init__(self, output, coll, status_callback=None, cover_img=None):
        self.output = output
        self.target_dir = os.path.dirname(output)
        self.coll = coll
        self.scaled_images = {}
        self.status_callback = status_callback
        self.cover_img = cover_img

    def initContainer(self):
        if not os.path.exists(self.target_dir):
            print 'created dir'
            os.makedirs(self.target_dir)
        self.container = EpubContainer(self.output, self.coll)
        self.container.link_file(
            os.path.join(os.path.dirname(__file__), 'wp.css'), 'OPS/wp.css')

    def closeContainer(self):
        self.container.close()

    def renderColl(self, dump_xhtml=False):
        xhtml = None
        self.initContainer()
        self.processCoverImage()
        self.processTitlePage()
        progress_inc = 100.0 / len(self.coll.outline.items)
        for n, (lvl, webpage) in enumerate(self.coll.outline.walk()):
            if isinstance(webpage, collection.WebPage):
                xhtml = self.processWebpage(webpage, dump_xhtml=dump_xhtml)
            elif isinstance(webpage, collection.Chapter):
                self.processChapter(webpage)
            if self.status_callback:
                self.status_callback(progress=n * progress_inc)
        self.processMetaInfo()
        self.closeContainer()
        if dump_xhtml:
            return xhtml

    def processCoverImage(self):
        if not self.cover_img:
            return
        content = [
            E.div(
                dict(style='width:100%;height:100%;'),
                E.img(
                    dict(
                        src='images/' + 'cover' +
                        os.path.splitext(self.cover_img)[1],
                        alt='',
                        #width='100%',
                        height='100%',
                        style='max-height:100%;max-width:100%;margin:auto;',
                    ))),
        ]
        xml = misc.xhtml_page(title='cover',
                              body_content=content,
                              flatten=True)
        self.container.addCover(xml, os.path.abspath(self.cover_img))

    def processTitlePage(self):
        if not any(txt != '' for txt in
                   [self.coll.title, self.coll.subtitle, self.coll.editor]):
            return
        titlepage = collection.Chapter(self.coll.title)
        titlepage.id = 'titlepage'
        body_content = [
            E.h1(self.coll.title,
                 style="margin-top:20%;font-size:200%;text-align:center;"),
            E.h2(self.coll.subtitle,
                 style="margin-top:1em;font-size:150%;text-align:center;"),
            E.h3(self.coll.editor,
                 style="margin-top:1em;font-size:100%;text-align:center;"),
        ]
        if any('wikipedia.org' in url for url in self.coll.url2webpage):
            img_src = 'wikipedia_logo.jpg'
            titlepage.images = {
                img_src: os.path.join(os.path.dirname(__file__), img_src)
            }
            body_content.append(
                E.div(E.img(
                    src='images/' + img_src,
                    width='50%',
                    alt='',
                ),
                      style='text-align:center;margin-top:4em;'))
        tree = misc.xhtml_page(title=self.coll.title,
                               body_content=body_content,
                               flatten=False)
        titlepage.tree = tree
        titlepage.xml = misc.flatten_tree(tree)
        self.container.addArticle(titlepage)

    def processMetaInfo(self):
        from mwlib.epub import metainfo
        chapter = collection.Chapter(_('Article Sources and Contributors'))
        chapter.id = '_articlesources'
        chapter.xml = metainfo.getArticleMetainfo(chapter, self.coll)
        self.container.addArticle(chapter)

        chapter = collection.Chapter(
            _('Image Sources, Licenses and Contributors'))
        chapter.id = '_imagesources'
        chapter.xml = metainfo.getImageMetainfo(chapter, self.coll)
        self.container.addArticle(chapter)

    def processChapter(self, chapter):
        self.num_chapters = getattr(self, 'num_chapters', 0) + 1
        chapter.id = 'chapter_%02d' % self.num_chapters
        title = xmlescape(chapter.title)
        chapter.xml = misc.xhtml_page(
            title=title,
            body_content=[
                E.h1(
                    {
                        'style':
                        'margin-top:15%;font-size:200%;text-align:center;'
                    }, title)
            ])
        self.container.addArticle(chapter)

    def processWebpage(self, webpage, dump_xhtml=False):
        if not hasattr(webpage, 'tree'):
            webpage.tree = webpage._get_parse_tree()
        from copy import copy
        self.remapLinks(webpage)
        self.tree_processor = TreeProcessor()
        #self.tree_processor.getMetaInfo(webpage)
        self.tree_processor.annotateNodes(webpage)
        self.tree_processor.clean(webpage)
        webpage.xml = self.serializeArticle(copy(webpage.tree))
        self.container.addArticle(webpage)
        if dump_xhtml:
            return webpage.xml
        del webpage.tree
        del webpage.xml

    def remapLinks(self, webpage):
        for img in webpage.tree.findall('.//img'):
            img_fn = webpage.images.get(img.attrib['src'])
            if img_fn:
                zip_rel_path = os.path.join(config.img_rel_path,
                                            os.path.basename(img_fn))
                img.attrib['src'] = zip_rel_path
            else:
                remove_node(img)

        target_ids = [safe_xml_id(_id) for _id in webpage.tree.xpath('.//@id')]
        for a in webpage.tree.findall('.//a'):
            href = a.get('href')
            if not href:  # this link is probably just an anchor
                continue
            if href.startswith('#'):
                target_id = safe_xml_id(href)[1:]
                if target_id not in target_ids:
                    a.set('id', target_id)
                    target_ids.append(target_id)
                a.set('href', '#' + target_id)
            else:
                url = clean_url(urlparse.urljoin(webpage.url, href))
                linked_wp = webpage.coll.url2webpage.get(url)
                if linked_wp:
                    a.set('href', linked_wp.id + '.xhtml')
                else:
                    a.set('href', url)

    def serializeArticle(self, node):
        assert not node.find('.//body'), 'error: node contains BODY tag'

        html = E.html(
            {'xmlns': "http://www.w3.org/1999/xhtml"},
            E.head(
                E.meta({
                    'http-equiv': "Content-Type",
                    'content': "application/xhtml+xml; charset=utf-8"
                })),
        )

        head = html.find('.//head')
        node_head = node.find('.//head')
        for head_content in node_head.iterchildren():
            head.append(head_content)
        node_head.getparent().remove(node_head)

        body = E.body()
        html.append(body)
        body.extend(node)

        return misc.flatten_tree(html)