Exemplo n.º 1
0
    def reflow_pre(xhtml):
        """ make <pre> reflowable.

        This helps a lot with readers like Sony's that cannot
        scroll horizontally.

        """
        def nbsp(matchobj):
            return (' ' * (len(matchobj.group(0)) - 1)) + ' '

        for pre in xpath(xhtml, "//xhtml:pre"):
            # white-space: pre-wrap would do fine
            # but it is not supported by OEB
            try:
                pre.tag = NS.xhtml.div
                writers.HTMLishWriter.add_class(pre, 'pgmonospaced')
                m = parsers.RE_GUTENBERG.search(pre.text)
                if m:
                    writers.HTMLishWriter.add_class(pre, 'pgheader')

                tail = pre.tail
                s = etree.tostring(pre,
                                   encoding=six.text_type,
                                   with_tail=False)
                s = s.replace('>\n', '>')  # eliminate that empty first line
                s = s.replace('\n', '<br/>')
                s = re.sub('  +', nbsp, s)
                div = etree.fromstring(s)
                div.tail = tail

                pre.getparent().replace(pre, div)

            except etree.XMLSyntaxError as what:
                exception("%s\n%s" % (s, what))
                raise
Exemplo n.º 2
0
    def shipout(self, job, parsers, ncx):
        """ Build the zip file. """

        try:
            ocf = OEBPSContainer(
                os.path.join(job.outputdir, job.outputfile),
                ('%d/' % options.ebook if options.ebook else None))

            opf = ContentOPF()

            opf.metadata_item(job.dc)

            # write out parsers

            for p in parsers:
                try:
                    ocf.add_bytes(self.url2filename(p.attribs.url),
                                  p.serialize(), p.mediatype())
                    if p.mediatype() == mt.xhtml:
                        opf.spine_item_from_parser(p)
                    else:
                        opf.manifest_item_from_parser(p)
                except Exception as what:
                    error("Could not process file %s: %s" %
                          (p.attribs.url, what))

            # toc

            for t in ncx.toc:
                if t[1].lower().strip(' .') in TOC_HEADERS:
                    opf.guide_item(t[0], 'toc', t[1])
                    break

            opf.toc_item('toc.ncx')
            ocf.add_unicode('toc.ncx', six.text_type(ncx))

            for p in parsers:
                if 'coverpage' in p.attribs.rel:
                    opf.add_coverpage(ocf, p.attribs.url)
                    break

            # Adobe page-map

            # opf.pagemap_item ('page-map.xml')
            # ocf.add_unicode ('page-map.xml', six.text_type (AdobePageMap (ncx)))

            # content.opf

            # debug (etree.tostring (opf.manifest, encoding=siy.text_type, pretty_print=True))

            opf.rewrite_links(self.url2filename)
            ocf.add_unicode('content.opf', six.text_type(opf))

            ocf.commit()

        except Exception as what:
            exception("Error building Epub: %s" % what)
            ocf.rollback()
            raise
Exemplo n.º 3
0
    def build(self, job):
        """ Build HTML file. """

        htmlfilename = os.path.join(job.outputdir, job.outputfile)
        try:
            os.remove(htmlfilename)
        except OSError:
            pass

        try:
            info("Creating HTML file: %s" % htmlfilename)

            for p in job.spider.parsers:
                # Do html only. The images were copied earlier by PicsDirWriter.

                xhtml = None
                if hasattr(p, 'rst2html'):
                    xhtml = p.rst2html(job)
                elif hasattr(p, 'xhtml'):
                    p.parse()
                    xhtml = copy.deepcopy(p.xhtml)

                if xhtml is not None:
                    self.make_links_relative(xhtml, p.attribs.url)

                    self.add_dublincore(job, xhtml)

                    # makes iphones zoom in
                    self.add_meta(xhtml, 'viewport', 'width=device-width')
                    self.add_meta_generator(xhtml)

                    # This writer has currently to deal only with RST
                    # input.  The RST writer has a workaround that
                    # avoids writing empty elements.  So we don't need
                    # the same ugly workaround as the EPUB writer,
                    # that has to deal with HTML input too.
                    html = etree.tostring(xhtml,
                                          method='xml',
                                          doctype=gg.XHTML_DOCTYPE,
                                          encoding='utf-8',
                                          pretty_print=True,
                                          xml_declaration=True)

                    self.write_with_crlf(htmlfilename, html)

            # self.copy_aux_files (job.outputdir)

            info("Done HTML file: %s" % htmlfilename)

        except Exception as what:
            exception("Error building HTML %s: %s" % (htmlfilename, what))
            if os.access(htmlfilename, os.W_OK):
                os.remove(htmlfilename)
            raise what
Exemplo n.º 4
0
    def build(self, job):
        """ Build epub """

        ncx = TocNCX(job.dc)
        parsers = []
        css_count = 0

        # add CSS parser
        self.add_external_css(job.spider, None, PRIVATE_CSS, 'pgepub.css')

        try:
            chunker = HTMLChunker.HTMLChunker()
            coverpage_url = None

            # do images early as we need the new dimensions later
            for p in job.spider.parsers:
                if hasattr(p, 'resize_image'):
                    if 'coverpage' in p.attribs.rel:
                        if job.maintype == 'kindle':
                            np = p.resize_image(MAX_IMAGE_SIZE_KINDLE,
                                                MAX_COVER_DIMEN_KINDLE, 'jpeg')
                        else:
                            np = p.resize_image(MAX_IMAGE_SIZE,
                                                MAX_COVER_DIMEN)
                        np.id = p.attribs.get('id', 'coverpage')
                        coverpage_url = p.attribs.url
                    else:
                        if job.maintype == 'kindle':
                            np = p.resize_image(MAX_IMAGE_SIZE_KINDLE,
                                                MAX_IMAGE_DIMEN_KINDLE)
                        else:
                            np = p.resize_image(MAX_IMAGE_SIZE,
                                                MAX_IMAGE_DIMEN)
                        np.id = p.attribs.get('id')
                    parsers.append(np)

            for p in job.spider.parsers:
                if p.mediatype() in OPS_CONTENT_DOCUMENTS:
                    debug("URL: %s" % p.attribs.url)

                    if hasattr(p, 'rst2epub2'):
                        xhtml = p.rst2epub2(job)

                        if options.verbose >= 2:
                            # write html to disk for debugging
                            debugfilename = os.path.join(
                                job.outputdir, job.outputfile)
                            debugfilename = os.path.splitext (debugfilename)[0] + '.' + \
                                job.maintype + '.debug.html'
                            with open(debugfilename, 'wb') as fp:
                                fp.write(
                                    etree.tostring(xhtml, encoding='utf-8'))

                    else:
                        # make a copy so we can mess around
                        p.parse()
                        xhtml = copy.deepcopy(p.xhtml)

                    strip_classes = self.get_classes_that_float(xhtml)
                    strip_classes = strip_classes.intersection(STRIP_CLASSES)
                    if strip_classes:
                        self.strip_pagenumbers(xhtml, strip_classes)

                    # build up TOC
                    # has side effects on xhtml
                    ncx.toc += p.make_toc(xhtml)

                    self.insert_root_div(xhtml)
                    self.fix_charset(xhtml)
                    self.fix_style_elements(xhtml)
                    self.reflow_pre(xhtml)

                    # strip all links to items not in manifest
                    p.strip_links(xhtml, job.spider.dict_urls_mediatypes())
                    self.strip_links(xhtml, job.spider.dict_urls_mediatypes())

                    self.strip_noepub(xhtml)
                    # self.strip_rst_dropcaps (xhtml)

                    self.fix_html_image_dimensions(xhtml)
                    if coverpage_url:
                        self.remove_coverpage(xhtml, coverpage_url)

                    # externalize and fix CSS
                    for style in xpath(xhtml, '//xhtml:style'):
                        self.add_external_css(job.spider, xhtml, style.text,
                                              "%d.css" % css_count)
                        css_count += 1
                        style.drop_tree()

                    self.add_external_css(job.spider, xhtml, None,
                                          'pgepub.css')

                    self.add_meta_generator(xhtml)

                    debug("Splitting %s ..." % p.attribs.url)
                    chunker.next_id = 0
                    chunker.split(xhtml, p.attribs)

            for p in job.spider.parsers:
                if hasattr(p, 'sheet'):
                    self.fix_incompatible_css(p.sheet)
                    p.rewrite_links(self.url2filename)
                    parsers.append(p)

            # after splitting html into chunks we have to rewrite all
            # internal links in HTML
            chunker.rewrite_internal_links()
            # also in the TOC
            if not ncx.toc:
                ncx.toc.append([job.spider.parsers[0].attribs.url, 'Start', 1])
            chunker.rewrite_internal_links_toc(ncx.toc)

            # make absolute links zip-filename-compatible
            chunker.rewrite_links(self.url2filename)
            ncx.rewrite_links(self.url2filename)

            # Do away with the chunker and copy all chunks into new parsers.
            # These are fake parsers that never actually parsed anything,
            # we just use them to just hold our data.
            for chunk, attribs in chunker.chunks:
                p = ParserFactory.ParserFactory.get(attribs)
                p.xhtml = chunk
                parsers.append(p)

            self.shipout(job, parsers, ncx)

        except Exception as what:
            exception("Error building Epub: %s" % what)
            raise
Exemplo n.º 5
0
def do_job(job):
    """ Do one job. """

    log_handler = None
    Logger.ebook = job.ebook
    if job.logfile:
        log_handler = open_log(
            os.path.join(os.path.abspath(job.outputdir), job.logfile))

    debug('=== Building %s ===' % job.type)
    start_time = datetime.datetime.now()
    try:
        if job.url:
            spider = Spider.Spider()
            dirpath = os.path.dirname(job.url)  # platform native path
            spider.include_urls += (options.include_urls
                                    or [parsers.webify_url(dirpath) + '/*']
                                    )  # use for parser only

            spider.include_mediatypes += options.include_mediatypes
            if job.subtype == '.images' or job.type == 'rst.gen':
                spider.include_mediatypes.append('image/*')

            spider.exclude_urls += options.exclude_urls

            spider.exclude_mediatypes += options.exclude_mediatypes

            spider.max_depth = options.max_depth or six.MAXSIZE

            for rewrite in options.rewrite:
                from_url, to_url = rewrite.split('>')
                spider.add_redirection(from_url, to_url)

            attribs = parsers.ParserAttributes()
            attribs.url = parsers.webify_url(job.url)
            attribs.id = 'start'

            if options.input_mediatype:
                attribs.orig_mediatype = attribs.HeaderElement.from_str(
                    options.input_mediatype)

            spider.recursive_parse(attribs)
            elect_coverpage(spider, job.url)
            job.url = spider.redirect(job.url)
            job.base_url = job.url
            job.spider = spider

        writer = WriterFactory.create(job.maintype)
        writer.build(job)

        if options.validate:
            writer.validate(job)

        packager = PackagerFactory.create(options.packager, job.type)
        if packager:
            packager.package(job)

        if job.type == 'html.images':
            # FIXME: hack for push packager
            options.html_images_list = list(job.spider.aux_file_iter())

    except SkipOutputFormat as what:
        warning("%s" % what)

    except Exception as what:
        exception("%s" % what)

    end_time = datetime.datetime.now()
    info(' %s made in %s' % (job.type, end_time - start_time))

    if log_handler:
        close_log(log_handler)
        log_handler = None