示例#1
0
    def package (self, job):
        self.setup (job)
        zipfilename = job.outputfile # filename is zipfile

        m = re.match (r'\d+', zipfilename)
        if m:
            ebook_no = m.group (0)
        else:
            error ('Invalid filename %s for push packager.' % zipfilename)
            return

        zip_ = self.create (zipfilename)

        for suffix in '.txt -8.txt -0.txt .zip -8.zip -0.zip -rst.zip -h.zip'.split ():
            filename = '%s%s' % (ebook_no, suffix)
            memberfilename = '%s/%s' % (ebook_no, filename)
            self.add (zip_, filename, memberfilename)

        for suffix, ext in (('-h', 'html'), ('-rst', 'rst')):
            filename = '%s%s.%s' % (ebook_no, suffix, ext)
            memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, filename)
            self.add (zip_, filename, memberfilename)

            # image files
            for url in options.html_images_list:
                rel_url = gg.make_url_relative (job.base_url, url)
                filename = os.path.join (self.path, rel_url)
                memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, rel_url)
                self.add (zip_, filename, memberfilename)

        zip_.close ()
        info ('Done Zip file: %s' % zipfilename)
示例#2
0
    def __init__(self, filename, oebps_path=None):
        """ Create the zip file.

        And populate it with mimetype and container.xml files.

        """

        self.zipfilename = filename
        self.oebps_path = oebps_path if oebps_path else 'OEBPS/'

        info('Creating Epub file: %s' % filename)

        # open zipfile
        zipfile.ZipFile.__init__(self, filename, 'w', zipfile.ZIP_DEFLATED)

        # write mimetype
        # the OCF spec says mimetype must be first and uncompressed
        i = self.zi()
        i.compress_type = zipfile.ZIP_STORED
        i.filename = 'mimetype'
        self.writestr(i, 'application/epub+zip')

        self.add_container_xml('content.opf')

        self.wrappers = 0  # to generate unique filenames for wrappers
示例#3
0
    def tidy (html):
        """ Pipe html thru w3c tidy. """

        html = parsers.RE_RESTRICTED.sub ('', html)
        html = RE_XMLDECL.sub ('', html)
        html = parsers.RE_HTML_CHARSET.sub ('; charset=utf-8', html)

        # convert to xhtml
        tidy = subprocess.Popen (
            ["tidy",
             "-utf8",
             "-clean",
             "--wrap",             "0",
             # "--drop-font-tags",   "y",
             # "--drop-proprietary-attributes", "y",
             # "--add-xml-space",    "y",
             "--output-xhtml",     "y",
             "--numeric-entities", "y",
             "--merge-divs",       "n", # keep poetry indentation
             "--merge-spans",      "n",
             "--add-xml-decl",     "n",
             "--doctype",          "strict",
             "--anchor-as-name",   "n",
             "--enclose-text",     "y" ],

            stdin = subprocess.PIPE,
            stdout = subprocess.PIPE,
            stderr = subprocess.PIPE)

        # print (html.encode ('utf-8'))
        # sys.exit ()

        (html, stderr) = tidy.communicate (html.encode ('utf-8'))

        regex = re.compile (r'(Info:|Warning:|Error:)\s*', re.I)

        # pylint: disable=E1103
        msg = stderr.decode (sys.stderr.encoding).strip ()
        for line in msg.splitlines ():
            match = regex.search (line)
            if match:
                sline = regex.sub ("", line)
                g = match.group (1).lower ()
                if g == 'info:':
                    info ("tidy: %s" % sline)
                elif g == 'warning:':
                    warning ("tidy: %s" % sline)
                elif g == 'error:':
                    error ("tidy: %s" % sline)
                else:
                    error (line)

        if tidy.returncode == 2:
            raise ValueError (stderr)

        return html.decode ('utf-8')
示例#4
0
    def build(self, job):
        """ Build Pics file. """

        dest_dir = os.path.abspath(job.outputdir)

        info("Creating Pics directory in: %s" % dest_dir)

        self.copy_aux_files(job, dest_dir)

        info("Done Pics directory in: %s" % dest_dir)
示例#5
0
    def build(self, job):
        """ Build HTML file. """

        htmlfilename = os.path.join(job.outputdir, job.outputfile)
        try:
            os.remove(htmlfilename)
        except OSError:
            pass

        try:
            info("Creating HTML file: %s" % htmlfilename)

            for p in job.spider.parsers:
                # Do html only. The images were copied earlier by PicsDirWriter.

                xhtml = None
                if hasattr(p, 'rst2html'):
                    xhtml = p.rst2html(job)
                elif hasattr(p, 'xhtml'):
                    p.parse()
                    xhtml = copy.deepcopy(p.xhtml)

                if xhtml is not None:
                    self.make_links_relative(xhtml, p.attribs.url)

                    self.add_dublincore(job, xhtml)

                    # makes iphones zoom in
                    self.add_meta(xhtml, 'viewport', 'width=device-width')
                    self.add_meta_generator(xhtml)

                    # This writer has currently to deal only with RST
                    # input.  The RST writer has a workaround that
                    # avoids writing empty elements.  So we don't need
                    # the same ugly workaround as the EPUB writer,
                    # that has to deal with HTML input too.
                    html = etree.tostring(xhtml,
                                          method='xml',
                                          doctype=gg.XHTML_DOCTYPE,
                                          encoding='utf-8',
                                          pretty_print=True,
                                          xml_declaration=True)

                    self.write_with_crlf(htmlfilename, html)

            # self.copy_aux_files (job.outputdir)

            info("Done HTML file: %s" % htmlfilename)

        except Exception as what:
            exception("Error building HTML %s: %s" % (htmlfilename, what))
            if os.access(htmlfilename, os.W_OK):
                os.remove(htmlfilename)
            raise what
示例#6
0
    def get_charset_from_meta(self):
        """ Parse text for hints about charset. """

        charset = None

        match = parsers.REB_PG_CHARSET.search(self.bytes_content())
        if match:
            charset = match.group(1).decode('ascii')
            info('Got charset %s from pg header' % charset)

        return charset
示例#7
0
    def package(self, job):
        self.setup(job)
        filename = self.path_name_ext
        zipfilename = os.path.join(self.path, self.name) + '.zip'
        memberfilename = self.name + self.ext

        zip_ = self.create(zipfilename)
        self.add(zip_, filename, memberfilename)
        zip_.close()

        info('Done Zip file: %s' % zipfilename)
示例#8
0
    def add(zip_, filename, memberfilename):
        """ Add one file to the zip. """

        try:
            os.stat(filename)
            dummy_name, ext = os.path.splitext(filename)
            info('  Adding file: %s as %s' % (filename, memberfilename))
            zip_.write(
                filename, memberfilename, zipfile.ZIP_STORED
                if ext in ['.zip', '.png'] else zipfile.ZIP_DEFLATED)
        except OSError:
            warning('ZipPackager: Cannot add file %s', filename)
示例#9
0
    def build(self, job):
        """ Build RST file. """

        filename = os.path.join(os.path.abspath(job.outputdir), job.outputfile)

        info("Creating RST file: %s" % filename)

        parser = ParserFactory.ParserFactory.create(job.url)

        if not hasattr(parser, 'rst2nroff'):
            error('RSTWriter can only work on a RSTParser.')
            raise SkipOutputFormat

        data = parser.preprocess('utf-8').encode('utf-8')

        self.write_with_crlf(filename, data)

        info("Done RST file: %s" % filename)
示例#10
0
    def pre_parse(self):
        """
        Pre-parse a html ebook.

        Does a full parse because a lightweight parse would be almost
        as much work.

        """

        # cache
        if self.xhtml is not None:
            return

        debug("HTMLParser.pre_parse () ...")

        html = self.unicode_content()

        if html.startswith('<?xml'):
            # Try a naive parse. This might fail because of errors in
            # the html or because we have no dtd loaded.  We do not
            # load dtds because that makes us dependent on network and
            # the w3c site being up.  Having all users of ebookmaker
            # install local dtds is unrealistic.
            try:
                self.xhtml = self.__parse(html)
            except etree.ParseError:
                pass

        if self.xhtml is None:
            # previous parse failed, try tidy
            info("Running html thru tidy.")
            html = self.tidy(html)
            self.xhtml = self.__parse(html)  # let exception bubble up

        self._fix_anchors()  # needs relative paths

        self.xhtml.make_links_absolute(base_url=self.attribs.url)

        self._to_xhtml11()

        self._make_coverpage_link()

        debug("Done parsing %s" % self.attribs.url)
示例#11
0
    def build(self, job):
        """ Build TXT file. """

        filename = os.path.join(job.outputdir, job.outputfile)

        encoding = job.subtype.strip('.')

        info("Creating plain text file: %s" % filename)

        parser = ParserFactory.ParserFactory.create(job.url)

        if hasattr(parser, 'rst2nroff'):
            data = self.groff(job, parser.rst2nroff(job, encoding), encoding)
        elif hasattr(parser, 'xhtml') and parser.xhtml is not None:
            info("Plain text file %s aborted due to html input" % filename)
            return
        else:
            data = parser.unicode_content()

        data = data.encode('utf_8_sig' if encoding == 'utf-8' else encoding,
                           'unitame')

        self.write_with_crlf(filename, data)

        info("Done plain text file: %s" % filename)
示例#12
0
    def unicode_content(self):
        """ Get document content as unicode string. """

        if self.unicode_buffer is None:
            data = (self.decode(self.get_charset_from_content_type())
                    or self.decode(self.get_charset_from_meta())
                    or self.decode(self.guess_charset_from_body())
                    or self.decode('utf-8') or self.decode('windows-1252'))

            if not data:
                if data == '':
                    info('Continuing parse despite missing file')
                    self.unicode_buffer = ''
                else:
                    raise UnicodeError(
                        "Text in Klingon encoding ... giving up.")

            # normalize line-endings
            if '\r' in data or '\u2028' in data:
                data = '\n'.join(data.splitlines())
            self.unicode_buffer = data

        return self.unicode_buffer
示例#13
0
    def package(self, job):
        self.setup(job)

        try:
            aux_file_list = list(job.spider.aux_file_iter())
        except AttributeError:
            aux_file_list = []

        filename = job.outputfile
        zipfilename = os.path.join(self.path, self.name) + '.zip'
        memberfilename = os.path.join(self.name, self.name) + self.ext

        zip_ = self.create(zipfilename)
        self.add(zip_, filename, memberfilename)

        # now images
        for url in aux_file_list:
            rel_url = gg.make_url_relative(job.base_url, url)
            filename = os.path.join(self.path, rel_url)
            memberfilename = os.path.join(self.name, rel_url)
            self.add(zip_, filename, memberfilename)

        zip_.close()
        info('Done Zip file: %s' % zipfilename)
示例#14
0
    def package(self, job):
        self.setup(job)
        filename = self.path_name_ext
        gzfilename = filename + GZIP_EXTENSION

        try:
            info('Creating Gzip file: %s' % gzfilename)
            info('  Adding file: %s' % filename)
            with open(filename, 'rb') as fp:
                with gzip.open(gzfilename, 'wb') as fpgz:
                    fpgz.writelines(fp)
            info('Done Zip file: %s' % gzfilename)
        except IOError as what:
            error(what)
示例#15
0
    def create(zipfilename):
        """ Create a zip file. """

        info('Creating Zip file: %s' % zipfilename)
        return zipfile.ZipFile(zipfilename, 'w', zipfile.ZIP_DEFLATED)
示例#16
0
    def build(self, job):
        """ Build kindle file from epub using amazon kindlegen. """

        info("Creating Kindle file: %s" %
             os.path.join(job.outputdir, job.outputfile))
        info("            ... from: %s" % job.url)

        try:
            cwd = os.getcwd()
            os.chdir(job.outputdir)

            kindlegen = subprocess.Popen([
                options.config.MOBIGEN, '-o',
                os.path.basename(job.outputfile), job.url
            ],
                                         stdin=subprocess.PIPE,
                                         stdout=subprocess.PIPE,
                                         stderr=subprocess.PIPE)

        except OSError as what:
            os.chdir(cwd)
            error("KindleWriter: %s %s" % (options.config.MOBIGEN, what))
            raise SkipOutputFormat

        (stdout, stderr) = kindlegen.communicate()

        os.chdir(cwd)

        if kindlegen.returncode > 0:
            regex = re.compile(r'^(\w+)\(prcgen\):')

            # pylint: disable=E1103
            msg = stderr.rstrip()
            if msg:
                msg = msg.decode(sys.stderr.encoding)
                error(msg)
            msg = stdout.rstrip()
            msg = msg.decode(sys.stdout.encoding)
            for line in msg.splitlines():
                match = regex.match(line)
                if match:
                    sline = regex.sub("", line)
                    g = match.group(1).lower()
                    if g == 'info':
                        if sline == 'MOBI File generated with WARNINGS!':
                            # we knew that already
                            continue
                        # info ("kindlegen: %s" % sline)
                    elif g == 'warning':
                        if sline.startswith('Cover is too small'):
                            continue
                        if sline == 'Cover not specified':
                            continue
                        warning("kindlegen: %s" % sline)
                    elif g == 'error':
                        error("kindlegen: %s" % sline)
                    else:
                        error(line)

        info("Done Kindle file: %s" %
             os.path.join(job.outputdir, job.outputfile))
示例#17
0
    def build(self, job):
        """ Build kindle file from epub using amazon kindlegen or calibre. """

        if job.dc.languages:
            if job.dc.languages[0].id in no_kindlegen_langs:
                mobimaker = options.config.MOBILANG
            else:
                mobimaker = options.config.MOBIGEN
        if not mobimaker:
            info('no mobimaker available')
            return

        # kindlegen needs localized paths
        outputdir = os.path.abspath(job.outputdir)

        info("Creating Kindle file: %s" % os.path.join(outputdir, job.outputfile))
        info("            ... from: %s" % job.url)

        try:
            cwd = os.getcwd()
            os.chdir(outputdir)
            if 'ebook-convert' in mobimaker:
                kindlegen = subprocess.Popen(
                    [
                        mobimaker,
                        job.url,
                        os.path.basename(job.outputfile),
                        '--personal-doc="[EBOK]"',
                    ],
                    stdin=subprocess.PIPE,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE
                )
            else:
                kindlegen = subprocess.Popen(
                    [
                        mobimaker,
                        '-o', os.path.basename(job.outputfile),
                        job.url
                    ],
                    stdin=subprocess.PIPE,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE
                )

        except OSError as what:
            os.chdir(cwd)
            error("KindleWriter: %s %s" % (mobimaker, what))
            raise SkipOutputFormat

        (stdout, stderr) = kindlegen.communicate()

        os.chdir(cwd)

        if kindlegen.returncode > 0:
            regex = re.compile(r'^(\w+)\(prcgen\):')

            # pylint: disable=E1103
            msg = stderr.rstrip()
            if msg:
                msg = msg.decode(sys.stderr.encoding)
                error(msg)
            msg = stdout.rstrip()
            msg = msg.decode(sys.stdout.encoding)
            for line in msg.splitlines():
                match = regex.match(line)
                if match:
                    sline = regex.sub("", line)
                    g = match.group(1).lower()
                    if g == 'info':
                        if sline == 'MOBI File generated with WARNINGS!':
                            # we knew that already
                            continue
                        # info("kindlegen: %s" % sline)
                    elif g == 'warning':
                        if sline.startswith('Cover is too small'):
                            continue
                        if sline == 'Cover not specified':
                            continue
                        warning("kindlegen: %s" % sline)
                    elif g == 'error':
                        error("kindlegen: %s" % sline)
                    else:
                        error(line)

        info("Done Kindle file: %s" % os.path.join(outputdir, job.outputfile))
示例#18
0
def main():
    """ Main program. """

    try:
        config()
    except configparser.Error as what:
        error("Error in configuration file: %s", str(what))
        return 1

    Logger.set_log_level(options.verbose)

    options.types = options.types or ['all']
    options.types = CommonCode.add_dependencies(options.types, DEPENDENCIES,
                                                BUILD_ORDER)
    debug("Building types: %s" % ' '.join(options.types))
    start_time = datetime.datetime.now()

    ParserFactory.load_parsers()
    WriterFactory.load_writers()
    PackagerFactory.load_packagers()

    if options.is_job_queue:
        job_queue = cPickle.load(sys.stdin.buffer)  # read bytes
    else:
        options.dc = get_dc(
            options.url)  # this is when doc at url gets parsed!
        job_queue = []
        output_files = dict()
        for type_ in options.types:
            job = CommonCode.Job(type_)
            job.url = options.url
            job.ebook = options.ebook
            job.dc = options.dc
            job.outputdir = options.outputdir
            job.outputfile = options.outputfile or make_output_filename(
                type_, options.dc)
            output_files[type_] = job.outputfile
            absoutputdir = os.path.abspath(job.outputdir)
            if job.type == 'kindle.images':
                job.url = os.path.join(absoutputdir,
                                       output_files['epub.images'])
            elif job.type == 'kindle.noimages':
                job.url = os.path.join(absoutputdir,
                                       output_files['epub.noimages'])

            job_queue.append(job)

    for j in job_queue:
        options.dc = j.dc
        options.outputdir = j.outputdir
        do_job(j)

    packager = PackagerFactory.create(options.packager, 'push')
    if packager:
        # HACK: the WWers ever only convert one ebook at a time
        job = job_queue[0]
        job.outputfile = '%d-final.zip' % (options.dc.project_gutenberg_id)
        packager.package(job)

    end_time = datetime.datetime.now()
    info(' Finished jobs. Total time: %s' % (end_time - start_time))
    return 0
示例#19
0
def do_job(job):
    """ Do one job. """

    log_handler = None
    Logger.ebook = job.ebook
    if job.logfile:
        log_handler = open_log(
            os.path.join(os.path.abspath(job.outputdir), job.logfile))

    debug('=== Building %s ===' % job.type)
    start_time = datetime.datetime.now()
    try:
        if job.url:
            spider = Spider.Spider()
            dirpath = os.path.dirname(job.url)  # platform native path
            spider.include_urls += (options.include_urls
                                    or [parsers.webify_url(dirpath) + '/*']
                                    )  # use for parser only

            spider.include_mediatypes += options.include_mediatypes
            if job.subtype == '.images' or job.type == 'rst.gen':
                spider.include_mediatypes.append('image/*')

            spider.exclude_urls += options.exclude_urls

            spider.exclude_mediatypes += options.exclude_mediatypes

            spider.max_depth = options.max_depth or six.MAXSIZE

            for rewrite in options.rewrite:
                from_url, to_url = rewrite.split('>')
                spider.add_redirection(from_url, to_url)

            attribs = parsers.ParserAttributes()
            attribs.url = parsers.webify_url(job.url)
            attribs.id = 'start'

            if options.input_mediatype:
                attribs.orig_mediatype = attribs.HeaderElement.from_str(
                    options.input_mediatype)

            spider.recursive_parse(attribs)
            elect_coverpage(spider, job.url)
            job.url = spider.redirect(job.url)
            job.base_url = job.url
            job.spider = spider

        writer = WriterFactory.create(job.maintype)
        writer.build(job)

        if options.validate:
            writer.validate(job)

        packager = PackagerFactory.create(options.packager, job.type)
        if packager:
            packager.package(job)

        if job.type == 'html.images':
            # FIXME: hack for push packager
            options.html_images_list = list(job.spider.aux_file_iter())

    except SkipOutputFormat as what:
        warning("%s" % what)

    except Exception as what:
        exception("%s" % what)

    end_time = datetime.datetime.now()
    info(' %s made in %s' % (job.type, end_time - start_time))

    if log_handler:
        close_log(log_handler)
        log_handler = None
示例#20
0
    def build(self, job):
        """ Build PDF file. """

        inputfilename = job.url
        outputfilename = os.path.join(os.path.abspath(job.outputdir),
                                      job.outputfile)

        debug("Inputfile: %s" % inputfilename)
        info("Creating PDF file: %s" % outputfilename)

        parser = ParserFactory.ParserFactory.create(inputfilename)

        if not hasattr(parser, 'rst2xetex'):
            warning('Skipping PDF Output because input mediatype is %s' %
                    parser.mediatype())
            raise SkipOutputFormat

        # Brain-dead xetex doesn't understand unix pipes
        # so we have to write a temp file

        texfilename = os.path.splitext(outputfilename)[0] + '.tex'
        auxfilename = os.path.splitext(outputfilename)[0] + '.aux'
        logfilename = os.path.splitext(outputfilename)[0] + '.log'

        try:
            os.remove(auxfilename)
        except OSError:
            pass

        tex = parser.rst2xetex(job)
        with open(texfilename, 'wb') as fp:
            fp.write(tex)

        try:
            cwd = os.getcwd()
            os.chdir(os.path.abspath(job.outputdir))

            _xetex = subprocess.Popen([
                options.config.XELATEX, "-output-directory", job.outputdir,
                "-interaction", "nonstopmode", texfilename
            ],
                                      stdin=subprocess.PIPE,
                                      stdout=subprocess.PIPE,
                                      stderr=subprocess.PIPE)
        except OSError as what:
            os.chdir(cwd)
            error("PDFWriter: %s %s" % (options.config.XELATEX, what))
            raise SkipOutputFormat

        (dummy_stdout, dummy_stderr) = _xetex.communicate()

        with open(logfilename, encoding='utf-8') as fp:
            for line in fp:
                line = line.strip()
                if 'Error:' in line:
                    error("xetex: %s" % line)
                if options.verbose >= 1:
                    if 'Warning:' in line:
                        warning("xetex: %s" % line)

        if options.verbose < 2:
            try:
                os.remove(texfilename)
                os.remove(logfilename)
                os.remove(auxfilename)
            except OSError:
                pass

        os.chdir(cwd)

        info("Done PDF file: %s" % outputfilename)
示例#21
0
 def commit(self):
     """ Close OCF Container. """
     info("Done Epub file: %s" % self.zipfilename)
     self.close()