Пример #1
0
    def open_file(cls, url, attribs):
        """ Open a local file for parsing. """
        def open_file_from_path(path):
            try:
                return open(url, 'rb')
            except FileNotFoundError:
                error('Missing file: %s' % url)
            except IsADirectoryError:
                error('Missing file is a directory: %s' % url)
            return None

        if re.search(r'^([a-zA-z]:|/)', url):
            fp = open_file_from_path(url)
        else:
            try:
                # handles all the flavors of file: urls, including on windows
                fp = urllib.request.urlopen(url)
            except urllib.error.URLError as what:
                fp = None
                error('Missing file: %s' % what.reason)
            except ValueError:  # just a relative path?
                fp = open_file_from_path(url)

        attribs.orig_mediatype = attribs.HeaderElement(
            MediaTypes.guess_type(url))

        debug("... got mediatype %s from guess_type" %
              str(attribs.orig_mediatype))
        attribs.orig_url = attribs.url = url
        return fp
Пример #2
0
    def validate(self, job):
        """ Validate generated epub using external tools. """

        debug("Validating %s ..." % job.outputfile)

        filename = os.path.join(job.outputdir, job.outputfile)

        for validator in (options.config.EPUB_VALIDATOR,
                          options.config.EPUB_PREFLIGHT):
            if validator is not None:
                params = validator.split() + [filename]
                checker = subprocess.Popen(params,
                                           stdin=subprocess.PIPE,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE)

                (dummy_stdout, stderr) = checker.communicate()
                if stderr:
                    error(stderr)
                    return 1
                    #raise AssertionError (
                    #    "%s does not validate." % job.outputfile)

        debug("%s validates ok." % job.outputfile)
        return 0
Пример #3
0
    def package (self, job):
        self.setup (job)
        zipfilename = job.outputfile # filename is zipfile

        m = re.match (r'\d+', zipfilename)
        if m:
            ebook_no = m.group (0)
        else:
            error ('Invalid filename %s for push packager.' % zipfilename)
            return

        zip_ = self.create (zipfilename)

        for suffix in '.txt -8.txt -0.txt .zip -8.zip -0.zip -rst.zip -h.zip'.split ():
            filename = '%s%s' % (ebook_no, suffix)
            memberfilename = '%s/%s' % (ebook_no, filename)
            self.add (zip_, filename, memberfilename)

        for suffix, ext in (('-h', 'html'), ('-rst', 'rst')):
            filename = '%s%s.%s' % (ebook_no, suffix, ext)
            memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, filename)
            self.add (zip_, filename, memberfilename)

            # image files
            for url in options.html_images_list:
                rel_url = gg.make_url_relative (job.base_url, url)
                filename = os.path.join (self.path, rel_url)
                memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, rel_url)
                self.add (zip_, filename, memberfilename)

        zip_.close ()
        info ('Done Zip file: %s' % zipfilename)
Пример #4
0
    def copy_aux_files(self, job, dest_dir):
        """ Copy image files to dest_dir. Use image data cached in parsers. """

        for p in job.spider.parsers:
            if hasattr(p, 'resize_image'):
                src_uri = p.attribs.url
                fn_dest = gg.make_url_relative(webify_url(job.base_url),
                                               src_uri)
                fn_dest = os.path.join(dest_dir, fn_dest)

                # debug ('base_url =  %s, src_uri = %s' % (job.base_url, src_uri))

                if gg.is_same_path(src_uri, fn_dest):
                    debug('Not copying %s to %s: same file' %
                          (src_uri, fn_dest))
                    continue
                debug('Copying %s to %s' % (src_uri, fn_dest))

                fn_dest = gg.normalize_path(fn_dest)
                gg.mkdir_for_filename(fn_dest)
                try:
                    with open(fn_dest, 'wb') as fp_dest:
                        fp_dest.write(p.serialize())
                except IOError as what:
                    error('Cannot copy %s to %s: %s' %
                          (src_uri, fn_dest, what))
Пример #5
0
    def decode(self, charset):
        """ Try to decode document contents to unicode. """
        if charset is None:
            return None

        charset = charset.lower().strip()

        if charset in BOGUS_CHARSET_NAMES:
            charset = BOGUS_CHARSET_NAMES[charset]

        if charset == 'utf-8':
            charset = 'utf_8_sig'

        try:
            debug("Trying to decode document with charset %s ..." % charset)
            buffer = self.bytes_content()
            buffer = REB_PG_CHARSET.sub(b'', buffer)
            buffer = buffer.decode(charset)
            self.attribs.orig_mediatype.params['charset'] = charset
            return buffer
        except LookupError as what:
            # unknown charset,
            error("Invalid charset name: %s (%s)" % (charset, what))
        except UnicodeError as what:
            # mis-stated charset, did not decode
            error("Text not in charset %s (%s)" % (charset, what))
        return None
Пример #6
0
    def __parse(self, html):
        # remove xml decl and doctype, we will add the correct one before serializing
        # html = re.compile ('^.*<html ', re.I | re.S).sub ('<html ', html)
        # FIXME: do not remove doctype because we need it to load the dtd

        # remove xml declaration because of parser error: "Unicode
        # strings with encoding declaration are not supported. Please
        # use bytes input or XML fragments without declaration."
        re_xml_decl = re.compile(r'^.*?<\?xml.*?\?>', re.S | re.U)
        html = re_xml_decl.sub('', html)
        try:
            return etree.fromstring(html,
                                    lxml.html.XHTMLParser(huge_tree=True),
                                    base_url=self.attribs.url)
        except etree.ParseError as what:
            # cannot try HTML parser because we depend on correct xhtml namespace
            m = re.search(r"Entity '([^']+)'", str(what))
            if m:
                warning("Missing entity: '%s'" % m.group(1))
            else:
                error("Failed to parse file because: %s" % what)
            m = re.search(r'line\s(\d+),', str(what))
            if m:
                lineno = int(m.group(1))
                error("Line %d: %s" % (lineno, html.splitlines()[lineno - 1]))
            raise
Пример #7
0
 def translate (self):
     visitor = self.translator_class (self.document)
     del Unitame.unhandled_chars[:]
     self.document.walkabout (visitor)
     self.output = visitor.astext ()
     if Unitame.unhandled_chars:
         error ("unitame: unhandled chars: %s" % ", ".join (set (Unitame.unhandled_chars)))
Пример #8
0
    def shipout(self, job, parsers, ncx):
        """ Build the zip file. """

        try:
            ocf = OEBPSContainer(
                os.path.join(job.outputdir, job.outputfile),
                ('%d/' % options.ebook if options.ebook else None))

            opf = ContentOPF()

            opf.metadata_item(job.dc)

            # write out parsers

            for p in parsers:
                try:
                    ocf.add_bytes(self.url2filename(p.attribs.url),
                                  p.serialize(), p.mediatype())
                    if p.mediatype() == mt.xhtml:
                        opf.spine_item_from_parser(p)
                    else:
                        opf.manifest_item_from_parser(p)
                except Exception as what:
                    error("Could not process file %s: %s" %
                          (p.attribs.url, what))

            # toc

            for t in ncx.toc:
                if t[1].lower().strip(' .') in TOC_HEADERS:
                    opf.guide_item(t[0], 'toc', t[1])
                    break

            opf.toc_item('toc.ncx')
            ocf.add_unicode('toc.ncx', six.text_type(ncx))

            for p in parsers:
                if 'coverpage' in p.attribs.rel:
                    opf.add_coverpage(ocf, p.attribs.url)
                    break

            # Adobe page-map

            # opf.pagemap_item ('page-map.xml')
            # ocf.add_unicode ('page-map.xml', six.text_type (AdobePageMap (ncx)))

            # content.opf

            # debug (etree.tostring (opf.manifest, encoding=siy.text_type, pretty_print=True))

            opf.rewrite_links(self.url2filename)
            ocf.add_unicode('content.opf', six.text_type(opf))

            ocf.commit()

        except Exception as what:
            exception("Error building Epub: %s" % what)
            ocf.rollback()
            raise
Пример #9
0
 def open_file_from_path(path):
     try:
         return open(url, 'rb')
     except FileNotFoundError:
         error('Missing file: %s' % url)
     except IsADirectoryError:
         error('Missing file is a directory: %s' % url)
     return None
Пример #10
0
def main():
    """ Main program. """

    try:
        config()
    except configparser.Error as what:
        error("Error in configuration file: %s", str(what))
        return 1

    Logger.set_log_level(options.verbose)

    options.types = options.types or ['all']
    options.types = CommonCode.add_dependencies(options.types, DEPENDENCIES,
                                                BUILD_ORDER)
    debug("Building types: %s" % ' '.join(options.types))

    ParserFactory.load_parsers()
    WriterFactory.load_writers()
    PackagerFactory.load_packagers()

    if options.is_job_queue:
        job_queue = cPickle.load(sys.stdin.buffer)  # read bytes
    else:
        options.dc = get_dc(options.url)
        job_queue = []
        output_files = dict()
        for type_ in options.types:
            job = CommonCode.Job(type_)
            job.url = options.url
            job.ebook = options.ebook
            job.dc = options.dc
            job.outputdir = options.outputdir
            job.outputfile = options.outputfile or make_output_filename(
                type_, options.dc)
            output_files[type_] = job.outputfile

            if job.type == 'kindle.images':
                job.url = os.path.join(job.outputdir,
                                       output_files['epub.images'])
            elif job.type == 'kindle.noimages':
                job.url = os.path.join(job.outputdir,
                                       output_files['epub.noimages'])

            job_queue.append(job)

    for j in job_queue:
        do_job(j)

    packager = PackagerFactory.create(options.packager, 'push')
    if packager:
        # HACK: the WWers ever only convert one ebook at a time
        job = job_queue[0]
        job.outputfile = '%d-final.zip' % (options.dc.project_gutenberg_id)
        packager.package(job)

    return 0
Пример #11
0
    def tidy (html):
        """ Pipe html thru w3c tidy. """

        html = parsers.RE_RESTRICTED.sub ('', html)
        html = RE_XMLDECL.sub ('', html)
        html = parsers.RE_HTML_CHARSET.sub ('; charset=utf-8', html)

        # convert to xhtml
        tidy = subprocess.Popen (
            ["tidy",
             "-utf8",
             "-clean",
             "--wrap",             "0",
             # "--drop-font-tags",   "y",
             # "--drop-proprietary-attributes", "y",
             # "--add-xml-space",    "y",
             "--output-xhtml",     "y",
             "--numeric-entities", "y",
             "--merge-divs",       "n", # keep poetry indentation
             "--merge-spans",      "n",
             "--add-xml-decl",     "n",
             "--doctype",          "strict",
             "--anchor-as-name",   "n",
             "--enclose-text",     "y" ],

            stdin = subprocess.PIPE,
            stdout = subprocess.PIPE,
            stderr = subprocess.PIPE)

        # print (html.encode ('utf-8'))
        # sys.exit ()

        (html, stderr) = tidy.communicate (html.encode ('utf-8'))

        regex = re.compile (r'(Info:|Warning:|Error:)\s*', re.I)

        # pylint: disable=E1103
        msg = stderr.decode (sys.stderr.encoding).strip ()
        for line in msg.splitlines ():
            match = regex.search (line)
            if match:
                sline = regex.sub ("", line)
                g = match.group (1).lower ()
                if g == 'info:':
                    info ("tidy: %s" % sline)
                elif g == 'warning:':
                    warning ("tidy: %s" % sline)
                elif g == 'error:':
                    error ("tidy: %s" % sline)
                else:
                    error (line)

        if tidy.returncode == 2:
            raise ValueError (stderr)

        return html.decode ('utf-8')
Пример #12
0
def generate_cover(dir):
    try:
        cover_image = Cover.draw(options.dc, cover_width=1200, cover_height=1800)
        cover_url = os.path.join(dir, make_output_filename('cover', options.dc))
        with open(cover_url, 'wb+') as cover:
            cover_image.save(cover)
        return cover_url
    except OSError:
        error("OSError, Cairo not installed or couldn't write file.")
        return None
Пример #13
0
    def bytes_content(self):
        """ Get document content as raw bytes. """

        if self.buffer is None:
            try:
                debug("Fetching %s ..." % self.attribs.url)
                self.buffer = self.fp.read()
                self.fp.close()
            except IOError as what:
                error(what)

        return self.buffer
Пример #14
0
    def package(self, job):
        self.setup(job)
        filename = self.path_name_ext
        gzfilename = filename + GZIP_EXTENSION

        try:
            info('Creating Gzip file: %s' % gzfilename)
            info('  Adding file: %s' % filename)
            with open(filename, 'rb') as fp:
                with gzip.open(gzfilename, 'wb') as fpgz:
                    fpgz.writelines(fp)
            info('Done Zip file: %s' % gzfilename)
        except IOError as what:
            error(what)
Пример #15
0
    def to_xhtml(self, html, base_url):
        html = html.replace('&nbsp;', ' ')
        html = html.replace('&mdash;', '—')

        try:
            xhtml = etree.fromstring(html,
                                     lxml.html.XHTMLParser(),
                                     base_url=base_url)
        except etree.ParseError as what:
            error("etree.fromstring says %s" % what)
            raise

        xhtml.make_links_absolute(base_url=base_url)

        return xhtml
Пример #16
0
    def apply(self, **kwargs):
        if self.document.settings.encoding != 'utf-8':
            charset = self.document.settings.encoding
            del Unitame.unhandled_chars[:]

            for n in self.document.traverse(nodes.Text):
                text = n.astext()
                text2 = text.encode(charset, 'unitame').decode(charset)
                if text != text2:
                    n.parent.replace(
                        n, nodes.Text(text2))  # cannot change text nodes

            if Unitame.unhandled_chars:
                error("unitame: unhandled chars: %s" %
                      ", ".join(set(Unitame.unhandled_chars)))
Пример #17
0
def __load_writers_from(package_name):
    """ See what types we can write. """

    for fn in resource_listdir(package_name, ''):
        modulename, ext = os.path.splitext(fn)
        if ext == '.py' and modulename.endswith('Writer'):
            type_ = modulename.lower().replace('writer', '')
            try:
                debug("Loading writer type %s from module %s" %
                      (type_, modulename))
                module = __import__(package_name + '.' + modulename,
                                    fromlist=[modulename])
                writers[type_] = module
            except ImportError as what:
                error("Could not load writer type %s from module %s. %s" %
                      (type_, modulename, what))
Пример #18
0
    def rewrite_internal_links_toc(self, toc):
        """ Rewrite links to point into right chunks.

        Because we split the HTML into chunks, all internal links need
        to be rewritten to become links into the right chunk.
        Rewrite all links in the passed toc.

        """

        for entry in toc:
            try:
                entry[0] = self.idmap[normalize_uri(entry[0])]
            except KeyError:
                error("HTMLChunker: Cannot rewrite toc entry '%s'" % entry[0])
                error(repr(self.idmap))
                del entry
Пример #19
0
    def build(self, job):
        """ Build RST file. """

        filename = os.path.join(os.path.abspath(job.outputdir), job.outputfile)

        info("Creating RST file: %s" % filename)

        parser = ParserFactory.ParserFactory.create(job.url)

        if not hasattr(parser, 'rst2nroff'):
            error('RSTWriter can only work on a RSTParser.')
            raise SkipOutputFormat

        data = parser.preprocess('utf-8').encode('utf-8')

        self.write_with_crlf(filename, data)

        info("Done RST file: %s" % filename)
Пример #20
0
    def rewrite_internal_links(self):
        """ Rewrite links to point into right chunks.

        Because we split the HTML into chunks, all internal links need
        to be rewritten to become links into the right chunk.
        Rewrite all internal links in all chunks.

        """
        for chunk in self.chunks:
            for a in xpath(chunk[0], "//xhtml:*[@href]"):
                try:
                    uri = normalize_uri(a.get('href'))
                    a.set('href', self.idmap[uri])
                except KeyError:
                    ur, dummy_frag = urllib.parse.urldefrag(uri)
                    if ur in self.idmap:
                        error(
                            "HTMLChunker: Cannot rewrite internal link '%s'" %
                            uri)
Пример #21
0
    def enqueue(self, queue, depth, attribs, is_doc):
        """ Enqueue url for parsing."""
        if is_doc:
            if not self.is_included_url(attribs):
                warning('External link in %s: %s' %
                        (attribs.referrer, attribs.url))
                return
            if depth >= self.max_depth:
                error('Omitted file %s due to depth > max_depth' % attribs.url)
                return
        if not self.is_included_mediatype(
                attribs) and not self.is_included_relation(attribs):
            return
        elif not self.is_included_url(
                attribs) and not self.is_included_relation(attribs):
            error(
                'Failed for embedded media in %s from disallowed location: %s'
                % (attribs.referrer, attribs.url))
            return

        queue.append((depth, attribs))
Пример #22
0
    def apply(self, **kwargs):
        iter_ = self.startnode.traverse(nodes.paragraph, siblings=1)

        if len(iter_):
            para = iter_[0]
            iter_ = para.traverse(nodes.Text)
            details = self.startnode.details

            if len(iter_):
                textnode = iter_[0]
                charnode = spannode = restnode = None

                char = details['char']
                if not textnode.startswith(char):
                    error("Dropcap: next paragraph doesn't start with: '%s'." %
                          char)
                    return

                span = details.get('span', '')
                if not textnode.startswith(span):
                    error("Dropcap: next paragraph doesn't start with: '%s'." %
                          span)
                    return
                if span and not span.startswith(char):
                    error("Dropcap: span doesn't start with: '%s'." % char)
                    return
                if span == char:
                    span = ''

                if span:
                    # split into char/span/rest
                    restnode = nodes.Text(textnode.astext()[len(span):])
                    spannode = nodes.inline()
                    spannode.append(
                        nodes.Text(textnode.astext()[len(char):len(span)]))
                    spannode['classes'].append('dropspan')
                else:
                    # split into char/rest
                    restnode = nodes.Text(textnode.astext()[len(char):])
                    spannode = nodes.inline('', '')
                    spannode['classes'].append('dropspan')

                if 'image' in details:
                    charnode = nodes.image()
                    charnode['uri'] = details['image']
                    charnode['alt'] = char
                    # debug ("Inserting image %s as dropcap." % uri)
                else:
                    charnode = nodes.inline()
                    charnode.append(nodes.Text(char))
                    # debug ("Inserting char %s as dropcap." % char)

                charnode['classes'].append('dropcap')
                charnode.attributes.update(details)

                para.replace(textnode, [charnode, spannode, restnode])

        self.startnode.parent.remove(self.startnode)
Пример #23
0
    def resize_image(self, max_size, max_dimen, output_format=None):
        """ Create a new parser with a resized image. """

        new_parser = Parser()

        try:
            image = Image.open(six.BytesIO(self.image_data))

            format_ = image.format
            if output_format:
                format_ = output_format
            if format_ == 'gif':
                format_ = 'png'
            if format_ == 'jpeg' and image.mode.lower() != 'rgb':
                image = image.convert('RGB')

            if 'dpi' in image.info:
                del image.info['dpi']

            # maybe resize image

            # find scaling factor
            scale = 1.0
            scale = min(scale, max_dimen[0] / float(image.size[0]))
            scale = min(scale, max_dimen[1] / float(image.size[1]))

            was = ''
            if scale < 1.0:
                dimen = (int(image.size[0] * scale),
                         int(image.size[1] * scale))
                was = "(was %d x %d scale=%.2f) " % (image.size[0],
                                                     image.size[1], scale)
                image = image.resize(dimen, Image.ANTIALIAS)

            # find best quality that fits into max_size
            data = self.image_data
            if (scale < 1.0) or (len(self.image_data) > max_size):
                for quality in (90, 85, 80, 70, 60, 50, 40, 30, 20, 10):
                    buf = six.BytesIO()
                    image.save(buf, format_, quality=quality)
                    data = buf.getvalue()
                    if len(data) <= max_size:
                        was += 'q=%d' % quality
                        break

            comment = "Image: %d x %d size=%d %s" % (
                image.size[0], image.size[1], len(data), was)
            debug(comment)

            new_parser.image_data = data
            new_parser.dimen = tuple(image.size)

            new_parser.attribs = copy.copy(self.attribs)
            new_parser.attribs.comment = comment
            new_parser.fp = self.fp

        except IOError as what:
            error("Could not resize image: %s" % what)
            return ParserFactory.create(BROKEN)

        return new_parser
Пример #24
0
    def _fix_anchors(self):
        """ Move name to id and fix hrefs and ids. """

        # move anchor name to id
        # 'id' values are more strict than 'name' values
        # try to fix ill-formed ids

        seen_ids = set()

        for anchor in (xpath(self.xhtml, "//xhtml:a[@name]") +
                       xpath(self.xhtml, "//xhtml:*[@id]")):
            id_ = anchor.get('id') or anchor.get('name')

            if 'name' in anchor.attrib:
                del anchor.attrib['name']
            if 'id' in anchor.attrib:
                del anchor.attrib['id']
            if NS.xml.id in anchor.attrib:
                del anchor.attrib[NS.xml.id]

            id_ = self._fix_id(id_)

            if not parsers.RE_XML_NAME.match(id_):
                error("Dropping ill-formed id '%s' in %s" %
                      (id_, self.attribs.url))
                continue

            # well-formed id
            if id_ in seen_ids:
                error("Dropping duplicate id '%s' in %s" %
                      (id_, self.attribs.url))
                continue

            seen_ids.add(id_)
            anchor.set('id', id_)

        # try to fix bogus fragment ids
        # 1. fragments point to xml:id, so must be well-formed ids
        # 2. the ids they point to must exist

        for link in xpath(self.xhtml, "//xhtml:*[@href]"):
            href = link.get('href')
            hre, frag = urllib.parse.urldefrag(href)
            if frag:
                frag = self._fix_internal_frag(frag)

                if not frag:
                    # non-recoverable ill-formed frag
                    del link.attrib['href']
                    self.add_class(link, 'pgkilled')
                    error('Dropping ill-formed frag in %s' % href)
                    continue

                # well-formed frag
                if hre:
                    # we have url + frag
                    link.set(
                        'href', "%s#%s" %
                        (hre, urllib.parse.quote(frag.encode('utf-8'))))
                    self.add_class(link, 'pgexternal')
                elif frag in seen_ids:
                    # we have only frag
                    link.set('href',
                             "#%s" % urllib.parse.quote(frag.encode('utf-8')))
                    self.add_class(link, 'pginternal')
                else:
                    del link.attrib['href']
                    self.add_class(link, 'pgkilled')
                    error("Dropping frag to non-existing id in %s" % href)
Пример #25
0
    def build(self, job):
        """ Build PDF file. """

        inputfilename = job.url
        outputfilename = os.path.join(os.path.abspath(job.outputdir),
                                      job.outputfile)

        debug("Inputfile: %s" % inputfilename)
        info("Creating PDF file: %s" % outputfilename)

        parser = ParserFactory.ParserFactory.create(inputfilename)

        if not hasattr(parser, 'rst2xetex'):
            warning('Skipping PDF Output because input mediatype is %s' %
                    parser.mediatype())
            raise SkipOutputFormat

        # Brain-dead xetex doesn't understand unix pipes
        # so we have to write a temp file

        texfilename = os.path.splitext(outputfilename)[0] + '.tex'
        auxfilename = os.path.splitext(outputfilename)[0] + '.aux'
        logfilename = os.path.splitext(outputfilename)[0] + '.log'

        try:
            os.remove(auxfilename)
        except OSError:
            pass

        tex = parser.rst2xetex(job)
        with open(texfilename, 'wb') as fp:
            fp.write(tex)

        try:
            cwd = os.getcwd()
            os.chdir(os.path.abspath(job.outputdir))

            _xetex = subprocess.Popen([
                options.config.XELATEX, "-output-directory", job.outputdir,
                "-interaction", "nonstopmode", texfilename
            ],
                                      stdin=subprocess.PIPE,
                                      stdout=subprocess.PIPE,
                                      stderr=subprocess.PIPE)
        except OSError as what:
            os.chdir(cwd)
            error("PDFWriter: %s %s" % (options.config.XELATEX, what))
            raise SkipOutputFormat

        (dummy_stdout, dummy_stderr) = _xetex.communicate()

        with open(logfilename, encoding='utf-8') as fp:
            for line in fp:
                line = line.strip()
                if 'Error:' in line:
                    error("xetex: %s" % line)
                if options.verbose >= 1:
                    if 'Warning:' in line:
                        warning("xetex: %s" % line)

        if options.verbose < 2:
            try:
                os.remove(texfilename)
                os.remove(logfilename)
                os.remove(auxfilename)
            except OSError:
                pass

        os.chdir(cwd)

        info("Done PDF file: %s" % outputfilename)
Пример #26
0
    def groff(self, job, nroff, encoding='utf-8'):
        """ Process thru groff.

        Takes and returns unicode strings!

        """

        device = {
            'utf-8': 'utf8',
            'iso-8859-1': 'latin1',
            'us-ascii': 'ascii'
        }[encoding]

        nroff = nroff.encode(encoding)
        nrofffilename = os.path.join(
            os.path.abspath(job.outputdir),
            os.path.splitext(job.outputfile)[0] + '.nroff')

        # write nroff file for debugging
        if options.verbose >= 2:
            with open(nrofffilename, 'wb') as fp:
                fp.write(nroff)
        else:
            try:
                # remove debug files from previous runs
                os.remove(nrofffilename)
            except OSError:
                pass

        # call groff
        try:
            _groff = subprocess.Popen(
                [
                    options.config.GROFF,
                    "-t",  # preprocess with tbl
                    "-K",
                    device,  # input encoding
                    "-T",
                    device
                ],  # output device
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE)
        except OSError:
            error("TxtWriter: executable not found: %s" % options.config.GROFF)
            raise SkipOutputFormat

        (txt, stderr) = _groff.communicate(nroff)

        # pylint: disable=E1103
        for line in stderr.splitlines():
            line = line.decode(sys.stderr.encoding)
            line = line.strip()
            if 'error' in line:
                error("groff: %s" % line)
            elif 'warn' in line:
                if options.verbose >= 1:
                    warning("groff: %s" % line)

        txt = txt.decode(encoding)
        return txt.translate(u2u)  # fix nroff idiosyncracies
Пример #27
0
    def resize_image(self, max_size, max_dimen, output_format=None):
        """ Create a new parser with a resized image. """
        def scale_image(image, scale):
            was = ''
            if scale < 1.0:
                dimen = (int(image.size[0] * scale),
                         int(image.size[1] * scale))
                was = "(was %d x %d scale=%.2f) " % (image.size[0],
                                                     image.size[1], scale)
                image = image.resize(dimen, Image.ANTIALIAS)
            return was, image

        def get_image_data(image, format_, quality=95):
            buf = six.BytesIO()
            if format_ == 'png':
                image.save(buf, 'png', optimize=True)
            else:
                image.save(buf, 'jpeg', quality=quality)
            return buf.getvalue()

        new_parser = Parser()

        try:
            unsized_image = Image.open(six.BytesIO(self.image_data))

            format_ = unsized_image.format.lower()
            if output_format:
                format_ = output_format
            if format_ == 'gif':
                format_ = 'png'
            if format_ == 'jpeg' and unsized_image.mode.lower() not in ('rgb',
                                                                        'l'):
                unsized_image = unsized_image.convert('RGB')

            if 'dpi' in unsized_image.info:
                del unsized_image.info['dpi']

            # maybe resize image

            # find scaling factor
            scale = 1.0
            scale = min(scale, max_dimen[0] / float(unsized_image.size[0]))
            scale = min(scale, max_dimen[1] / float(unsized_image.size[1]))

            was, image = scale_image(unsized_image, scale)
            data = get_image_data(image, format_)

            if format_ == 'png':
                # scale it till it fits into max_size
                while len(data) > max_size and scale > 0.01:
                    scale = scale * 0.8
                    was, image = scale_image(unsized_image, scale)
                    data = get_image_data(image, format_)
            else:
                # find best quality that fits into max_size
                if len(data) > max_size:
                    for quality in (90, 85, 80, 70, 60, 50, 40, 30, 20, 10):
                        data = get_image_data(image, format_, quality=quality)
                        if len(data) <= max_size:
                            break

                    was += 'q=%d' % quality
            comment = "Image: %d x %d size=%d %s" % (
                image.size[0], image.size[1], len(data), was)
            debug(comment)

            new_parser.image_data = data
            new_parser.dimen = tuple(image.size)

            new_parser.attribs = copy.copy(self.attribs)
            new_parser.attribs.comment = comment
            new_parser.fp = self.fp

        except IOError as what:
            error("Could not resize image: %s" % what)
            return ParserFactory.create(BROKEN)

        return new_parser
Пример #28
0
    def build(self, job):
        """ Build kindle file from epub using amazon kindlegen. """

        info("Creating Kindle file: %s" %
             os.path.join(job.outputdir, job.outputfile))
        info("            ... from: %s" % job.url)

        try:
            cwd = os.getcwd()
            os.chdir(job.outputdir)

            kindlegen = subprocess.Popen([
                options.config.MOBIGEN, '-o',
                os.path.basename(job.outputfile), job.url
            ],
                                         stdin=subprocess.PIPE,
                                         stdout=subprocess.PIPE,
                                         stderr=subprocess.PIPE)

        except OSError as what:
            os.chdir(cwd)
            error("KindleWriter: %s %s" % (options.config.MOBIGEN, what))
            raise SkipOutputFormat

        (stdout, stderr) = kindlegen.communicate()

        os.chdir(cwd)

        if kindlegen.returncode > 0:
            regex = re.compile(r'^(\w+)\(prcgen\):')

            # pylint: disable=E1103
            msg = stderr.rstrip()
            if msg:
                msg = msg.decode(sys.stderr.encoding)
                error(msg)
            msg = stdout.rstrip()
            msg = msg.decode(sys.stdout.encoding)
            for line in msg.splitlines():
                match = regex.match(line)
                if match:
                    sline = regex.sub("", line)
                    g = match.group(1).lower()
                    if g == 'info':
                        if sline == 'MOBI File generated with WARNINGS!':
                            # we knew that already
                            continue
                        # info ("kindlegen: %s" % sline)
                    elif g == 'warning':
                        if sline.startswith('Cover is too small'):
                            continue
                        if sline == 'Cover not specified':
                            continue
                        warning("kindlegen: %s" % sline)
                    elif g == 'error':
                        error("kindlegen: %s" % sline)
                    else:
                        error(line)

        info("Done Kindle file: %s" %
             os.path.join(job.outputdir, job.outputfile))
Пример #29
0
    def build(self, job):
        """ Build kindle file from epub using amazon kindlegen or calibre. """

        if job.dc.languages:
            if job.dc.languages[0].id in no_kindlegen_langs:
                mobimaker = options.config.MOBILANG
            else:
                mobimaker = options.config.MOBIGEN
        if not mobimaker:
            info('no mobimaker available')
            return

        # kindlegen needs localized paths
        outputdir = os.path.abspath(job.outputdir)

        info("Creating Kindle file: %s" % os.path.join(outputdir, job.outputfile))
        info("            ... from: %s" % job.url)

        try:
            cwd = os.getcwd()
            os.chdir(outputdir)
            if 'ebook-convert' in mobimaker:
                kindlegen = subprocess.Popen(
                    [
                        mobimaker,
                        job.url,
                        os.path.basename(job.outputfile),
                        '--personal-doc="[EBOK]"',
                    ],
                    stdin=subprocess.PIPE,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE
                )
            else:
                kindlegen = subprocess.Popen(
                    [
                        mobimaker,
                        '-o', os.path.basename(job.outputfile),
                        job.url
                    ],
                    stdin=subprocess.PIPE,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE
                )

        except OSError as what:
            os.chdir(cwd)
            error("KindleWriter: %s %s" % (mobimaker, what))
            raise SkipOutputFormat

        (stdout, stderr) = kindlegen.communicate()

        os.chdir(cwd)

        if kindlegen.returncode > 0:
            regex = re.compile(r'^(\w+)\(prcgen\):')

            # pylint: disable=E1103
            msg = stderr.rstrip()
            if msg:
                msg = msg.decode(sys.stderr.encoding)
                error(msg)
            msg = stdout.rstrip()
            msg = msg.decode(sys.stdout.encoding)
            for line in msg.splitlines():
                match = regex.match(line)
                if match:
                    sline = regex.sub("", line)
                    g = match.group(1).lower()
                    if g == 'info':
                        if sline == 'MOBI File generated with WARNINGS!':
                            # we knew that already
                            continue
                        # info("kindlegen: %s" % sline)
                    elif g == 'warning':
                        if sline.startswith('Cover is too small'):
                            continue
                        if sline == 'Cover not specified':
                            continue
                        warning("kindlegen: %s" % sline)
                    elif g == 'error':
                        error("kindlegen: %s" % sline)
                    else:
                        error(line)

        info("Done Kindle file: %s" % os.path.join(outputdir, job.outputfile))
Пример #30
0
    def resize_image(self, max_size, max_dimen, output_format=None):
        """ Create a new parser with a resized image. """
        def scale_image(image, scale):
            was = ''
            if scale < 1.0:
                dimen = (int(image.size[0] * scale),
                         int(image.size[1] * scale))
                was = "(was %d x %d scale=%.2f) " % (image.size[0],
                                                     image.size[1], scale)
                image = image.resize(dimen, Image.ANTIALIAS)
            return was, image

        def get_image_data(image, format_, quality='keep'):
            """ Format is the output format, not necessarily the input format """
            buf = six.BytesIO()
            if image.format != 'JPEG' and quality == 'keep':
                quality = 90
            if format_ == 'png':
                image.save(buf, 'png', optimize=True)
            else:
                try:
                    image.save(buf, 'jpeg', quality=quality)
                except ValueError as e:
                    if quality == 'keep' and 'quantization' in str(e):
                        image.save(buf, 'jpeg', quality=90)
                    else:
                        raise e
            return buf.getvalue()

        new_parser = Parser()

        try:
            unsized_image = Image.open(six.BytesIO(self.image_data))

            format_ = unsized_image.format.lower()
            if output_format:
                format_ = output_format
            if format_ == 'gif':
                format_ = 'png'
            if format_ == 'jpeg' and unsized_image.mode.lower() not in ('rgb',
                                                                        'l'):
                unsized_image = unsized_image.convert('RGB')

            if 'dpi' in unsized_image.info:
                del unsized_image.info['dpi']

            # maybe resize image

            # find scaling factor
            scale = 1.0
            scale = min(scale, max_dimen[0] / float(unsized_image.size[0]))
            scale = min(scale, max_dimen[1] / float(unsized_image.size[1]))

            was, image = scale_image(unsized_image, scale)
            data = get_image_data(image, format_)

            if format_ == 'png':
                # scale it till it fits into max_size
                while len(data) > max_size and scale > 0.01:
                    scale = scale * 0.8
                    was, image = scale_image(unsized_image, scale)
                    data = get_image_data(image, format_)
            else:
                # find best quality that fits into max_size
                if len(data) > max_size:
                    for quality in (90, 85, 80, 70, 60, 50, 40, 30, 20, 10):
                        data = get_image_data(image, format_, quality=quality)
                        if len(data) <= max_size:
                            break

                    was += 'q=%d' % quality
            comment = "Image: %d x %d size=%d %s" % (
                image.size[0], image.size[1], len(data), was)
            debug(comment)

            new_parser.image_data = data
            new_parser.dimen = tuple(image.size)

            new_parser.attribs = copy.copy(self.attribs)
            new_parser.attribs.comment = comment
            new_parser.fp = self.fp

        except IOError as what:
            error("Could not resize image: %s" % what)
            new_parser.attribs = copy.copy(self.attribs)
            fp = resource_stream('ebookmaker.parsers', 'broken.png')
            new_parser.image_data = fp.read()
            fp.close()

        return new_parser