Exemplo n.º 1
0
    def open_file(cls, url, attribs):
        """ Open a local file for parsing. """
        def open_file_from_path(path):
            try:
                return open(url, 'rb')
            except FileNotFoundError:
                error('Missing file: %s' % url)
            except IsADirectoryError:
                error('Missing file is a directory: %s' % url)
            return None

        if re.search(r'^([a-zA-z]:|/)', url):
            fp = open_file_from_path(url)
        else:
            try:
                # handles all the flavors of file: urls, including on windows
                fp = urllib.request.urlopen(url)
            except urllib.error.URLError as what:
                fp = None
                error('Missing file: %s' % what.reason)
            except ValueError:  # just a relative path?
                fp = open_file_from_path(url)

        attribs.orig_mediatype = attribs.HeaderElement(
            MediaTypes.guess_type(url))

        debug("... got mediatype %s from guess_type" %
              str(attribs.orig_mediatype))
        attribs.orig_url = attribs.url = url
        return fp
Exemplo n.º 2
0
    def pre_parse(self):
        """ Parse a RST file as link list. """

        debug("RSTParser: Pre-parsing %s" % self.attribs.url)

        default_style = self.get_resource('mydocutils.parsers',
                                          'default_style.rst')

        source = docutils.io.StringInput(default_style +
                                         self.unicode_content())
        reader = docutils.readers.standalone.Reader()
        parser = gutenberg_parsers.Parser()

        overrides = {
            'get_resource': self.get_resource,
            'get_image_size': self.get_image_size_from_parser,
            'base_url': self.attribs.url,
        }

        doc = reader.read(source, parser,
                          self.get_settings((reader, parser), overrides))
        self.document1 = doc

        self.rewrite_links(partial(urllib.parse.urljoin, self.attribs.url))

        debug("RSTParser: Done pre-parsing %s" % self.attribs.url)
Exemplo n.º 3
0
    def _make_coverpage_link (self):
        """ Insert a <link rel="coverpage"> in the html head.

        First we determine the coverpage url.  In HTML we find the
        coverpage by appling these rules:

          1. the image specified in <link rel='coverpage'>,
          2. the image with an id of 'coverpage' or
          3. the image with an url containing 'cover'
          4. the image with an url containing 'title'

        If one rule returns images we take the first one in document
        order, else we proceed with the next rule.
        """

        coverpages = xpath (self.xhtml, "//xhtml:link[@rel='coverpage']")
        for coverpage in coverpages:
            url = coverpage.get ('src')
            debug ("Found link to coverpage %s." % url)
            return   # already provided by user

        # look for a suitable candidate
        coverpages = xpath (self.xhtml, "//xhtml:img[@id='coverpage']")
        if not coverpages:
            coverpages = xpath (self.xhtml, "//xhtml:img[contains (@src, 'cover')]")
        if not coverpages:
            coverpages = xpath (self.xhtml, "//xhtml:img[contains (@src, 'title')]")

        for coverpage in coverpages:
            for head in xpath (self.xhtml, "/xhtml:html/xhtml:head"):
                url = coverpage.get ('src')
                head.append (parsers.em.link (rel = 'coverpage', href = url))
                debug ("Inserted link to coverpage %s." % url)
Exemplo n.º 4
0
    def _full_parse(self, writer, overrides):
        """ Full parse from scratch. """

        debug("RSTParser: Full-parsing %s" % self.attribs.url)

        default_style = self.get_resource('mydocutils.parsers',
                                          'default_style.rst')

        source = docutils.io.StringInput(
            default_style + self.unicode_content(), self.attribs.url,
            'unicode')
        reader = docutils.readers.standalone.Reader()
        parser = gutenberg_parsers.Parser()

        doc = reader.read(
            source, parser,
            self.get_settings((reader, parser, writer), overrides))
        self.document1 = doc

        self.rewrite_links(partial(urllib.parse.urljoin, self.attribs.url))

        doc.transformer.populate_from_components(
            (source, reader, parser, writer))
        doc.transformer.apply_transforms()
        debug("RSTParser: Done full-parsing %s" % self.attribs.url)

        return doc
Exemplo n.º 5
0
    def __unicode__(self):
        """ Serialize content.opf as unicode string. """

        assert len(self.manifest), 'No manifest item in content.opf.'
        assert len(self.spine), 'No spine item in content.opf.'
        assert 'toc' in self.spine.attrib, 'No TOC item in content.opf.'

        package = self.opf.package(**{
            'version': '2.0',
            'unique-identifier': 'id'
        })
        package.append(self.metadata)
        package.append(self.manifest)
        package.append(self.spine)
        if len(self.guide):
            package.append(self.guide)

        content_opf = "%s\n\n%s" % (
            gg.XML_DECLARATION,
            etree.tostring(package, encoding=six.text_type, pretty_print=True))

        # FIXME: remove this when lxml is fixed
        # now merge xmlns:opf and xmlns:
        content_opf = content_opf.replace('lxml-bug-workaround', '')

        if options.verbose >= 2:
            debug(content_opf)
        return content_opf
Exemplo n.º 6
0
 def run(self):
     debug('Endsection directive state: %s' % self.state)
     # back out of lists, etc.
     if isinstance(self.state, states.SpecializedBody):
         debug('Backing out of list')
         self.state_machine.previous_line(2)  # why do we need 2 ???
     raise EOFError
Exemplo n.º 7
0
    def copy_aux_files(self, job, dest_dir):
        """ Copy image files to dest_dir. Use image data cached in parsers. """

        for p in job.spider.parsers:
            if hasattr(p, 'resize_image'):
                src_uri = p.attribs.url
                fn_dest = gg.make_url_relative(webify_url(job.base_url),
                                               src_uri)
                fn_dest = os.path.join(dest_dir, fn_dest)

                # debug ('base_url =  %s, src_uri = %s' % (job.base_url, src_uri))

                if gg.is_same_path(src_uri, fn_dest):
                    debug('Not copying %s to %s: same file' %
                          (src_uri, fn_dest))
                    continue
                debug('Copying %s to %s' % (src_uri, fn_dest))

                fn_dest = gg.normalize_path(fn_dest)
                gg.mkdir_for_filename(fn_dest)
                try:
                    with open(fn_dest, 'wb') as fp_dest:
                        fp_dest.write(p.serialize())
                except IOError as what:
                    error('Cannot copy %s to %s: %s' %
                          (src_uri, fn_dest, what))
Exemplo n.º 8
0
    def decode(self, charset):
        """ Try to decode document contents to unicode. """
        if charset is None:
            return None

        charset = charset.lower().strip()

        if charset in BOGUS_CHARSET_NAMES:
            charset = BOGUS_CHARSET_NAMES[charset]

        if charset == 'utf-8':
            charset = 'utf_8_sig'

        try:
            debug("Trying to decode document with charset %s ..." % charset)
            buffer = self.bytes_content()
            buffer = REB_PG_CHARSET.sub(b'', buffer)
            buffer = buffer.decode(charset)
            self.attribs.orig_mediatype.params['charset'] = charset
            return buffer
        except LookupError as what:
            # unknown charset,
            error("Invalid charset name: %s (%s)" % (charset, what))
        except UnicodeError as what:
            # mis-stated charset, did not decode
            error("Text not in charset %s (%s)" % (charset, what))
        return None
Exemplo n.º 9
0
 def f(url):
     """ Remap function """
     ur, frag = urllib.parse.urldefrag(url)
     if ur in url_map:
         debug("Rewriting redirected url: %s to %s" % (ur, url_map[ur]))
         ur = url_map[ur]
     return "%s#%s" % (ur, frag) if frag else ur
Exemplo n.º 10
0
    def validate(self, job):
        """ Validate generated epub using external tools. """

        debug("Validating %s ..." % job.outputfile)

        filename = os.path.join(job.outputdir, job.outputfile)

        for validator in (options.config.EPUB_VALIDATOR,
                          options.config.EPUB_PREFLIGHT):
            if validator is not None:
                params = validator.split() + [filename]
                checker = subprocess.Popen(params,
                                           stdin=subprocess.PIPE,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE)

                (dummy_stdout, stderr) = checker.communicate()
                if stderr:
                    error(stderr)
                    return 1
                    #raise AssertionError (
                    #    "%s does not validate." % job.outputfile)

        debug("%s validates ok." % job.outputfile)
        return 0
Exemplo n.º 11
0
    def is_included_mediatype(self, attribs):
        """ Return True if this document is eligible. """

        mediatype = self.get_mediatype(attribs)
        if not mediatype:
            warning('Mediatype could not be determined from url %s' %
                    attribs.url)
            return True  # always include if mediatype unknown

        included = any([
            fnmatch.fnmatch(mediatype, pattern)
            for pattern in self.include_mediatypes
        ])
        excluded = any([
            fnmatch.fnmatch(mediatype, pattern)
            for pattern in self.exclude_mediatypes
        ])

        if included and not excluded:
            return True

        if excluded:
            debug("Dropping excluded mediatype %s" % mediatype)
        if not included:
            debug("Dropping not included mediatype %s" % mediatype)

        return False
Exemplo n.º 12
0
 def setUp(self):
     config()
     Logger.set_log_level(options.verbose)
     options.types = options.types or ['all']
     options.types = CommonCode.add_dependencies(options.types,
                                                 DEPENDENCIES, BUILD_ORDER)
     debug("Building types: %s" % ' '.join(options.types))
Exemplo n.º 13
0
    def __unicode__(self):
        """ Serialize toc.ncx as unicode string. """
        ncx = self.ncx
        tocdepth = 1

        if self.toc:
            # normalize toc so that it starts with an h1 and doesn't jump down more than one
            # level at a time
            fixer = OutlineFixer()
            for t in self.toc:
                t[2] = fixer.level(t[2])

            # flatten toc if it contains only one top-level entry
            top_level_entries = sum(t[2] == 1 for t in self.toc)
            if top_level_entries < 2:
                for t in self.toc:
                    if t[2] != -1:
                        t[2] = max(1, t[2] - 1)

            tocdepth = max(t[2] for t in self.toc)

        head = ncx.head(
            ncx.meta(name='dtb:uid', content=self.dc.opf_identifier),
            ncx.meta(name='dtb:depth', content=str(tocdepth)),
            ncx.meta(name='dtb:generator', content=GENERATOR % VERSION),
            ncx.meta(name='dtb:totalPageCount', content='0'),
            ncx.meta(name='dtb:maxPageNumber', content='0'))

        doc_title = ncx.docTitle(ncx.text(self.dc.title))

        self.seen_urls = {}
        has_pages = False
        for url, dummy_title, depth in self.toc:
            # navPoints and pageTargets referencing the same element
            # must have the same playOrder
            if url not in self.seen_urls:
                self.seen_urls[url] = str(len(self.seen_urls) + 1)
            if depth == -1:
                has_pages = True

        params = {'version': '2005-1'}
        if self.dc.languages:
            params[NS.xml.lang] = self.dc.languages[0].id

        ncx = ncx.ncx(head, doc_title, self._make_navmap(self.toc), **params)

        if has_pages:
            ncx.append(self._make_pagelist(self.toc))

        # Ugly workaround for error: "Serialisation to unicode must not
        # request an XML declaration"

        toc_ncx = "%s\n\n%s" % (gg.XML_DECLARATION,
                                etree.tostring(ncx,
                                               doctype=gg.NCX_DOCTYPE,
                                               encoding=six.text_type,
                                               pretty_print=True))
        if options.verbose >= 3:
            debug(toc_ncx)
        return toc_ncx
Exemplo n.º 14
0
    def shipout_chunk(self, attribs, chunk_id=None, comment=None):
        """ ready chunk to be shipped """

        attribs = copy.copy(attribs)

        if self.chunk_size > MAX_CHUNK_SIZE:
            self.split(self.chunk, attribs)
            return

        url = normalize_uri(attribs.url)
        chunk_name = self._make_name(url)

        # the url of the whole page
        if not url in self.idmap:
            self.idmap[url] = chunk_name

        # fragments of the page
        for e in xpath(self.chunk, '//xhtml:*[@id]'):
            id_ = e.attrib['id']
            old_id = "%s#%s" % (url, id_)
            # key is unicode string,
            # value is uri-escaped byte string
            # if ids get cloned while chunking, map to the first one only
            if old_id not in self.idmap:
                self.idmap[old_id] = "%s#%s" % (chunk_name,
                                                urllib.parse.quote(id_))

        attribs.url = chunk_name
        attribs.id = chunk_id
        attribs.comment = comment
        self.chunks.append((self.chunk, attribs))

        debug("Adding chunk %s (%d bytes) %s" %
              (chunk_name, self.chunk_size, chunk_id))
Exemplo n.º 15
0
    def is_included_mediatype(self, attribs):
        """ Return True if this document is eligible. """

        if attribs.orig_mediatype is None:
            mediatype = MediaTypes.guess_type(attribs.url)
            if mediatype:
                attribs.orig_mediatype = attribs.HeaderElement(mediatype)
            else:
                return True  # always include if mediatype unknown

        mediatype = attribs.orig_mediatype.value

        included = any([
            fnmatch.fnmatch(mediatype, pattern)
            for pattern in self.include_mediatypes
        ])
        excluded = any([
            fnmatch.fnmatch(mediatype, pattern)
            for pattern in self.exclude_mediatypes
        ])

        if included and not excluded:
            return True

        if excluded:
            debug("Dropping excluded mediatype %s" % mediatype)
        if not included:
            debug("Dropping not included mediatype %s" % mediatype)

        return False
Exemplo n.º 16
0
    def get_charset_from_content_type(self):
        """ Get charset from server content-type. """

        charset = self.attribs.orig_mediatype.params.get('charset')
        if charset:
            debug('Got charset %s from server' % charset)
            return charset
        return None
Exemplo n.º 17
0
 def unpack_media_handheld (sheet):
     """ unpack a @media handheld rule """
     for rule in sheet:
         if rule.type == rule.MEDIA_RULE:
             if rule.media.mediaText.find ('handheld') > -1:
                 debug ("Unpacking CSS @media handheld rule.")
                 rule.media.mediaText = 'all'
                 rule.insertRule (cssutils.css.CSSComment ('/* was @media handheld */'), 0)
Exemplo n.º 18
0
    def is_included_relation(self, attribs):
        """ Return True if this document is eligible. """

        keep = attribs.rel.intersection(('coverpage', 'important'))
        if keep:
            debug("Not dropping after all because of rel.")

        return keep
Exemplo n.º 19
0
def main():
    """ Main program. """

    try:
        config()
    except configparser.Error as what:
        error("Error in configuration file: %s", str(what))
        return 1

    Logger.set_log_level(options.verbose)

    options.types = options.types or ['all']
    options.types = CommonCode.add_dependencies(options.types, DEPENDENCIES,
                                                BUILD_ORDER)
    debug("Building types: %s" % ' '.join(options.types))

    ParserFactory.load_parsers()
    WriterFactory.load_writers()
    PackagerFactory.load_packagers()

    if options.is_job_queue:
        job_queue = cPickle.load(sys.stdin.buffer)  # read bytes
    else:
        options.dc = get_dc(options.url)
        job_queue = []
        output_files = dict()
        for type_ in options.types:
            job = CommonCode.Job(type_)
            job.url = options.url
            job.ebook = options.ebook
            job.dc = options.dc
            job.outputdir = options.outputdir
            job.outputfile = options.outputfile or make_output_filename(
                type_, options.dc)
            output_files[type_] = job.outputfile

            if job.type == 'kindle.images':
                job.url = os.path.join(job.outputdir,
                                       output_files['epub.images'])
            elif job.type == 'kindle.noimages':
                job.url = os.path.join(job.outputdir,
                                       output_files['epub.noimages'])

            job_queue.append(job)

    for j in job_queue:
        do_job(j)

    packager = PackagerFactory.create(options.packager, 'push')
    if packager:
        # HACK: the WWers ever only convert one ebook at a time
        job = job_queue[0]
        job.outputfile = '%d-final.zip' % (options.dc.project_gutenberg_id)
        packager.package(job)

    return 0
Exemplo n.º 20
0
    def _make_coverpage_link(self, coverpage_url=None):
        """ Insert a <link rel="coverpage"> in the html head
        using the image specified by the --cover command-line option
        """

        if coverpage_url:
            for head in xpath(self.xhtml, "/xhtml:html/xhtml:head"):
                head.append(
                    parsers.em.link(rel='coverpage', href=coverpage_url))
                debug("Inserted link to coverpage %s." % coverpage_url)
            return
Exemplo n.º 21
0
    def guess_charset_from_body(self):
        """ Guess charset from text. """

        # http://chardet-matthickford.readthedocs.org/en/latest/usage.html

        result = chardet.detect(self.bytes_content())
        charset = result.get('encoding')
        if charset:
            debug('Got charset %s from text sniffing' % charset)
            return charset
        return None
Exemplo n.º 22
0
    def remove_coverpage(self, xhtml, url):
        """ Remove coverpage from flow.

        EPUB readers will display the coverpage from the manifest and
        if we don't remove it from flow it will be displayed twice.

        """
        for img in xpath(xhtml, '//xhtml:img[@src = $url]', url=url):
            debug("remove_coverpage: dropping <img> %s from flow" % url)
            img.drop_tree()
            return  # only the first one though
Exemplo n.º 23
0
    def bytes_content(self):
        """ Get document content as raw bytes. """

        if self.buffer is None:
            try:
                debug("Fetching %s ..." % self.attribs.url)
                self.buffer = self.fp.read()
                self.fp.close()
            except IOError as what:
                error(what)

        return self.buffer
Exemplo n.º 24
0
    def get_charset_from_meta(self):
        """ Parse text for hints about charset. """
        # .. -*- coding: utf-8 -*-

        charset = None
        rst = self.bytes_content()

        match = REB_EMACS_CHARSET.search(rst)
        if match:
            charset = match.group(1).decode('ascii')
            debug('Got charset %s from emacs comment' % charset)

        return charset
Exemplo n.º 25
0
    def fix_incompatible_css(sheet):
        """ Strip CSS properties and values that are not EPUB compatible. """

        # debug("enter fix_incompatible_css")

        for rule in sheet:
            if rule.type == rule.STYLE_RULE:
                for p in list(rule.style):
                    if p.name == 'float':
                        debug("Dropping property %s" % p.name)
                        rule.style.removeProperty('float')
                        rule.style.removeProperty('width')
                        rule.style.removeProperty('height')
                    elif p.name == 'position':
                        debug("Dropping property %s" % p.name)
                        rule.style.removeProperty('position')
                        rule.style.removeProperty('left')
                        rule.style.removeProperty('right')
                        rule.style.removeProperty('top')
                        rule.style.removeProperty('bottom')
                    elif p.name in ('background-image', 'background-position',
                                    'background-attachment',
                                    'background-repeat'):
                        debug("Dropping property %s" % p.name)
                        rule.style.removeProperty(p.name)
                    elif 'border' not in p.name and 'px' in p.value:
                        debug("Dropping property with px value %s" % p.name)
                        rule.style.removeProperty(p.name)
Exemplo n.º 26
0
    def open_file (cls, orig_url, attribs):
        """ Open a local file for parsing. """

        url = orig_url
        if url.startswith ('file://'):
            fp = open (url[7:], "rb")
        else:
            fp = open (url, "rb")
        attribs.orig_mediatype = attribs.HeaderElement (MediaTypes.guess_type (url))

        debug ("... got mediatype %s from guess_type" % str (attribs.orig_mediatype))
        attribs.orig_url = orig_url
        attribs.url = url
        return fp
Exemplo n.º 27
0
    def fix_incompatible_css(sheet):
        """ Strip CSS properties and values that are not EPUB compatible. """

        cssclass = re.compile(r'\.(-?[_a-zA-Z]+[_a-zA-Z0-9-]*)')

        for rule in sheet:
            if rule.type == rule.STYLE_RULE:
                ruleclasses = list(
                    cssclass.findall(rule.selectorList.selectorText))
                for p in list(rule.style):
                    if p.name == 'float' and "x-ebookmaker" not in ruleclasses:
                        debug("Dropping property %s" % p.name)
                        rule.style.removeProperty('float')
                        rule.style.removeProperty('width')
                        rule.style.removeProperty('height')
                    elif p.name == 'position':
                        debug("Dropping property %s" % p.name)
                        rule.style.removeProperty('position')
                        rule.style.removeProperty('left')
                        rule.style.removeProperty('right')
                        rule.style.removeProperty('top')
                        rule.style.removeProperty('bottom')
                    elif p.name in ('background-image', 'background-position',
                                    'background-attachment',
                                    'background-repeat'):
                        debug("Dropping property %s" % p.name)
                        rule.style.removeProperty(p.name)
                    elif 'border' not in p.name and 'px' in p.value:
                        debug("Dropping property with px value %s" % p.name)
                        rule.style.removeProperty(p.name)
Exemplo n.º 28
0
    def fix_style_elements(xhtml):
        """ Fix CSS style elements.  Make sure they are utf-8. """

        # debug ("enter fix_style_elements")

        for style in xpath(xhtml, "//xhtml:style"):
            p = parsers.CSSParser.Parser()
            p.parse_string(style.text)
            try:
                # pylint: disable=E1103
                style.text = p.sheet.cssText.decode('utf-8')
            except (ValueError, UnicodeError):
                debug("CSS:\n%s" % p.sheet.cssText)
                raise
Exemplo n.º 29
0
def load_parsers ():
    """ See what types we can parse. """

    for fn in resource_listdir ('ebookmaker.parsers', ''):
        modulename, ext = os.path.splitext (fn)
        if ext == '.py':
            if modulename.endswith ('Parser'):
                module = __import__ ('ebookmaker.parsers.' + modulename, fromlist = [modulename])
                debug ("Loading parser from module: %s for mediatypes: %s" % (
                    modulename, ', '.join (module.mediatypes)))
                for mediatype in module.mediatypes:
                    parser_modules[mediatype] = module

    return parser_modules.keys ()
Exemplo n.º 30
0
    def _full_parse_2(self, writer, destination, overrides):
        """ Full parser from pickled doctree.

        Doesn't work yet. It turned out pickling a doctree is much
        harder than I thought. """

        debug("Full-parsing %s" % self.attribs.url)

        source = docutils.io.StringInput(self.unicode_content())
        reader = docutils.readers.standalone.Reader()
        parser = gutenberg_parsers.Parser()

        doc = reader.read(
            source, parser,
            self.get_settings((reader, parser, writer), overrides))
        self.document1 = doc

        self.rewrite_links(partial(urllib.parse.urljoin, self.attribs.url))

        # make it picklable
        reporter = doc.reporter  #  = None
        # doc.reporter = None
        transformer = doc.transformer
        doc.settings = None
        from docutils.parsers.rst.directives.html import MetaBody

        #for metanode in doc.traverse (MetaBody.meta):
        for pending in doc.traverse(nodes.pending):
            # pending.transform = None
            # docutils' meta nodes aren't picklable because the class is nested
            # in pending['nodes']
            if 'nodes' in pending.details:
                if isinstance(pending.details['nodes'][0], MetaBody.meta):
                    pending.details['nodes'][0].__class__ = mynodes.meta
        from six.moves import cPickle as pickle
        pickled = pickle.dumps(doc)

        doc = pickle.loads(pickled)

        #doc.transformer.populate_from_components (
        #    (source, reader, parser, writer))

        doc.transformer = transformer
        doc.reporter = reporter
        doc.settings = self.get_settings((reader, parser, writer), overrides)

        doc.transformer.apply_transforms()

        return writer.write(doc, destination)