Пример #1
0
    def copy_aux_files(self, job, dest_dir):
        """ Copy image files to dest_dir. Use image data cached in parsers. """

        for p in job.spider.parsers:
            if hasattr(p, 'resize_image'):
                src_uri = p.attribs.url
                fn_dest = gg.make_url_relative(webify_url(job.base_url),
                                               src_uri)
                fn_dest = os.path.join(dest_dir, fn_dest)

                # debug ('base_url =  %s, src_uri = %s' % (job.base_url, src_uri))

                if gg.is_same_path(src_uri, fn_dest):
                    debug('Not copying %s to %s: same file' %
                          (src_uri, fn_dest))
                    continue
                debug('Copying %s to %s' % (src_uri, fn_dest))

                fn_dest = gg.normalize_path(fn_dest)
                gg.mkdir_for_filename(fn_dest)
                try:
                    with open(fn_dest, 'wb') as fp_dest:
                        fp_dest.write(p.serialize())
                except IOError as what:
                    error('Cannot copy %s to %s: %s' %
                          (src_uri, fn_dest, what))
Пример #2
0
    def format_date(date):
        """ Format a date. """

        if date is None:
            return ''

        try:
            # datetime
            return date.replace(tzinfo=gg.UTC(), microsecond=0).isoformat()
        except TypeError:
            # date
            return datetime.datetime.combine(
                date, datetime.time(tzinfo=gg.UTC())).isoformat()
Пример #3
0
    def package (self, job):
        self.setup (job)
        zipfilename = job.outputfile # filename is zipfile

        m = re.match (r'\d+', zipfilename)
        if m:
            ebook_no = m.group (0)
        else:
            error ('Invalid filename %s for push packager.' % zipfilename)
            return

        zip_ = self.create (zipfilename)

        for suffix in '.txt -8.txt -0.txt .zip -8.zip -0.zip -rst.zip -h.zip'.split ():
            filename = '%s%s' % (ebook_no, suffix)
            memberfilename = '%s/%s' % (ebook_no, filename)
            self.add (zip_, filename, memberfilename)

        for suffix, ext in (('-h', 'html'), ('-rst', 'rst')):
            filename = '%s%s.%s' % (ebook_no, suffix, ext)
            memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, filename)
            self.add (zip_, filename, memberfilename)

            # image files
            for url in options.html_images_list:
                rel_url = gg.make_url_relative (job.base_url, url)
                filename = os.path.join (self.path, rel_url)
                memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, rel_url)
                self.add (zip_, filename, memberfilename)

        zip_.close ()
        info ('Done Zip file: %s' % zipfilename)
Пример #4
0
    def add_meta(xhtml, name, content):
        """ Add a meta tag. """

        for head in gg.xpath(xhtml, '//xhtml:head'):
            meta = em.meta(name=name, content=content)
            meta.tail = '\n'
            head.append(meta)
Пример #5
0
 def format_title_filing(row):
     """ Format a book title for display in results. """
     title = gg.cut_at_newline(row.get('filing') or 'No Title')
     for lang_id in row.get('fk_langs') or []:
         if lang_id != 'en':
             title += " (%s)" % cherrypy.response.i18n.locale.languages.get(
                 lang_id, lang_id)
     return title
Пример #6
0
def make_output_filename(type_, dc):
    """ Make a suitable filename for output type. """

    if dc.project_gutenberg_id:
        # PG book: use PG naming convention
        return FILENAMES[type_].format(id=dc.project_gutenberg_id)
    # not a PG ebook
    return FILENAMES[type_].format(id=gg.string_to_filename(dc.title)[:65])
Пример #7
0
        def get_header_text(header):
            """ clean header text """
            text = gg.normalize(
                etree.tostring(header,
                               method="text",
                               encoding=six.text_type,
                               with_tail=False))

            return header.get('title', text).strip()
Пример #8
0
    def add_internal_css(xhtml, css_as_string):
        """ Add internal stylesheet to html. """

        if css_as_string and xhtml is not None:
            css_as_string = '\n' + css_as_string.strip(' \n') + '\n'
            for head in gg.xpath(xhtml, '//xhtml:head'):
                style = em.style(css_as_string, type='text/css')
                style.tail = '\n'
                head.append(style)
Пример #9
0
    def add_dublincore(self, job, tree):
        """ Add dublin core metadata to <head>. """
        source = gg.archive2files(options.ebook, job.url)

        if hasattr(options.config, 'FILESDIR'):
            job.dc.source = source.replace(options.config.FILESDIR,
                                           options.config.PGURL)

        for head in xpath(tree, '//xhtml:head'):
            for e in job.dc.to_html():
                e.tail = '\n'
                head.append(e)
Пример #10
0
    def serve(books, size, session):
        """ Output a gallery of coverpages. """

        cherrypy.response.headers['Content-Type'] = 'text/html; charset=utf-8'
        cherrypy.response.headers['Content-Language'] = 'en'
        s = ''
        for book_id in books:
            dc = DublinCoreMapping.DublinCoreObject(session=session,
                                                    pooled=True)
            dc.load_from_database(book_id)
            cover = session.execute(
                select(Models.File.archive_path).where(
                    Models.File.fk_books == book_id,
                    Models.File.fk_filetypes == size)).scalars().first()
            if not cover:
                continue
            url = '/' + cover

            href = '/ebooks/%d' % book_id
            if dc.title:
                title = gg.xmlspecialchars(dc.title)  # handles <,>,&
                #Shortening long titles for latest covers
                title = title.replace('"', '&quot;')
                title = title.replace("'", '&apos;')
            else:
                title = '!! missing title !!'

            short_title = dc.make_pretty_title()

            def author_name(author):
                return DublinCore.DublinCore.make_pretty_name(author.name)

            author_name_list = map(author_name, dc.authors)

            authors = ', '.join(author_name_list)

            s += f"""
                <a href="{href}" title="{title}" authors="{authors}" target="_top">
                    <div class="cover_image">
                        <div class="cover_img">
                            <img src="{url}" alt="{title}, {authors}" title="{title}"
                             authors="{authors}" draggable="false">
                        </div>
                        <div class="cover_title">
                            <h5>{short_title}</h5>
                        </div>
                    </div>
                </a>
                """

        return s.encode('utf-8')
Пример #11
0
    def __init__(self):
        self.nsmap = gg.build_nsmap('opf dc dcterms xsi')

        # FIXME: remove this when lxml is fixed
        # workaround for lxml fat-fingering the default attribute namespaces
        self.nsmap[None] = str(NS.opf) + 'lxml-bug-workaround'

        self.opf = ElementMaker(namespace=self.nsmap[None], nsmap=self.nsmap)

        self.metadata = self.opf.metadata()
        self.manifest = self.opf.manifest()
        self.spine = self.opf.spine()
        self.guide = self.opf.guide()
        self.item_id = 0
Пример #12
0
    def add_external_css(self, spider, xhtml, css_as_string, url):
        """ Add external stylesheet to html. """

        if css_as_string:
            attribs = parsers.ParserAttributes()
            attribs.orig_mediatype = attribs.HeaderElement('text/css')
            attribs.url = attribs.orig_url = url
            p = ParserFactory.ParserFactory.get(attribs)
            p.parse_string(css_as_string)
            spider.parsers.append(p)

        if xhtml is not None:
            for head in gg.xpath(xhtml, '//xhtml:head'):
                link = em.link(href=url, rel='stylesheet', type='text/css')
                link.tail = '\n'
                head.append(link)
Пример #13
0
    def topological_sort(self):
        """ Do a topological sort of documents using <link rel='next'> """

        relnext = [(p.attribs.referrer, p.attribs.url) for p in self.parsers
                   if 'next' in p.attribs.rel]
        if relnext:
            try:
                d = {}
                for order, url in enumerate(gg.topological_sort(relnext)):
                    d[url] = order
                    debug("%s order %d" % (url, order))
                for parser in self.parsers:
                    parser.order = d.get(parser.attribs.url, 999999)
                self.parsers.sort(key=lambda p: p.order)

            except Exception:
                pass
Пример #14
0
    def serve(rows, size):
        """ Output a gallery of coverpages. """

        cherrypy.response.headers['Content-Type'] = 'text/html; charset=utf-8'
        cherrypy.response.headers['Content-Language'] = 'en'

        s = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en" xml:base="http://www.gutenberg.org">
<head>
<title>Cover Flow</title>
<style>
.cover-thumb {
        display: inline-block;
        background-position: center;
        background-repeat: no-repeat;
}
.cover-thumb-small {
	width:   76px;
	height: 110px;
}
.cover-thumb-medium {
	width:  210px;
	height: 310px;
}
</style>
</head>
<body><div>"""

        for row in rows:
            url = '/' + row.filename
            href = '/ebooks/%d' % row.pk
            title = gg.xmlspecialchars(row.title)
            title = title.replace('"', '&quot;')

            s += """<a href="{href}"
                       title="{title}"
                       class="cover-thumb cover-thumb-{size}" target="_top"
                       style="background-image: url({url})"> </a>\n""".format(
                url=url, href=href, title=title, size=size)

        return (s + '</div></body></html>\n').encode('utf-8')
Пример #15
0
    def serve(rows, size):
        """ Output a gallery of coverpages. """

        cherrypy.response.headers['Content-Type'] = 'text/html; charset=utf-8'
        cherrypy.response.headers['Content-Language'] = 'en'
        s = ''
        for row in rows:
            url = '/' + row.filename
            href = '/ebooks/%d' % row.pk
            if row.title:
                title = gg.xmlspecialchars(row.title)  # handles <,>,&
                #Shortening long titles for latest covers
                title = title.replace('"', '&quot;')
                title = title.replace("'", '&apos;')
            else:
                title = '!! missing title !!'
            short_title = title
            title_len = len(title)
            short_title = re.sub(r"\-+", " ", short_title)
            short_title = short_title.splitlines()[0]
            if (title_len > 80):
                short_title = textwrap.wrap(short_title, 80)[0]
            s += """
                <a href="{href}" title="{title}" target="_top">
                    <div class="cover_image">
                        <div class="cover_img">
                            <img src="{url}" alt="{title}" title="{title}" draggable="false">
                        </div>
                        <div class="cover_title">
                            <h5>{short_title}</h5>
                        </div>
                    </div>
                </a>
                """.format(url=url,
                           href=href,
                           title=title,
                           short_title=short_title,
                           size=size)

        return s.encode('utf-8')
Пример #16
0
    def package(self, job):
        self.setup(job)

        try:
            aux_file_list = list(job.spider.aux_file_iter())
        except AttributeError:
            aux_file_list = []

        filename = job.outputfile
        zipfilename = os.path.join(self.path, self.name) + '.zip'
        memberfilename = os.path.join(self.name, self.name) + self.ext

        zip_ = self.create(zipfilename)
        self.add(zip_, filename, memberfilename)

        # now images
        for url in aux_file_list:
            rel_url = gg.make_url_relative(job.base_url, url)
            filename = os.path.join(self.path, rel_url)
            memberfilename = os.path.join(self.name, rel_url)
            self.add(zip_, filename, memberfilename)

        zip_.close()
        info('Done Zip file: %s' % zipfilename)
Пример #17
0
 def format_suggestion(self, row):
     """ Format a suggestion for display in results. """
     query = ' '.join(self.query.split()[0:-1])
     if query:
         query += ' '
     return query + gg.cut_at_newline(row.get('title') or '')
Пример #18
0
    def add_body_class(xhtml, classname):
        """ Add a class to the body element. """

        if classname and xhtml is not None:
            for body in gg.xpath(xhtml, '//xhtml:body'):
                HTMLishWriter.add_class(body, classname)
Пример #19
0
    def index(self, **dummy_kwargs):
        """ A bibrec page. """

        os = BaseSearcher.OpenSearch()

        os.log_request('bibrec')

        dc = BaseSearcher.DC(cherrypy.engine.pool)

        # the bulk of the work is done here
        dc.load_from_database(os.id)
        if not dc.files:
            # NOTE: Error message
            cherrypy.tools.rate_limiter.e404()
            raise cherrypy.HTTPError(404, _('No ebook by that number.'))

        # add these fields so we won't have to test for their existence later
        dc.extra_info = None
        dc.url = None

        dc.translate()
        dc.header = gg.cut_at_newline(dc.title)
        os.title = dc.make_pretty_title()
        dc.extra_info = ''
        dc.class_ = BaseSearcher.ClassAttr()
        dc.order = 10
        dc.icon = 'book'
        if 'Sound' in dc.categories:
            dc.icon = 'audiobook'
        os.title_icon = dc.icon
        os.twit = os.title
        os.qrcode_url = '//%s/cache/epub/%d/pg%d.qrcode.png' % (os.file_host,
                                                                os.id, os.id)

        os.entries.append(dc)

        s = cherrypy.session
        last_visited = s.get('last_visited', [])
        last_visited.append(os.id)
        s['last_visited'] = last_visited

        # can we find some meaningful breadcrumbs ?
        for a in dc.authors:
            if a.marcrel in ('aut', 'cre'):
                book_cnt = BaseSearcher.sql_get(
                    "select count (*) from mn_books_authors where fk_authors = %(aid)s",
                    aid=a.id)
                if book_cnt > 1:
                    os.breadcrumbs.append(
                        (__('One by {author}', '{count} by {author}',
                            book_cnt).format(count=book_cnt,
                                             author=dc.make_pretty_name(
                                                 a.name)),
                         _('Find more ebooks by the same author.'),
                         os.url('author', id=a.id)))

        if os.format in ('html', 'mobile'):
            cat = BaseSearcher.Cat()
            cat.header = _('Similar Books')
            cat.title = _('Readers also downloaded…')
            cat.rel = 'related'
            cat.url = os.url('also', id=os.id)
            cat.class_ += 'navlink grayed noprint'
            cat.icon = 'suggestion'
            cat.order = 30
            os.entries.append(cat)

            for bookshelf in dc.bookshelves:
                cat = BaseSearcher.Cat()
                cat.title = _('In {bookshelf}').format(
                    bookshelf=bookshelf.bookshelf)
                cat.rel = 'related'
                cat.url = os.url('bookshelf', id=bookshelf.id)
                cat.class_ += 'navlink grayed'
                cat.icon = 'bookshelf'
                cat.order = 33
                os.entries.append(cat)

        if os.format in ('mobile', ):
            for author in dc.authors:
                cat = BaseSearcher.Cat()
                cat.title = _('By {author}').format(
                    author=author.name_and_dates)
                cat.rel = 'related'
                cat.url = os.url('author', id=author.id)
                cat.class_ += 'navlink grayed'
                cat.icon = 'author'
                cat.order = 31
                os.entries.append(cat)

            for subject in dc.subjects:
                cat = BaseSearcher.Cat()
                cat.title = _('On {subject}').format(subject=subject.subject)
                cat.rel = 'related'
                cat.url = os.url('subject', id=subject.id)
                cat.class_ += 'navlink grayed'
                cat.icon = 'subject'
                cat.order = 32
                os.entries.append(cat)

        os.total_results = 1

        os.template = 'results' if os.format == 'mobile' else 'bibrec'
        os.page = 'bibrec'
        os.og_type = 'book'
        os.finalize()

        return self.format(os)
Пример #20
0
    def make_toc(self, xhtml):
        """ Build a TOC from HTML headers.

        Return a list of tuples (url, text, depth).

        Page numbers are also inserted because DTBook NCX needs the
        play_order to be sequential.

        """
        def id_generator(i=0):
            """ Generate an id for the TOC to link to. """
            while True:
                yield 'pgepubid%05d' % i
                i += 1

        idg = id_generator()

        def get_id(elem):
            """ Get the id of the element or generate and set one. """
            if not elem.get('id'):
                elem.set('id', six.next(idg))
            return elem.get('id')

        toc = []
        last_depth = 0

        for header in xpath(
                xhtml,
                '//xhtml:h1|//xhtml:h2|//xhtml:h3|//xhtml:h4|'
                # DP page number
                '//xhtml:*[contains (@class, "pageno")]|'
                # DocUtils contents header
                '//xhtml:p[contains (@class, "topic-title")]'):

            text = gg.normalize(
                etree.tostring(header,
                               method="text",
                               encoding=six.text_type,
                               with_tail=False))

            text = header.get('title', text).strip()

            if not text:
                # so <h2 title=""> may be used to suppress TOC entry
                continue

            if header.get('class', '').find('pageno') > -1:
                toc.append(
                    ["%s#%s" % (self.attribs.url, get_id(header)), text, -1])
            else:
                # header
                if text.lower().startswith('by '):
                    # common error in PG: <h2>by Lewis Carroll</h2> should
                    # yield no TOC entry
                    continue

                try:
                    depth = int(header.tag[-1:])
                except ValueError:
                    depth = 2  # avoid top level

                # fix bogus header numberings
                if depth > last_depth + 1:
                    depth = last_depth + 1

                last_depth = depth

                # if <h*> is first element of a <div> use <div> instead
                parent = header.getparent()
                if (parent.tag == NS.xhtml.div and parent[0] == header
                        and parent.text and parent.text.strip() == ''):
                    header = parent

                toc.append([
                    "%s#%s" % (self.attribs.url, get_id(header)), text, depth
                ])

        return toc
Пример #21
0
    def parse(self):
        """ Parse the plain text.

        Try to find semantic units in the character soup. """

        debug("GutenbergTextParser.parse () ...")

        if self.xhtml is not None:
            return

        text = self.unicode_content()
        text = parsers.RE_RESTRICTED.sub('', text)
        text = gg.xmlspecialchars(text)

        lines = [line.rstrip() for line in text.splitlines()]
        lines.append("")
        del text

        blanks = 0
        par = Par()

        for line in lines:
            if len(line) == 0:
                blanks += 1
            else:
                if blanks and par.lines:  # don't append empty pars
                    par.after = blanks
                    self.pars.append(par)
                    if self.body == 1:
                        self.max_blanks = max(blanks, self.max_blanks)
                    par = Par()
                    par.before = blanks
                    blanks = 0

                par.lines.append(line)

        par.after = blanks
        if par.lines:
            self.pars.append(par)

        lines = None

        self.analyze()

        # build xhtml tree

        em = parsers.em
        self.xhtml = em.html(
            em.head(
                em.title(' '),
                # pylint: disable=W0142
                em.meta(**{
                    'http-equiv': 'Content-Style-Type',
                    'content': 'text/css'
                }),
                em.meta(
                    **{
                        'http-equiv': 'Content-Type',
                        'content': mt.xhtml + '; charset=utf-8'
                    })),
            em.body())

        for body in xpath(self.xhtml, '//xhtml:body'):
            xhtmlparser = lxml.html.XHTMLParser()
            for par in self.pars:
                p = etree.fromstring(self.ship_out(par), xhtmlparser)
                p.tail = '\n\n'
                body.append(p)

        self.pars = []
Пример #22
0
    def strip_pagenumbers(xhtml, strip_classes):
        """ Strip dp page numbers.

        Rationale: DP implements page numbers either with float or
        with absolute positioning. Float is not supported by Kindle.
        Absolute positioning is not allowed in epub.

        If we'd leave these in, they would show up as numbers in the
        middle of the text.

        To still keep links working, we replace all page number
        contraptions we can find with empty <a>'s.

        """

        # look for elements with a class that is in strip_classes

        for class_ in strip_classes:
            xp = "//xhtml:*[@class and contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % class_

            count = 0
            for elem in xpath(xhtml, xp):

                # save textual content
                text = gg.normalize(
                    etree.tostring(elem,
                                   method="text",
                                   encoding=six.text_type,
                                   with_tail=False))
                if len(text) > 10:
                    # safeguard against removing things that are not pagenumbers
                    continue

                if not text:
                    text = elem.get('title')

                # look for id anywhere inside element
                id_ = elem.xpath(".//@id")

                # transmogrify element into empty <a>
                tail = elem.tail
                elem.clear()
                elem.tag = NS.xhtml.a
                if id_:
                    # some blockheaded PPers include more than
                    # one page number in one span. take the last id
                    # because the others represent empty pages.
                    elem.set('id', id_[-1])

                if class_ in DP_PAGENUMBER_CLASSES:
                    # mark element as rewritten pagenumber. we
                    # actually don't use this class for styling
                    # because it is on an empty element
                    elem.set('class', 'x-ebookmaker-pageno')

                if text:
                    elem.set('title', text)
                elem.tail = tail
                count += 1

                # The OPS Spec 2.0 is very clear: "Reading Systems
                # must be XML processors as defined in XML 1.1."
                # Nevertheless many browser-plugin ebook readers use
                # the HTML parsers of the browser.  But HTML parsers
                # don't grok the minimized form of empty elements.
                #
                # This will force lxml to output the non-minimized form
                # of the element.
                elem.text = ''

            if count:
                warning("%d elements having class %s have been rewritten." %
                        (count, class_))
Пример #23
0
    def metadata_item(self, dc):
        """ Build metadata from DublinCore struct.

        Example of metadata:

  <metadata xmlns:dcterms='http://purl.org/dc/terms/'
            xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance'
            xmlns:opf='http://www.idpf.org/2007/opf'>

    <dcterms:identifier opf:scheme='URI' id='id'>http://www.gutenberg.org/ebooks/29000</dc:identifier>
    <dcterms:creator opf:file-as='Trollope, Anthony'>Anthony Trollope</dc:creator>
    <dcterms:title>The Macdermots of Ballycloran</dc:title>
    <dcterms:language xsi:type='dcterms:RFC3066'>en</dc:language>
    <dcterms:subject>Domestic fiction</dc:subject>
    <dcterms:subject>Ireland -- Fiction</dc:subject>
    <dcterms:created>1890</dcterms:created>
    <dcterms:publisher>Project Gutenberg</dc:publisher>
    <dcterms:date opf:event='publication'>2009-05-31</dc:date>
    <dcterms:date opf:event='conversion'>2009-08-26T21:11:14Z</dc:date>
    <dcterms:rights>Public domain</dc:rights>
    <dcterms:source>29000-h.htm</dc:source>

    <meta name='cover' content='item0' />
  </metadata>
    """

        # OPF 2.0 v1.0 specifies to use the
        # Dublin Core Metadata Element Set, Version 1.1
        # http://dublincore.org/documents/2004/12/20/dces/
        # but that has been superseded by DCMI Metadata Terms
        # http://dublincore.org/documents/dcmi-terms/
        # we use NS.dc for now but should switch to NS.dcterms later

        dcterms = ElementMaker(nsmap=self.nsmap, namespace=str(NS.dc))

        if dc.publisher:
            self.metadata.append(dcterms.publisher(dc.publisher))
        if dc.rights:
            self.metadata.append(dcterms.rights(dc.rights))

        self.metadata.append(
            dcterms.identifier(dc.opf_identifier, {
                NS.opf.scheme: 'URI',
                'id': 'id'
            }))  # should be NS.xml.id

        for author in dc.authors:
            pretty_name = dc.make_pretty_name(author.name)
            if author.marcrel == 'aut' or author.marcrel == 'cre':
                self.metadata.append(
                    dcterms.creator(pretty_name,
                                    {NS.opf['file-as']: author.name}))
            else:
                self.metadata.append(
                    dcterms.contributor(
                        pretty_name, {
                            NS.opf.role: author.marcrel,
                            NS.opf['file-as']: author.name
                        }))

        # replace newlines with /
        title = re.sub(r'\s*[\r\n]+\s*', ' / ', dc.title)
        self.metadata.append(dcterms.title(title))

        for language in dc.languages:
            self.metadata.append(
                dcterms.language(language.id,
                                 {NS.xsi.type: 'dcterms:RFC4646'}))

        for subject in dc.subjects:
            self.metadata.append(dcterms.subject(subject.subject))

        if dc.created:
            self.metadata.append(
                dcterms.date(dc.created, {NS.opf.event: 'creation'}))

        if dc.release_date:
            self.metadata.append(
                dcterms.date(dc.release_date.isoformat(),
                             {NS.opf.event: 'publication'}))

        self.metadata.append(
            dcterms.date(
                datetime.datetime.now(gg.UTC()).isoformat(),
                {NS.opf.event: 'conversion'}))

        source = dc.source
        if hasattr(options.config, 'FILESDIR'):
            if source.startswith(options.config.FILESDIR):
                source = source[len(options.config.FILESDIR):]
                source = urllib.parse.urljoin(options.config.PGURL, source)

        self.metadata.append(dcterms.source(source))