Пример #1
0
    def _webkit_command(self, html_url, pdf, outline=False, outline_file=None, page_num=None):
        m = [str(x) for x in self.margins]
        outline_args = ['--outline',  '--outline-depth', '2'] * outline
        if outline_file is not None:
            outline_args += ['--dump-outline', outline_file]

        page_num_args = []
        if page_num:
            footer_url, header_url = self.get_boilerplate(page_num)
            if footer_url is not None:
                page_num_args += ['--footer-html', footer_url]
            if header_url is not None:
                page_num_args += ['--header-html', header_url]

        greyscale_args = ['-g'] * self.grey_scale
        quiet_args = ['-q']
        cmd = ([config.WKHTMLTOPDF] +
               quiet_args +
               ['--page-width', str(self.width * POINT_2_MM),
                '--page-height', str(self.height * POINT_2_MM),
                '-T', m[0], '-R', m[1], '-B', m[2], '-L', m[3],
                #'--disable-smart-shrinking',
                '-d', '100',
                #'--zoom', '1.2',
                '--encoding', 'UTF-8',
                '--javascript-delay', '2000',
                ] +
               page_num_args +
               outline_args +
               greyscale_args +
               config.WKHTMLTOPDF_EXTRA_COMMANDS +
               [html_url, pdf])
        log(' '.join(cmd))
        return cmd
Пример #2
0
    def _parse_credits(self, force=False):
        # open the Credits chapter that has a list of authors for each chapter.
        # each chapter is listed thus (linebreaks added):
        #   <i>CHAPTER TITLE</i><br/>&copy; First Author 2007<br/>
        #   Modifications:<br/>Second Author 2007, 2008<br/>
        #   Third Author 2008<br/>Fourth Author 2008<br/><hr/>
        #
        # where "CHAPTER TITLE" is as appears in TOC.txt, and "X
        # Author" are the names TWiki has for authors.  So the thing
        # to do is look for the <i> tags and match them to the toc.
        #
        # the chapter title is not guaranteed unique (but usually is).
        if self.credits is not None and not force:
            log("not reloading metadata")
            return

        self.credits = {}
        self.contributors = set()
        self.titles = []

        credits_html = self.get_chapter_html('Credits', wrapped=True)
        try:
            tree = lxml.html.document_fromstring(credits_html, parser=utf8_html_parser)
        except UnicodeDecodeError, e:
            log("book isn't unicode! (%s)" %(e,))
            encoding = config.SERVER_DEFAULTS[self.server]['toc-encoding']
            parser = lxml.html.HTMLParser(encoding=encoding)
            tree = lxml.html.document_fromstring(credits_html, parser=parser)
Пример #3
0
def parse_args(arg_validators):
    """Read and validate CGI or commandline arguments, putting the
    good ones into the returned dictionary.  Command line arguments
    should be in the form --title='A Book'.

    arg_validators is a dictionary mapping keys to either 1) functions
    that validate their values; or 2) tuples of such functions and
    default values.  The default value will itself be validated and
    used in the case that no relevant argument is given.
    """
    query = cgi.FieldStorage()
    options, args = gnu_getopt(sys.argv[1:], '', [x + '=' for x in arg_validators])
    options = dict(options)
    log("Starting request for %s" % (os.environ.get('REQUEST_URI'),))
    log(query, debug='STARTUP')
    data = {}
    for key, validator in arg_validators.items():
        if isinstance(validator, tuple):
            validator, default = validator
        else:
            default = None
        value = query.getfirst(key, options.get('--' + key, default))
        log('%s: %s' % (key, value), debug='STARTUP')
        if value is not None:
            if validator is not None and not validator(value):
                log("argument '%s' is not valid ('%s')" % (key, value))
                continue
            data[key] = value
    log("effective query is:", data)
    return data
Пример #4
0
def get_book_list(server):
    """Ask the server for a list of books.  Floss Manual TWikis keep such a list at
    /bin/view/TWiki/WebLeftBarWebsList?skin=text but it needs a bit of processing

    If BOOK_LIST_CACHE is non-zero, the book list won't be re-fetched
    in that many seconds, rather it will be read from disk.
    """
    if config.BOOK_LIST_CACHE:
        cache_name = os.path.join(config.CACHE_DIR, '%s.booklist' % server)
        if (os.path.exists(cache_name) and
            os.stat(cache_name).st_mtime + config.BOOK_LIST_CACHE > time.time()):
            f = open(cache_name)
            s = f.read()
            f.close()
            return s.split()

    url = config.CHAPTER_URL % (server, 'TWiki', 'WebLeftBarWebsList')
    #url = 'http://%s/bin/view/TWiki/WebLeftBarWebsList?skin=text' % server
    #XXX should use lxml
    log('getting booklist: %s' % url)
    s = url_fetch(url)
    items = sorted(x for x in re.findall(r'/bin/view/([\w/]+)/WebHome', s)
                   if x not in config.IGNORABLE_TWIKI_BOOKS)
    if config.BOOK_LIST_CACHE:
        f = open(cache_name, 'w')
        f.write('\n'.join(items))
        f.close()
    return items
Пример #5
0
def get_book_list(server):
    """Ask the server for a list of books.  Floss Manual TWikis keep such a list at
    /bin/view/TWiki/WebLeftBarWebsList?skin=text but it needs a bit of processing

    If BOOK_LIST_CACHE is non-zero, the book list won't be re-fetched
    in that many seconds, rather it will be read from disk.
    """
    if config.BOOK_LIST_CACHE:
        cache_name = os.path.join(config.CACHE_DIR, '%s.booklist' % server)
        if (os.path.exists(cache_name)
                and os.stat(cache_name).st_mtime + config.BOOK_LIST_CACHE >
                time.time()):
            f = open(cache_name)
            s = f.read()
            f.close()
            return s.split()

    url = config.CHAPTER_URL % (server, 'TWiki', 'WebLeftBarWebsList')
    #url = 'http://%s/bin/view/TWiki/WebLeftBarWebsList?skin=text' % server
    #XXX should use lxml
    log('getting booklist: %s' % url)
    s = url_fetch(url)
    items = sorted(x for x in re.findall(r'/bin/view/([\w/]+)/WebHome', s)
                   if x not in config.IGNORABLE_TWIKI_BOOKS)
    if config.BOOK_LIST_CACHE:
        f = open(cache_name, 'w')
        f.write('\n'.join(items))
        f.close()
    return items
Пример #6
0
def parse_manifest(manifest, pwd):
    """
    Only contains <item>s; each <item> has id, href, and media-type.

    It includes 'toc.ncx', but not 'META-INF/container.xml' or the pbf
    file (i.e., the files needed to get this far).

    The manifest can specify fallbacks for unrecognised documents, but
    Espri does not use that (nor do any of the test epub files).

    <manifest>
    <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml" />
    <item id="WHume_NatureC01" href="Hume_NatureC01.html" media-type="application/xhtml+xml" />
    <item id="cover" href="cover.jpg" media-type="image/jpeg" />
    </manifest>
    """
    items = {}
    ns = '{%s}' % manifest.nsmap[None]

    for t in manifest.iterchildren(ns + 'item'):
        id = t.get('id')
        href = os.path.join(pwd, t.get('href'))
        if isinstance(href, unicode):
            log('damn unicode: %r' % href)
            log(etree.tostring(t))
        media_type = t.get('media-type')
        items[id] = (href, media_type)  #XXX does media-type matter?

    return items
Пример #7
0
    def _parse_credits(self, force=False):
        # open the Credits chapter that has a list of authors for each chapter.
        # each chapter is listed thus (linebreaks added):
        #   <i>CHAPTER TITLE</i><br/>&copy; First Author 2007<br/>
        #   Modifications:<br/>Second Author 2007, 2008<br/>
        #   Third Author 2008<br/>Fourth Author 2008<br/><hr/>
        #
        # where "CHAPTER TITLE" is as appears in TOC.txt, and "X
        # Author" are the names TWiki has for authors.  So the thing
        # to do is look for the <i> tags and match them to the toc.
        #
        # the chapter title is not guaranteed unique (but usually is).
        if self.credits is not None and not force:
            log("not reloading metadata")
            return

        self.credits = {}
        self.contributors = set()
        self.titles = []

        credits_html = self.get_chapter_html('Credits', wrapped=True)
        try:
            tree = lxml.html.document_fromstring(credits_html,
                                                 parser=utf8_html_parser)
        except UnicodeDecodeError, e:
            log("book isn't unicode! (%s)" % (e, ))
            encoding = get_server_defaults(self.server)['toc-encoding']
            parser = lxml.html.HTMLParser(encoding=encoding)
            tree = lxml.html.document_fromstring(credits_html, parser=parser)
Пример #8
0
def get_chapter_breaks(points, pwd):
    # First go was overly complex, trying to guess which sections were
    # really chapters.  Now, every ncx navpoint is a chapter break.
    serial_points = []

    def serialise(p, depth):
        serial_points.append((depth, p))
        #if p['class']:
        #    log("found class=='%s' at depth %s" % (p['class'], depth))
        if not p.get('points'):
            return
        for child in p['points']:
            serialise(child, depth + 1)

    for p in points:
        serialise(p, 1)

    splits = {}
    for depth, p in serial_points:
        url, ID = p['content_src'], None
        url = os.path.join(pwd, url)
        if '#' in url:
            log("GOT a fragment! %s" % url)
            url, ID = url.split('#', 1)
        s = splits.setdefault(url, [])
        s.append((depth, ID, p))

    return serial_points, splits
Пример #9
0
def parse_args(arg_validators):
    """Read and validate CGI or commandline arguments, putting the
    good ones into the returned dictionary.  Command line arguments
    should be in the form --title='A Book'.

    arg_validators is a dictionary mapping keys to either 1) functions
    that validate their values; or 2) tuples of such functions and
    default values.  The default value will itself be validated and
    used in the case that no relevant argument is given.
    """
    query = cgi.FieldStorage()
    options, args = gnu_getopt(sys.argv[1:], '',
                               [x + '=' for x in arg_validators])
    options = dict(options)
    log("Starting request for %s" % (os.environ.get('REQUEST_URI'), ))
    log(query, debug='STARTUP')
    data = {}
    for key, validator in arg_validators.items():
        if isinstance(validator, tuple):
            validator, default = validator
        else:
            default = None
        value = query.getfirst(key, options.get('--' + key, default))
        log('%s: %s' % (key, value), debug='STARTUP')
        if value is not None:
            if validator is not None and not validator(value):
                log("argument '%s' is not valid ('%s')" % (key, value))
                continue
            data[key] = value
    log("effective query is:", data)
    return data
Пример #10
0
def get_chapter_breaks(points, pwd):
    # First go was overly complex, trying to guess which sections were
    # really chapters.  Now, every ncx navpoint is a chapter break.
    serial_points = []

    def serialise(p, depth):
        serial_points.append((depth, p))
        # if p['class']:
        #    log("found class=='%s' at depth %s" % (p['class'], depth))
        if not p.get("points"):
            return
        for child in p["points"]:
            serialise(child, depth + 1)

    for p in points:
        serialise(p, 1)

    splits = {}
    for depth, p in serial_points:
        url, ID = p["content_src"], None
        url = os.path.join(pwd, url)
        if "#" in url:
            log("GOT a fragment! %s" % url)
            url, ID = url.split("#", 1)
        s = splits.setdefault(url, [])
        s.append((depth, ID, p))

    return serial_points, splits
Пример #11
0
    def _webkit_command(self, html_url, pdf, outline=False, outline_file=None, page_num=None):
        m = [str(x) for x in self.margins]
        outline_args = ['--outline',  '--outline-depth', '2'] * outline
        if outline_file is not None:
            outline_args += ['--dump-outline', outline_file]

        page_num_args = []
        if page_num:
            footer_url, header_url = self.get_boilerplate(page_num)
            if footer_url is not None:
                page_num_args += ['--footer-html', footer_url]
            if header_url is not None:
                page_num_args += ['--header-html', header_url]

        greyscale_args = ['-g'] * self.grey_scale
        quiet_args = ['-q']
        cmd = ([config.WKHTMLTOPDF] +
               quiet_args +
               ['--page-width', str(self.width * POINT_2_MM),
                '--page-height', str(self.height * POINT_2_MM),
                '-T', m[0], '-R', m[1], '-B', m[2], '-L', m[3],
                #'--disable-smart-shrinking',
                '-d', '100',
                #'--zoom', '1.2',
                '--encoding', 'UTF-8',
                '--javascript-delay', '2000',
                ] +
               page_num_args +
               outline_args +
               greyscale_args +
               config.WKHTMLTOPDF_EXTRA_COMMANDS +
               [html_url, pdf])
        log(' '.join(cmd))
        return cmd
Пример #12
0
def parse_manifest(manifest, pwd):
    """
    Only contains <item>s; each <item> has id, href, and media-type.

    It includes 'toc.ncx', but not 'META-INF/container.xml' or the pbf
    file (i.e., the files needed to get this far).

    The manifest can specify fallbacks for unrecognised documents, but
    Espri does not use that (nor do any of the test epub files).

    <manifest>
    <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml" />
    <item id="WHume_NatureC01" href="Hume_NatureC01.html" media-type="application/xhtml+xml" />
    <item id="cover" href="cover.jpg" media-type="image/jpeg" />
    </manifest>
    """
    items = {}
    ns = "{%s}" % manifest.nsmap[None]

    for t in manifest.iterchildren(ns + "item"):
        id = t.get("id")
        href = os.path.join(pwd, t.get("href"))
        if isinstance(href, unicode):
            log("damn unicode: %r" % href)
            log(etree.tostring(t))
        media_type = t.get("media-type")
        items[id] = (href, media_type)  # XXX does media-type matter?

    return items
Пример #13
0
 def extract(expected, conv=_strip):
     line = lines.next()
     try:
         k, v = line.split(':', 1)
         if k == expected:
             return conv(v)
     except ValueError:
         log("trouble with line %r" %line)
Пример #14
0
 def extract(expected, conv=_strip):
     line = lines.next()
     try:
         k, v = line.split(':', 1)
         if k == expected:
             return conv(v)
     except ValueError:
         log("trouble with line %r" %line)
Пример #15
0
def font_links():
    """Links to various example pdfs."""
    links = []
    for script in os.listdir(config.FONT_EXAMPLE_SCRIPT_DIR):
        if not script.isalnum():
            log("warning: font-sample %s won't work; skipping" % script)
            continue
        links.append('<a href="%s?script=%s">%s</a>' % (config.FONT_LIST_URL, script, script))
    return links
Пример #16
0
def get_default_css(server=config.DEFAULT_SERVER, mode='book'):
    """Get the default CSS text for the selected server"""
    log(server)
    cssfile = url2path(get_server_defaults(server)['css-%s' % mode])
    log(cssfile)
    f = open(cssfile)
    s = f.read()
    f.close()
    return s
Пример #17
0
def get_default_css(server=config.DEFAULT_SERVER, mode='book'):
    """Get the default CSS text for the selected server"""
    log(server)
    cssfile = url2path(config.SERVER_DEFAULTS[server]['css-%s' % mode])
    log(cssfile)
    f = open(cssfile)
    s = f.read()
    f.close()
    return s
Пример #18
0
def font_links():
    """Links to various example pdfs."""
    links = []
    for script in os.listdir(config.FONT_EXAMPLE_SCRIPT_DIR):
        if not script.isalnum():
            log("warning: font-sample %s won't work; skipping" % script)
            continue
        links.append('<a href="%s?script=%s">%s</a>' %
                     (config.FONT_LIST_URL, script, script))
    return links
Пример #19
0
def output_blob_and_shut_up(blob, content_type="application/octet-stream", filename=None):
    print 'Content-type: %s\nContent-length: %s' % (content_type, len(blob))
    if filename is not None:
        print 'Content-Disposition: attachment; filename="%s"' % filename
    print
    print blob
    sys.stdout.flush()
    devnull = open('/dev/null', 'w')
    os.dup2(devnull.fileno(), sys.stdout.fileno())
    log(sys.stdout)
Пример #20
0
 def _loadtree(self, html):
     try:
         try:
             self.tree = lxml.html.document_fromstring(html, parser=self.parser)
         except UnicodeError, e:
             log("failed to parse tree as unicode, got %s %r" % (e, e), "trying again using default parser")
             self.tree = lxml.html.document_fromstring(html)
     except etree.XMLSyntaxError, e:
         log("Could not parse html file %r, string %r... exception %s" % (self.name, html[:40], e))
         self.tree = empty_html_tree()
Пример #21
0
 def __init__(self, book, server, bookname=None):
     if bookname is None:
         bookname = make_book_name(book, server, '.zip')
     log("*** Extracting TWiki book %s ***" % bookname)
     self.bookname = bookname
     self.book = book
     self.server = server
     self.workdir = tempfile.mkdtemp(prefix=bookname, dir=config.TMPDIR)
     os.chmod(self.workdir, 0755)
     #probable text direction
     self.dir = guess_text_dir(self.server, self.book)
Пример #22
0
def make_navpoint(parent, n, title, url):
    """Make the actual navpoint node"""
    log((parent, n, title, url))
    if url is None:
        url = ''
    navpoint = etree.SubElement(parent, 'navPoint',
                                id=(NAVPOINT_ID_TEMPLATE % (n - 1)),
                                playOrder=str(n))
    add_ncxtext(navpoint, 'navLabel', title)
    etree.SubElement(navpoint, 'content', src=url)
    return navpoint
Пример #23
0
 def get_chapter_html(self, chapter, wrapped=False):
     url = config.CHAPTER_URL % (self.server, self.book, chapter)
     log('getting chapter: %s' % url)
     html = url_fetch(url)
     if wrapped:
         html = CHAPTER_TEMPLATE % {
             'title': '%s: %s' % (self.book, chapter),
             'text': html,
             'dir': self.dir
         }
     return html
Пример #24
0
 def __init__(self, book, server, bookname=None):
     if bookname is None:
         bookname = make_book_name(book, server, '.zip')
     log("*** Extracting TWiki book %s ***" % bookname)
     self.bookname = bookname
     self.book = book
     self.server = server
     self.workdir = tempfile.mkdtemp(prefix=bookname, dir=config.TMPDIR)
     os.chmod(self.workdir, 0755)
     #probable text direction
     self.dir = guess_text_dir(self.server, self.book)
Пример #25
0
def parse_metadata(metadata):
    """metadata is an OPF metadata node, as defined at
    http://www.idpf.org/2007/opf/OPF_2.0_final_spec.html#Section2.2
    (or a dc-metadata or x-metadata child thereof).

    """
    # the node probably has at least 'dc', 'opf', and None namespace
    # prefixes.  None and opf probably map to the same thing. 'dc' is
    # Dublin Core.
    nsmap = metadata.nsmap
    nstags = dict((k, '{%s}' % v) for k, v in nsmap.iteritems())
    default_ns = nstags[None]

    # Collect element data in namespace-bins, and map prefixes to
    # those bins for convenience
    nsdict = dict((v, {}) for v in nsmap.values())

    def add_item(ns, tag, value, extra):
        #any key can be duplicate, so store in a list
        if ns not in nsdict:
            nsdict[ns] = {}
        values = nsdict[ns].setdefault(tag, [])
        values.append((value, extra))

    for t in metadata.iterdescendants():
        #look for special OPF tags
        if t.tag == default_ns + 'meta':
            #meta tags <meta name="" content="" />
            name = t.get('name')
            content = t.get('content')
            others = dict(
                (k, v) for k, v in t.items() if k not in ('name', 'content'))
            if ':' in name:
                # the meta tag is using xml namespaces in attribute values.
                prefix, name = name.split(':', 1)
            else:
                prefix = None
            add_item(t.nsmap[prefix], name, content, others)
            continue

        if t.tag in (default_ns + 'dc-metadata', default_ns + 'x-metadata'):
            # Subelements of these deprecated elements are in either
            # DC or non-DC namespace (respectively).  Of course, this
            # is true of any element anyway, so it is sufficent to
            # ignore this (unless we want to cause pedantic errors).
            log("found a live %s tag; descending into but otherwise ignoring it"
                % t.tag[len(default_ns):])
            continue

        tag = t.tag[t.tag.rfind('}') + 1:]
        add_item(t.nsmap[t.prefix], tag, t.text,
                 tuple((k.replace(default_ns, ''), v) for k, v in t.items()))

    return nsdict
Пример #26
0
 def get_chapter_html(self, chapter, wrapped=False):
     url = config.CHAPTER_URL % (self.server, self.book, chapter)
     log('getting chapter: %s' % url)
     html = url_fetch(url)
     if wrapped:
         html = CHAPTER_TEMPLATE % {
             'title': '%s: %s' % (self.book, chapter),
             'text': html,
             'dir': self.dir
         }
     return html
Пример #27
0
def parse_metadata(metadata):
    """metadata is an OPF metadata node, as defined at
    http://www.idpf.org/2007/opf/OPF_2.0_final_spec.html#Section2.2
    (or a dc-metadata or x-metadata child thereof).

    """
    # the node probably has at least 'dc', 'opf', and None namespace
    # prefixes.  None and opf probably map to the same thing. 'dc' is
    # Dublin Core.
    nsmap = metadata.nsmap
    nstags = dict((k, '{%s}' % v) for k, v in nsmap.iteritems())
    default_ns = nstags[None]

    # Collect element data in namespace-bins, and map prefixes to
    # those bins for convenience
    nsdict = dict((v, {}) for v in nsmap.values())

    def add_item(ns, tag, value, extra):
        #any key can be duplicate, so store in a list
        if ns not in nsdict:
            nsdict[ns] = {}
        values = nsdict[ns].setdefault(tag, [])
        values.append((value, extra))

    for t in metadata.iterdescendants():
        #look for special OPF tags
        if t.tag == default_ns + 'meta':
            #meta tags <meta name="" content="" />
            name = t.get('name')
            content = t.get('content')
            others = dict((k, v) for k, v in t.items() if k not in ('name', 'content'))
            if ':' in name:
                # the meta tag is using xml namespaces in attribute values.
                prefix, name = name.split(':', 1)
            else:
                prefix = None
            add_item(t.nsmap[prefix], name, content, others)
            continue

        if t.tag in (default_ns + 'dc-metadata', default_ns + 'x-metadata'):
            # Subelements of these deprecated elements are in either
            # DC or non-DC namespace (respectively).  Of course, this
            # is true of any element anyway, so it is sufficent to
            # ignore this (unless we want to cause pedantic errors).
            log("found a live %s tag; descending into but otherwise ignoring it"
                % t.tag[len(default_ns):])
            continue

        tag = t.tag[t.tag.rfind('}') + 1:]
        add_item(t.nsmap[t.prefix], tag, t.text,
                 tuple((k.replace(default_ns, ''), v) for k, v in t.items()))

    return nsdict
Пример #28
0
def parse_extracted_outline(outline_file, depth=config.CONTENTS_DEPTH):
    '''Extract outline data from an XML file structured as follows:

      <?xml version="1.0" encoding="UTF-8"?>
      <outline xmlns="http://code.google.com/p/wkhtmltopdf/outline">
        <item title="" page="0" link="__WKANCHOR_0" backLink="__WKANCHOR_1">
          <item title="1. ANONYMOUS" page="2" link="__WKANCHOR_2" backLink="__WKANCHOR_3"/>
          <item title="2. HOW THIS BOOK IS WRITTEN" page="4" link="__WKANCHOR_4" backLink="__WKANCHOR_5">
            <item title="WHAT IS A BOOK SPRINT?" page="4" link="__WKANCHOR_6" backLink="__WKANCHOR_7"/>
            <item title="HOW TO WRITE THIS BOOK" page="11" link="__WKANCHOR_c" backLink="__WKANCHOR_d">
              <item title="1. Register" page="11" link="__WKANCHOR_e" backLink="__WKANCHOR_f"/>
              <item title="2. Contribute!" page="11" link="__WKANCHOR_g" backLink="__WKANCHOR_h"/>
            </item>
          </item>
          <item title="3. ASSUMPTIONS" page="13" link="__WKANCHOR_i" backLink="__WKANCHOR_j">
            <item title="WHAT THIS BOOK IS NOT..." page="13" link="__WKANCHOR_k" backLink="__WKANCHOR_l"/>
          </item>
        </item>
      </outline>

     In other words:

     <!ELEMENT outline (item*)>
     <!ELEMENT item (item*)>
     and item has the following attributes:
       title:    url-escaped string
       page:     page number
       link:     link to here from the TOC
       backLink: link back to the TOC

    Title is encoded as utf-8 text that has been "percent-encoding" as
    described in section 2.1 of RFC 3986.
    '''
    from lxml import etree
    f = open(outline_file, 'r')
    tree = etree.parse(f)
    f.close()

    contents = []

    def parse_item(e, depth):
        title = urllib.unquote(e.get('title')).strip()
        pageno = int(e.get('page'))
        if depth:
            contents.append((title, depth, pageno))
        for x in e.iterchildren(config.WKTOCNS + 'item'):
            parse_item(x, depth + 1)

    for x in tree.getroot().iterchildren(config.WKTOCNS + 'item'):
        parse_item(x, 0)

    log(contents)
    return contents
Пример #29
0
def parse_outline(pdf, level_threshold, debug_filename=None):
    """Create a structure reflecting the outline of a PDF.
    A chapter heading looks like this:

    BookmarkTitle: 2. What is sound?
    BookmarkLevel: 1
    BookmarkPageNumber: 3
    """
    cmd = ('pdftk', pdf, 'dump_data')
    p = Popen(cmd, stdout=PIPE, stderr=PIPE)
    outline, err = p.communicate()
    #log("OUTLINE:", outline)
    if debug_filename is not None:
        try:
            f = open(debug_filename, 'w')
            f.write(outline)
            f.close()
        except IOError:
            log("could not write to %s!" % debug_filename)

    lines = (x.strip() for x in outline.split('\n') if x.strip())
    contents = []

    def _strip(s):
        return s.strip(config.WHITESPACE_AND_NULL)

    def extract(expected, conv=_strip):
        line = lines.next()
        try:
            k, v = line.split(':', 1)
            if k == expected:
                return conv(v)
        except ValueError:
            log("trouble with line %r" %line)

    #There are a few useless variables, then the pagecount, then the contents.
    #The pagecount is useful, so pick it up first.
    page_count = None
    while page_count == None:
        page_count = extract('NumberOfPages', int)

    try:
        while True:
            title = extract('BookmarkTitle')
            if title is not None:
                level = extract('BookmarkLevel', int)
                pagenum = extract('BookmarkPageNumber', int)
                if level <= level_threshold and None not in (level, pagenum):
                    contents.append((title, level, pagenum))
    except StopIteration:
        pass

    return contents, page_count
Пример #30
0
def parse_outline(pdf, level_threshold, debug_filename=None):
    """Create a structure reflecting the outline of a PDF.
    A chapter heading looks like this:

    BookmarkTitle: 2. What is sound?
    BookmarkLevel: 1
    BookmarkPageNumber: 3
    """
    cmd = ('pdftk', pdf, 'dump_data')
    p = Popen(cmd, stdout=PIPE, stderr=PIPE)
    outline, err = p.communicate()
    #log("OUTLINE:", outline)
    if debug_filename is not None:
        try:
            f = open(debug_filename, 'w')
            f.write(outline)
            f.close()
        except IOError:
            log("could not write to %s!" % debug_filename)

    lines = (x.strip() for x in outline.split('\n') if x.strip())
    contents = []

    def _strip(s):
        return s.strip(config.WHITESPACE_AND_NULL)

    def extract(expected, conv=_strip):
        line = lines.next()
        try:
            k, v = line.split(':', 1)
            if k == expected:
                return conv(v)
        except ValueError:
            log("trouble with line %r" %line)

    #There are a few useless variables, then the pagecount, then the contents.
    #The pagecount is useful, so pick it up first.
    page_count = None
    while page_count == None:
        page_count = extract('NumberOfPages', int)

    try:
        while True:
            title = extract('BookmarkTitle')
            if title is not None:
                level = extract('BookmarkLevel', int)
                pagenum = extract('BookmarkPageNumber', int)
                if level <= level_threshold and None not in (level, pagenum):
                    contents.append((title, level, pagenum))
    except StopIteration:
        pass

    return contents, page_count
Пример #31
0
def output_blob_and_shut_up(blob,
                            content_type="application/octet-stream",
                            filename=None):
    print 'Content-type: %s\nContent-length: %s' % (content_type, len(blob))
    if filename is not None:
        print 'Content-Disposition: attachment; filename="%s"' % filename
    print
    print blob
    sys.stdout.flush()
    devnull = open('/dev/null', 'w')
    os.dup2(devnull.fileno(), sys.stdout.fileno())
    log(sys.stdout)
Пример #32
0
def parse_extracted_outline(outline_file, depth=config.CONTENTS_DEPTH):
    '''Extract outline data from an XML file structured as follows:

      <?xml version="1.0" encoding="UTF-8"?>
      <outline xmlns="http://code.google.com/p/wkhtmltopdf/outline">
        <item title="" page="0" link="__WKANCHOR_0" backLink="__WKANCHOR_1">
          <item title="1. ANONYMOUS" page="2" link="__WKANCHOR_2" backLink="__WKANCHOR_3"/>
          <item title="2. HOW THIS BOOK IS WRITTEN" page="4" link="__WKANCHOR_4" backLink="__WKANCHOR_5">
            <item title="WHAT IS A BOOK SPRINT?" page="4" link="__WKANCHOR_6" backLink="__WKANCHOR_7"/>
            <item title="HOW TO WRITE THIS BOOK" page="11" link="__WKANCHOR_c" backLink="__WKANCHOR_d">
              <item title="1. Register" page="11" link="__WKANCHOR_e" backLink="__WKANCHOR_f"/>
              <item title="2. Contribute!" page="11" link="__WKANCHOR_g" backLink="__WKANCHOR_h"/>
            </item>
          </item>
          <item title="3. ASSUMPTIONS" page="13" link="__WKANCHOR_i" backLink="__WKANCHOR_j">
            <item title="WHAT THIS BOOK IS NOT..." page="13" link="__WKANCHOR_k" backLink="__WKANCHOR_l"/>
          </item>
        </item>
      </outline>

     In other words:

     <!ELEMENT outline (item*)>
     <!ELEMENT item (item*)>
     and item has the following attributes:
       title:    url-escaped string
       page:     page number
       link:     link to here from the TOC
       backLink: link back to the TOC

    Title is encoded as utf-8 text that has been "percent-encoding" as
    described in section 2.1 of RFC 3986.
    '''
    from lxml import etree
    f = open(outline_file, 'r')
    tree = etree.parse(f)
    f.close()

    contents = []

    def parse_item(e, depth):
        title = urllib.unquote(e.get('title')).strip()
        pageno = int(e.get('page'))
        if depth:
            contents.append((title, depth, pageno))
        for x in e.iterchildren(config.WKTOCNS + 'item'):
            parse_item(x, depth + 1)

    for x in tree.getroot().iterchildren(config.WKTOCNS + 'item'):
        parse_item(x, 0)

    log(contents)
    return contents
Пример #33
0
def _find_tag(doc, tag):
    # log(lxml.etree.tostring(doc, encoding='utf-8', method='html').replace('&#13;', ''))
    try:
        doc = doc.getroot()
    except AttributeError:
        pass
    if doc.nsmap:
        try:
            return doc.iter(XHTMLNS + tag).next()
        except StopIteration:
            log("doc had nsmap %s, but did not seem to be xhtml (looking for %s)" % (doc.nsmap, tag))
    return doc.iter(tag).next()
Пример #34
0
def localise_local_links(doc, old_filename=''):
    """Xinha produces document local links (e.g., for footnotes) in
    the form 'filename#local_anchor', which are broken if the filename
    changes.  In practice the filename changes at least twice during
    processing -- once from 'filename' to 'filename.html', when Booki
    makes the bookizip, and again to 'body.html' when all the chapters
    get concatenated.

    Additionally, Xinha will reuse the same IDs in each chapter, so
    when the chapters are all concatenated the IDs are no longer
    unique and the links won't work properly.

    This function will replace links in the form 'filename#id' with
    '#filename_id', and change the target IDs according.  It avoids
    altering the ID of elements that aren't locally linked, as these
    might be used for CSS or external links.
    """
    old_prefix = (old_filename + '#').encode('utf-8')
    targets = []
    transformed_ids = {}

    #loop 1: find links and shortlist elements with ID
    for e in doc.iter():
        if e.tag == 'a':
            href = e.get('href')
            if href and href.startswith(old_prefix):
                old_id = href[len(old_prefix):]
                new_id = '%s_%s' % (old_filename, old_id)
                e.set('href', '#' + new_id)
                transformed_ids[old_id] = new_id
            name = e.get('name')
            if name:
                targets.append(e)
                continue
        ID = e.get('id')
        if ID is not None:
            targets.append(e)

    log("transforming these IDs in chapter %s: %s" %
        (old_filename, transformed_ids))

    for e in targets:
        old_id = e.get('id')
        if old_id is None and e.tag == 'a':
            old_id = e.get('name')
        if old_id is None:
            continue
        if old_id in transformed_ids:
            new_id = transformed_ids[old_id]
            e.set('id', new_id)
            if e.tag == 'a':
                e.set('name', new_id)
Пример #35
0
 def _loadtree(self, html):
     try:
         try:
             self.tree = lxml.html.document_fromstring(html,
                                                       parser=self.parser)
         except UnicodeError, e:
             log('failed to parse tree as unicode, got %s %r' % (e, e),
                 'trying again using default parser')
             self.tree = lxml.html.document_fromstring(html)
     except etree.XMLSyntaxError, e:
         log('Could not parse html file %r, string %r... exception %s' %
             (self.name, html[:40], e))
         self.tree = empty_html_tree()
Пример #36
0
def _find_tag(doc, tag):
    #log(lxml.etree.tostring(doc, encoding='utf-8', method='html').replace('&#13;', ''))
    try:
        doc = doc.getroot()
    except AttributeError:
        pass
    if doc.nsmap:
        try:
            return doc.iter(XHTMLNS + tag).next()
        except StopIteration:
            log('doc had nsmap %s, but did not seem to be xhtml (looking for %s)'
                % (doc.nsmap, tag))
    return doc.iter(tag).next()
Пример #37
0
def localise_local_links(doc, old_filename=''):
    """Xinha produces document local links (e.g., for footnotes) in
    the form 'filename#local_anchor', which are broken if the filename
    changes.  In practice the filename changes at least twice during
    processing -- once from 'filename' to 'filename.html', when Booki
    makes the bookizip, and again to 'body.html' when all the chapters
    get concatenated.

    Additionally, Xinha will reuse the same IDs in each chapter, so
    when the chapters are all concatenated the IDs are no longer
    unique and the links won't work properly.

    This function will replace links in the form 'filename#id' with
    '#filename_id', and change the target IDs according.  It avoids
    altering the ID of elements that aren't locally linked, as these
    might be used for CSS or external links.
    """
    old_prefix = (old_filename + '#').encode('utf-8')
    targets = []
    transformed_ids = {}

    #loop 1: find links and shortlist elements with ID
    for e in doc.iter():
        if e.tag == 'a':
            href = e.get('href')
            if href and href.startswith(old_prefix):
                old_id = href[len(old_prefix):]
                new_id = '%s_%s' % (old_filename, old_id)
                e.set('href', '#' + new_id)
                transformed_ids[old_id] = new_id
            name = e.get('name')
            if name:
                targets.append(e)
                continue
        ID = e.get('id')
        if ID is not None:
            targets.append(e)

    log("transforming these IDs in chapter %s: %s" % (old_filename, transformed_ids))

    for e in targets:
        old_id = e.get('id')
        if old_id is None and e.tag == 'a':
            old_id = e.get('name')
        if old_id is None:
            continue
        if old_id in transformed_ids:
            new_id = transformed_ids[old_id]
            e.set('id', new_id)
            if e.tag == 'a':
                e.set('name', new_id)
Пример #38
0
    def parse_opf(self):
        """
        The opf file is arranged like this:
        <package>
        <metadata />
        <manifest />
        <spine />
        <guide />
        </package>

        Metadata, manifest and spine are parsed in separate helper
        functions.
        """
        self.opfdir = os.path.dirname(
            self.opf_file)  #needed for manifest parsing
        tree = self.gettree(self.opf_file)
        root = tree.getroot()
        metadata = root.find(OPFNS + 'metadata')
        manifest = root.find(OPFNS + 'manifest')
        spine = root.find(OPFNS + 'spine')

        self.metadata = parse_metadata(metadata)
        self.manifest = parse_manifest(manifest, self.opfdir)
        # mapping of filenames to new filenames.  This needs to be
        # done early to detect clashes (e.g. '/images/hello.jpg' and
        # '/images/big/hello.jpg' would both reduce to
        # 'static/hello.jpg').
        self.media_map = {}
        for k, v in self.manifest.items():
            fn, mimetype = v
            if isinstance(fn, unicode):
                log('Stupid unicode: %r' % fn)

            if mimetype not in MARKUP_TYPES:
                oldfn = fn
                if '/' in fn:
                    fn = fn.rsplit('/', 1)[1]
                while fn in self.media_map.values():
                    fn = '_' + fn
                newfn = 'static/%s' % fn
                self.media_map[oldfn] = newfn

        ncxid, self.spine = parse_spine(spine)
        self.ncxfile = self.manifest[ncxid][0]

        #there is also an optional guide section, which we ignore
        guide = root.find(OPFNS + 'guide')
        if guide is not None:
            self.guide = parse_guide(guide)
        else:
            self.guide = None
Пример #39
0
    def parse_opf(self):
        """
        The opf file is arranged like this:
        <package>
        <metadata />
        <manifest />
        <spine />
        <guide />
        </package>

        Metadata, manifest and spine are parsed in separate helper
        functions.
        """
        self.opfdir = os.path.dirname(self.opf_file)  # needed for manifest parsing
        tree = self.gettree(self.opf_file)
        root = tree.getroot()
        metadata = root.find(OPFNS + "metadata")
        manifest = root.find(OPFNS + "manifest")
        spine = root.find(OPFNS + "spine")

        self.metadata = parse_metadata(metadata)
        self.manifest = parse_manifest(manifest, self.opfdir)
        # mapping of filenames to new filenames.  This needs to be
        # done early to detect clashes (e.g. '/images/hello.jpg' and
        # '/images/big/hello.jpg' would both reduce to
        # 'static/hello.jpg').
        self.media_map = {}
        for k, v in self.manifest.items():
            fn, mimetype = v
            if isinstance(fn, unicode):
                log("Stupid unicode: %r" % fn)

            if mimetype not in MARKUP_TYPES:
                oldfn = fn
                if "/" in fn:
                    fn = fn.rsplit("/", 1)[1]
                while fn in self.media_map.values():
                    fn = "_" + fn
                newfn = "static/%s" % fn
                self.media_map[oldfn] = newfn

        ncxid, self.spine = parse_spine(spine)
        self.ncxfile = self.manifest[ncxid][0]

        # there is also an optional guide section, which we ignore
        guide = root.find(OPFNS + "guide")
        if guide is not None:
            self.guide = parse_guide(guide)
        else:
            self.guide = None
Пример #40
0
 def load(self, src):
     # Zip is a variable format, and zipfile is limited.  If that
     # becomes a problem we will have to ise an `unzip` subprocess,
     # but it hasn't been so far.
     if isinstance(src, str):
         # Should end with PK<06><05> + 18 more.
         # Some zips contain 'comments' after that, which breaks ZipFile
         zipend = src.rfind("PK\x05\x06") + 22
         if len(src) != zipend:
             log("Bad zipfile?")
             src = src[:zipend]
         src = StringIO(src)
     self.zip = zipfile.ZipFile(src, "r", compression=zipfile.ZIP_DEFLATED, allowZip64=True)
     self.names = self.zip.namelist()
     self.info = self.zip.infolist()
     self.origin = src
Пример #41
0
def get_book_list(server):
    """Ask the server for a list of books.  Booki offers this list as
    json at /list-books.json.
    """
    url = 'http://%s/list-books.json' % server
    log('getting booklist: %s' % url)
    f = urlopen(url)
    books = json.load(f)

    items = []
    for book in books:
        url = book['fields']['url_title']
        title = book['fields']['title']
        items.append((url, title))

    f.close()
    return items
Пример #42
0
    def find_language(self):
        opflang = [x[0].lower() for x in self.metadata.get(DC, {}).get("language", ())]

        # XXX Should the ncx language enter into it? Being xml:lang,
        # it is in theory just the language of the ncx document
        # itself.  But if the metadata lacks language, should it be
        # used instead? At present, NO.
        # ncxlang = self.ncxdata['headers'].get('lang', ())

        # XXX also, for now, ignoring case of badly formed language
        # codes, conflicting or supplementary languages, etc.
        opflang = [x for x in opflang if x not in ("und", "")]
        if not opflang:
            return None
        if len(set(opflang)) > 1:
            log("%s metadata has more than one language: %s -- using first one" % (self.origin, opflang))
        return opflang[0]
Пример #43
0
def espri(epuburl, bookid, src_id=None):
    """Make a bookizip from the epub at <epuburl> and save it as
    <bookid>.zip."""
    log("starting espri", epuburl, bookid)
    f = urlopen(epuburl)
    s = f.read()
    f.close()
    e = epub.Epub()
    e.load(s)
    if src_id is not None:
        #so that booki knows where the book came from, so e.g. archive.org can find it again
        e.register_source_id(src_id)
    e.parse_meta()
    e.parse_opf()
    e.parse_ncx()
    zipfile = '%s/%s.zip' % (config.BOOKI_BOOK_DIR, bookid)
    e.make_bookizip(zipfile)
Пример #44
0
def espri(epuburl, bookid, src_id=None):
    """Make a bookizip from the epub at <epuburl> and save it as
    <bookid>.zip."""
    log("starting espri", epuburl, bookid)
    f = urlopen(epuburl)
    s = f.read()
    f.close()
    e = epub.Epub()
    e.load(s)
    if src_id is not None:
        #so that booki knows where the book came from, so e.g. archive.org can find it again
        e.register_source_id(src_id)
    e.parse_meta()
    e.parse_opf()
    e.parse_ncx()
    zipfile = '%s/%s.zip' % (config.BOOKI_BOOK_DIR, bookid)
    e.make_bookizip(zipfile)
Пример #45
0
def make_ncx(toc, filemap, ID, title):
    log(filemap)
    tree = etree.parse(StringIO(BARE_NCX))
    root = tree.getroot()
    head = etree.SubElement(root, 'head')
    add_ncxtext(root, 'docTitle', title)
    navmap = etree.SubElement(root, 'navMap')
    counter, maxdepth = 0, 0
    for subtoc in toc:
        counter, maxdepth = write_navtree(navmap, subtoc, counter, 1, maxdepth, filemap)

    for name, content in (('dtb:uid', ID),
                          ('dtb:depth', str(maxdepth)),
                          ('dtb:totalPageCount', '0'),
                          ('dtb:maxPageNumber', '0')
                          ):
        etree.SubElement(head, 'meta', name=name, content=content)
    return etree.tostring(tree, pretty_print=True, encoding='utf-8')
Пример #46
0
    def fetch_if_necessary(self, url, target=None, use_cache=True):
        if url in self._fetched:
            return self._fetched[url]

        if target is None:
            target = url_to_filename(url, self.prefix)

        if use_cache and os.path.exists(self.cache_dir + target):
            log("used cache for %s" % target)
            return target

        try:
            data = url_fetch(url)
        except HTTPError, e:
            # if it is missing, assume it will be missing every time
            # after, otherwise, you can get into endless waiting
            self._fetched[url] = None
            log("Wanting '%s', got error %s" % (url, e))
            return None
Пример #47
0
    def make_barcode_pdf(self, isbn, pdf, corner='br'):
        """Put an ISBN barcode in a corner of a single blank page."""

        position = '%s,%s,%s,%s,%s' % (corner, self.width, self.height, self.side_margin, self.bottom_margin)
        cmd1 = [config.BOOKLAND,
                '--position', position,
                str(isbn)]
        cmd2 = ['ps2pdf',
                '-dFIXEDMEDIA',
                '-dDEVICEWIDTHPOINTS=%s' % self.width,
                '-dDEVICEHEIGHTPOINTS=%s' % self.height,
                '-', pdf]

        p1 = Popen(cmd1, stdout=PIPE)
        p2 = Popen(cmd2, stdin=p1.stdout, stdout=PIPE, stderr=PIPE)
        out, err = p2.communicate()

        log('ran:\n%s | %s' % (' '.join(cmd1), ' '.join(cmd2)))
        log("return: %s and %s \nstdout:%s \nstderr:%s" % (p1.poll(), p2.poll(), out, err))
Пример #48
0
    def make_barcode_pdf(self, isbn, pdf, corner='br'):
        """Put an ISBN barcode in a corner of a single blank page."""

        position = '%s,%s,%s,%s,%s' % (corner, self.width, self.height, self.side_margin, self.bottom_margin)
        cmd1 = [config.BOOKLAND,
                '--position', position,
                str(isbn)]
        cmd2 = ['ps2pdf',
                '-dFIXEDMEDIA',
                '-dDEVICEWIDTHPOINTS=%s' % self.width,
                '-dDEVICEHEIGHTPOINTS=%s' % self.height,
                '-', pdf]

        p1 = Popen(cmd1, stdout=PIPE)
        p2 = Popen(cmd2, stdin=p1.stdout, stdout=PIPE, stderr=PIPE)
        out, err = p2.communicate()

        log('ran:\n%s | %s' % (' '.join(cmd1), ' '.join(cmd2)))
        log("return: %s and %s \nstdout:%s \nstderr:%s" % (p1.poll(), p2.poll(), out, err))
Пример #49
0
 def load(self, src):
     # Zip is a variable format, and zipfile is limited.  If that
     # becomes a problem we will have to ise an `unzip` subprocess,
     # but it hasn't been so far.
     if isinstance(src, str):
         # Should end with PK<06><05> + 18 more.
         # Some zips contain 'comments' after that, which breaks ZipFile
         zipend = src.rfind('PK\x05\x06') + 22
         if len(src) != zipend:
             log('Bad zipfile?')
             src = src[:zipend]
         src = StringIO(src)
     self.zip = zipfile.ZipFile(src,
                                'r',
                                compression=zipfile.ZIP_DEFLATED,
                                allowZip64=True)
     self.names = self.zip.namelist()
     self.info = self.zip.infolist()
     self.origin = src
Пример #50
0
    def fetch_if_necessary(self, url, target=None, use_cache=True):
        if url in self._fetched:
            return self._fetched[url]

        if target is None:
            target = url_to_filename(url, self.prefix)

        if use_cache and os.path.exists(self.cache_dir + target):
            log("used cache for %s" % target)
            return target

        try:
            data = url_fetch(url)
        except HTTPError, e:
            # if it is missing, assume it will be missing every time
            # after, otherwise, you can get into endless waiting
            self._fetched[url] = None
            log("Wanting '%s', got error %s" %(url, e))
            return None
Пример #51
0
        def write_toc(point, section):
            tocpoint = {}
            title = find_good_label(point['labels'], lang),
            if title and title[0]:
                tocpoint['title'] = title[0]
            ID = point['id']
            if ID in spine:
                tocpoint['url'] = self.manifest.get(ID, ID + '.html')
                while deferred_urls:
                    tp = deferred_urls.pop()
                    tp['url'] = tocpoint['url']
                    log('%r has deferred url: %r' % (tp['title'], tp['url']))
            else:
                deferred_urls.append(tocpoint)
            if point['points']:
                tocpoint['children'] = []
                for child in point['points']:
                    write_toc(child, tocpoint['children'])

            section.append(tocpoint)
Пример #52
0
        def write_toc(point, section):
            tocpoint = {}
            title = find_good_label(point['labels'], lang),
            if title and title[0]:
                tocpoint['title'] = title[0]
            ID = point['id']
            if ID in spine:
                tocpoint['url'] = self.manifest.get(ID, ID + '.html')
                while deferred_urls:
                    tp = deferred_urls.pop()
                    tp['url'] = tocpoint['url']
                    log('%r has deferred url: %r' % (tp['title'], tp['url']))
            else:
                deferred_urls.append(tocpoint)
            if point['points']:
                tocpoint['children'] = []
                for child in point['points']:
                    write_toc(child, tocpoint['children'])

            section.append(tocpoint)
Пример #53
0
    def find_language(self):
        opflang = [
            x[0].lower()
            for x in self.metadata.get(DC, {}).get('language', ())
        ]

        # XXX Should the ncx language enter into it? Being xml:lang,
        # it is in theory just the language of the ncx document
        # itself.  But if the metadata lacks language, should it be
        # used instead? At present, NO.
        #ncxlang = self.ncxdata['headers'].get('lang', ())

        # XXX also, for now, ignoring case of badly formed language
        # codes, conflicting or supplementary languages, etc.
        opflang = [x for x in opflang if x not in ('und', '')]
        if not opflang:
            return None
        if len(set(opflang)) > 1:
            log('%s metadata has more than one language: %s -- using first one'
                % (self.origin, opflang))
        return opflang[0]
Пример #54
0
def toc_iterator(server, book):
    """TOC.txt has 3 lines per chapter.  Fetch them and yield them in
    triples.
    """
    url = config.TOC_URL % (server, book)
    log('getting TOC: %s' % url)
    f = urlopen(url)
    encoding = config.SERVER_DEFAULTS[server]['toc-encoding']
    while True:
        try:
            if encoding is not None:
                yield TocItem(f.next().decode(encoding).strip().encode('utf-8'),
                              f.next().decode(encoding).strip().encode('utf-8'),
                              f.next().decode(encoding).strip().encode('utf-8'))
            else:
                yield TocItem(f.next().strip(),
                              f.next().strip(),
                              f.next().strip())
        except StopIteration:
            break
    f.close()
Пример #55
0
def wikibooks_espri(wiki_url):
    """Wikibooks import using the wikibooks2epub script by Jan Gerber
    to first convert the wikibook to an epub, which can then be turned
    into a bookizip via the espri function.
    """
    os.environ['oxCACHE'] = os.path.abspath(config.WIKIBOOKS_CACHE)
    os.environ['LANG'] = 'en_NZ.UTF-8'
    tainted_name = unquote(os.path.basename(urlsplit(wiki_url).path))
    bookid = "%s-%s" % (super_bleach(tainted_name),
                        time.strftime('%Y.%m.%d-%H.%M.%S'))
    workdir = tempfile.mkdtemp(prefix=bookid, dir=os.path.join(config.DATA_ROOT, "tmp"))
    os.chmod(workdir, 0755)
    epub_file = os.path.join(workdir, bookid + '.epub')
    epub_url = path2url(epub_file)

    #the wikibooks importer is a separate process, so run that, then collect the epub.
    cmd = [config.TIMEOUT_CMD, config.WIKIBOOKS_TIMEOUT,
           config.WIKIBOOKS_CMD,
           '-i', wiki_url,
           '-o', epub_file
           ]
    log(cmd)
    log(os.environ)
    log(os.getcwd())

    try:
        check_call(cmd)
    except CalledProcessError, e:
        if e.returncode == 137:
            raise TimeoutError('Wikibooks took too long (over %s seconds)' % WIKIBOOKS_TIMEOUT)
        raise
Пример #56
0
def wikibooks_espri(wiki_url):
    """Wikibooks import using the wikibooks2epub script by Jan Gerber
    to first convert the wikibook to an epub, which can then be turned
    into a bookizip via the espri function.
    """
    os.environ['oxCACHE'] = os.path.abspath(config.WIKIBOOKS_CACHE)
    os.environ['LANG'] = 'en_NZ.UTF-8'
    tainted_name = unquote(os.path.basename(urlsplit(wiki_url).path))
    bookid = "%s-%s" % (super_bleach(tainted_name),
                        time.strftime('%Y.%m.%d-%H.%M.%S'))
    workdir = tempfile.mkdtemp(prefix=bookid,
                               dir=os.path.join(config.DATA_ROOT, "tmp"))
    os.chmod(workdir, 0755)
    epub_file = os.path.join(workdir, bookid + '.epub')
    epub_url = path2url(epub_file)

    #the wikibooks importer is a separate process, so run that, then collect the epub.
    cmd = [
        config.TIMEOUT_CMD, config.WIKIBOOKS_TIMEOUT, config.WIKIBOOKS_CMD,
        '-i', wiki_url, '-o', epub_file
    ]
    log(cmd)
    log(os.environ)
    log(os.getcwd())

    try:
        check_call(cmd)
    except CalledProcessError, e:
        if e.returncode == 137:
            raise TimeoutError('Wikibooks took too long (over %s seconds)' %
                               WIKIBOOKS_TIMEOUT)
        raise
Пример #57
0
def toc_iterator(server, book):
    """TOC.txt has 3 lines per chapter.  Fetch them and yield them in
    triples.
    """
    url = config.TOC_URL % (server, book)
    log('getting TOC: %s' % url)
    f = urlopen(url)
    encoding = get_server_defaults(server)['toc-encoding']
    while True:
        try:
            if encoding is not None:
                yield TocItem(
                    f.next().decode(encoding).strip().encode('utf-8'),
                    f.next().decode(encoding).strip().encode('utf-8'),
                    f.next().decode(encoding).strip().encode('utf-8'))
            else:
                yield TocItem(f.next().strip(),
                              f.next().strip(),
                              f.next().strip())
        except StopIteration:
            break
    f.close()