def _webkit_command(self, html_url, pdf, outline=False, outline_file=None, page_num=None): m = [str(x) for x in self.margins] outline_args = ['--outline', '--outline-depth', '2'] * outline if outline_file is not None: outline_args += ['--dump-outline', outline_file] page_num_args = [] if page_num: footer_url, header_url = self.get_boilerplate(page_num) if footer_url is not None: page_num_args += ['--footer-html', footer_url] if header_url is not None: page_num_args += ['--header-html', header_url] greyscale_args = ['-g'] * self.grey_scale quiet_args = ['-q'] cmd = ([config.WKHTMLTOPDF] + quiet_args + ['--page-width', str(self.width * POINT_2_MM), '--page-height', str(self.height * POINT_2_MM), '-T', m[0], '-R', m[1], '-B', m[2], '-L', m[3], #'--disable-smart-shrinking', '-d', '100', #'--zoom', '1.2', '--encoding', 'UTF-8', '--javascript-delay', '2000', ] + page_num_args + outline_args + greyscale_args + config.WKHTMLTOPDF_EXTRA_COMMANDS + [html_url, pdf]) log(' '.join(cmd)) return cmd
def _parse_credits(self, force=False): # open the Credits chapter that has a list of authors for each chapter. # each chapter is listed thus (linebreaks added): # <i>CHAPTER TITLE</i><br/>© First Author 2007<br/> # Modifications:<br/>Second Author 2007, 2008<br/> # Third Author 2008<br/>Fourth Author 2008<br/><hr/> # # where "CHAPTER TITLE" is as appears in TOC.txt, and "X # Author" are the names TWiki has for authors. So the thing # to do is look for the <i> tags and match them to the toc. # # the chapter title is not guaranteed unique (but usually is). if self.credits is not None and not force: log("not reloading metadata") return self.credits = {} self.contributors = set() self.titles = [] credits_html = self.get_chapter_html('Credits', wrapped=True) try: tree = lxml.html.document_fromstring(credits_html, parser=utf8_html_parser) except UnicodeDecodeError, e: log("book isn't unicode! (%s)" %(e,)) encoding = config.SERVER_DEFAULTS[self.server]['toc-encoding'] parser = lxml.html.HTMLParser(encoding=encoding) tree = lxml.html.document_fromstring(credits_html, parser=parser)
def parse_args(arg_validators): """Read and validate CGI or commandline arguments, putting the good ones into the returned dictionary. Command line arguments should be in the form --title='A Book'. arg_validators is a dictionary mapping keys to either 1) functions that validate their values; or 2) tuples of such functions and default values. The default value will itself be validated and used in the case that no relevant argument is given. """ query = cgi.FieldStorage() options, args = gnu_getopt(sys.argv[1:], '', [x + '=' for x in arg_validators]) options = dict(options) log("Starting request for %s" % (os.environ.get('REQUEST_URI'),)) log(query, debug='STARTUP') data = {} for key, validator in arg_validators.items(): if isinstance(validator, tuple): validator, default = validator else: default = None value = query.getfirst(key, options.get('--' + key, default)) log('%s: %s' % (key, value), debug='STARTUP') if value is not None: if validator is not None and not validator(value): log("argument '%s' is not valid ('%s')" % (key, value)) continue data[key] = value log("effective query is:", data) return data
def get_book_list(server): """Ask the server for a list of books. Floss Manual TWikis keep such a list at /bin/view/TWiki/WebLeftBarWebsList?skin=text but it needs a bit of processing If BOOK_LIST_CACHE is non-zero, the book list won't be re-fetched in that many seconds, rather it will be read from disk. """ if config.BOOK_LIST_CACHE: cache_name = os.path.join(config.CACHE_DIR, '%s.booklist' % server) if (os.path.exists(cache_name) and os.stat(cache_name).st_mtime + config.BOOK_LIST_CACHE > time.time()): f = open(cache_name) s = f.read() f.close() return s.split() url = config.CHAPTER_URL % (server, 'TWiki', 'WebLeftBarWebsList') #url = 'http://%s/bin/view/TWiki/WebLeftBarWebsList?skin=text' % server #XXX should use lxml log('getting booklist: %s' % url) s = url_fetch(url) items = sorted(x for x in re.findall(r'/bin/view/([\w/]+)/WebHome', s) if x not in config.IGNORABLE_TWIKI_BOOKS) if config.BOOK_LIST_CACHE: f = open(cache_name, 'w') f.write('\n'.join(items)) f.close() return items
def parse_manifest(manifest, pwd): """ Only contains <item>s; each <item> has id, href, and media-type. It includes 'toc.ncx', but not 'META-INF/container.xml' or the pbf file (i.e., the files needed to get this far). The manifest can specify fallbacks for unrecognised documents, but Espri does not use that (nor do any of the test epub files). <manifest> <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml" /> <item id="WHume_NatureC01" href="Hume_NatureC01.html" media-type="application/xhtml+xml" /> <item id="cover" href="cover.jpg" media-type="image/jpeg" /> </manifest> """ items = {} ns = '{%s}' % manifest.nsmap[None] for t in manifest.iterchildren(ns + 'item'): id = t.get('id') href = os.path.join(pwd, t.get('href')) if isinstance(href, unicode): log('damn unicode: %r' % href) log(etree.tostring(t)) media_type = t.get('media-type') items[id] = (href, media_type) #XXX does media-type matter? return items
def _parse_credits(self, force=False): # open the Credits chapter that has a list of authors for each chapter. # each chapter is listed thus (linebreaks added): # <i>CHAPTER TITLE</i><br/>© First Author 2007<br/> # Modifications:<br/>Second Author 2007, 2008<br/> # Third Author 2008<br/>Fourth Author 2008<br/><hr/> # # where "CHAPTER TITLE" is as appears in TOC.txt, and "X # Author" are the names TWiki has for authors. So the thing # to do is look for the <i> tags and match them to the toc. # # the chapter title is not guaranteed unique (but usually is). if self.credits is not None and not force: log("not reloading metadata") return self.credits = {} self.contributors = set() self.titles = [] credits_html = self.get_chapter_html('Credits', wrapped=True) try: tree = lxml.html.document_fromstring(credits_html, parser=utf8_html_parser) except UnicodeDecodeError, e: log("book isn't unicode! (%s)" % (e, )) encoding = get_server_defaults(self.server)['toc-encoding'] parser = lxml.html.HTMLParser(encoding=encoding) tree = lxml.html.document_fromstring(credits_html, parser=parser)
def get_chapter_breaks(points, pwd): # First go was overly complex, trying to guess which sections were # really chapters. Now, every ncx navpoint is a chapter break. serial_points = [] def serialise(p, depth): serial_points.append((depth, p)) #if p['class']: # log("found class=='%s' at depth %s" % (p['class'], depth)) if not p.get('points'): return for child in p['points']: serialise(child, depth + 1) for p in points: serialise(p, 1) splits = {} for depth, p in serial_points: url, ID = p['content_src'], None url = os.path.join(pwd, url) if '#' in url: log("GOT a fragment! %s" % url) url, ID = url.split('#', 1) s = splits.setdefault(url, []) s.append((depth, ID, p)) return serial_points, splits
def parse_args(arg_validators): """Read and validate CGI or commandline arguments, putting the good ones into the returned dictionary. Command line arguments should be in the form --title='A Book'. arg_validators is a dictionary mapping keys to either 1) functions that validate their values; or 2) tuples of such functions and default values. The default value will itself be validated and used in the case that no relevant argument is given. """ query = cgi.FieldStorage() options, args = gnu_getopt(sys.argv[1:], '', [x + '=' for x in arg_validators]) options = dict(options) log("Starting request for %s" % (os.environ.get('REQUEST_URI'), )) log(query, debug='STARTUP') data = {} for key, validator in arg_validators.items(): if isinstance(validator, tuple): validator, default = validator else: default = None value = query.getfirst(key, options.get('--' + key, default)) log('%s: %s' % (key, value), debug='STARTUP') if value is not None: if validator is not None and not validator(value): log("argument '%s' is not valid ('%s')" % (key, value)) continue data[key] = value log("effective query is:", data) return data
def get_chapter_breaks(points, pwd): # First go was overly complex, trying to guess which sections were # really chapters. Now, every ncx navpoint is a chapter break. serial_points = [] def serialise(p, depth): serial_points.append((depth, p)) # if p['class']: # log("found class=='%s' at depth %s" % (p['class'], depth)) if not p.get("points"): return for child in p["points"]: serialise(child, depth + 1) for p in points: serialise(p, 1) splits = {} for depth, p in serial_points: url, ID = p["content_src"], None url = os.path.join(pwd, url) if "#" in url: log("GOT a fragment! %s" % url) url, ID = url.split("#", 1) s = splits.setdefault(url, []) s.append((depth, ID, p)) return serial_points, splits
def parse_manifest(manifest, pwd): """ Only contains <item>s; each <item> has id, href, and media-type. It includes 'toc.ncx', but not 'META-INF/container.xml' or the pbf file (i.e., the files needed to get this far). The manifest can specify fallbacks for unrecognised documents, but Espri does not use that (nor do any of the test epub files). <manifest> <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml" /> <item id="WHume_NatureC01" href="Hume_NatureC01.html" media-type="application/xhtml+xml" /> <item id="cover" href="cover.jpg" media-type="image/jpeg" /> </manifest> """ items = {} ns = "{%s}" % manifest.nsmap[None] for t in manifest.iterchildren(ns + "item"): id = t.get("id") href = os.path.join(pwd, t.get("href")) if isinstance(href, unicode): log("damn unicode: %r" % href) log(etree.tostring(t)) media_type = t.get("media-type") items[id] = (href, media_type) # XXX does media-type matter? return items
def extract(expected, conv=_strip): line = lines.next() try: k, v = line.split(':', 1) if k == expected: return conv(v) except ValueError: log("trouble with line %r" %line)
def font_links(): """Links to various example pdfs.""" links = [] for script in os.listdir(config.FONT_EXAMPLE_SCRIPT_DIR): if not script.isalnum(): log("warning: font-sample %s won't work; skipping" % script) continue links.append('<a href="%s?script=%s">%s</a>' % (config.FONT_LIST_URL, script, script)) return links
def get_default_css(server=config.DEFAULT_SERVER, mode='book'): """Get the default CSS text for the selected server""" log(server) cssfile = url2path(get_server_defaults(server)['css-%s' % mode]) log(cssfile) f = open(cssfile) s = f.read() f.close() return s
def get_default_css(server=config.DEFAULT_SERVER, mode='book'): """Get the default CSS text for the selected server""" log(server) cssfile = url2path(config.SERVER_DEFAULTS[server]['css-%s' % mode]) log(cssfile) f = open(cssfile) s = f.read() f.close() return s
def output_blob_and_shut_up(blob, content_type="application/octet-stream", filename=None): print 'Content-type: %s\nContent-length: %s' % (content_type, len(blob)) if filename is not None: print 'Content-Disposition: attachment; filename="%s"' % filename print print blob sys.stdout.flush() devnull = open('/dev/null', 'w') os.dup2(devnull.fileno(), sys.stdout.fileno()) log(sys.stdout)
def _loadtree(self, html): try: try: self.tree = lxml.html.document_fromstring(html, parser=self.parser) except UnicodeError, e: log("failed to parse tree as unicode, got %s %r" % (e, e), "trying again using default parser") self.tree = lxml.html.document_fromstring(html) except etree.XMLSyntaxError, e: log("Could not parse html file %r, string %r... exception %s" % (self.name, html[:40], e)) self.tree = empty_html_tree()
def __init__(self, book, server, bookname=None): if bookname is None: bookname = make_book_name(book, server, '.zip') log("*** Extracting TWiki book %s ***" % bookname) self.bookname = bookname self.book = book self.server = server self.workdir = tempfile.mkdtemp(prefix=bookname, dir=config.TMPDIR) os.chmod(self.workdir, 0755) #probable text direction self.dir = guess_text_dir(self.server, self.book)
def make_navpoint(parent, n, title, url): """Make the actual navpoint node""" log((parent, n, title, url)) if url is None: url = '' navpoint = etree.SubElement(parent, 'navPoint', id=(NAVPOINT_ID_TEMPLATE % (n - 1)), playOrder=str(n)) add_ncxtext(navpoint, 'navLabel', title) etree.SubElement(navpoint, 'content', src=url) return navpoint
def get_chapter_html(self, chapter, wrapped=False): url = config.CHAPTER_URL % (self.server, self.book, chapter) log('getting chapter: %s' % url) html = url_fetch(url) if wrapped: html = CHAPTER_TEMPLATE % { 'title': '%s: %s' % (self.book, chapter), 'text': html, 'dir': self.dir } return html
def parse_metadata(metadata): """metadata is an OPF metadata node, as defined at http://www.idpf.org/2007/opf/OPF_2.0_final_spec.html#Section2.2 (or a dc-metadata or x-metadata child thereof). """ # the node probably has at least 'dc', 'opf', and None namespace # prefixes. None and opf probably map to the same thing. 'dc' is # Dublin Core. nsmap = metadata.nsmap nstags = dict((k, '{%s}' % v) for k, v in nsmap.iteritems()) default_ns = nstags[None] # Collect element data in namespace-bins, and map prefixes to # those bins for convenience nsdict = dict((v, {}) for v in nsmap.values()) def add_item(ns, tag, value, extra): #any key can be duplicate, so store in a list if ns not in nsdict: nsdict[ns] = {} values = nsdict[ns].setdefault(tag, []) values.append((value, extra)) for t in metadata.iterdescendants(): #look for special OPF tags if t.tag == default_ns + 'meta': #meta tags <meta name="" content="" /> name = t.get('name') content = t.get('content') others = dict( (k, v) for k, v in t.items() if k not in ('name', 'content')) if ':' in name: # the meta tag is using xml namespaces in attribute values. prefix, name = name.split(':', 1) else: prefix = None add_item(t.nsmap[prefix], name, content, others) continue if t.tag in (default_ns + 'dc-metadata', default_ns + 'x-metadata'): # Subelements of these deprecated elements are in either # DC or non-DC namespace (respectively). Of course, this # is true of any element anyway, so it is sufficent to # ignore this (unless we want to cause pedantic errors). log("found a live %s tag; descending into but otherwise ignoring it" % t.tag[len(default_ns):]) continue tag = t.tag[t.tag.rfind('}') + 1:] add_item(t.nsmap[t.prefix], tag, t.text, tuple((k.replace(default_ns, ''), v) for k, v in t.items())) return nsdict
def parse_metadata(metadata): """metadata is an OPF metadata node, as defined at http://www.idpf.org/2007/opf/OPF_2.0_final_spec.html#Section2.2 (or a dc-metadata or x-metadata child thereof). """ # the node probably has at least 'dc', 'opf', and None namespace # prefixes. None and opf probably map to the same thing. 'dc' is # Dublin Core. nsmap = metadata.nsmap nstags = dict((k, '{%s}' % v) for k, v in nsmap.iteritems()) default_ns = nstags[None] # Collect element data in namespace-bins, and map prefixes to # those bins for convenience nsdict = dict((v, {}) for v in nsmap.values()) def add_item(ns, tag, value, extra): #any key can be duplicate, so store in a list if ns not in nsdict: nsdict[ns] = {} values = nsdict[ns].setdefault(tag, []) values.append((value, extra)) for t in metadata.iterdescendants(): #look for special OPF tags if t.tag == default_ns + 'meta': #meta tags <meta name="" content="" /> name = t.get('name') content = t.get('content') others = dict((k, v) for k, v in t.items() if k not in ('name', 'content')) if ':' in name: # the meta tag is using xml namespaces in attribute values. prefix, name = name.split(':', 1) else: prefix = None add_item(t.nsmap[prefix], name, content, others) continue if t.tag in (default_ns + 'dc-metadata', default_ns + 'x-metadata'): # Subelements of these deprecated elements are in either # DC or non-DC namespace (respectively). Of course, this # is true of any element anyway, so it is sufficent to # ignore this (unless we want to cause pedantic errors). log("found a live %s tag; descending into but otherwise ignoring it" % t.tag[len(default_ns):]) continue tag = t.tag[t.tag.rfind('}') + 1:] add_item(t.nsmap[t.prefix], tag, t.text, tuple((k.replace(default_ns, ''), v) for k, v in t.items())) return nsdict
def parse_extracted_outline(outline_file, depth=config.CONTENTS_DEPTH): '''Extract outline data from an XML file structured as follows: <?xml version="1.0" encoding="UTF-8"?> <outline xmlns="http://code.google.com/p/wkhtmltopdf/outline"> <item title="" page="0" link="__WKANCHOR_0" backLink="__WKANCHOR_1"> <item title="1. ANONYMOUS" page="2" link="__WKANCHOR_2" backLink="__WKANCHOR_3"/> <item title="2. HOW THIS BOOK IS WRITTEN" page="4" link="__WKANCHOR_4" backLink="__WKANCHOR_5"> <item title="WHAT IS A BOOK SPRINT?" page="4" link="__WKANCHOR_6" backLink="__WKANCHOR_7"/> <item title="HOW TO WRITE THIS BOOK" page="11" link="__WKANCHOR_c" backLink="__WKANCHOR_d"> <item title="1. Register" page="11" link="__WKANCHOR_e" backLink="__WKANCHOR_f"/> <item title="2. Contribute!" page="11" link="__WKANCHOR_g" backLink="__WKANCHOR_h"/> </item> </item> <item title="3. ASSUMPTIONS" page="13" link="__WKANCHOR_i" backLink="__WKANCHOR_j"> <item title="WHAT THIS BOOK IS NOT..." page="13" link="__WKANCHOR_k" backLink="__WKANCHOR_l"/> </item> </item> </outline> In other words: <!ELEMENT outline (item*)> <!ELEMENT item (item*)> and item has the following attributes: title: url-escaped string page: page number link: link to here from the TOC backLink: link back to the TOC Title is encoded as utf-8 text that has been "percent-encoding" as described in section 2.1 of RFC 3986. ''' from lxml import etree f = open(outline_file, 'r') tree = etree.parse(f) f.close() contents = [] def parse_item(e, depth): title = urllib.unquote(e.get('title')).strip() pageno = int(e.get('page')) if depth: contents.append((title, depth, pageno)) for x in e.iterchildren(config.WKTOCNS + 'item'): parse_item(x, depth + 1) for x in tree.getroot().iterchildren(config.WKTOCNS + 'item'): parse_item(x, 0) log(contents) return contents
def parse_outline(pdf, level_threshold, debug_filename=None): """Create a structure reflecting the outline of a PDF. A chapter heading looks like this: BookmarkTitle: 2. What is sound? BookmarkLevel: 1 BookmarkPageNumber: 3 """ cmd = ('pdftk', pdf, 'dump_data') p = Popen(cmd, stdout=PIPE, stderr=PIPE) outline, err = p.communicate() #log("OUTLINE:", outline) if debug_filename is not None: try: f = open(debug_filename, 'w') f.write(outline) f.close() except IOError: log("could not write to %s!" % debug_filename) lines = (x.strip() for x in outline.split('\n') if x.strip()) contents = [] def _strip(s): return s.strip(config.WHITESPACE_AND_NULL) def extract(expected, conv=_strip): line = lines.next() try: k, v = line.split(':', 1) if k == expected: return conv(v) except ValueError: log("trouble with line %r" %line) #There are a few useless variables, then the pagecount, then the contents. #The pagecount is useful, so pick it up first. page_count = None while page_count == None: page_count = extract('NumberOfPages', int) try: while True: title = extract('BookmarkTitle') if title is not None: level = extract('BookmarkLevel', int) pagenum = extract('BookmarkPageNumber', int) if level <= level_threshold and None not in (level, pagenum): contents.append((title, level, pagenum)) except StopIteration: pass return contents, page_count
def _find_tag(doc, tag): # log(lxml.etree.tostring(doc, encoding='utf-8', method='html').replace(' ', '')) try: doc = doc.getroot() except AttributeError: pass if doc.nsmap: try: return doc.iter(XHTMLNS + tag).next() except StopIteration: log("doc had nsmap %s, but did not seem to be xhtml (looking for %s)" % (doc.nsmap, tag)) return doc.iter(tag).next()
def localise_local_links(doc, old_filename=''): """Xinha produces document local links (e.g., for footnotes) in the form 'filename#local_anchor', which are broken if the filename changes. In practice the filename changes at least twice during processing -- once from 'filename' to 'filename.html', when Booki makes the bookizip, and again to 'body.html' when all the chapters get concatenated. Additionally, Xinha will reuse the same IDs in each chapter, so when the chapters are all concatenated the IDs are no longer unique and the links won't work properly. This function will replace links in the form 'filename#id' with '#filename_id', and change the target IDs according. It avoids altering the ID of elements that aren't locally linked, as these might be used for CSS or external links. """ old_prefix = (old_filename + '#').encode('utf-8') targets = [] transformed_ids = {} #loop 1: find links and shortlist elements with ID for e in doc.iter(): if e.tag == 'a': href = e.get('href') if href and href.startswith(old_prefix): old_id = href[len(old_prefix):] new_id = '%s_%s' % (old_filename, old_id) e.set('href', '#' + new_id) transformed_ids[old_id] = new_id name = e.get('name') if name: targets.append(e) continue ID = e.get('id') if ID is not None: targets.append(e) log("transforming these IDs in chapter %s: %s" % (old_filename, transformed_ids)) for e in targets: old_id = e.get('id') if old_id is None and e.tag == 'a': old_id = e.get('name') if old_id is None: continue if old_id in transformed_ids: new_id = transformed_ids[old_id] e.set('id', new_id) if e.tag == 'a': e.set('name', new_id)
def _loadtree(self, html): try: try: self.tree = lxml.html.document_fromstring(html, parser=self.parser) except UnicodeError, e: log('failed to parse tree as unicode, got %s %r' % (e, e), 'trying again using default parser') self.tree = lxml.html.document_fromstring(html) except etree.XMLSyntaxError, e: log('Could not parse html file %r, string %r... exception %s' % (self.name, html[:40], e)) self.tree = empty_html_tree()
def _find_tag(doc, tag): #log(lxml.etree.tostring(doc, encoding='utf-8', method='html').replace(' ', '')) try: doc = doc.getroot() except AttributeError: pass if doc.nsmap: try: return doc.iter(XHTMLNS + tag).next() except StopIteration: log('doc had nsmap %s, but did not seem to be xhtml (looking for %s)' % (doc.nsmap, tag)) return doc.iter(tag).next()
def parse_opf(self): """ The opf file is arranged like this: <package> <metadata /> <manifest /> <spine /> <guide /> </package> Metadata, manifest and spine are parsed in separate helper functions. """ self.opfdir = os.path.dirname( self.opf_file) #needed for manifest parsing tree = self.gettree(self.opf_file) root = tree.getroot() metadata = root.find(OPFNS + 'metadata') manifest = root.find(OPFNS + 'manifest') spine = root.find(OPFNS + 'spine') self.metadata = parse_metadata(metadata) self.manifest = parse_manifest(manifest, self.opfdir) # mapping of filenames to new filenames. This needs to be # done early to detect clashes (e.g. '/images/hello.jpg' and # '/images/big/hello.jpg' would both reduce to # 'static/hello.jpg'). self.media_map = {} for k, v in self.manifest.items(): fn, mimetype = v if isinstance(fn, unicode): log('Stupid unicode: %r' % fn) if mimetype not in MARKUP_TYPES: oldfn = fn if '/' in fn: fn = fn.rsplit('/', 1)[1] while fn in self.media_map.values(): fn = '_' + fn newfn = 'static/%s' % fn self.media_map[oldfn] = newfn ncxid, self.spine = parse_spine(spine) self.ncxfile = self.manifest[ncxid][0] #there is also an optional guide section, which we ignore guide = root.find(OPFNS + 'guide') if guide is not None: self.guide = parse_guide(guide) else: self.guide = None
def parse_opf(self): """ The opf file is arranged like this: <package> <metadata /> <manifest /> <spine /> <guide /> </package> Metadata, manifest and spine are parsed in separate helper functions. """ self.opfdir = os.path.dirname(self.opf_file) # needed for manifest parsing tree = self.gettree(self.opf_file) root = tree.getroot() metadata = root.find(OPFNS + "metadata") manifest = root.find(OPFNS + "manifest") spine = root.find(OPFNS + "spine") self.metadata = parse_metadata(metadata) self.manifest = parse_manifest(manifest, self.opfdir) # mapping of filenames to new filenames. This needs to be # done early to detect clashes (e.g. '/images/hello.jpg' and # '/images/big/hello.jpg' would both reduce to # 'static/hello.jpg'). self.media_map = {} for k, v in self.manifest.items(): fn, mimetype = v if isinstance(fn, unicode): log("Stupid unicode: %r" % fn) if mimetype not in MARKUP_TYPES: oldfn = fn if "/" in fn: fn = fn.rsplit("/", 1)[1] while fn in self.media_map.values(): fn = "_" + fn newfn = "static/%s" % fn self.media_map[oldfn] = newfn ncxid, self.spine = parse_spine(spine) self.ncxfile = self.manifest[ncxid][0] # there is also an optional guide section, which we ignore guide = root.find(OPFNS + "guide") if guide is not None: self.guide = parse_guide(guide) else: self.guide = None
def load(self, src): # Zip is a variable format, and zipfile is limited. If that # becomes a problem we will have to ise an `unzip` subprocess, # but it hasn't been so far. if isinstance(src, str): # Should end with PK<06><05> + 18 more. # Some zips contain 'comments' after that, which breaks ZipFile zipend = src.rfind("PK\x05\x06") + 22 if len(src) != zipend: log("Bad zipfile?") src = src[:zipend] src = StringIO(src) self.zip = zipfile.ZipFile(src, "r", compression=zipfile.ZIP_DEFLATED, allowZip64=True) self.names = self.zip.namelist() self.info = self.zip.infolist() self.origin = src
def get_book_list(server): """Ask the server for a list of books. Booki offers this list as json at /list-books.json. """ url = 'http://%s/list-books.json' % server log('getting booklist: %s' % url) f = urlopen(url) books = json.load(f) items = [] for book in books: url = book['fields']['url_title'] title = book['fields']['title'] items.append((url, title)) f.close() return items
def find_language(self): opflang = [x[0].lower() for x in self.metadata.get(DC, {}).get("language", ())] # XXX Should the ncx language enter into it? Being xml:lang, # it is in theory just the language of the ncx document # itself. But if the metadata lacks language, should it be # used instead? At present, NO. # ncxlang = self.ncxdata['headers'].get('lang', ()) # XXX also, for now, ignoring case of badly formed language # codes, conflicting or supplementary languages, etc. opflang = [x for x in opflang if x not in ("und", "")] if not opflang: return None if len(set(opflang)) > 1: log("%s metadata has more than one language: %s -- using first one" % (self.origin, opflang)) return opflang[0]
def espri(epuburl, bookid, src_id=None): """Make a bookizip from the epub at <epuburl> and save it as <bookid>.zip.""" log("starting espri", epuburl, bookid) f = urlopen(epuburl) s = f.read() f.close() e = epub.Epub() e.load(s) if src_id is not None: #so that booki knows where the book came from, so e.g. archive.org can find it again e.register_source_id(src_id) e.parse_meta() e.parse_opf() e.parse_ncx() zipfile = '%s/%s.zip' % (config.BOOKI_BOOK_DIR, bookid) e.make_bookizip(zipfile)
def make_ncx(toc, filemap, ID, title): log(filemap) tree = etree.parse(StringIO(BARE_NCX)) root = tree.getroot() head = etree.SubElement(root, 'head') add_ncxtext(root, 'docTitle', title) navmap = etree.SubElement(root, 'navMap') counter, maxdepth = 0, 0 for subtoc in toc: counter, maxdepth = write_navtree(navmap, subtoc, counter, 1, maxdepth, filemap) for name, content in (('dtb:uid', ID), ('dtb:depth', str(maxdepth)), ('dtb:totalPageCount', '0'), ('dtb:maxPageNumber', '0') ): etree.SubElement(head, 'meta', name=name, content=content) return etree.tostring(tree, pretty_print=True, encoding='utf-8')
def fetch_if_necessary(self, url, target=None, use_cache=True): if url in self._fetched: return self._fetched[url] if target is None: target = url_to_filename(url, self.prefix) if use_cache and os.path.exists(self.cache_dir + target): log("used cache for %s" % target) return target try: data = url_fetch(url) except HTTPError, e: # if it is missing, assume it will be missing every time # after, otherwise, you can get into endless waiting self._fetched[url] = None log("Wanting '%s', got error %s" % (url, e)) return None
def make_barcode_pdf(self, isbn, pdf, corner='br'): """Put an ISBN barcode in a corner of a single blank page.""" position = '%s,%s,%s,%s,%s' % (corner, self.width, self.height, self.side_margin, self.bottom_margin) cmd1 = [config.BOOKLAND, '--position', position, str(isbn)] cmd2 = ['ps2pdf', '-dFIXEDMEDIA', '-dDEVICEWIDTHPOINTS=%s' % self.width, '-dDEVICEHEIGHTPOINTS=%s' % self.height, '-', pdf] p1 = Popen(cmd1, stdout=PIPE) p2 = Popen(cmd2, stdin=p1.stdout, stdout=PIPE, stderr=PIPE) out, err = p2.communicate() log('ran:\n%s | %s' % (' '.join(cmd1), ' '.join(cmd2))) log("return: %s and %s \nstdout:%s \nstderr:%s" % (p1.poll(), p2.poll(), out, err))
def load(self, src): # Zip is a variable format, and zipfile is limited. If that # becomes a problem we will have to ise an `unzip` subprocess, # but it hasn't been so far. if isinstance(src, str): # Should end with PK<06><05> + 18 more. # Some zips contain 'comments' after that, which breaks ZipFile zipend = src.rfind('PK\x05\x06') + 22 if len(src) != zipend: log('Bad zipfile?') src = src[:zipend] src = StringIO(src) self.zip = zipfile.ZipFile(src, 'r', compression=zipfile.ZIP_DEFLATED, allowZip64=True) self.names = self.zip.namelist() self.info = self.zip.infolist() self.origin = src
def fetch_if_necessary(self, url, target=None, use_cache=True): if url in self._fetched: return self._fetched[url] if target is None: target = url_to_filename(url, self.prefix) if use_cache and os.path.exists(self.cache_dir + target): log("used cache for %s" % target) return target try: data = url_fetch(url) except HTTPError, e: # if it is missing, assume it will be missing every time # after, otherwise, you can get into endless waiting self._fetched[url] = None log("Wanting '%s', got error %s" %(url, e)) return None
def write_toc(point, section): tocpoint = {} title = find_good_label(point['labels'], lang), if title and title[0]: tocpoint['title'] = title[0] ID = point['id'] if ID in spine: tocpoint['url'] = self.manifest.get(ID, ID + '.html') while deferred_urls: tp = deferred_urls.pop() tp['url'] = tocpoint['url'] log('%r has deferred url: %r' % (tp['title'], tp['url'])) else: deferred_urls.append(tocpoint) if point['points']: tocpoint['children'] = [] for child in point['points']: write_toc(child, tocpoint['children']) section.append(tocpoint)
def find_language(self): opflang = [ x[0].lower() for x in self.metadata.get(DC, {}).get('language', ()) ] # XXX Should the ncx language enter into it? Being xml:lang, # it is in theory just the language of the ncx document # itself. But if the metadata lacks language, should it be # used instead? At present, NO. #ncxlang = self.ncxdata['headers'].get('lang', ()) # XXX also, for now, ignoring case of badly formed language # codes, conflicting or supplementary languages, etc. opflang = [x for x in opflang if x not in ('und', '')] if not opflang: return None if len(set(opflang)) > 1: log('%s metadata has more than one language: %s -- using first one' % (self.origin, opflang)) return opflang[0]
def toc_iterator(server, book): """TOC.txt has 3 lines per chapter. Fetch them and yield them in triples. """ url = config.TOC_URL % (server, book) log('getting TOC: %s' % url) f = urlopen(url) encoding = config.SERVER_DEFAULTS[server]['toc-encoding'] while True: try: if encoding is not None: yield TocItem(f.next().decode(encoding).strip().encode('utf-8'), f.next().decode(encoding).strip().encode('utf-8'), f.next().decode(encoding).strip().encode('utf-8')) else: yield TocItem(f.next().strip(), f.next().strip(), f.next().strip()) except StopIteration: break f.close()
def wikibooks_espri(wiki_url): """Wikibooks import using the wikibooks2epub script by Jan Gerber to first convert the wikibook to an epub, which can then be turned into a bookizip via the espri function. """ os.environ['oxCACHE'] = os.path.abspath(config.WIKIBOOKS_CACHE) os.environ['LANG'] = 'en_NZ.UTF-8' tainted_name = unquote(os.path.basename(urlsplit(wiki_url).path)) bookid = "%s-%s" % (super_bleach(tainted_name), time.strftime('%Y.%m.%d-%H.%M.%S')) workdir = tempfile.mkdtemp(prefix=bookid, dir=os.path.join(config.DATA_ROOT, "tmp")) os.chmod(workdir, 0755) epub_file = os.path.join(workdir, bookid + '.epub') epub_url = path2url(epub_file) #the wikibooks importer is a separate process, so run that, then collect the epub. cmd = [config.TIMEOUT_CMD, config.WIKIBOOKS_TIMEOUT, config.WIKIBOOKS_CMD, '-i', wiki_url, '-o', epub_file ] log(cmd) log(os.environ) log(os.getcwd()) try: check_call(cmd) except CalledProcessError, e: if e.returncode == 137: raise TimeoutError('Wikibooks took too long (over %s seconds)' % WIKIBOOKS_TIMEOUT) raise
def wikibooks_espri(wiki_url): """Wikibooks import using the wikibooks2epub script by Jan Gerber to first convert the wikibook to an epub, which can then be turned into a bookizip via the espri function. """ os.environ['oxCACHE'] = os.path.abspath(config.WIKIBOOKS_CACHE) os.environ['LANG'] = 'en_NZ.UTF-8' tainted_name = unquote(os.path.basename(urlsplit(wiki_url).path)) bookid = "%s-%s" % (super_bleach(tainted_name), time.strftime('%Y.%m.%d-%H.%M.%S')) workdir = tempfile.mkdtemp(prefix=bookid, dir=os.path.join(config.DATA_ROOT, "tmp")) os.chmod(workdir, 0755) epub_file = os.path.join(workdir, bookid + '.epub') epub_url = path2url(epub_file) #the wikibooks importer is a separate process, so run that, then collect the epub. cmd = [ config.TIMEOUT_CMD, config.WIKIBOOKS_TIMEOUT, config.WIKIBOOKS_CMD, '-i', wiki_url, '-o', epub_file ] log(cmd) log(os.environ) log(os.getcwd()) try: check_call(cmd) except CalledProcessError, e: if e.returncode == 137: raise TimeoutError('Wikibooks took too long (over %s seconds)' % WIKIBOOKS_TIMEOUT) raise
def toc_iterator(server, book): """TOC.txt has 3 lines per chapter. Fetch them and yield them in triples. """ url = config.TOC_URL % (server, book) log('getting TOC: %s' % url) f = urlopen(url) encoding = get_server_defaults(server)['toc-encoding'] while True: try: if encoding is not None: yield TocItem( f.next().decode(encoding).strip().encode('utf-8'), f.next().decode(encoding).strip().encode('utf-8'), f.next().decode(encoding).strip().encode('utf-8')) else: yield TocItem(f.next().strip(), f.next().strip(), f.next().strip()) except StopIteration: break f.close()