def _make_navmap(self, toc): """ Build the toc. """ ncx = self.ncx root = ncx.navMap() last_np_with_depth = {0: root} count = 0 for url, title, depth in toc: if depth > -1: count += 1 np = ncx.navPoint( ncx.navLabel(ncx.text(title)), ncx.content(src=url), **{ 'id': "np-%d" % count, 'playOrder': self.seen_urls[url] }) try: parent = last_np_with_depth[depth - 1] parent.append(np) last_np_with_depth[depth] = np except KeyError: warning("Bogus depth %d in TOC" % depth) return root
def __parse(self, html): # remove xml decl and doctype, we will add the correct one before serializing # html = re.compile ('^.*<html ', re.I | re.S).sub ('<html ', html) # FIXME: do not remove doctype because we need it to load the dtd # remove xml declaration because of parser error: "Unicode # strings with encoding declaration are not supported. Please # use bytes input or XML fragments without declaration." re_xml_decl = re.compile(r'^.*?<\?xml.*?\?>', re.S | re.U) html = re_xml_decl.sub('', html) try: return etree.fromstring(html, lxml.html.XHTMLParser(huge_tree=True), base_url=self.attribs.url) except etree.ParseError as what: # cannot try HTML parser because we depend on correct xhtml namespace m = re.search(r"Entity '([^']+)'", str(what)) if m: warning("Missing entity: '%s'" % m.group(1)) else: error("Failed to parse file because: %s" % what) m = re.search(r'line\s(\d+),', str(what)) if m: lineno = int(m.group(1)) error("Line %d: %s" % (lineno, html.splitlines()[lineno - 1])) raise
def is_included_mediatype(self, attribs): """ Return True if this document is eligible. """ mediatype = self.get_mediatype(attribs) if not mediatype: warning('Mediatype could not be determined from url %s' % attribs.url) return True # always include if mediatype unknown included = any([ fnmatch.fnmatch(mediatype, pattern) for pattern in self.include_mediatypes ]) excluded = any([ fnmatch.fnmatch(mediatype, pattern) for pattern in self.exclude_mediatypes ]) if included and not excluded: return True if excluded: debug("Dropping excluded mediatype %s" % mediatype) if not included: debug("Dropping not included mediatype %s" % mediatype) return False
def tidy (html): """ Pipe html thru w3c tidy. """ html = parsers.RE_RESTRICTED.sub ('', html) html = RE_XMLDECL.sub ('', html) html = parsers.RE_HTML_CHARSET.sub ('; charset=utf-8', html) # convert to xhtml tidy = subprocess.Popen ( ["tidy", "-utf8", "-clean", "--wrap", "0", # "--drop-font-tags", "y", # "--drop-proprietary-attributes", "y", # "--add-xml-space", "y", "--output-xhtml", "y", "--numeric-entities", "y", "--merge-divs", "n", # keep poetry indentation "--merge-spans", "n", "--add-xml-decl", "n", "--doctype", "strict", "--anchor-as-name", "n", "--enclose-text", "y" ], stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE) # print (html.encode ('utf-8')) # sys.exit () (html, stderr) = tidy.communicate (html.encode ('utf-8')) regex = re.compile (r'(Info:|Warning:|Error:)\s*', re.I) # pylint: disable=E1103 msg = stderr.decode (sys.stderr.encoding).strip () for line in msg.splitlines (): match = regex.search (line) if match: sline = regex.sub ("", line) g = match.group (1).lower () if g == 'info:': info ("tidy: %s" % sline) elif g == 'warning:': warning ("tidy: %s" % sline) elif g == 'error:': error ("tidy: %s" % sline) else: error (line) if tidy.returncode == 2: raise ValueError (stderr) return html.decode ('utf-8')
def add(zip_, filename, memberfilename): """ Add one file to the zip. """ try: os.stat(filename) dummy_name, ext = os.path.splitext(filename) info(' Adding file: %s as %s' % (filename, memberfilename)) zip_.write( filename, memberfilename, zipfile.ZIP_STORED if ext in ['.zip', '.png'] else zipfile.ZIP_DEFLATED) except OSError: warning('ZipPackager: Cannot add file %s', filename)
def elect_coverpage(spider): """ Find first coverpage candidate that is not too small. """ coverpage_found = False for p in spider.parsers: if 'coverpage' in p.attribs.rel: if coverpage_found: # keep the first one found, reset all others p.attribs.rel.remove('coverpage') continue if hasattr(p, 'get_image_dimen'): dimen = p.get_image_dimen() if (dimen[0] * dimen[1]) < COVERPAGE_MIN_AREA: p.attribs.rel.remove('coverpage') warning( "removed coverpage candidate %s because too small (%d x %d)" % (p.url, dimen[0], dimen[1])) continue coverpage_found = True
def get_default_width(self, uri): """Calculate a sensible default width for images. Assume images are processed for a viewport 980px wide, the same as the iPhone browser assumes. """ if (self.document.settings.get_image_size and six.callable(self.document.settings.get_image_size)): size = self.document.settings.get_image_size(uri) if size is not None: w = int(float(size[0]) / (980.0 * 0.8) * 100.0 + 0.5) width = "%d%%" % min(100, w) debug('Got dimension of image: %s: %s' % (uri, width)) return width warning('Could not get dimension of image: %s' % uri) return '100%'
def enqueue(self, queue, depth, attribs, is_doc): """ Enqueue url for parsing.""" if is_doc: if not self.is_included_url(attribs): warning('External link in %s: %s' % (attribs.referrer, attribs.url)) return if depth >= self.max_depth: error('Omitted file %s due to depth > max_depth' % attribs.url) return if not self.is_included_mediatype( attribs) and not self.is_included_relation(attribs): return elif not self.is_included_url( attribs) and not self.is_included_relation(attribs): error( 'Failed for embedded media in %s from disallowed location: %s' % (attribs.referrer, attribs.url)) return queue.append((depth, attribs))
def elect_coverpage(spider, url): """ Find first coverpage candidate that is not too small. """ coverpage_found = False for p in spider.parsers: if 'coverpage' in p.attribs.rel: if coverpage_found: # keep the first one found, reset all others p.attribs.rel.remove('coverpage') continue if hasattr(p, 'get_image_dimen'): dimen = p.get_image_dimen() if (dimen[0] * dimen[1]) < COVERPAGE_MIN_AREA: p.attribs.rel.remove('coverpage') p_url = p.url if hasattr(p, 'url') else '' warning( "removed coverpage candidate %s because too small (%d x %d)" % (p_url, dimen[0], dimen[1])) continue coverpage_found = True if spider.parsers and not coverpage_found and options.generate_cover: if not hasattr(Cover, 'cairo'): warning('Cairo not installed, cover generation disabled') return if options.outputdir: dir = options.outputdir elif url.startswith('file://'): dir = os.path.dirname(os.path.abspath(url[7:])) elif url.startswith('file:'): dir = os.path.dirname(os.path.abspath(url[5:])) else: dir = os.path.dirname(os.path.abspath(url)) debug('generating cover in %s' % dir) cover_url = generate_cover(dir) if cover_url: cover_parser = ParserFactory.ParserFactory.create(cover_url) cover_parser.attribs.rel.add('coverpage') cover_parser.pre_parse() spider.parsers.append(cover_parser)
def build(self, job): """ Build PDF file. """ inputfilename = job.url outputfilename = os.path.join(os.path.abspath(job.outputdir), job.outputfile) debug("Inputfile: %s" % inputfilename) info("Creating PDF file: %s" % outputfilename) parser = ParserFactory.ParserFactory.create(inputfilename) if not hasattr(parser, 'rst2xetex'): warning('Skipping PDF Output because input mediatype is %s' % parser.mediatype()) raise SkipOutputFormat # Brain-dead xetex doesn't understand unix pipes # so we have to write a temp file texfilename = os.path.splitext(outputfilename)[0] + '.tex' auxfilename = os.path.splitext(outputfilename)[0] + '.aux' logfilename = os.path.splitext(outputfilename)[0] + '.log' try: os.remove(auxfilename) except OSError: pass tex = parser.rst2xetex(job) with open(texfilename, 'wb') as fp: fp.write(tex) try: cwd = os.getcwd() os.chdir(os.path.abspath(job.outputdir)) _xetex = subprocess.Popen([ options.config.XELATEX, "-output-directory", job.outputdir, "-interaction", "nonstopmode", texfilename ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except OSError as what: os.chdir(cwd) error("PDFWriter: %s %s" % (options.config.XELATEX, what)) raise SkipOutputFormat (dummy_stdout, dummy_stderr) = _xetex.communicate() with open(logfilename, encoding='utf-8') as fp: for line in fp: line = line.strip() if 'Error:' in line: error("xetex: %s" % line) if options.verbose >= 1: if 'Warning:' in line: warning("xetex: %s" % line) if options.verbose < 2: try: os.remove(texfilename) os.remove(logfilename) os.remove(auxfilename) except OSError: pass os.chdir(cwd) info("Done PDF file: %s" % outputfilename)
def build(self, job): """ Build kindle file from epub using amazon kindlegen. """ info("Creating Kindle file: %s" % os.path.join(job.outputdir, job.outputfile)) info(" ... from: %s" % job.url) try: cwd = os.getcwd() os.chdir(job.outputdir) kindlegen = subprocess.Popen([ options.config.MOBIGEN, '-o', os.path.basename(job.outputfile), job.url ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except OSError as what: os.chdir(cwd) error("KindleWriter: %s %s" % (options.config.MOBIGEN, what)) raise SkipOutputFormat (stdout, stderr) = kindlegen.communicate() os.chdir(cwd) if kindlegen.returncode > 0: regex = re.compile(r'^(\w+)\(prcgen\):') # pylint: disable=E1103 msg = stderr.rstrip() if msg: msg = msg.decode(sys.stderr.encoding) error(msg) msg = stdout.rstrip() msg = msg.decode(sys.stdout.encoding) for line in msg.splitlines(): match = regex.match(line) if match: sline = regex.sub("", line) g = match.group(1).lower() if g == 'info': if sline == 'MOBI File generated with WARNINGS!': # we knew that already continue # info ("kindlegen: %s" % sline) elif g == 'warning': if sline.startswith('Cover is too small'): continue if sline == 'Cover not specified': continue warning("kindlegen: %s" % sline) elif g == 'error': error("kindlegen: %s" % sline) else: error(line) info("Done Kindle file: %s" % os.path.join(job.outputdir, job.outputfile))
def groff(self, job, nroff, encoding='utf-8'): """ Process thru groff. Takes and returns unicode strings! """ device = { 'utf-8': 'utf8', 'iso-8859-1': 'latin1', 'us-ascii': 'ascii' }[encoding] nroff = nroff.encode(encoding) nrofffilename = os.path.join( os.path.abspath(job.outputdir), os.path.splitext(job.outputfile)[0] + '.nroff') # write nroff file for debugging if options.verbose >= 2: with open(nrofffilename, 'wb') as fp: fp.write(nroff) else: try: # remove debug files from previous runs os.remove(nrofffilename) except OSError: pass # call groff try: _groff = subprocess.Popen( [ options.config.GROFF, "-t", # preprocess with tbl "-K", device, # input encoding "-T", device ], # output device stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except OSError: error("TxtWriter: executable not found: %s" % options.config.GROFF) raise SkipOutputFormat (txt, stderr) = _groff.communicate(nroff) # pylint: disable=E1103 for line in stderr.splitlines(): line = line.decode(sys.stderr.encoding) line = line.strip() if 'error' in line: error("groff: %s" % line) elif 'warn' in line: if options.verbose >= 1: warning("groff: %s" % line) txt = txt.decode(encoding) return txt.translate(u2u) # fix nroff idiosyncracies
class ParagraphMetrics(object): """ Calculates some metrics. """ words = None try: fn = options.config.RHYMING_DICT if fn is not None: from six.moves import dbm_gnu words = dbm_gnu.open(fn) except ImportError: warning("No gnu dbm support found. Rhyming dictionary not used.") except dbm_gnu.error: warning("File containing rhyming dictionary not found: %s" % fn) def __init__(self, par): """ Calculate metrics about this paragraph. """ lines = par.lines self.cnt_lines = len(lines) self.lengths = list(map(len, lines)) self.centers = list(map(self._center, lines)) self.indents = list(map(self._indent, lines)) self.titles = list(map(self._istitle, lines)) self.uppers = list(map(six.text_type.isupper, lines)) # skip last line, which is almost always shorter self.length = MinMaxAvg(self.lengths[:-1]) self.length.last = self.lengths[-1] # skip first line, which sometimes is indented on every par self.indent = MinMaxAvg(self.indents[1:]) self.indent.first = self.indents[0] # all lines must be centered self.center = MinMaxAvg(self.centers) self.stems = None self.rhymes = None if self.words: self._init_rhymes(par) @staticmethod def _indent(line): """ Find out how much a line is left-indented. """ return len(line) - len(line.lstrip()) @staticmethod def _center(line): """ Find the center pos of a line. """ len_ = len(line) indent = len_ - len(line.lstrip()) return (len_ + indent) / 2 @staticmethod def _istitle(line): """ Return True if the first char is uppercase. """ m = re.search(r'\w', line) return m and m.group(0).isupper() def _rhyme_stemmer(self, line): """ Return the stem of the rhyme. See comments in: rhyme_compiler.py """ line = re.sub(r'\W*$', '', line) words = re.split('[- ]+', line) try: last_word = words[-1].lower() return self.words[last_word.encode('utf-8')] except (IndexError, KeyError): last_word = re.sub('^(un|in)', '', last_word) try: return self.words[last_word.encode('utf-8')] except (IndexError, KeyError): return None def _init_rhymes(self, par): """ Get rhyme stems and see which lines do rhyme. """ self.stems = list(map(self._rhyme_stemmer, par.lines)) self.rhymes = len(self.stems) * [0] go_back = 8 # how many lines to consider for i, stem in enumerate(self.stems): if stem is None: continue try: j = self.stems.index(stem, max(0, i - go_back), i) self.rhymes[j] = 1 self.rhymes[i] = 1 except ValueError: pass
def strip_pagenumbers(xhtml, strip_classes): """ Strip dp page numbers. Rationale: DP implements page numbers either with float or with absolute positioning. Float is not supported by Kindle. Absolute positioning is not allowed in epub. If we'd leave these in, they would show up as numbers in the middle of the text. To still keep links working, we replace all page number contraptions we can find with empty <a>'s. """ # look for elements with a class that is in strip_classes for class_ in strip_classes: xp = "//xhtml:*[@class and contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % class_ count = 0 for elem in xpath(xhtml, xp): # save textual content text = gg.normalize( etree.tostring(elem, method="text", encoding=six.text_type, with_tail=False)) if len(text) > 10: # safeguard against removing things that are not pagenumbers continue if not text: text = elem.get('title') # look for id anywhere inside element id_ = elem.xpath(".//@id") # transmogrify element into empty <a> tail = elem.tail elem.clear() elem.tag = NS.xhtml.a if id_: # some blockheaded PPers include more than # one page number in one span. take the last id # because the others represent empty pages. elem.set('id', id_[-1]) if class_ in DP_PAGENUMBER_CLASSES: # mark element as rewritten pagenumber. we # actually don't use this class for styling # because it is on an empty element elem.set('class', 'x-ebookmaker-pageno') if text: elem.set('title', text) elem.tail = tail count += 1 # The OPS Spec 2.0 is very clear: "Reading Systems # must be XML processors as defined in XML 1.1." # Nevertheless many browser-plugin ebook readers use # the HTML parsers of the browser. But HTML parsers # don't grok the minimized form of empty elements. # # This will force lxml to output the non-minimized form # of the element. elem.text = '' if count: warning("%d elements having class %s have been rewritten." % (count, class_))
def recursive_parse(self, root_attribs): """ Do a recursive parse starting from url. Do a breadth-first traversal. Assuming the first page contains a linked TOC, this will get us a more natural ordering of the pages than a depth-first traversal. """ queue = [] debug("Start of retrieval") # enqueue root url self.enqueue(queue, 0, root_attribs, True) while queue: depth, attribs = queue.pop(0) url = self.redirect(attribs.url) if url in self.parsed_urls: continue parser = ParserFactory.create(url, attribs) # Maybe the url was redirected to something we already have? url = parser.attribs.url if url in self.parsed_urls: continue self.parsed_urls.add(url) self.add_redirection(parser.attribs.orig_url, url) parser.pre_parse() self.parsers.append(parser) # look for more documents to add to the queue debug("Requesting iterlinks for: %s ..." % url) for url, elem in parser.iterlinks(): if elem.get('rel') == 'nofollow': # remove link to content not followed elem.tag = 'span' elem.set('data-nofolllow-href', elem.get('href')) del elem.attrib['href'] del elem.attrib['rel'] warning('not followed: %s' % url) continue new_attribs = parsers.ParserAttributes() new_attribs.url = urllib.parse.urldefrag(url)[0] new_attribs.referrer = parser.attribs.url for k, v in elem.items(): if k in ('id', 'title'): setattr(new_attribs, k, v) elif k == 'type': new_attribs.orig_mediatype = new_attribs.HeaderElement.from_str( v) elif k == 'rel': new_attribs.rel.update(v.lower().split()) tag = elem.tag if tag == NS.xhtml.a: if self.is_image(new_attribs) and self.is_included_url(new_attribs) and \ self.is_included_mediatype(new_attribs): # need to wrap an image wrapper_parser = parsers.WrapperParser.Parser( new_attribs) if wrapper_parser.attribs.url not in self.parsed_urls: ParserFactory.parsers[ wrapper_parser.attribs.url] = wrapper_parser self.parsers.append(wrapper_parser) self.parsed_urls.add(wrapper_parser.attribs.url) elem.set('href', wrapper_parser.attribs.url) new_attribs.referrer = wrapper_parser.attribs.url elem.set('title', wrapper_parser.attribs.title) self.enqueue(queue, depth + 1, new_attribs, False) else: self.enqueue(queue, depth + 1, new_attribs, True) elif tag == NS.xhtml.img: self.enqueue(queue, depth, new_attribs, False) elif tag == NS.xhtml.link: if new_attribs.rel.intersection( ('stylesheet', 'coverpage')): self.enqueue(queue, depth, new_attribs, False) else: self.enqueue(queue, depth + 1, new_attribs, True) elif tag == NS.xhtml.object: self.enqueue(queue, depth, new_attribs, False) debug("End of retrieval") # rewrite redirected urls if self.redirection_map: for parser in self.parsers: parser.remap_links(self.redirection_map) self.topological_sort()
def build(self, job): """ Build kindle file from epub using amazon kindlegen or calibre. """ if job.dc.languages: if job.dc.languages[0].id in no_kindlegen_langs: mobimaker = options.config.MOBILANG else: mobimaker = options.config.MOBIGEN if not mobimaker: info('no mobimaker available') return # kindlegen needs localized paths outputdir = os.path.abspath(job.outputdir) info("Creating Kindle file: %s" % os.path.join(outputdir, job.outputfile)) info(" ... from: %s" % job.url) try: cwd = os.getcwd() os.chdir(outputdir) if 'ebook-convert' in mobimaker: kindlegen = subprocess.Popen( [ mobimaker, job.url, os.path.basename(job.outputfile), '--personal-doc="[EBOK]"', ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) else: kindlegen = subprocess.Popen( [ mobimaker, '-o', os.path.basename(job.outputfile), job.url ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) except OSError as what: os.chdir(cwd) error("KindleWriter: %s %s" % (mobimaker, what)) raise SkipOutputFormat (stdout, stderr) = kindlegen.communicate() os.chdir(cwd) if kindlegen.returncode > 0: regex = re.compile(r'^(\w+)\(prcgen\):') # pylint: disable=E1103 msg = stderr.rstrip() if msg: msg = msg.decode(sys.stderr.encoding) error(msg) msg = stdout.rstrip() msg = msg.decode(sys.stdout.encoding) for line in msg.splitlines(): match = regex.match(line) if match: sline = regex.sub("", line) g = match.group(1).lower() if g == 'info': if sline == 'MOBI File generated with WARNINGS!': # we knew that already continue # info("kindlegen: %s" % sline) elif g == 'warning': if sline.startswith('Cover is too small'): continue if sline == 'Cover not specified': continue warning("kindlegen: %s" % sline) elif g == 'error': error("kindlegen: %s" % sline) else: error(line) info("Done Kindle file: %s" % os.path.join(outputdir, job.outputfile))
def do_job(job): """ Do one job. """ log_handler = None Logger.ebook = job.ebook if job.logfile: log_handler = open_log( os.path.join(os.path.abspath(job.outputdir), job.logfile)) debug('=== Building %s ===' % job.type) start_time = datetime.datetime.now() try: if job.url: spider = Spider.Spider() dirpath = os.path.dirname(job.url) # platform native path spider.include_urls += (options.include_urls or [parsers.webify_url(dirpath) + '/*'] ) # use for parser only spider.include_mediatypes += options.include_mediatypes if job.subtype == '.images' or job.type == 'rst.gen': spider.include_mediatypes.append('image/*') spider.exclude_urls += options.exclude_urls spider.exclude_mediatypes += options.exclude_mediatypes spider.max_depth = options.max_depth or six.MAXSIZE for rewrite in options.rewrite: from_url, to_url = rewrite.split('>') spider.add_redirection(from_url, to_url) attribs = parsers.ParserAttributes() attribs.url = parsers.webify_url(job.url) attribs.id = 'start' if options.input_mediatype: attribs.orig_mediatype = attribs.HeaderElement.from_str( options.input_mediatype) spider.recursive_parse(attribs) elect_coverpage(spider, job.url) job.url = spider.redirect(job.url) job.base_url = job.url job.spider = spider writer = WriterFactory.create(job.maintype) writer.build(job) if options.validate: writer.validate(job) packager = PackagerFactory.create(options.packager, job.type) if packager: packager.package(job) if job.type == 'html.images': # FIXME: hack for push packager options.html_images_list = list(job.spider.aux_file_iter()) except SkipOutputFormat as what: warning("%s" % what) except Exception as what: exception("%s" % what) end_time = datetime.datetime.now() info(' %s made in %s' % (job.type, end_time - start_time)) if log_handler: close_log(log_handler) log_handler = None