def open_file(cls, url, attribs): """ Open a local file for parsing. """ def open_file_from_path(path): try: return open(url, 'rb') except FileNotFoundError: error('Missing file: %s' % url) except IsADirectoryError: error('Missing file is a directory: %s' % url) return None if re.search(r'^([a-zA-z]:|/)', url): fp = open_file_from_path(url) else: try: # handles all the flavors of file: urls, including on windows fp = urllib.request.urlopen(url) except urllib.error.URLError as what: fp = None error('Missing file: %s' % what.reason) except ValueError: # just a relative path? fp = open_file_from_path(url) attribs.orig_mediatype = attribs.HeaderElement( MediaTypes.guess_type(url)) debug("... got mediatype %s from guess_type" % str(attribs.orig_mediatype)) attribs.orig_url = attribs.url = url return fp
def validate(self, job): """ Validate generated epub using external tools. """ debug("Validating %s ..." % job.outputfile) filename = os.path.join(job.outputdir, job.outputfile) for validator in (options.config.EPUB_VALIDATOR, options.config.EPUB_PREFLIGHT): if validator is not None: params = validator.split() + [filename] checker = subprocess.Popen(params, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (dummy_stdout, stderr) = checker.communicate() if stderr: error(stderr) return 1 #raise AssertionError ( # "%s does not validate." % job.outputfile) debug("%s validates ok." % job.outputfile) return 0
def package (self, job): self.setup (job) zipfilename = job.outputfile # filename is zipfile m = re.match (r'\d+', zipfilename) if m: ebook_no = m.group (0) else: error ('Invalid filename %s for push packager.' % zipfilename) return zip_ = self.create (zipfilename) for suffix in '.txt -8.txt -0.txt .zip -8.zip -0.zip -rst.zip -h.zip'.split (): filename = '%s%s' % (ebook_no, suffix) memberfilename = '%s/%s' % (ebook_no, filename) self.add (zip_, filename, memberfilename) for suffix, ext in (('-h', 'html'), ('-rst', 'rst')): filename = '%s%s.%s' % (ebook_no, suffix, ext) memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, filename) self.add (zip_, filename, memberfilename) # image files for url in options.html_images_list: rel_url = gg.make_url_relative (job.base_url, url) filename = os.path.join (self.path, rel_url) memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, rel_url) self.add (zip_, filename, memberfilename) zip_.close () info ('Done Zip file: %s' % zipfilename)
def copy_aux_files(self, job, dest_dir): """ Copy image files to dest_dir. Use image data cached in parsers. """ for p in job.spider.parsers: if hasattr(p, 'resize_image'): src_uri = p.attribs.url fn_dest = gg.make_url_relative(webify_url(job.base_url), src_uri) fn_dest = os.path.join(dest_dir, fn_dest) # debug ('base_url = %s, src_uri = %s' % (job.base_url, src_uri)) if gg.is_same_path(src_uri, fn_dest): debug('Not copying %s to %s: same file' % (src_uri, fn_dest)) continue debug('Copying %s to %s' % (src_uri, fn_dest)) fn_dest = gg.normalize_path(fn_dest) gg.mkdir_for_filename(fn_dest) try: with open(fn_dest, 'wb') as fp_dest: fp_dest.write(p.serialize()) except IOError as what: error('Cannot copy %s to %s: %s' % (src_uri, fn_dest, what))
def decode(self, charset): """ Try to decode document contents to unicode. """ if charset is None: return None charset = charset.lower().strip() if charset in BOGUS_CHARSET_NAMES: charset = BOGUS_CHARSET_NAMES[charset] if charset == 'utf-8': charset = 'utf_8_sig' try: debug("Trying to decode document with charset %s ..." % charset) buffer = self.bytes_content() buffer = REB_PG_CHARSET.sub(b'', buffer) buffer = buffer.decode(charset) self.attribs.orig_mediatype.params['charset'] = charset return buffer except LookupError as what: # unknown charset, error("Invalid charset name: %s (%s)" % (charset, what)) except UnicodeError as what: # mis-stated charset, did not decode error("Text not in charset %s (%s)" % (charset, what)) return None
def __parse(self, html): # remove xml decl and doctype, we will add the correct one before serializing # html = re.compile ('^.*<html ', re.I | re.S).sub ('<html ', html) # FIXME: do not remove doctype because we need it to load the dtd # remove xml declaration because of parser error: "Unicode # strings with encoding declaration are not supported. Please # use bytes input or XML fragments without declaration." re_xml_decl = re.compile(r'^.*?<\?xml.*?\?>', re.S | re.U) html = re_xml_decl.sub('', html) try: return etree.fromstring(html, lxml.html.XHTMLParser(huge_tree=True), base_url=self.attribs.url) except etree.ParseError as what: # cannot try HTML parser because we depend on correct xhtml namespace m = re.search(r"Entity '([^']+)'", str(what)) if m: warning("Missing entity: '%s'" % m.group(1)) else: error("Failed to parse file because: %s" % what) m = re.search(r'line\s(\d+),', str(what)) if m: lineno = int(m.group(1)) error("Line %d: %s" % (lineno, html.splitlines()[lineno - 1])) raise
def translate (self): visitor = self.translator_class (self.document) del Unitame.unhandled_chars[:] self.document.walkabout (visitor) self.output = visitor.astext () if Unitame.unhandled_chars: error ("unitame: unhandled chars: %s" % ", ".join (set (Unitame.unhandled_chars)))
def shipout(self, job, parsers, ncx): """ Build the zip file. """ try: ocf = OEBPSContainer( os.path.join(job.outputdir, job.outputfile), ('%d/' % options.ebook if options.ebook else None)) opf = ContentOPF() opf.metadata_item(job.dc) # write out parsers for p in parsers: try: ocf.add_bytes(self.url2filename(p.attribs.url), p.serialize(), p.mediatype()) if p.mediatype() == mt.xhtml: opf.spine_item_from_parser(p) else: opf.manifest_item_from_parser(p) except Exception as what: error("Could not process file %s: %s" % (p.attribs.url, what)) # toc for t in ncx.toc: if t[1].lower().strip(' .') in TOC_HEADERS: opf.guide_item(t[0], 'toc', t[1]) break opf.toc_item('toc.ncx') ocf.add_unicode('toc.ncx', six.text_type(ncx)) for p in parsers: if 'coverpage' in p.attribs.rel: opf.add_coverpage(ocf, p.attribs.url) break # Adobe page-map # opf.pagemap_item ('page-map.xml') # ocf.add_unicode ('page-map.xml', six.text_type (AdobePageMap (ncx))) # content.opf # debug (etree.tostring (opf.manifest, encoding=siy.text_type, pretty_print=True)) opf.rewrite_links(self.url2filename) ocf.add_unicode('content.opf', six.text_type(opf)) ocf.commit() except Exception as what: exception("Error building Epub: %s" % what) ocf.rollback() raise
def open_file_from_path(path): try: return open(url, 'rb') except FileNotFoundError: error('Missing file: %s' % url) except IsADirectoryError: error('Missing file is a directory: %s' % url) return None
def main(): """ Main program. """ try: config() except configparser.Error as what: error("Error in configuration file: %s", str(what)) return 1 Logger.set_log_level(options.verbose) options.types = options.types or ['all'] options.types = CommonCode.add_dependencies(options.types, DEPENDENCIES, BUILD_ORDER) debug("Building types: %s" % ' '.join(options.types)) ParserFactory.load_parsers() WriterFactory.load_writers() PackagerFactory.load_packagers() if options.is_job_queue: job_queue = cPickle.load(sys.stdin.buffer) # read bytes else: options.dc = get_dc(options.url) job_queue = [] output_files = dict() for type_ in options.types: job = CommonCode.Job(type_) job.url = options.url job.ebook = options.ebook job.dc = options.dc job.outputdir = options.outputdir job.outputfile = options.outputfile or make_output_filename( type_, options.dc) output_files[type_] = job.outputfile if job.type == 'kindle.images': job.url = os.path.join(job.outputdir, output_files['epub.images']) elif job.type == 'kindle.noimages': job.url = os.path.join(job.outputdir, output_files['epub.noimages']) job_queue.append(job) for j in job_queue: do_job(j) packager = PackagerFactory.create(options.packager, 'push') if packager: # HACK: the WWers ever only convert one ebook at a time job = job_queue[0] job.outputfile = '%d-final.zip' % (options.dc.project_gutenberg_id) packager.package(job) return 0
def tidy (html): """ Pipe html thru w3c tidy. """ html = parsers.RE_RESTRICTED.sub ('', html) html = RE_XMLDECL.sub ('', html) html = parsers.RE_HTML_CHARSET.sub ('; charset=utf-8', html) # convert to xhtml tidy = subprocess.Popen ( ["tidy", "-utf8", "-clean", "--wrap", "0", # "--drop-font-tags", "y", # "--drop-proprietary-attributes", "y", # "--add-xml-space", "y", "--output-xhtml", "y", "--numeric-entities", "y", "--merge-divs", "n", # keep poetry indentation "--merge-spans", "n", "--add-xml-decl", "n", "--doctype", "strict", "--anchor-as-name", "n", "--enclose-text", "y" ], stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE) # print (html.encode ('utf-8')) # sys.exit () (html, stderr) = tidy.communicate (html.encode ('utf-8')) regex = re.compile (r'(Info:|Warning:|Error:)\s*', re.I) # pylint: disable=E1103 msg = stderr.decode (sys.stderr.encoding).strip () for line in msg.splitlines (): match = regex.search (line) if match: sline = regex.sub ("", line) g = match.group (1).lower () if g == 'info:': info ("tidy: %s" % sline) elif g == 'warning:': warning ("tidy: %s" % sline) elif g == 'error:': error ("tidy: %s" % sline) else: error (line) if tidy.returncode == 2: raise ValueError (stderr) return html.decode ('utf-8')
def generate_cover(dir): try: cover_image = Cover.draw(options.dc, cover_width=1200, cover_height=1800) cover_url = os.path.join(dir, make_output_filename('cover', options.dc)) with open(cover_url, 'wb+') as cover: cover_image.save(cover) return cover_url except OSError: error("OSError, Cairo not installed or couldn't write file.") return None
def bytes_content(self): """ Get document content as raw bytes. """ if self.buffer is None: try: debug("Fetching %s ..." % self.attribs.url) self.buffer = self.fp.read() self.fp.close() except IOError as what: error(what) return self.buffer
def package(self, job): self.setup(job) filename = self.path_name_ext gzfilename = filename + GZIP_EXTENSION try: info('Creating Gzip file: %s' % gzfilename) info(' Adding file: %s' % filename) with open(filename, 'rb') as fp: with gzip.open(gzfilename, 'wb') as fpgz: fpgz.writelines(fp) info('Done Zip file: %s' % gzfilename) except IOError as what: error(what)
def to_xhtml(self, html, base_url): html = html.replace(' ', ' ') html = html.replace('—', '—') try: xhtml = etree.fromstring(html, lxml.html.XHTMLParser(), base_url=base_url) except etree.ParseError as what: error("etree.fromstring says %s" % what) raise xhtml.make_links_absolute(base_url=base_url) return xhtml
def apply(self, **kwargs): if self.document.settings.encoding != 'utf-8': charset = self.document.settings.encoding del Unitame.unhandled_chars[:] for n in self.document.traverse(nodes.Text): text = n.astext() text2 = text.encode(charset, 'unitame').decode(charset) if text != text2: n.parent.replace( n, nodes.Text(text2)) # cannot change text nodes if Unitame.unhandled_chars: error("unitame: unhandled chars: %s" % ", ".join(set(Unitame.unhandled_chars)))
def __load_writers_from(package_name): """ See what types we can write. """ for fn in resource_listdir(package_name, ''): modulename, ext = os.path.splitext(fn) if ext == '.py' and modulename.endswith('Writer'): type_ = modulename.lower().replace('writer', '') try: debug("Loading writer type %s from module %s" % (type_, modulename)) module = __import__(package_name + '.' + modulename, fromlist=[modulename]) writers[type_] = module except ImportError as what: error("Could not load writer type %s from module %s. %s" % (type_, modulename, what))
def rewrite_internal_links_toc(self, toc): """ Rewrite links to point into right chunks. Because we split the HTML into chunks, all internal links need to be rewritten to become links into the right chunk. Rewrite all links in the passed toc. """ for entry in toc: try: entry[0] = self.idmap[normalize_uri(entry[0])] except KeyError: error("HTMLChunker: Cannot rewrite toc entry '%s'" % entry[0]) error(repr(self.idmap)) del entry
def build(self, job): """ Build RST file. """ filename = os.path.join(os.path.abspath(job.outputdir), job.outputfile) info("Creating RST file: %s" % filename) parser = ParserFactory.ParserFactory.create(job.url) if not hasattr(parser, 'rst2nroff'): error('RSTWriter can only work on a RSTParser.') raise SkipOutputFormat data = parser.preprocess('utf-8').encode('utf-8') self.write_with_crlf(filename, data) info("Done RST file: %s" % filename)
def rewrite_internal_links(self): """ Rewrite links to point into right chunks. Because we split the HTML into chunks, all internal links need to be rewritten to become links into the right chunk. Rewrite all internal links in all chunks. """ for chunk in self.chunks: for a in xpath(chunk[0], "//xhtml:*[@href]"): try: uri = normalize_uri(a.get('href')) a.set('href', self.idmap[uri]) except KeyError: ur, dummy_frag = urllib.parse.urldefrag(uri) if ur in self.idmap: error( "HTMLChunker: Cannot rewrite internal link '%s'" % uri)
def enqueue(self, queue, depth, attribs, is_doc): """ Enqueue url for parsing.""" if is_doc: if not self.is_included_url(attribs): warning('External link in %s: %s' % (attribs.referrer, attribs.url)) return if depth >= self.max_depth: error('Omitted file %s due to depth > max_depth' % attribs.url) return if not self.is_included_mediatype( attribs) and not self.is_included_relation(attribs): return elif not self.is_included_url( attribs) and not self.is_included_relation(attribs): error( 'Failed for embedded media in %s from disallowed location: %s' % (attribs.referrer, attribs.url)) return queue.append((depth, attribs))
def apply(self, **kwargs): iter_ = self.startnode.traverse(nodes.paragraph, siblings=1) if len(iter_): para = iter_[0] iter_ = para.traverse(nodes.Text) details = self.startnode.details if len(iter_): textnode = iter_[0] charnode = spannode = restnode = None char = details['char'] if not textnode.startswith(char): error("Dropcap: next paragraph doesn't start with: '%s'." % char) return span = details.get('span', '') if not textnode.startswith(span): error("Dropcap: next paragraph doesn't start with: '%s'." % span) return if span and not span.startswith(char): error("Dropcap: span doesn't start with: '%s'." % char) return if span == char: span = '' if span: # split into char/span/rest restnode = nodes.Text(textnode.astext()[len(span):]) spannode = nodes.inline() spannode.append( nodes.Text(textnode.astext()[len(char):len(span)])) spannode['classes'].append('dropspan') else: # split into char/rest restnode = nodes.Text(textnode.astext()[len(char):]) spannode = nodes.inline('', '') spannode['classes'].append('dropspan') if 'image' in details: charnode = nodes.image() charnode['uri'] = details['image'] charnode['alt'] = char # debug ("Inserting image %s as dropcap." % uri) else: charnode = nodes.inline() charnode.append(nodes.Text(char)) # debug ("Inserting char %s as dropcap." % char) charnode['classes'].append('dropcap') charnode.attributes.update(details) para.replace(textnode, [charnode, spannode, restnode]) self.startnode.parent.remove(self.startnode)
def resize_image(self, max_size, max_dimen, output_format=None): """ Create a new parser with a resized image. """ new_parser = Parser() try: image = Image.open(six.BytesIO(self.image_data)) format_ = image.format if output_format: format_ = output_format if format_ == 'gif': format_ = 'png' if format_ == 'jpeg' and image.mode.lower() != 'rgb': image = image.convert('RGB') if 'dpi' in image.info: del image.info['dpi'] # maybe resize image # find scaling factor scale = 1.0 scale = min(scale, max_dimen[0] / float(image.size[0])) scale = min(scale, max_dimen[1] / float(image.size[1])) was = '' if scale < 1.0: dimen = (int(image.size[0] * scale), int(image.size[1] * scale)) was = "(was %d x %d scale=%.2f) " % (image.size[0], image.size[1], scale) image = image.resize(dimen, Image.ANTIALIAS) # find best quality that fits into max_size data = self.image_data if (scale < 1.0) or (len(self.image_data) > max_size): for quality in (90, 85, 80, 70, 60, 50, 40, 30, 20, 10): buf = six.BytesIO() image.save(buf, format_, quality=quality) data = buf.getvalue() if len(data) <= max_size: was += 'q=%d' % quality break comment = "Image: %d x %d size=%d %s" % ( image.size[0], image.size[1], len(data), was) debug(comment) new_parser.image_data = data new_parser.dimen = tuple(image.size) new_parser.attribs = copy.copy(self.attribs) new_parser.attribs.comment = comment new_parser.fp = self.fp except IOError as what: error("Could not resize image: %s" % what) return ParserFactory.create(BROKEN) return new_parser
def _fix_anchors(self): """ Move name to id and fix hrefs and ids. """ # move anchor name to id # 'id' values are more strict than 'name' values # try to fix ill-formed ids seen_ids = set() for anchor in (xpath(self.xhtml, "//xhtml:a[@name]") + xpath(self.xhtml, "//xhtml:*[@id]")): id_ = anchor.get('id') or anchor.get('name') if 'name' in anchor.attrib: del anchor.attrib['name'] if 'id' in anchor.attrib: del anchor.attrib['id'] if NS.xml.id in anchor.attrib: del anchor.attrib[NS.xml.id] id_ = self._fix_id(id_) if not parsers.RE_XML_NAME.match(id_): error("Dropping ill-formed id '%s' in %s" % (id_, self.attribs.url)) continue # well-formed id if id_ in seen_ids: error("Dropping duplicate id '%s' in %s" % (id_, self.attribs.url)) continue seen_ids.add(id_) anchor.set('id', id_) # try to fix bogus fragment ids # 1. fragments point to xml:id, so must be well-formed ids # 2. the ids they point to must exist for link in xpath(self.xhtml, "//xhtml:*[@href]"): href = link.get('href') hre, frag = urllib.parse.urldefrag(href) if frag: frag = self._fix_internal_frag(frag) if not frag: # non-recoverable ill-formed frag del link.attrib['href'] self.add_class(link, 'pgkilled') error('Dropping ill-formed frag in %s' % href) continue # well-formed frag if hre: # we have url + frag link.set( 'href', "%s#%s" % (hre, urllib.parse.quote(frag.encode('utf-8')))) self.add_class(link, 'pgexternal') elif frag in seen_ids: # we have only frag link.set('href', "#%s" % urllib.parse.quote(frag.encode('utf-8'))) self.add_class(link, 'pginternal') else: del link.attrib['href'] self.add_class(link, 'pgkilled') error("Dropping frag to non-existing id in %s" % href)
def build(self, job): """ Build PDF file. """ inputfilename = job.url outputfilename = os.path.join(os.path.abspath(job.outputdir), job.outputfile) debug("Inputfile: %s" % inputfilename) info("Creating PDF file: %s" % outputfilename) parser = ParserFactory.ParserFactory.create(inputfilename) if not hasattr(parser, 'rst2xetex'): warning('Skipping PDF Output because input mediatype is %s' % parser.mediatype()) raise SkipOutputFormat # Brain-dead xetex doesn't understand unix pipes # so we have to write a temp file texfilename = os.path.splitext(outputfilename)[0] + '.tex' auxfilename = os.path.splitext(outputfilename)[0] + '.aux' logfilename = os.path.splitext(outputfilename)[0] + '.log' try: os.remove(auxfilename) except OSError: pass tex = parser.rst2xetex(job) with open(texfilename, 'wb') as fp: fp.write(tex) try: cwd = os.getcwd() os.chdir(os.path.abspath(job.outputdir)) _xetex = subprocess.Popen([ options.config.XELATEX, "-output-directory", job.outputdir, "-interaction", "nonstopmode", texfilename ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except OSError as what: os.chdir(cwd) error("PDFWriter: %s %s" % (options.config.XELATEX, what)) raise SkipOutputFormat (dummy_stdout, dummy_stderr) = _xetex.communicate() with open(logfilename, encoding='utf-8') as fp: for line in fp: line = line.strip() if 'Error:' in line: error("xetex: %s" % line) if options.verbose >= 1: if 'Warning:' in line: warning("xetex: %s" % line) if options.verbose < 2: try: os.remove(texfilename) os.remove(logfilename) os.remove(auxfilename) except OSError: pass os.chdir(cwd) info("Done PDF file: %s" % outputfilename)
def groff(self, job, nroff, encoding='utf-8'): """ Process thru groff. Takes and returns unicode strings! """ device = { 'utf-8': 'utf8', 'iso-8859-1': 'latin1', 'us-ascii': 'ascii' }[encoding] nroff = nroff.encode(encoding) nrofffilename = os.path.join( os.path.abspath(job.outputdir), os.path.splitext(job.outputfile)[0] + '.nroff') # write nroff file for debugging if options.verbose >= 2: with open(nrofffilename, 'wb') as fp: fp.write(nroff) else: try: # remove debug files from previous runs os.remove(nrofffilename) except OSError: pass # call groff try: _groff = subprocess.Popen( [ options.config.GROFF, "-t", # preprocess with tbl "-K", device, # input encoding "-T", device ], # output device stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except OSError: error("TxtWriter: executable not found: %s" % options.config.GROFF) raise SkipOutputFormat (txt, stderr) = _groff.communicate(nroff) # pylint: disable=E1103 for line in stderr.splitlines(): line = line.decode(sys.stderr.encoding) line = line.strip() if 'error' in line: error("groff: %s" % line) elif 'warn' in line: if options.verbose >= 1: warning("groff: %s" % line) txt = txt.decode(encoding) return txt.translate(u2u) # fix nroff idiosyncracies
def resize_image(self, max_size, max_dimen, output_format=None): """ Create a new parser with a resized image. """ def scale_image(image, scale): was = '' if scale < 1.0: dimen = (int(image.size[0] * scale), int(image.size[1] * scale)) was = "(was %d x %d scale=%.2f) " % (image.size[0], image.size[1], scale) image = image.resize(dimen, Image.ANTIALIAS) return was, image def get_image_data(image, format_, quality=95): buf = six.BytesIO() if format_ == 'png': image.save(buf, 'png', optimize=True) else: image.save(buf, 'jpeg', quality=quality) return buf.getvalue() new_parser = Parser() try: unsized_image = Image.open(six.BytesIO(self.image_data)) format_ = unsized_image.format.lower() if output_format: format_ = output_format if format_ == 'gif': format_ = 'png' if format_ == 'jpeg' and unsized_image.mode.lower() not in ('rgb', 'l'): unsized_image = unsized_image.convert('RGB') if 'dpi' in unsized_image.info: del unsized_image.info['dpi'] # maybe resize image # find scaling factor scale = 1.0 scale = min(scale, max_dimen[0] / float(unsized_image.size[0])) scale = min(scale, max_dimen[1] / float(unsized_image.size[1])) was, image = scale_image(unsized_image, scale) data = get_image_data(image, format_) if format_ == 'png': # scale it till it fits into max_size while len(data) > max_size and scale > 0.01: scale = scale * 0.8 was, image = scale_image(unsized_image, scale) data = get_image_data(image, format_) else: # find best quality that fits into max_size if len(data) > max_size: for quality in (90, 85, 80, 70, 60, 50, 40, 30, 20, 10): data = get_image_data(image, format_, quality=quality) if len(data) <= max_size: break was += 'q=%d' % quality comment = "Image: %d x %d size=%d %s" % ( image.size[0], image.size[1], len(data), was) debug(comment) new_parser.image_data = data new_parser.dimen = tuple(image.size) new_parser.attribs = copy.copy(self.attribs) new_parser.attribs.comment = comment new_parser.fp = self.fp except IOError as what: error("Could not resize image: %s" % what) return ParserFactory.create(BROKEN) return new_parser
def build(self, job): """ Build kindle file from epub using amazon kindlegen. """ info("Creating Kindle file: %s" % os.path.join(job.outputdir, job.outputfile)) info(" ... from: %s" % job.url) try: cwd = os.getcwd() os.chdir(job.outputdir) kindlegen = subprocess.Popen([ options.config.MOBIGEN, '-o', os.path.basename(job.outputfile), job.url ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except OSError as what: os.chdir(cwd) error("KindleWriter: %s %s" % (options.config.MOBIGEN, what)) raise SkipOutputFormat (stdout, stderr) = kindlegen.communicate() os.chdir(cwd) if kindlegen.returncode > 0: regex = re.compile(r'^(\w+)\(prcgen\):') # pylint: disable=E1103 msg = stderr.rstrip() if msg: msg = msg.decode(sys.stderr.encoding) error(msg) msg = stdout.rstrip() msg = msg.decode(sys.stdout.encoding) for line in msg.splitlines(): match = regex.match(line) if match: sline = regex.sub("", line) g = match.group(1).lower() if g == 'info': if sline == 'MOBI File generated with WARNINGS!': # we knew that already continue # info ("kindlegen: %s" % sline) elif g == 'warning': if sline.startswith('Cover is too small'): continue if sline == 'Cover not specified': continue warning("kindlegen: %s" % sline) elif g == 'error': error("kindlegen: %s" % sline) else: error(line) info("Done Kindle file: %s" % os.path.join(job.outputdir, job.outputfile))
def build(self, job): """ Build kindle file from epub using amazon kindlegen or calibre. """ if job.dc.languages: if job.dc.languages[0].id in no_kindlegen_langs: mobimaker = options.config.MOBILANG else: mobimaker = options.config.MOBIGEN if not mobimaker: info('no mobimaker available') return # kindlegen needs localized paths outputdir = os.path.abspath(job.outputdir) info("Creating Kindle file: %s" % os.path.join(outputdir, job.outputfile)) info(" ... from: %s" % job.url) try: cwd = os.getcwd() os.chdir(outputdir) if 'ebook-convert' in mobimaker: kindlegen = subprocess.Popen( [ mobimaker, job.url, os.path.basename(job.outputfile), '--personal-doc="[EBOK]"', ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) else: kindlegen = subprocess.Popen( [ mobimaker, '-o', os.path.basename(job.outputfile), job.url ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) except OSError as what: os.chdir(cwd) error("KindleWriter: %s %s" % (mobimaker, what)) raise SkipOutputFormat (stdout, stderr) = kindlegen.communicate() os.chdir(cwd) if kindlegen.returncode > 0: regex = re.compile(r'^(\w+)\(prcgen\):') # pylint: disable=E1103 msg = stderr.rstrip() if msg: msg = msg.decode(sys.stderr.encoding) error(msg) msg = stdout.rstrip() msg = msg.decode(sys.stdout.encoding) for line in msg.splitlines(): match = regex.match(line) if match: sline = regex.sub("", line) g = match.group(1).lower() if g == 'info': if sline == 'MOBI File generated with WARNINGS!': # we knew that already continue # info("kindlegen: %s" % sline) elif g == 'warning': if sline.startswith('Cover is too small'): continue if sline == 'Cover not specified': continue warning("kindlegen: %s" % sline) elif g == 'error': error("kindlegen: %s" % sline) else: error(line) info("Done Kindle file: %s" % os.path.join(outputdir, job.outputfile))
def resize_image(self, max_size, max_dimen, output_format=None): """ Create a new parser with a resized image. """ def scale_image(image, scale): was = '' if scale < 1.0: dimen = (int(image.size[0] * scale), int(image.size[1] * scale)) was = "(was %d x %d scale=%.2f) " % (image.size[0], image.size[1], scale) image = image.resize(dimen, Image.ANTIALIAS) return was, image def get_image_data(image, format_, quality='keep'): """ Format is the output format, not necessarily the input format """ buf = six.BytesIO() if image.format != 'JPEG' and quality == 'keep': quality = 90 if format_ == 'png': image.save(buf, 'png', optimize=True) else: try: image.save(buf, 'jpeg', quality=quality) except ValueError as e: if quality == 'keep' and 'quantization' in str(e): image.save(buf, 'jpeg', quality=90) else: raise e return buf.getvalue() new_parser = Parser() try: unsized_image = Image.open(six.BytesIO(self.image_data)) format_ = unsized_image.format.lower() if output_format: format_ = output_format if format_ == 'gif': format_ = 'png' if format_ == 'jpeg' and unsized_image.mode.lower() not in ('rgb', 'l'): unsized_image = unsized_image.convert('RGB') if 'dpi' in unsized_image.info: del unsized_image.info['dpi'] # maybe resize image # find scaling factor scale = 1.0 scale = min(scale, max_dimen[0] / float(unsized_image.size[0])) scale = min(scale, max_dimen[1] / float(unsized_image.size[1])) was, image = scale_image(unsized_image, scale) data = get_image_data(image, format_) if format_ == 'png': # scale it till it fits into max_size while len(data) > max_size and scale > 0.01: scale = scale * 0.8 was, image = scale_image(unsized_image, scale) data = get_image_data(image, format_) else: # find best quality that fits into max_size if len(data) > max_size: for quality in (90, 85, 80, 70, 60, 50, 40, 30, 20, 10): data = get_image_data(image, format_, quality=quality) if len(data) <= max_size: break was += 'q=%d' % quality comment = "Image: %d x %d size=%d %s" % ( image.size[0], image.size[1], len(data), was) debug(comment) new_parser.image_data = data new_parser.dimen = tuple(image.size) new_parser.attribs = copy.copy(self.attribs) new_parser.attribs.comment = comment new_parser.fp = self.fp except IOError as what: error("Could not resize image: %s" % what) new_parser.attribs = copy.copy(self.attribs) fp = resource_stream('ebookmaker.parsers', 'broken.png') new_parser.image_data = fp.read() fp.close() return new_parser