def package (self, job): self.setup (job) zipfilename = job.outputfile # filename is zipfile m = re.match (r'\d+', zipfilename) if m: ebook_no = m.group (0) else: error ('Invalid filename %s for push packager.' % zipfilename) return zip_ = self.create (zipfilename) for suffix in '.txt -8.txt -0.txt .zip -8.zip -0.zip -rst.zip -h.zip'.split (): filename = '%s%s' % (ebook_no, suffix) memberfilename = '%s/%s' % (ebook_no, filename) self.add (zip_, filename, memberfilename) for suffix, ext in (('-h', 'html'), ('-rst', 'rst')): filename = '%s%s.%s' % (ebook_no, suffix, ext) memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, filename) self.add (zip_, filename, memberfilename) # image files for url in options.html_images_list: rel_url = gg.make_url_relative (job.base_url, url) filename = os.path.join (self.path, rel_url) memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, rel_url) self.add (zip_, filename, memberfilename) zip_.close () info ('Done Zip file: %s' % zipfilename)
def __init__(self, filename, oebps_path=None): """ Create the zip file. And populate it with mimetype and container.xml files. """ self.zipfilename = filename self.oebps_path = oebps_path if oebps_path else 'OEBPS/' info('Creating Epub file: %s' % filename) # open zipfile zipfile.ZipFile.__init__(self, filename, 'w', zipfile.ZIP_DEFLATED) # write mimetype # the OCF spec says mimetype must be first and uncompressed i = self.zi() i.compress_type = zipfile.ZIP_STORED i.filename = 'mimetype' self.writestr(i, 'application/epub+zip') self.add_container_xml('content.opf') self.wrappers = 0 # to generate unique filenames for wrappers
def tidy (html): """ Pipe html thru w3c tidy. """ html = parsers.RE_RESTRICTED.sub ('', html) html = RE_XMLDECL.sub ('', html) html = parsers.RE_HTML_CHARSET.sub ('; charset=utf-8', html) # convert to xhtml tidy = subprocess.Popen ( ["tidy", "-utf8", "-clean", "--wrap", "0", # "--drop-font-tags", "y", # "--drop-proprietary-attributes", "y", # "--add-xml-space", "y", "--output-xhtml", "y", "--numeric-entities", "y", "--merge-divs", "n", # keep poetry indentation "--merge-spans", "n", "--add-xml-decl", "n", "--doctype", "strict", "--anchor-as-name", "n", "--enclose-text", "y" ], stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE) # print (html.encode ('utf-8')) # sys.exit () (html, stderr) = tidy.communicate (html.encode ('utf-8')) regex = re.compile (r'(Info:|Warning:|Error:)\s*', re.I) # pylint: disable=E1103 msg = stderr.decode (sys.stderr.encoding).strip () for line in msg.splitlines (): match = regex.search (line) if match: sline = regex.sub ("", line) g = match.group (1).lower () if g == 'info:': info ("tidy: %s" % sline) elif g == 'warning:': warning ("tidy: %s" % sline) elif g == 'error:': error ("tidy: %s" % sline) else: error (line) if tidy.returncode == 2: raise ValueError (stderr) return html.decode ('utf-8')
def build(self, job): """ Build Pics file. """ dest_dir = os.path.abspath(job.outputdir) info("Creating Pics directory in: %s" % dest_dir) self.copy_aux_files(job, dest_dir) info("Done Pics directory in: %s" % dest_dir)
def build(self, job): """ Build HTML file. """ htmlfilename = os.path.join(job.outputdir, job.outputfile) try: os.remove(htmlfilename) except OSError: pass try: info("Creating HTML file: %s" % htmlfilename) for p in job.spider.parsers: # Do html only. The images were copied earlier by PicsDirWriter. xhtml = None if hasattr(p, 'rst2html'): xhtml = p.rst2html(job) elif hasattr(p, 'xhtml'): p.parse() xhtml = copy.deepcopy(p.xhtml) if xhtml is not None: self.make_links_relative(xhtml, p.attribs.url) self.add_dublincore(job, xhtml) # makes iphones zoom in self.add_meta(xhtml, 'viewport', 'width=device-width') self.add_meta_generator(xhtml) # This writer has currently to deal only with RST # input. The RST writer has a workaround that # avoids writing empty elements. So we don't need # the same ugly workaround as the EPUB writer, # that has to deal with HTML input too. html = etree.tostring(xhtml, method='xml', doctype=gg.XHTML_DOCTYPE, encoding='utf-8', pretty_print=True, xml_declaration=True) self.write_with_crlf(htmlfilename, html) # self.copy_aux_files (job.outputdir) info("Done HTML file: %s" % htmlfilename) except Exception as what: exception("Error building HTML %s: %s" % (htmlfilename, what)) if os.access(htmlfilename, os.W_OK): os.remove(htmlfilename) raise what
def get_charset_from_meta(self): """ Parse text for hints about charset. """ charset = None match = parsers.REB_PG_CHARSET.search(self.bytes_content()) if match: charset = match.group(1).decode('ascii') info('Got charset %s from pg header' % charset) return charset
def package(self, job): self.setup(job) filename = self.path_name_ext zipfilename = os.path.join(self.path, self.name) + '.zip' memberfilename = self.name + self.ext zip_ = self.create(zipfilename) self.add(zip_, filename, memberfilename) zip_.close() info('Done Zip file: %s' % zipfilename)
def add(zip_, filename, memberfilename): """ Add one file to the zip. """ try: os.stat(filename) dummy_name, ext = os.path.splitext(filename) info(' Adding file: %s as %s' % (filename, memberfilename)) zip_.write( filename, memberfilename, zipfile.ZIP_STORED if ext in ['.zip', '.png'] else zipfile.ZIP_DEFLATED) except OSError: warning('ZipPackager: Cannot add file %s', filename)
def build(self, job): """ Build RST file. """ filename = os.path.join(os.path.abspath(job.outputdir), job.outputfile) info("Creating RST file: %s" % filename) parser = ParserFactory.ParserFactory.create(job.url) if not hasattr(parser, 'rst2nroff'): error('RSTWriter can only work on a RSTParser.') raise SkipOutputFormat data = parser.preprocess('utf-8').encode('utf-8') self.write_with_crlf(filename, data) info("Done RST file: %s" % filename)
def pre_parse(self): """ Pre-parse a html ebook. Does a full parse because a lightweight parse would be almost as much work. """ # cache if self.xhtml is not None: return debug("HTMLParser.pre_parse () ...") html = self.unicode_content() if html.startswith('<?xml'): # Try a naive parse. This might fail because of errors in # the html or because we have no dtd loaded. We do not # load dtds because that makes us dependent on network and # the w3c site being up. Having all users of ebookmaker # install local dtds is unrealistic. try: self.xhtml = self.__parse(html) except etree.ParseError: pass if self.xhtml is None: # previous parse failed, try tidy info("Running html thru tidy.") html = self.tidy(html) self.xhtml = self.__parse(html) # let exception bubble up self._fix_anchors() # needs relative paths self.xhtml.make_links_absolute(base_url=self.attribs.url) self._to_xhtml11() self._make_coverpage_link() debug("Done parsing %s" % self.attribs.url)
def build(self, job): """ Build TXT file. """ filename = os.path.join(job.outputdir, job.outputfile) encoding = job.subtype.strip('.') info("Creating plain text file: %s" % filename) parser = ParserFactory.ParserFactory.create(job.url) if hasattr(parser, 'rst2nroff'): data = self.groff(job, parser.rst2nroff(job, encoding), encoding) elif hasattr(parser, 'xhtml') and parser.xhtml is not None: info("Plain text file %s aborted due to html input" % filename) return else: data = parser.unicode_content() data = data.encode('utf_8_sig' if encoding == 'utf-8' else encoding, 'unitame') self.write_with_crlf(filename, data) info("Done plain text file: %s" % filename)
def unicode_content(self): """ Get document content as unicode string. """ if self.unicode_buffer is None: data = (self.decode(self.get_charset_from_content_type()) or self.decode(self.get_charset_from_meta()) or self.decode(self.guess_charset_from_body()) or self.decode('utf-8') or self.decode('windows-1252')) if not data: if data == '': info('Continuing parse despite missing file') self.unicode_buffer = '' else: raise UnicodeError( "Text in Klingon encoding ... giving up.") # normalize line-endings if '\r' in data or '\u2028' in data: data = '\n'.join(data.splitlines()) self.unicode_buffer = data return self.unicode_buffer
def package(self, job): self.setup(job) try: aux_file_list = list(job.spider.aux_file_iter()) except AttributeError: aux_file_list = [] filename = job.outputfile zipfilename = os.path.join(self.path, self.name) + '.zip' memberfilename = os.path.join(self.name, self.name) + self.ext zip_ = self.create(zipfilename) self.add(zip_, filename, memberfilename) # now images for url in aux_file_list: rel_url = gg.make_url_relative(job.base_url, url) filename = os.path.join(self.path, rel_url) memberfilename = os.path.join(self.name, rel_url) self.add(zip_, filename, memberfilename) zip_.close() info('Done Zip file: %s' % zipfilename)
def package(self, job): self.setup(job) filename = self.path_name_ext gzfilename = filename + GZIP_EXTENSION try: info('Creating Gzip file: %s' % gzfilename) info(' Adding file: %s' % filename) with open(filename, 'rb') as fp: with gzip.open(gzfilename, 'wb') as fpgz: fpgz.writelines(fp) info('Done Zip file: %s' % gzfilename) except IOError as what: error(what)
def create(zipfilename): """ Create a zip file. """ info('Creating Zip file: %s' % zipfilename) return zipfile.ZipFile(zipfilename, 'w', zipfile.ZIP_DEFLATED)
def build(self, job): """ Build kindle file from epub using amazon kindlegen. """ info("Creating Kindle file: %s" % os.path.join(job.outputdir, job.outputfile)) info(" ... from: %s" % job.url) try: cwd = os.getcwd() os.chdir(job.outputdir) kindlegen = subprocess.Popen([ options.config.MOBIGEN, '-o', os.path.basename(job.outputfile), job.url ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except OSError as what: os.chdir(cwd) error("KindleWriter: %s %s" % (options.config.MOBIGEN, what)) raise SkipOutputFormat (stdout, stderr) = kindlegen.communicate() os.chdir(cwd) if kindlegen.returncode > 0: regex = re.compile(r'^(\w+)\(prcgen\):') # pylint: disable=E1103 msg = stderr.rstrip() if msg: msg = msg.decode(sys.stderr.encoding) error(msg) msg = stdout.rstrip() msg = msg.decode(sys.stdout.encoding) for line in msg.splitlines(): match = regex.match(line) if match: sline = regex.sub("", line) g = match.group(1).lower() if g == 'info': if sline == 'MOBI File generated with WARNINGS!': # we knew that already continue # info ("kindlegen: %s" % sline) elif g == 'warning': if sline.startswith('Cover is too small'): continue if sline == 'Cover not specified': continue warning("kindlegen: %s" % sline) elif g == 'error': error("kindlegen: %s" % sline) else: error(line) info("Done Kindle file: %s" % os.path.join(job.outputdir, job.outputfile))
def build(self, job): """ Build kindle file from epub using amazon kindlegen or calibre. """ if job.dc.languages: if job.dc.languages[0].id in no_kindlegen_langs: mobimaker = options.config.MOBILANG else: mobimaker = options.config.MOBIGEN if not mobimaker: info('no mobimaker available') return # kindlegen needs localized paths outputdir = os.path.abspath(job.outputdir) info("Creating Kindle file: %s" % os.path.join(outputdir, job.outputfile)) info(" ... from: %s" % job.url) try: cwd = os.getcwd() os.chdir(outputdir) if 'ebook-convert' in mobimaker: kindlegen = subprocess.Popen( [ mobimaker, job.url, os.path.basename(job.outputfile), '--personal-doc="[EBOK]"', ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) else: kindlegen = subprocess.Popen( [ mobimaker, '-o', os.path.basename(job.outputfile), job.url ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) except OSError as what: os.chdir(cwd) error("KindleWriter: %s %s" % (mobimaker, what)) raise SkipOutputFormat (stdout, stderr) = kindlegen.communicate() os.chdir(cwd) if kindlegen.returncode > 0: regex = re.compile(r'^(\w+)\(prcgen\):') # pylint: disable=E1103 msg = stderr.rstrip() if msg: msg = msg.decode(sys.stderr.encoding) error(msg) msg = stdout.rstrip() msg = msg.decode(sys.stdout.encoding) for line in msg.splitlines(): match = regex.match(line) if match: sline = regex.sub("", line) g = match.group(1).lower() if g == 'info': if sline == 'MOBI File generated with WARNINGS!': # we knew that already continue # info("kindlegen: %s" % sline) elif g == 'warning': if sline.startswith('Cover is too small'): continue if sline == 'Cover not specified': continue warning("kindlegen: %s" % sline) elif g == 'error': error("kindlegen: %s" % sline) else: error(line) info("Done Kindle file: %s" % os.path.join(outputdir, job.outputfile))
def main(): """ Main program. """ try: config() except configparser.Error as what: error("Error in configuration file: %s", str(what)) return 1 Logger.set_log_level(options.verbose) options.types = options.types or ['all'] options.types = CommonCode.add_dependencies(options.types, DEPENDENCIES, BUILD_ORDER) debug("Building types: %s" % ' '.join(options.types)) start_time = datetime.datetime.now() ParserFactory.load_parsers() WriterFactory.load_writers() PackagerFactory.load_packagers() if options.is_job_queue: job_queue = cPickle.load(sys.stdin.buffer) # read bytes else: options.dc = get_dc( options.url) # this is when doc at url gets parsed! job_queue = [] output_files = dict() for type_ in options.types: job = CommonCode.Job(type_) job.url = options.url job.ebook = options.ebook job.dc = options.dc job.outputdir = options.outputdir job.outputfile = options.outputfile or make_output_filename( type_, options.dc) output_files[type_] = job.outputfile absoutputdir = os.path.abspath(job.outputdir) if job.type == 'kindle.images': job.url = os.path.join(absoutputdir, output_files['epub.images']) elif job.type == 'kindle.noimages': job.url = os.path.join(absoutputdir, output_files['epub.noimages']) job_queue.append(job) for j in job_queue: options.dc = j.dc options.outputdir = j.outputdir do_job(j) packager = PackagerFactory.create(options.packager, 'push') if packager: # HACK: the WWers ever only convert one ebook at a time job = job_queue[0] job.outputfile = '%d-final.zip' % (options.dc.project_gutenberg_id) packager.package(job) end_time = datetime.datetime.now() info(' Finished jobs. Total time: %s' % (end_time - start_time)) return 0
def do_job(job): """ Do one job. """ log_handler = None Logger.ebook = job.ebook if job.logfile: log_handler = open_log( os.path.join(os.path.abspath(job.outputdir), job.logfile)) debug('=== Building %s ===' % job.type) start_time = datetime.datetime.now() try: if job.url: spider = Spider.Spider() dirpath = os.path.dirname(job.url) # platform native path spider.include_urls += (options.include_urls or [parsers.webify_url(dirpath) + '/*'] ) # use for parser only spider.include_mediatypes += options.include_mediatypes if job.subtype == '.images' or job.type == 'rst.gen': spider.include_mediatypes.append('image/*') spider.exclude_urls += options.exclude_urls spider.exclude_mediatypes += options.exclude_mediatypes spider.max_depth = options.max_depth or six.MAXSIZE for rewrite in options.rewrite: from_url, to_url = rewrite.split('>') spider.add_redirection(from_url, to_url) attribs = parsers.ParserAttributes() attribs.url = parsers.webify_url(job.url) attribs.id = 'start' if options.input_mediatype: attribs.orig_mediatype = attribs.HeaderElement.from_str( options.input_mediatype) spider.recursive_parse(attribs) elect_coverpage(spider, job.url) job.url = spider.redirect(job.url) job.base_url = job.url job.spider = spider writer = WriterFactory.create(job.maintype) writer.build(job) if options.validate: writer.validate(job) packager = PackagerFactory.create(options.packager, job.type) if packager: packager.package(job) if job.type == 'html.images': # FIXME: hack for push packager options.html_images_list = list(job.spider.aux_file_iter()) except SkipOutputFormat as what: warning("%s" % what) except Exception as what: exception("%s" % what) end_time = datetime.datetime.now() info(' %s made in %s' % (job.type, end_time - start_time)) if log_handler: close_log(log_handler) log_handler = None
def build(self, job): """ Build PDF file. """ inputfilename = job.url outputfilename = os.path.join(os.path.abspath(job.outputdir), job.outputfile) debug("Inputfile: %s" % inputfilename) info("Creating PDF file: %s" % outputfilename) parser = ParserFactory.ParserFactory.create(inputfilename) if not hasattr(parser, 'rst2xetex'): warning('Skipping PDF Output because input mediatype is %s' % parser.mediatype()) raise SkipOutputFormat # Brain-dead xetex doesn't understand unix pipes # so we have to write a temp file texfilename = os.path.splitext(outputfilename)[0] + '.tex' auxfilename = os.path.splitext(outputfilename)[0] + '.aux' logfilename = os.path.splitext(outputfilename)[0] + '.log' try: os.remove(auxfilename) except OSError: pass tex = parser.rst2xetex(job) with open(texfilename, 'wb') as fp: fp.write(tex) try: cwd = os.getcwd() os.chdir(os.path.abspath(job.outputdir)) _xetex = subprocess.Popen([ options.config.XELATEX, "-output-directory", job.outputdir, "-interaction", "nonstopmode", texfilename ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except OSError as what: os.chdir(cwd) error("PDFWriter: %s %s" % (options.config.XELATEX, what)) raise SkipOutputFormat (dummy_stdout, dummy_stderr) = _xetex.communicate() with open(logfilename, encoding='utf-8') as fp: for line in fp: line = line.strip() if 'Error:' in line: error("xetex: %s" % line) if options.verbose >= 1: if 'Warning:' in line: warning("xetex: %s" % line) if options.verbose < 2: try: os.remove(texfilename) os.remove(logfilename) os.remove(auxfilename) except OSError: pass os.chdir(cwd) info("Done PDF file: %s" % outputfilename)
def commit(self): """ Close OCF Container. """ info("Done Epub file: %s" % self.zipfilename) self.close()