def extract_images(self, processed_records, output_dir): self.log.debug('Extracting images...') output_dir = os.path.abspath(os.path.join(output_dir, 'images')) if not os.path.exists(output_dir): os.makedirs(output_dir) image_index = 0 self.image_names = [] start = getattr(self.book_header, 'first_image_index', -1) if start > self.num_sections or start < 0: # BAEN PRC files have bad headers start = 0 for i in range(start, self.num_sections): if i in processed_records: continue processed_records.append(i) data = self.sections[i][0] image_index += 1 if data[:4] in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}: # This record is a known non image type, not need to try to # load the image continue path = os.path.join(output_dir, '%05d.jpg' % image_index) try: if what(None, data) not in {'jpg', 'jpeg', 'gif', 'png', 'bmp'}: continue save_cover_data_to(data, path, minify_to=(10000, 10000)) except Exception: continue self.image_names.append(os.path.basename(path))
def main(args=sys.argv): parser = option_parser() opts, args = parser.parse_args(args) buf = BytesIO() log = create_log(buf) abort = Event() patch_plugins() authors = [] if opts.authors: authors = string_to_authors(opts.authors) identifiers = {} if opts.isbn: identifiers['isbn'] = opts.isbn allowed_plugins = frozenset(opts.allowed_plugin) results = identify(log, abort, title=opts.title, authors=authors, identifiers=identifiers, timeout=int(opts.timeout), allowed_plugins=allowed_plugins or None) if not results: print(log, file=sys.stderr) prints('No results found', file=sys.stderr) raise SystemExit(1) result = results[0] cf = None if opts.cover and results: cover = download_cover(log, title=opts.title, authors=authors, identifiers=result.identifiers, timeout=int(opts.timeout)) if cover is None and not opts.opf: prints('No cover found', file=sys.stderr) else: save_cover_data_to(cover[-1], opts.cover) result.cover = cf = opts.cover log = buf.getvalue() result = (metadata_to_opf(result) if opts.opf else unicode(result).encode('utf-8')) if opts.verbose: print(log, file=sys.stderr) print(result) if not opts.opf and opts.cover: prints('Cover :', cf) return 0
def main(args=sys.argv): parser = option_parser() opts, args = parser.parse_args(args) buf = BytesIO() log = create_log(buf) abort = Event() patch_plugins() authors = [] if opts.authors: authors = string_to_authors(opts.authors) identifiers = {} for idspec in opts.identifier: k, v = idspec.partition(':')[::2] if not k or not v: raise SystemExit('Not a valid identifier: {}'.format(idspec)) identifiers[k] = v if opts.isbn: identifiers['isbn'] = opts.isbn allowed_plugins = frozenset(opts.allowed_plugin) results = identify(log, abort, title=opts.title, authors=authors, identifiers=identifiers, timeout=int(opts.timeout), allowed_plugins=allowed_plugins or None) if not results: print(log, file=sys.stderr) prints('No results found', file=sys.stderr) raise SystemExit(1) result = results[0] cf = None if opts.cover and results: cover = download_cover(log, title=opts.title, authors=authors, identifiers=result.identifiers, timeout=int(opts.timeout)) if cover is None and not opts.opf: prints('No cover found', file=sys.stderr) else: save_cover_data_to(cover[-1], opts.cover) result.cover = cf = opts.cover log = buf.getvalue() result = (metadata_to_opf(result) if opts.opf else unicode_type(result).encode('utf-8')) if opts.verbose: print(log, file=sys.stderr) print(result) if not opts.opf and opts.cover: prints('Cover :', cf) return 0
def main(args=sys.argv): parser = option_parser() opts, args = parser.parse_args(args) buf = BytesIO() log = create_log(buf) abort = Event() patch_plugins() authors = [] if opts.authors: authors = string_to_authors(opts.authors) identifiers = {} for idspec in opts.identifier: k, v = idspec.partition(':')[::2] if not k or not v: raise SystemExit('Not a valid identifier: {}'.format(idspec)) identifiers[k] = v if opts.isbn: identifiers['isbn'] = opts.isbn allowed_plugins = frozenset(opts.allowed_plugin) results = identify(log, abort, title=opts.title, authors=authors, identifiers=identifiers, timeout=int(opts.timeout), allowed_plugins=allowed_plugins or None) if not results: prints(buf.getvalue(), file=sys.stderr) prints('No results found', file=sys.stderr) raise SystemExit(1) result = results[0] cf = None if opts.cover and results: cover = download_cover(log, title=opts.title, authors=authors, identifiers=result.identifiers, timeout=int(opts.timeout)) if cover is None: if not opts.opf: prints('No cover found', file=sys.stderr) else: save_cover_data_to(cover[-1], opts.cover) result.cover = cf = opts.cover if opts.verbose: prints(buf.getvalue(), file=sys.stderr) if opts.opf: getattr(sys.stdout, 'buffer', sys.stdout).write(metadata_to_opf(result)) print() else: prints(str(result)) if not opts.opf and opts.cover: prints('Cover :', cf) return 0
def extract_images(self, processed_records, output_dir): self.log.debug('Extracting images...') output_dir = os.path.abspath(os.path.join(output_dir, 'images')) if not os.path.exists(output_dir): os.makedirs(output_dir) image_index = 0 self.image_names = [] image_name_map = {} start = getattr(self.book_header, 'first_image_index', -1) if start > self.num_sections or start < 0: # BAEN PRC files have bad headers start = 0 for i in range(start, self.num_sections): if i in processed_records: continue processed_records.append(i) data = self.sections[i][0] image_index += 1 if data[:4] in { b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE' }: # This record is a known non image type, no need to try to # load the image continue try: imgfmt = what(None, data) except Exception: continue if imgfmt not in {'jpg', 'jpeg', 'gif', 'png', 'bmp'}: continue if imgfmt == 'jpeg': imgfmt = 'jpg' if imgfmt == 'gif': try: data = gif_data_to_png_data(data) imgfmt = 'png' except AnimatedGIF: pass path = os.path.join(output_dir, '%05d.%s' % (image_index, imgfmt)) image_name_map[image_index] = os.path.basename(path) if imgfmt == 'png': with open(path, 'wb') as f: f.write(data) else: try: save_cover_data_to(data, path, minify_to=(10000, 10000)) except Exception: continue self.image_names.append(os.path.basename(path)) return image_name_map
def main(args=sys.argv): parser = option_parser() opts, args = parser.parse_args(args) buf = BytesIO() log = create_log(buf) abort = Event() authors = [] if opts.authors: authors = string_to_authors(opts.authors) identifiers = {} if opts.isbn: identifiers['isbn'] = opts.isbn results = identify(log, abort, title=opts.title, authors=authors, identifiers=identifiers, timeout=int(opts.timeout)) if not results: print (log, file=sys.stderr) prints('No results found', file=sys.stderr) raise SystemExit(1) result = results[0] cf = None if opts.cover and results: cover = download_cover(log, title=opts.title, authors=authors, identifiers=result.identifiers, timeout=int(opts.timeout)) if cover is None and not opts.opf: prints('No cover found', file=sys.stderr) else: save_cover_data_to(cover[-1], opts.cover) result.cover = cf = opts.cover log = buf.getvalue() result = (metadata_to_opf(result) if opts.opf else unicode(result).encode('utf-8')) if opts.verbose: print (log, file=sys.stderr) print (result) if not opts.opf and opts.cover: prints('Cover :', cf) return 0
def fb2mlize_images(self): ''' This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function. ''' from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES images = [] for item in self.oeb_book.manifest: # Don't write the image if it's not referenced in the document's text. if item.href not in self.image_hrefs: continue if item.media_type in OEB_RASTER_IMAGES: try: if item.media_type != 'image/jpeg': imdata = save_cover_data_to(item.data, compression_quality=70) raw_data = b64encode(imdata) else: raw_data = b64encode(item.data) # Don't put the encoded image on a single line. data = '' col = 1 for char in raw_data: if col == 72: data += '\n' col = 1 col += 1 data += char images.append('<binary id="%s" content-type="image/jpeg">%s\n</binary>' % (self.image_hrefs[item.href], data)) except Exception as e: self.log.error('Error: Could not include file %s because ' '%s.' % (item.href, e)) return ''.join(images)
def fb2mlize_images(self): ''' This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function. ''' from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES images = [] for item in self.oeb_book.manifest: # Don't write the image if it's not referenced in the document's text. if item.href not in self.image_hrefs: continue if item.media_type in OEB_RASTER_IMAGES: try: if item.media_type != 'image/jpeg': imdata = save_cover_data_to(item.data, compression_quality=70) raw_data = b64encode(imdata) else: raw_data = b64encode(item.data) # Don't put the encoded image on a single line. data = '' col = 1 for char in raw_data: if col == 72: data += '\n' col = 1 col += 1 data += char images.append( '<binary id="%s" content-type="image/jpeg">%s\n</binary>' % (self.image_hrefs[item.href], data)) except Exception as e: self.log.error('Error: Could not include file %s because ' '%s.' % (item.href, e)) return ''.join(images)
def get_metadata(stream): from calibre.ebooks.metadata import MetaInformation from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.mobi.reader.headers import MetadataHeader from calibre.ebooks.mobi.reader.mobi6 import MobiReader from calibre.utils.img import save_cover_data_to from calibre import CurrentDir stream.seek(0) try: raw = stream.read(3) except Exception: raw = b'' stream.seek(0) if raw == b'TPZ': from calibre.ebooks.metadata.topaz import get_metadata return get_metadata(stream) from calibre.utils.logging import Log log = Log() try: mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')]) except: mi = MetaInformation(_('Unknown'), [_('Unknown')]) mh = MetadataHeader(stream, log) if mh.title and mh.title != _('Unknown'): mi.title = mh.title if mh.exth is not None: if mh.exth.mi is not None: mi = mh.exth.mi else: size = 1024**3 if hasattr(stream, 'seek') and hasattr(stream, 'tell'): pos = stream.tell() stream.seek(0, 2) size = stream.tell() stream.seek(pos) if size < 4 * 1024 * 1024: with TemporaryDirectory('_mobi_meta_reader') as tdir: with CurrentDir(tdir): mr = MobiReader(stream, log) parse_cache = {} mr.extract_content(tdir, parse_cache) if mr.embedded_mi is not None: mi = mr.embedded_mi if hasattr(mh.exth, 'cover_offset'): cover_index = mh.first_image_index + mh.exth.cover_offset data = mh.section_data(int(cover_index)) else: try: data = mh.section_data(mh.first_image_index) except Exception: data = b'' if data and what(None, data) in {'jpg', 'jpeg', 'gif', 'png', 'bmp', 'webp'}: try: mi.cover_data = ('jpg', save_cover_data_to(data)) except Exception: log.exception('Failed to read MOBI cover') return mi
def fb2mlize_images(self): ''' This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function. ''' from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES images = [] for item in self.oeb_book.manifest: # Don't write the image if it's not referenced in the document's text. if item.href not in self.image_hrefs: continue if item.media_type in OEB_RASTER_IMAGES: try: if item.media_type not in ('image/jpeg', 'image/png'): imdata = save_cover_data_to(item.data, compression_quality=70) raw_data = as_base64_unicode(imdata) content_type = 'image/jpeg' else: raw_data = as_base64_unicode(item.data) content_type = item.media_type # Don't put the encoded image on a single line. step = 72 data = '\n'.join(raw_data[i:i + step] for i in range(0, len(raw_data), step)) images.append( '<binary id="%s" content-type="%s">%s</binary>' % (self.image_hrefs[item.href], content_type, data)) except Exception as e: self.log.error('Error: Could not include file %s because ' '%s.' % (item.href, e)) return '\n'.join(images)
def get_cover(self, id, thumbnail=False, thumb_width=60, thumb_height=80): try: cherrypy.response.headers['Content-Type'] = 'image/jpeg' cherrypy.response.timeout = 3600 cover = self.db.cover(id, index_is_id=True) if cover is None: cover = self.default_cover updated = self.build_time else: updated = self.db.cover_last_modified(id, index_is_id=True) cherrypy.response.headers['Last-Modified'] = self.last_modified(updated) if thumbnail: quality = tweaks['content_server_thumbnail_compression_quality'] if quality < 50: quality = 50 elif quality > 99: quality = 99 return scale_image(cover, thumb_width, thumb_height, compression_quality=quality)[-1] return save_cover_data_to(cover, None, minify_to=(self.max_cover_width, self.max_cover_height)) except Exception as err: import traceback cherrypy.log.error('Failed to generate cover:') cherrypy.log.error(traceback.print_exc()) raise cherrypy.HTTPError(404, 'Failed to generate cover: %r'%err)
def get_cover(self, id, thumbnail=False, thumb_width=60, thumb_height=80): try: cherrypy.response.headers['Content-Type'] = 'image/jpeg' cherrypy.response.timeout = 3600 cover = self.db.cover(id, index_is_id=True) if cover is None: cover = self.default_cover updated = self.build_time else: updated = self.db.cover_last_modified(id, index_is_id=True) cherrypy.response.headers['Last-Modified'] = self.last_modified( updated) if thumbnail: quality = tweaks[ 'content_server_thumbnail_compression_quality'] if quality < 50: quality = 50 elif quality > 99: quality = 99 return scale_image(cover, thumb_width, thumb_height, compression_quality=quality)[-1] return save_cover_data_to(cover, None, minify_to=(self.max_cover_width, self.max_cover_height)) except Exception as err: import traceback cherrypy.log.error('Failed to generate cover:') cherrypy.log.error(traceback.print_exc()) raise cherrypy.HTTPError(404, 'Failed to generate cover: %r' % err)
def get_metadata(stream): from calibre.ebooks.metadata import MetaInformation from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.mobi.reader.headers import MetadataHeader from calibre.ebooks.mobi.reader.mobi6 import MobiReader from calibre.utils.img import save_cover_data_to from calibre import CurrentDir stream.seek(0) try: raw = stream.read(3) except: raw = '' stream.seek(0) if raw == b'TPZ': from calibre.ebooks.metadata.topaz import get_metadata return get_metadata(stream) from calibre.utils.logging import Log log = Log() try: mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')]) except: mi = MetaInformation(_('Unknown'), [_('Unknown')]) mh = MetadataHeader(stream, log) if mh.title and mh.title != _('Unknown'): mi.title = mh.title if mh.exth is not None: if mh.exth.mi is not None: mi = mh.exth.mi else: size = 1024**3 if hasattr(stream, 'seek') and hasattr(stream, 'tell'): pos = stream.tell() stream.seek(0, 2) size = stream.tell() stream.seek(pos) if size < 4*1024*1024: with TemporaryDirectory('_mobi_meta_reader') as tdir: with CurrentDir(tdir): mr = MobiReader(stream, log) parse_cache = {} mr.extract_content(tdir, parse_cache) if mr.embedded_mi is not None: mi = mr.embedded_mi if hasattr(mh.exth, 'cover_offset'): cover_index = mh.first_image_index + mh.exth.cover_offset data = mh.section_data(int(cover_index)) else: try: data = mh.section_data(mh.first_image_index) except: data = '' if data and what(None, data) in {'jpg', 'jpeg', 'gif', 'png', 'bmp', 'webp'}: try: mi.cover_data = ('jpg', save_cover_data_to(data)) except Exception: log.exception('Failed to read MOBI cover') return mi
def image_to_hexstring(self, data): # Images must be hex-encoded in 128 character lines data = save_cover_data_to(data) width, height = identify(data)[1:] lines = [] v = memoryview(data) for i in range(0, len(data), 64): lines.append(hexlify(v[i:i + 64])) hex_string = b'\n'.join(lines).decode('ascii') return hex_string, width, height
def process_result(log, result): plugin, data = result try: if getattr(plugin, 'auto_trim_covers', False): img = image_from_data(data) nimg = remove_borders_from_image(img) if nimg is not img: data = image_to_data(nimg) fmt, width, height = identify(data) if width < 0 or height < 0: raise ValueError('Could not read cover image dimensions') if width < 50 or height < 50: raise ValueError('Image too small') data = save_cover_data_to(data) except Exception: log.exception('Invalid cover from', plugin.name) return None return (plugin, width, height, fmt, data)
def rescale_image(data, maxsizeb=IMAGE_MAX_SIZE, dimen=None): ''' Convert image setting all transparent pixels to white and changing format to JPEG. Ensure the resultant image has a byte size less than maxsizeb. If dimen is not None, generate a thumbnail of width=dimen, height=dimen or width, height = dimen (depending on the type of dimen) Returns the image as a bytestring ''' if dimen is not None: if hasattr(dimen, '__len__'): width, height = dimen else: width = height = dimen data = scale_image(data, width=width, height=height, compression_quality=90)[-1] else: # Replace transparent pixels with white pixels and convert to JPEG data = save_cover_data_to(data) if len(data) <= maxsizeb: return data orig_data = data # save it in case compression fails quality = 90 while len(data) > maxsizeb and quality >= 5: data = image_to_data(image_from_data(orig_data), compression_quality=quality) quality -= 5 if len(data) <= maxsizeb: return data orig_data = data scale = 0.9 while len(data) > maxsizeb and scale >= 0.05: img = image_from_data(data) w, h = img.width(), img.height() img = resize_image(img, int(scale * w), int(scale * h)) data = image_to_data(img, compression_quality=quality) scale -= 0.05 return data
def image_to_hexstring(self, data): data = save_cover_data_to(data) width, height = identify(data)[1:] raw_hex = '' for char in data: raw_hex += hex(ord(char)).replace('0x', '').rjust(2, '0') # Images must be broken up so that they are no longer than 129 chars # per line hex_string = '' col = 1 for char in raw_hex: if col == 129: hex_string += '\n' col = 1 col += 1 hex_string += char return (hex_string, width, height)
def rescale_image(data, maxsizeb=IMAGE_MAX_SIZE, dimen=None): ''' Convert image setting all transparent pixels to white and changing format to JPEG. Ensure the resultant image has a byte size less than maxsizeb. If dimen is not None, generate a thumbnail of width=dimen, height=dimen or width, height = dimen (depending on the type of dimen) Returns the image as a bytestring ''' if dimen is not None: if hasattr(dimen, '__len__'): width, height = dimen else: width = height = dimen data = scale_image(data, width=width, height=height, compression_quality=90)[-1] else: # Replace transparent pixels with white pixels and convert to JPEG data = save_cover_data_to(data) if len(data) <= maxsizeb: return data orig_data = data # save it in case compression fails quality = 90 while len(data) > maxsizeb and quality >= 5: data = image_to_data(image_from_data(orig_data), compression_quality=quality) quality -= 5 if len(data) <= maxsizeb: return data orig_data = data scale = 0.9 while len(data) > maxsizeb and scale >= 0.05: img = image_from_data(data) w, h = img.width(), img.height() img = resize_image(img, int(scale*w), int(scale*h)) data = image_to_data(img, compression_quality=quality) scale -= 0.05 return data
def convert(self, oeb_book, output_path, input_plugin, opts, log): from lxml import etree from calibre.ebooks.oeb.base import OEB_IMAGES, SVG_MIME from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf from calibre.utils.zipfile import ZipFile from calibre.utils.filenames import ascii_filename # HTML if opts.htmlz_css_type == 'inline': from calibre.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer OEB2HTMLizer = OEB2HTMLInlineCSSizer elif opts.htmlz_css_type == 'tag': from calibre.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer OEB2HTMLizer = OEB2HTMLNoCSSizer else: from calibre.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer with TemporaryDirectory(u'_htmlz_output') as tdir: htmlizer = OEB2HTMLizer(log) html = htmlizer.oeb2html(oeb_book, opts) fname = u'index' if opts.htmlz_title_filename: from calibre.utils.filenames import shorten_components_to fname = shorten_components_to(100, (ascii_filename( unicode_type(oeb_book.metadata.title[0])), ))[0] with open(os.path.join(tdir, fname + u'.html'), 'wb') as tf: if isinstance(html, unicode_type): html = html.encode('utf-8') tf.write(html) # CSS if opts.htmlz_css_type == 'class' and opts.htmlz_class_style == 'external': with open(os.path.join(tdir, u'style.css'), 'wb') as tf: tf.write(htmlizer.get_css(oeb_book)) # Images images = htmlizer.images if images: if not os.path.exists(os.path.join(tdir, u'images')): os.makedirs(os.path.join(tdir, u'images')) for item in oeb_book.manifest: if item.media_type in OEB_IMAGES and item.href in images: if item.media_type == SVG_MIME: data = unicode_type( etree.tostring(item.data, encoding=unicode_type)) else: data = item.data fname = os.path.join(tdir, u'images', images[item.href]) with open(fname, 'wb') as img: img.write(data) # Cover cover_path = None try: cover_data = None if oeb_book.metadata.cover: term = oeb_book.metadata.cover[0].term cover_data = oeb_book.guide[term].item.data if cover_data: from calibre.utils.img import save_cover_data_to cover_path = os.path.join(tdir, u'cover.jpg') with lopen(cover_path, 'w') as cf: cf.write('') save_cover_data_to(cover_data, cover_path) except: import traceback traceback.print_exc() # Metadata with open(os.path.join(tdir, u'metadata.opf'), 'wb') as mdataf: opf = OPF( io.BytesIO( etree.tostring(oeb_book.metadata.to_opf1(), encoding='UTF-8'))) mi = opf.to_book_metadata() if cover_path: mi.cover = u'cover.jpg' mdataf.write(metadata_to_opf(mi)) htmlz = ZipFile(output_path, 'w') htmlz.add_dir(tdir)
def serialize_cover_data(new_cdata, cpath): from calibre.utils.img import save_cover_data_to return save_cover_data_to(new_cdata, data_fmt=os.path.splitext(cpath)[1][1:])
def _encode_into_jpeg(data): data = save_cover_data_to(data) return as_base64_unicode(data)
def _encode_into_jpeg(data): data = save_cover_data_to(data) return b64encode(data)
def _write_new_cover(new_cdata, cpath): from calibre.utils.img import save_cover_data_to new_cover = PersistentTemporaryFile(suffix=os.path.splitext(cpath)[1]) new_cover.close() save_cover_data_to(new_cdata, new_cover.name) return new_cover
def extract_content(self, output_dir): # Each text record is independent (unless the continuation # value is set in the previous record). Put each converted # text recored into a separate file. We will reference the # home.html file as the first file and let the HTML input # plugin assemble the order based on hyperlinks. with CurrentDir(output_dir): for uid, num in self.uid_text_secion_number.items(): self.log.debug('Writing record with uid: %s as %s.html' % (uid, uid)) with open('%s.html' % uid, 'wb') as htmlf: html = u'<html><body>' section_header, section_data = self.sections[num] if section_header.type == DATATYPE_PHTML: html += self.process_phtml(section_data.data, section_data.header.paragraph_offsets) elif section_header.type == DATATYPE_PHTML_COMPRESSED: d = self.decompress_phtml(section_data.data) html += self.process_phtml(d, section_data.header.paragraph_offsets).decode(self.get_text_uid_encoding(section_header.uid), 'replace') html += '</body></html>' htmlf.write(html.encode('utf-8')) # Images. # Cache the image sizes in case they are used by a composite image. images = set() if not os.path.exists(os.path.join(output_dir, 'images/')): os.makedirs(os.path.join(output_dir, 'images/')) with CurrentDir(os.path.join(output_dir, 'images/')): # Single images. for uid, num in self.uid_image_section_number.items(): section_header, section_data = self.sections[num] if section_data: idata = None if section_header.type == DATATYPE_TBMP: idata = section_data elif section_header.type == DATATYPE_TBMP_COMPRESSED: if self.header_record.compression == 1: idata = decompress_doc(section_data) elif self.header_record.compression == 2: idata = zlib.decompress(section_data) try: save_cover_data_to(idata, '%s.jpg' % uid, compression_quality=70) images.add(uid) self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid)) except Exception as e: self.log.error('Failed to write image with uid %s: %s' % (uid, e)) else: self.log.error('Failed to write image with uid %s: No data.' % uid) # Composite images. # We're going to use the already compressed .jpg images here. for uid, num in self.uid_composite_image_section_number.items(): try: section_header, section_data = self.sections[num] # Get the final width and height. width = 0 height = 0 for row in section_data.layout: row_width = 0 col_height = 0 for col in row: if col not in images: raise Exception('Image with uid: %s missing.' % col) w, h = identify(lopen('%s.jpg' % col, 'rb'))[1:] row_width += w if col_height < h: col_height = h if width < row_width: width = row_width height += col_height # Create a new image the total size of all image # parts. Put the parts into the new image. with Canvas(width, height) as canvas: y_off = 0 for row in section_data.layout: x_off = 0 largest_height = 0 for col in row: im = image_from_data(lopen('%s.jpg' % col, 'rb').read()) canvas.compose(im, x_off, y_off) w, h = im.width(), im.height() x_off += w if largest_height < h: largest_height = h y_off += largest_height with lopen('%s.jpg' % uid) as out: out.write(canvas.export(compression_quality=70)) self.log.debug('Wrote composite image with uid %s to images/%s.jpg' % (uid, uid)) except Exception as e: self.log.error('Failed to write composite image with uid %s: %s' % (uid, e)) # Run the HTML through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(self.options, opt.option.name, opt.recommended_value) self.options.input_encoding = 'utf-8' odi = self.options.debug_pipeline self.options.debug_pipeline = None # Determine the home.html record uid. This should be set in the # reserved values in the metadata recored. home.html is the first # text record (should have hyper link references to other records) # in the document. try: home_html = self.header_record.home_html if not home_html: home_html = self.uid_text_secion_number.items()[0][0] except: raise Exception('Could not determine home.html') # Generate oeb from html conversion. oeb = html_input.convert(open('%s.html' % home_html, 'rb'), self.options, 'html', self.log, {}) self.options.debug_pipeline = odi return oeb
def convert(self, oeb_book, output_path, input_plugin, opts, log): from lxml import etree from calibre.ebooks.oeb.base import OEB_IMAGES, SVG_MIME from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf from calibre.utils.zipfile import ZipFile from calibre.utils.filenames import ascii_filename # HTML if opts.htmlz_css_type == 'inline': from calibre.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer OEB2HTMLizer = OEB2HTMLInlineCSSizer elif opts.htmlz_css_type == 'tag': from calibre.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer OEB2HTMLizer = OEB2HTMLNoCSSizer else: from calibre.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer with TemporaryDirectory(u'_htmlz_output') as tdir: htmlizer = OEB2HTMLizer(log) html = htmlizer.oeb2html(oeb_book, opts) fname = u'index' if opts.htmlz_title_filename: from calibre.utils.filenames import shorten_components_to fname = shorten_components_to(100, (ascii_filename(unicode_type(oeb_book.metadata.title[0])),))[0] with open(os.path.join(tdir, fname+u'.html'), 'wb') as tf: if isinstance(html, unicode_type): html = html.encode('utf-8') tf.write(html) # CSS if opts.htmlz_css_type == 'class' and opts.htmlz_class_style == 'external': with open(os.path.join(tdir, u'style.css'), 'wb') as tf: tf.write(htmlizer.get_css(oeb_book)) # Images images = htmlizer.images if images: if not os.path.exists(os.path.join(tdir, u'images')): os.makedirs(os.path.join(tdir, u'images')) for item in oeb_book.manifest: if item.media_type in OEB_IMAGES and item.href in images: if item.media_type == SVG_MIME: data = etree.tostring(item.data, encoding='unicode') else: data = item.data fname = os.path.join(tdir, u'images', images[item.href]) with open(fname, 'wb') as img: img.write(data) # Cover cover_path = None try: cover_data = None if oeb_book.metadata.cover: term = oeb_book.metadata.cover[0].term cover_data = oeb_book.guide[term].item.data if cover_data: from calibre.utils.img import save_cover_data_to cover_path = os.path.join(tdir, u'cover.jpg') with lopen(cover_path, 'w') as cf: cf.write('') save_cover_data_to(cover_data, cover_path) except: import traceback traceback.print_exc() # Metadata with open(os.path.join(tdir, u'metadata.opf'), 'wb') as mdataf: opf = OPF(io.BytesIO(etree.tostring(oeb_book.metadata.to_opf1(), encoding='UTF-8'))) mi = opf.to_book_metadata() if cover_path: mi.cover = u'cover.jpg' mdataf.write(metadata_to_opf(mi)) htmlz = ZipFile(output_path, 'w') htmlz.add_dir(tdir)
def extract_content(self, output_dir): # Each text record is independent (unless the continuation # value is set in the previous record). Put each converted # text recored into a separate file. We will reference the # home.html file as the first file and let the HTML input # plugin assemble the order based on hyperlinks. with CurrentDir(output_dir): for uid, num in self.uid_text_secion_number.items(): self.log.debug('Writing record with uid: %s as %s.html' % (uid, uid)) with open('%s.html' % uid, 'wb') as htmlf: html = u'<html><body>' section_header, section_data = self.sections[num] if section_header.type == DATATYPE_PHTML: html += self.process_phtml( section_data.data, section_data.header.paragraph_offsets) elif section_header.type == DATATYPE_PHTML_COMPRESSED: d = self.decompress_phtml(section_data.data) html += self.process_phtml( d, section_data.header.paragraph_offsets).decode( self.get_text_uid_encoding(section_header.uid), 'replace') html += '</body></html>' htmlf.write(html.encode('utf-8')) # Images. # Cache the image sizes in case they are used by a composite image. images = set() if not os.path.exists(os.path.join(output_dir, 'images/')): os.makedirs(os.path.join(output_dir, 'images/')) with CurrentDir(os.path.join(output_dir, 'images/')): # Single images. for uid, num in self.uid_image_section_number.items(): section_header, section_data = self.sections[num] if section_data: idata = None if section_header.type == DATATYPE_TBMP: idata = section_data elif section_header.type == DATATYPE_TBMP_COMPRESSED: if self.header_record.compression == 1: idata = decompress_doc(section_data) elif self.header_record.compression == 2: idata = zlib.decompress(section_data) try: save_cover_data_to(idata, '%s.jpg' % uid, compression_quality=70) images.add(uid) self.log.debug( 'Wrote image with uid %s to images/%s.jpg' % (uid, uid)) except Exception as e: self.log.error( 'Failed to write image with uid %s: %s' % (uid, e)) else: self.log.error( 'Failed to write image with uid %s: No data.' % uid) # Composite images. # We're going to use the already compressed .jpg images here. for uid, num in self.uid_composite_image_section_number.items(): try: section_header, section_data = self.sections[num] # Get the final width and height. width = 0 height = 0 for row in section_data.layout: row_width = 0 col_height = 0 for col in row: if col not in images: raise Exception('Image with uid: %s missing.' % col) w, h = identify(lopen('%s.jpg' % col, 'rb'))[1:] row_width += w if col_height < h: col_height = h if width < row_width: width = row_width height += col_height # Create a new image the total size of all image # parts. Put the parts into the new image. with Canvas(width, height) as canvas: y_off = 0 for row in section_data.layout: x_off = 0 largest_height = 0 for col in row: im = image_from_data( lopen('%s.jpg' % col, 'rb').read()) canvas.compose(im, x_off, y_off) w, h = im.width(), im.height() x_off += w if largest_height < h: largest_height = h y_off += largest_height with lopen('%s.jpg' % uid) as out: out.write(canvas.export(compression_quality=70)) self.log.debug( 'Wrote composite image with uid %s to images/%s.jpg' % (uid, uid)) except Exception as e: self.log.error( 'Failed to write composite image with uid %s: %s' % (uid, e)) # Run the HTML through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(self.options, opt.option.name, opt.recommended_value) self.options.input_encoding = 'utf-8' odi = self.options.debug_pipeline self.options.debug_pipeline = None # Determine the home.html record uid. This should be set in the # reserved values in the metadata recored. home.html is the first # text record (should have hyper link references to other records) # in the document. try: home_html = self.header_record.home_html if not home_html: home_html = self.uid_text_secion_number.items()[0][0] except: raise Exception('Could not determine home.html') # Generate oeb from html conversion. oeb = html_input.convert(open('%s.html' % home_html, 'rb'), self.options, 'html', self.log, {}) self.options.debug_pipeline = odi return oeb