def read_image_data(self, fname, base=None): if fname.startswith('file://'): src = fname[len('file://'):] if not src or not os.path.exists(src): raise LinkedImageNotFound(src) with open(src, 'rb') as rawsrc: raw = rawsrc.read() else: try: raw = self.docx.read(fname) except KeyError: raise LinkedImageNotFound(fname) base = base or image_filename(fname.rpartition('/')[-1]) or 'image' ext = what(None, raw) or base.rpartition('.')[-1] or 'jpeg' if ext == 'emf': # For an example, see: https://bugs.launchpad.net/bugs/1224849 self.log.info('Found an EMF image: %s, trying to extract ' 'embedded raster image', fname) from ebook_converter.utils.wmf.emf import emf_unwrap try: raw = emf_unwrap(raw) except Exception: self.log.exception('Failed to extract embedded raster image ' 'from EMF') else: ext = 'png' base = base.rpartition('.')[0] if not base: base = 'image' base += '.' + ext return raw, base
def __call__(self, oeb, opts): from ebook_converter.utils.imghdr import what self.log = oeb.log attr_path = XPath('//h:img[@src]') for item in oeb.spine: root = item.data if not hasattr(root, 'xpath'): continue for img in attr_path(root): raw = img.get('src', '') if not raw.startswith('data:'): continue header, data = raw.partition(',')[0::2] if not header.startswith('data:image/') or not data: continue if ';base64' in header: data = re.sub(r'\s+', '', data) try: data = from_base64_bytes(data) except Exception: self.log.error('Found invalid base64 encoded data ' 'URI, ignoring it') continue else: data = urllib.parse.unquote(data) data = as_bytes(data) fmt = what(None, data) if not fmt: self.log.warn('Image encoded as data URL has unknown ' 'format, ignoring') continue img.set( 'src', item.relhref(self.convert_image_data_uri(data, fmt, oeb)))
def return_raster_image(path): from ebook_converter.utils.imghdr import what if os.access(path, os.R_OK): with open(path, 'rb') as f: raw = f.read() if what(None, raw) not in (None, 'svg'): return raw
def extract_images(self, picts): from ebook_converter.utils.imghdr import what from binascii import unhexlify self.log('Extracting images...') with open(picts, 'rb') as f: raw = f.read() picts = filter(len, re.findall(br'\{\\pict([^}]+)\}', raw)) hex_pat = re.compile(br'[^a-fA-F0-9]') encs = [hex_pat.sub(b'', pict) for pict in picts] count = 0 imap = {} for enc in encs: if len(enc) % 2 == 1: enc = enc[:-1] data = unhexlify(enc) fmt = what(None, data) if fmt is None: fmt = 'wmf' count += 1 name = u'%04d.%s' % (count, fmt) with open(name, 'wb') as f: f.write(data) imap[count] = name # with open(name+'.hex', 'wb') as f: # f.write(enc) return self.convert_images(imap)
def resource_adder(self, link_, base=None): link, frag = self.link_to_local_path(link_, base=base) if link is None: return link_ try: if base and not os.path.isabs(link): link = os.path.join(base, link) link = os.path.abspath(link) except: return link_ if not os.access(link, os.R_OK): return link_ if os.path.isdir(link): self.log.warning(link_, 'is a link to a directory. Ignoring.') return link_ if link not in self.added_resources: bhref = os.path.basename(link) id, href = self.oeb.manifest.generate( id='added', href=sanitize_file_name(bhref)) guessed = mimetypes.guess_type(href)[0] media_type = guessed or self.BINARY_MIME if media_type == 'text/plain': self.log.warning('Ignoring link to text file %r', link_) return None if media_type == self.BINARY_MIME: # Check for the common case, images try: img = what(link) except EnvironmentError: pass else: if img: media_type = mimetypes.guess_type( 'dummy.' + img)[0] or self.BINARY_MIME self.oeb.log.debug('Added %s', link) self.oeb.container = self.DirContainer(os.path.dirname(link), self.oeb.log, ignore_opf=True) # Load into memory item = self.oeb.manifest.add(id, href, media_type) # bhref refers to an already existing file. The read() method of # DirContainer will call unquote on it before trying to read the # file, therefore we quote it here. # XXX(gryf): why the heck it was changed to bytes? item.html_input_href = urllib.parse.quote(bhref) if guessed in self.OEB_STYLES: item.override_css_fetch = functools.partial( self.css_import_handler, os.path.dirname(link)) item.data self.added_resources[link] = href nlink = self.added_resources[link] if frag: nlink = '#'.join((nlink, frag)) return nlink
def image_from_data(data): ' Create an image object from data, which should be a bytestring. ' if isinstance(data, QImage): return data i = QImage() if not i.loadFromData(data): q = what(None, data) if q == 'jxr': return load_jxr_data(data) raise NotImage('Not a valid image (detected type: {})'.format(q)) return i
def extract_images(self, processed_records, output_dir): self.log.debug('Extracting images...') output_dir = os.path.abspath(os.path.join(output_dir, 'images')) if not os.path.exists(output_dir): os.makedirs(output_dir) image_index = 0 self.image_names = [] image_name_map = {} start = getattr(self.book_header, 'first_image_index', -1) if start > self.num_sections or start < 0: # BAEN PRC files have bad headers start = 0 for i in range(start, self.num_sections): if i in processed_records: continue processed_records.append(i) data = self.sections[i][0] image_index += 1 if data[:4] in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}: # This record is a known non image type, no need to try to # load the image continue try: imgfmt = what(None, data) except Exception: continue if imgfmt not in {'jpg', 'jpeg', 'gif', 'png', 'bmp'}: continue if imgfmt == 'jpeg': imgfmt = 'jpg' if imgfmt == 'gif': try: data = gif_data_to_png_data(data) imgfmt = 'png' except AnimatedGIF: pass path = os.path.join(output_dir, '%05d.%s' % (image_index, imgfmt)) image_name_map[image_index] = os.path.basename(path) if imgfmt == 'png': with open(path, 'wb') as f: f.write(data) else: try: save_cover_data_to(data, path, minify_to=(10000, 10000)) except Exception: logging.exception('Exception has been thrown during ' 'transforming image') continue self.image_names.append(os.path.basename(path)) return image_name_map
def dataize_svg(self, item, svg=None): if svg is None: svg = item.data hrefs = self.oeb.manifest.hrefs for elem in xpath(svg, '//svg:*[@xl:href]'): href = urlnormalize(elem.attrib[base.tag('xlink', 'href')]) path = urllib.parse.urldefrag(href)[0] if not path: continue abshref = item.abshref(path) if abshref not in hrefs: continue linkee = hrefs[abshref] data = linkee.bytes_representation ext = what(None, data) or 'jpg' with PersistentTemporaryFile(suffix='.' + ext) as pt: pt.write(data) self.temp_files.append(pt.name) elem.attrib[base.tag('xlink', 'href')] = pt.name return svg
def process_image(self, data): if not self.process_images: return data func = mobify_image if self.opts.mobi_keep_original_images else rescale_image try: return func(data) except Exception: ext = what(None, data) if ext not in ('png', 'gif'): raise if ext == 'gif': with PersistentTemporaryFile(suffix='.gif') as pt: pt.write(data) return mobify_image(data) with PersistentTemporaryFile(suffix='.png') as pt: pt.write(data) try: from ebook_converter.utils.img import optimize_png optimize_png(pt.name) data = open(pt.name, 'rb').read() finally: os.remove(pt.name) return func(data)
def add_resources(self, add_fonts): oeb = self.oeb oeb.logger.info('Serializing resources...') index = 1 mh_href = None if 'masthead' in oeb.guide and oeb.guide['masthead'].href: mh_href = oeb.guide['masthead'].href self.records.append(None) index += 1 self.used_image_indices.add(0) self.image_indices.add(0) elif self.is_periodical: # Generate a default masthead data = generate_masthead(str(self.oeb.metadata['title'][0])) self.records.append(data) self.used_image_indices.add(0) self.image_indices.add(0) index += 1 cover_href = self.cover_offset = self.thumbnail_offset = None if (oeb.metadata.cover and str(oeb.metadata.cover[0]) in oeb.manifest.ids): cover_id = str(oeb.metadata.cover[0]) item = oeb.manifest.ids[cover_id] cover_href = item.href for item in self.oeb.manifest.values(): if item.media_type not in OEB_RASTER_IMAGES: continue try: data = self.process_image(item.data) except: self.log.warn('Bad image file %r' % item.href) continue else: if mh_href and item.href == mh_href: self.records[0] = data continue self.image_indices.add(len(self.records)) self.records.append(data) self.item_map[item.href] = index self.mime_map[item.href] = 'image/%s'%what(None, data) index += 1 if cover_href and item.href == cover_href: self.cover_offset = self.item_map[item.href] - 1 self.used_image_indices.add(self.cover_offset) try: data = rescale_image(item.data, dimen=MAX_THUMB_DIMEN, maxsizeb=MAX_THUMB_SIZE) except: self.log.warn('Failed to generate thumbnail') else: self.image_indices.add(len(self.records)) self.records.append(data) self.thumbnail_offset = index - 1 self.used_image_indices.add(self.thumbnail_offset) index += 1 finally: item.unload_data_from_memory() if add_fonts: for item in self.oeb.manifest.values(): if item.href and item.href.rpartition('.')[-1].lower() in { 'ttf', 'otf'} and isinstance(item.data, bytes): self.records.append(write_font_record(item.data)) self.item_map[item.href] = len(self.records) self.has_fonts = True
def mobify_image(data): 'Convert PNG images to GIF as the idiotic Kindle cannot display some PNG' fmt = what(None, data) if fmt == 'png': data = png_data_to_gif_data(data) return data
def find_imgtype(data): return what(None, data) or 'unknown'