def temporary_file(self, suffix): """ Return a file-like object that is a temporary file on the file system. This file will remain available even after being closed and will only be removed on interpreter shutdown. Use the ``name`` member of the returned object to access the full path to the created temporary file. :param suffix: The suffix that the temporary file will have. """ return PersistentTemporaryFile(suffix)
def process_image(self, data): if not self.process_images: return data func = mobify_image if self.opts.mobi_keep_original_images else rescale_image try: return func(data) except Exception: ext = what(None, data) if ext not in ('png', 'gif'): raise if ext == 'gif': with PersistentTemporaryFile(suffix='.gif') as pt: pt.write(data) return mobify_image(data) with PersistentTemporaryFile(suffix='.png') as pt: pt.write(data) try: from ebook_converter.utils.img import optimize_png optimize_png(pt.name) data = open(pt.name, 'rb').read() finally: os.remove(pt.name) return func(data)
def extract_content(self, output_dir): self.log.info('Extracting PDF...') pdf = PersistentTemporaryFile('.pdf') pdf.close() pdf = open(pdf, 'wb') for x in range(self.header.section_count()): pdf.write(self.header.section_data(x)) pdf.close() from ebook_converter.customize.ui import plugin_for_input_format pdf_plugin = plugin_for_input_format('pdf') for opt in pdf_plugin.options: if not hasattr(self.options, opt.option.name): setattr(self.options, opt.option.name, opt.recommended_value) return pdf_plugin.convert(open(pdf, 'rb'), self.options, 'pdf', self.log, {})
def dataize_svg(self, item, svg=None): if svg is None: svg = item.data hrefs = self.oeb.manifest.hrefs for elem in xpath(svg, '//svg:*[@xl:href]'): href = urlnormalize(elem.attrib[base.tag('xlink', 'href')]) path = urllib.parse.urldefrag(href)[0] if not path: continue abshref = item.abshref(path) if abshref not in hrefs: continue linkee = hrefs[abshref] data = linkee.bytes_representation ext = what(None, data) or 'jpg' with PersistentTemporaryFile(suffix='.' + ext) as pt: pt.write(data) self.temp_files.append(pt.name) elem.attrib[base.tag('xlink', 'href')] = pt.name return svg
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): ''' Convert the pdf into html using the pdftohtml app. This will write the html as index.html into output_dir. It will also write all extracted images to the output_dir ''' pdfsrc = os.path.join(output_dir, 'src.pdf') index = os.path.join(output_dir, 'index.' + ('xml' if as_xml else 'html')) with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest: shutil.copyfileobj(src, dest) with CurrentDir(output_dir): def a(x): return os.path.basename(x) cmd = [ 'pdftohtml', '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', a(pdfsrc), a(index) ] if no_images: cmd.append('-i') if as_xml: cmd.append('-xml') logf = PersistentTemporaryFile('pdftohtml_log') try: p = popen(cmd, stderr=logf._fd, stdout=logf._fd, stdin=subprocess.PIPE) except OSError as err: if err.errno == errno.ENOENT: raise ConversionError('Could not find pdftohtml, check it is ' 'in your PATH') else: raise ret = eintr_retry_call(p.wait) logf.flush() logf.close() out = open(logf.name, 'rb').read().decode('utf-8', 'replace').strip() if ret != 0: raise ConversionError('pdftohtml failed with return code: ' '%d\n%s' % (ret, out)) if out: print("pdftohtml log:") print(out) if not os.path.exists(index) or os.stat(index).st_size < 100: raise DRMError() if not as_xml: with open(index, 'r+b') as i: raw = i.read().decode('utf-8', 'replace') raw = flip_images(raw) raw = raw.replace( '<head', '<!-- created by ebook-converter\'s' ' pdftohtml -->\n <head', 1) i.seek(0) i.truncate() # versions of pdftohtml >= 0.20 output self closing <br> tags, # this breaks the pdf heuristics regexps, so replace them raw = raw.replace('<br/>', '<br>') raw = re.sub(r'<a\s+name=(\d+)', r'<a id="\1"', raw, flags=re.I) raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I) raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I) raw = xml_replace_entities(raw) raw = raw.replace('\u00a0', ' ') i.write(raw.encode('utf-8')) cmd = [ 'pdftohtml', '-f', '1', '-l', '1', '-xml', '-i', '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', '-q', '-stdout', a(pdfsrc) ] p = popen(cmd, stdout=subprocess.PIPE) raw = p.stdout.read().strip() if p.wait() == 0 and raw: parse_outline(raw, output_dir) try: os.remove(pdfsrc) except Exception: pass