def Book(options, logger, font_delta=0, header=None, profile=PRS500_PROFILE, **settings): from uuid import uuid4 ps = {} ps['topmargin'] = options.top_margin ps['evensidemargin'] = options.left_margin ps['oddsidemargin'] = options.left_margin ps['textwidth'] = profile.screen_width - (options.left_margin + options.right_margin) ps['textheight'] = profile.screen_height - (options.top_margin + options.bottom_margin) \ - profile.fudge if header: hdr = Header() hb = TextBlock(textStyle=TextStyle(align='foot', fontsize=int( profile.header_font_size * 10)), blockStyle=BlockStyle(blockwidth=ps['textwidth'])) hb.append(header) hdr.PutObj(hb) ps['headheight'] = profile.header_height ps['headsep'] = options.header_separation ps['header'] = hdr ps['topmargin'] = 0 ps['textheight'] = profile.screen_height - (options.bottom_margin + ps['topmargin']) \ - ps['headheight'] - ps['headsep'] - profile.fudge fontsize = int(10 * profile.font_size + font_delta * 20) baselineskip = fontsize + 20 fonts = find_custom_fonts(options, logger) tsd = dict(fontsize=fontsize, parindent=int(10 * profile.parindent), linespace=int(10 * profile.line_space), baselineskip=baselineskip, wordspace=10 * options.wordspace) if fonts['serif'] and 'normal' in fonts['serif']: tsd['fontfacename'] = fonts['serif']['normal'][1] book = _Book(textstyledefault=tsd, pagestyledefault=ps, blockstyledefault=dict(blockwidth=ps['textwidth']), bookid=uuid4().hex, **settings) for family in fonts.keys(): if fonts[family]: for font in fonts[family].values(): book.embed_font(*font) FONT_FILE_MAP[font[1]] = font[0] for family in ['serif', 'sans', 'mono']: if not fonts[family]: fonts[family] = {'normal': (None, profile.default_fonts[family])} elif 'normal' not in fonts[family]: raise ConversionError('Could not find the normal version of the ' + family + ' font') return book, fonts
def process_file(lrfpath, opts, logger=None): if logger is None: level = logging.DEBUG if opts.verbose else logging.INFO logger = logging.getLogger('lrf2html') setup_cli_handlers(logger, level) if opts.out is None: opts.out = os.getcwdu() else: opts.out = os.path.abspath(opts.out) if not os.path.isdir(opts.out): raise ConversionError(opts.out + ' is not a directory') if not os.path.exists(opts.out): os.makedirs(opts.out) document = LRFDocument(open(lrfpath, 'rb')) LRFConverter(document, opts, logger)
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): ''' Convert the pdf into html using the pdftohtml app. This will write the html as index.html into output_dir. It will also write all extracted images to the output_dir ''' pdfsrc = os.path.join(output_dir, 'src.pdf') index = os.path.join(output_dir, 'index.' + ('xml' if as_xml else 'html')) with lopen(pdf_path, 'rb') as src, lopen(pdfsrc, 'wb') as dest: shutil.copyfileobj(src, dest) with CurrentDir(output_dir): def a(x): return os.path.basename(x) exe = PDFTOHTML cmd = [ exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', a(pdfsrc), a(index) ] if isbsd: cmd.remove('-nodrm') if no_images: cmd.append('-i') if as_xml: cmd.append('-xml') logf = PersistentTemporaryFile('pdftohtml_log') try: p = popen(cmd, stderr=logf._fd, stdout=logf._fd, stdin=subprocess.PIPE) except OSError as err: if err.errno == errno.ENOENT: raise ConversionError( _('Could not find pdftohtml, check it is in your PATH')) else: raise ret = eintr_retry_call(p.wait) logf.flush() logf.close() out = lopen(logf.name, 'rb').read().decode('utf-8', 'replace').strip() if ret != 0: raise ConversionError('pdftohtml failed with return code: %d\n%s' % (ret, out)) if out: prints("pdftohtml log:") prints(out) if not os.path.exists(index) or os.stat(index).st_size < 100: raise DRMError() if not as_xml: with lopen(index, 'r+b') as i: raw = i.read().decode('utf-8') raw = flip_images(raw) raw = raw.replace( '<head', '<!-- created by calibre\'s pdftohtml -->\n <head', 1) i.seek(0) i.truncate() # versions of pdftohtml >= 0.20 output self closing <br> tags, this # breaks the pdf heuristics regexps, so replace them raw = raw.replace('<br/>', '<br>') raw = re.sub(r'<a\s+name=(\d+)', r'<a id="\1"', raw, flags=re.I) raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I) raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I) raw = replace_entities(raw) raw = raw.replace('\u00a0', ' ') i.write(raw.encode('utf-8')) cmd = [ exe, '-f', '1', '-l', '1', '-xml', '-i', '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', '-q', '-stdout', a(pdfsrc) ] if isbsd: cmd.remove('-nodrm') p = popen(cmd, stdout=subprocess.PIPE) raw = p.stdout.read().strip() if p.wait() == 0 and raw: parse_outline(raw, output_dir) try: os.remove(pdfsrc) except: pass
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): ''' Convert the pdf into html using the pdftohtml app. This will write the html as index.html into output_dir. It will also write all extracted images to the output_dir ''' pdfsrc = os.path.join(output_dir, u'src.pdf') index = os.path.join(output_dir, u'index.' + ('xml' if as_xml else 'html')) with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest: shutil.copyfileobj(src, dest) with CurrentDir(output_dir): # This is necessary as pdftohtml doesn't always (linux) respect # absolute paths. Also, it allows us to safely pass only bytestring # arguments to subprocess on widows # subprocess in python 2 cannot handle unicode arguments on windows # that cannot be encoded with mbcs. Ensure all args are # bytestrings. def a(x): return os.path.basename(x).encode('ascii') exe = PDFTOHTML.encode(filesystem_encoding) if isinstance( PDFTOHTML, unicode) else PDFTOHTML cmd = [ exe, b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge', b'-nodrm', b'-q', a(pdfsrc), a(index) ] if isbsd: cmd.remove(b'-nodrm') if no_images: cmd.append(b'-i') if as_xml: cmd.append('-xml') logf = PersistentTemporaryFile(u'pdftohtml_log') try: p = popen(cmd, stderr=logf._fd, stdout=logf._fd, stdin=subprocess.PIPE) except OSError as err: if err.errno == errno.ENOENT: raise ConversionError( _('Could not find pdftohtml, check it is in your PATH')) else: raise while True: try: ret = p.wait() break except OSError as e: if e.errno == errno.EINTR: continue else: raise logf.flush() logf.close() out = open(logf.name, 'rb').read().strip() if ret != 0: raise ConversionError(b'return code: %d\n%s' % (ret, out)) if out: print "pdftohtml log:" print out if not os.path.exists(index) or os.stat(index).st_size < 100: raise DRMError() if not as_xml: with open(index, 'r+b') as i: raw = i.read() raw = flip_images(raw) raw = '<!-- created by calibre\'s pdftohtml -->\n' + raw i.seek(0) i.truncate() # versions of pdftohtml >= 0.20 output self closing <br> tags, this # breaks the pdf heuristics regexps, so replace them raw = raw.replace(b'<br/>', b'<br>') raw = re.sub(br'<a\s+name=(\d+)', br'<a id="\1"', raw, flags=re.I) i.write(raw) cmd = [ exe, b'-f', b'1', '-l', '1', b'-xml', b'-i', b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge', b'-nodrm', b'-q', b'-stdout', a(pdfsrc) ] p = popen(cmd, stdout=subprocess.PIPE) raw = p.stdout.read().strip() if p.wait() == 0 and raw: parse_outline(raw, output_dir) if isbsd: cmd.remove(b'-nodrm') try: os.remove(pdfsrc) except: pass
def XPath(x): try: return etree.XPath(x, namespaces=XPNSMAP) except etree.XPathSyntaxError: raise ConversionError( 'The syntax of the XPath expression %s is invalid.' % repr(x))
def call_convert_cmd(log, output_dir, pdf_name, first=None, last=None): ''' Convert the pdf into xml/txt using the pdftohtml/text app. This will write the output as index.xml/.txt into output_dir. pdftotext is often better than pdftohtml. ''' from calibre.ebooks.pdf.pdftohtml import popen pdfsrc = os.path.join(output_dir, pdf_name) if USE_PDFTOTEXT: EXE = 'pdftotext' index_file = os.path.join(output_dir, 'index.txt') else: from calibre.ebooks.pdf.pdftohtml import PDFTOHTML as EXE index_file = os.path.join(output_dir, 'index.xml') if os.path.exists(index_file): os.remove(index_file) with CurrentDir(output_dir): # This is necessary as pdftohtml doesn't always (linux) respect # absolute paths. Also, it allows us to safely pass only bytestring # arguments to subprocess on widows # subprocess in python 2 cannot handle unicode arguments on windows # that cannot be encoded with mbcs. Ensure all args are bytestrings. def a(x): return os.path.basename(x).encode('ascii') exe = EXE.encode(filesystem_encoding) if isinstance(EXE, str) else EXE if USE_PDFTOTEXT: cmd = [ exe, b'-enc', b'UTF-8', b'-nopgbrk', b'-q', a(pdfsrc), a(index_file) ] else: cmd = [ exe, b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge', b'-nodrm', b'-q', a(pdfsrc), a(index_file), b'-xml', b'-i' ] if isbsd: cmd.remove(b'-nodrm') if first is not None: cmd.append(b'-f') cmd.append(str(first)) if last is not None: cmd.append(b'-l') cmd.append(str(last)) logf = PersistentTemporaryFile('pdftohtml_log') try: p = popen(cmd, stderr=logf._fd, stdout=logf._fd, stdin=subprocess.PIPE) except OSError as err: if err.errno == errno.ENOENT: raise ConversionError( _('Could not find %s, check it is in your PATH') % EXE) else: raise while True: try: ret = p.wait() break except OSError as e: if e.errno == errno.EINTR: continue else: raise logf.flush() logf.close() out = open(logf.name, 'rb').read().strip() if ret != 0: raise ConversionError(out) if out: log('%s log:' % EXE) log(out) if not os.path.exists(index_file): raise DRMError() if USE_PDFTOTEXT: with open(index_file, 'r', encoding='utf-8', errors='ignore') as f: text = f.read() else: with open(index_file, 'r', encoding='utf-8', errors='ignore') as f: # avoid encoding problems content = f.read().encode('utf-8') parser = etree.XMLParser(recover=True) tree = etree.fromstring(clean_ascii_chars(content), parser) text = ''.join(e.text or '' for e in tree.iter('text')) return text