Пример #1
0
    def temporary_file(self, suffix):
        """
        Return a file-like object that is a temporary file on the file system.
        This file will remain available even after being closed and will only
        be removed on interpreter shutdown. Use the ``name`` member of the
        returned object to access the full path to the created temporary file.

        :param suffix: The suffix that the temporary file will have.
        """
        return PersistentTemporaryFile(suffix)
Пример #2
0
    def process_image(self, data):
        if not self.process_images:
            return data
        func = mobify_image if self.opts.mobi_keep_original_images else rescale_image
        try:
            return func(data)
        except Exception:
            ext = what(None, data)
            if ext not in ('png', 'gif'):
                raise
            if ext == 'gif':
                with PersistentTemporaryFile(suffix='.gif') as pt:
                    pt.write(data)
                    return mobify_image(data)

            with PersistentTemporaryFile(suffix='.png') as pt:
                pt.write(data)
            try:
                from ebook_converter.utils.img import optimize_png
                optimize_png(pt.name)
                data = open(pt.name, 'rb').read()
            finally:
                os.remove(pt.name)
            return func(data)
Пример #3
0
    def extract_content(self, output_dir):
        self.log.info('Extracting PDF...')

        pdf = PersistentTemporaryFile('.pdf')
        pdf.close()
        pdf = open(pdf, 'wb')
        for x in range(self.header.section_count()):
            pdf.write(self.header.section_data(x))
        pdf.close()

        from ebook_converter.customize.ui import plugin_for_input_format

        pdf_plugin = plugin_for_input_format('pdf')
        for opt in pdf_plugin.options:
            if not hasattr(self.options, opt.option.name):
                setattr(self.options, opt.option.name, opt.recommended_value)

        return pdf_plugin.convert(open(pdf, 'rb'), self.options, 'pdf', self.log, {})
Пример #4
0
 def dataize_svg(self, item, svg=None):
     if svg is None:
         svg = item.data
     hrefs = self.oeb.manifest.hrefs
     for elem in xpath(svg, '//svg:*[@xl:href]'):
         href = urlnormalize(elem.attrib[base.tag('xlink', 'href')])
         path = urllib.parse.urldefrag(href)[0]
         if not path:
             continue
         abshref = item.abshref(path)
         if abshref not in hrefs:
             continue
         linkee = hrefs[abshref]
         data = linkee.bytes_representation
         ext = what(None, data) or 'jpg'
         with PersistentTemporaryFile(suffix='.' + ext) as pt:
             pt.write(data)
             self.temp_files.append(pt.name)
         elem.attrib[base.tag('xlink', 'href')] = pt.name
     return svg
Пример #5
0
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
    '''
    Convert the pdf into html using the pdftohtml app.
    This will write the html as index.html into output_dir.
    It will also write all extracted images to the output_dir
    '''

    pdfsrc = os.path.join(output_dir, 'src.pdf')
    index = os.path.join(output_dir, 'index.' + ('xml' if as_xml else 'html'))

    with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest:
        shutil.copyfileobj(src, dest)

    with CurrentDir(output_dir):

        def a(x):
            return os.path.basename(x)

        cmd = [
            'pdftohtml', '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
            '-nodrm',
            a(pdfsrc),
            a(index)
        ]

        if no_images:
            cmd.append('-i')
        if as_xml:
            cmd.append('-xml')

        logf = PersistentTemporaryFile('pdftohtml_log')
        try:
            p = popen(cmd,
                      stderr=logf._fd,
                      stdout=logf._fd,
                      stdin=subprocess.PIPE)
        except OSError as err:
            if err.errno == errno.ENOENT:
                raise ConversionError('Could not find pdftohtml, check it is '
                                      'in your PATH')
            else:
                raise
        ret = eintr_retry_call(p.wait)
        logf.flush()
        logf.close()
        out = open(logf.name, 'rb').read().decode('utf-8', 'replace').strip()
        if ret != 0:
            raise ConversionError('pdftohtml failed with return code: '
                                  '%d\n%s' % (ret, out))
        if out:
            print("pdftohtml log:")
            print(out)
        if not os.path.exists(index) or os.stat(index).st_size < 100:
            raise DRMError()

        if not as_xml:
            with open(index, 'r+b') as i:
                raw = i.read().decode('utf-8', 'replace')
                raw = flip_images(raw)
                raw = raw.replace(
                    '<head', '<!-- created by ebook-converter\'s'
                    ' pdftohtml -->\n  <head', 1)
                i.seek(0)
                i.truncate()
                # versions of pdftohtml >= 0.20 output self closing <br> tags,
                # this breaks the pdf heuristics regexps, so replace them
                raw = raw.replace('<br/>', '<br>')
                raw = re.sub(r'<a\s+name=(\d+)',
                             r'<a id="\1"',
                             raw,
                             flags=re.I)
                raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I)
                raw = re.sub(r'<a href="index.html#(\d+)"',
                             r'<a href="#p\1"',
                             raw,
                             flags=re.I)
                raw = xml_replace_entities(raw)
                raw = raw.replace('\u00a0', ' ')

                i.write(raw.encode('utf-8'))

            cmd = [
                'pdftohtml', '-f', '1', '-l', '1', '-xml', '-i', '-enc',
                'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', '-q',
                '-stdout',
                a(pdfsrc)
            ]
            p = popen(cmd, stdout=subprocess.PIPE)
            raw = p.stdout.read().strip()
            if p.wait() == 0 and raw:
                parse_outline(raw, output_dir)

        try:
            os.remove(pdfsrc)
        except Exception:
            pass