Exemplo n.º 1
0
    def process_pml(self, pml_path, html_path, close_all=False):
        from calibre.ebooks.pml.pmlconverter import PML_HTMLizer

        pclose = False
        hclose = False

        if not hasattr(pml_path, 'read'):
            pml_stream = open(pml_path, 'rb')
            pclose = True
        else:
            pml_stream = pml_path
            pml_stream.seek(0)

        if not hasattr(html_path, 'write'):
            html_stream = open(html_path, 'wb')
            hclose = True
        else:
            html_stream = html_path

        ienc = pml_stream.encoding if pml_stream.encoding else 'cp1252'
        if self.options.input_encoding:
            ienc = self.options.input_encoding

        self.log.debug('Converting PML to HTML...')
        hizer = PML_HTMLizer()
        html = hizer.parse_pml(pml_stream.read().decode(ienc), html_path)
        html = '<html><head><title></title></head><body>%s</body></html>'%html
        html_stream.write(html.encode('utf-8', 'replace'))

        if pclose:
            pml_stream.close()
        if hclose:
            html_stream.close()

        return hizer.get_toc()
Exemplo n.º 2
0
    def process_pml(self, pml_path, html_path, close_all=False):
        from calibre.ebooks.pml.pmlconverter import PML_HTMLizer

        pclose = False
        hclose = False

        if not hasattr(pml_path, 'read'):
            pml_stream = open(pml_path, 'rb')
            pclose = True
        else:
            pml_stream = pml_path
            pml_stream.seek(0)

        if not hasattr(html_path, 'write'):
            html_stream = open(html_path, 'wb')
            hclose = True
        else:
            html_stream = html_path

        ienc = pml_stream.encoding if pml_stream.encoding else 'cp1252'
        if self.options.input_encoding:
            ienc = self.options.input_encoding

        self.log.debug('Converting PML to HTML...')
        hizer = PML_HTMLizer()
        html = hizer.parse_pml(pml_stream.read().decode(ienc), html_path)
        html = '<html><head><title></title></head><body>%s</body></html>'%html
        html_stream.write(html.encode('utf-8', 'replace'))

        if pclose:
            pml_stream.close()
        if hclose:
            html_stream.close()

        return hizer.get_toc()
Exemplo n.º 3
0
    def extract_content(self, output_dir):
        from calibre.ebooks.pml.pmlconverter import footnote_to_html, sidebar_to_html
        from calibre.ebooks.pml.pmlconverter import PML_HTMLizer

        output_dir = os.path.abspath(output_dir)

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        title = self.mi.title
        if not isinstance(title, unicode_type):
            title = title.decode('utf-8', 'replace')
        html = u'<html><head><title>%s</title></head><body>' % title

        pml = u''
        for i in range(1, self.header_record.num_text_pages + 1):
            self.log.debug('Extracting text page %i' % i)
            pml += self.get_text_page(i)
        hizer = PML_HTMLizer()
        html += hizer.parse_pml(pml, 'index.html')
        toc = hizer.get_toc()

        if self.header_record.footnote_count > 0:
            html += '<br /><h1>%s</h1>' % _('Footnotes')
            footnoteids = re.findall(
                '\\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
            for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_count)):
                self.log.debug('Extracting footnote page %i' % i)
                if fid < len(footnoteids):
                    fid = footnoteids[fid]
                else:
                    fid = ''
                html += footnote_to_html(fid, self.decompress_text(i))

        if self.header_record.sidebar_count > 0:
            html += '<br /><h1>%s</h1>' % _('Sidebar')
            sidebarids = re.findall(
                '\\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
            for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_count)):
                self.log.debug('Extracting sidebar page %i' % i)
                if sid < len(sidebarids):
                    sid = sidebarids[sid]
                else:
                    sid = ''
                html += sidebar_to_html(sid, self.decompress_text(i))

        html += '</body></html>'

        with CurrentDir(output_dir):
            with open('index.html', 'wb') as index:
                self.log.debug('Writing text to index.html')
                index.write(html.encode('utf-8'))

        if not os.path.exists(os.path.join(output_dir, 'images/')):
            os.makedirs(os.path.join(output_dir, 'images/'))
        images = []
        with CurrentDir(os.path.join(output_dir, 'images/')):
            for i in range(0, self.header_record.num_image_pages):
                name, img = self.get_image(self.header_record.image_data_offset + i)
                images.append(name)
                with open(name, 'wb') as imgf:
                    self.log.debug('Writing image %s to images/' % name)
                    imgf.write(img)

        opf_path = self.create_opf(output_dir, images, toc)

        return opf_path
Exemplo n.º 4
0
    def extract_content(self, output_dir):
        from calibre.ebooks.pml.pmlconverter import footnote_to_html, sidebar_to_html
        from calibre.ebooks.pml.pmlconverter import PML_HTMLizer

        output_dir = os.path.abspath(output_dir)

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        title = self.mi.title
        if not isinstance(title, unicode):
            title = title.decode('utf-8', 'replace')
        html = u'<html><head><title>%s</title></head><body>' % title

        pml = u''
        for i in range(1, self.header_record.num_text_pages + 1):
            self.log.debug('Extracting text page %i' % i)
            pml += self.get_text_page(i)
        hizer = PML_HTMLizer()
        html += hizer.parse_pml(pml, 'index.html')
        toc = hizer.get_toc()

        if self.header_record.footnote_count > 0:
            html += '<br /><h1>%s</h1>' % _('Footnotes')
            footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
            for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_count)):
                self.log.debug('Extracting footnote page %i' % i)
                if fid < len(footnoteids):
                    fid = footnoteids[fid]
                else:
                    fid = ''
                html += footnote_to_html(fid, self.decompress_text(i))

        if self.header_record.sidebar_count > 0:
            html += '<br /><h1>%s</h1>' % _('Sidebar')
            sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
            for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_count)):
                self.log.debug('Extracting sidebar page %i' % i)
                if sid < len(sidebarids):
                    sid = sidebarids[sid]
                else:
                    sid = ''
                html += sidebar_to_html(sid, self.decompress_text(i))

        html += '</body></html>'

        with CurrentDir(output_dir):
            with open('index.html', 'wb') as index:
                self.log.debug('Writing text to index.html')
                index.write(html.encode('utf-8'))

        if not os.path.exists(os.path.join(output_dir, 'images/')):
            os.makedirs(os.path.join(output_dir, 'images/'))
        images = []
        with CurrentDir(os.path.join(output_dir, 'images/')):
            for i in range(0, self.header_record.num_image_pages):
                name, img = self.get_image(self.header_record.image_data_offset + i)
                images.append(name)
                with open(name, 'wb') as imgf:
                    self.log.debug('Writing image %s to images/' % name)
                    imgf.write(img)

        opf_path = self.create_opf(output_dir, images, toc)

        return opf_path