Пример #1
0
 def postprocess_book(self, oeb, opts, log):
     from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
     for item in oeb.spine:
         root = item.data
         if not hasattr(root, 'xpath'): continue
         for bad in ('metadata', 'guide'):
             metadata = XPath('//h:'+bad)(root)
             if metadata:
                 for x in metadata:
                     x.getparent().remove(x)
         body = XPath('//h:body')(root)
         if body:
             body = body[0]
             if len(body) == 1 and body[0].tag == XHTML('pre'):
                 pre = body[0]
                 from calibre.ebooks.txt.processor import convert_basic, \
                     separate_paragraphs_single_line
                 from calibre.ebooks.chardet import xml_to_unicode
                 from lxml import etree
                 import copy
                 html = separate_paragraphs_single_line(pre.text)
                 html = convert_basic(html).replace('<html>',
                         '<html xmlns="%s">'%XHTML_NS)
                 html = xml_to_unicode(html, strip_encoding_pats=True,
                         resolve_entities=True)[0]
                 root = etree.fromstring(html)
                 body = XPath('//h:body')(root)
                 pre.tag = XHTML('div')
                 pre.text = ''
                 for elem in body:
                     ne = copy.deepcopy(elem)
                     pre.append(ne)
Пример #2
0
 def postprocess_book(self, oeb, opts, log):
     from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
     for item in oeb.spine:
         root = item.data
         if not hasattr(root, 'xpath'): continue
         for bad in ('metadata', 'guide'):
             metadata = XPath('//h:' + bad)(root)
             if metadata:
                 for x in metadata:
                     x.getparent().remove(x)
         body = XPath('//h:body')(root)
         if body:
             body = body[0]
             if len(body) == 1 and body[0].tag == XHTML('pre'):
                 pre = body[0]
                 from calibre.ebooks.txt.processor import convert_basic, \
                     separate_paragraphs_single_line
                 from calibre.ebooks.chardet import xml_to_unicode
                 from lxml import etree
                 import copy
                 html = separate_paragraphs_single_line(pre.text)
                 html = convert_basic(html).replace(
                     '<html>', '<html xmlns="%s">' % XHTML_NS)
                 html = xml_to_unicode(html,
                                       strip_encoding_pats=True,
                                       resolve_entities=True)[0]
                 root = etree.fromstring(html)
                 body = XPath('//h:body')(root)
                 pre.tag = XHTML('div')
                 pre.text = ''
                 for elem in body:
                     ne = copy.deepcopy(elem)
                     pre.append(ne)
Пример #3
0
 def postprocess_book(self, oeb, opts, log):
     from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
     for item in oeb.spine:
         root = item.data
         if not hasattr(root, 'xpath'):
             continue
         for bad in ('metadata', 'guide'):
             metadata = XPath('//h:' + bad)(root)
             if metadata:
                 for x in metadata:
                     x.getparent().remove(x)
         body = XPath('//h:body')(root)
         if body:
             body = body[0]
             if len(body) == 1 and body[0].tag == XHTML('pre'):
                 pre = body[0]
                 from calibre.ebooks.txt.processor import convert_basic, \
                     separate_paragraphs_single_line
                 from calibre.ebooks.chardet import xml_to_unicode
                 from calibre.utils.xml_parse import safe_xml_fromstring
                 import copy
                 self.log(
                     'LIT file with all text in singe <pre> tag detected')
                 html = separate_paragraphs_single_line(pre.text)
                 html = convert_basic(html).replace(
                     '<html>', '<html xmlns="%s">' % XHTML_NS)
                 html = xml_to_unicode(html,
                                       strip_encoding_pats=True,
                                       resolve_entities=True)[0]
                 if opts.smarten_punctuation:
                     # SmartyPants skips text inside <pre> tags
                     from calibre.ebooks.conversion.preprocess import smarten_punctuation
                     html = smarten_punctuation(html, self.log)
                 root = safe_xml_fromstring(html)
                 body = XPath('//h:body')(root)
                 pre.tag = XHTML('div')
                 pre.text = ''
                 for elem in body:
                     ne = copy.deepcopy(elem)
                     pre.append(ne)
Пример #4
0
 def txt_process(self, match):
     from calibre.ebooks.txt.processor import convert_basic, separate_paragraphs_single_line
     content = match.group('text')
     content = separate_paragraphs_single_line(content)
     content = convert_basic(content, epub_split_size_kb=0)
     return content
Пример #5
0
 def txt_process(self, match):
     from calibre.ebooks.txt.processor import convert_basic, separate_paragraphs_single_line
     content = match.group('text')
     content = separate_paragraphs_single_line(content)
     content = convert_basic(content, epub_split_size_kb=0)
     return content
Пример #6
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
        from calibre.ebooks.chardet import detect
        from calibre.utils.zipfile import ZipFile
        from calibre.ebooks.txt.processor import (
            convert_basic, convert_markdown_with_metadata,
            separate_paragraphs_single_line,
            separate_paragraphs_print_formatted, preserve_spaces,
            detect_paragraph_type, detect_formatting_type,
            normalize_line_endings, convert_textile, remove_indents,
            block_to_single_line, separate_hard_scene_breaks)

        self.log = log
        txt = ''
        log.debug('Reading text from file...')
        length = 0

        # Extract content from zip archive.
        if file_ext == 'txtz':
            zf = ZipFile(stream)
            zf.extractall('.')

            for x in walk('.'):
                if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
                    with open(x, 'rb') as tf:
                        txt += tf.read() + '\n\n'
        else:
            txt = stream.read()
            if file_ext in {'md', 'textile', 'markdown'}:
                options.formatting_type = {
                    'md': 'markdown'
                }.get(file_ext, file_ext)
                log.info('File extension indicates particular formatting. '
                         'Forcing formatting type to: %s' %
                         options.formatting_type)
                options.paragraph_type = 'off'

        # Get the encoding of the document.
        if options.input_encoding:
            ienc = options.input_encoding
            log.debug('Using user specified input encoding of %s' % ienc)
        else:
            det_encoding = detect(txt)
            det_encoding, confidence = det_encoding['encoding'], det_encoding[
                'confidence']
            if det_encoding and det_encoding.lower().replace(
                    '_',
                    '-').strip() in ('gb2312', 'chinese', 'csiso58gb231280',
                                     'euc-cn', 'euccn', 'eucgb2312-cn',
                                     'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
                # Microsoft Word exports to HTML with encoding incorrectly set to
                # gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
                det_encoding = 'gbk'
            ienc = det_encoding
            log.debug(
                'Detected input encoding as %s with a confidence of %s%%' %
                (ienc, confidence * 100))
        if not ienc:
            ienc = 'utf-8'
            log.debug(
                'No input encoding specified and could not auto detect using %s'
                % ienc)
        # Remove BOM from start of txt as its presence can confuse markdown
        import codecs
        for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8,
                    codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
            if txt.startswith(bom):
                txt = txt[len(bom):]
                break
        txt = txt.decode(ienc, 'replace')

        # Replace entities
        txt = _ent_pat.sub(xml_entity_to_unicode, txt)

        # Normalize line endings
        txt = normalize_line_endings(txt)

        # Determine the paragraph type of the document.
        if options.paragraph_type == 'auto':
            options.paragraph_type = detect_paragraph_type(txt)
            if options.paragraph_type == 'unknown':
                log.debug(
                    'Could not reliably determine paragraph type using block')
                options.paragraph_type = 'block'
            else:
                log.debug('Auto detected paragraph type as %s' %
                          options.paragraph_type)

        # Detect formatting
        if options.formatting_type == 'auto':
            options.formatting_type = detect_formatting_type(txt)
            log.debug('Auto detected formatting as %s' %
                      options.formatting_type)

        if options.formatting_type == 'heuristic':
            setattr(options, 'enable_heuristics', True)
            setattr(options, 'unwrap_lines', False)
            setattr(options, 'smarten_punctuation', True)

        # Reformat paragraphs to block formatting based on the detected type.
        # We don't check for block because the processor assumes block.
        # single and print at transformed to block for processing.
        if options.paragraph_type == 'single':
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'print':
            txt = separate_hard_scene_breaks(txt)
            txt = separate_paragraphs_print_formatted(txt)
            txt = block_to_single_line(txt)
        elif options.paragraph_type == 'unformatted':
            from calibre.ebooks.conversion.utils import HeuristicProcessor
            # unwrap lines based on punctuation
            docanalysis = DocAnalysis('txt', txt)
            length = docanalysis.line_length(.5)
            preprocessor = HeuristicProcessor(options,
                                              log=getattr(self, 'log', None))
            txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'block':
            txt = separate_hard_scene_breaks(txt)
            txt = block_to_single_line(txt)

        if getattr(options, 'enable_heuristics', False) and getattr(
                options, 'dehyphenate', False):
            docanalysis = DocAnalysis('txt', txt)
            if not length:
                length = docanalysis.line_length(.5)
            dehyphenator = Dehyphenator(options.verbose, log=self.log)
            txt = dehyphenator(txt, 'txt', length)

        # User requested transformation on the text.
        if options.txt_in_remove_indents:
            txt = remove_indents(txt)

        # Preserve spaces will replace multiple spaces to a space
        # followed by the &nbsp; entity.
        if options.preserve_spaces:
            txt = preserve_spaces(txt)

        # Process the text using the appropriate text processor.
        html = ''
        input_mi = None
        if options.formatting_type == 'markdown':
            log.debug('Running text through markdown conversion...')
            try:
                input_mi, html = convert_markdown_with_metadata(
                    txt,
                    extensions=[
                        x.strip()
                        for x in options.markdown_extensions.split(',')
                        if x.strip()
                    ])
            except RuntimeError:
                raise ValueError(
                    'This txt file has malformed markup, it cannot be'
                    ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax'
                )
        elif options.formatting_type == 'textile':
            log.debug('Running text through textile conversion...')
            html = convert_textile(txt)
        else:
            log.debug('Running text through basic conversion...')
            flow_size = getattr(options, 'flow_size', 0)
            html = convert_basic(txt, epub_split_size_kb=flow_size)

        # Run the HTMLized text through the html processing plugin.
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = os.getcwdu()
        if file_ext != 'txtz' and hasattr(stream, 'name'):
            base = os.path.dirname(stream.name)
        fname = os.path.join(base, 'index.html')
        c = 0
        while os.path.exists(fname):
            c += 1
            fname = 'index%d.html' % c
        htmlfile = open(fname, 'wb')
        with htmlfile:
            htmlfile.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html',
                                 log, {})
        options.debug_pipeline = odi
        os.remove(htmlfile.name)

        # Set metadata from file.
        if input_mi is None:
            from calibre.customize.ui import get_file_type_metadata
            input_mi = get_file_type_metadata(stream, file_ext)
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
        self.html_postprocess_title = input_mi.title

        return oeb
Пример #7
0
    def convert(self, stream, options, file_ext, log,
                accelerators):
        from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
        from calibre.ebooks.chardet import detect
        from calibre.utils.zipfile import ZipFile
        from calibre.ebooks.txt.processor import (convert_basic,
                convert_markdown_with_metadata, separate_paragraphs_single_line,
                separate_paragraphs_print_formatted, preserve_spaces,
                detect_paragraph_type, detect_formatting_type,
                normalize_line_endings, convert_textile, remove_indents,
                block_to_single_line, separate_hard_scene_breaks)

        self.log = log
        txt = ''
        log.debug('Reading text from file...')
        length = 0

        # Extract content from zip archive.
        if file_ext == 'txtz':
            zf = ZipFile(stream)
            zf.extractall('.')

            for x in walk('.'):
                if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
                    with open(x, 'rb') as tf:
                        txt += tf.read() + '\n\n'
        else:
            txt = stream.read()
            if file_ext in {'md', 'textile', 'markdown'}:
                options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext)
                log.info('File extension indicates particular formatting. '
                        'Forcing formatting type to: %s'%options.formatting_type)
                options.paragraph_type = 'off'

        # Get the encoding of the document.
        if options.input_encoding:
            ienc = options.input_encoding
            log.debug('Using user specified input encoding of %s' % ienc)
        else:
            det_encoding = detect(txt[:4096])
            det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence']
            if det_encoding and det_encoding.lower().replace('_', '-').strip() in (
                    'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
                    'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
                # Microsoft Word exports to HTML with encoding incorrectly set to
                # gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
                det_encoding = 'gbk'
            ienc = det_encoding
            log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100))
        if not ienc:
            ienc = 'utf-8'
            log.debug('No input encoding specified and could not auto detect using %s' % ienc)
        # Remove BOM from start of txt as its presence can confuse markdown
        import codecs
        for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
            if txt.startswith(bom):
                txt = txt[len(bom):]
                break
        txt = txt.decode(ienc, 'replace')

        # Replace entities
        txt = _ent_pat.sub(xml_entity_to_unicode, txt)

        # Normalize line endings
        txt = normalize_line_endings(txt)

        # Determine the paragraph type of the document.
        if options.paragraph_type == 'auto':
            options.paragraph_type = detect_paragraph_type(txt)
            if options.paragraph_type == 'unknown':
                log.debug('Could not reliably determine paragraph type using block')
                options.paragraph_type = 'block'
            else:
                log.debug('Auto detected paragraph type as %s' % options.paragraph_type)

        # Detect formatting
        if options.formatting_type == 'auto':
            options.formatting_type = detect_formatting_type(txt)
            log.debug('Auto detected formatting as %s' % options.formatting_type)

        if options.formatting_type == 'heuristic':
            setattr(options, 'enable_heuristics', True)
            setattr(options, 'unwrap_lines', False)
            setattr(options, 'smarten_punctuation', True)

        # Reformat paragraphs to block formatting based on the detected type.
        # We don't check for block because the processor assumes block.
        # single and print at transformed to block for processing.
        if options.paragraph_type == 'single':
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'print':
            txt = separate_hard_scene_breaks(txt)
            txt = separate_paragraphs_print_formatted(txt)
            txt = block_to_single_line(txt)
        elif options.paragraph_type == 'unformatted':
            from calibre.ebooks.conversion.utils import HeuristicProcessor
            # unwrap lines based on punctuation
            docanalysis = DocAnalysis('txt', txt)
            length = docanalysis.line_length(.5)
            preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
            txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'block':
            txt = separate_hard_scene_breaks(txt)
            txt = block_to_single_line(txt)

        if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
            docanalysis = DocAnalysis('txt', txt)
            if not length:
                length = docanalysis.line_length(.5)
            dehyphenator = Dehyphenator(options.verbose, log=self.log)
            txt = dehyphenator(txt,'txt', length)

        # User requested transformation on the text.
        if options.txt_in_remove_indents:
            txt = remove_indents(txt)

        # Preserve spaces will replace multiple spaces to a space
        # followed by the &nbsp; entity.
        if options.preserve_spaces:
            txt = preserve_spaces(txt)

        # Process the text using the appropriate text processor.
        html = ''
        input_mi = None
        if options.formatting_type == 'markdown':
            log.debug('Running text through markdown conversion...')
            try:
                input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()])
            except RuntimeError:
                raise ValueError('This txt file has malformed markup, it cannot be'
                    ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax')
        elif options.formatting_type == 'textile':
            log.debug('Running text through textile conversion...')
            html = convert_textile(txt)
        else:
            log.debug('Running text through basic conversion...')
            flow_size = getattr(options, 'flow_size', 0)
            html = convert_basic(txt, epub_split_size_kb=flow_size)

        # Run the HTMLized text through the html processing plugin.
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = os.getcwdu()
        if file_ext != 'txtz' and hasattr(stream, 'name'):
            base = os.path.dirname(stream.name)
        fname = os.path.join(base, 'index.html')
        c = 0
        while os.path.exists(fname):
            c += 1
            fname = 'index%d.html'%c
        htmlfile = open(fname, 'wb')
        with htmlfile:
            htmlfile.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log,
                {})
        options.debug_pipeline = odi
        os.remove(htmlfile.name)

        # Set metadata from file.
        if input_mi is None:
            from calibre.customize.ui import get_file_type_metadata
            input_mi = get_file_type_metadata(stream, file_ext)
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
        self.html_postprocess_title = input_mi.title

        return oeb