Python HeuristicProcessor.HeuristicProcessor示例，calibre.ebooks.conversion.utils.HeuristicProcessor.HeuristicProcessor Python示例

示例#1

0

显示文件

    def smarten_punctuation(self):
        preprocessor = HeuristicProcessor(log=self.log)

        for name in self.get_html_names():
            self.log.info("Smartening punctuation for file {0}".format(name))
            html = self.get_raw(name)
            html = html.encode("UTF-8")

            # Fix non-breaking space indents
            html = preprocessor.fix_nbsp_indents(html)
            # Smarten punctuation
            html = smartyPants(html)
            # Ellipsis to HTML entity
            html = re.sub(ur'(?u)(?<=\w)\s?(\.\s+?){2}\.',
                          '&hellip;',
                          html,
                          flags=re.UNICODE | re.MULTILINE)
            # Double-dash and unicode char code to em-dash
            html = string.replace(html, '---', ' &#x2013; ')
            html = string.replace(html, u"\x97", ' &#x2013; ')
            html = string.replace(html, '--', ' &#x2014; ')
            html = string.replace(html, u"\u2014", ' &#x2014; ')
            html = string.replace(html, u"\u2013", ' &#x2013; ')

            # Fix comment nodes that got mangled
            html = string.replace(html, u'<! &#x2014; ', u'<!-- ')
            html = string.replace(html, u' &#x2014; >', u' -->')

            # Remove Unicode replacement characters
            html = string.replace(html, u"\uFFFD", "")

            self.dirty(name)
        self.flush_cache()

示例#2

0

显示文件

文件： container.py 项目： calverb/calibre-kobo-driver

    def clean_markup(self):
        preprocessor = HeuristicProcessor(log=self.log)
        for name in self.get_html_names():
            self.log.debug("Cleaning markup for file {0}".format(name))
            html = self.get_raw(name)
            html = html.encode("UTF-8")

            # Replace unicode dashes with ASCII representations - smarten punctuation picks this up if asked for
            html = string.replace(html, u"\u2014", ' -- ')
            html = string.replace(html, u"\u2013", ' --- ')
            html = string.replace(html, u"\x97", ' --- ')

            # Get rid of Microsoft cruft
            html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
            html = re.sub(r'(?i)</?st1:\w+>', '', html)

            # Re-open self-closing paragraph tags
            html = re.sub(r'<p[^>/]*/>', '<p></p>', html)

            # Remove empty headings
            html = re.sub(r'(?i)<h\d+>\s*</h\d+>', '', html)

            # Remove Unicode replacement characters
            html = string.replace(html, u"\uFFFD", "")

            self.set(name, html)

示例#3

0

显示文件

文件： container.py 项目： calverb/calibre-kobo-driver

    def smarten_punctuation(self):
        preprocessor = HeuristicProcessor(log=self.log)

        for name in self.get_html_names():
            self.log.debug("Smartening punctuation for file {0}".format(name))
            html = self.get_raw(name)
            html = html.encode("UTF-8")

            # Fix non-breaking space indents
            html = preprocessor.fix_nbsp_indents(html)
            # Smarten punctuation
            html = smartyPants(html)
            # Ellipsis to HTML entity
            html = re.sub(r'(?u)(?<=\w)\s?(\.\s+?){2}\.', '&hellip;', html)
            # Double-dash and unicode char code to em-dash
            html = string.replace(html, '---', ' &#x2013; ')
            html = string.replace(html, u"\x97", ' &#x2013; ')
            html = string.replace(html, '--', ' &#x2014; ')
            html = string.replace(html, u"\u2014", ' &#x2014; ')
            html = string.replace(html, u"\u2013", ' &#x2013; ')
            html = string.replace(html, u"...", "&#x2026;")

            # Remove Unicode replacement characters
            html = string.replace(html, u"\uFFFD", "")

            self.set(name, html)

示例#4

0

显示文件

文件： container.py 项目： jessicandavis/calibre-kobo-driver

    def __smarten_punctuation_impl(self, name):  # type: (str) -> None
        """Convert standard punctuation to "smart" punctuation."""
        preprocessor = HeuristicProcessor(log=self.log)

        self.log.debug("Smartening punctuation for file {0}".format(name))
        html = self.raw_data(name, decode=True, normalize_to_nfc=True)
        if html is None:
            raise Exception(
                _(  # noqa: F821 - _ is defined in globals by calibre
                    "No HTML content in file {0}").format(name))

        # Fix non-breaking space indents
        html = preprocessor.fix_nbsp_indents(html)

        # Smarten punctuation
        html = smartyPants(html)

        # Ellipsis to HTML entity
        html = ELLIPSIS_RE.sub("&hellip;", html)

        # Double-dash and unicode char code to em-dash
        html = html.replace("---", " &#x2013; ")
        html = html.replace("\x97", " &#x2013; ")
        html = html.replace("\u2013", " &#x2013; ")
        html = html.replace("--", " &#x2014; ")
        html = html.replace("\u2014", " &#x2014; ")

        # Fix comment nodes that got mangled
        html = html.replace("<! &#x2014; ", "<!-- ")
        html = html.replace(" &#x2014; >", " -->")

        self.replace(name, self.parse_xhtml(html))
        self.flush_cache()

示例#5

0

显示文件

    def smarten_punctuation(self):
        """Convert standard punctuation to "smart" punctuation."""
        preprocessor = HeuristicProcessor(log=self.log)

        for name in self.html_names():
            self.log.debug("Smartening punctuation for file {0}".format(name))
            html = self.raw_data(name, decode=True, normalize_to_nfc=True)
            if html is None:
                continue

            # Fix non-breaking space indents
            html = preprocessor.fix_nbsp_indents(html)

            # Smarten punctuation
            html = smartyPants(html)

            # Ellipsis to HTML entity
            html = ELLIPSIS_RE.sub("&hellip;", html)

            # Double-dash and unicode char code to em-dash
            html = string.replace(html, "---", " &#x2013; ")
            html = string.replace(html, "\x97", " &#x2013; ")
            html = string.replace(html, "\u2013", " &#x2013; ")
            html = string.replace(html, "--", " &#x2014; ")
            html = string.replace(html, "\u2014", " &#x2014; ")

            # Fix comment nodes that got mangled
            html = string.replace(html, "<! &#x2014; ", "<!-- ")
            html = string.replace(html, " &#x2014; >", " -->")

            with self.open(name, "wb") as f:
                f.write(
                    html.encode(self.encoding_map.get(name,
                                                      self.used_encoding)))

示例#6

0

显示文件

def smarten_punctuation(html, log=None):
    from calibre.utils.smartypants import smartyPants
    from calibre.ebooks.chardet import substitute_entites
    from calibre.ebooks.conversion.utils import HeuristicProcessor
    preprocessor = HeuristicProcessor(log=log)
    from uuid import uuid4
    start = 'calibre-smartypants-'+str(uuid4())
    stop = 'calibre-smartypants-'+str(uuid4())
    html = html.replace('<!--', start)
    html = html.replace('-->', stop)
    html = preprocessor.fix_nbsp_indents(html)
    html = smartyPants(html)
    html = html.replace(start, '<!--')
    html = html.replace(stop, '-->')
    return substitute_entites(html)

示例#7

0

显示文件

def smarten_punctuation(html, log):
    from calibre.utils.smartypants import smartyPants
    from calibre.ebooks.chardet import substitute_entites
    from calibre.ebooks.conversion.utils import HeuristicProcessor
    preprocessor = HeuristicProcessor(log=log)
    from uuid import uuid4
    start = 'calibre-smartypants-' + str(uuid4())
    stop = 'calibre-smartypants-' + str(uuid4())
    html = html.replace('<!--', start)
    html = html.replace('-->', stop)
    html = preprocessor.fix_nbsp_indents(html)
    html = smartyPants(html)
    html = html.replace(start, '<!--')
    html = html.replace(stop, '-->')
    # convert ellipsis to entities to prevent wrapping
    html = re.sub(r'(?u)(?<=\w)\s?(\.\s?){2}\.', '&hellip;', html)
    # convert double dashes to em-dash
    html = re.sub(r'\s--\s', u'\u2014', html)
    return substitute_entites(html)

示例#8

0

显示文件

    def __call__(self, html, remove_special_chars=None,
            get_preprocess_html=False):
        if remove_special_chars is not None:
            html = remove_special_chars.sub('', html)
        html = html.replace('\0', '')
        is_pdftohtml = self.is_pdftohtml(html)
        if self.is_baen(html):
            rules = []
        elif self.is_book_designer(html):
            rules = self.BOOK_DESIGNER
        elif is_pdftohtml:
            rules = self.PDFTOHTML
        else:
            rules = []

        start_rules = []

        if not getattr(self.extra_opts, 'keep_ligatures', False):
            html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)

        user_sr_rules = {}
        # Function for processing search and replace

        def do_search_replace(search_pattern, replace_txt):
            from calibre.ebooks.conversion.search_replace import compile_regular_expression
            try:
                search_re = compile_regular_expression(search_pattern)
                if not replace_txt:
                    replace_txt = ''
                rules.insert(0, (search_re, replace_txt))
                user_sr_rules[(search_re, replace_txt)] = search_pattern
            except Exception as e:
                self.log.error('Failed to parse %r regexp because %s' %
                        (search, as_unicode(e)))

        # search / replace using the sr?_search / sr?_replace options
        for i in range(1, 4):
            search, replace = 'sr%d_search'%i, 'sr%d_replace'%i
            search_pattern = getattr(self.extra_opts, search, '')
            replace_txt = getattr(self.extra_opts, replace, '')
            if search_pattern:
                do_search_replace(search_pattern, replace_txt)

        # multi-search / replace using the search_replace option
        search_replace = getattr(self.extra_opts, 'search_replace', None)
        if search_replace:
            search_replace = json.loads(search_replace)
            for search_pattern, replace_txt in reversed(search_replace):
                do_search_replace(search_pattern, replace_txt)

        end_rules = []
        # delete soft hyphens - moved here so it's executed after header/footer removal
        if is_pdftohtml:
            # unwrap/delete soft hyphens
            end_rules.append((re.compile(u'[](</p>\s*<p>\s*)+\s*(?=[[a-z\d])'), lambda match: ''))
            # unwrap/delete soft hyphens with formatting
            end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))

        length = -1
        if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
            docanalysis = DocAnalysis('pdf', html)
            length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
            if length:
                # print "The pdf line length returned is " + str(length)
                # unwrap em/en dashes
                end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
                end_rules.append(
                    # Un wrap using punctuation
                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),  # noqa
                )

        for rule in self.PREPROCESS + start_rules:
            html = rule[0].sub(rule[1], html)

        if self.regex_wizard_callback is not None:
            self.regex_wizard_callback(self.current_href, html)

        if get_preprocess_html:
            return html

        def dump(raw, where):
            import os
            dp = getattr(self.extra_opts, 'debug_pipeline', None)
            if dp and os.path.exists(dp):
                odir = os.path.join(dp, 'input')
                if os.path.exists(odir):
                    odir = os.path.join(odir, where)
                    if not os.path.exists(odir):
                        os.makedirs(odir)
                    name, i = None, 0
                    while not name or os.path.exists(os.path.join(odir, name)):
                        i += 1
                        name = '%04d.html'%i
                    with open(os.path.join(odir, name), 'wb') as f:
                        f.write(raw.encode('utf-8'))

        # dump(html, 'pre-preprocess')

        for rule in rules + end_rules:
            try:
                html = rule[0].sub(rule[1], html)
            except Exception as e:
                if rule in user_sr_rules:
                    self.log.error(
                        'User supplied search & replace rule: %s -> %s '
                        'failed with error: %s, ignoring.'%(
                            user_sr_rules[rule], rule[1], e))
                else:
                    raise

        if is_pdftohtml and length > -1:
            # Dehyphenate
            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
            html = dehyphenator(html,'html', length)

        if is_pdftohtml:
            from calibre.ebooks.conversion.utils import HeuristicProcessor
            pdf_markup = HeuristicProcessor(self.extra_opts, None)
            totalwords = 0
            if pdf_markup.get_word_count(html) > 7000:
                html = pdf_markup.markup_chapters(html, totalwords, True)

        # dump(html, 'post-preprocess')

        # Handle broken XHTML w/ SVG (ugh)
        if 'svg:' in html and SVG_NS not in html:
            html = html.replace(
                '<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
        if 'xlink:' in html and XLINK_NS not in html:
            html = html.replace(
                '<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)

        html = XMLDECL_RE.sub('', html)

        if getattr(self.extra_opts, 'asciiize', False):
            from calibre.utils.localization import get_udc
            from calibre.utils.mreplace import MReplace
            unihandecoder = get_udc()
            mr = MReplace(data={u'«':u'&lt;'*3, u'»':u'&gt;'*3})
            html = mr.mreplace(html)
            html = unihandecoder.decode(html)

        if getattr(self.extra_opts, 'enable_heuristics', False):
            from calibre.ebooks.conversion.utils import HeuristicProcessor
            preprocessor = HeuristicProcessor(self.extra_opts, self.log)
            html = preprocessor(html)

        if is_pdftohtml:
            html = html.replace('<!-- created by calibre\'s pdftohtml -->', '')

        if getattr(self.extra_opts, 'smarten_punctuation', False):
            html = smarten_punctuation(html, self.log)

        try:
            unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
        except AttributeError:
            unsupported_unicode_chars = u''
        if unsupported_unicode_chars:
            from calibre.utils.localization import get_udc
            unihandecoder = get_udc()
            for char in unsupported_unicode_chars:
                asciichar = unihandecoder.decode(char)
                html = html.replace(char, asciichar)

        return html

示例#9

0

显示文件

文件： txt_input.py 项目： shadyproject/calibre

    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
        from calibre.ebooks.chardet import detect
        from calibre.utils.zipfile import ZipFile
        from calibre.ebooks.txt.processor import (
            convert_basic, convert_markdown_with_metadata,
            separate_paragraphs_single_line,
            separate_paragraphs_print_formatted, preserve_spaces,
            detect_paragraph_type, detect_formatting_type,
            normalize_line_endings, convert_textile, remove_indents,
            block_to_single_line, separate_hard_scene_breaks)

        self.log = log
        txt = ''
        log.debug('Reading text from file...')
        length = 0

        # Extract content from zip archive.
        if file_ext == 'txtz':
            zf = ZipFile(stream)
            zf.extractall('.')

            for x in walk('.'):
                if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
                    with open(x, 'rb') as tf:
                        txt += tf.read() + '\n\n'
        else:
            txt = stream.read()
            if file_ext in {'md', 'textile', 'markdown'}:
                options.formatting_type = {
                    'md': 'markdown'
                }.get(file_ext, file_ext)
                log.info('File extension indicates particular formatting. '
                         'Forcing formatting type to: %s' %
                         options.formatting_type)
                options.paragraph_type = 'off'

        # Get the encoding of the document.
        if options.input_encoding:
            ienc = options.input_encoding
            log.debug('Using user specified input encoding of %s' % ienc)
        else:
            det_encoding = detect(txt)
            det_encoding, confidence = det_encoding['encoding'], det_encoding[
                'confidence']
            if det_encoding and det_encoding.lower().replace(
                    '_',
                    '-').strip() in ('gb2312', 'chinese', 'csiso58gb231280',
                                     'euc-cn', 'euccn', 'eucgb2312-cn',
                                     'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
                # Microsoft Word exports to HTML with encoding incorrectly set to
                # gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
                det_encoding = 'gbk'
            ienc = det_encoding
            log.debug(
                'Detected input encoding as %s with a confidence of %s%%' %
                (ienc, confidence * 100))
        if not ienc:
            ienc = 'utf-8'
            log.debug(
                'No input encoding specified and could not auto detect using %s'
                % ienc)
        # Remove BOM from start of txt as its presence can confuse markdown
        import codecs
        for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8,
                    codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
            if txt.startswith(bom):
                txt = txt[len(bom):]
                break
        txt = txt.decode(ienc, 'replace')

        # Replace entities
        txt = _ent_pat.sub(xml_entity_to_unicode, txt)

        # Normalize line endings
        txt = normalize_line_endings(txt)

        # Determine the paragraph type of the document.
        if options.paragraph_type == 'auto':
            options.paragraph_type = detect_paragraph_type(txt)
            if options.paragraph_type == 'unknown':
                log.debug(
                    'Could not reliably determine paragraph type using block')
                options.paragraph_type = 'block'
            else:
                log.debug('Auto detected paragraph type as %s' %
                          options.paragraph_type)

        # Detect formatting
        if options.formatting_type == 'auto':
            options.formatting_type = detect_formatting_type(txt)
            log.debug('Auto detected formatting as %s' %
                      options.formatting_type)

        if options.formatting_type == 'heuristic':
            setattr(options, 'enable_heuristics', True)
            setattr(options, 'unwrap_lines', False)
            setattr(options, 'smarten_punctuation', True)

        # Reformat paragraphs to block formatting based on the detected type.
        # We don't check for block because the processor assumes block.
        # single and print at transformed to block for processing.
        if options.paragraph_type == 'single':
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'print':
            txt = separate_hard_scene_breaks(txt)
            txt = separate_paragraphs_print_formatted(txt)
            txt = block_to_single_line(txt)
        elif options.paragraph_type == 'unformatted':
            from calibre.ebooks.conversion.utils import HeuristicProcessor
            # unwrap lines based on punctuation
            docanalysis = DocAnalysis('txt', txt)
            length = docanalysis.line_length(.5)
            preprocessor = HeuristicProcessor(options,
                                              log=getattr(self, 'log', None))
            txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'block':
            txt = separate_hard_scene_breaks(txt)
            txt = block_to_single_line(txt)

        if getattr(options, 'enable_heuristics', False) and getattr(
                options, 'dehyphenate', False):
            docanalysis = DocAnalysis('txt', txt)
            if not length:
                length = docanalysis.line_length(.5)
            dehyphenator = Dehyphenator(options.verbose, log=self.log)
            txt = dehyphenator(txt, 'txt', length)

        # User requested transformation on the text.
        if options.txt_in_remove_indents:
            txt = remove_indents(txt)

        # Preserve spaces will replace multiple spaces to a space
        # followed by the &nbsp; entity.
        if options.preserve_spaces:
            txt = preserve_spaces(txt)

        # Process the text using the appropriate text processor.
        html = ''
        input_mi = None
        if options.formatting_type == 'markdown':
            log.debug('Running text through markdown conversion...')
            try:
                input_mi, html = convert_markdown_with_metadata(
                    txt,
                    extensions=[
                        x.strip()
                        for x in options.markdown_extensions.split(',')
                        if x.strip()
                    ])
            except RuntimeError:
                raise ValueError(
                    'This txt file has malformed markup, it cannot be'
                    ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax'
                )
        elif options.formatting_type == 'textile':
            log.debug('Running text through textile conversion...')
            html = convert_textile(txt)
        else:
            log.debug('Running text through basic conversion...')
            flow_size = getattr(options, 'flow_size', 0)
            html = convert_basic(txt, epub_split_size_kb=flow_size)

        # Run the HTMLized text through the html processing plugin.
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = os.getcwdu()
        if file_ext != 'txtz' and hasattr(stream, 'name'):
            base = os.path.dirname(stream.name)
        fname = os.path.join(base, 'index.html')
        c = 0
        while os.path.exists(fname):
            c += 1
            fname = 'index%d.html' % c
        htmlfile = open(fname, 'wb')
        with htmlfile:
            htmlfile.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html',
                                 log, {})
        options.debug_pipeline = odi
        os.remove(htmlfile.name)

        # Set metadata from file.
        if input_mi is None:
            from calibre.customize.ui import get_file_type_metadata
            input_mi = get_file_type_metadata(stream, file_ext)
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
        self.html_postprocess_title = input_mi.title

        return oeb