Exemplos de DocAnalysis em Python, exemplos de calibre.ebooks.conversion.preprocess.DocAnalysis em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: processor.py Projeto: AEliu/calibre

def detect_paragraph_type(txt):
    '''
    Tries to determine the paragraph type of the document.

    block: Paragraphs are separated by a blank line.
    single: Each line is a paragraph.
    print: Each paragraph starts with a 2+ spaces or a tab
           and ends when a new paragraph is reached.
    unformatted: most lines have hard line breaks, few/no blank lines or indents

    returns block, single, print, unformatted
    '''
    txt = txt.replace('\r\n', '\n')
    txt = txt.replace('\r', '\n')
    txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))

    # Check for hard line breaks - true if 55% of the doc breaks in the same region
    docanalysis = DocAnalysis('txt', txt)
    hardbreaks = docanalysis.line_histogram(.55)

    if hardbreaks:
        # Determine print percentage
        tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
        print_percent = tab_line_count / float(txt_line_count)

        # Determine block percentage
        empty_line_count = len(re.findall('(?mu)^\s*$', txt))
        block_percent = empty_line_count / float(txt_line_count)

        # Compare the two types - the type with the larger number of instances wins
        # in cases where only one or the other represents the vast majority of the document neither wins
        if print_percent >= block_percent:
            if .15 <= print_percent <= .75:
                return 'print'
        elif .15 <= block_percent <= .75:
            return 'block'

        # Assume unformatted text with hardbreaks if nothing else matches
        return 'unformatted'

    # return single if hardbreaks is false
    return 'single'

Exemplo n.º 2

0

Exibir arquivo

Arquivo: utils.py Projeto: JimmXinu/calibre

    def __call__(self, html):
        self.log.debug("*********  Heuristic processing HTML  *********")
        # Count the words in the document to estimate how many chapters to look for and whether
        # other types of processing are attempted
        try:
            self.totalwords = self.get_word_count(html)
        except:
            self.log.warn("Can't get wordcount")

        if self.totalwords < 50:
            self.log.warn("flow is too short, not running heuristics")
            return html

        is_abbyy = self.is_abbyy(html)
        if is_abbyy:
            html = self.abbyy_processor(html)

        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
        html = self.arrange_htm_line_endings(html)
        # self.dump(html, 'after_arrange_line_endings')
        if self.cleanup_required():
            # ##### Check Markup ######
            #
            # some lit files don't have any <p> tags or equivalent (generally just plain text between
            # <pre> tags), check and  mark up line endings if required before proceeding
            # fix indents must run after this step
            if self.no_markup(html, 0.1):
                self.log.debug("not enough paragraph markers, adding now")
                # markup using text processing
                html = self.markup_pre(html)

        # Replace series of non-breaking spaces with text-indent
        if getattr(self.extra_opts, 'fix_indents', False):
            html = self.fix_nbsp_indents(html)

        if self.cleanup_required():
            # fix indents must run before this step, as it removes non-breaking spaces
            html = self.cleanup_markup(html)

        is_pdftohtml = self.is_pdftohtml(html)
        if is_pdftohtml:
            self.line_open = "<(?P<outer>p)[^>]*>(\\s*<[ibu][^>]*>)?\\s*"
            self.line_close = "\\s*(</[ibu][^>]*>\\s*)?</(?P=outer)>"

        # ADE doesn't render <br />, change to empty paragraphs
        # html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)

        # Determine whether the document uses interleaved blank lines
        self.blanks_between_paragraphs = self.analyze_blanks(html)

        # detect chapters/sections to match xpath or splitting logic

        if getattr(self.extra_opts, 'markup_chapter_headings', False):
            html = self.markup_chapters(html, self.totalwords, self.blanks_between_paragraphs)
        # self.dump(html, 'after_chapter_markup')

        if getattr(self.extra_opts, 'italicize_common_cases', False):
            html = self.markup_italicis(html)

        # If more than 40% of the lines are empty paragraphs and the user has enabled delete
        # blank paragraphs then delete blank lines to clean up spacing
        if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
            self.log.debug("deleting blank lines")
            self.blanks_deleted = True
            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
            html = self.blankreg.sub('', html)

        # Determine line ending type
        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
        # that lines can be un-wrapped across page boundaries
        format = self.analyze_line_endings(html)

        # Check Line histogram to determine if the document uses hard line breaks, If 50% or
        # more of the lines break in the same region of the document then unwrapping is required
        docanalysis = DocAnalysis(format, html)
        hardbreaks = docanalysis.line_histogram(.50)
        self.log.debug("Hard line breaks check returned "+unicode_type(hardbreaks))

        # Calculate Length
        unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
        length = docanalysis.line_length(unwrap_factor)
        self.log.debug("Median line length is " + unicode_type(length) + ", calculated with " + format + " format")

        # ##### Unwrap lines ######
        if getattr(self.extra_opts, 'unwrap_lines', False):
            # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
            if hardbreaks or unwrap_factor < 0.4:
                self.log.debug("Unwrapping required, unwrapping Lines")
                # Dehyphenate with line length limiters
                dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
                html = dehyphenator(html,'html', length)
                html = self.punctuation_unwrap(length, html, 'html')

        if getattr(self.extra_opts, 'dehyphenate', False):
            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
            self.log.debug("Fixing hyphenated content")
            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
            html = dehyphenator(html,'html_cleanup', length)
            html = dehyphenator(html, 'individual_words', length)

        # If still no sections after unwrapping mark split points on lines with no punctuation
        if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
            self.log.debug("Looking for more split points based on punctuation,"
                    " currently have " + unicode_type(self.html_preprocess_sections))
            chapdetect3 = re.compile(
                r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)  # noqa
            html = chapdetect3.sub(self.chapter_break, html)

        if getattr(self.extra_opts, 'renumber_headings', False):
            # search for places where a first or second level heading is immediately followed by another
            # top level heading.  demote the second heading to h3 to prevent splitting between chapter
            # headings and titles, images, etc
            doubleheading = re.compile(
                r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
            html = doubleheading.sub('\\g<firsthead>'+'\n<h3'+'\\g<secondhead>'+'</h3>', html)

        # If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks,
        # style it with the 'whitespace' class.  All remaining blank lines are styled as softbreaks.
        # Multiple sequential blank paragraphs are merged with appropriate margins
        # If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
        if getattr(self.extra_opts, 'format_scene_breaks', False):
            self.log.debug('Formatting scene breaks')
            html = re.sub('(?i)<div[^>]*>\\s*<br(\\s?/)?>\\s*</div>', '<p></p>', html)
            html = self.detect_scene_breaks(html)
            html = self.detect_whitespace(html)
            html = self.detect_soft_breaks(html)
            blanks_count = len(self.any_multi_blank.findall(html))
            if blanks_count >= 1:
                html = self.merge_blanks(html, blanks_count)
            detected_scene_break = re.compile(r'<p class="scenebreak"[^>]*>.*?</p>')
            scene_break_count = len(detected_scene_break.findall(html))
            # If the user has enabled scene break replacement, then either softbreaks
            # or 'hard' scene breaks are replaced, depending on which is in use
            # Otherwise separator lines are centered, use a bit larger margin in this case
            replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
            if replacement_break:
                replacement_break = self.markup_user_break(replacement_break)
                if scene_break_count >= 1:
                    html = detected_scene_break.sub(replacement_break, html)
                    html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html)
                else:
                    html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html)

        if self.deleted_nbsps:
            # put back non-breaking spaces in empty paragraphs so they render correctly
            html = self.anyblank.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
        return html

Exemplo n.º 3

0

Exibir arquivo

Arquivo: txt_input.py Projeto: Sabesan2000/SOFE-QUAILTY-FINAL

    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
        from calibre.ebooks.chardet import detect
        from calibre.utils.zipfile import ZipFile
        from calibre.ebooks.txt.processor import (
            convert_basic, convert_markdown_with_metadata,
            separate_paragraphs_single_line,
            separate_paragraphs_print_formatted, preserve_spaces,
            detect_paragraph_type, detect_formatting_type,
            normalize_line_endings, convert_textile, remove_indents,
            block_to_single_line, separate_hard_scene_breaks)

        self.log = log
        txt = b''
        log.debug('Reading text from file...')
        length = 0
        base_dir = self.output_dir = getcwd()

        # Extract content from zip archive.
        if file_ext == 'txtz':
            zf = ZipFile(stream)
            zf.extractall('.')

            for x in walk('.'):
                if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
                    with open(x, 'rb') as tf:
                        txt += tf.read() + b'\n\n'
        else:
            if getattr(stream, 'name', None):
                base_dir = os.path.dirname(stream.name)
            txt = stream.read()
            if file_ext in {'md', 'textile', 'markdown'}:
                options.formatting_type = {
                    'md': 'markdown'
                }.get(file_ext, file_ext)
                log.info('File extension indicates particular formatting. '
                         'Forcing formatting type to: %s' %
                         options.formatting_type)
                options.paragraph_type = 'off'

        # Get the encoding of the document.
        if options.input_encoding:
            ienc = options.input_encoding
            log.debug('Using user specified input encoding of %s' % ienc)
        else:
            det_encoding = detect(txt[:4096])
            det_encoding, confidence = det_encoding['encoding'], det_encoding[
                'confidence']
            if det_encoding and det_encoding.lower().replace(
                    '_',
                    '-').strip() in ('gb2312', 'chinese', 'csiso58gb231280',
                                     'euc-cn', 'euccn', 'eucgb2312-cn',
                                     'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
                # Microsoft Word exports to HTML with encoding incorrectly set to
                # gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
                det_encoding = 'gbk'
            ienc = det_encoding
            log.debug(
                'Detected input encoding as %s with a confidence of %s%%' %
                (ienc, confidence * 100))
        if not ienc:
            ienc = 'utf-8'
            log.debug(
                'No input encoding specified and could not auto detect using %s'
                % ienc)
        # Remove BOM from start of txt as its presence can confuse markdown
        import codecs
        for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8,
                    codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
            if txt.startswith(bom):
                txt = txt[len(bom):]
                break
        txt = txt.decode(ienc, 'replace')

        # Replace entities
        txt = _ent_pat.sub(xml_entity_to_unicode, txt)

        # Normalize line endings
        txt = normalize_line_endings(txt)

        # Determine the paragraph type of the document.
        if options.paragraph_type == 'auto':
            options.paragraph_type = detect_paragraph_type(txt)
            if options.paragraph_type == 'unknown':
                log.debug(
                    'Could not reliably determine paragraph type using block')
                options.paragraph_type = 'block'
            else:
                log.debug('Auto detected paragraph type as %s' %
                          options.paragraph_type)

        # Detect formatting
        if options.formatting_type == 'auto':
            options.formatting_type = detect_formatting_type(txt)
            log.debug('Auto detected formatting as %s' %
                      options.formatting_type)

        if options.formatting_type == 'heuristic':
            setattr(options, 'enable_heuristics', True)
            setattr(options, 'unwrap_lines', False)
            setattr(options, 'smarten_punctuation', True)

        # Reformat paragraphs to block formatting based on the detected type.
        # We don't check for block because the processor assumes block.
        # single and print at transformed to block for processing.
        if options.paragraph_type == 'single':
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'print':
            txt = separate_hard_scene_breaks(txt)
            txt = separate_paragraphs_print_formatted(txt)
            txt = block_to_single_line(txt)
        elif options.paragraph_type == 'unformatted':
            from calibre.ebooks.conversion.utils import HeuristicProcessor
            # unwrap lines based on punctuation
            docanalysis = DocAnalysis('txt', txt)
            length = docanalysis.line_length(.5)
            preprocessor = HeuristicProcessor(options,
                                              log=getattr(self, 'log', None))
            txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'block':
            txt = separate_hard_scene_breaks(txt)
            txt = block_to_single_line(txt)

        if getattr(options, 'enable_heuristics', False) and getattr(
                options, 'dehyphenate', False):
            docanalysis = DocAnalysis('txt', txt)
            if not length:
                length = docanalysis.line_length(.5)
            dehyphenator = Dehyphenator(options.verbose, log=self.log)
            txt = dehyphenator(txt, 'txt', length)

        # User requested transformation on the text.
        if options.txt_in_remove_indents:
            txt = remove_indents(txt)

        # Preserve spaces will replace multiple spaces to a space
        # followed by the &nbsp; entity.
        if options.preserve_spaces:
            txt = preserve_spaces(txt)

        # Process the text using the appropriate text processor.
        self.shifted_files = []
        try:
            html = ''
            input_mi = None
            if options.formatting_type == 'markdown':
                log.debug('Running text through markdown conversion...')
                try:
                    input_mi, html = convert_markdown_with_metadata(
                        txt,
                        extensions=[
                            x.strip()
                            for x in options.markdown_extensions.split(',')
                            if x.strip()
                        ])
                except RuntimeError:
                    raise ValueError(
                        'This txt file has malformed markup, it cannot be'
                        ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax'
                    )
                html = self.fix_resources(html, base_dir)
            elif options.formatting_type == 'textile':
                log.debug('Running text through textile conversion...')
                html = convert_textile(txt)
                html = self.fix_resources(html, base_dir)
            else:
                log.debug('Running text through basic conversion...')
                flow_size = getattr(options, 'flow_size', 0)
                html = convert_basic(txt, epub_split_size_kb=flow_size)

            # Run the HTMLized text through the html processing plugin.
            from calibre.customize.ui import plugin_for_input_format
            html_input = plugin_for_input_format('html')
            for opt in html_input.options:
                setattr(options, opt.option.name, opt.recommended_value)
            options.input_encoding = 'utf-8'
            htmlfile = self.shift_file('index.html', html.encode('utf-8'))
            odi = options.debug_pipeline
            options.debug_pipeline = None
            # Generate oeb from html conversion.
            oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html',
                                     log, {})
            options.debug_pipeline = odi
        finally:
            for x in self.shifted_files:
                os.remove(x)

        # Set metadata from file.
        if input_mi is None:
            from calibre.customize.ui import get_file_type_metadata
            input_mi = get_file_type_metadata(stream, file_ext)
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
        self.html_postprocess_title = input_mi.title

        return oeb

Exemplo n.º 4

0

Exibir arquivo

Arquivo: txt_input.py Projeto: MarioJC/calibre

    def convert(self, stream, options, file_ext, log,
                accelerators):
        from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
        from calibre.ebooks.chardet import detect
        from calibre.utils.zipfile import ZipFile
        from calibre.ebooks.txt.processor import (convert_basic,
                convert_markdown_with_metadata, separate_paragraphs_single_line,
                separate_paragraphs_print_formatted, preserve_spaces,
                detect_paragraph_type, detect_formatting_type,
                normalize_line_endings, convert_textile, remove_indents,
                block_to_single_line, separate_hard_scene_breaks)

        self.log = log
        txt = ''
        log.debug('Reading text from file...')
        length = 0

        # Extract content from zip archive.
        if file_ext == 'txtz':
            zf = ZipFile(stream)
            zf.extractall('.')

            for x in walk('.'):
                if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
                    with open(x, 'rb') as tf:
                        txt += tf.read() + '\n\n'
        else:
            txt = stream.read()
            if file_ext in {'md', 'textile', 'markdown'}:
                options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext)
                log.info('File extension indicates particular formatting. '
                        'Forcing formatting type to: %s'%options.formatting_type)
                options.paragraph_type = 'off'

        # Get the encoding of the document.
        if options.input_encoding:
            ienc = options.input_encoding
            log.debug('Using user specified input encoding of %s' % ienc)
        else:
            det_encoding = detect(txt[:4096])
            det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence']
            if det_encoding and det_encoding.lower().replace('_', '-').strip() in (
                    'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
                    'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
                # Microsoft Word exports to HTML with encoding incorrectly set to
                # gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
                det_encoding = 'gbk'
            ienc = det_encoding
            log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100))
        if not ienc:
            ienc = 'utf-8'
            log.debug('No input encoding specified and could not auto detect using %s' % ienc)
        # Remove BOM from start of txt as its presence can confuse markdown
        import codecs
        for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
            if txt.startswith(bom):
                txt = txt[len(bom):]
                break
        txt = txt.decode(ienc, 'replace')

        # Replace entities
        txt = _ent_pat.sub(xml_entity_to_unicode, txt)

        # Normalize line endings
        txt = normalize_line_endings(txt)

        # Determine the paragraph type of the document.
        if options.paragraph_type == 'auto':
            options.paragraph_type = detect_paragraph_type(txt)
            if options.paragraph_type == 'unknown':
                log.debug('Could not reliably determine paragraph type using block')
                options.paragraph_type = 'block'
            else:
                log.debug('Auto detected paragraph type as %s' % options.paragraph_type)

        # Detect formatting
        if options.formatting_type == 'auto':
            options.formatting_type = detect_formatting_type(txt)
            log.debug('Auto detected formatting as %s' % options.formatting_type)

        if options.formatting_type == 'heuristic':
            setattr(options, 'enable_heuristics', True)
            setattr(options, 'unwrap_lines', False)
            setattr(options, 'smarten_punctuation', True)

        # Reformat paragraphs to block formatting based on the detected type.
        # We don't check for block because the processor assumes block.
        # single and print at transformed to block for processing.
        if options.paragraph_type == 'single':
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'print':
            txt = separate_hard_scene_breaks(txt)
            txt = separate_paragraphs_print_formatted(txt)
            txt = block_to_single_line(txt)
        elif options.paragraph_type == 'unformatted':
            from calibre.ebooks.conversion.utils import HeuristicProcessor
            # unwrap lines based on punctuation
            docanalysis = DocAnalysis('txt', txt)
            length = docanalysis.line_length(.5)
            preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
            txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'block':
            txt = separate_hard_scene_breaks(txt)
            txt = block_to_single_line(txt)

        if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
            docanalysis = DocAnalysis('txt', txt)
            if not length:
                length = docanalysis.line_length(.5)
            dehyphenator = Dehyphenator(options.verbose, log=self.log)
            txt = dehyphenator(txt,'txt', length)

        # User requested transformation on the text.
        if options.txt_in_remove_indents:
            txt = remove_indents(txt)

        # Preserve spaces will replace multiple spaces to a space
        # followed by the &nbsp; entity.
        if options.preserve_spaces:
            txt = preserve_spaces(txt)

        # Process the text using the appropriate text processor.
        html = ''
        input_mi = None
        if options.formatting_type == 'markdown':
            log.debug('Running text through markdown conversion...')
            try:
                input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()])
            except RuntimeError:
                raise ValueError('This txt file has malformed markup, it cannot be'
                    ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax')
        elif options.formatting_type == 'textile':
            log.debug('Running text through textile conversion...')
            html = convert_textile(txt)
        else:
            log.debug('Running text through basic conversion...')
            flow_size = getattr(options, 'flow_size', 0)
            html = convert_basic(txt, epub_split_size_kb=flow_size)

        # Run the HTMLized text through the html processing plugin.
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = os.getcwdu()
        if file_ext != 'txtz' and hasattr(stream, 'name'):
            base = os.path.dirname(stream.name)
        fname = os.path.join(base, 'index.html')
        c = 0
        while os.path.exists(fname):
            c += 1
            fname = 'index%d.html'%c
        htmlfile = open(fname, 'wb')
        with htmlfile:
            htmlfile.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log,
                {})
        options.debug_pipeline = odi
        os.remove(htmlfile.name)

        # Set metadata from file.
        if input_mi is None:
            from calibre.customize.ui import get_file_type_metadata
            input_mi = get_file_type_metadata(stream, file_ext)
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
        self.html_postprocess_title = input_mi.title

        return oeb

Exemplo n.º 5

0

Exibir arquivo

Arquivo: utils.py Projeto: Eskibear/kindle-feeder-rails

    def __call__(self, html):
        self.log.debug("*********  Heuristic processing HTML  *********")
        # Count the words in the document to estimate how many chapters to look for and whether
        # other types of processing are attempted
        try:
            self.totalwords = self.get_word_count(html)
        except:
            self.log.warn("Can't get wordcount")

        if self.totalwords < 50:
            self.log.warn("flow is too short, not running heuristics")
            return html

        is_abbyy = self.is_abbyy(html)
        if is_abbyy:
            html = self.abbyy_processor(html)

        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
        html = self.arrange_htm_line_endings(html)
        #self.dump(html, 'after_arrange_line_endings')
        if self.cleanup_required():
            ###### Check Markup ######
            #
            # some lit files don't have any <p> tags or equivalent (generally just plain text between
            # <pre> tags), check and  mark up line endings if required before proceeding
            # fix indents must run after this step
            if self.no_markup(html, 0.1):
                self.log.debug("not enough paragraph markers, adding now")
                # markup using text processing
                html = self.markup_pre(html)

        # Replace series of non-breaking spaces with text-indent
        if getattr(self.extra_opts, 'fix_indents', False):
            html = self.fix_nbsp_indents(html)

        if self.cleanup_required():
            # fix indents must run before this step, as it removes non-breaking spaces
            html = self.cleanup_markup(html)

        is_pdftohtml = self.is_pdftohtml(html)
        if is_pdftohtml:
            self.line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*"
            self.line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>"

        # ADE doesn't render <br />, change to empty paragraphs
        #html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)

        # Determine whether the document uses interleaved blank lines
        self.blanks_between_paragraphs = self.analyze_blanks(html)

        # detect chapters/sections to match xpath or splitting logic

        if getattr(self.extra_opts, 'markup_chapter_headings', False):
            html = self.markup_chapters(html, self.totalwords,
                                        self.blanks_between_paragraphs)
        #self.dump(html, 'after_chapter_markup')

        if getattr(self.extra_opts, 'italicize_common_cases', False):
            html = self.markup_italicis(html)

        # If more than 40% of the lines are empty paragraphs and the user has enabled delete
        # blank paragraphs then delete blank lines to clean up spacing
        if self.blanks_between_paragraphs and getattr(
                self.extra_opts, 'delete_blank_paragraphs', False):
            self.log.debug("deleting blank lines")
            self.blanks_deleted = True
            html = self.multi_blank.sub(
                '\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>',
                html)
            html = self.blankreg.sub('', html)

        # Determine line ending type
        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
        # that lines can be un-wrapped across page boundaries
        format = self.analyze_line_endings(html)

        # Check Line histogram to determine if the document uses hard line breaks, If 50% or
        # more of the lines break in the same region of the document then unwrapping is required
        docanalysis = DocAnalysis(format, html)
        hardbreaks = docanalysis.line_histogram(.50)
        self.log.debug("Hard line breaks check returned " +
                       unicode(hardbreaks))

        # Calculate Length
        unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
        length = docanalysis.line_length(unwrap_factor)
        self.log.debug("Median line length is " + unicode(length) +
                       ", calculated with " + format + " format")

        ###### Unwrap lines ######
        if getattr(self.extra_opts, 'unwrap_lines', False):
            # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
            if hardbreaks or unwrap_factor < 0.4:
                self.log.debug("Unwrapping required, unwrapping Lines")
                # Dehyphenate with line length limiters
                dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
                html = dehyphenator(html, 'html', length)
                html = self.punctuation_unwrap(length, html, 'html')

        if getattr(self.extra_opts, 'dehyphenate', False):
            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
            self.log.debug("Fixing hyphenated content")
            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
            html = dehyphenator(html, 'html_cleanup', length)
            html = dehyphenator(html, 'individual_words', length)

        # If still no sections after unwrapping mark split points on lines with no punctuation
        if self.html_preprocess_sections < self.min_chapters and getattr(
                self.extra_opts, 'markup_chapter_headings', False):
            self.log.debug(
                "Looking for more split points based on punctuation,"
                " currently have " + unicode(self.html_preprocess_sections))
            chapdetect3 = re.compile(
                r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)',
                re.IGNORECASE)
            html = chapdetect3.sub(self.chapter_break, html)

        if getattr(self.extra_opts, 'renumber_headings', False):
            # search for places where a first or second level heading is immediately followed by another
            # top level heading.  demote the second heading to h3 to prevent splitting between chapter
            # headings and titles, images, etc
            doubleheading = re.compile(
                r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>',
                re.IGNORECASE)
            html = doubleheading.sub(
                '\g<firsthead>' + '\n<h3' + '\g<secondhead>' + '</h3>', html)

        # If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks,
        # style it with the 'whitespace' class.  All remaining blank lines are styled as softbreaks.
        # Multiple sequential blank paragraphs are merged with appropriate margins
        # If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
        if getattr(self.extra_opts, 'format_scene_breaks', False):
            self.log.debug('Formatting scene breaks')
            html = re.sub('(?i)<div[^>]*>\s*<br(\s?/)?>\s*</div>', '<p></p>',
                          html)
            html = self.detect_scene_breaks(html)
            html = self.detect_whitespace(html)
            html = self.detect_soft_breaks(html)
            blanks_count = len(self.any_multi_blank.findall(html))
            if blanks_count >= 1:
                html = self.merge_blanks(html, blanks_count)
            detected_scene_break = re.compile(
                r'<p class="scenebreak"[^>]*>.*?</p>')
            scene_break_count = len(detected_scene_break.findall(html))
            # If the user has enabled scene break replacement, then either softbreaks
            # or 'hard' scene breaks are replaced, depending on which is in use
            # Otherwise separator lines are centered, use a bit larger margin in this case
            replacement_break = getattr(self.extra_opts,
                                        'replace_scene_breaks', None)
            if replacement_break:
                replacement_break = self.markup_user_break(replacement_break)
                if scene_break_count >= 1:
                    html = detected_scene_break.sub(replacement_break, html)
                    html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>',
                                  replacement_break, html)
                else:
                    html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>',
                                  replacement_break, html)

        if self.deleted_nbsps:
            # put back non-breaking spaces in empty paragraphs so they render correctly
            html = self.anyblank.sub(
                '\n' + r'\g<openline>' + u'\u00a0' + r'\g<closeline>', html)
        return html