Exemplo n.º 1
0
def detect_paragraph_type(txt):
    '''
    Tries to determine the paragraph type of the document.

    block: Paragraphs are separated by a blank line.
    single: Each line is a paragraph.
    print: Each paragraph starts with a 2+ spaces or a tab
           and ends when a new paragraph is reached.
    unformatted: most lines have hard line breaks, few/no blank lines or indents

    returns block, single, print, unformatted
    '''
    txt = txt.replace('\r\n', '\n')
    txt = txt.replace('\r', '\n')
    txt_line_count = len(re.findall('(?mu)^\s*.+$', txt))

    # Check for hard line breaks - true if 55% of the doc breaks in the same region
    docanalysis = DocAnalysis('txt', txt)
    hardbreaks = docanalysis.line_histogram(.55)

    if hardbreaks:
        # Determine print percentage
        tab_line_count = len(re.findall('(?mu)^(\t|\s{2,}).+$', txt))
        print_percent = tab_line_count / float(txt_line_count)

        # Determine block percentage
        empty_line_count = len(re.findall('(?mu)^\s*$', txt))
        block_percent = empty_line_count / float(txt_line_count)

        # Compare the two types - the type with the larger number of instances wins
        # in cases where only one or the other represents the vast majority of the document neither wins
        if print_percent >= block_percent:
            if .15 <= print_percent <= .75:
                return 'print'
        elif .15 <= block_percent <= .75:
            return 'block'

        # Assume unformatted text with hardbreaks if nothing else matches
        return 'unformatted'

    # return single if hardbreaks is false
    return 'single'
Exemplo n.º 2
0
def detect_paragraph_type(txt):
    '''
    Tries to determine the paragraph type of the document.

    block: Paragraphs are separated by a blank line.
    single: Each line is a paragraph.
    print: Each paragraph starts with a 2+ spaces or a tab
           and ends when a new paragraph is reached.
    unformatted: most lines have hard line breaks, few/no blank lines or indents

    returns block, single, print, unformatted
    '''
    txt = txt.replace('\r\n', '\n')
    txt = txt.replace('\r', '\n')
    txt_line_count = len(re.findall(r'(?mu)^\s*.+$', txt))

    # Check for hard line breaks - true if 55% of the doc breaks in the same region
    docanalysis = DocAnalysis('txt', txt)
    hardbreaks = docanalysis.line_histogram(.55)

    if hardbreaks:
        # Determine print percentage
        tab_line_count = len(re.findall(r'(?mu)^(\t|\s{2,}).+$', txt))
        print_percent = tab_line_count / float(txt_line_count)

        # Determine block percentage
        empty_line_count = len(re.findall(r'(?mu)^\s*$', txt))
        block_percent = empty_line_count / float(txt_line_count)

        # Compare the two types - the type with the larger number of instances wins
        # in cases where only one or the other represents the vast majority of the document neither wins
        if print_percent >= block_percent:
            if .15 <= print_percent <= .75:
                return 'print'
        elif .15 <= block_percent <= .75:
            return 'block'

        # Assume unformatted text with hardbreaks if nothing else matches
        return 'unformatted'

    # return single if hardbreaks is false
    return 'single'
Exemplo n.º 3
0
    def __call__(self, html):
        self.log.debug("*********  Heuristic processing HTML  *********")
        # Count the words in the document to estimate how many chapters to look for and whether
        # other types of processing are attempted
        try:
            self.totalwords = self.get_word_count(html)
        except:
            self.log.warn("Can't get wordcount")

        if self.totalwords < 50:
            self.log.warn("flow is too short, not running heuristics")
            return html

        is_abbyy = self.is_abbyy(html)
        if is_abbyy:
            html = self.abbyy_processor(html)

        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
        html = self.arrange_htm_line_endings(html)
        # self.dump(html, 'after_arrange_line_endings')
        if self.cleanup_required():
            # ##### Check Markup ######
            #
            # some lit files don't have any <p> tags or equivalent (generally just plain text between
            # <pre> tags), check and  mark up line endings if required before proceeding
            # fix indents must run after this step
            if self.no_markup(html, 0.1):
                self.log.debug("not enough paragraph markers, adding now")
                # markup using text processing
                html = self.markup_pre(html)

        # Replace series of non-breaking spaces with text-indent
        if getattr(self.extra_opts, 'fix_indents', False):
            html = self.fix_nbsp_indents(html)

        if self.cleanup_required():
            # fix indents must run before this step, as it removes non-breaking spaces
            html = self.cleanup_markup(html)

        is_pdftohtml = self.is_pdftohtml(html)
        if is_pdftohtml:
            self.line_open = "<(?P<outer>p)[^>]*>(\\s*<[ibu][^>]*>)?\\s*"
            self.line_close = "\\s*(</[ibu][^>]*>\\s*)?</(?P=outer)>"

        # ADE doesn't render <br />, change to empty paragraphs
        # html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)

        # Determine whether the document uses interleaved blank lines
        self.blanks_between_paragraphs = self.analyze_blanks(html)

        # detect chapters/sections to match xpath or splitting logic

        if getattr(self.extra_opts, 'markup_chapter_headings', False):
            html = self.markup_chapters(html, self.totalwords,
                                        self.blanks_between_paragraphs)
        # self.dump(html, 'after_chapter_markup')

        if getattr(self.extra_opts, 'italicize_common_cases', False):
            html = self.markup_italicis(html)

        # If more than 40% of the lines are empty paragraphs and the user has enabled delete
        # blank paragraphs then delete blank lines to clean up spacing
        if self.blanks_between_paragraphs and getattr(
                self.extra_opts, 'delete_blank_paragraphs', False):
            self.log.debug("deleting blank lines")
            self.blanks_deleted = True
            html = self.multi_blank.sub(
                '\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>',
                html)
            html = self.blankreg.sub('', html)

        # Determine line ending type
        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
        # that lines can be un-wrapped across page boundaries
        format = self.analyze_line_endings(html)

        # Check Line histogram to determine if the document uses hard line breaks, If 50% or
        # more of the lines break in the same region of the document then unwrapping is required
        docanalysis = DocAnalysis(format, html)
        hardbreaks = docanalysis.line_histogram(.50)
        self.log.debug("Hard line breaks check returned " +
                       unicode_type(hardbreaks))

        # Calculate Length
        unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
        length = docanalysis.line_length(unwrap_factor)
        self.log.debug("Median line length is " + unicode_type(length) +
                       ", calculated with " + format + " format")

        # ##### Unwrap lines ######
        if getattr(self.extra_opts, 'unwrap_lines', False):
            # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
            if hardbreaks or unwrap_factor < 0.4:
                self.log.debug("Unwrapping required, unwrapping Lines")
                # Dehyphenate with line length limiters
                dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
                html = dehyphenator(html, 'html', length)
                html = self.punctuation_unwrap(length, html, 'html')

        if getattr(self.extra_opts, 'dehyphenate', False):
            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
            self.log.debug("Fixing hyphenated content")
            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
            html = dehyphenator(html, 'html_cleanup', length)
            html = dehyphenator(html, 'individual_words', length)

        # If still no sections after unwrapping mark split points on lines with no punctuation
        if self.html_preprocess_sections < self.min_chapters and getattr(
                self.extra_opts, 'markup_chapter_headings', False):
            self.log.debug(
                "Looking for more split points based on punctuation,"
                " currently have " +
                unicode_type(self.html_preprocess_sections))
            chapdetect3 = re.compile(
                r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)'
                r'(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*'
                r'.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*'
                r'(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)',
                re.IGNORECASE)
            html = chapdetect3.sub(self.chapter_break, html)

        if getattr(self.extra_opts, 'renumber_headings', False):
            # search for places where a first or second level heading is immediately followed by another
            # top level heading.  demote the second heading to h3 to prevent splitting between chapter
            # headings and titles, images, etc
            doubleheading = re.compile(
                r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>',
                re.IGNORECASE)
            html = doubleheading.sub(
                '\\g<firsthead>' + '\n<h3' + '\\g<secondhead>' + '</h3>', html)

        # If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks,
        # style it with the 'whitespace' class.  All remaining blank lines are styled as softbreaks.
        # Multiple sequential blank paragraphs are merged with appropriate margins
        # If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
        if getattr(self.extra_opts, 'format_scene_breaks', False):
            self.log.debug('Formatting scene breaks')
            html = re.sub('(?i)<div[^>]*>\\s*<br(\\s?/)?>\\s*</div>',
                          '<p></p>', html)
            html = self.detect_scene_breaks(html)
            html = self.detect_whitespace(html)
            html = self.detect_soft_breaks(html)
            blanks_count = len(self.any_multi_blank.findall(html))
            if blanks_count >= 1:
                html = self.merge_blanks(html, blanks_count)
            detected_scene_break = re.compile(
                r'<p class="scenebreak"[^>]*>.*?</p>')
            scene_break_count = len(detected_scene_break.findall(html))
            # If the user has enabled scene break replacement, then either softbreaks
            # or 'hard' scene breaks are replaced, depending on which is in use
            # Otherwise separator lines are centered, use a bit larger margin in this case
            replacement_break = getattr(self.extra_opts,
                                        'replace_scene_breaks', None)
            if replacement_break:
                replacement_break = self.markup_user_break(replacement_break)
                if scene_break_count >= 1:
                    html = detected_scene_break.sub(replacement_break, html)
                    html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>',
                                  replacement_break, html)
                else:
                    html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>',
                                  replacement_break, html)

        if self.deleted_nbsps:
            # put back non-breaking spaces in empty paragraphs so they render correctly
            html = self.anyblank.sub(
                '\n' + r'\g<openline>' + '\u00a0' + r'\g<closeline>', html)
        return html
Exemplo n.º 4
0
    def __call__(self, html):
        self.log.debug("*********  Heuristic processing HTML  *********")
        # Count the words in the document to estimate how many chapters to look for and whether
        # other types of processing are attempted
        try:
            self.totalwords = self.get_word_count(html)
        except:
            self.log.warn("Can't get wordcount")

        if self.totalwords < 50:
            self.log.warn("flow is too short, not running heuristics")
            return html

        is_abbyy = self.is_abbyy(html)
        if is_abbyy:
            html = self.abbyy_processor(html)

        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
        html = self.arrange_htm_line_endings(html)
        # self.dump(html, 'after_arrange_line_endings')
        if self.cleanup_required():
            # ##### Check Markup ######
            #
            # some lit files don't have any <p> tags or equivalent (generally just plain text between
            # <pre> tags), check and  mark up line endings if required before proceeding
            # fix indents must run after this step
            if self.no_markup(html, 0.1):
                self.log.debug("not enough paragraph markers, adding now")
                # markup using text processing
                html = self.markup_pre(html)

        # Replace series of non-breaking spaces with text-indent
        if getattr(self.extra_opts, 'fix_indents', False):
            html = self.fix_nbsp_indents(html)

        if self.cleanup_required():
            # fix indents must run before this step, as it removes non-breaking spaces
            html = self.cleanup_markup(html)

        is_pdftohtml = self.is_pdftohtml(html)
        if is_pdftohtml:
            self.line_open = "<(?P<outer>p)[^>]*>(\\s*<[ibu][^>]*>)?\\s*"
            self.line_close = "\\s*(</[ibu][^>]*>\\s*)?</(?P=outer)>"

        # ADE doesn't render <br />, change to empty paragraphs
        # html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)

        # Determine whether the document uses interleaved blank lines
        self.blanks_between_paragraphs = self.analyze_blanks(html)

        # detect chapters/sections to match xpath or splitting logic

        if getattr(self.extra_opts, 'markup_chapter_headings', False):
            html = self.markup_chapters(html, self.totalwords, self.blanks_between_paragraphs)
        # self.dump(html, 'after_chapter_markup')

        if getattr(self.extra_opts, 'italicize_common_cases', False):
            html = self.markup_italicis(html)

        # If more than 40% of the lines are empty paragraphs and the user has enabled delete
        # blank paragraphs then delete blank lines to clean up spacing
        if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
            self.log.debug("deleting blank lines")
            self.blanks_deleted = True
            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
            html = self.blankreg.sub('', html)

        # Determine line ending type
        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
        # that lines can be un-wrapped across page boundaries
        format = self.analyze_line_endings(html)

        # Check Line histogram to determine if the document uses hard line breaks, If 50% or
        # more of the lines break in the same region of the document then unwrapping is required
        docanalysis = DocAnalysis(format, html)
        hardbreaks = docanalysis.line_histogram(.50)
        self.log.debug("Hard line breaks check returned "+unicode_type(hardbreaks))

        # Calculate Length
        unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
        length = docanalysis.line_length(unwrap_factor)
        self.log.debug("Median line length is " + unicode_type(length) + ", calculated with " + format + " format")

        # ##### Unwrap lines ######
        if getattr(self.extra_opts, 'unwrap_lines', False):
            # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
            if hardbreaks or unwrap_factor < 0.4:
                self.log.debug("Unwrapping required, unwrapping Lines")
                # Dehyphenate with line length limiters
                dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
                html = dehyphenator(html,'html', length)
                html = self.punctuation_unwrap(length, html, 'html')

        if getattr(self.extra_opts, 'dehyphenate', False):
            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
            self.log.debug("Fixing hyphenated content")
            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
            html = dehyphenator(html,'html_cleanup', length)
            html = dehyphenator(html, 'individual_words', length)

        # If still no sections after unwrapping mark split points on lines with no punctuation
        if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
            self.log.debug("Looking for more split points based on punctuation,"
                    " currently have " + unicode_type(self.html_preprocess_sections))
            chapdetect3 = re.compile(
                r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)  # noqa
            html = chapdetect3.sub(self.chapter_break, html)

        if getattr(self.extra_opts, 'renumber_headings', False):
            # search for places where a first or second level heading is immediately followed by another
            # top level heading.  demote the second heading to h3 to prevent splitting between chapter
            # headings and titles, images, etc
            doubleheading = re.compile(
                r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
            html = doubleheading.sub('\\g<firsthead>'+'\n<h3'+'\\g<secondhead>'+'</h3>', html)

        # If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks,
        # style it with the 'whitespace' class.  All remaining blank lines are styled as softbreaks.
        # Multiple sequential blank paragraphs are merged with appropriate margins
        # If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
        if getattr(self.extra_opts, 'format_scene_breaks', False):
            self.log.debug('Formatting scene breaks')
            html = re.sub('(?i)<div[^>]*>\\s*<br(\\s?/)?>\\s*</div>', '<p></p>', html)
            html = self.detect_scene_breaks(html)
            html = self.detect_whitespace(html)
            html = self.detect_soft_breaks(html)
            blanks_count = len(self.any_multi_blank.findall(html))
            if blanks_count >= 1:
                html = self.merge_blanks(html, blanks_count)
            detected_scene_break = re.compile(r'<p class="scenebreak"[^>]*>.*?</p>')
            scene_break_count = len(detected_scene_break.findall(html))
            # If the user has enabled scene break replacement, then either softbreaks
            # or 'hard' scene breaks are replaced, depending on which is in use
            # Otherwise separator lines are centered, use a bit larger margin in this case
            replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
            if replacement_break:
                replacement_break = self.markup_user_break(replacement_break)
                if scene_break_count >= 1:
                    html = detected_scene_break.sub(replacement_break, html)
                    html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html)
                else:
                    html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html)

        if self.deleted_nbsps:
            # put back non-breaking spaces in empty paragraphs so they render correctly
            html = self.anyblank.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
        return html