def remove_extra_linebreaks(self, result: MarkedUpText) -> None: """ Removes linebreaks in the middle of the sentence. Usually, single linebreaks within a paragraph should be deleted and replaced with one space character. But we preserve the linebreaks if the paragraph is a list or a table. Unfortunately, presently we can't recognize a paragraph as a table (if the source is a PDF file). :param result: MarkedUpText containing resulted plain text """ paragraphs = result.labels.get('paragraphs') or [(0, len(result.text))] for par_start, par_end in paragraphs: # check the paragraph is not a list and, therefore, can be # cleared of extra line breaks par_text = result.text[par_start:par_end] par_lines = [l for l in par_text.split('\n') if l.strip()] if not par_lines: continue # if lines make a list then don't remove line breaks is_list = True list_lines = 0 for line in par_lines: if self.re_list_start.match(line): list_lines += 1 max_breaks_allowed = math.ceil(len(par_lines) / 3) if len(par_lines) - list_lines > max_breaks_allowed: is_list = False if not is_list: result.replace_by_regex(self.re_single_newline, ' ', par_start, par_end)
def test_replace_by_regex_extra_end(self): text = 'A text with extra spaces. ' markup = MarkedUpText(text, labels={'p': [(7, 12), (22, 29)]}) reg = re.compile(r'\s+') markup.replace_by_regex(reg, ' ') self.assertEqual('A text with extra spaces. ', markup.text) labels = markup.labels['p'] self.assertEqual((6, 10), labels[0]) self.assertEqual((18, 25), labels[1])
def test_replace_by_regex_extra_longer(self): text = 'A text with extra spaces, and more spaces' markup = MarkedUpText(text, labels={'p': [(7, 12), (22, 32), (41, 46)]}) reg = re.compile(r'\s+') markup.replace_by_regex(reg, ' ') self.assertEqual('A text with extra spaces, and more spaces', markup.text) labels = markup.labels['p'] self.assertEqual((6, 10), labels[0]) self.assertEqual((18, 28), labels[1]) self.assertEqual((35, 40), labels[2])
def test_replace_by_regex_limited(self): text = """ <p>Here (Improve text segmentation (section / page / paragraph / sentence), section 1.1 Use markup from document parser) I described Tika’s output in XHTML. In short: </p> """ labels = {'p': [(7, 12), (22, 28)]} reg = re.compile(r'\s+') markup1 = MarkedUpText(text, labels={l: list(labels[l]) for l in labels}) markup1.replace_by_regex(reg, ' ') markup2 = MarkedUpText(text, labels={l: list(labels[l]) for l in labels}) markup2.replace_by_regex(reg, ' ', 0, len(text)) self.assertEqual(markup1.text, markup2.text) markup2 = MarkedUpText(text, labels={l: list(labels[l]) for l in labels}) markup2.replace_by_regex(reg, ' ', 0, len(text) >> 1) self.assertNotEqual(markup1.text, markup2.text)
def test_replace_by_regex_none(self): text = 'A text with extra spaces.' markup = MarkedUpText(text) reg = re.compile(r'AbC') markup.replace_by_regex(reg, ' ') self.assertEqual(text, markup.text)