def test_strip_text_coords(self): text = ' (A) right of set-off; (B) ' stripped = TextBeautifier.strip_string_coords(text, 100, 127) self.assertEqual(('(A) right of set-off; (B)', 104, 126), stripped) text = ' (A) right of set-off; (B) ' stripped = TextBeautifier.lstrip_string_coords(text, 100, 127) self.assertEqual(('(A) right of set-off; (B) ', 104, 127), stripped) text = ' (A) right of set-off; (B) ' stripped = TextBeautifier.rstrip_string_coords(text, 100, 127) self.assertEqual((' (A) right of set-off; (B)', 100, 126), stripped)
def trim_defined_term(term: str, start: int, end: int) -> \ Tuple[str, int, int, bool]: """ Remove pair of quotes / brackets framing text Replace N-grams of spaces with single spaces Replace line breaks with spaces :param term: a phrase that may contain excess framing symbols :param start: original term's start position, may be changed :param end: original term's end position, may be changed :return: updated term, start, end and the flag indicating that the whole phrase was inside quotes """ was_quoted = False # pick text from quotes # pick text from quotes quoted_parts = [m.group() for m in QUOTED_TEXT_RE.finditer(term)] if len(quoted_parts) == 1: term = quoted_parts[0].strip('''\"'“„''') was_quoted = True orig_term_len = len(term) orig_term_quotes = count_sequence_matches( term, lambda c: c in TextBeautifier.QUOTES) term, start, end = TextBeautifier.strip_pair_symbols((term, start, end)) if len(term) < orig_term_len: # probably we removed quotes updated_term_quotes = count_sequence_matches( term, lambda c: c in TextBeautifier.QUOTES) was_quoted = was_quoted or orig_term_quotes - updated_term_quotes > 1 term = term.replace('\n', ' ') term = SPACES_RE.sub(' ', term) term, start, end = TextBeautifier.strip_string_coords( term, start, end, STRIP_PUNCT_SYMBOLS) # strip all dots or just left one (if ends with abbreviation) ends_with_abbr = ABBREVIATION_ENDING_RE.search(term) if not ends_with_abbr: term, start, end = TextBeautifier.strip_string_coords( term, start, end, '.') else: term, start, end = TextBeautifier.lstrip_string_coords( term, start, end, '.') return term, start, end, was_quoted