def test_strip_pair_symbols(self): text = '"(A right of set-off; B)"' cleared = TextBeautifier.strip_pair_symbols(text) self.assertEqual('A right of set-off; B', cleared) text = '("A right" of set-off; "B")' cleared = TextBeautifier.strip_pair_symbols(text) self.assertEqual('"A right" of set-off; "B"', cleared)
def test_unbalanced_braces(self): text = '{x + 3) *[(y-1)/(y+1^2]^3' cleared = TextBeautifier.unify_quotes_braces(text) self.assertEqual('{x + 3} *(y-1)/(y+1^2)^3', cleared) text = 'Ma tem ca nu cred ((in general) in legatura printre aceste fapte (' cleared = TextBeautifier.unify_quotes_braces(text) self.assertEqual('Ma tem ca nu cred (in general) in legatura printre aceste fapte ', cleared)
def test_strip_pair_symbols_untouched(self): text = '(A) right of set-off; (B)' cleared = TextBeautifier.strip_pair_symbols(text) self.assertEqual(text, cleared) text = '(A ( right) of set-off; (B)' cleared = TextBeautifier.strip_pair_symbols(text) self.assertEqual(text, cleared) text = '"(A ( right)" "of set-off; (B)"' cleared = TextBeautifier.strip_pair_symbols(text) self.assertEqual(text, cleared)
def test_strip_pair_symbols_coords(self): text = ' "(A right of set-off; B)"' cleared = TextBeautifier.strip_pair_symbols((text, 2, 23)) self.assertEqual(('A right of set-off; B', 5, 21), cleared) text = '("A right" of set-off; "B")' cleared = TextBeautifier.strip_pair_symbols((text, 100, 119)) self.assertEqual(('"A right" of set-off; "B"', 101, 118), cleared) text = ' "A right" of set-off; "B" ' cleared = TextBeautifier.strip_pair_symbols((text, 100, 119)) self.assertEqual(('"A right" of set-off; "B"', 102, 118), cleared)
def test_negative(self): text = '(x + 3) *[(y-1)/(y+1)^2]^3' cleared = TextBeautifier.unify_quotes_braces(text) self.assertEqual(text, cleared) text = '"Jupyter", Venus and "Mars"' cleared = TextBeautifier.unify_quotes_braces(text) self.assertEqual(text, cleared) text = "Let' get loud" cleared = TextBeautifier.unify_quotes_braces(text) self.assertEqual(text, cleared)
def test_strip_text_coords(self): text = ' (A) right of set-off; (B) ' stripped = TextBeautifier.strip_string_coords(text, 100, 127) self.assertEqual(('(A) right of set-off; (B)', 104, 126), stripped) text = ' (A) right of set-off; (B) ' stripped = TextBeautifier.lstrip_string_coords(text, 100, 127) self.assertEqual(('(A) right of set-off; (B) ', 104, 127), stripped) text = ' (A) right of set-off; (B) ' stripped = TextBeautifier.rstrip_string_coords(text, 100, 127) self.assertEqual((' (A) right of set-off; (B)', 100, 126), stripped)
def test_mixshaped_quotes(self): text = 'Equity Interest upon the occurrence of an “asset sale" or a “change of control” ' cleared = TextBeautifier.unify_quotes_braces(text) self.assertEqual("Equity Interest upon the occurrence of an “asset sale” or a “change of control” ", cleared) text = 'На "бобах\' одна старушка погадала бы, да \'жалко": "померла"' cleared = TextBeautifier.unify_quotes_braces(text) # the text remains untouched because dub. quotes are balanced self.assertEqual(text, cleared) text = 'called "champerty\'' cleared = TextBeautifier.unify_quotes_braces(text) self.assertEqual('called "champerty"', cleared)
def get_token_spans(txt: str) -> \ Generator[Tuple[str, str, int, int], None, None]: """ returns: [('word', 'token', (word_start, word_end)), ...] """ words = nltk.word_tokenize(txt) tokens = nltk.pos_tag(words) offset = 0 last_symbol = len(txt) - 1 for word, token in tokens: next_offset = txt.find(word, offset) if next_offset < 0: transf_word = TextBeautifier.find_transformed_word( txt, word, offset) if transf_word: word, next_offset = transf_word offset = next_offset if next_offset >= 0 else offset + 1 offset = min(offset, last_symbol) right_margin = offset + len(word) yield word, token, offset, right_margin - 1 offset = right_margin
def trim_defined_term(term: str, start: int, end: int) -> \ Tuple[str, int, int, bool]: """ Remove pair of quotes / brackets framing text Replace N-grams of spaces with single spaces Replace line breaks with spaces :param term: a phrase that may contain excess framing symbols :param start: original term's start position, may be changed :param end: original term's end position, may be changed :return: updated term, start, end and the flag indicating that the whole phrase was inside quotes """ was_quoted = False # pick text from quotes # pick text from quotes quoted_parts = [m.group() for m in QUOTED_TEXT_RE.finditer(term)] if len(quoted_parts) == 1: term = quoted_parts[0].strip('''\"'“„''') was_quoted = True orig_term_len = len(term) orig_term_quotes = count_sequence_matches( term, lambda c: c in TextBeautifier.QUOTES) term, start, end = TextBeautifier.strip_pair_symbols((term, start, end)) if len(term) < orig_term_len: # probably we removed quotes updated_term_quotes = count_sequence_matches( term, lambda c: c in TextBeautifier.QUOTES) was_quoted = was_quoted or orig_term_quotes - updated_term_quotes > 1 term = term.replace('\n', ' ') term = SPACES_RE.sub(' ', term) term, start, end = TextBeautifier.strip_string_coords( term, start, end, STRIP_PUNCT_SYMBOLS) # strip all dots or just left one (if ends with abbreviation) ends_with_abbr = ABBREVIATION_ENDING_RE.search(term) if not ends_with_abbr: term, start, end = TextBeautifier.strip_string_coords( term, start, end, '.') else: term, start, end = TextBeautifier.lstrip_string_coords( term, start, end, '.') return term, start, end, was_quoted
def get_definition_list_in_sentence( sentence_coords: Tuple[int, int, str], decode_unicode=True) -> List[DefinitionCaught]: """ Find possible definitions in natural language in a single sentence. :param sentence_coords: sentence, sentence start, end :param decode_unicode: :return: """ definitions = [] # type: List[DefinitionCaught] sentence = sentence_coords[2] # unify quotes and braces # replace excess braces with ' ' so the str length will remain the same sentence = TextBeautifier.unify_quotes_braces(sentence, empty_replacement=' ') sent_start = sentence_coords[0] result = set() # type: Set[Tuple[str, int, int]] # it really transforms string, e.g. replaces “ with " if decode_unicode: sentence = unidecode.unidecode(sentence) sentence_coords = sentence_coords[0], sentence_coords[1], sentence # case 1 for item in TRIGGER_WORDS_PTN_RE.finditer(sentence): result.update( regex_matches_to_word_coords(EXTRACT_PTN_RE, item.group(), item.start() + sent_start)) # case 3 mts = regex_matches_to_word_coords(NOUN_PTN_RE, sentence, sent_start) mts = [i for i in mts if not NOUN_ANTI_PTN_RE.fullmatch(i[0])] mts = [ m for m in mts if m[0].lower().strip(' ,;.') not in EnLanguageTokens.pronouns ] if len(mts) > 0: result.update(mts) # cases 2, 4, 5, 6 for _ in TRIGGER_QUOTED_DEFINITION_RE.finditer(sentence): for quoted_definition_re in QUOTED_DEFINITION_RE: result.update( regex_matches_to_word_coords(quoted_definition_re, sentence, sent_start)) break # make definitions out of entries for term, start, end in result: term_cleared = TextBeautifier.strip_pair_symbols((term, start, end)) term_cleared = trim_defined_term(term_cleared[0], term_cleared[1], term_cleared[2]) was_quoted = term_cleared[3] if PICK_DEFINITION_FROM_QUOTES: term, start, end = term_cleared[0], term_cleared[1], term_cleared[ 2] if not term_cleared[0]: continue term, start, end = TextBeautifier.unify_quotes_braces_coords( term, start, end) # check the term is not empty if len(term.strip(PUNCTUATION_STRIP_STR)) == 0: continue # returns [('word', 'token', (word_start, word_end)), ...] ... term_pos = list(SpanTokenizer.get_token_spans(term)) if does_term_are_service_words(term_pos): continue term_wo_intro = IntroductoryWordsDetector.remove_term_introduction( term, term_pos) if term_wo_intro != term: term = TextBeautifier.strip_pair_symbols(term_wo_intro) if not term: continue # check the term is not too long max_words_per_definition = MAX_TERM_TOKENS if was_quoted: max_words_per_definition = MAX_QUOTED_TERM_TOKENS words_in_term = sum( 1 for w in word_processor.split_text_on_words(term_cleared[0]) if not w.is_separator) quotes_in_text = get_quotes_count_in_string(term_cleared[0]) possible_definitions = quotes_in_text // 2 if quotes_in_text > 1 else 1 possible_tokens_count = max_words_per_definition * possible_definitions if words_in_term > possible_tokens_count: continue split_definitions_lst = split_definitions_inside_term( term, sentence_coords, start, end) for definition, s, e in split_definitions_lst: definition, s, e = TextBeautifier.strip_pair_symbols( (definition, s, e)) definitions.append(DefinitionCaught(definition, sentence, ( s, e, ))) return definitions
def find_phrase_in_source_text( text: str, phrases: List[str], pos_start: int = 0, pos_end: int = 0) -> List[Tuple[str, int, int]]: """ Though phrase is taken from text, it could be changed - e.g., extra or removed spaces... Returns a list of (phrase, phrase_start) tuples :param text: text where to find phrases ([phrases]) :param phrases: words or phrases to be found inside text :param pos_start: where to start (in source text) :param pos_end: where to stop searching :return: [('phrase A', 10, 18), ... ] """ text = TextBeautifier.normalize_smb_preserve_len(text) condensed = '' ctos = [] # condensed-to-source indices stoc = [0] * len(text) # source-to-condensed indices cindex = 0 end_index = len(text) if pos_end: end_index = min(pos_end, end_index) for i in range(pos_start, end_index): a = text[i] if a not in PhrasePositionFinder.space_symbols: stoc[i] = cindex ctos.append(i) cindex += 1 condensed += a continue stoc[i] = cindex phrases = [(p, 0, 0) for p in phrases] start = 0 for i, phrase in enumerate(phrases): if start >= len(stoc): break word = TextBeautifier.normalize_smb_preserve_len(phrase[0]) src_word = word pstart = text.find(word, start) if pstart < 0: transf_word = TextBeautifier.find_transformed_word( text, word, start) if transf_word: word, pstart = transf_word if pstart >= 0: start = pstart + len(word) phrases[i] = (phrase[0], pstart, start) continue # phrase is modified = extra spaces were added or removed word = PhrasePositionFinder.reg_space.sub('', word) cstart = stoc[start] con_word_start = condensed.find(word, cstart) con_word_start = con_word_start if con_word_start >= 0 else cstart src_index = ctos[con_word_start] w_end = src_index + len(src_word) if w_end < len(ctos): w_end = ctos[w_end] else: w_end = ctos[-1] start = src_index + len(src_word) phrases[i] = (phrase[0], src_index, w_end) return phrases
def test_misplaced_quotes(self): text = '”Consolidated EBITDA“' cleared = TextBeautifier.unify_quotes_braces(text) self.assertEqual('“Consolidated EBITDA”', cleared)
def test_quotes_coords(self): text = '"Consolidated EBITDA means, for any period' cleared = TextBeautifier.unify_quotes_braces_coords(text, 10, 93) self.assertEqual(('Consolidated EBITDA means, for any period', 11, 93), cleared)
def test_doubled_quotes(self): text = '""Consolidated EBITDA" means, for any period' cleared = TextBeautifier.unify_quotes_braces(text) self.assertEqual('"Consolidated EBITDA" means, for any period', cleared)
def test_braces_shape(self): text = '{x + 3) *[(y-1)/(y+1]^2]^3' cleared = TextBeautifier.unify_quotes_braces(text) self.assertEqual('{x + 3} *[(y-1)/(y+1)^2]^3', cleared)
def test_find_transformed_word(self): text = '(each an “Obligation” and collectively, the “Obligations”)' wrd = TextBeautifier.find_transformed_word(text, '"Obligation"', 0) self.assertEqual(None, wrd)