def test_unbalanced_braces(self): text = '{x + 3) *[(y-1)/(y+1^2]^3' cleared = TextBeautifier.unify_quotes_braces(text) self.assertEqual('{x + 3} *(y-1)/(y+1^2)^3', cleared) text = 'Ma tem ca nu cred ((in general) in legatura printre aceste fapte (' cleared = TextBeautifier.unify_quotes_braces(text) self.assertEqual('Ma tem ca nu cred (in general) in legatura printre aceste fapte ', cleared)
def test_negative(self): text = '(x + 3) *[(y-1)/(y+1)^2]^3' cleared = TextBeautifier.unify_quotes_braces(text) self.assertEqual(text, cleared) text = '"Jupyter", Venus and "Mars"' cleared = TextBeautifier.unify_quotes_braces(text) self.assertEqual(text, cleared) text = "Let' get loud" cleared = TextBeautifier.unify_quotes_braces(text) self.assertEqual(text, cleared)
def test_mixshaped_quotes(self): text = 'Equity Interest upon the occurrence of an “asset sale" or a “change of control” ' cleared = TextBeautifier.unify_quotes_braces(text) self.assertEqual("Equity Interest upon the occurrence of an “asset sale” or a “change of control” ", cleared) text = 'На "бобах\' одна старушка погадала бы, да \'жалко": "померла"' cleared = TextBeautifier.unify_quotes_braces(text) # the text remains untouched because dub. quotes are balanced self.assertEqual(text, cleared) text = 'called "champerty\'' cleared = TextBeautifier.unify_quotes_braces(text) self.assertEqual('called "champerty"', cleared)
def get_definition_list_in_sentence( sentence_coords: Tuple[int, int, str], decode_unicode=True) -> List[DefinitionCaught]: """ Find possible definitions in natural language in a single sentence. :param sentence_coords: sentence, sentence start, end :param decode_unicode: :return: """ definitions = [] # type: List[DefinitionCaught] sentence = sentence_coords[2] # unify quotes and braces # replace excess braces with ' ' so the str length will remain the same sentence = TextBeautifier.unify_quotes_braces(sentence, empty_replacement=' ') sent_start = sentence_coords[0] result = set() # type: Set[Tuple[str, int, int]] # it really transforms string, e.g. replaces “ with " if decode_unicode: sentence = unidecode.unidecode(sentence) sentence_coords = sentence_coords[0], sentence_coords[1], sentence # case 1 for item in TRIGGER_WORDS_PTN_RE.finditer(sentence): result.update( regex_matches_to_word_coords(EXTRACT_PTN_RE, item.group(), item.start() + sent_start)) # case 3 mts = regex_matches_to_word_coords(NOUN_PTN_RE, sentence, sent_start) mts = [i for i in mts if not NOUN_ANTI_PTN_RE.fullmatch(i[0])] mts = [ m for m in mts if m[0].lower().strip(' ,;.') not in EnLanguageTokens.pronouns ] if len(mts) > 0: result.update(mts) # cases 2, 4, 5, 6 for _ in TRIGGER_QUOTED_DEFINITION_RE.finditer(sentence): for quoted_definition_re in QUOTED_DEFINITION_RE: result.update( regex_matches_to_word_coords(quoted_definition_re, sentence, sent_start)) break # make definitions out of entries for term, start, end in result: term_cleared = TextBeautifier.strip_pair_symbols((term, start, end)) term_cleared = trim_defined_term(term_cleared[0], term_cleared[1], term_cleared[2]) was_quoted = term_cleared[3] if PICK_DEFINITION_FROM_QUOTES: term, start, end = term_cleared[0], term_cleared[1], term_cleared[ 2] if not term_cleared[0]: continue term, start, end = TextBeautifier.unify_quotes_braces_coords( term, start, end) # check the term is not empty if len(term.strip(PUNCTUATION_STRIP_STR)) == 0: continue # returns [('word', 'token', (word_start, word_end)), ...] ... term_pos = list(SpanTokenizer.get_token_spans(term)) if does_term_are_service_words(term_pos): continue term_wo_intro = IntroductoryWordsDetector.remove_term_introduction( term, term_pos) if term_wo_intro != term: term = TextBeautifier.strip_pair_symbols(term_wo_intro) if not term: continue # check the term is not too long max_words_per_definition = MAX_TERM_TOKENS if was_quoted: max_words_per_definition = MAX_QUOTED_TERM_TOKENS words_in_term = sum( 1 for w in word_processor.split_text_on_words(term_cleared[0]) if not w.is_separator) quotes_in_text = get_quotes_count_in_string(term_cleared[0]) possible_definitions = quotes_in_text // 2 if quotes_in_text > 1 else 1 possible_tokens_count = max_words_per_definition * possible_definitions if words_in_term > possible_tokens_count: continue split_definitions_lst = split_definitions_inside_term( term, sentence_coords, start, end) for definition, s, e in split_definitions_lst: definition, s, e = TextBeautifier.strip_pair_symbols( (definition, s, e)) definitions.append(DefinitionCaught(definition, sentence, ( s, e, ))) return definitions
def test_misplaced_quotes(self): text = '”Consolidated EBITDA“' cleared = TextBeautifier.unify_quotes_braces(text) self.assertEqual('“Consolidated EBITDA”', cleared)
def test_doubled_quotes(self): text = '""Consolidated EBITDA" means, for any period' cleared = TextBeautifier.unify_quotes_braces(text) self.assertEqual('"Consolidated EBITDA" means, for any period', cleared)
def test_braces_shape(self): text = '{x + 3) *[(y-1)/(y+1]^2]^3' cleared = TextBeautifier.unify_quotes_braces(text) self.assertEqual('{x + 3} *[(y-1)/(y+1)^2]^3', cleared)