def test_strip_pair_symbols(self):
        text = '"(A right of set-off; B)"'
        cleared = TextBeautifier.strip_pair_symbols(text)
        self.assertEqual('A right of set-off; B', cleared)

        text = '("A right" of set-off; "B")'
        cleared = TextBeautifier.strip_pair_symbols(text)
        self.assertEqual('"A right" of set-off; "B"', cleared)
    def test_unbalanced_braces(self):
        text = '{x + 3) *[(y-1)/(y+1^2]^3'
        cleared = TextBeautifier.unify_quotes_braces(text)
        self.assertEqual('{x + 3} *(y-1)/(y+1^2)^3', cleared)

        text = 'Ma tem ca nu cred ((in general) in legatura printre aceste fapte ('
        cleared = TextBeautifier.unify_quotes_braces(text)
        self.assertEqual('Ma tem ca nu cred (in general) in legatura printre aceste fapte ', cleared)
    def test_strip_pair_symbols_untouched(self):
        text = '(A) right of set-off; (B)'
        cleared = TextBeautifier.strip_pair_symbols(text)
        self.assertEqual(text, cleared)

        text = '(A ( right) of set-off; (B)'
        cleared = TextBeautifier.strip_pair_symbols(text)
        self.assertEqual(text, cleared)

        text = '"(A ( right)" "of set-off; (B)"'
        cleared = TextBeautifier.strip_pair_symbols(text)
        self.assertEqual(text, cleared)
    def test_strip_pair_symbols_coords(self):
        text = ' "(A right of set-off; B)"'
        cleared = TextBeautifier.strip_pair_symbols((text, 2, 23))
        self.assertEqual(('A right of set-off; B', 5, 21), cleared)

        text = '("A right" of set-off; "B")'
        cleared = TextBeautifier.strip_pair_symbols((text, 100, 119))
        self.assertEqual(('"A right" of set-off; "B"', 101, 118), cleared)

        text = '  "A right" of set-off; "B" '
        cleared = TextBeautifier.strip_pair_symbols((text, 100, 119))
        self.assertEqual(('"A right" of set-off; "B"', 102, 118), cleared)
    def test_negative(self):
        text = '(x + 3) *[(y-1)/(y+1)^2]^3'
        cleared = TextBeautifier.unify_quotes_braces(text)
        self.assertEqual(text, cleared)

        text = '"Jupyter", Venus and "Mars"'
        cleared = TextBeautifier.unify_quotes_braces(text)
        self.assertEqual(text, cleared)

        text = "Let' get loud"
        cleared = TextBeautifier.unify_quotes_braces(text)
        self.assertEqual(text, cleared)
Exemplo n.º 6
0
    def test_strip_text_coords(self):
        text = '    (A) right of set-off; (B) '
        stripped = TextBeautifier.strip_string_coords(text, 100, 127)
        self.assertEqual(('(A) right of set-off; (B)', 104, 126), stripped)

        text = '    (A) right of set-off; (B) '
        stripped = TextBeautifier.lstrip_string_coords(text, 100, 127)
        self.assertEqual(('(A) right of set-off; (B) ', 104, 127), stripped)

        text = '    (A) right of set-off; (B) '
        stripped = TextBeautifier.rstrip_string_coords(text, 100, 127)
        self.assertEqual(('    (A) right of set-off; (B)', 100, 126), stripped)
    def test_mixshaped_quotes(self):
        text = 'Equity Interest upon the occurrence of an “asset sale" or a “change of control” '
        cleared = TextBeautifier.unify_quotes_braces(text)
        self.assertEqual("Equity Interest upon the occurrence of an “asset sale” or a “change of control” ",
                         cleared)

        text = 'На "бобах\' одна старушка погадала бы, да \'жалко": "померла"'
        cleared = TextBeautifier.unify_quotes_braces(text)
        # the text remains untouched because dub. quotes are balanced
        self.assertEqual(text, cleared)

        text = 'called "champerty\''
        cleared = TextBeautifier.unify_quotes_braces(text)
        self.assertEqual('called "champerty"', cleared)
Exemplo n.º 8
0
    def get_token_spans(txt: str) -> \
            Generator[Tuple[str, str, int, int], None, None]:
        """
        returns: [('word', 'token', (word_start, word_end)), ...]
        """
        words = nltk.word_tokenize(txt)
        tokens = nltk.pos_tag(words)
        offset = 0
        last_symbol = len(txt) - 1

        for word, token in tokens:
            next_offset = txt.find(word, offset)
            if next_offset < 0:
                transf_word = TextBeautifier.find_transformed_word(
                    txt, word, offset)
                if transf_word:
                    word, next_offset = transf_word

            offset = next_offset if next_offset >= 0 else offset + 1
            offset = min(offset, last_symbol)

            right_margin = offset + len(word)

            yield word, token, offset, right_margin - 1
            offset = right_margin
Exemplo n.º 9
0
def trim_defined_term(term: str, start: int, end: int) -> \
        Tuple[str, int, int, bool]:
    """
    Remove pair of quotes / brackets framing text
    Replace N-grams of spaces with single spaces
    Replace line breaks with spaces
    :param term: a phrase that may contain excess framing symbols
    :param start: original term's start position, may be changed
    :param end: original term's end position, may be changed
    :return: updated term, start, end and the flag indicating that the whole phrase was inside quotes
    """
    was_quoted = False

    # pick text from quotes
    # pick text from quotes
    quoted_parts = [m.group() for m in QUOTED_TEXT_RE.finditer(term)]
    if len(quoted_parts) == 1:
        term = quoted_parts[0].strip('''\"'“„''')
        was_quoted = True

    orig_term_len = len(term)
    orig_term_quotes = count_sequence_matches(
        term, lambda c: c in TextBeautifier.QUOTES)
    term, start, end = TextBeautifier.strip_pair_symbols((term, start, end))
    if len(term) < orig_term_len:
        # probably we removed quotes
        updated_term_quotes = count_sequence_matches(
            term, lambda c: c in TextBeautifier.QUOTES)
        was_quoted = was_quoted or orig_term_quotes - updated_term_quotes > 1

    term = term.replace('\n', ' ')
    term = SPACES_RE.sub(' ', term)

    term, start, end = TextBeautifier.strip_string_coords(
        term, start, end, STRIP_PUNCT_SYMBOLS)

    # strip all dots or just left one (if ends with abbreviation)
    ends_with_abbr = ABBREVIATION_ENDING_RE.search(term)
    if not ends_with_abbr:
        term, start, end = TextBeautifier.strip_string_coords(
            term, start, end, '.')
    else:
        term, start, end = TextBeautifier.lstrip_string_coords(
            term, start, end, '.')

    return term, start, end, was_quoted
Exemplo n.º 10
0
def get_definition_list_in_sentence(
        sentence_coords: Tuple[int, int, str],
        decode_unicode=True) -> List[DefinitionCaught]:
    """
        Find possible definitions in natural language in a single sentence.
        :param sentence_coords: sentence, sentence start, end
        :param decode_unicode:
        :return:
        """
    definitions = []  # type: List[DefinitionCaught]
    sentence = sentence_coords[2]
    # unify quotes and braces
    # replace excess braces with ' ' so the str length will remain the same
    sentence = TextBeautifier.unify_quotes_braces(sentence,
                                                  empty_replacement=' ')
    sent_start = sentence_coords[0]
    result = set()  # type: Set[Tuple[str, int, int]]

    # it really transforms string, e.g. replaces “ with "
    if decode_unicode:
        sentence = unidecode.unidecode(sentence)
        sentence_coords = sentence_coords[0], sentence_coords[1], sentence

    # case 1
    for item in TRIGGER_WORDS_PTN_RE.finditer(sentence):
        result.update(
            regex_matches_to_word_coords(EXTRACT_PTN_RE, item.group(),
                                         item.start() + sent_start))

    # case 3
    mts = regex_matches_to_word_coords(NOUN_PTN_RE, sentence, sent_start)
    mts = [i for i in mts if not NOUN_ANTI_PTN_RE.fullmatch(i[0])]
    mts = [
        m for m in mts
        if m[0].lower().strip(' ,;.') not in EnLanguageTokens.pronouns
    ]
    if len(mts) > 0:
        result.update(mts)

    # cases 2, 4, 5, 6
    for _ in TRIGGER_QUOTED_DEFINITION_RE.finditer(sentence):
        for quoted_definition_re in QUOTED_DEFINITION_RE:
            result.update(
                regex_matches_to_word_coords(quoted_definition_re, sentence,
                                             sent_start))
        break

    # make definitions out of entries
    for term, start, end in result:
        term_cleared = TextBeautifier.strip_pair_symbols((term, start, end))
        term_cleared = trim_defined_term(term_cleared[0], term_cleared[1],
                                         term_cleared[2])
        was_quoted = term_cleared[3]

        if PICK_DEFINITION_FROM_QUOTES:
            term, start, end = term_cleared[0], term_cleared[1], term_cleared[
                2]

        if not term_cleared[0]:
            continue

        term, start, end = TextBeautifier.unify_quotes_braces_coords(
            term, start, end)

        # check the term is not empty
        if len(term.strip(PUNCTUATION_STRIP_STR)) == 0:
            continue

        # returns [('word', 'token', (word_start, word_end)), ...] ...
        term_pos = list(SpanTokenizer.get_token_spans(term))
        if does_term_are_service_words(term_pos):
            continue

        term_wo_intro = IntroductoryWordsDetector.remove_term_introduction(
            term, term_pos)
        if term_wo_intro != term:
            term = TextBeautifier.strip_pair_symbols(term_wo_intro)
        if not term:
            continue

        # check the term is not too long
        max_words_per_definition = MAX_TERM_TOKENS
        if was_quoted:
            max_words_per_definition = MAX_QUOTED_TERM_TOKENS

        words_in_term = sum(
            1 for w in word_processor.split_text_on_words(term_cleared[0])
            if not w.is_separator)
        quotes_in_text = get_quotes_count_in_string(term_cleared[0])
        possible_definitions = quotes_in_text // 2 if quotes_in_text > 1 else 1
        possible_tokens_count = max_words_per_definition * possible_definitions
        if words_in_term > possible_tokens_count:
            continue

        split_definitions_lst = split_definitions_inside_term(
            term, sentence_coords, start, end)

        for definition, s, e in split_definitions_lst:
            definition, s, e = TextBeautifier.strip_pair_symbols(
                (definition, s, e))
            definitions.append(DefinitionCaught(definition, sentence, (
                s,
                e,
            )))

    return definitions
    def find_phrase_in_source_text(
            text: str,
            phrases: List[str],
            pos_start: int = 0,
            pos_end: int = 0) -> List[Tuple[str, int, int]]:
        """
        Though phrase is taken from text, it could be changed - e.g.,
        extra or removed spaces...

        Returns a list of (phrase, phrase_start) tuples
        :param text: text where to find phrases ([phrases])
        :param phrases: words or phrases to be found inside text
        :param pos_start: where to start (in source text)
        :param pos_end: where to stop searching
        :return: [('phrase A', 10, 18), ... ]
        """

        text = TextBeautifier.normalize_smb_preserve_len(text)
        condensed = ''
        ctos = []  # condensed-to-source indices
        stoc = [0] * len(text)  # source-to-condensed indices
        cindex = 0
        end_index = len(text)
        if pos_end:
            end_index = min(pos_end, end_index)

        for i in range(pos_start, end_index):
            a = text[i]
            if a not in PhrasePositionFinder.space_symbols:
                stoc[i] = cindex
                ctos.append(i)
                cindex += 1
                condensed += a
                continue
            stoc[i] = cindex

        phrases = [(p, 0, 0) for p in phrases]
        start = 0
        for i, phrase in enumerate(phrases):
            if start >= len(stoc):
                break
            word = TextBeautifier.normalize_smb_preserve_len(phrase[0])
            src_word = word
            pstart = text.find(word, start)
            if pstart < 0:
                transf_word = TextBeautifier.find_transformed_word(
                    text, word, start)
                if transf_word:
                    word, pstart = transf_word
            if pstart >= 0:
                start = pstart + len(word)
                phrases[i] = (phrase[0], pstart, start)
                continue
            # phrase is modified = extra spaces were added or removed
            word = PhrasePositionFinder.reg_space.sub('', word)
            cstart = stoc[start]
            con_word_start = condensed.find(word, cstart)
            con_word_start = con_word_start if con_word_start >= 0 else cstart
            src_index = ctos[con_word_start]
            w_end = src_index + len(src_word)
            if w_end < len(ctos):
                w_end = ctos[w_end]
            else:
                w_end = ctos[-1]
            start = src_index + len(src_word)
            phrases[i] = (phrase[0], src_index, w_end)

        return phrases
 def test_misplaced_quotes(self):
     text = '”Consolidated EBITDA“'
     cleared = TextBeautifier.unify_quotes_braces(text)
     self.assertEqual('“Consolidated EBITDA”', cleared)
 def test_quotes_coords(self):
     text = '"Consolidated EBITDA means, for any period'
     cleared = TextBeautifier.unify_quotes_braces_coords(text, 10, 93)
     self.assertEqual(('Consolidated EBITDA means, for any period', 11, 93), cleared)
 def test_doubled_quotes(self):
     text = '""Consolidated EBITDA" means, for any period'
     cleared = TextBeautifier.unify_quotes_braces(text)
     self.assertEqual('"Consolidated EBITDA" means, for any period', cleared)
 def test_braces_shape(self):
     text = '{x + 3) *[(y-1)/(y+1]^2]^3'
     cleared = TextBeautifier.unify_quotes_braces(text)
     self.assertEqual('{x + 3} *[(y-1)/(y+1)^2]^3', cleared)
 def test_find_transformed_word(self):
     text = '(each an “Obligation” and collectively, the “Obligations”)'
     wrd = TextBeautifier.find_transformed_word(text, '"Obligation"', 0)
     self.assertEqual(None, wrd)