def test_strip_pair_symbols(self):
        text = '"(A right of set-off; B)"'
        cleared = TextBeautifier.strip_pair_symbols(text)
        self.assertEqual('A right of set-off; B', cleared)

        text = '("A right" of set-off; "B")'
        cleared = TextBeautifier.strip_pair_symbols(text)
        self.assertEqual('"A right" of set-off; "B"', cleared)
    def test_strip_pair_symbols_untouched(self):
        text = '(A) right of set-off; (B)'
        cleared = TextBeautifier.strip_pair_symbols(text)
        self.assertEqual(text, cleared)

        text = '(A ( right) of set-off; (B)'
        cleared = TextBeautifier.strip_pair_symbols(text)
        self.assertEqual(text, cleared)

        text = '"(A ( right)" "of set-off; (B)"'
        cleared = TextBeautifier.strip_pair_symbols(text)
        self.assertEqual(text, cleared)
    def test_strip_pair_symbols_coords(self):
        text = ' "(A right of set-off; B)"'
        cleared = TextBeautifier.strip_pair_symbols((text, 2, 23))
        self.assertEqual(('A right of set-off; B', 5, 21), cleared)

        text = '("A right" of set-off; "B")'
        cleared = TextBeautifier.strip_pair_symbols((text, 100, 119))
        self.assertEqual(('"A right" of set-off; "B"', 101, 118), cleared)

        text = '  "A right" of set-off; "B" '
        cleared = TextBeautifier.strip_pair_symbols((text, 100, 119))
        self.assertEqual(('"A right" of set-off; "B"', 102, 118), cleared)
Пример #4
0
def trim_defined_term(term: str, start: int, end: int) -> \
        Tuple[str, int, int, bool]:
    """
    Remove pair of quotes / brackets framing text
    Replace N-grams of spaces with single spaces
    Replace line breaks with spaces
    :param term: a phrase that may contain excess framing symbols
    :param start: original term's start position, may be changed
    :param end: original term's end position, may be changed
    :return: updated term, start, end and the flag indicating that the whole phrase was inside quotes
    """
    was_quoted = False

    # pick text from quotes
    # pick text from quotes
    quoted_parts = [m.group() for m in QUOTED_TEXT_RE.finditer(term)]
    if len(quoted_parts) == 1:
        term = quoted_parts[0].strip('''\"'“„''')
        was_quoted = True

    orig_term_len = len(term)
    orig_term_quotes = count_sequence_matches(
        term, lambda c: c in TextBeautifier.QUOTES)
    term, start, end = TextBeautifier.strip_pair_symbols((term, start, end))
    if len(term) < orig_term_len:
        # probably we removed quotes
        updated_term_quotes = count_sequence_matches(
            term, lambda c: c in TextBeautifier.QUOTES)
        was_quoted = was_quoted or orig_term_quotes - updated_term_quotes > 1

    term = term.replace('\n', ' ')
    term = SPACES_RE.sub(' ', term)

    term, start, end = TextBeautifier.strip_string_coords(
        term, start, end, STRIP_PUNCT_SYMBOLS)

    # strip all dots or just left one (if ends with abbreviation)
    ends_with_abbr = ABBREVIATION_ENDING_RE.search(term)
    if not ends_with_abbr:
        term, start, end = TextBeautifier.strip_string_coords(
            term, start, end, '.')
    else:
        term, start, end = TextBeautifier.lstrip_string_coords(
            term, start, end, '.')

    return term, start, end, was_quoted
Пример #5
0
def get_definition_list_in_sentence(
        sentence_coords: Tuple[int, int, str],
        decode_unicode=True) -> List[DefinitionCaught]:
    """
        Find possible definitions in natural language in a single sentence.
        :param sentence_coords: sentence, sentence start, end
        :param decode_unicode:
        :return:
        """
    definitions = []  # type: List[DefinitionCaught]
    sentence = sentence_coords[2]
    # unify quotes and braces
    # replace excess braces with ' ' so the str length will remain the same
    sentence = TextBeautifier.unify_quotes_braces(sentence,
                                                  empty_replacement=' ')
    sent_start = sentence_coords[0]
    result = set()  # type: Set[Tuple[str, int, int]]

    # it really transforms string, e.g. replaces “ with "
    if decode_unicode:
        sentence = unidecode.unidecode(sentence)
        sentence_coords = sentence_coords[0], sentence_coords[1], sentence

    # case 1
    for item in TRIGGER_WORDS_PTN_RE.finditer(sentence):
        result.update(
            regex_matches_to_word_coords(EXTRACT_PTN_RE, item.group(),
                                         item.start() + sent_start))

    # case 3
    mts = regex_matches_to_word_coords(NOUN_PTN_RE, sentence, sent_start)
    mts = [i for i in mts if not NOUN_ANTI_PTN_RE.fullmatch(i[0])]
    mts = [
        m for m in mts
        if m[0].lower().strip(' ,;.') not in EnLanguageTokens.pronouns
    ]
    if len(mts) > 0:
        result.update(mts)

    # cases 2, 4, 5, 6
    for _ in TRIGGER_QUOTED_DEFINITION_RE.finditer(sentence):
        for quoted_definition_re in QUOTED_DEFINITION_RE:
            result.update(
                regex_matches_to_word_coords(quoted_definition_re, sentence,
                                             sent_start))
        break

    # make definitions out of entries
    for term, start, end in result:
        term_cleared = TextBeautifier.strip_pair_symbols((term, start, end))
        term_cleared = trim_defined_term(term_cleared[0], term_cleared[1],
                                         term_cleared[2])
        was_quoted = term_cleared[3]

        if PICK_DEFINITION_FROM_QUOTES:
            term, start, end = term_cleared[0], term_cleared[1], term_cleared[
                2]

        if not term_cleared[0]:
            continue

        term, start, end = TextBeautifier.unify_quotes_braces_coords(
            term, start, end)

        # check the term is not empty
        if len(term.strip(PUNCTUATION_STRIP_STR)) == 0:
            continue

        # returns [('word', 'token', (word_start, word_end)), ...] ...
        term_pos = list(SpanTokenizer.get_token_spans(term))
        if does_term_are_service_words(term_pos):
            continue

        term_wo_intro = IntroductoryWordsDetector.remove_term_introduction(
            term, term_pos)
        if term_wo_intro != term:
            term = TextBeautifier.strip_pair_symbols(term_wo_intro)
        if not term:
            continue

        # check the term is not too long
        max_words_per_definition = MAX_TERM_TOKENS
        if was_quoted:
            max_words_per_definition = MAX_QUOTED_TERM_TOKENS

        words_in_term = sum(
            1 for w in word_processor.split_text_on_words(term_cleared[0])
            if not w.is_separator)
        quotes_in_text = get_quotes_count_in_string(term_cleared[0])
        possible_definitions = quotes_in_text // 2 if quotes_in_text > 1 else 1
        possible_tokens_count = max_words_per_definition * possible_definitions
        if words_in_term > possible_tokens_count:
            continue

        split_definitions_lst = split_definitions_inside_term(
            term, sentence_coords, start, end)

        for definition, s, e in split_definitions_lst:
            definition, s, e = TextBeautifier.strip_pair_symbols(
                (definition, s, e))
            definitions.append(DefinitionCaught(definition, sentence, (
                s,
                e,
            )))

    return definitions