Python TextBeautifier.unify_quotes_braces示例

编程语言: Python

命名空间/包名称: lexnlp.extract.common.text_beautifier

类/类型: TextBeautifier

方法/功能: unify_quotes_braces

hotexamples.com的示例: 7

Python TextBeautifier.unify_quotes_braces - 已找到7个示例。这些是从开源项目中提取的最受好评的lexnlp.extract.common.text_beautifier.TextBeautifier.unify_quotes_braces现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

unify_quotes_braces(7)

strip_pair_symbols(5)

find_transformed_word(3)

lstrip_string_coords(2)

strip_string_coords(2)

unify_quotes_braces_coords(2)

normalize_smb_preserve_len(1)

rstrip_string_coords(1)

示例#1

显示文件

文件： test_text_beautifier.py 项目： denmonz/Orrick-Flashcards

    def test_unbalanced_braces(self):
        text = '{x + 3) *[(y-1)/(y+1^2]^3'
        cleared = TextBeautifier.unify_quotes_braces(text)
        self.assertEqual('{x + 3} *(y-1)/(y+1^2)^3', cleared)

        text = 'Ma tem ca nu cred ((in general) in legatura printre aceste fapte ('
        cleared = TextBeautifier.unify_quotes_braces(text)
        self.assertEqual('Ma tem ca nu cred (in general) in legatura printre aceste fapte ', cleared)

示例#2

显示文件

文件： test_text_beautifier.py 项目： denmonz/Orrick-Flashcards

    def test_negative(self):
        text = '(x + 3) *[(y-1)/(y+1)^2]^3'
        cleared = TextBeautifier.unify_quotes_braces(text)
        self.assertEqual(text, cleared)

        text = '"Jupyter", Venus and "Mars"'
        cleared = TextBeautifier.unify_quotes_braces(text)
        self.assertEqual(text, cleared)

        text = "Let' get loud"
        cleared = TextBeautifier.unify_quotes_braces(text)
        self.assertEqual(text, cleared)

示例#3

显示文件

文件： test_text_beautifier.py 项目： denmonz/Orrick-Flashcards

    def test_mixshaped_quotes(self):
        text = 'Equity Interest upon the occurrence of an “asset sale" or a “change of control” '
        cleared = TextBeautifier.unify_quotes_braces(text)
        self.assertEqual("Equity Interest upon the occurrence of an “asset sale” or a “change of control” ",
                         cleared)

        text = 'На "бобах\' одна старушка погадала бы, да \'жалко": "померла"'
        cleared = TextBeautifier.unify_quotes_braces(text)
        # the text remains untouched because dub. quotes are balanced
        self.assertEqual(text, cleared)

        text = 'called "champerty\''
        cleared = TextBeautifier.unify_quotes_braces(text)
        self.assertEqual('called "champerty"', cleared)

示例#4

显示文件

def get_definition_list_in_sentence(
        sentence_coords: Tuple[int, int, str],
        decode_unicode=True) -> List[DefinitionCaught]:
    """
        Find possible definitions in natural language in a single sentence.
        :param sentence_coords: sentence, sentence start, end
        :param decode_unicode:
        :return:
        """
    definitions = []  # type: List[DefinitionCaught]
    sentence = sentence_coords[2]
    # unify quotes and braces
    # replace excess braces with ' ' so the str length will remain the same
    sentence = TextBeautifier.unify_quotes_braces(sentence,
                                                  empty_replacement=' ')
    sent_start = sentence_coords[0]
    result = set()  # type: Set[Tuple[str, int, int]]

    # it really transforms string, e.g. replaces “ with "
    if decode_unicode:
        sentence = unidecode.unidecode(sentence)
        sentence_coords = sentence_coords[0], sentence_coords[1], sentence

    # case 1
    for item in TRIGGER_WORDS_PTN_RE.finditer(sentence):
        result.update(
            regex_matches_to_word_coords(EXTRACT_PTN_RE, item.group(),
                                         item.start() + sent_start))

    # case 3
    mts = regex_matches_to_word_coords(NOUN_PTN_RE, sentence, sent_start)
    mts = [i for i in mts if not NOUN_ANTI_PTN_RE.fullmatch(i[0])]
    mts = [
        m for m in mts
        if m[0].lower().strip(' ,;.') not in EnLanguageTokens.pronouns
    ]
    if len(mts) > 0:
        result.update(mts)

    # cases 2, 4, 5, 6
    for _ in TRIGGER_QUOTED_DEFINITION_RE.finditer(sentence):
        for quoted_definition_re in QUOTED_DEFINITION_RE:
            result.update(
                regex_matches_to_word_coords(quoted_definition_re, sentence,
                                             sent_start))
        break

    # make definitions out of entries
    for term, start, end in result:
        term_cleared = TextBeautifier.strip_pair_symbols((term, start, end))
        term_cleared = trim_defined_term(term_cleared[0], term_cleared[1],
                                         term_cleared[2])
        was_quoted = term_cleared[3]

        if PICK_DEFINITION_FROM_QUOTES:
            term, start, end = term_cleared[0], term_cleared[1], term_cleared[
                2]

        if not term_cleared[0]:
            continue

        term, start, end = TextBeautifier.unify_quotes_braces_coords(
            term, start, end)

        # check the term is not empty
        if len(term.strip(PUNCTUATION_STRIP_STR)) == 0:
            continue

        # returns [('word', 'token', (word_start, word_end)), ...] ...
        term_pos = list(SpanTokenizer.get_token_spans(term))
        if does_term_are_service_words(term_pos):
            continue

        term_wo_intro = IntroductoryWordsDetector.remove_term_introduction(
            term, term_pos)
        if term_wo_intro != term:
            term = TextBeautifier.strip_pair_symbols(term_wo_intro)
        if not term:
            continue

        # check the term is not too long
        max_words_per_definition = MAX_TERM_TOKENS
        if was_quoted:
            max_words_per_definition = MAX_QUOTED_TERM_TOKENS

        words_in_term = sum(
            1 for w in word_processor.split_text_on_words(term_cleared[0])
            if not w.is_separator)
        quotes_in_text = get_quotes_count_in_string(term_cleared[0])
        possible_definitions = quotes_in_text // 2 if quotes_in_text > 1 else 1
        possible_tokens_count = max_words_per_definition * possible_definitions
        if words_in_term > possible_tokens_count:
            continue

        split_definitions_lst = split_definitions_inside_term(
            term, sentence_coords, start, end)

        for definition, s, e in split_definitions_lst:
            definition, s, e = TextBeautifier.strip_pair_symbols(
                (definition, s, e))
            definitions.append(DefinitionCaught(definition, sentence, (
                s,
                e,
            )))

    return definitions

示例#5

显示文件

文件： test_text_beautifier.py 项目： denmonz/Orrick-Flashcards

 def test_misplaced_quotes(self):
     text = '”Consolidated EBITDA“'
     cleared = TextBeautifier.unify_quotes_braces(text)
     self.assertEqual('“Consolidated EBITDA”', cleared)

示例#6

显示文件

文件： test_text_beautifier.py 项目： denmonz/Orrick-Flashcards

 def test_doubled_quotes(self):
     text = '""Consolidated EBITDA" means, for any period'
     cleared = TextBeautifier.unify_quotes_braces(text)
     self.assertEqual('"Consolidated EBITDA" means, for any period', cleared)

示例#7

显示文件

文件： test_text_beautifier.py 项目： denmonz/Orrick-Flashcards

 def test_braces_shape(self):
     text = '{x + 3) *[(y-1)/(y+1]^2]^3'
     cleared = TextBeautifier.unify_quotes_braces(text)
     self.assertEqual('{x + 3} *[(y-1)/(y+1)^2]^3', cleared)