Пример #1
0
    def _produce_separator_split_token(
            self, remainder: int, word: str, mo: regex, prefix: str,
            offset: int) -> Generator[Token, None, str]:
        """Helper method to handle alnum words with `_separation` patterns."""
        if mo.start() > remainder:
            if Tokenizer._apostrophe_t.fullmatch(
                    mo.group(0)) and word[mo.start() - 1] == 'n':
                if remainder < mo.start() - 1:
                    yield Token(prefix, word[remainder:mo.start() - 1],
                                offset + remainder)
                    prefix = ""

                yield Token(
                    prefix, "not" if self.replace_not_contraction else 'n' +
                    mo.group(0), offset + mo.start())
                return ""

            yield Token(prefix, word[remainder:mo.start()], offset + remainder)
            prefix = ""

        separator = mo.group(0)

        if separator and self._can_emit(separator):
            yield Token(prefix, separator, offset + mo.start())
            return ""
        else:
            return prefix + separator
Пример #2
0
    def collect_regex_matches_with_quoted_chunks(
            phrase: str, reg: re, prob: int,
            quoted_def_start: Callable[[str, Match, Match], int],
            quoted_def_end: Callable[[str, Match, Match],
                                     int], def_start: Callable[[str, Match],
                                                               int],
            def_end: Callable[[str, Match], int]) -> List[PatternFound]:
        """
        First, find all matches by 'reg' ptr
        Second, go through matches
        For each match try to find a set of quoted words
        If found, use them as matches
        Or use the whole match
        :param quoted_def_start: (phrase, match, quoted_match) -> definition's start
        :param quoted_def_end: (phrase, match, quoted_match) -> definition's end
        :param def_start: (phrase, match) -> definition's start
        :param def_end: (phrase, match) -> definition's end
        :return:
        """
        defs = []
        for match in reg.finditer(phrase):
            quoted_matches = \
                CommonDefinitionPatterns.peek_quoted_part(phrase,
                                                          match,
                                                          quoted_def_start,
                                                          quoted_def_end,
                                                          prob)
            if len(quoted_matches) > 0:
                defs += quoted_matches
                continue

            df = PatternFound()
            df.name = match.group()
            df.start = def_start(phrase, match)
            df.end = def_end(phrase, match)
            df.probability = prob
            defs.append(df)

        return defs
Пример #3
0
    def collect_regex_matches(
            phrase: str, reg: re, prob: int, def_start: Callable[[str, Match],
                                                                 int],
            def_end: Callable[[str, Match], int]) -> List[PatternFound]:
        """
        find all matches by 'reg' ptr
        :param quoted_def_start: (phrase, match, quoted_match) -> definition's start
        :param quoted_def_end: (phrase, match, quoted_match) -> definition's end
        :param def_start: (phrase, match) -> definition's start
        :param def_end: (phrase, match) -> definition's end
        :return:
        """
        defs = []
        for match in reg.finditer(phrase):

            df = PatternFound()
            df.name = match.group()
            df.start = def_start(phrase, match)
            df.end = def_end(phrase, match)
            df.probability = prob
            defs.append(df)

        return defs