Пример #1
0
    def _char_set_for(self, codepoint: int) -> CharSet:
        """
        Return a CharSet instance for the given character.

        Note that this takes into account case insensitivity, if it is enabled.
        """
        char = chr(codepoint)
        return (CharSet(char, char.lower(), char.upper())
                if self.case_insensitive else CharSet(char))
Пример #2
0
    def _parse_range(self, stream: SequenceReader) -> RegexpCollection.Parser:
        """
        Parse a regular expression for a character range.

        :param file stream: Input regexp stream.
        :rtype: RegexpCollection.Parser
        """
        assert stream.read() == '['
        ranges: List[Tuple[int, int]] = []

        # First, determine if this range must be negated
        negate = False
        if stream.next_is('^'):
            negate = True
            stream.read()

        # Now, read ranges...
        #
        # TODO: handle '-' and ']' in first position.
        in_range = False
        while not stream.eof and not stream.next_is(']'):
            if stream.next_is('-'):
                check_source_language(bool(ranges and not in_range),
                                      'dangling dash')
                in_range = True
                stream.read()
            else:
                codepoint = (self._read_escape(stream)
                             if stream.next_is('\\') else ord(stream.read()))
                if in_range:
                    low, high = ranges.pop()
                    assert low == high
                    ranges.append((low, codepoint))
                else:
                    ranges.append((codepoint, codepoint))
                in_range = False

        check_source_language(not in_range, 'dangling dash')
        check_source_language(stream.next_is(']'), 'unbalanced square bracket')
        assert stream.read() == ']'

        # In case insensitivity is enabled, make sure both lowercase and
        # uppercase variants of all characters in ranges are present.
        if self.case_insensitive:
            char_set = CharSet()
            for low, high in ranges:
                for codepoint in range(low, high + 1):
                    char = chr(codepoint)
                    for c in (char, char.lower(), char.upper()):
                        char_set.add(c)
        else:
            char_set = CharSet.from_int_ranges(*ranges)

        if negate:
            char_set = char_set.negation
        return self.Range(char_set)
Пример #3
0
    def _parse_range(cls, stream):
        """
        Parse a regular expression for a character range.

        :param file stream: Input regexp stream.
        :rtype: RegexpCollection.Parser
        """
        assert stream.read() == '['
        ranges = []

        # First, determine if this range must be negated
        negate = False
        if stream.next_is('^'):
            negate = True
            stream.read()

        # Now, read ranges...
        #
        # TODO: handle '-' and ']' in first position.
        in_range = False
        while not stream.eof and not stream.next_is(']'):
            if stream.next_is('-'):
                check_source_language(ranges and not in_range, 'dangling dash')
                in_range = True
                stream.read()
            else:
                char = (cls._read_escape(stream)
                        if stream.next_is('\\') else ord(stream.read()))
                if in_range:
                    low, high = ranges.pop()
                    assert low == high
                    ranges.append((low, char))
                else:
                    ranges.append((char, char))
                in_range = False

        check_source_language(not in_range, 'dangling dash')
        check_source_language(stream.next_is(']'), 'unbalanced square bracket')
        assert stream.read() == ']'

        char_set = CharSet.from_int_ranges(*ranges)
        if negate:
            char_set = char_set.negation
        return cls.Range(char_set)
Пример #4
0
    def add_transition(self, chars: CharSet, next_state: DFAState) -> None:
        """
        Add a transition from this state to another one.

        :param chars: Specification of the input that allows to transition from
            this state to the next one. A CharSet instance indicates that one
            character in this set is required.
        :param next_state: Destination state for this new transition.
        """
        assert isinstance(chars, CharSet)
        assert isinstance(next_state, DFAState)

        # Check that ``chars`` does overlap with character sets for other
        # transitions.
        for other_chars, _ in self.transitions:
            assert not chars.overlaps_with(other_chars), (
                'Overlapping input char sets: {} and {}'.format(
                    chars, other_chars))

        self.transitions.append((chars, next_state))
Пример #5
0
from langkit.lexer.char_set import CharSet


def check_ranges(label, cs):
    def format_char(char):
        return (chr(char) if ord(' ') < char and char <= ord('~') else
                '\\U+{:04X}'.format(char))

    print('== {} =='.format(label))
    print(' '.join('{}-{}'.format(format_char(l), format_char(h))
                   for l, h in cs.ranges))
    print('')


check_ranges('Single', CharSet('a'))
check_ranges('Adjacent 2 singles', CharSet('a', 'b'))
check_ranges('Non-adjacent 2 singles', CharSet('a', 'c'))
check_ranges('Reverted non-adjacent 2 singles', CharSet('c', 'a'))
check_ranges('Adjacent 3 singles', CharSet('a', 'c', 'b'))

check_ranges('Empty range', CharSet(('a', 'c'), ('d', 'c')))
check_ranges('Redundant single', CharSet(('a', 'c'), 'b'))
check_ranges('Non-adjacent ranges', CharSet(('i', 'o'), ('a', 'c')))

for c in ('h', 'i', 'j', 'k'):
    check_ranges('Adjacent ranges - {}'.format(c), CharSet(('i', 'o'),
                                                           ('a', c)))

check_ranges('Overlapping ranges (1)', CharSet(('i', 'o'), ('a', 'o')))
check_ranges('Nested range', CharSet(('i', 'o'), ('a', 'p')))
Пример #6
0
    def _parse_sequence(cls, stream):
        """
        Parse a sequence of regexps. Stop at the first unmatched parenthesis or
        at the first top-level pipe character.

        :param file stream: Input regexp stream.
        :rtype: RegexpCollection.Parser
        """
        subparsers = []
        while True:
            if stream.eof or stream.next_is('|', ')'):
                break

            elif stream.next_is('('):
                # Nested group: recursively parse alternatives
                stream.read()
                subparsers.append(cls._parse_or(stream))
                check_source_language(stream.next_is(')'),
                                      'unbalanced parenthesis')
                stream.read()

            elif stream.next_is('['):
                # Parse a range of characters
                subparsers.append(cls._parse_range(stream))

            elif stream.next_is('{'):
                # Parse a reference to a named pattern
                stream.read()
                name = ''
                while not stream.eof and not stream.next_is('}'):
                    name += stream.read()
                check_source_language(stream.next_is('}'),
                                      'unbalanced bracket')
                stream.read()
                check_source_language(rule_name_re.match(name),
                                      'invalid rule name: {}'.format(name))
                subparsers.append(cls.Defer(name))

            elif stream.next_is('*', '+', '?'):
                # Repeat the previous sequence item
                check_source_language(subparsers, 'nothing to repeat')
                check_source_language(
                    not isinstance(subparsers[-1], cls.Repeat),
                    'multiple repeat')
                wrapper = {
                    '*': lambda p: cls.Repeat(p),
                    '+': lambda p: cls.Sequence([p, cls.Repeat(p)]),
                    '?': lambda p: cls.Opt(p)
                }[stream.read()]
                subparsers[-1] = wrapper(subparsers[-1])

            elif stream.next_is('.'):
                # Generally, "." designates any character *except* newlines. Do
                # the same here.
                stream.read()
                subparsers.append(cls.Range(CharSet('\n').negation))

            elif stream.next_is('^', '$'):
                check_source_language(
                    False, 'matching beginning or ending is unsupported')

            elif stream.next_is('\\'):
                # Parse an escape sequence. In can be a Unicode character, a
                # Unicode property or a simple escape sequence.
                stream.read()

                # \p and \P refer to character sets from Unicode general
                # categories.
                if stream.next_is('p', 'P'):
                    action = stream.read()

                    # Read the category name, which must appear between curly
                    # brackets.
                    category = ''
                    check_source_language(
                        stream.next_is('{'),
                        'incomplete Unicode category matcher')
                    stream.read()
                    while not stream.eof and not stream.next_is('}'):
                        category += stream.read()
                    check_source_language(
                        stream.next_is('}'),
                        'incomplete Unicode category matcher')
                    stream.read()

                    try:
                        char_set = CharSet.for_category(category)
                    except KeyError:
                        check_source_language(
                            False,
                            'invalid Unicode category: {}'.format(category))
                    if action == 'P':
                        char_set = char_set.negation
                    subparsers.append(cls.Range(char_set))

                else:
                    stream.go_back()
                    subparsers.append(
                        cls.Range(CharSet.from_int(cls._read_escape(stream))))

            else:
                subparsers.append(cls.Range(CharSet(stream.read())))

        return cls.Sequence(subparsers)
Пример #7
0
    def _parse_sequence(self,
                        stream: SequenceReader) -> RegexpCollection.Parser:
        """
        Parse a sequence of regexps. Stop at the first unmatched parenthesis or
        at the first top-level pipe character.

        :param stream: Input regexp stream.
        """
        subparsers = []
        while True:
            if stream.eof or stream.next_is('|', ')'):
                break

            elif stream.next_is('('):
                # Nested group: recursively parse alternatives
                stream.read()
                subparsers.append(self._parse_or(stream))
                check_source_language(stream.next_is(')'),
                                      'unbalanced parenthesis')
                stream.read()

            elif stream.next_is('['):
                # Parse a range of characters
                subparsers.append(self._parse_range(stream))

            elif stream.next_is('{'):
                # Parse a reference to a named pattern
                stream.read()
                name = ''
                while not stream.eof and not stream.next_is('}'):
                    name += stream.read()
                check_source_language(stream.next_is('}'),
                                      'unbalanced bracket')
                stream.read()
                check_source_language(
                    rule_name_re.match(name) is not None,
                    'invalid rule name: {}'.format(name))
                subparsers.append(self.Defer(name))

            elif stream.next_is('*', '+', '?'):
                # Repeat the previous sequence item
                check_source_language(bool(subparsers), 'nothing to repeat')
                check_source_language(
                    not isinstance(subparsers[-1], self.Repeat),
                    'multiple repeat')
                wrapper = {
                    '*': lambda p: self.Repeat(p),
                    '+': lambda p: self.Sequence([p, self.Repeat(p)]),
                    '?': lambda p: self.Opt(p)
                }[stream.read()]
                subparsers[-1] = wrapper(subparsers[-1])

            elif stream.next_is('.'):
                # Generally, "." designates any character *except* newlines. Do
                # the same here.
                stream.read()
                subparsers.append(self.Range(CharSet('\n').negation))

            elif stream.next_is('^', '$'):
                check_source_language(
                    False, 'matching beginning or ending is unsupported')

            elif stream.next_is('\\'):
                # Parse an escape sequence. In can be a Unicode character, a
                # Unicode property or a simple escape sequence.
                stream.read()

                # \p and \P refer to character sets from Unicode general
                # categories.
                if stream.next_is('p', 'P'):
                    action = stream.read()

                    # Read the category name, which must appear between curly
                    # brackets.
                    category = ''
                    check_source_language(
                        stream.next_is('{'),
                        'incomplete Unicode category matcher')
                    stream.read()
                    while not stream.eof and not stream.next_is('}'):
                        category += stream.read()
                    check_source_language(
                        stream.next_is('}'),
                        'incomplete Unicode category matcher')
                    stream.read()

                    # If case insensitivity is enabled, the presence of either
                    # the Ll, Lu or Lt categories automatically enable the
                    # presence of the others.
                    #
                    # This is because X.upper() can turn codepoints from Ll or
                    # Lt into codepoints from Lu and X.lower() can turn
                    # codepoints from Lu or Lt into codepoints from Ll.
                    if category in ("Ll", "Lu", "Lt"):
                        char_set = (CharSet.for_category("Ll")
                                    | CharSet.for_category("Lu")
                                    | CharSet.for_category("Lt"))
                    else:
                        try:
                            char_set = CharSet.for_category(category)
                        except KeyError:
                            check_source_language(
                                False, f'invalid Unicode category: {category}')

                    if action == 'P':
                        char_set = char_set.negation
                    subparsers.append(self.Range(char_set))

                else:
                    stream.go_back()
                    subparsers.append(self.Range(self._parse_escape(stream)))

            else:
                char_set = self._char_set_for(ord(stream.read()))
                subparsers.append(self.Range(char_set))

        return self.Sequence(subparsers)