def __init__(self, state, parent): super(CharacterBuilder, self).__init__(state) self._parent = parent self._charset = Character([], alphabet=state.alphabet) self._invert = None self._queue = None self._range = False
def append_character(self, character, escaped=False): if not escaped and character == '\\': return ComplexEscapeBuilder(self._state, self) elif not escaped and character == '{': return CountBuilder(self._state, self) elif not escaped and character == '(': return GroupEscapeBuilder(self._state, self) elif not escaped and character == '[': return CharacterBuilder(self._state, self) elif not escaped and character == '.': self._sequence.append(Dot(self._state.flags & ParserState.DOTALL)) elif not escaped and character == '^': self._sequence.append(StartOfLine(self._state.flags & ParserState.MULTILINE)) elif not escaped and character == '$': self._sequence.append(EndOfLine(self._state.flags & ParserState.MULTILINE)) elif not escaped and character == '|': self.__start_new_alternative() elif character and self._sequence and (not escaped and character in '+?*'): return RepeatBuilder(self._state, self, self._sequence.pop(), character) elif character and (escaped or self._state.significant(character)): (is_pair, value) = self._state.alphabet.unpack(character, self._state.flags) if is_pair: self._sequence.append(Character([(value[0], value[0]), (value[1], value[1])], self._state.alphabet)) else: self._sequence.append(String(value)) return self
def test_contains(self): assert 'a' not in Character([('b', 'b')], Ascii()) assert 'b' in Character([('b', 'b')], Ascii()) assert 'c' not in Character([('b', 'b')], Ascii()) assert 'a' in Character([('a', 'b')], Ascii()) assert 'b' in Character([('a', 'b')], Ascii()) assert 'c' not in Character([('a', 'b')], Ascii()) assert 'a' in Character([('a', 'c')], Ascii()) assert 'b' in Character([('a', 'c')], Ascii()) assert 'c' in Character([('a', 'c')], Ascii()) assert 'a' in Character([('a', 'b'), ('b', 'c')], Ascii()) assert 'b' in Character([('a', 'b'), ('b', 'c')], Ascii()) assert 'c' in Character([('a', 'b'), ('b', 'c')], Ascii()) assert 'a' in Character([('a', 'a'), ('c', 'c')], Ascii()) assert 'b' not in Character([('a', 'a'), ('c', 'c')], Ascii()) assert 'c' in Character([('a', 'a'), ('c', 'c')], Ascii())
def do_test_str(self, intervals, target): result = str(Character(intervals, Ascii())) assert result == target, result
class CharacterBuilder(Builder): ''' Parse a character range - expressions of the form [...]. These can include character classes (\\s for example), which we handle in the alphabet as functions rather than character code ranges, so the final graph node can be quite complex. ''' def __init__(self, state, parent): super(CharacterBuilder, self).__init__(state) self._parent = parent self._charset = Character([], alphabet=state.alphabet) self._invert = None self._queue = None self._range = False def append_character(self, character, escaped=False): def unpack(character): (is_charset, value) = self._state.alphabet.unpack(character, self._state.flags) if not is_charset: value = (character, character) return value def append(character=character): if self._range: if self._queue is None: raise RxpyException('Incomplete range') else: (alo, ahi) = unpack(self._queue) (blo, bhi) = unpack(character) self._charset.append_interval((alo, blo)) self._charset.append_interval((ahi, bhi)) self._queue = None self._range = False else: if self._queue: (lo, hi) = unpack(self._queue) self._charset.append_interval((lo, lo)) self._charset.append_interval((hi, hi)) self._queue = character if self._invert is None and character == '^': self._invert = True elif not escaped and character == '\\': return SimpleEscapeBuilder(self._state, self) elif escaped and character in 'dD': self._charset.append_class(self._state.alphabet.digit, character, character=='D') elif escaped and character in 'wW': self._charset.append_class(self._state.alphabet.word, character, character=='W') elif escaped and character in 'sS': self._charset.append_class(self._state.alphabet.space, character, character=='S') # not charset allows first character to be unescaped - or ] elif character and \ ((not self._charset and not self._queue) or escaped or character not in "-]"): append() elif character == '-': if self._range: # repeated - is range to -? append() else: self._range = True elif character == ']': if self._queue: if self._range: self._range = False # convert open range to '-' append('-') append(None) if self._invert: self._charset.invert() self._parent._sequence.append(self._charset.simplify()) return self._parent else: raise RxpyException('Syntax error in character set') # after first character this must be known if self._invert is None: self._invert = False return self
def test_contains(self): assert 0 not in Character([('1', '1')], Digits()) assert 1 in Character([('1', '1')], Digits()) assert 2 not in Character([('1', '1')], Digits()) assert 0 in Character([('0', '1')], Digits()) assert 1 in Character([('0', '1')], Digits()) assert 2 not in Character([('0', '1')], Digits()) assert 0 in Character([('0', '2')], Digits()) assert 1 in Character([('0', '2')], Digits()) assert 2 in Character([('0', '2')], Digits()) assert 0 in Character([('0', '1'), ('1', '2')], Digits()) assert 1 in Character([('0', '1'), ('1', '2')], Digits()) assert 2 in Character([('0', '1'), ('1', '2')], Digits()) assert 0 in Character([('0', '0'), ('2', '2')], Digits()) assert 1 not in Character([('0', '0'), ('2', '2')], Digits()) assert 2 in Character([('0', '0'), ('2', '2')], Digits())
def do_test_str(self, intervals, target): result = str(Character(intervals, alphabet=Digits())) assert result == target, result