def test_explicit(self): #basicConfig(level=DEBUG) number = Token(Digit()) letter = Token(Letter()) block = Delayed() line = Or(Line(number), Line(letter) & block) > list block += Block(line[1:]) program = Trace(line[1:]) text = '''1 2 a 3 b 4 5 6 ''' program.config.lines(block_policy=explicit) parser = program.get_parse_string() result = parser(text) assert result == [['1'], ['2'], ['a', ['3'], ['b', ['4'], ['5']], ['6']]], result
def test_bline(self): ''' Test a simple example: letters introduce numbers in an indented block. ''' #basicConfig(level=DEBUG) number = Token(Digit()) letter = Token(Letter()) # the simplest whitespace grammar i can think of - lines are either # numbers (which are single, simple statements) or letters (which # mark the start of a new, indented block). block = Delayed() line = Or(BLine(number), BLine(letter) & block) > list # and a block is simply a collection of lines, as above block += Block(line[1:]) program = Trace(line[1:]) text = '''1 2 a 3 b 4 5 6 ''' program.config.default_line_aware(block_policy=1) parser = program.get_parse_string() result = parser(text) assert result == [['1'], ['2'], ['a', ['3'], ['b', ['4'], ['5']], ['6']]], result
def test_literal(self): ''' Simple literal should compile directly. ''' token = Token(Literal('abc')) token.compile() assert token.regexp == 'abc', repr(token.regexp)
def test_continued_explicit(self): number = Token(Digit()) letter = Token(Letter()) block = Delayed() bline = ContinuedLineFactory(r'x') line = Or(bline(number), bline(letter) & block) > list block += Block(line[1:]) program = Trace(line[1:]) text = '''1 2 a 3 b 4 5 6 ''' program.config.lines(block_policy=explicit) parser = program.get_parse_string() result = parser(text) assert result == [['1'], ['2'], ['a', ['3'], ['b', ['4'], ['5']], ['6']]], result
def test_string_arg(self): ''' Skip anything(not just spaces) ''' words = Token('[a-z]+')[:] words.config.lexer(discard='.') parser = words.get_parse() results = parser('abc defXghi') assert results == ['abc', 'def', 'ghi'], results
def test_real(self): ''' A real is more complex, but still compiles. ''' token = Token(Real(exponent='Ee')) token.compile() assert token.regexp == \ '(?:[\\+\\-])?(?:(?:[0-9](?:[0-9])*)?\\.[0-9](?:[0-9])*|[0-9](?:[0-9])*(?:\\.)?)(?:[Ee](?:[\\+\\-])?[0-9](?:[0-9])*)?', \ repr(token.regexp)
def test_impossible(self): ''' Cannot compile arbitrary functions. ''' try: token = Token(Real() > (lambda x: x)) token.compile() assert False, 'Expected error' except LexerError: pass
def test_good_error_msg(self): ''' Better error message with streams. ''' #basicConfig(level=DEBUG) words = Token('[a-z]+')[:] words.config.lexer() parser = words.get_parse_string() try: parser('abc defXghi') assert False, 'expected error' except RuntimeLexerError as err: assert str(err) == "No token for 'Xghi' at line 1, character 8 of 'abc defXghi'.", str(err)
def test_bad_error_msg(self): ''' An ugly error message. ''' #basicConfig(level=DEBUG) words = Token('[a-z]+')[:] words.config.lexer() parser = words.get_parse_sequence() try: parser('abc defXghi') assert False, 'expected error' except RuntimeLexerError as err: assert str(err) == "No token for 'Xghi' at offset 7, value 'X' of 'abc defXghi'.", str(err)
def test_bad_space(self): ''' An unexpected character fails to match. ''' token = Token('a') token.config.clear().lexer(discard='b') parser = token.get_parse() assert parser('a') == ['a'], parser('a') assert parser('b') == None, parser('b') try: parser('c') assert False, 'expected failure' except RuntimeLexerError as err: assert str(err) == "No token for 'c' at line 1, character 1 of 'c'.", str(err)
def test_file(self): if version[0] == '3': f = TemporaryFile('w+', encoding='utf8') else: f = TemporaryFile('w+') print("hello world\n", file=f) f.flush() # f.seek(0) # print(f.readlines()) f.seek(0) w = Token('[a-z]+') s = Token(' +') v = w & s & w v.parse_iterable(f)
def test_invert_bug_4(self): #basicConfig(level=DEBUG) bad = BLine(Token('[^a]*')) bad.config.line_aware(block_policy=2).left_memoize() parser = bad.get_parse_string() result = parser('123') assert result == ['123'], result
def test_mixed(self): ''' Cannot mix tokens and non-tokens at same level. ''' bad = Token(Any()) & Any() try: bad.get_parse() assert False, 'expected failure' except LexerError as err: assert str(err) == 'The grammar contains a mix of Tokens and ' \ 'non-Token matchers at the top level. If ' \ 'Tokens are used then non-token matchers ' \ 'that consume input must only appear "inside" ' \ 'Tokens. The non-Token matchers include: ' \ 'Any(None).', str(err) else: assert False, 'wrong exception'
def test_line(self): #basicConfig(level=DEBUG) text = Token('[^\n\r]+') quoted = Regexp("'[^']'") line = BLine(text(quoted)) line.config.default_line_aware(block_start=0) parser = line.get_parse_string() assert parser("'a'") == ["'a'"]
def test_incomplete(self): ''' A token is not completely consumed (this doesn't raise error messages, it just fails to match). ''' token = Token('[a-z]+')(Any()) token.config.no_full_first_match() parser = token.get_parse_string() assert parser('a') == ['a'], parser('a') # even though this matches the token, the Any() sub-matcher doesn't # consume all the contents assert parser('ab') == None, parser('ab') token = Token('[a-z]+')(Any(), complete=False) token.config.no_full_first_match() parser = token.get_parse_string() assert parser('a') == ['a'], parser('a') # whereas this is fine, since complete=False assert parser('ab') == ['a'], parser('ab')
def test_invert_bug_6(self): #basicConfig(level=DEBUG) bad = BLine(Token(str('[^(*SOL)(*EOL)a]*'))) bad.config.default_line_aware(block_policy=2, parser_factory=make_str_parser) bad.config.trace(True) parser = bad.get_parse_string() result = parser(str('123')) assert result == [str('123')], result
def simple_grammar(self): ''' Test a simple example: letters introduce numbers in an indented block. ''' #basicConfig(level=DEBUG) number = Token(Digit()) letter = Token(Letter()) # the simplest whitespace grammar i can think of - lines are either # numbers (which are single, simple statements) or letters (which # mark the start of a new, indented block). block = Delayed() line = Or(Line(number), Line(letter) & block) > list # and a block is simply a collection of lines, as above block += Block(line[1:]) program = Trace(line[1:]) program.config.lines(block_policy=1) return program
def ContinuedLineFactory(matcher): ''' Create a replacement for ``Line()`` that can match multiple lines if they end in the given character/matcher. ''' matcher = coerce_(matcher, lambda regexp: Token(regexp)) restricted = RestrictTokensBy(matcher, LineEnd(), LineStart()) def factory(matcher, indent=True): return restricted(Line(matcher, indent=indent)) return factory
def test_offset(self): #basicConfig(level=DEBUG) text = Token('[^\n\r]+') line = BLine(text(~Literal('aa') & Regexp('.*'))) line.config.default_line_aware(block_start=0) parser = line.get_parse_string() assert parser('aabc') == ['bc'] # what happens with an empty match? check = ~Literal('aa') & Regexp('.*') check.config.no_full_first_match() assert check.parse('aa') == [''] assert parser('aa') == ['']
def test_bad_config(self): #basicConfig(level=DEBUG) text = Token('[^\n\r]+') quoted = Regexp("'[^']'") line = BLine(text(quoted)) line.config.default_line_aware() parser = line.get_parse_string() try: parser("'a'") assert False, 'Expected error' except OffsideError as error: assert str(error).startswith('No initial indentation has been set.')
def test_indent(self): ''' Test simple matches against leading spaces. ''' #basicConfig(level=DEBUG) text = ''' left four''' word = Token(Word(Letter())) indent = LineStart() line1 = indent('') + LineEnd() line2 = indent('') & word('left') + LineEnd() line3 = indent(' ') & word('four') + LineEnd() expr = (line1 & line2 & line3) expr.config.lines(block_start=NO_BLOCKS) parser = expr.get_parse_string() result = parser(text) assert result == ['', '', 'left', ' ', 'four'], result
def test_tabs(self): ''' Use block_policy here so that the regexp parser that excludes SOL and EOL is used; otherwise Any()[:] matches those and we end up with a single monster token. ''' line = Indent() & Token(Any()) & Eol() line.config.default_line_aware(tabsize=8, block_policy=0).trace(True) result = line.parse('a') assert result == ['', 'a', ''], result result = line.parse('\ta') assert result == [' ', 'a', ''], result line.config.default_line_aware(tabsize=None, block_policy=0) result = line.parse('\ta') assert result == ['\t', 'a', ''], result line.config.default_line_aware(block_policy=0) result = line.parse('\ta') assert result == [' ', 'a', ''], result
def test_indent(self): ''' Test simple matches against leading spaces. ''' #basicConfig(level=DEBUG) text = ''' left four''' word = Token(Word(Letter())) indent = Indent() line1 = indent('') + Eol() line2 = indent('') & word('left') + Eol() line3 = indent(' ') & word('four') + Eol() expr = (line1 & line2 & line3) expr.config.default_line_aware() parser = expr.get_parse_string() result = parser(text) assert result == ['', '', 'left', ' ', 'four'], result
def test_indent(self): ''' Test simple matches against leading spaces. ''' #basicConfig(level=DEBUG) text = ''' onespace \tspaceandtab''' word = Token(Word(Letter())) indent = LineStart() line1 = indent('') & ~LineEnd() line2 = indent(' ') & word('onespace') & ~LineEnd() line3 = indent(' ') & word('spaceandtab') & ~LineEnd() expr = line1 & line2 & line3 expr.config.lines(tabsize=4, block_start=NO_BLOCKS).trace_stack(True) parser = expr.get_parse_string() result = parser(text) #print(result) assert result == ['', ' ', 'onespace', ' ', 'spaceandtab'], result
def test_indent(self): ''' Test simple matches against leading spaces. ''' #basicConfig(level=DEBUG) text = ''' onespace \tspaceandtab''' word = Token(Word(Letter())) indent = Indent() line1 = indent('') & ~Eol() line2 = indent(' ') & word('onespace') & ~Eol() line3 = indent(' ') & word('spaceandtab') & ~Eol() expr = line1 & line2 & line3 expr.config.default_line_aware(tabsize=4).trace(True) parser = expr.get_parse_string() result = parser(text) #print(result) assert result == ['', ' ', 'onespace', ' ', 'spaceandtab'], result
def test_default(self): w = Token('[a-z]+') s = Token(' +') v = w & s & w v.parse_string("hello world\n")
def test_bug(self): #basicConfig(level=DEBUG) t = Token(Word())(Any()[2] & Eos()) t.match("ab cd")