def __init__(self, code): self.code = code self.lg = LexerGenerator() self.lg.ignore(r'\s+') self.lg.add('COMMENT', r';') self.lg.add('STRING', r'".*"') self.lg.add('STRING', r'\'.*\'') self.lg.add('IF', r'if') self.lg.add('ELSE', r'else') self.lg.add('LPAREN', r'\(') self.lg.add('RPAREN', r'\)') self.lg.add('LBRACE', r'\{') self.lg.add('RBRACE', r'\}') self.lg.add('IS_EQUAL_TO', r'==') self.lg.add('EQUAL', r'=') self.lg.add('GREATER_EQUAL', r'>=') self.lg.add('LESSER_EQUAL', r'<=') self.lg.add('LESSER', r'<') self.lg.add('GREATER', r'>') self.lg.add('PLUS', r'-') self.lg.add('MINUS', r'\+') self.lg.add('COMMA', r',') self.lg.add('NUMBER', r'\d+') self.lg.add('PRINT', r'print') self.lg.add('NAME', r'[a-zA-Z_][a-zA-Z0-9_]*') self.lexer = self.lg.build()
def test_arithmetic(self): lg = LexerGenerator() lg.add("NUMBER", r"\d+") lg.add("PLUS", r"\+") lg.add("TIMES", r"\*") pg = ParserGenerator(["NUMBER", "PLUS", "TIMES"], precedence=[ ("left", ["PLUS"]), ("left", ["TIMES"]), ]) @pg.production("main : expr") def main(p): return p[0] @pg.production("expr : expr PLUS expr") @pg.production("expr : expr TIMES expr") def expr_binop(p): return BoxInt({ "+": operator.add, "*": operator.mul }[p[1].getstr()](p[0].getint(), p[2].getint())) @pg.production("expr : NUMBER") def expr_num(p): return BoxInt(int(p[0].getstr())) lexer = lg.build() parser = pg.build() assert parser.parse(lexer.lex("3*4+5"))
def test_regex_flags(self): lg = LexerGenerator() lg.add("ALL", r".*", re.DOTALL) l = lg.build() def f(n): tokens = l.lex("%d\n%d" % (n, n)) t = tokens.next() if t.name != "ALL": return -1 ended = False try: tokens.next() except StopIteration: ended = True if not ended: return -2 return 1 assert self.run(f, [3]) == 1
def get_lexer(): lg = LexerGenerator() for name, rule in RULES: lg.add(name, rule) lg.ignore('\s+') return lg.build()
def test_basic_lexer(self): lg = LexerGenerator() lg.add("NUMBER", r"\d+") lg.add("PLUS", r"\+") l = lg.build() def f(n): tokens = l.lex("%d+%d+%d" % (n, n, n)) i = 0 s = 0 while i < 5: t = tokens.next() if i % 2 == 0: if t.name != "NUMBER": return -1 s += int(t.value) else: if t.name != "PLUS": return -2 if t.value != "+": return -3 i += 1 if tokens.next() is not None: return -4 return s assert self.run(f, [14]) == 42
def Lexer(): lexer = LexerGenerator() lexer.add('WHILE', r'wh') lexer.add('PRINTF', r'pf') lexer.add('IF', r'if') lexer.add('ELSE', r'el') lexer.add('MAIN', r'mn') lexer.add('RETURN', r'rt') lexer.add('LEFT_PAREN', r'\(') lexer.add('RIGHT_PAREN', r'\)') lexer.add('SEMI_COLON', r'\;') lexer.add('COMMA', r'\,') lexer.add('PLUS', r'\+') lexer.add('MINUS', r'\-') lexer.add('MULT', r'\*') lexer.add('DIV', r'\/') lexer.add('RIGHT_BRACKETS', r'\}') lexer.add('LEFT_BRACKETS', r'\{') lexer.add('EQUAL', r'=') lexer.add('E_EQUAL', r'sm') lexer.add('BT', r'bt') lexer.add('LT', r'lt') lexer.add('OR', r'or') lexer.add('AND', r'and') lexer.add('NOT', r'not') lexer.add('INT', r'\d+') lexer.add('IDENTIFIER', "[a-zA-Z_][a-zA-Z0-9_]*") lexer.ignore('\s+') return lexer.build()
def test_ignore_recursion(self): lg = LexerGenerator() lg.ignore(r"\s") l = lg.build() assert list(l.lex(" " * 2000)) == []
def test_position(self): lg = LexerGenerator() lg.add("NUMBER", r"\d+") lg.add("PLUS", r"\+") lg.ignore(r"\s+") l = lg.build() stream = l.lex("2 + 3") t = stream.next() assert t.source_pos.lineno == 1 assert t.source_pos.colno == 1 t = stream.next() assert t.source_pos.lineno == 1 assert t.source_pos.colno == 3 t = stream.next() assert t.source_pos.lineno == 1 assert t.source_pos.colno == 5 with raises(StopIteration): stream.next() stream = l.lex("2 +\n 37") t = stream.next() assert t.source_pos.lineno == 1 assert t.source_pos.colno == 1 t = stream.next() assert t.source_pos.lineno == 1 assert t.source_pos.colno == 3 t = stream.next() assert t.source_pos.lineno == 2 assert t.source_pos.colno == 5 with raises(StopIteration): stream.next()
def tokenize(): lg = LexerGenerator() lg.add('NUMBER', r'\d+') lg.add('PLUS', r'\+') lg.add('MINUS', r'-') lg.add('MUL', r'\*') lg.add('DIV', r'/') lg.add('OPEN_PAR', r'\(') lg.add('CLOSE_PAR', r'\)') lg.add('OPEN_BLOCK', r'\{') lg.add('CLOSE_BLOCK', r'\}') lg.add('COMMA', r',') lg.add('IMPRIME', r'imprime') lg.add('ESCANEIA', r'escaneia') lg.add('CMD_END', r';') lg.add('GE', r'>=') lg.add('LE', r'<=') lg.add('EQUAL', r'=') lg.add('EQUALS', r'==') lg.add('GREATER', r'>') lg.add('LESS', r'<') lg.add('ENQUANTO', r'enquanto') lg.add('E', r'e') lg.add('OU', r'ou') lg.add('SENAO', r'senao') lg.add('NAO', r'nao') lg.add('SE', r'se') lg.add('IDENTIFIER', "[a-zA-Z_][a-zA-Z0-9_]*") lg.ignore('\s+') return lg.build()
def _lex(source_code): generator = LexerGenerator() _ignore_whitespace(generator) for matcher in _matchers(): generator.add(*matcher) lexer = generator.build() return lexer.lex(source_code)
def _lispy_lexer_generator(): """ Creates a rply lexer generator. """ lg = LexerGenerator() lg.ignore(r"\s+|(;;.*?(\n|$))") for name, pat in TOKENS.items(): lg.add(name, pat) return lg
def get_lexer(): lg = LexerGenerator() # Constants lg.add('FLOAT', '-?\d+.\d+') lg.add('INTEGER', '-?\d+') lg.add('STRING', '(""".?""")|(".?")|(\'.?\')') lg.add('BOOLEAN', "true(?!\w)|false(?!\w)") # Language keywords lg.add('PRINT', r'print(?!\w)') lg.add('PASS', r'pass(?!\w)') lg.add('IF', 'if(?!\w)') lg.add('ELSE', 'else(?!\w)') lg.add('AND', "and(?!\w)") lg.add('OR', "or(?!\w)") lg.add('NOT', "not(?!\w)") lg.add('DEF', 'def(?!\w)') lg.add('IMPORT', 'import(?!\w)') # User identifiers lg.add('SYMBOL', ":[a-zA-Z_][a-zA-Z0-9_]+") lg.add('IDENTIFIER', "[a-zA-Z_][a-zA-Z0-9_]+") # Operators lg.add('==', '==') lg.add('!=', '!=') lg.add('>=', '>=') lg.add('<=', '<=') lg.add('>', '>') lg.add('<', '<') lg.add('=', '=') lg.add('[', r'\[') lg.add(']', r'\]') lg.add('{', '{') lg.add('}', '}') lg.add('|', r'\|') lg.add(',', ',') lg.add('.', r'\.') lg.add(':', ':') lg.add('+', r'\+') lg.add('-', r'\-') lg.add('*', r'\*') lg.add('/', '/') lg.add('%', '%') # Separators lg.add('(', r'\(') lg.add(')', r'\)') lg.add('NEWLINE', '\n') # Ignore spaces lg.ignore('[ \t\r\f\v]+') return lg.build()
def build_lexer(self): lg = LexerGenerator() for (name, pattern) in name_with_pattern: lg.add(name, pattern) lg.ignore(r'\s+') return lg.build()
def test_error_line_number(self): lg = LexerGenerator() lg.add("NEW_LINE", r"\n") l = lg.build() stream = l.lex("\nfail") stream.next() with raises(LexingError) as excinfo: stream.next() assert excinfo.value.source_pos.lineno == 2
def test_transitions(self): lg = LexerGenerator() lg.add('NUMBER', r'\d+') lg.add('ADD', r'\+') lg.add('COMMENT_START', r'\(#', transition='push', target='comment') lg.ignore(r'\s+') comment = lg.add_state('comment') comment.add('COMMENT_START', r'\(#', transition='push', target='comment') comment.add('COMMENT_END', r'#\)', transition='pop') comment.add('COMMENT', r'([^(#]|#(?!\))|\)(?!#))+') l = lg.build() stream = l.lex('(# this is (# a nested comment #)#) 1 + 1 (# 1 # 1 #)') t = stream.next() assert t.name == 'COMMENT_START' assert t.value == '(#' t = stream.next() assert t.name == 'COMMENT' assert t.value == ' this is ' t = stream.next() assert t.name == 'COMMENT_START' assert t.value == '(#' t = stream.next() assert t.name == 'COMMENT' assert t.value == ' a nested comment ' t = stream.next() assert t.name == 'COMMENT_END' assert t.value == '#)' t = stream.next() assert t.name == 'COMMENT_END' assert t.value == '#)' t = stream.next() assert t.name == 'NUMBER' assert t.value == '1' t = stream.next() assert t.name == 'ADD' assert t.value == '+' t = stream.next() assert t.name == 'NUMBER' assert t.value == '1' t = stream.next() assert t.name == 'COMMENT_START' assert t.value == '(#' t = stream.next() assert t.name == 'COMMENT' assert t.value == ' 1 # 1 ' t = stream.next() assert t.name == 'COMMENT_END' assert t.value == '#)'
def __init__(self): lg = LexerGenerator() lg.add('NUMBER', r'\d+') lg.add('PLUS', r'\+') lg.add('MINUS', r'-') lg.add('EXIT', r'exit') lg.ignore(r'\s+') self.lexer = lg.build()
def test_regex_flags_ignore(self): lg = LexerGenerator() lg.add("ALL", r".*", re.DOTALL) lg.ignore(r".*", re.DOTALL) l = lg.build() stream = l.lex("test\ndotall") with raises(StopIteration): stream.next()
def __init__(self): lg = LexerGenerator() # ignore begin and ending white spaces # lg.ignore(r"\s+") # original # lg.ignore(r"[^\S\n\r\f]+") # this is the regex that just matches a space (emtpy string) # lg.ignore(r"[^\S\n]+") # matches just a space # support get rid of white space, single-line comments and multiline comments # since some of the characters are defined in the lexer, I have to ignore them in order to achieve multi-line comments lg.ignore(r"(\s+)|(\/\/.*\n)|(\/\*(.*)|(\s*)\*\/)") # no longer needed. Newline character is not allowed in mini-js lg.add("NEWLINE", r"\n") # this is solely for debug purpose. # white space will not be allowed in the actual interpretation of the program lg.add("WHITESPACE", r"\s") lg.add("BOOLEAN_OR", r"\|\|") lg.add("BOOLEAN_AND", r"\&\&") lg.add("GREATER_EQUAL", r"\>\=") lg.add("LESS_EQUAL", r"\<\=") lg.add("GREATER", r"\>") lg.add("LESS", r"\<") lg.add("EQUAL_EQUAL", r"\=\=") lg.add("NOT_EQUAL", r"\!\=") lg.add("WHILE", r"while") lg.add("FOR", r"for") lg.add("IF", r"if") lg.add("ELSE", r"else") lg.add("PRINTLN", r"println") lg.add("PRINT", r"print") lg.add("LPAREN", r"\(") lg.add("RPAREN", r"\)") lg.add("LBRACE", r"\{") lg.add("RBRACE", r"\}") lg.add("EQUAL", r"=") lg.add("PLUS", r"\+") lg.add("MINUS", r"-") lg.add("MULTIPLY", r"\*") lg.add("DIVIDE", r"/") lg.add("SEMICOLON", r";") lg.add("NUMBER", r"\d+") lg.add("STRING", r"\".*\"") lg.add("BOOLEAN", r"true|false") lg.add("IDENTIFIER", r"[a-zA-Z_][a-zA-Z0-9_]*") self.lexer = lg.build()
def test_error(self): lg = LexerGenerator() lg.add("NUMBER", r"\d+") lg.add("PLUS", r"\+") l = lg.build() stream = l.lex('fail') with raises(LexingError) as excinfo: stream.next() assert 'SourcePosition(' in repr(excinfo.value)
def test_regex_flags(self): lg = LexerGenerator() lg.add("ALL", r".*", re.DOTALL) l = lg.build() stream = l.lex("test\ndotall") t = stream.next() assert t.source_pos.lineno == 1 assert t.source_pos.colno == 1 assert t.getstr() == "test\ndotall" with raises(StopIteration): stream.next()
def __init__(self, input=None): # Initialize the lexer self.lexer = LexerGenerator() self._initialize_tokens() self.built_lexer = self.lexer.build() self.tokens = None self.valid_tokens = [] self.char = 0 self.line = 0 self.token_pos = 0 # Try to parse the input, if there is any if input: self.input(input)
def test_error_column_number(self): lg = LexerGenerator() lg.add("NUMBER", r"\d+") lg.add("PLUS", r"\+") l = lg.build() stream = l.lex("1+2+fail") stream.next() stream.next() stream.next() stream.next() with raises(LexingError) as excinfo: stream.next() assert excinfo.value.source_pos.colno == 4
class Lexer(object): def __init__(self, source): self.source = source lg = LexerGenerator() lg.add("PLUS", r"\+") lg.add("MINUS", r"-") lg.add("NUMBER", r"\d+") lg.ignore(r"\s+") lexer = lg.build() def lex(self): stream = self.lexer.lex(self.source) return LexerWrapper(stream)
def test_repr(self): lg = LexerGenerator() lg.add("NUMBER", r"\d+") lg.add("PLUS", r"\+") lg.ignore(r"\s+") l = lg.build() stream = l.lex("2 + 3") assert str(stream) is not None t = stream.next() assert t.name == "NUMBER" assert t.value == "2" assert str(stream) is not None t = stream.next() assert t.name == "PLUS"
def test_newline_position(self): lg = LexerGenerator() lg.add("NEWLINE", r"\n") lg.add("SPACE", r" ") l = lg.build() stream = l.lex(" \n ") t = stream.next() assert t.source_pos.lineno == 1 assert t.source_pos.colno == 1 t = stream.next() assert t.source_pos.lineno == 1 assert t.source_pos.colno == 2 t = stream.next() assert t.source_pos.lineno == 2 assert t.source_pos.colno == 1
def lex(): lg = LexerGenerator() # build up a set of token names and regexes they match lg.add('WHITESPACE', r'[ ]+') lg.add('INTEGER', r'-?\d+') lg.add('IF', r'if(?!\w)') lg.add('ELSE', r'else(?!\w)') lg.add('WHILE', r'while(?!\w)') lg.add('FOR', 'for i in range') lg.add('FUNCTION', r'def(?!\w)') lg.add('COLON', ':') lg.add('OPENPAREN', '\(') lg.add('CLOSEPAREN', '\)') lg.add('NEWLINE', r'\n') lg.add('IMPORT', 'from karel import \*') lg.add('BEGIN', 'begin_karel_program') lg.add('END', 'end_karel_program') lg.add('NOT', r'not(?!\w)') #commands lg.add('MOVE', 'move') lg.add('LEFTTURN', 'turn_left') lg.add('PUTBEEPER', 'put_beeper') lg.add('PICKBEEPER', 'pick_beeper') #conditions lg.add('FACENORTH', 'facing_north') lg.add('FACESOUTH', 'facing_south') lg.add('FACEWEST', 'facing_west') lg.add('FACEEAST', 'facing_east') lg.add('FRONTCLEAR', 'front_is_clear') lg.add('LEFTCLEAR', 'left_is_clear') lg.add('RIGHTCLEAR', 'right_is_clear') lg.add('PRESENT', 'beepers_present') lg.add('INBAG', 'beepers_in_bag') lg.add('NOTCHECK', 'not') lg.add('IDENTIFIER', '[a-zA-Z_][a-zA-Z0-9_]*') # ignore whitespace lg.ignore('#.*\n') lg.ignore('"""(.|\n)*?"""') lg.ignore("'''(.|\n)*?'''") lexer = lg.build() return lexer
def build_lexer(): lexer = LexerGenerator() # Lexer Analysis Rules lexer.ignore(' ') lexer.add("WHATEVR", r"WHATEVR") lexer.add("VISIBLE", r"VISIBLE") lexer.add("KTHXBAI", r"KTHXBAI") lexer.add("GIMME", r"GIMME") lexer.add("MKAY", r"MKAY") lexer.add("HAS", r"HAS") lexer.add("HAI", r"HAI") lexer.add("ITZ", r"ITZ") lexer.add("OF", r"OF") lexer.add("BANG", r"!") lexer.add("BY", r"BY") lexer.add("AN", r"AN") lexer.add("A", r"A") lexer.add("R", r"R") lexer.add("I", r"I") lexer.add("MULTI_COMMENT", r"OBTW [.*|\n]TDLR") # Not working at all! lexer.add("NEWLINE", "\n") lexer.add("PRIMITIVE_TYPE", r"NUMBR|NUMBAR|LETTR|TROOF") lexer.add("NUMBAR_LITERAL", r"-?\d+.\d+") lexer.add("NUMBR_LITERAL", r"-?\d+") lexer.add("TROOF_LITERAL", r"[WIN|FAIL]") lexer.add("YARN_LITERAL", r"[\"|\'].*[\"|\']") lexer.add("MATH_BINARY_OPERATOR", r"SUM|DIFF|PRODUKT|QUOSHUNT|BIGGR|SMALLR") lexer.add("MATH_UNARY_OPERATOR", r"FLIP|SQUAR") lexer.add("LOGICAL_BINARY_OPERATOR", r"BOTH|EIHER|WON") lexer.add("LOGICAL_UNARY_OPERATOR", r"NOT") lexer.add("LOGICAL_VARIABLE_OPERATOR", r"ALL|ANY") lexer.add("COMPARISON_BINARY_OPERATOR", r"SAEM|DIFFRINT|FURSTSMALLR|FURSTBIGGR") lexer.add("ASSIGNMENT_OPERATOR", r"CORRECT_THIS") lexer.add( "SINGLE_COMMENT", r"BTW.*\n") # New line required to be added to tokens list prior! lexer.add("IDENTIFIER", r"[a-zA-Z][a-zA-Z_]*") lexer.add("LETTR_LITERAL", r".") lexer.add("ERROR", r"^[.]*") return lexer.build()
def __init__(self): self.lg = LexerGenerator() self.lg.ignore(r"\s+") self.lg.ignore(r"//.*") self.lg.add("PRINT", r"yazdır") self.lg.add("LOOP", r"tekrar") self.lg.add("READ", r"oku") self.lg.add("IF", r"ise") self.lg.add("ELSE", r"değil") self.lg.add("==", r"==") self.lg.add("!=", r"!=") self.lg.add("<=", r"<=") self.lg.add(">=", r">=") self.lg.add(">", r">") self.lg.add("<", r"<") self.lg.add("+=", r"\+=") self.lg.add("-=", r"\-=") self.lg.add("=", r"=") self.lg.add("STRING", r"'.*'") self.lg.add("FLOAT", r"\d+(\.\d+)") #r"[-]?\d+(\.\d+)" self.lg.add("INTEGER", r"\d+") # [-]?\d+ self.lg.add("BOOLEAN", r"(doğru|yanlış)") self.lg.add("ADD", r"\+") self.lg.add("SUB", r"-") self.lg.add("MUL", r"\*") self.lg.add("DIV", r"\/") self.lg.add("MOD", r"\%") self.lg.add("(", r"\(") self.lg.add(")", r"\)") self.lg.add("[", r"\[") self.lg.add("]", r"\]") self.lg.add("{", r"\{") self.lg.add("}", r"\}") self.lg.add(",", r",") self.lg.add("IDENTIFIER", r"[_\w]*[_\w0-9]+")
def test_simple(self): lg = LexerGenerator() lg.add("NUMBER", r"\d+") lg.add("PLUS", r"\+") l = lg.build() stream = l.lex("2+3") t = stream.next() assert t.name == "NUMBER" assert t.value == "2" t = stream.next() assert t.name == "PLUS" assert t.value == "+" t = stream.next() assert t.name == "NUMBER" assert t.value == "3" assert t.source_pos.idx == 2 t = stream.next() assert t is None
def test_ignore(self): lg = LexerGenerator() lg.add("NUMBER", r"\d+") lg.add("PLUS", r"\+") lg.ignore(r"\s+") l = lg.build() stream = l.lex("2 + 3") t = stream.next() assert t.name == "NUMBER" assert t.value == "2" t = stream.next() assert t.name == "PLUS" assert t.value == "+" t = stream.next() assert t.name == "NUMBER" assert t.value == "3" assert t.source_pos.idx == 4 with raises(StopIteration): stream.next()