def error(self): s = "Lexer error on '{lexeme}' line: {lineno} column: {column}".format( lexeme=self.current_char, lineno=self.lineno, column=self.column, ) raise LexerError(message=s)
def comment(self): curr_char = self.curr_char() if curr_char == '}': raise LexerError("Invalid Comment", self.line, self.char_pos) while curr_char and curr_char != '}': if curr_char == '\n': self.next_line() self.forward() curr_char = self.curr_char() # Unclosed Comment if self.pos >= self.end: raise LexerError("Invalid Comment", self.line, self.char_pos) # Exit Comment self.forward()
def decimal(self, beg): self.forward() # skip '.' curr_char = self.curr_char() # A digit must follow the dot if not curr_char or not curr_char.isdigit(): raise LexerError("Invalid constant", self.line, self.char_pos) while curr_char and curr_char.isdigit(): self.forward() curr_char = self.curr_char() next_next = self.next_char() if curr_char == 'E' and next_next: return self.big_e(beg) elif curr_char.isalpha(): raise LexerError("Invalid constant", self.line, self.char_pos) else: self.addop_flag = True return Token("REALCONSTANT", self.buff[beg:self.pos], self.line)
def big_e(self, beg): self.forward() # skip 'E' curr_char = self.curr_char() if not (curr_char == '+' or curr_char == '-' or curr_char.isdigit()): raise LexerError("Invalid constant", self.line, self.char_pos) if (curr_char == '+' or curr_char == '-') and not self.next_char().isdigit(): raise LexerError("Invalid constant", self.line, self.char_pos + 1) self.forward() # skip preliminary +, - or digit curr_char = self.curr_char() while curr_char and curr_char.isdigit(): self.forward() curr_char = self.curr_char() # Alphabetical characters are not allowed as part of a number if curr_char.isalpha(): raise LexerError("Invalid constant", self.line, self.char_pos) self.addop_flag = True return Token("REALCONSTANT", self.buff[beg:self.pos], self.line)
def digits(self): beg = self.pos # beginning of constant curr_char = self.curr_char() while curr_char and curr_char.isdigit(): self.forward() curr_char = self.curr_char() next_next = self.next_char( ) # ensures there are more characters after current if curr_char == 'E' and next_next: return self.big_e(beg) elif curr_char == '.' and next_next and next_next != '.': return self.decimal(beg) elif curr_char.isalpha(): # not including E raise LexerError("Invalid constant", self.line, self.char_pos) else: self.addop_flag = True return Token("INTCONSTANT", self.buff[beg:self.pos], self.line)
def letters(self): id = "" curr_char = self.curr_char() while curr_char and (curr_char.isalpha() or curr_char.isdigit()): id += curr_char self.forward() curr_char = self.curr_char() if len(id) > self.max_id_len: raise LexerError("Identifier too long", self.line, self.char_pos) tk = Token(id, None, self.line, 1) if tk.type() == "IDENTIFIER": self.addop_flag = True else: self.addop_flag = False # consumed, only valid after one token return tk
def next_token(self): if self.pos > self.end: return None elif self.pos == self.end: self.forward() return Token("ENDOFFILE", None, self.line) curr_char = self.curr_char() if curr_char == " " or curr_char == '\t' or curr_char == '\n': self.space() return self.next_token() elif curr_char == '{' or curr_char == '}': self.comment() return self.next_token() elif is_punct(curr_char): return self.punct() elif curr_char.isalpha(): return self.letters() elif curr_char.isdigit(): return self.digits() else: raise LexerError("Invalid character", self.line, self.char_pos)
def tokenize(text): position = 0 tokens = [] while True: if position < 0: break char = text[position] # whitespace if char == ' ': position = skip_until_nonspace(text, position) continue # comments if char == '-' and next_char(text, position) == '-': start = position position = skip_until_newline(text, position) # tokens.append(Token(COMMENT, text[start:position])) continue # tabs if char == '\t': tokens.append(Token(TAB, char)) position = advance(text, position) continue # symbols if char in SYMBOLS: if char == '.': token_type = PERIOD elif char == '(': token_type = LPAREN elif char == ')': token_type = RPAREN elif char == ',': token_type = COMMA else: token_type = SYMBOL tokens.append(Token(token_type, char)) position = advance(text, position) continue # newlines if char == '\n': # tokens.append(Token(NEWLINE, char)) position = advance(text, position) continue # numbers if char.isdigit(): start = position position = skip_until_nondigit(text, position) tokens.append(Token(NUMBER, text[start:position])) continue # words if char in ALPHABET: start = position position = skip_until_nonalpha(text, position) characters = text[start:position] if len(characters) > 1: token_type = WORD elif str.isupper(characters): token_type = LETTER_UPPER else: token_type = LETTER_LOWER tokens.append(Token(token_type, characters)) continue e = LexerError(message=f"Invalid character: {char}") print(e.message) exit(1) tokens.append(Token(ENDMARKER, '')) return tokens