def tokenize_string(lexer, char): if CharUtils.classify(char) is not CharType.DOUBLE_QUOTE: raise ValueError( 'Strings must begin and end with DOUBLE_QUOTE (").') token_value = '' start_line = lexer.line start_column = lexer.column for position in range(lexer.position, len(lexer.input)): c = lexer.input[position] token_value += c # terminating double quote if (CharUtils.classify(c) is CharType.DOUBLE_QUOTE and not (position == lexer.position or CharUtils.classify(lexer.input[position - 1]) is CharType.ESCAPE_CHARACTER)): break if token_value[-1] == '"': lexer.move_right_n(len(token_value)) return Token(TokenType.STRING, token_value, start_line, start_column) return None
def tokenize_identifier(lexer, char): if CharUtils.classify(char) not in [ CharType.LETTER, CharType.UNDERSCORE ]: raise ValueError( 'Identifiers must begin with LETTER or UNDERSCORE (_).') start_line = lexer.line start_column = lexer.column token_value = '' for c in lexer.input[lexer.position:]: if CharUtils.classify(c) not in CharUtils.IDENTIFIER_TYPES: break token_value += c lexer.move_right_n(len(token_value)) keyword_to_token_type_map = { 'abstract': TokenType.ABSTRACT, 'as': TokenType.AS, 'class': TokenType.CLASS, 'else': TokenType.ELSE, 'extends': TokenType.EXTENDS, 'false': TokenType.FALSE, 'final': TokenType.FINAL, 'func': TokenType.FUNC, 'for': TokenType.FOR, 'if': TokenType.IF, 'in': TokenType.IN, 'lazy': TokenType.LAZY, 'let': TokenType.LET, 'new': TokenType.NEW, 'null': TokenType.NULL, 'override': TokenType.OVERRIDE, 'private': TokenType.PRIVATE, 'protected': TokenType.PROTECTED, 'return': TokenType.RETURN, 'super': TokenType.SUPER, 'to': TokenType.TO, 'this': TokenType.THIS, 'true': TokenType.TRUE, 'var': TokenType.VAR, 'while': TokenType.WHILE, } return Token( keyword_to_token_type_map.get(token_value, TokenType.IDENTIFIER), token_value, start_line, start_column)
def _get_initial_transition(self, char): if char == '.': return self.State.FRACTIONAL_BEGINNING elif CharUtils.classify(char) in CharUtils.DIGIT_TYPES: return self.State.INTEGER return self.State.INVALID
def tokenize_operator(lexer, char): char_type = CharUtils.classify(char) operator_type = OperatorType.map_operator_to_type(char) next_char = lexer.look_ahead() next_operator_type = OperatorType.map_operator_to_type(next_char) start_line, start_column = lexer.line, lexer.column token_value = char # all of these symbols can have `=` tacked on to them extended_by_equal = [ OperatorType.PLUS, OperatorType.MINUS, OperatorType.MOD, OperatorType.DIV, OperatorType.TIMES, OperatorType.LESS, OperatorType.EQUAL, OperatorType.GREATER, OperatorType.EQUAL, OperatorType.NOT, ] if char_type is CharType.DOT: if CharUtils.classify(lexer.look_ahead()) in CharUtils.DIGIT_TYPES: return Tokenizer.tokenize_number(lexer, char) elif (operator_type in extended_by_equal and next_operator_type is OperatorType.EQUAL): token_value += next_char elif (operator_type is OperatorType.LESS and next_operator_type is OperatorType.MINUS): # gives us the arrow operator token_value += next_char elif (operator_type is OperatorType.MINUS and next_operator_type is OperatorType.GREATER): token_value += next_char elif (operator_type is OperatorType.AMP or operator_type is OperatorType.PIPE): if next_operator_type == operator_type: token_value += next_char else: return None lexer.move_right_n(len(token_value)) return Token(TokenType.get_operator_token_type(token_value), token_value, start_line, start_column)
def _get_integer_transition(self, char): if char == '.': return self.State.FRACTIONAL_BEGINNING elif char in self.EXPONENTIAL_PREFIXES: return self.State.EXPONENTIAL_BEGINNING elif CharUtils.classify(char) in CharUtils.DIGIT_TYPES: return self.State.INTEGER return self.State.INVALID
def tokenize_delimiter(lexer, char): if CharUtils.classify(char) is not CharType.DELIMITER: raise ValueError( 'Character must be one of [, ], {, }, (, ), :, or comma.') start_line, start_column = lexer.line, lexer.column lexer.move_right() return Token( TokenType.get_delimiter_token_type(char), char, start_line, start_column, )
def tokenize_newline(lexer, char): if CharUtils.classify(char) is not CharType.NEWLINE: raise ValueError('Character must be newline (\n).') start_line, start_column = lexer.line, lexer.column lexer.next_line() return Token( TokenType.NEWLINE, char, start_line, start_column, )
def next_token(self): if self.position >= len(self.input): return Token(TokenType.END_OF_INPUT, None, None, None) self.skip_whitespace() char = self.input[self.position] tokenizer_func = self.tokenizer_map.get(CharUtils.classify(char), None) if not tokenizer_func: raise ValueError('Token not in tokenizer map.') return tokenizer_func(self, char)
def tokenize_number(lexer, char): if CharUtils.classify(char) not in (CharUtils.DIGIT_TYPES + [CharType.DOT]): raise ValueError( 'Numbers must begin with ZERO, POSITIVE_DIGIT, or dot (.).') fsm = NumberFSM() token_type, token_value = fsm.run(lexer.input[lexer.position:]) if token_type: # if we have a valid number, update the lexer position to reflect # the length of the number and return the token start_line, start_column = lexer.line, lexer.column lexer.move_right_n(len(token_value)) return Token(token_type, token_value, start_line, start_column) return None
def run(self, input): current_state = self.initial_state output = '' for c in input: # we've hit the end of the decimal token if not CharUtils.is_valid_number_character(c): break output += c next_state = self.next_state(current_state, c) current_state = next_state if current_state in self.accepting_states: return ( self.ACCEPTING_STATE_TO_TOKEN_MAP.get(current_state, None), output) return (None, None)
def _get_exponential_number_transition(self, char): if CharUtils.classify(char) in CharUtils.DIGIT_TYPES: return self.State.EXPONENTIAL_NUMBER return self.State.INVALID
def _get_signed_exponential_transition(self, char): if CharUtils.classify(char) is CharType.POSITIVE_DIGIT: return self.State.EXPONENTIAL_NUMBER return self.State.INVALID
def _get_exponential_beginning_transition(self, char): if char in self.SIGNS: return self.State.SIGNED_EXPONENTIAL elif CharUtils.classify(char) is CharType.POSITIVE_DIGIT: return self.State.EXPONENTIAL_NUMBER return self.State.INVALID
def _get_fractional_number_transition(self, char): if char in self.EXPONENTIAL_PREFIXES: return self.State.EXPONENTIAL_BEGINNING elif CharUtils.classify(char) in CharUtils.DIGIT_TYPES: return self.State.FRACTIONAL_NUMBER return self.State.INVALID
def _get_fractional_beginning_transition(self, char): if CharUtils.classify(char) in CharUtils.DIGIT_TYPES: return self.State.FRACTIONAL_NUMBER return self.State.INVALID
def skip_whitespace(self): while (self.position < len(self.input) and CharUtils.is_whitespace(self.input[self.position])): self.move_right()