示例#1
0
文件: lexer.py 项目: ischaojie/nan
 def error(self):
     s = "Lexer error on '{lexeme}' line: {lineno} column: {column}".format(
         lexeme=self.current_char,
         lineno=self.lineno,
         column=self.column,
     )
     raise LexerError(message=s)
示例#2
0
    def comment(self):
        curr_char = self.curr_char()

        if curr_char == '}':
            raise LexerError("Invalid Comment", self.line, self.char_pos)

        while curr_char and curr_char != '}':
            if curr_char == '\n':
                self.next_line()

            self.forward()
            curr_char = self.curr_char()

        # Unclosed Comment
        if self.pos >= self.end:
            raise LexerError("Invalid Comment", self.line, self.char_pos)

        # Exit Comment
        self.forward()
示例#3
0
    def decimal(self, beg):
        self.forward()  # skip '.'
        curr_char = self.curr_char()

        # A digit must follow the dot
        if not curr_char or not curr_char.isdigit():
            raise LexerError("Invalid constant", self.line, self.char_pos)

        while curr_char and curr_char.isdigit():
            self.forward()
            curr_char = self.curr_char()

        next_next = self.next_char()
        if curr_char == 'E' and next_next:
            return self.big_e(beg)
        elif curr_char.isalpha():
            raise LexerError("Invalid constant", self.line, self.char_pos)
        else:
            self.addop_flag = True
            return Token("REALCONSTANT", self.buff[beg:self.pos], self.line)
示例#4
0
    def big_e(self, beg):
        self.forward()  # skip 'E'
        curr_char = self.curr_char()
        if not (curr_char == '+' or curr_char == '-' or curr_char.isdigit()):
            raise LexerError("Invalid constant", self.line, self.char_pos)
        if (curr_char == '+'
                or curr_char == '-') and not self.next_char().isdigit():
            raise LexerError("Invalid constant", self.line, self.char_pos + 1)

        self.forward()  # skip preliminary +, - or digit
        curr_char = self.curr_char()
        while curr_char and curr_char.isdigit():
            self.forward()
            curr_char = self.curr_char()

        # Alphabetical characters are not allowed as part of a number
        if curr_char.isalpha():
            raise LexerError("Invalid constant", self.line, self.char_pos)

        self.addop_flag = True
        return Token("REALCONSTANT", self.buff[beg:self.pos], self.line)
示例#5
0
    def digits(self):
        beg = self.pos  # beginning of constant
        curr_char = self.curr_char()
        while curr_char and curr_char.isdigit():
            self.forward()
            curr_char = self.curr_char()

        next_next = self.next_char(
        )  # ensures there are more characters after current
        if curr_char == 'E' and next_next:
            return self.big_e(beg)
        elif curr_char == '.' and next_next and next_next != '.':
            return self.decimal(beg)
        elif curr_char.isalpha():  # not including E
            raise LexerError("Invalid constant", self.line, self.char_pos)
        else:
            self.addop_flag = True
            return Token("INTCONSTANT", self.buff[beg:self.pos], self.line)
示例#6
0
    def letters(self):
        id = ""
        curr_char = self.curr_char()
        while curr_char and (curr_char.isalpha() or curr_char.isdigit()):
            id += curr_char
            self.forward()
            curr_char = self.curr_char()

        if len(id) > self.max_id_len:
            raise LexerError("Identifier too long", self.line, self.char_pos)

        tk = Token(id, None, self.line, 1)
        if tk.type() == "IDENTIFIER":
            self.addop_flag = True
        else:
            self.addop_flag = False  # consumed, only valid after one token

        return tk
示例#7
0
    def next_token(self):
        if self.pos > self.end:
            return None
        elif self.pos == self.end:
            self.forward()
            return Token("ENDOFFILE", None, self.line)

        curr_char = self.curr_char()
        if curr_char == " " or curr_char == '\t' or curr_char == '\n':
            self.space()
            return self.next_token()
        elif curr_char == '{' or curr_char == '}':
            self.comment()
            return self.next_token()
        elif is_punct(curr_char):
            return self.punct()
        elif curr_char.isalpha():
            return self.letters()
        elif curr_char.isdigit():
            return self.digits()
        else:
            raise LexerError("Invalid character", self.line, self.char_pos)
示例#8
0
def tokenize(text):
    position = 0
    tokens = []

    while True:
        if position < 0:
            break
        char = text[position]
    
        # whitespace
        if char == ' ':
            position = skip_until_nonspace(text, position)
            continue
    
        # comments
        if char == '-' and next_char(text, position) == '-':
            start = position
            position = skip_until_newline(text, position)      
            # tokens.append(Token(COMMENT, text[start:position]))
            continue
    
        # tabs
        if char == '\t':
            tokens.append(Token(TAB, char))
            position = advance(text, position)
            continue

        # symbols
        if char in SYMBOLS:
            if char == '.':
                token_type = PERIOD
            elif char == '(':
                token_type = LPAREN
            elif char == ')':
                token_type = RPAREN
            elif char == ',':
                token_type = COMMA
            else:
                token_type = SYMBOL
            tokens.append(Token(token_type, char))
            position = advance(text, position)
            continue

        # newlines
        if char == '\n':
            # tokens.append(Token(NEWLINE, char))
            position = advance(text, position)
            continue

        # numbers
        if char.isdigit():
            start = position
            position = skip_until_nondigit(text, position)
            tokens.append(Token(NUMBER, text[start:position]))
            continue

        # words
        if char in ALPHABET:
            start = position
            position = skip_until_nonalpha(text, position)
            characters = text[start:position]
            if len(characters) > 1:
                token_type = WORD
            elif str.isupper(characters):
                token_type = LETTER_UPPER
            else:
                token_type = LETTER_LOWER
            tokens.append(Token(token_type, characters))
            continue

        e = LexerError(message=f"Invalid character: {char}")
        print(e.message)
        exit(1)

    tokens.append(Token(ENDMARKER, ''))
    return tokens