示例#1
0
class Lexer:
    def __init__(self, fname, source):
        self.source = source
        self.fname = fname
        self.pos = Position(-1, 0, -1, fname, source)
        self.current_symbol = None
        self.advance()

    def advance(self):
        self.pos.advance(self.current_symbol)
        self.current_symbol = self.source[self.pos.index] if self.pos.index < len(self.source) else None

    def make_tokens(self):
        tokens = []

        while self.current_symbol != None:
            if self.current_symbol in " \t":
                self.advance()
            elif self.current_symbol in DIGITS:
                tokens.append(self.make_number())
            elif self.current_symbol in LETTERS + "_":
                tokens.append(self.make_identifier())
            elif self.current_symbol == "+":
                tokens.append(Token(TT_ADD, pos_start=self.pos))
                self.advance()
            elif self.current_symbol == "-":
                tokens.append(Token(TT_SUB, pos_start=self.pos))
                self.advance()
            elif self.current_symbol == "*":
                tokens.append(Token(TT_MUL, pos_start=self.pos))
                self.advance()
            elif self.current_symbol == "/":
                tokens.append(Token(TT_DIV, pos_start=self.pos))
                self.advance()
            elif self.current_symbol == "^":
                tokens.append(Token(TT_POW, pos_start=self.pos))
                self.advance()
            elif self.current_symbol == "(":
                tokens.append(Token(TT_LPAREN, pos_start=self.pos))
                self.advance()
            elif self.current_symbol == ")":
                tokens.append(Token(TT_RPAREN, pos_start=self.pos))
                self.advance()
            elif self.current_symbol == "!":
                token, error = self.make_not_equals()
                if error:
                    return [], error
                tokens.append(token)
            elif self.current_symbol == "=":
                tokens.append(self.make_equals())
            elif self.current_symbol == "<":
                tokens.append(tself.make_less_than())
            elif self.current_symbol == ">":
                tokens.append(tself.make_greater_than())
            else:
                pos_start = self.pos.copy()
                illegal_symbol = self.current_symbol
                self.advance()
                return [], IllegalTokenError(pos_start, self.pos, f"'{illegal_symbol}'")

        tokens.append(Token(TT_EOF, pos_start=self.pos))
        return tokens, None

    def make_number(self):
        num_s = ""
        dot_c = 0
        pos_start = self.pos.copy()

        while self.current_symbol != None and self.current_symbol in DIGITS + ".":
            if self.current_symbol == ".":
                if dot_c == 1:
                    break
                dot_c += 1
                num_s += "."
            else:
                num_s += self.current_symbol
            self.advance()
        if dot_c == 0:
            return Token(TT_INT, int(num_s), pos_start, self.pos)
        else:
            return Token(TT_DEC, float(num_s), pos_start, self.pos)

    def make_identifier(self):
        id_str = ""
        pos_start = self.pos.copy()

        while self.current_symbol != None and self.current_symbol in LETTERS_DIGITS + "_":
            id_str += self.current_symbol
            self.advance()

        token_type = TT_KEY if id_str in KEYWORDS else TT_ID
        return Token(token_type, id_str, pos_start, self.pos)

    def make_not_equals(self):
        pos_start = self.pos.copy()
        self.advance()

        if self.current_symbol == "=":
            self.advance()
            return Token(TT_NE, pos_start=pos_start, pos_end=self.pos), None

        invalid_symbol = self.current_symbol
        self.advance()
        return None, ExpectedSymbolError(pos_start, self.pos, f"Unxpected symbol '{invalid_symbol}', expected '='")

    def make_equals(self):
        token_type = TT_EQ
        pos_start = self.pos.copy()
        self.advance()

        if self.current_symbol == "=":
            self.advance()
            token_type = TT_EE

        return Token(token_type, pos_start=pos_start, pos_end=self.pos)

    def make_less_than(self):
        token_type = TT_LT
        pos_start = self.pos.copy()
        self.advance()

        if self.current_symbol == "=":
            self.advance()
            token_type = TT_LTE

        return Token(token_type, pos_start=pos_start, pos_end=pos_end)

    def make_greater_than(self):
        token_type = TT_GT
        pos_start = self.pos.copy()
        self.advance()

        if self.current_symbol == "=":
            self.advance()
            token_type = TT_GTE

        return Token(token_type, pos_start=pos_start, pos_end=pos_end)
示例#2
0
class Lexer(object):
    """
    Lexer 词法分析
    """
    def __init__(self, fn, text):
        self.fn = fn  # text来源 => 某个文件,方便报错定位
        self.text = text
        self.pos = Position(-1, 0, -1, fn, text)  # 位置
        self.current_char = None  # 当前字符
        self.advance()  # self.pos从-1开始,然后立刻调用self.advance

    def advance(self):
        """预读"""
        self.pos.advance(self.current_char)
        if self.pos.idx < len(self.text):
            self.current_char = self.text[self.pos.idx]
        else:
            self.current_char = None

    def make_tokens(self):
        tokens = []

        while self.current_char != None:
            if self.current_char in (' ', '\t'):
                # 为空格或制表符,直接跳过
                self.advance()
            elif self.current_char == '#':
                # 跳过注释
                self.skip_comment()
            elif self.current_char in DIGITS:  # 识别数字
                tokens.append(self.make_number())

            elif self.current_char in LETTERS:  # 识别字母
                tokens.append(self.make_identifier())

            elif self.current_char == '!':
                token, error = self.make_not_equals()
                if error:
                    return [], error
                tokens.append(token)
            elif self.current_char == '=':
                tokens.append(self.make_equals())
            elif self.current_char == '<':
                tokens.append(self.make_less_than())
            elif self.current_char == '>':
                tokens.append(self.make_greater_than())
            elif self.current_char == '^':  # 幂操作 x^y => x的y次幂
                tokens.append(Token(TT_POW, pos_start=self.pos))
                self.advance()
            elif self.current_char == '+':
                tokens.append(Token(TT_PLUS, pos_start=self.pos))
                self.advance()
            elif self.current_char == '-':
                tokens.append(self.make_minus_or_arrow())
            elif self.current_char == '*':
                tokens.append(Token(TT_MUL, pos_start=self.pos))
                self.advance()
            elif self.current_char == '/':
                tokens.append(Token(TT_DIV, pos_start=self.pos))
                self.advance()
            elif self.current_char == '(':
                tokens.append(Token(TT_LPAREN, pos_start=self.pos))
                self.advance()
            elif self.current_char == ')':
                tokens.append(Token(TT_RPAREN, pos_start=self.pos))
                self.advance()
            elif self.current_char == '[':
                tokens.append(Token(TT_LSQUARE, pos_start=self.pos))
                self.advance()
            elif self.current_char == ']':
                tokens.append(Token(TT_RSQUARE, pos_start=self.pos))
                self.advance()
            elif self.current_char == ',':
                tokens.append(Token(TT_COMMA, pos_start=self.pos))
                self.advance()
            elif self.current_char == '"':
                tokens.append(self.make_string())
            elif self.current_char in ';\n':  # 换行
                tokens.append(Token(TT_NEWLINE, pos_start=self.pos))
                self.advance()
            else:
                # 没有匹配任何Token,return some error
                pos_start = self.pos.copy()
                char = self.current_char
                self.advance()
                return [], IllegalCharError(pos_start, self.pos, f"'{char}'")
        tokens.append(Token(TT_EOF, pos_start=self.pos))
        return tokens, None

    def skip_comment(self):
        # 跳过toypl中的注释
        self.advance()
        while self.current_char != '\n':
            self.advance()
        self.advance()

    def make_number(self):
        """
        识别数字
        :return:
        """
        num_str = ''
        dot_coumt = 0  # 点的个数 => . 小数点
        pos_start = self.pos.copy()  # 拷贝,避免影响原self.pos

        while self.current_char != None and self.current_char in DIGITS + '.':
            if self.current_char == '.':
                if dot_coumt == 1:
                    break  # 只可有一个小数点
                dot_coumt += 1
                num_str += '.'
            else:
                num_str += self.current_char
            self.advance()
        if dot_coumt == 0:  # 整数
            return Token(TT_INT, int(num_str), pos_start, self.pos)
        else:
            return Token(TT_FLOAT, float(num_str), pos_start, self.pos)

    def make_string(self):
        string = ''
        pos_start = self.pos.copy()
        escape_character = False  # 是否为转义字符 => \"
        escape_characters = {'n': '\n', 't': '\t'}

        self.advance()
        # 当前字符不为空 以及 (不为 " 或者是转义字符串 => \")
        while self.current_char != None and (self.current_char != '"'
                                             or escape_character):
            if escape_character:
                # 如果是转义字符,则需要获得字符原始的值
                string += escape_characters.get(self.current_char,
                                                self.current_char)
                escape_character = False
            else:
                if self.current_char == '\\':  # python 中 \\ 其实就是当个 \
                    escape_character = True  # 为转义字符
                else:
                    string += self.current_char  # 普通字符,直接拼接则可

            self.advance()

        self.advance()
        return Token(TT_STRING, string, pos_start, self.pos)

    def make_identifier(self):
        """
        识别变量
        :return:
        """
        variable_str = ''
        pos_start = self.pos.copy()

        while self.current_char != None and self.current_char in LETTERS_DIGITS + '_':  # 运行变量名中存在下划线
            variable_str += self.current_char
            self.advance()

        # 如果字符串在KEYWORDS中,说明该Token是关键字,否则则是变量名
        if variable_str in KEYWORDS:
            tok_type = TT_KEYWORD
        else:
            tok_type = TT_IDENTIFIER

        return Token(tok_type, variable_str, pos_start, self.pos)

    def make_not_equals(self):
        """
        匹配 !=
        :return:
        """
        pos_start = self.pos.copy()

        self.advance()
        if self.current_char == '=':  # != 不等于
            self.advance()
            return Token(TT_NE, pos_start=pos_start, pos_end=self.pos), None

        self.advance()
        return None, ExpectedCharError(pos_start, self.pos, "'=' (after '!')")

    def make_equals(self):
        """
        匹配 = 或 ==
        :return:
        """
        tok_type = TT_EQ
        pos_start = self.pos.copy()

        self.advance()
        if self.current_char == '=':  # ==
            self.advance()
            tok_type = TT_EE
        return Token(tok_type, pos_start=pos_start, pos_end=self.pos)

    def make_less_than(self):
        """
        匹配 < 或 <=
        :return:
        """
        tok_type = TT_LT
        pos_start = self.pos.copy()

        self.advance()
        if self.current_char == '=':  # <=
            self.advance()
            tok_type = TT_LTE
        return Token(tok_type, pos_start=pos_start, pos_end=self.pos)

    def make_greater_than(self):
        """
        匹配 > 或 >=
        :return:
        """
        tok_type = TT_GT
        pos_start = self.pos.copy()

        self.advance()
        if self.current_char == '=':  # >=
            self.advance()
            tok_type = TT_GTE
        return Token(tok_type, pos_start=pos_start, pos_end=self.pos)

    def make_minus_or_arrow(self):
        """
        匹配 - 或 ->
        :return:
        """
        tok_type = TT_MINUS
        pos_start = self.pos.copy()

        self.advance()
        if self.current_char == '>':
            self.advance()
            tok_type = TT_ARROW

        return Token(tok_type, pos_start=pos_start, pos_end=self.pos)
示例#3
0
class Lexer:
    """
    Lexer class that handles lexical analysis of the source code.

    Attributes:
        index: int, Current index in source code string
        row: int, Current row in source code
        column: int, Current column in source code
        filename: string, Name of the file currently being lexed
        source_code: string, The source code
        current_indent_lvl: int, The number of indentations at current position in code
        tokens: list, A list of all lexed tokens 
    """
    def __init__(self, source_code="", filename="CLI"):
        """
        Inits class with source code and filename, init a Position object and gets first character.
        """

        self.error = None
        if not isinstance(source_code, str):
            self.error = Error("Error: expected 'str' as source_code")
            return
        if not isinstance(filename, str):
            self.error = Error("Error: expected 'str' as filename")
            return
        self.filename = filename
        self.source_code = source_code
        self.position = Position(-1, 0, -1, 0, filename)
        self.current_character = None
        self.tokens = []
        self.advance()

    def advance(self, n=1):
        """
        Advances current_token to next character in source code. 
        Changes Position-object to new postion in code.
        """
        for i in range(n):
            self.position.advance(self.current_character)

            if self.position.index < len(self.source_code):
                self.current_character = self.source_code[self.position.index]
            else:
                self.current_character = None
                return False
        return True

    def look_ahead(self, count=1):
        """
        Looks ahead on comming characters in the source code.

        PARAMS:
            count: int - Specifies how many characters to fetch
        RETURNS:
            str: the following characters
        """
        if not isinstance(count, int):
            self.error = Error("Error: count is expected to be an int")
            return None

        if count < 1:
            self.error = Error(
                "Error: count is expected be a positive integer")
            return None

        sc_length = len(self.source_code)
        next_char = self.position.index + 1

        if sc_length < next_char + count:
            return None
        return self.source_code[next_char:next_char + count]

    def allowed_character(self, allowed_characters):
        if self.current_character is None:
            self.error = Error("Error: Unexpected end of source code.")
            return False
        if allowed_characters is None or allowed_characters == "":
            self.error = Error("Error: No characters to allow entered.")
            return False
        elif self.current_character.lower() in allowed_characters.lower():
            return True
        return False

    def make_tokens(self):
        """
        Preforms the lexical analysis on the source code and breaks it down to terminal tokens.

        RETURNS:
            list, containing token-objects 
        """
        while self.current_character:
            if self.allowed_character("0123456789"):
                self.tokens.append(self.make_number())
                if self.error: return
            elif "{}".format(self.current_character) == "\n":
                start = self.position.copy()
                self.advance()
                self.tokens.append(
                    Token(tt._NEWLINE, '\n', start, self.position.copy()))
                indent = self.check_indent()
                if self.error: return
                if indent != self.position.indent:
                    self.change_indent(indent)
            elif self.allowed_character("'\""):
                self.tokens.append(self.make_string())
                if self.error: return
                continue
            elif self.is_operator():
                self.tokens.append(self.make_operator())
                if self.error: return
                continue
            else:
                letterResult, error = isLetter(self.current_character)
                if error:
                    self.error = error
                    return
                if letterResult:
                    self.tokens.append(self.make_symbol())
                    continue
                else:
                    if self.allowed_character(" \t"):
                        self.advance()
                        continue
                    start = self.position.copy()
                    char = self.current_character
                    self.advance()
                    end = self.position.copy()
                    self.tokens.append(Token(tt._INVALID, char, start, end))
                    self.error = Error("ValueError: Unexpected character")
            if self.error:
                return

    def make_number(self):
        """
        Reads characters from source code and returns a number token. 
        The method can parse integers, floats, octadecimal, hexdecimal and binary numbers. 
        """

        if self.current_character == '0':
            next_character = self.look_ahead()
            if next_character:
                next_character = next_character.lower()
                if next_character == 'b':
                    return self.make_binary()
                elif next_character == 'o':
                    return self.make_octodecimal()
                elif next_character == 'x':
                    return self.make_hexadecimal()
        return self.make_decimal()

    def make_binary(self):
        """
        Reads binary characters until not allowed character appers.
        Returns a binary token
        """
        binary_string = ""
        allowed_chars = "01"

        start_position = self.position.copy()
        end_position = None

        binary_string += self.current_character
        self.advance()
        binary_string += self.current_character
        self.advance()

        if binary_string.lower() != '0b':
            end_position = self.position.copy()
            self.error = Error("ValueError: Can not convert to a number")
            return Token(tt._INVALID, binary_string, start_position,
                         end_position)

        while self.current_character and self.allowed_character(allowed_chars):
            binary_string += self.current_character

            self.advance()

        end_position = self.position.copy()

        if len(binary_string) < 3:
            self.error = Error("ValueError: Can not convert to a number")
            return Token(tt._INVALID, binary_string, start_position,
                         end_position)
        return Token(tt._BIN, int(binary_string, base=2), start_position,
                     end_position)

    def make_octodecimal(self):
        """
        Reads octodecimal characters until not allowed character appers.
        Returns a octodecimal token
        """
        oct_string = ""
        allowed_chars = "01234567"

        start_position = self.position.copy()
        end_position = None

        oct_string += self.current_character
        self.advance()
        oct_string += self.current_character
        self.advance()

        if oct_string.lower() != '0o':
            end_position = self.position.copy()
            self.error = Error("ValueError: Can not convert to a number")
            return Token(tt._INVALID, oct_string, start_position, end_position)

        while self.current_character and self.allowed_character(allowed_chars):
            oct_string += self.current_character

            self.advance()

        end_position = self.position.copy()

        if len(oct_string) < 3:
            self.error = Error("ValueError: Can not convert to a number")
            return Token(tt._INVALID, oct_string, start_position, end_position)

        return Token(tt._OCT, int(oct_string, base=8), start_position,
                     end_position)

    def make_hexadecimal(self):
        """
        Reads hexadecimal characters until not allowed character appers.
        Returns a hexdecimal token
        """
        hex_string = ""
        allowed_chars = "0123456789abcdef"

        start_position = self.position.copy()
        end_position = None

        hex_string += self.current_character
        self.advance()
        hex_string += self.current_character
        self.advance()

        if hex_string.lower() != '0x':
            end_position = self.position.copy()
            self.error = Error("ValueError: Can not convert to a number")
            return Token(tt._INVALID, hex_string, start_position, end_position)

        while self.current_character and self.allowed_character(allowed_chars):
            hex_string += self.current_character

            self.advance()

        end_position = self.position.copy()

        if len(hex_string) < 3:
            self.error = Error("ValueError: Can not convert to a number")
            return Token(tt._INVALID, hex_string, start_position, end_position)

        return Token(tt._HEX, int(hex_string, base=16), start_position,
                     end_position)

    def make_decimal(self):
        """
        Reads decimal characters until not allowed character appers.
        Returns a integer or float token
        """

        number_string = ""
        dot_counter = 0
        allowed_chars = "1234567890."

        start_position = self.position.copy()
        end_position = None

        if self.current_character not in allowed_chars:
            self.error = Error("ValueError: Expected a digit or dot '.'")
            char = self.current_character
            self.advance()
            end_position = self.position.copy()
            return Token(tt._INVALID, char, start_position, end_position)

        while self.allowed_character(allowed_chars) and dot_counter < 2:
            number_string += self.current_character

            self.advance()
            if self.current_character == ".":
                dot_counter += 1
            elif self.current_character is None:
                break

        end_position = self.position.copy()

        if dot_counter:
            return Token(tt._FLOAT, float(number_string), start_position,
                         end_position)
        else:
            return Token(tt._INT, int(number_string), start_position,
                         end_position)

    def make_symbol(self):
        """
        Creates identifiers and keywords.
        Reads characters from the source code until getting to a non allowed
        character, decides if it's a keyword or identifier and returns a token.

        RETURNS:
            Token
        """
        allowed_chars = "1234567890_abcdefghijklmnopqrstuvwxyz"
        symbol = ""
        start = self.position.copy()

        if not self.allowed_character(allowed_chars[10:]):
            symbol = self.current_character
            self.advance()
            end = self.position.copy()
            self.error = Error(
                "ValueError: Unexpected illegal character {}".format(symbol))
            return Token(tt._INVALID, symbol, start, end)

        while self.allowed_character(allowed_chars):
            symbol += self.current_character
            if not self.advance():
                break

        end = self.position.copy()

        symbol_type, error = isKeyword(symbol)

        if error:
            self.error = error
            return Token(tt._INVALID, symbol, start, end)

        return Token(symbol_type, symbol, start, end)

    def make_string(self):
        start = self.position.copy()
        qm = self.current_character
        not_allowed_chars = qm + "\n"

        string = str()
        prev = None
        while self.advance() and not self.allowed_character(not_allowed_chars):
            if self.current_character == '\\':
                next = self.look_ahead()
                if next == '\n':
                    self.advance()
                    continue
                elif next == "\"\'":
                    string += self.current_character
                    prev = self.current_character
                elif next == 'n':
                    self.advance()
                    string += '\n'
                    prev = '\n'
                    continue
                elif next == 't':
                    self.advance()
                    string += '\t'
                    prev = '\t'
                    continue
                elif next == '\\':
                    self.advance()

            string += self.current_character
            prev = self.current_character

        if self.current_character == qm:
            self.advance()
            end = self.position.copy()
            return Token(tt._STRING, string, start, end)
        elif self.current_character == "\n":
            self.advance()
            end = self.position.copy()
            self.error = Error("StringError: Incorrect line break in string")
            return Token(tt._INVALID, string, start, end)

    def check_indent(self):
        """
        Checks indentation level.

        Returns
            int - Level of indentation
        """
        if self.current_character == "\n":
            self.advance()
        elif self.current_character != " ":
            start = self.position.copy()
            char = self.current_character
            self.advance()
            end = self.position.copy()
            self.error = Error("IndentationError: Unexpected character")
            self.tokens.append(Token(tt._INVALID, char, start, end))
            return None
        count = 0
        start, end = None, None
        while self.current_character == " ":
            if count % 4 == 0:
                start = self.position.copy()
            count += 1
            self.advance()

        if count % 4 == 0:
            return int(count / 4)
        else:
            self.error = Error("IndentationError: Invalid indentation")
            end = self.position.copy()
            self.tokens.append(
                Token(tt._INVALID, " " * (count % 4), start, end))
            return None

    def change_indent(self, indent):
        """
        Generates indent and dedent tokens to change indentation level.
        """
        if not isinstance(indent, int):
            self.error = Error("ValueError: Positive integer expected")
            return
        if indent < 0:
            self.error = Error("ValueError: Positive integer expected")
            return

        while self.position.indent < indent:
            self.position.indent += 1
            self.tokens.append(
                Token(tt._INDENT, "    ", self.position, self.position))

        while self.position.indent > indent:
            self.position.indent -= 1
            self.tokens.append(
                Token(tt._DEDENT, "    ", self.position, self.position))

    def is_operator(self):
        first_char_in_op = [
            '=',
            '+',
            '-',
            '*',
            '/',
            '%',
            '&',
            '|',
            '^',
            '<',
            '>',
            '(',
            ')',
            '[',
            ']',
            '{',
            '}',
            '.',
            ',',
            ':',
        ]
        if self.current_character in first_char_in_op:
            return True
        return False

    def make_operator(self):
        n = self.look_ahead(2)
        if not n:
            n = self.look_ahead()

        possible_op = self.current_character
        if possible_op is None:
            self.error = Error("LexicalError: No characters in buffer")
            return Token(tt._INVALID, None, None, None)
        if n:
            possible_op += n
        start = self.position.copy()

        values = {
            '=': tt._ASSIGN,
            '==': tt._BITWISE_EQ,
            '+': tt._PLUS,
            '++': tt._INCR,
            '+=': tt._PLUS_ASSIGN,
            '-': tt._MINUS,
            '--': tt._DECR,
            '-=': tt._MINUS_ASSIGN,
            '*': tt._MULT,
            '*=': tt._MULT_ASSIGN,
            '**': tt._EXP,
            '**=': tt._POWER_ASSIGN,
            '/': tt._DIV,
            '/=': tt._DIV_ASSIGN,
            '//': tt._FLOOR,
            '//=': tt._FLOOR_ASSIGN,
            '%': tt._MOD,
            '%=': tt._MOD_ASSIGN,
            '&=': tt._AND_ASSIGN,
            '&': tt._BITWISE_AND,
            '|=': tt._OR_ASSIGN,
            '|': tt._BITWISE_OR,
            '^': tt._BITWISE_XOR,
            '^=': tt._XOR_ASSIGN,
            '<': tt._BITWISE_LT,
            '<=': tt._BITWISE_LTE,
            '<<': tt._BITWISE_LSHIFT,
            '<<=': tt._LSHIFT_ASSIGN,
            '>': tt._BITWISE_GT,
            '>=': tt._BITWISE_GTE,
            '>>': tt._BITWISE_RSHIFT,
            '>>=': tt._RSHIFT_ASSIGN,
            '(': tt._LPARAN,
            ')': tt._RPARAN,
            '[': tt._LSQBRACK,
            ']': tt._RSQBRACK,
            '{': tt._LCURLBRACK,
            '}': tt._RCURLBRACK,
            '.': tt._DOT,
            ',': tt._COMMA,
            ':': tt._COLON,
        }

        if possible_op.__len__() == 3 and values.get(possible_op):
            self.advance(3)
            end = self.position.copy()
            return Token(values.get(possible_op), possible_op, start, end)
        elif possible_op.__len__() >= 2 and values.get(possible_op[:2]):
            self.advance(2)
            end = self.position.copy()
            return Token(values.get(possible_op[:2]), possible_op[:2], start,
                         end)
        elif values.get(self.current_character):
            char = self.current_character
            self.advance()
            end = self.position.copy()
            return Token(values.get(char), char, start, end)
        else:
            self.error = Error("ValueError: Token not a operator")
            char = self.current_character
            self.advance()
            end = self.position.copy()
            return Token(tt._INVALID, char, start, end)
示例#4
0
class Lexer:
    def __init__(self, file_name, text):
        self.file_name = file_name
        self.text = text
        self.position = Position(-1, 0, -1, file_name, text)
        self.current_char = None
        self.advance()
    
    def advance(self):
        self.position.advance(self.current_char)
        self.current_char = self.text[self.position.index] if self.position.index < len(self.text) else None
    
    def make_tokens(self):
        tokens = []

        while self.current_char != None:
            if self.current_char in ' \t':
                self.advance()
            elif self.current_char in DIGITS:
                tokens.append(self.make_number())
            elif self.current_char == '+':
                tokens.append(Token(TOKENTYPE_PLUS, position_start=self.position))
                self.advance()
            elif self.current_char == '-':
                tokens.append(Token(TOKENTYPE_MINUS, position_start=self.position))
                self.advance()
            elif self.current_char == '*':
                tokens.append(Token(TOKENTYPE_MUL, position_start=self.position))
                self.advance()
            elif self.current_char == '/':
                tokens.append(Token(TOKENTYPE_DIV, position_start=self.position))
                self.advance()
            elif self.current_char == '(':
                tokens.append(Token(TOKENTYPE_LEFTPARENTESIS, position_start=self.position))
                self.advance()
            elif self.current_char == ')':
                tokens.append(Token(TOKENTYPE_RIGHTPARENTESIS, position_start=self.position))
                self.advance()
            else:
                position_start = self.position.copy()
                char = self.current_char
                self.advance()
                return[], IllegalCharError(position_start, self.position, ">>" + char + "<<")
        
        tokens.append(Token(TOKENTYPE_EOF, position_start=self.position))
        return tokens, None

    def make_number(self):
        num_str = ''
        dot_count = 0
        position_start = self.position.copy()

        while self.current_char != None and self.current_char in DIGITS + '.':
            if self.current_char == '.':
                if dot_count == 1: break
                dot_count += 1
                num_str += '.'
            else:
                num_str += self.current_char
            self.advance()

        if dot_count == 0:
            return Token(TOKENTYPE_INT, int(num_str), position_start, self.position)
        else:
            return Token(TOKENTYPE_FLOAT, float(num_str), position_start, self.position)
示例#5
0
class Lexer:
    def __init__(self, fn, text):
        self.fn = fn
        self.text = text
        self.pos = Position(-1, 0, -1, fn, text)
        self.current_char = None
        self.advance()

    def advance(self):
        self.pos.advance(self.current_char)
        self.current_char = self.text[self.pos.idx] if self.pos.idx < len(
            self.text) else None

    def make_tokens(self):
        tokens = []

        while self.current_char != None:
            cc = self.current_char
            if cc in ' \t':
                self.advance()
            elif cc in DIGITS:
                tokens.append(self.make_number())
            elif cc == '+':
                tokens.append(Token(TT_PLUS, pos_start=self.pos))
                self.advance()
            elif cc == '-':
                tokens.append(Token(TT_MINUS, pos_start=self.pos))
                self.advance()
            elif cc == '*':
                tokens.append(Token(TT_MUL, pos_start=self.pos))
                self.advance()
            elif cc == '/':
                tokens.append(Token(TT_DIV, pos_start=self.pos))
                self.advance()
            elif cc == '^':
                tokens.append(Token(TT_POW, pos_start=self.pos))
                self.advance()
            elif cc == '(':
                tokens.append(Token(TT_LPAREN, pos_start=self.pos))
                self.advance()
            elif cc == ')':
                tokens.append(Token(TT_RPAREN, pos_start=self.pos))
                self.advance()
            else:
                pos_start = self.pos.copy()
                self.advance()
                return [], IllegalCharError(pos_start, self.pos,
                                            "'" + cc + "'")

        tokens.append(Token(TT_EOF, pos_start=self.pos))
        return tokens, None

    def make_number(self):
        num_str = ''
        dot_count = 0
        pos_start = self.pos.copy()

        while self.current_char != None and self.current_char in DIGITS + '.':
            if self.current_char == '.':
                if dot_count == 1:
                    break
                dot_count += 1
                num_str += '.'
            else:
                num_str += self.current_char
            self.advance()

        if dot_count == 0:
            return Token(TT_INT, int(num_str), pos_start, self.pos)
        else:
            return Token(TT_FLOAT, float(num_str), pos_start, self.pos)
示例#6
0
class Lexer:
    # :string: -> :string:
    def __init__(self, file_name, lines):
        self.lines = lines
        self.file_name = file_name
        self.pos = Position(-1, 0, -1, file_name, lines)
        self.cur_symbol = None
        self.advance()

    def advance(self):
        self.pos.advance(self.cur_symbol)
        next_symbol = None
        if self.pos.line < len(self.lines):
            if self.pos.column < len(self.lines[self.pos.line]):
                next_symbol = self.lines[self.pos.line][self.pos.column]
        self.cur_symbol = next_symbol

    def generate_tokens(self):
        all_tokens = []  # Tokens for entire file
        tokens = []  # Tokens for a line
        while self.cur_symbol != None:
            if self.cur_symbol in IRRELEVENT_SYMBOLS:
                self.advance()
            elif self.cur_symbol in DIGITS + '.':
                tokens.append(self.generate_number())
            elif self.cur_symbol in LETTERS:  # Gaurantees first symbol has to be a letter. The rest can be any.
                tokens.append(self.generate_word(
                ))  # Either keyword, variable, boolean or word oeprator.
            elif self.cur_symbol == '+':
                tokens.append(Token(TT_PLUS, start_pos=self.pos))
                self.advance()
            elif self.cur_symbol == '-':
                tokens.append(Token(TT_MINUS, start_pos=self.pos))
                self.advance()
            elif self.cur_symbol == '*':
                tokens.append(Token(TT_MULT, start_pos=self.pos))
                self.advance()
            elif self.cur_symbol == '/':
                tokens.append(self.generate_compare(TT_NE, TT_DIV))
            elif self.cur_symbol == '^':
                tokens.append(Token(TT_EXP, start_pos=self.pos))
                self.advance()
            elif self.cur_symbol == '%':
                tokens.append(Token(TT_MOD, start_pos=self.pos))
                self.advance()
            elif self.cur_symbol == '(':
                tokens.append(Token(TT_L_PAREN, start_pos=self.pos))
                self.advance()
            elif self.cur_symbol == ')':
                tokens.append(Token(TT_R_PAREN, start_pos=self.pos))
                self.advance()
            elif self.cur_symbol == '{':
                tokens.append(Token(TT_L_C_BRACK, start_pos=self.pos))
                self.advance()
            elif self.cur_symbol == '}':
                tokens.append(Token(TT_R_C_BRACK, start_pos=self.pos))
                self.advance()
            elif self.cur_symbol == '=':
                tokens.append(self.generate_compare(TT_EQT, TT_EQ))
            elif self.cur_symbol == '<':
                tokens.append(self.generate_compare(TT_LTE, TT_LT))
            elif self.cur_symbol == '>':
                tokens.append(self.generate_compare(TT_GTE, TT_GT))
            elif self.cur_symbol == ',':
                tokens.append(Token(TT_COMMA, start_pos=self.pos))
                self.advance()
            elif self.cur_symbol in ('\n', '#'):
                # Store tokens for the previous line when new line appears.
                # If there are no tokens for the new line, then do not append
                # any empty lists.
                # The '#' is for comments on the code
                if len(tokens) > 0:
                    all_tokens.append(tokens)
                tokens = []
                self.advance()
            else:
                pos_begin = self.pos.copy()
                illegal_symbol = self.cur_symbol
                self.advance()
                return ([],
                        IllegalSymbolError(pos_begin, self.pos,
                                           "'" + illegal_symbol + "'"))
        # Edge case for when there isn't a new line at the end of the
        # program.
        if len(tokens) > 0:
            all_tokens.append(tokens)
        all_tokens.append([Token(TT_EOF, start_pos=self.pos)])
        return (all_tokens, None)

    def generate_number(self):
        number_str = ""
        decimal_cnt = 0
        start_pos = self.pos.copy()
        while self.cur_symbol != None and self.cur_symbol in DIGITS + '.':
            if self.cur_symbol == '.':
                decimal_cnt += 1
                if decimal_cnt > 1: break
            else:
                number_str += self.cur_symbol
            self.advance()
        if decimal_cnt == 0:
            return Token(TT_INT, int(number_str), start_pos, self.pos)
        return Token(TT_FLOAT, float(number_str), start_pos, self.pos)

    def generate_word(self):
        id_str = ""
        pos_start = self.pos.copy()
        while self.cur_symbol != None and self.cur_symbol in LETTERS_DIGITS + '_':
            id_str += self.cur_symbol
            self.advance()
        if id_str in KEYWORDS:
            tok_type = TT_KEYWORD
        elif id_str in BOOLEANS:
            tok_type = TT_BOOL
            id_str = True if id_str == "True" else False
        elif id_str in WORD_OPERATOR:
            if id_str == "and":
                tok_type = TT_AND
            elif id_str == "or":
                tok_type = TT_OR
            else:
                tok_type = TT_NOT
        else:
            tok_type = TT_ID
        return Token(tok_type, id_str, pos_start, self.pos)

    def generate_compare(self, cmp_tok_a, cmp_tok_b):
        start_pos = self.pos.copy()
        self.advance()
        if self.cur_symbol == '=':
            self.advance()
            return Token(cmp_tok_a, start_pos=start_pos, end_pos=self.pos)
        return Token(cmp_tok_b, start_pos=start_pos, end_pos=self.pos)
示例#7
0
class Lexer:
    def __init__(self, text: str, file_name: str):
        self.text = text
        self.pos = Position(-1, 0, -1, file_name, text)
        self.current_char: Optional[str] = None

    def advance(self) -> None:
        self.pos.advance(self.current_char)
        self.current_char = (self.text[self.pos.index]
                             if self.pos.index < len(self.text) else None)

    def make_tokens(self) -> List[Token]:
        comment = False
        tokens: List[Token] = []

        self.advance()
        while self.current_char is not None:
            if comment:
                if self.current_char in "\n\r":
                    comment = False
                else:
                    self.advance()
                    continue
            if self.current_char == "+":
                tokens.append(EmptyToken(TT_PLUS, self.pos, self.pos))
            elif self.current_char == "-":
                tokens.append(self.make_minus_or_arrow())
                continue
            elif self.current_char == "#":
                comment = True
            elif self.current_char == "*":
                tokens.append(EmptyToken(TT_MUL, self.pos, self.pos))
            elif self.current_char == "^":
                tokens.append(EmptyToken(TT_POW, self.pos, self.pos))
            elif self.current_char == "/":
                tokens.append(EmptyToken(TT_DIV, self.pos, self.pos))
            elif self.current_char == "(":
                tokens.append(EmptyToken(TT_LPAREN, self.pos, self.pos))
            elif self.current_char == ")":
                tokens.append(EmptyToken(TT_RPAREN, self.pos, self.pos))
            elif self.current_char == "[":
                tokens.append(EmptyToken(TT_LBRACKET, self.pos, self.pos))
            elif self.current_char == "]":
                tokens.append(EmptyToken(TT_RBRACKET, self.pos, self.pos))
            elif self.current_char == "{":
                tokens.append(EmptyToken(TT_LCURLY, self.pos, self.pos))
            elif self.current_char == "}":
                tokens.append(EmptyToken(TT_RCURLY, self.pos, self.pos))
            elif self.current_char == ";":
                tokens.append(EmptyToken(TT_SEMICOLON, self.pos, self.pos))
            elif self.current_char == "|":
                tokens.append(
                    StringToken(TT_KEYWORD, self.pos, self.pos,
                                KEYWORDS["MATCH_OR"]))
            elif self.current_char == ",":
                tokens.append(EmptyToken(TT_COMA, self.pos, self.pos))
            elif self.current_char == ":":
                tokens.append(EmptyToken(TT_COLON, self.pos, self.pos))
            elif self.current_char == "=":
                tokens.append(self.make_equals())
                continue
            elif self.current_char == "<":
                tokens.append(self.make_less_than())
                continue
            elif self.current_char == ">":
                tokens.append(self.make_greater_than())
                continue
            elif self.current_char == "!":
                token, error = self.make_not_equals()
                if error or token is None:
                    return []
                tokens.append(token)
            elif self.current_char.isdigit():
                tokens.append(self.make_number())
                continue
            elif self.current_char == "_" or (self.current_char.isalnum() and
                                              not self.current_char.isdigit()):
                tokens.append(self.make_identifier())
                continue
            elif not self.current_char.isspace():
                post_start = self.pos.copy()
                char = self.current_char
                self.advance()
                raise IllegalCharacterError(post_start, self.pos, char)
            self.advance()
        tokens.append(EmptyToken(TT_EOF, self.pos, self.pos))
        return tokens

    def make_number(self) -> Token:
        num = ""
        dot_count = 0
        pos_start = self.pos.copy()

        while self.current_char is not None:
            if self.current_char.isdigit():
                pass
            elif self.current_char == ".":
                if dot_count == 0:
                    dot_count += 1
                else:
                    break
            else:
                break
            num += self.current_char
            self.advance()
        if dot_count == 0:
            return NumberToken(TT_INT, pos_start, self.pos, int(num))
        return NumberToken(TT_FLOAT, pos_start, self.pos, float(num))

    def make_identifier(self) -> Token:
        id_str = ""
        pos_start = self.pos.copy()

        while self.current_char and (self.current_char.isalnum()
                                     or self.current_char == "_"):
            id_str += self.current_char
            self.advance()

        tok_type = TT_KEYWORD if id_str in KEYWORDS.values() else TT_IDENTIFIER
        return StringToken(tok_type, pos_start, self.pos, id_str)

    def make_not_equals(self) -> Tuple[Optional[Token], Optional[Error]]:
        pos_start = self.pos.copy()
        self.advance()
        if self.current_char == "=":
            self.advance()
            return EmptyToken(TT_NE, pos_start, self.pos), None
        return (
            None,
            UnexpectedCharError(pos_start, self.pos,
                                'after "!" should be "="'),
        )

    def make_equals(self) -> Token:
        token_type = TT_EQUALS
        pos_start = self.pos.copy()

        self.advance()
        if self.current_char == "=":
            self.advance()
            token_type = TT_EE

        return EmptyToken(token_type, pos_start, self.pos)

    def make_greater_than(self) -> Token:
        token_type = TT_GT
        pos_start = self.pos.copy()

        self.advance()
        if self.current_char == "=":
            self.advance()
            token_type = TT_GTE

        return EmptyToken(token_type, pos_start, self.pos)

    def make_less_than(self) -> Token:
        token_type = TT_LT
        pos_start = self.pos.copy()

        self.advance()
        if self.current_char == "=":
            self.advance()
            token_type = TT_LTE

        return EmptyToken(token_type, pos_start, self.pos)

    def make_minus_or_arrow(self) -> Token:
        token_type = TT_MINUS
        pos_start = self.pos.copy()

        self.advance()
        if self.current_char == ">":
            self.advance()
            token_type = TT_ARROW

        return EmptyToken(token_type, pos_start, self.pos)
示例#8
0
class Lexer:
    def __init__(self, fn, text):
        self.fn = fn
        self.text = text
        self.pos = Position(-1, 0, -1, fn, text)
        self.current_char = None
        self.advance()

    def advance(self):
        self.pos.advance(self.current_char)
        self.current_char = self.text[self.pos.idx] if self.pos.idx < len(
            self.text) else None

    def make_tokens(self):
        tokens = []

        while self.current_char != None:
            if self.current_char in '  \t':
                self.advance()
            elif self.current_char in DIGITS:
                tokens.append(self.make_number())
            elif self.current_char in LETTERS:
                tokens.append(self.make_indentifier())
            elif self.current_char == '+':
                tokens.append(Token(TT_PLUS, pos_start=self.pos))
                self.advance()
            elif self.current_char == '-':
                tokens.append(self.make_minus_or_arrow())
                self.advance()
            elif self.current_char == '*':
                tokens.append(Token(TT_MUL, pos_start=self.pos))
                self.advance()
            elif self.current_char == '/':
                tokens.append(Token(TT_DIV, pos_start=self.pos))
                self.advance()
            elif self.current_char == '^':
                tokens.append(Token(TT_POWER, pos_start=self.pos))
                self.advance()
            elif self.current_char == '(':
                tokens.append(Token(TT_LPAREN, pos_start=self.pos))
                self.advance()
            elif self.current_char == ')':
                tokens.append(Token(TT_RPAREN, pos_start=self.pos))
                self.advance()
            elif self.current_char == '!':
                tok, error = self.make_not_equals()
                if error: return [], Error
                tokens.append(tok)
            elif self.current_char == '=':
                tokens.append(self.make_equals())
            elif self.current_char == '<':
                tokens.append(self.make_less_than())
            elif self.current_char == '>':
                tokens.append(self.make_greater_than())
            elif self.current_char == ',':
                tokens.append(Token(TT_COMMA, pos_start=self.pos))
                self.advance()
            else:
                pos_start = self.pos.copy()
                char = self.current_char
                self.advance()
                return [], IllegalCharError(pos_start, self.pos,
                                            "'" + char + "'")

        tokens.append(Token(TT_EOF, pos_start=self.pos))
        return tokens, None

    def make_indentifier(self):
        id_str = ''
        pos_start = self.pos.copy()

        while self.current_char != None and self.current_char in LETTERS_DIGITS + '_':
            id_str += self.current_char
            self.advance()

        tok_type = TT_KEYWORD if id_str in KEYWORDS else TT_IDENTIFIER

        return Token(tok_type, id_str, pos_start, self.pos)

    def make_minus_or_arrow(self):
        tok_type = TT_MINUS
        pos_start = self.pos.copy()
        self.advance()

        if self.current_char == '>':
            self.advance()
            tok_type = TT_ARROW

        return Token(tok_type, pos_start=pos_start, pos_end=self.pos)

    def make_not_equals(self):
        pos_start = self.pos.copy()
        self.advance()

        if self.current_char == '=':
            self.advance()
            return Token(TT_NE, pos_start=pos_start, pos_end=self.pos), None

        return None, ExpectedCharError(pos_start, self.pos,
                                       "'=' expected after '!'")

    def make_equals(self):
        tok_type = TT_EQ
        pos_start = self.pos.copy()
        self.advance()

        if self.current_char == '=':
            self.advance()
            tok_type = TT_EE

        return Token(tok_type, pos_start=pos_start, pos_end=self.pos)

    def make_less_than(self):
        tok_type = TT_LT
        pos_start = self.pos.copy()
        self.advance()

        if self.current_char == '=':
            self.advance()
            tok_type = TT_LTE

        return Token(tok_type, pos_start=pos_start, pos_end=self.pos)

    def make_greater_than(self):
        tok_type = TT_GT
        pos_start = self.pos.copy()
        self.advance()

        if self.current_char == '=':
            self.advance()
            tok_type = TT_GTE

        return Token(tok_type, pos_start=pos_start, pos_end=self.pos)

    def make_number(self):
        num_str = ''
        dot_count = 0
        pos_start = self.pos.copy()

        while self.current_char != None and self.current_char in DIGITS + '.':
            if self.current_char == '.':
                if dot_count == 1: break
                dot_count += 1
                num_str += '.'
            else:
                num_str += self.current_char
            self.advance()

        if dot_count == 0:
            return Token(TT_INT, int(num_str), pos_start, self.pos)
        else:
            return Token(TT_FLOAT, float(num_str), pos_start, self.pos)