def assign(self) -> Node: """ Based on a left-member expression, check that we are dealing with an assign, be mutable or immutable. This method first looks for an expression. If the expressions exits, then it has likely stored a token. We match this token against an assign operator ('<-' or '->'). If the token does not match, then we return the expression node. The expression expects to fill out the line, so the next token should be something like EOL. If it is not, but it is an assign operator, either '<-' or '->' then this method evaluates an assignment instead. If the operator is a left assign, a variable, that must be a name, and not an expressed is being assigned an expression (or a literal). If the operator is a right assign, then an expression (or a literal) is assigned to an immutable name. If name fails, for instance there is an expression, or if the token after is not an end of line, then an error is returned. If there is no assign operator, this method makes sure that there is no let keyword leading the orphan expression. """ node, let = self.let( ) # There can be an optional let in assign statement. if self.token is not None and self.token == Symbol.LASSIGN: if node.name not in (NodeType.Name, NodeType.Let): raise LythSyntaxError(node.info, msg=LythError.LEFT_MEMBER_IS_EXPRESSION) else: token = self.token self.token = None node = Node(token, node, self.expression()) elif self.token is not None and self.token == Symbol.RASSIGN: token = self.token self.token = None node = Node(token, self.name(), node) if next(self.lexer) != Symbol.EOL: raise LythSyntaxError(node.info, msg=LythError.GARBAGE_CHARACTERS) elif let and node.name != NodeType.Class: raise LythSyntaxError(let.info, msg=LythError.LET_ON_EXPRESSION) return Node(let, node) if let is not None else node
def expression(self, end: Symbol = Symbol.EOL) -> Node: """ Looking for a line that could lead to an expression, that is, a series of operations. There should be one expression per line, or one expression per pair of parentheses. This is why this method is not a while loop. Expression raises an Exception if it detects trailing characters. The exception however, is bypassed in case of an assignment. In this case, the expression returns the node, and the current assignment token for the assign method to run. The end parameter determines the token the expression expects to stop. In some cases, expression is started by an opening parenthesis, then the method should have been called with an expected right parenthesis to stop it. The default token otherwise is the end of a line as multi line is not yet supported by lyth. """ node = self.addition() if self.token in (Symbol.LASSIGN, Symbol.RASSIGN): return node elif node.name == NodeType.Name and self.token in (Symbol.COLON, Keyword.BE): return self.classdef(node) elif self.token is not None and self.token.symbol is not end: print(node) print(self.token) raise LythSyntaxError(node.info, msg=LythError.GARBAGE_CHARACTERS) self.token = None return node
def classdef(self, name: Node, end: Symbol = Symbol.EOL) -> Node: """ Looking for a class definition. Causes to fetch the block and append to the class node that is built subsequent lines of codes until the next dedent. """ token = self.token or next(self.lexer) if token == Keyword.BE: self.token = next(self.lexer) type_node = Node.typedef(self.name()) token = next(self.lexer) else: # node = Node.classdef(name) type_node = None if token != Symbol.COLON: raise LythSyntaxError(node_type.info, msg=LythError.GARBAGE_CHARACTERS) self.token = None try: node = Node.classdef(name, type_node, *self.block()) except StopIteration: raise return node
def __init__(self, lexeme: str, scan: Scanner, force_literal=False) -> None: """ Instantiate a new Token. Instantiates a new Token object if the provided symbol is a _Lexeme. If not, it returns an exception to the scanner saying that the symbol is invalid. Raises: LythSyntaxError: The character being scanned could not lead to a token. """ symbol = Symbol.as_value(lexeme) if symbol is not None: self.symbol = symbol elif lexeme.isdigit(): self.symbol = Literal.VALUE elif lexeme.isalpha() or lexeme == '_': self.symbol = Literal.STRING else: raise LythSyntaxError(scan, msg=LythError.INVALID_CHARACTER) self.literal = force_literal self.info = TokenInfo(scan.filename, scan.lineno, scan.offset, scan.line) self.lexeme = lexeme self.quotes = 1 if self.symbol is Symbol.QUOTE else 0
def name(self) -> Node: """ Looking for a name token. Literal does not expect the line to be terminated, or the source code to have an end. If it is the case, then an exception saying that it was unsuccessful is raised instead. """ token = self.token or self.lexer() if token in (Symbol.EOF, Symbol.EOL): raise LythSyntaxError(token.info, msg=LythError.INCOMPLETE_LINE) elif token != Literal.STRING: raise LythSyntaxError(token.info, msg=LythError.NAME_EXPECTED) return Node(token)
def literal(self) -> Node: """ Looking for a literal token to make it a numeral, or a name. Literal does not expect the line to be terminated, or the source code to have an end. If it is the case, then an exception saying that it was unsuccessful is raised instead. If the token is an opening parenthesis, then the corresponding node to return will not be a literal, rather a new expression needs to be evaluated. If the token being parsed is not a literal of type value, then it also raises an exception saying the symbol is invalid and that it should be a literal instead. The token may already have been scanned by let. In all cases the current token, even if none, must be consumed, otherwise the expression will evaluate with a literal token. """ token = self.token or self.lexer() self.token = None if token in (Symbol.EOF, Symbol.EOL): raise LythSyntaxError(token.info, msg=LythError.INCOMPLETE_LINE) elif token == Symbol.LPAREN: return self.expression(end=Symbol.RPAREN) elif token == Symbol.DOC: self.docstring() return self.literal() elif token not in (Literal.VALUE, Literal.STRING): raise LythSyntaxError(token.info, msg=LythError.LITERAL_EXPECTED) return Node(token)
def block(self) -> List[Node]: """ Processing a list of indented statements following a colon. Original token is provided as parameter to help this method wraps the node around the token, fill its statement attribute and return it. For this, the block method requires the node constructor, and the token. """ statements = [] self.indent += 1 while True: new_token = self.token or self.lexer() if new_token == Symbol.EOL: self.token = None continue if new_token == Symbol.EOF: self.token = new_token return statements if new_token != Symbol.INDENT: raise LythSyntaxError(new_token.info, msg=LythError.INCONSISTENT_INDENT) if new_token.lexeme <= self.indent - 1: self.indent = new_token.lexeme self.token = new_token return statements if new_token.lexeme != self.indent: raise LythSyntaxError(new_token.info, msg=LythError.INCONSISTENT_INDENT) statements.append(self.assign())
def visit_immutableassign(self, node, context: Context) -> None: """ An assign operator requesting immediate assistance. This method raises an exception if we try to reassign a value that is already present in the symbol table.. """ name = self.visit(node.left, Context.STORE) symbol = self.table.get((name, self.scope), None) if symbol is not None: raise LythSyntaxError(node.info, msg=LythError.REASSIGN_IMMUTABLE) else: self.table += Name( name, self.scope, SymbolType(Field.UNKNOWN, Field.IMMUTABLE, self.visit(node.right, Context.LOAD)))
def __call__(self) -> Token: """ Finalizes the token. In some cases it is simple to convert the string of the lexeme to the right type once it is finished, rather than letting the analyzer does it. If the token is an indent, the lexeme is the number of indents. The number of indents must be even, or an exception is raised. """ if self.symbol is Literal.VALUE: self.lexeme = int(self.lexeme) elif self.symbol is Symbol.INDENT: if len(self.lexeme) % 2: raise LythSyntaxError(self.info, msg=LythError.UNEVEN_INDENT) self.lexeme = len(self.lexeme) // 2 return self
def visit_name(self, node: Node, context: Context) -> Union[str, int, Field]: """ A variable requires its name to be returned. If the context is to store the result of an expression into a variable, usually writing a symbol to the symbol table, then this method returns a name. If the context is to load the value referenced by this name, usually reading a symbol from the symbol table, then this method returns the value in the symbol table (or return an error if the variable is referenced before it was assigned any value in the symbol table.) """ if context is Context.STORE: return node.value symbol = self.table.get((node.value, self.scope), None) if symbol is None: raise LythSyntaxError( node.info, msg=LythError.VARIABLE_REFERENCED_BEFORE_ASSIGNMENT) return symbol.type.value
def let(self) -> Tuple[Node, Optional[Token]]: """ Is there any let keyword that wants to come out? Let keyword declares a node to be declared publicly in our tree of symbol. It can be an assign, a class, an enum, a struct etc. or even a list of them. """ token = self.lexer() if token == Keyword.LET: next_token = self.lexer() # # 1. Multiple statements let # if next_token == Symbol.COLON: eol = self.lexer() if eol != Symbol.EOL: raise LythSyntaxError(eol.info, msg=LythError.GARBAGE_CHARACTERS) return Node(token, *self.block()), None # # 2. Single statement let # self.token = next_token return self.expression(), token # # 3. No let detected # self.token = token return self.expression(), None
def __add__(self, lexeme: str) -> Token: """ Add a scanned character to an existing token. This method validates that the character appended to the existing token keeps the integrity of the token. For example, if the token is made of digits, it is important that the next characters are digits as well. Sometimes the token type changes as well. The comparator '<' could become an assignment if '-' is the next character being scanned. The methodology is the following: 1. Appending space to an indent token leads to an indent token with a lexeme of incremented size. 2. If the new lexeme appended to current lexeme leads to a new symbol, update symbol and new lexeme, and return this instance. 3. If the new literal would be a symbol appended to a literal, there is clearly a missing space. Exception, such as '5!' will be corrected by the lexer. 4. Appending a digit to a literal leads to appending the lexeme and returning current token. 5. Appending an alphanumerical character, or '_', to a string value leads to appending that character to the lexeme and returning current token. If the lexeme becomes a lyth keyword, then the token symbol is changed to corresponding keyword. 6. Appending an alphanumerical character, or '_', to a keyword causes it to be demoted back to string symbol. 7. Appending an alphanumerical character, or '_', leading to a literal right after a symbol, without the presence of a space leads to an error. Exception, such as '-5' will be corrected by the lexer. 8. Appending a quote to a quote leaves the method unchanged and the same quote symbol is returned. It is up to the lexer to count the number of quotes in order to build a docstring. """ if self.literal: if lexeme == '"': self.symbol = Symbol.QUOTE self.quotes += 1 else: self.symbol = Literal.STRING self.lexeme += lexeme return self if self.symbol is Symbol.INDENT and lexeme == ' ': self.lexeme += lexeme return self symbol = Symbol.as_value(self.lexeme + lexeme) if symbol is not None: self.symbol = symbol self.lexeme += lexeme return self symbol = Symbol.as_value(lexeme) if symbol is not None and self.symbol in Literal: raise LythSyntaxError(self.info, msg=LythError.MISSING_SPACE_BEFORE_OPERATOR) elif lexeme.isdigit() and self.symbol in Literal: self.lexeme += lexeme return self elif (lexeme.isalnum() or lexeme == '_') and self.symbol is Literal.STRING: self.lexeme += lexeme self.symbol = Keyword.as_value(self.lexeme) or self.symbol return self elif (lexeme.isalnum() or lexeme == '_') and self.symbol in Keyword: self.lexeme += lexeme self.symbol = Literal.STRING return self elif (lexeme.isalnum() or lexeme == '_') and self.symbol in Symbol: raise LythSyntaxError(self.info, msg=LythError.MISSING_SPACE_AFTER_OPERATOR) elif (lexeme == '"' and self.symbol is Symbol.QUOTE): self.quotes += 1 return self else: raise LythSyntaxError(self.info, msg=LythError.SYNTAX_ERROR)
def next(self) -> Token: """ Get the next token in source being scanned. This method assumes spaces as delimiters. Spaces in python comprise escape characters (feed line etc.) as well. It yields tokens upon space and successive spaces are ignored. There are multiple case to consider here. 1. A space is detected and a token is being built. The generator yields the token, effectively stopping its construction. 2. A space is detected and an indent token is being built. The generator appends the space to this indent. 3. If the space is a feed line character, the generator yields a new EOL token right after. 4. If the space is in first column, this is the beginning of an indent and a corresponding token is instantiated. 5. Other spaces following are ignored, looping through the while loop to retrieve another character (and so on) 6. If it is not a space and it ends an indent, the generator yields first the indent. 7. If it is not a space and no token is present, then we start creating one. 8. If a colon is following directly another token, we stop building the token, return it, and generate a colon token. 9. If it is not a space and a token is present, then we continue the construction of the current token. 10. One quote leads to a quote token, two quotes lead to two quote tokens, three quotes lead to a doc token. When the end of file is reached: 1. If the scanner reached the end of its source, and the last token is not an EOL located at the begining of previous file, then the generator pedanticly asks for an empty line. 2. EOF is treated as an empty space, the generator yields the last token, effectively stopping its construction. 3. The generator then adds an EOF token and leaves the while loop, causing the generator to raise StopIteration on future next() calls. Exceptions can be ignored: 1. The scanner pedanticly requires that symbols are separated with spaces. However '+5', '-5' and '5!' are examples of valid expressions. The generator yields current token and starts a new one. """ token = None in_doc = False while True: try: char = self.scanner() if char.isspace(): # # 1. A space is detected, and a token is being built. # if token is not None and token != Symbol.INDENT: yield token() # # 2. A space is detected, and an indent token is being # built. # elif token is not None and token == Symbol.INDENT: token += ' ' continue # # 3. If the space is a feed line character, the generator # inserts a new EOL token. # if char == '\n': yield Token('\n', self.scanner, in_doc) token = None # # 4. If the space is in the first column, this is the # beginning of an indent. # if self.scanner.offset == 0: token = Token(' ', self.scanner, in_doc) continue # # 5. Other spaces following are ignored. # continue # # 6. If it is not a space and it ends an indent, the generator # returns the indent first # if token is not None and token == Symbol.INDENT: yield token() token = None # # 7. If it is not a space and no token is present, then we # start defining a new token # if token is None: token = Token(char, self.scanner, in_doc) if token == Symbol.COLON: raise LythSyntaxError( token.info, msg=LythError.TOO_MUCH_SPACE_BEFORE) # # 8. A colon token is following directly another token # elif token is not None and char == ':': yield token token = Token(char, self.scanner, in_doc) # # 9. If it is not a space and a token is present, then we # append the character to the token. # else: token += char # 10. One quote leads to a quote token, two quotes lead to two quote # tokens, three quotes lead to a doc token. if token == Symbol.QUOTE and token.quotes == 3: yield Token('"""', self.scanner, in_doc)() in_doc = not in_doc token = None except StopIteration: if token is not None and (token.symbol is not Symbol.EOL or token.lineno != 0): raise LythSyntaxError( token.info, msg=LythError.MISSING_EMPTY_LINE) from None yield Token(None, self.scanner, in_doc) break except LythSyntaxError as error: if error.msg is LythError.MISSING_SPACE_AFTER_OPERATOR: if token is not None and token.symbol in (Symbol.ADD, Symbol.SUB, Symbol.LPAREN): yield token() token = Token(char, self.scanner, in_doc) continue elif error.msg is LythError.MISSING_SPACE_BEFORE_OPERATOR: new_token = Token(char, self.scanner, in_doc) if new_token.symbol is Symbol.RPAREN \ or token.symbol is Literal.STRING and new_token.symbol is Symbol.LPAREN: yield token() token = new_token continue raise