def numeric_val(source_code, i, table, scanner_obj): """ Processes numeric values in the source code Params ====== source_code (str) : The string containing simc source code i (int) : The current index in the source code table (SymbolTable) : Symbol table constructed holding information about identifiers and constants scanner_obj (Scanner) : Instance of Scanner class Returns ======= (Token) : The token generated for the numeric constant (int) : Current position in source code """ numeric_constant = "" # Loop until we get a non-digit character while is_digit(source_code[i]): numeric_constant += source_code[i] i += 1 # If a numeric constant contains more than 1 decimal point (.) then that is invalid if numeric_constant.count(".") > 1: error( "Invalid numeric constant, cannot have more than one decimal point in a" " number!", scanner_obj.line_num, ) # Check the length after . to distinguish between float and double length = len( numeric_constant.split(".")[1]) if "." in numeric_constant else 0 # Determine type of numeric value type = "int" if length != 0: if length <= 7: type = "float" elif length >= 7: type = "double" # Make entry in symbol table id = table.entry(numeric_constant, type, "constant") # Return number token and current index in source code return Token("number", id, scanner_obj.line_num), i
def string_val(source_code, i, table, line_num, start_char='"'): """ Processes string values in the source code Params ====== source_code (string) = The string containing simc source code i (int) = The current index in the source code table (SymbolTable) = Symbol table constructed holding information about identifiers and constants line_num (int) = Line number start_char (str) (Optional) = Character with which string starts Returns ======= Token, int: The token generated for the string constant and the current position in source code, this is done only if there is no error in the string constant """ string_constant = "" # Skip the first " so that the string atleast makes into the while loop i += 1 # Loop until we get a non-digit character while source_code[i] != start_char: if source_code[i] == "\0": error("Unterminated string!", line_num) string_constant += source_code[i] i += 1 # Skip the " character so that it does not loop back to this function incorrectly i += 1 # Determine the type of data type = "char" if len(string_constant) > 1: type = "string" # Put appropriate quote string_constant = ('"' + string_constant + '"' if type == "string" else "'" + string_constant + "'") # Make entry in symbol table id = table.entry(string_constant, type, "constant") # Return string token and current index in source code return Token("string", id, line_num), i
def string_val(source_code, i, table, scanner_obj, start_char='"'): """ Processes string values in the source code Params ====== source_code (str) : The string containing simc source code i (int) : The current index in the source code table (SymbolTable) : Symbol table constructed holding information about identifiers and constants scanner_obj (Scanner) : Instance of Scanner class start_char (str) (Optional) : Character with which string starts Returns ======= (Token) : The token generated for the string constant (int) : Current position in source code """ string_constant = "" # Skip the first "/' so that the string atleast makes into the while loop i += 1 # Loop until we get a non-digit character while source_code[i] != start_char: if source_code[i] == "\0": error("Unterminated string!", scanner_obj.line_num) string_constant += source_code[i] i += 1 # Skip the "/' character so that it does not loop back to this function incorrectly i += 1 # Put appropriate quote string_constant = '"' + string_constant + '"' # Make entry in symbol table id = table.entry(string_constant, "string", "constant") # Return string token and current index in source code return Token("string", id, scanner_obj.line_num), i
def check_if(given_type, should_be_types, msg, line_num): """ Check if type matches what it should be otherwise throw an error and exit Params ====== given_type (string) = Type of token to be checked should_be_types (string/list) = Type(s) to be compared with msg (string) = Error message to print in case some case fails line_num (int) = Line number """ # Convert to list if type is string if type(should_be_types) == str: should_be_types = [should_be_types] # If the given_type is not part of should_be_types then throw error and exit if given_type not in should_be_types: error(msg, line_num)
def assign_statement(tokens, i, table, func_ret_type): """ Parse assignment statement Params ====== tokens (list) = List of tokens i (int) = Current index in token table (SymbolTable) = Symbol table constructed holding information about identifiers and constants Returns ======= OpCode, int: The opcode for the assign code and the index after parsing assign statement Grammar ======= var_statement -> var id [= expr]? expr -> string | number | id | operator string -> quote [a-zA-Z0-9`~!@#$%^&*()_-+={[]}:;,.?/|\]+ quote quote -> " number -> [0-9]+ id -> [a-zA-Z_]?[a-zA-Z0-9_]* operator -> + | - | * | / """ # Check if the identifier is a pointer is_ptr = False # count depth of pointer count_ast = 0 if tokens[i - 2].type == "multiply": j = -2 while tokens[j + i].type == "multiply": j -= 1 count_ast = -1 * j - 2 is_ptr = True # Check if variable is declared or not value, type, _ = table.get_by_id(tokens[i - 1].val) if type == "var": error("Variable %s used before declaration" % value, tokens[i - 1].line_num) # Dictionary to convert tokens to their corresponding assignment types assignment_type = { "assignment": "=", "plus_equal": "+=", "minus_equal": "-=", "multiply_equal": "*=", "divide_equal": "/=", "modulus_equal": "%=", } # Check if assignment operator follows identifier name check_if( tokens[i].type, [ "assignment", "plus_equal", "minus_equal", "multiply_equal", "divide_equal", "modulus_equal", ], "Expected assignment operator after identifier", tokens[i].line_num, ) # Convert the token to respective symbol converted_type = assignment_type[tokens[i].type] # Store the index of identifier id_idx = i - 1 # Check if expression follows = in assign statement op_value, op_type, i, func_ret_type = expression( tokens, i + 1, table, "Required expression after assignment operator", expect_paren=False, func_ret_type=func_ret_type, ) # Map datatype to appropriate datatype in C prec_to_type = { 0: "string", 1: "string", 2: "char", 3: "int", 4: "float", 5: "double", } op_value = converted_type + "---" + op_value # Modify datatype of the identifier table.symbol_table[tokens[id_idx].val][1] = prec_to_type[op_type] # Check if a pointer is being assigned if is_ptr: return ( OpCode( "ptr_only_assign", table.symbol_table[tokens[id_idx].val][0] + "---" + op_value + "---" + str(count_ast), "", ), i, func_ret_type, ) # Return the opcode and i (the token after assign statement) return ( OpCode("assign", table.symbol_table[tokens[id_idx].val][0] + "---" + op_value, ""), i, func_ret_type, )
def var_statement(tokens, i, table, func_ret_type): """ Parse variable declaration [/initialization] statement Params ====== tokens (list) = List of tokens i (int) = Current index in token table (SymbolTable) = Symbol table constructed holding information about identifiers and constants func_ret_type (string) = Function return type Returns ======= OpCode, int: The opcode for the var_assign/var_no_assign code and the index after parsing var statement Grammar ======= var_statement -> var id [= expr]? expr -> string | number | id | operator string -> quote [a-zA-Z0-9`~!@#$%^&*()_-+={[]}:;,.?/|\]+ quote quote -> " number -> [0-9]+ id -> [a-zA-Z_]?[a-zA-Z0-9_]* operator -> + | - | * | / """ is_ptr, count_ast, i = check_ptr(tokens, i) # Check if identifier is present after var check_if(tokens[i].type, "id", "Expected id after var keyword", tokens[i].line_num) # Tokens that are not accepted after declaration of a variable invalid_tokens = [ "plus_equal", "minus_equal", "divide_equal", "multiply_equal", "plus", "minus", "divide", "multiply", "modulus", "modulus_equal", "equal", "not_equal", ] # Check if variable is also initialized if i + 1 < len(tokens) and tokens[i + 1].type == "assignment": # Store the index of identifier id_idx = i # Check if expression follows = in var statement op_value, op_type, i, func_ret_type = expression( tokens, i + 2, table, "Required expression after assignment operator", expect_paren=False, func_ret_type=func_ret_type, ) # Map datatype to appropriate datatype in C prec_to_type = { 0: "string", 1: "string", 2: "char", 3: "int", 4: "float", 5: "double", } # Modify datatype of the identifier table.symbol_table[tokens[id_idx].val][1] = prec_to_type[op_type] if is_ptr: return ( OpCode( "ptr_assign", table.symbol_table[tokens[id_idx].val][0] + "---" + op_value + "---" + str(count_ast), prec_to_type[op_type], ), i, func_ret_type, ) else: # Return the opcode and i (the token after var statement) return ( OpCode( "var_assign", table.symbol_table[tokens[id_idx].val][0] + "---" + op_value, prec_to_type[op_type], ), i, func_ret_type, ) elif i + 1 < len(tokens) and tokens[i + 1].type in invalid_tokens: error("Invalid Syntax for declaration", tokens[i].line_num) else: # Get the value from symbol table by id value, type, _ = table.get_by_id(tokens[i].val) # If already declared then throw error if type in [ "declared", "int", "char", "float", "double", "string", "char *", "char*", ]: error("Variable %s already declared" % value, tokens[i].line_num) # Set declared table.symbol_table[tokens[i].val][1] = "declared" # Return the opcode and i+1 (the token after var statement) if is_ptr: return OpCode("ptr_no_assign", value), i + 1, func_ret_type return OpCode("var_no_assign", value), i + 1, func_ret_type
def if_statement(tokens, i, table, func_ret_type): """ Parse if statement Params ====== tokens (list) = List of tokens i (int) = Current index in token table (SymbolTable) = Symbol table constructed holding information about identifiers and constants Returns ======= OpCode, int: The opcode for the assign code and the index after parsing if statement Grammar ======= if_statement -> if(condition) { body } condition -> expr expr -> string | number | id | operator string -> quote [a-zA-Z0-9`~!@#$%^&*()_-+={[]}:;,.?/|\]+ quote quote -> " number -> [0-9]+ id -> [a-zA-Z_]?[a-zA-Z0-9_]* operator -> + | - | * | / """ # Check if ( follows if statement check_if( tokens[i].type, "left_paren", "Expected ( after if statement", tokens[i].line_num, ) # check if expression follows ( in if statement op_value, op_type, i, func_ret_type = expression( tokens, i + 1, table, "Expected expression inside if statement", func_ret_type=func_ret_type, ) op_value_list = op_value.replace(" ", "").split(",") # check if ) follows expression in if statement check_if( tokens[i - 1].type, "right_paren", "Expected ) after expression in if statement", tokens[i - 1].line_num, ) # If \n follows ) then skip all the \n characters if tokens[i + 1].type == "newline": i += 1 while tokens[i].type == "newline": i += 1 i -= 1 # Check if { follows ) in if statement check_if( tokens[i + 1].type, "left_brace", "Expected { before if body", tokens[i + 1].line_num, ) # Loop until } is reached i += 2 ret_idx = i found_right_brace = False while i < len(tokens) and tokens[i].type != "right_brace": if found_right_brace: found_right_brace = True i += 1 # If right brace found at end if i != len(tokens) and tokens[i].type == "right_brace": found_right_brace = True # If right brace is not found then produce error if not found_right_brace: error("Expected } after if body", tokens[i].line_num) return OpCode("if", op_value[:-1]), ret_idx - 1, func_ret_type
def function_call_statement(tokens, i, table, func_ret_type): """ Parse function calling statement Params ====== tokens (list) = List of tokens i (int) = Current index in token table (SymbolTable) = Symbol table constructed holding information about identifiers and constants func_ret_type (dict) = If return type of function is not figured yet Returns ======= OpCode, int, dict: The opcode for the assign code, index after parsing function calling statement and function return type Grammar ======= function_call_statement -> id([actual_params,]*) actual_params -> expr body -> statement expr -> string | number | id | operator string -> quote [a-zA-Z0-9`~!@#$%^&*()_-+={[]}:;,.?/|\]+ quote quote -> " number -> [0-9]+ id -> [a-zA-Z_]?[a-zA-Z0-9_]* operator -> + | - | * | / """ # Get information about the function from symbol table func_name, _, metadata = table.get_by_id(tokens[i].val) # Extract params from functions metadata (typedata), these are stored as <id>---[<param 1>, . . . , <param n>] params = metadata.split("---")[1:] if "---" in metadata else [")"] num_formal_params = len(params) if params != [")"] else 0 # Parse the params op_value, op_type, i, func_ret_type = expression( tokens, i + 2, table, "", True, True, expect_paren=True, func_ret_type=func_ret_type, ) op_value_list = op_value.replace(" ", "").split(",") op_value_list = (op_value_list if len(op_value_list) > 0 and len(op_value_list[0]) > 0 else []) num_actual_params = len(op_value_list) if op_value_list != [")"] else 0 # Check if number of actual and formal parameters match if num_formal_params != num_actual_params: error( "Expected %d parameters but got %d parameters in function %s" % (num_formal_params, num_actual_params, func_name), tokens[i].line_num, ) # Assign datatype to formal parameters for j in range(len(params)): # If parameter list is empty if params[j] == ")": continue # Fetch the datatype of corresponding actual parameter from symbol table _, dtype, _ = table.get_by_id( table.get_by_symbol(op_value_list[j].replace(")", ""))) # Set the datatype of the formal parameter table.symbol_table[table.get_by_symbol(params[j])][1] = dtype if func_name in func_ret_type.keys(): _, op_type, _, _ = expression(tokens, func_ret_type[func_name], table, "") # Map datatype to appropriate datatype in C prec_to_type = { 0: "char*", 1: "char*", 2: "char", 3: "int", 4: "float", 5: "double", } table.symbol_table[table.get_by_symbol( func_name)][1] = prec_to_type[op_type] del func_ret_type[func_name] return ( OpCode("func_call", func_name + "---" + "&&&".join(op_value_list)[:-1], ""), i + 1, func_ret_type, )
def expression( tokens, i, table, msg, accept_unkown=False, accept_empty_expression=False, expect_paren=True, func_ret_type={}, ): """ Parse and expression from tokens Params ====== tokens (list) = List of tokens i (string/list) = Current index in list of tokens table (SymbolTable) = Symbol table constructed holding information about identifiers and constants msg (string) = Error message to print in case some case fails accept_unkown (bool) = Accept unknown type for variable or not accept_empty_expression (bool) = Accept empty expression or not expect_paren (bool) = Expect parenthesis at the end func_ret_type (string) = Functions return type Returns ======= string, string, int: The expression, datatype of the expression and the current index in source code after parsing """ # Initial values op_value = "" op_type = -1 # Mapping for precedence checking (double > float > int) type_to_prec = {"int": 3, "float": 4, "double": 5} # Loop until expression is not parsed completely while i < len(tokens) and tokens[i].type in [ "number", "input", "string", "id", "plus", "minus", "multiply", "divide", "comma", "equal", "not_equal", "greater_than", "less_than", "greater_than_equal", "less_than_equal", "modulus", "increment", "decrement", "plus_equal", "minus_equal", "multiply_equal", "divide_equal", "modulus_equal", "and", "or", "left_paren", "exit", "right_paren", "newline", "call_end", "address_of", "right_shift", "left_shift", ]: # Check for function call if tokens[i].type == "id" and tokens[i + 1].type == "left_paren": fun_opcode, i, func_ret_type = function_call_statement( tokens, i, table, func_ret_type) val = fun_opcode.val.split("---") params = val[1].split("&&&") op_value += val[0] + "(" + ", ".join(params) + ")" type_to_prec = { "char*": 1, "char": 2, "int": 3, "float": 4, "double": 5 } op_type = type_to_prec[table.get_by_id(table.get_by_symbol( val[0]))[1]] i -= 1 # If token is identifier or constant elif tokens[i].type in ["number", "string", "id"]: # Fetch information from symbol table value, type, typedata = table.get_by_id(tokens[i].val) if type == "string": # If { in string then it is a f-string if "{" in value: vars = [] temp_var = "" enter = False # Collect the variable names for char in value: if char == "{": enter = True elif char == "}": vars.append(temp_var[1:]) temp_var = "" enter = False if enter: temp_var += char # Determine the type of variables and append the name of variables at the end type_to_fs = { "char": "%c", "string": "%s", "int": "%d", "float": "%f", "double": "%lf", } for var in vars: _, type, _ = table.get_by_id(table.get_by_symbol(var)) if type == "var": error("Unknown variable %s" % var, tokens[i].line_num) value = value.replace(var, type_to_fs[type]) value += ", " + var # Replace all {} in string value = value.replace("{", "").replace("}", "") op_value += value op_type = 0 if typedata == "constant" else 1 elif type == "char": op_value += value op_type = 2 elif type == "int": op_value += str(value) op_type = (type_to_prec["int"] if type_to_prec["int"] > op_type else op_type) elif type == "float": op_value += str(value) op_type = (type_to_prec["float"] if type_to_prec["float"] > op_type else op_type) elif type == "double": op_value += str(value) op_type = (type_to_prec["double"] if type_to_prec["double"] > op_type else op_type) elif type in ["var", "declared"] and not accept_unkown: error("Cannot find the type of %s" % value, tokens[i].line_num) elif type == "var" and accept_unkown: op_value += str(value) elif tokens[i].type in ["newline", "call_end"]: break else: word_to_op = { "plus": " + ", "minus": " - ", "multiply": " * ", "divide": " / ", " comma ": ", ", "equal": " == ", "not_equal": " != ", "greater_than": " > ", "less_than": " < ", "greater_than_equal": " >= ", "less_than_equal": " <= ", "input": " scanf ", "modulus": " % ", "increment": " ++ ", "decrement": " -- ", "plus_equal": " += ", "minus_equal": " -= ", "multiply_equal": " *= ", "divide_equal": " /= ", "modulus_equal": " %= ", "and": " && ", "or": " || ", "comma": ",", "left_paren": "(", "right_paren": ")", "address_of": "&", "left_shift": " << ", "right_shift": " >> ", } if (expect_paren and tokens[i].type == "right_paren" and tokens[i + 1].type in ["newline", "left_brace"]): break op_value += word_to_op[tokens[i].type] i += 1 # If expression is empty then throw an error if op_value == "" and not accept_empty_expression: error(msg, tokens[i].line_num) # Check if statement is of type input if " scanf " in op_value: # Check if there exists a prompt message if '"' in op_value: i1 = op_value.index('"') + 1 i2 = op_value.index('"', i1) # Extracting the prompt p_msg = op_value[i1:i2] # Checking if dtype is mentioned if "'" in op_value[i2 + 1:]: i1 = op_value.index("'", i2 + 1) + 1 i2 = op_value.index("'", i1) dtype = op_value[i1:i2] else: # default dtype is string dtype = "s" else: p_msg = "" dtype = "s" dtype_to_prec = {"i": 3, "f": 4, "d": 5, "s": 1} op_value = str(p_msg) + "---" + str(dtype) op_type = dtype_to_prec[dtype] # Return the expression, type of expression, and current index in source codes return op_value, op_type, i, func_ret_type
def function_definition_statement(tokens, i, table, func_ret_type): """ Parse function definition statement Params ====== tokens (list) = List of tokens i (int) = Current index in token table (SymbolTable) = Symbol table constructed holding information about identifiers and constants func_ret_type (string) = Function return type Returns ======= OpCode, int, string: The opcode for the assign code, the index, and the name of the function after parsing function calling statement Grammar ======= function_definition_statement -> fun id([formal_params,]*) { body } formal_params -> expr body -> statement expr -> string | number | id | operator string -> quote [a-zA-Z0-9`~!@#$%^&*()_-+={[]}:;,.?/|\]+ quote quote -> " number -> [0-9]+ id -> [a-zA-Z_]?[a-zA-Z0-9_]* operator -> + | - | * | / """ # Check if identifier follows fun check_if(tokens[i].type, "id", "Expected function name", tokens[i].line_num) # Store the id of function name in symbol table func_idx = tokens[i].val # Get function name func_name, _, _ = table.get_by_id(func_idx) # Check if ( follows id in function check_if( tokens[i + 1].type, "left_paren", "Expected ( after function name", tokens[i + 1].line_num, ) # Check if expression follows ( in function statement op_value, op_type, i, func_ret_type = expression( tokens, i + 2, table, "", True, True, func_ret_type=func_ret_type) op_value_list = op_value.replace(" ", "").replace(")", "").split(",") # Check if ) follows expression in function check_if( tokens[i - 1].type, "right_paren", "Expected ) after function params list", tokens[i - 1].line_num, ) # If \n follows ) then skip all the \n characters if tokens[i + 1].type == "newline": i += 1 while tokens[i].type == "newline": i += 1 i -= 1 # Check if { follows ) in function check_if( tokens[i + 1].type, "left_brace", "Expected { before function body", tokens[i + 1].line_num, ) # Loop until } is reached i += 2 ret_idx = i found_right_brace = False while i < len(tokens) and tokens[i].type != "right_brace": if tokens[i].type == "right_brace": found_right_brace = True i += 1 # If right brace found at end if i != len(tokens) and tokens[i].type == "right_brace": found_right_brace = True # If right brace is not found then produce error if not found_right_brace: error("Expected } after function body", tokens[i].line_num) # Add the identifier types to function's typedata table.symbol_table[func_idx][2] = ( "function---" + "---".join(op_value_list) if len(op_value_list) > 0 and len(op_value_list[0]) > 0 else "function") return ( OpCode("func_decl", func_name + "---" + "&&&".join(op_value_list), ""), ret_idx - 1, func_name, func_ret_type, )
def parse(tokens, table): """ Parse tokens and generate opcodes Params ====== tokens (list) = List of tokens Returns ======= list: The list of opcodes Grammar ======= statement -> print_statement | var_statement | assign_statement | function_definition_statement """ # List of opcodes op_codes = [] # Current function's name func_name = "" # Do while started or not in_do = False # Count main functions main_fn_count = 0 # Count if conditions if_count = 0 # Brace count brace_count = 0 # If function return type could not be figured out during return then do it while calling func_ret_type = {} # Loop through all the tokens i = 0 while i <= len(tokens) - 1: # If token is of type print then generate print opcode if tokens[i].type == "print": print_opcode, i, func_ret_type = print_statement( tokens, i + 1, table, func_ret_type) op_codes.append(print_opcode) # If token is of type var then generate var opcode elif tokens[i].type == "var": var_opcode, i, func_ret_type = var_statement( tokens, i + 1, table, func_ret_type) op_codes.append(var_opcode) # If token is of type id then generate assign opcode elif tokens[i].type == "id": # If '(' follows id then it is function calling else variable assignment if tokens[i + 1].type == "left_paren": fun_opcode, i, func_ret_type = function_call_statement( tokens, i, table, func_ret_type) op_codes.append(fun_opcode) elif tokens[i + 1].type in ["increment", "decrement"]: unary_opcode, i, func_ret_type = unary_statement( tokens, i, table, func_ret_type) op_codes.append(unary_opcode) else: assign_opcode, i, func_ret_type = assign_statement( tokens, i + 1, table, func_ret_type) op_codes.append(assign_opcode) # If token is of type fun then generate function opcode elif tokens[i].type == "fun": fun_opcode, i, func_name, func_ret_type = function_definition_statement( tokens, i + 1, table, func_ret_type) op_codes.append(fun_opcode) # If token is of type left_brace then generate scope_begin opcode elif tokens[i].type == "left_brace": op_codes.append(OpCode("scope_begin", "", "")) brace_count += 1 i += 1 # If token is of type right_brace then generate scope_over opcode elif tokens[i].type == "right_brace": op_codes.append(OpCode("scope_over", "", "")) brace_count -= 1 if brace_count < 0: error( "Closing brace doesn't match any previous opening brace", tokens[i].line_num, ) i += 1 # If token is of type MAIN then generate MAIN opcode elif tokens[i].type == "MAIN": op_codes.append(OpCode("MAIN", "", "")) main_fn_count += 1 if main_fn_count > 1: error("Presence of two MAIN in a single file", tokens[i].line_num) i += 1 # If token is of type END_MAIN then generate MAIN opcode elif tokens[i].type == "END_MAIN": op_codes.append(OpCode("END_MAIN", "", "")) main_fn_count -= 1 i += 1 # If token is of type for then generate for code elif tokens[i].type == "for": for_opcode, i, func_ret_type = for_statement( tokens, i + 1, table, func_ret_type) op_codes.append(for_opcode) # If token is of type do then generate do_while code elif tokens[i].type == "do": check_if( tokens[i + 1].type, "left_brace", "Expected { after do statement", tokens[i + 1].line_num, ) in_do = True op_codes.append(OpCode("do", "", "")) i += 1 # If token is of type while then generate while opcode elif tokens[i].type == "while": while_opcode, i, func_ret_type = while_statement( tokens, i + 1, table, in_do, func_ret_type) if in_do: in_do = False op_codes.append(while_opcode) # If token is of type if then generate if opcode elif tokens[i].type == "if": if_opcode, i, func_ret_type = if_statement(tokens, i + 1, table, func_ret_type) op_codes.append(if_opcode) # Increment if count on encountering if if_count += 1 # If token is of type exit then generate exit opcode elif tokens[i].type == "exit": exit_opcode, i, func_ret_type = exit_statement( tokens, i + 1, table, func_ret_type) op_codes.append(exit_opcode) # If token is of type else then check whether it is else if or else elif tokens[i].type == "else": # If the next token is if, then it is else if if tokens[i + 1].type == "if": if_opcode, i, func_ret_type = if_statement( tokens, i + 2, table, func_ret_type) if_opcode.type = "else_if" op_codes.append(if_opcode) # Otherwise it is else elif tokens[i + 1].type == "left_brace": op_codes.append(OpCode("else", "", "")) # Decrement if count on encountering if, to make sure there aren't extra else conditions if_count -= 1 # If if_count is negative then the current else is extra if if_count < 0: error("Else does not match any if!", tokens[i].line_num) i += 1 # If token is of type return then generate return opcode elif tokens[i].type == "return": beg_idx = i + 1 if tokens[i + 1].type not in ["id", "number", "string"]: op_value = "" op_type = 6 i += 2 else: op_value, op_type, i, func_ret_type = expression( tokens, i + 1, table, "Expected expression after return", True, True, expect_paren=False, func_ret_type=func_ret_type, ) if func_name == "": error("Return statement outside any function", tokens[i].line_num) else: # Map datatype to appropriate datatype in C prec_to_type = { -1: "not_known", 0: "char*", 1: "char*", 2: "char", 3: "int", 4: "float", 5: "double", 6: "void", } if op_type == -1: func_ret_type[func_name] = beg_idx # Change return type of function table.symbol_table[table.get_by_symbol( func_name)][1] = prec_to_type[op_type] # Set func_name to an empty string after processing func_name = "" op_codes.append(OpCode("return", op_value, "")) # If token is of type break then generate break opcode elif tokens[i].type == "break": op_codes.append(OpCode("break", "", "")) i += 1 # If token is of type continue then generate continue opcode elif tokens[i].type == "continue": op_codes.append(OpCode("continue", "", "")) i += 1 # If token is of type single_line_statement then generate single_line_comment opcode elif tokens[i].type == "single_line_comment": op_codes.append(OpCode("single_line_comment", tokens[i].val, "")) i += 1 # If token is of type multi_line_statement then generate multi_line_comment opcode elif tokens[i].type == "multi_line_comment": op_codes.append(OpCode("multi_line_comment", tokens[i].val, "")) i += 1 # If token is of type switch then generate switch opcode elif tokens[i].type == "switch": switch_opcode, i, func_ret_type = switch_statement( tokens, i + 1, table, func_ret_type) op_codes.append(switch_opcode) # If token is of type case then generate case opcode elif tokens[i].type == "case": case_opcode, i, func_ret_type = case_statement( tokens, i + 1, table, func_ret_type) op_codes.append(case_opcode) # If token is of type default then generate default opcode elif tokens[i].type == "default": check_if( tokens[i + 1].type, "colon", "Expected : after default statement in switch", tokens[i + 1].line_num, ) op_codes.append(OpCode("default", "", "")) i += 2 # If token is the type increment or decrement then generate unary_opcode elif tokens[i].type in ["increment", "decrement"]: unary_opcode, i, func_ret_type = unary_statement( tokens, i, table, func_ret_type) op_codes.append(unary_opcode) # Otherwise increment the index else: i += 1 # Errors that may occur after parsing loop if main_fn_count != 0: error("MAIN not ended with END_MAIN", tokens[i - 1].line_num + 1) # Return opcodes return op_codes
def lexical_analyze(filename, table): """ Generate tokens from source code Params ====== filename (string) = The string containing simc source code filename table (SymbolTable) = Symbol table constructed holding information about identifiers and constants Returns ======== list: A list of tokens of the source code, if the code is lexically correct, otherwise presents user with an error """ # Check if file extension is .simc or not if "." not in filename or filename.split(".")[-1] != "simc": error("Incorrect file extension", line_num) # Read the entire source code as a string source_code = open(filename, "r").read() source_code += "\0" # List of tokens tokens = [] # Line number line_num = 1 # Parantheses checker for detecting function call parantheses_count = 0 # To store comment string comment_str = "" # Loop through the source code character by character i = 0 while source_code[i] != "\0": # If a digit appears, call numeric_val function and add the numeric token to list, # if it was correct if is_digit(source_code[i]): token, i = numeric_val(source_code, i, table, line_num) tokens.append(token) # If double quote appears the value is a string token elif source_code[i] == '"': token, i = string_val(source_code, i, table, line_num) tokens.append(token) # If single quote appears the value is a string token elif source_code[i] == "'": token, i = string_val(source_code, i, table, line_num, start_char="'") tokens.append(token) # If alphabet or number appears then it might be either a keyword or an identifier elif is_alnum(source_code[i]): token, i = keyword_identifier(source_code, i, table, line_num) tokens.append(token) # Identifying left paren token elif source_code[i] == "(": if tokens[-1].type == "id" or parantheses_count > 0: parantheses_count += 1 tokens.append(Token("left_paren", "", line_num)) i += 1 # Identifying right paren token elif source_code[i] == ")": if parantheses_count > 0: parantheses_count -= 1 tokens.append(Token("right_paren", "", line_num)) if parantheses_count == 0: tokens.append(Token("call_end", "", line_num)) i += 1 # Identifying left brace token elif source_code[i] == "{": tokens.append(Token("left_brace", "", line_num)) i += 1 # Identifying right brace token elif source_code[i] == "}": tokens.append(Token("right_brace", "", line_num)) i += 1 # Identifying newline token elif source_code[i] == "\n": tokens.append(Token("newline", "", line_num)) line_num += 1 i += 1 # Identifying assignment token or equivalence token elif source_code[i] == "=": if source_code[i + 1] != "=": tokens.append(Token("assignment", "", line_num)) i += 1 else: tokens.append(Token("equal", "", line_num)) i += 2 # Identifying plus_equal, increment or plus token elif source_code[i] == "+": if source_code[i + 1] == "=": tokens.append(Token("plus_equal", "", line_num)) i += 2 elif source_code[i + 1] == "+": tokens.append(Token("increment", "", line_num)) i += 2 else: tokens.append(Token("plus", "", line_num)) i += 1 # Identifying minus_equal, decrement or minus token elif source_code[i] == "-": if source_code[i + 1] == "=": tokens.append(Token("minus_equal", "", line_num)) i += 2 elif source_code[i + 1] == "-": tokens.append(Token("decrement", "", line_num)) i += 2 else: tokens.append(Token("minus", "", line_num)) i += 1 # Identifying multiply_equal or multiply token elif source_code[i] == "*": if source_code[i + 1] == "=": tokens.append(Token("multiply_equal", "", line_num)) i += 2 else: tokens.append(Token("multiply", "", line_num)) i += 1 # Identifying 'address of' token elif source_code[i] == "&": tokens.append(Token("address_of", "", line_num)) i += 1 # Identifying divide_equal or divide token elif source_code[i] == "/": if source_code[i + 1] == "=": tokens.append(Token("divide_equal", "", line_num)) i += 2 # to check if it is a single line comment elif source_code[i + 1] == "/": i += 2 while source_code[i] != "\n": comment_str += str(source_code[i]) i += 1 tokens.append( Token("single_line_comment", comment_str, line_num)) comment_str = "" # to check if it is a multi line comment elif source_code[i + 1] == "*": i += 2 while source_code[i] != "*" and source_code[i + 1] != "/": comment_str += str(source_code[i]) i += 1 tokens.append( Token("multi_line_comment", comment_str, line_num)) comment_str = "" else: tokens.append(Token("divide", "", line_num)) i += 1 # Identifying modulus_equal or modulus token elif source_code[i] == "%": if source_code[i + 1] == "=": tokens.append(Token("modulus_equal", "", line_num)) i += 2 else: tokens.append(Token("modulus", "", line_num)) i += 1 # Identifying comma token elif source_code[i] == ",": tokens.append(Token("comma", "", line_num)) i += 1 # Identifying not_equal token elif source_code[i] == "!" and source_code[i + 1] == "=": tokens.append(Token("not_equal", "", line_num)) i += 2 # Identifying greater_than or greater_than_equal token elif source_code[i] == ">": if source_code[i + 1] not in ["=", ">"]: tokens.append(Token("greater_than", "", line_num)) i += 1 elif source_code[i + 1] == "=": tokens.append(Token("greater_than_equal", "", line_num)) i += 2 else: tokens.append(Token("right_shift", "", line_num)) i += 2 # Identifying less_than or less_than_equal token elif source_code[i] == "<": if source_code[i + 1] not in ["<", "="]: tokens.append(Token("less_than", "", line_num)) i += 1 elif source_code[i + 1] == "=": tokens.append(Token("less_than_equal", "", line_num)) i += 2 elif source_code[i + 1] == "<": tokens.append(Token("left_shift", "", line_num)) i += 2 # Identifiying colon token elif source_code[i] == ":": tokens.append(Token("colon", "", line_num)) i += 1 # Otherwise increment the index else: i += 1 # Return the generated tokens return tokens
def keyword_identifier(source_code, i, table, line_num): """ Process keywords and identifiers in source code Params ====== source_code (string) = The string containing simc source code i (int) = The current index in the source code table (SymbolTable) = Symbol table constructed holding information about identifiers and constants line_num (int) = Line number Returns ======= Token, int: The token generated for the keyword or identifier and the current position in source code """ value = "" # Loop until we get a non-digit character while is_alnum(source_code[i]): value += source_code[i] i += 1 # Check if value is keyword or not if is_keyword(value): return Token(value, "", line_num), i # Check if identifier is in symbol table id = table.get_by_symbol(value) C_keywords = [ "break", "else", "long", "switch", "case", "enum", "register", "typedef", "char", "extern", "return", "union", "const", "float", "short", "unsigned", "continue", "for", "signed", "void", "default", "goto", "sizeof", "volatile", "do", "if", "static", "while", ] # Check if identifier is a keyword in class if value in C_keywords: error("A keyword cannot be an identifier - %s" % value, line_num) # If identifier is not in symbol table then give a placeholder datatype var if id == -1: id = table.entry(value, "var", "variable") # Return id token and current index in source code return Token("id", id, line_num), i