def lex(input, token_definitions): tokens = [] i = 0 while i < len(input): regex_match = None # regex match object for token_expr in token_definitions: ## iterate through each token definition token_tag, regex_pattern = token_expr # get the pattern and tag of token expression regex_obj = re.compile(regex_pattern) # compile match obj # try to match one of the token expressions to the string of input starting at position i regex_match = regex_obj.match(input, i) if regex_match: # if match is not none lexeme = regex_match.group(0) # grab the capture group if token_tag != None: # if the tag is valid token, not whitespace, comments, etc tokens.append((token_tag, lexeme)) break #found token # end of token expression check, check if any of the # token expr matched the symbol string at i, # if so, move i to the end of the match characters if regex_match: j = regex_match.end(0) if i is j: # did not advance, repeating same match, break loop break else: i = j else: log.error("Lexer Error: Invalid symbol: " + input[i]) raw_input('...') break return tokens
def construct(self): for symbol in self.grammer.keys(): self[symbol] = {} # allocate map for symbol # find first and follow set firsts = self.first_set(symbol) follows = self.follow_set(symbol) for first in firsts: # add rule whose first matches first of rule for rule in self.grammer[ symbol]: # for each rule in symbols rule list if first in self.first_set( rule[0] ): # if the first matches the first of the rule if first not in self[ symbol]: # if symbol, first rule has not been added if first == self.EPSILON: # get follow of parent symbol if first is epsilon # for each follow, assign epsilon for follow in follows: self[symbol][follow] = [ first ] # make epsilon symbol a rule (list) else: self[symbol][first] = rule else: log.error('TABLE: Duplicate Rules for ' + symbol + ',' + first)
def parse(self, input): tokens = lexer.lex(input, self.lexemes) if len(tokens) <= 0: log.error('No TOKENS') return None # append EOI to token queue tokens.append((self.table.EOI, None)) # append end of input token to end of input self.validate(tokens) # begin by parsing the start token root = self.parse_token([self.table.START, None], tokens) return root
def parse_token(self, root, tokens): if len(tokens) <= 0: return root # done, no tokens # tokens[0] is next token with its symbol tag tokens[0][0] and its value at tokens[0][1] if root != None: root_tag = root[0] root_value = root[1] next_token = tokens[0] next_tag = next_token[0] next_value = next_token[1] if root_tag == next_tag: # roots tag matches token tag, generated a match to terminal tokens.pop(0) # move to the next token return next_token # return matched token token value elif root_tag in self.table and next_tag in self.table[root[ 0]]: # if root is nonterminal and current token has rules with nonterminal value = [] if self.table[root_tag][ next_tag] == None: # no production rule in table from root symbol to next token log.error('ERROR: No Rule for ROOT:' + str(root_tag)) return None # else, for each symbol in rule for rule_tag in self.table[root_tag][next_tag]: # parse the remaining tokens for the rule if rule_tag != self.table.EPSILON: # if not epsilon, attempt to parse the remaining tokens for the given symbol in the rule rule_token = self.parse_token([rule_tag, None], tokens) # if rule token parsed is valid and is a token pair(tag, value) if rule_token != None and len(rule_token) > 1: # if the parsed token tag matches rule tag value.append(rule_token) else: # else, rule_symbol could not be parsed log.error('Could not parse RULE:' + str(rule_symbol) + '\n\tTOKENS: ' + str(tokens)) return None else: value.append(None) root[1] = value # update roots value to the parsed value return root
def validate(self, tokens): # nonterminal tokens contain null values token_stack = [(self.table.START, None)] # push eoi and start on stack index = 0 # current token index valid = True # token stack contains currently parsed while len(token_stack) > 0 and valid: top_token = token_stack[-1] if top_token[0] == tokens[index][0]: # generated a match # take action by calling current token's action function token_stack.pop() index += 1 # get rule to do given symbol and token tag (token[0]) elif tokens[index][0] in self.table[ top_token[0]]: # if theres a rule for symbol(nonterminal) # copy rule, as to not affect table's rule when popping rule symbols rule = [] # rule symbol list for symbol in self.table[top_token[0]][tokens[index][0]]: rule.append(symbol) # add single symbol # pop stack for both epsilon and non epsilon cases token_stack.pop() # pop symbol and replace if not epsilon if rule[0] != self.table.EPSILON: # if not epsilon # add rule symbols in reverse order into stack while len(rule) > 0: # append null tokens that are used for their tags token_stack.append((rule[-1], None)) rule.pop() else: # no rule for symbol valid = False # reject input # end parse loop if valid: log.debug('VALIDATION: SUCCESS') else: log.error( 'VALIDATION: FAILED, No RULE in TABLE[top_token][next_token]') log.error('TOP TOKEN: ' + str(top_token)) log.error('NEXT TOKEN: ' + str(tokens[index]) + '\n') return valid