def parse_model(model): """Parses an Excel formula into tokens and returns the operand ranges. :param model: A text representation of an Excel formula. """ parser = ExcelParser() tokens = parser.parse(model.formula) print(parser.prettyprint()) return parser.getOperandRanges()
def shunting_yard(expression, named_ranges, ref=None, tokenize_range=False): """ Tokenize an excel formula expression into reverse polish notation Core algorithm taken from wikipedia with varargs extensions from http://www.kallisti.net.nz/blog/2008/02/extension-to-the-shunting-yard-algorithm-to-allow-variable-numbers-of-arguments-to-functions/ The ref is the cell address which is passed down to the actual compiled python code. Range basic operations signature require this reference, so it has to be written during OperatorNode.emit() https://github.com/iOiurson/koala/blob/master/koala/ast/graph.py#L292. This is needed because Excel range basic operations (+, -, * ...) are applied on matching cells. Example: Cell C2 has the following formula 'A1:A3 + B1:B3'. The output will actually be A2 + B2, because the formula is relative to cell C2. """ #remove leading = if expression.startswith('='): expression = expression[1:] p = ExcelParser(tokenize_range=tokenize_range) p.parse(expression) # insert tokens for '(' and ')', to make things clearer below tokens = [] for t in p.tokens.items: if t.ttype == "function" and t.tsubtype == "start": t.tsubtype = "" tokens.append(t) tokens.append(f_token('(', 'arglist', 'start')) elif t.ttype == "function" and t.tsubtype == "stop": tokens.append(f_token(')', 'arglist', 'stop')) elif t.ttype == "subexpression" and t.tsubtype == "start": t.tvalue = '(' tokens.append(t) elif t.ttype == "subexpression" and t.tsubtype == "stop": t.tvalue = ')' tokens.append(t) elif t.ttype == "operand" and t.tsubtype == "range" and t.tvalue in named_ranges: t.tsubtype = "named_range" tokens.append(t) else: tokens.append(t) #http://office.microsoft.com/en-us/excel-help/calculation-operators-and-precedence-HP010078886.aspx operators = {} operators[':'] = Operator(':', 8, 'left') operators[''] = Operator(' ', 8, 'left') operators[','] = Operator(',', 8, 'left') operators['u-'] = Operator('u-', 7, 'left') #unary negation operators['%'] = Operator('%', 6, 'left') operators['^'] = Operator('^', 5, 'left') operators['*'] = Operator('*', 4, 'left') operators['/'] = Operator('/', 4, 'left') operators['+'] = Operator('+', 3, 'left') operators['-'] = Operator('-', 3, 'left') operators['&'] = Operator('&', 2, 'left') operators['='] = Operator('=', 1, 'left') operators['<'] = Operator('<', 1, 'left') operators['>'] = Operator('>', 1, 'left') operators['<='] = Operator('<=', 1, 'left') operators['>='] = Operator('>=', 1, 'left') operators['<>'] = Operator('<>', 1, 'left') output = collections.deque() stack = [] were_values = [] arg_count = [] new_tokens = [] # reconstruct expressions with ':' and replace the corresponding tokens by the reconstructed expression if not tokenize_range: for index, token in enumerate(tokens): new_tokens.append(token) if type(token.tvalue) == str: if token.tvalue.startswith( ':'): # example -> :OFFSET( or simply :A10 depth = 0 expr = '' rev = reversed(tokens[:index]) for t in rev: # going backwards, 'stop' starts, 'start' stops if t.tsubtype == 'stop': depth += 1 elif depth > 0 and t.tsubtype == 'start': depth -= 1 expr = t.tvalue + expr new_tokens.pop() if depth == 0: new_tokens.pop( ) # these 2 lines are needed to remove INDEX() new_tokens.pop() expr = next(rev).tvalue + expr break expr += token.tvalue depth = 0 if token.tvalue[1:] in ['OFFSET', 'INDEX']: for t in tokens[(index + 1):]: if t.tsubtype == 'start': depth += 1 elif depth > 0 and t.tsubtype == 'stop': depth -= 1 expr += t.tvalue tokens.remove(t) if depth == 0: break new_tokens.append(f_token(expr, 'operand', 'pointer')) elif ':OFFSET' in token.tvalue or ':INDEX' in token.tvalue: # example -> A1:OFFSET( depth = 0 expr = '' expr += token.tvalue for t in tokens[(index + 1):]: if t.tsubtype == 'start': depth += 1 elif t.tsubtype == 'stop': depth -= 1 expr += t.tvalue tokens.remove(t) if depth == 0: new_tokens.pop() break new_tokens.append(f_token(expr, 'operand', 'pointer')) tokens = new_tokens if new_tokens else tokens for t in tokens: if t.ttype == "operand": output.append(create_node(t, ref)) if were_values: were_values.pop() were_values.append(True) elif t.ttype == "function": stack.append(t) arg_count.append(0) if were_values: were_values.pop() were_values.append(True) were_values.append(False) elif t.ttype == "argument": while stack and (stack[-1].tsubtype != "start"): output.append(create_node(stack.pop(), ref)) if were_values.pop(): arg_count[-1] += 1 were_values.append(False) if not len(stack): raise Exception("Mismatched or misplaced parentheses") elif t.ttype.startswith('operator'): if t.ttype.endswith('-prefix') and t.tvalue == "-": o1 = operators['u-'] else: o1 = operators[t.tvalue] while stack and stack[-1].ttype.startswith('operator'): if stack[-1].ttype.endswith( '-prefix') and stack[-1].tvalue == "-": o2 = operators['u-'] else: o2 = operators[stack[-1].tvalue] if ((o1.associativity == "left" and o1.precedence <= o2.precedence) or (o1.associativity == "right" and o1.precedence < o2.precedence)): output.append(create_node(stack.pop(), ref)) else: break stack.append(t) elif t.tsubtype == "start": stack.append(t) elif t.tsubtype == "stop": while stack and stack[-1].tsubtype != "start": output.append(create_node(stack.pop(), ref)) if not stack: raise Exception("Mismatched or misplaced parentheses") stack.pop() if stack and stack[-1].ttype == "function": f = create_node(stack.pop(), ref) a = arg_count.pop() w = were_values.pop() if w: a += 1 f.num_args = a #print f, "has ",a," args" output.append(f) while stack: if (stack[-1].tsubtype == "start" or stack[-1].tsubtype == "stop"): raise Exception("Mismatched or misplaced parentheses") output.append(create_node(stack.pop(), ref)) # convert to list return [x for x in output]
def shunting_yard(expression, named_ranges, ref = None, tokenize_range = False): """ Tokenize an excel formula expression into reverse polish notation Core algorithm taken from wikipedia with varargs extensions from http://www.kallisti.net.nz/blog/2008/02/extension-to-the-shunting-yard-algorithm-to-allow-variable-numbers-of-arguments-to-functions/ The ref is the cell address which is passed down to the actual compiled python code. Range basic operations signature require this reference, so it has to be written during OperatorNode.emit() https://github.com/iOiurson/koala/blob/master/koala/ast/graph.py#L292. This is needed because Excel range basic operations (+, -, * ...) are applied on matching cells. Example: Cell C2 has the following formula 'A1:A3 + B1:B3'. The output will actually be A2 + B2, because the formula is relative to cell C2. """ #remove leading = if expression.startswith('='): expression = expression[1:] p = ExcelParser(tokenize_range = tokenize_range); p.parse(expression) # insert tokens for '(' and ')', to make things clearer below tokens = [] for t in p.tokens.items: if t.ttype == "function" and t.tsubtype == "start": t.tsubtype = "" tokens.append(t) tokens.append(f_token('(','arglist','start')) elif t.ttype == "function" and t.tsubtype == "stop": tokens.append(f_token(')','arglist','stop')) elif t.ttype == "subexpression" and t.tsubtype == "start": t.tvalue = '(' tokens.append(t) elif t.ttype == "subexpression" and t.tsubtype == "stop": t.tvalue = ')' tokens.append(t) elif t.ttype == "operand" and t.tsubtype == "range" and t.tvalue in named_ranges: t.tsubtype = "named_range" tokens.append(t) else: tokens.append(t) #http://office.microsoft.com/en-us/excel-help/calculation-operators-and-precedence-HP010078886.aspx operators = {} operators[':'] = Operator(':',8,'left') operators[''] = Operator(' ',8,'left') operators[','] = Operator(',',8,'left') operators['u-'] = Operator('u-',7,'left') #unary negation operators['%'] = Operator('%',6,'left') operators['^'] = Operator('^',5,'left') operators['*'] = Operator('*',4,'left') operators['/'] = Operator('/',4,'left') operators['+'] = Operator('+',3,'left') operators['-'] = Operator('-',3,'left') operators['&'] = Operator('&',2,'left') operators['='] = Operator('=',1,'left') operators['<'] = Operator('<',1,'left') operators['>'] = Operator('>',1,'left') operators['<='] = Operator('<=',1,'left') operators['>='] = Operator('>=',1,'left') operators['<>'] = Operator('<>',1,'left') output = collections.deque() stack = [] were_values = [] arg_count = [] new_tokens = [] # reconstruct expressions with ':' and replace the corresponding tokens by the reconstructed expression if not tokenize_range: for index, token in enumerate(tokens): new_tokens.append(token) if type(token.tvalue) == str or type(token.tvalue) == unicode: if token.tvalue.startswith(':'): # example -> :OFFSET( or simply :A10 depth = 0 expr = '' rev = reversed(tokens[:index]) for t in rev: # going backwards, 'stop' starts, 'start' stops if t.tsubtype == 'stop': depth += 1 elif depth > 0 and t.tsubtype == 'start': depth -= 1 expr = t.tvalue + expr new_tokens.pop() if depth == 0: new_tokens.pop() # these 2 lines are needed to remove INDEX() new_tokens.pop() expr = six.next(rev).tvalue + expr break expr += token.tvalue depth = 0 if token.tvalue[1:] in ['OFFSET', 'INDEX']: for t in tokens[(index + 1):]: if t.tsubtype == 'start': depth += 1 elif depth > 0 and t.tsubtype == 'stop': depth -= 1 expr += t.tvalue tokens.remove(t) if depth == 0: break new_tokens.append(f_token(expr, 'operand', 'pointer')) elif ':OFFSET' in token.tvalue or ':INDEX' in token.tvalue: # example -> A1:OFFSET( depth = 0 expr = '' expr += token.tvalue for t in tokens[(index + 1):]: if t.tsubtype == 'start': depth += 1 elif t.tsubtype == 'stop': depth -= 1 expr += t.tvalue tokens.remove(t) if depth == 0: new_tokens.pop() break new_tokens.append(f_token(expr, 'operand', 'pointer')) tokens = new_tokens if new_tokens else tokens for t in tokens: if t.ttype == "operand": output.append(create_node(t, ref)) if were_values: were_values.pop() were_values.append(True) elif t.ttype == "function": stack.append(t) arg_count.append(0) if were_values: were_values.pop() were_values.append(True) were_values.append(False) elif t.ttype == "argument": while stack and (stack[-1].tsubtype != "start"): output.append(create_node(stack.pop(), ref)) if were_values.pop(): arg_count[-1] += 1 were_values.append(False) if not len(stack): raise Exception("Mismatched or misplaced parentheses") elif t.ttype.startswith('operator'): if t.ttype.endswith('-prefix') and t.tvalue =="-": o1 = operators['u-'] else: o1 = operators[t.tvalue] while stack and stack[-1].ttype.startswith('operator'): if stack[-1].ttype.endswith('-prefix') and stack[-1].tvalue =="-": o2 = operators['u-'] else: o2 = operators[stack[-1].tvalue] if ( (o1.associativity == "left" and o1.precedence <= o2.precedence) or (o1.associativity == "right" and o1.precedence < o2.precedence) ): output.append(create_node(stack.pop(), ref)) else: break stack.append(t) elif t.tsubtype == "start": stack.append(t) elif t.tsubtype == "stop": while stack and stack[-1].tsubtype != "start": output.append(create_node(stack.pop(), ref)) if not stack: raise Exception("Mismatched or misplaced parentheses") stack.pop() if stack and stack[-1].ttype == "function": f = create_node(stack.pop(), ref) a = arg_count.pop() w = were_values.pop() if w: a += 1 f.num_args = a #print f, "has ",a," args" output.append(f) while stack: if (stack[-1].tsubtype == "start" or stack[-1].tsubtype == "stop"): raise Exception("Mismatched or misplaced parentheses") output.append(create_node(stack.pop(), ref)) # convert to list return [x for x in output]