def get_parse_rules(): """define pyparsing rules here pyparsing is used mainly to - easy parse and pass comments and quoted strings, - separate line breaks and get indent (whitespace) immediately after them, - locate keywords which do not always start the command but are sometimes inside, this is -- used for var [for (var i=0; ..)], because we split in whitespace and ( -- not used token categories: + javascript significant characters (include space inside), / comments, | line breaks + next indentation """ space = Word(' \t')('+Space') lineBreak = (Word('\r\n', exact=2) | Word('\n', exact=1))('|Linebreak') breakIndent = lineBreak + Optional(Word(' \t'))('|Indent') quotedString = (QuotedString('"', unquoteResults=False) | QuotedString("'", unquoteResults=False) | QuotedString("/", unquoteResults=False) | QuotedString('`', multiline=True, unquoteResults=False))('+Cmd') keywordVar = Keyword('var')('+CmVar') return (space | cppStyleComment()('/Comment') | quotedString | breakIndent | keywordVar | CharsNotIn(';"\'/` (\r\n\t')('+Cmd') | Word(';"\'/`(\r', exact=1)('+Cmd') )
def getParseRules(): lineBreak = (Word('\r\n', exact=2) | Word('\n', exact=1))('Linebreak') breakIndent = lineBreak + Optional(Word(' \t'))('Indent') quotedString = (QuotedString('"', unquoteResults=False) | QuotedString("'", unquoteResults=False) | QuotedString('`', multiline=True, unquoteResults=False))('Quoted') return (cppStyleComment()('Comment') | quotedString | breakIndent | Word(';', exact=1)('CmdSep') | CharsNotIn(';"\'`/\r\n')('Command'))
def preprocessor(source): """ Divide raw source code to statements by locating longest possible parenthesized statements """ expression = pyparsing.Forward() comment = pyparsing.cppStyleComment() all_chars = pyparsing.Word( pyparsing.printables.replace('(', '').replace(')', '')) expression << pyparsing.nestedExpr( content=pyparsing.OneOrMore(expression | all_chars)) syntax = pyparsing.OneOrMore(expression) syntax.ignore(comment) return [code_reassembly(token) for token in syntax.parseString(source)]
def preprocessor(source): """ Divide raw source code to statements by locating longest possible parenthesized statements """ expression = pyparsing.Forward() comment = pyparsing.cppStyleComment() all_chars = pyparsing.Word( pyparsing.printables.replace('(', '').replace(')', '') ) expression << pyparsing.nestedExpr( content=pyparsing.OneOrMore(expression | all_chars) ) syntax = pyparsing.OneOrMore(expression) syntax.ignore(comment) return [ code_reassembly(token) for token in syntax.parseString(source) ]
def get_parse_rules(): """define pyparsing rules here pyparsing is used mainly to - easy parse and pass comments and quoted strings, - separate line breaks and get indent (whitespace) immediately after them, token categories: + javascript significant characters (include space inside), / comments, | line breaks + next indentation """ space = Word(' \t')('+Space') lineBreak = (Word('\r\n', exact=2) | Word('\n', exact=1))('|Linebreak') breakIndent = lineBreak + Optional(Word(' \t'))('|Indent') quotedString = (QuotedString('"', unquoteResults=False) | QuotedString("'", unquoteResults=False) | QuotedString("/", unquoteResults=False) | QuotedString('`', multiline=True, unquoteResults=False))('+Cmd') return (space | cppStyleComment()('/Comment') | quotedString | breakIndent | CharsNotIn(';"\'/` (\r\n\t')('+Cmd') | Word(';"\'/`(\r', exact=1)('+Cmd') )
def get_parse_rules(): """define pyparsing rules here pyparsing is used mainly to - easy parse and pass comments and quoted strings, - separate line breaks and get indent (whitespace) immediately after them, token categories: + javascript significant characters (include space inside), / comments, | line breaks + next indentation """ unicodeAlphas = u''.join(unichr(c) for c in xrange(65536) if unichr(c).isalpha()) unicodeAlphanums = u''.join(unichr(c) for c in xrange(65536) if unichr(c).isalnum()) unicodeNonStarters = u''.join(unichr(c) for c in xrange(65536) if filter( lambda uc: not uc.isalpha() and uc not in u';{}"\'/`\r\n', unichr(c))) #space = Word(' \t')('+Space') lineBreak = (Word('\r\n', exact=2) | Word('\n', exact=1))('|Linebreak') breakIndent = lineBreak + Optional(Word(' \t'))('|Indent') quotedString = (QuotedString('"', unquoteResults=False) | QuotedString("'", unquoteResults=False) #| QuotedString("/", unquoteResults=False) | QuotedString('`', multiline=True, unquoteResults=False))('+CmdQuoted') jsName = Word(unicodeAlphas, unicodeAlphanums)('+CmdName') jsNotName = Word(unicodeNonStarters)('+CmdOther') return (#space | cppStyleComment()('/Comment') | quotedString | breakIndent | jsName | Literal(u';')('+CmdSep') | Word(u'{}', exact=1)('+CmdBlock') | Word(u'"\'`\r', exact=1)('+CmdOther') # all excluded from unicodeNonStarters and not handled explicitly earlier, but except of / | jsNotName #| CharsNotIn(';"\'/` (\r\n\t')('+Cmd') #| Word(';"\'/`(\r', exact=1)('+CmdOther') )
def prepare_grammar(): stack = [] def pop(id, n, extra = []): print("pop(id={}, n={})".format(id, n)) tail = stack[-n:] del stack[-n:] stack.append([id] + tail + extra) return stack[-1] def RIGHT_UNARY(sym, arg): parser = pp.Forward() body = sym + parser body.setParseAction(lambda t: pop(t[0], 1)) parser << (arg ^ body) return parser def LEFT_BINARY(sym, arg): body = sym + arg body.setParseAction(lambda t: pop(t[0], 2)) return arg + pp.ZeroOrMore(body) def RIGHT_BINARY(sym, arg): parser = pp.Forward() body = sym + parser body.setParseAction(lambda t: pop(t[0], 2)) parser << (arg + pp.Optional(body)) return parser def push(t): print('push0') print(stack) print(t) stack.append(t[0]) print(stack) return t[0] def push1(t): print('push1') print(stack) print(t) b = stack.pop() a = stack.pop() #print(a) r = [t[0], a, b] stack.append(r) print(stack) #print(r) return r def push_unr(t): print('push_unr') print(stack) print(t) a = stack.pop() r = [t[0], a] stack.append(r) print(stack) return r; import pyparsing as pp LBRA, RBRA, SCOLON, LPAR, RPAR, COMMA = [pp.Suppress(c) for c in '{};(),'] ASSIGN, QUESTION, COLON, PLUS, MINUS = [pp.Suppress(c) for c in '=?:+-'] MUL, DIV, MOD, NOT = [pp.Suppress(c) for c in '*/%!'] ident = pp.Word(pp.alphas + '_', pp.alphanums + '_') eident = ident.copy() eident.setParseAction(push) dec_digit = pp.Regex(r'0|([1-9]\d*)').setParseAction(lambda toks: int(toks[0])) hex_digit = pp.Regex(r'0x[0-9a-fA-F]+').setParseAction(lambda toks: int(toks[0][2:],16)) bin_digit = pp.Regex(r'0b[01]+').setParseAction(lambda toks: int(toks[0][2:],2)) digit = dec_digit ^ hex_digit ^ bin_digit #push(digit) digit.setParseAction(push) expr = pp.Forward() DOT = pp.Literal('.') struct_access = DOT + eident struct_access.setParseAction(push1) LSPAR = pp.Literal('[') RSPAR = pp.Literal(']') array_access = LSPAR + expr + RSPAR array_access.setParseAction(push1) #array_access.setParseAction(lambda toks: ['[]', toks[0]]) access_expr = eident + pp.Group(pp.ZeroOrMore(struct_access ^ array_access)) #access_expr.setParseAction(lambda t: ['.', t[0], list(t[1:])]) par_expr = LPAR + expr + RPAR # TODO: what about casts top_expr = digit ^ access_expr ^ par_expr PLUS = pp.Literal('+') MINUS = pp.Literal('-') NOT = pp.Literal('!') #unr_expr = pp.ZeroOrMore(PLUS ^ MINUS ^ NOT) + top_expr #unr_expr = pp.Forward() #unr_expr << (top_expr ^ ((PLUS ^ MINUS ^ NOT) + unr_expr).setParseAction(push_unr)) unr_expr = RIGHT_UNARY(PLUS ^ MINUS ^ NOT, top_expr) MUL = pp.Literal('*') DIV = pp.Literal('/') MOD = pp.Literal('%') mul_expr = LEFT_BINARY(MUL ^ DIV ^ MOD, unr_expr) add_expr = LEFT_BINARY(PLUS ^ MINUS, mul_expr) #mul_expr = unr_expr + pp.ZeroOrMore(((MUL ^ DIV ^ MOD) + unr_expr).setParseAction(push1)) #add_expr = mul_expr + pp.ZeroOrMore(((PLUS ^ MINUS) + mul_expr).setParseAction(push1)) LT = pp.Literal("<") LE = pp.Literal("<=") EQ = pp.Literal("==") NEQ = pp.Literal("!=") GE = pp.Literal(">=") GT = pp.Literal(">") CMP = LT ^ LE ^ EQ ^ NEQ ^ GE ^ GT def cmp_tail2_merge(t): print('cmp_tail2_merge') print(stack) print(t) z = stack.pop() print("z={}".format(z)) h = stack.pop() print("h={}".format(h)) if h[0] == '<>': # stack = ..., [<> ... op1 a], z # -> [<> ... op1 a op2 z] ret = h + [t[0], z] else: # stack = ..., [op1 a b], z # -> [<> a op1 b op2 z] ret = ['<>', h[1], h[0], h[2], t[0], z] print("ret={}".format(ret)) stack.append(ret) return ret cmp_tail2 = CMP + add_expr cmp_tail2.setParseAction(cmp_tail2_merge) cmp_tail1 = CMP + add_expr cmp_tail1.setParseAction(lambda t: pop(t[0], 2)) # a < b -> ['<', a, b] # a < b < c -> ['<>', a, '<', b, '<', c] cmp_expr = add_expr + pp.Optional(cmp_tail1 + pp.ZeroOrMore(cmp_tail2)) #and_expr = pp.Forward() #and_expr << (cmp_expr + pp.Optional((pp.Literal('&&') + and_expr).setParseAction(push1))) AND = pp.Literal('&&') and_expr = RIGHT_BINARY(AND, cmp_expr) #and_expr = cmp_expr + pp.ZeroOrMore(pp.Suppress('&&') + cmp_expr) #or_expr = and_expr + pp.ZeroOrMore(pp.Suppress('||') + and_expr) OR = pp.Literal('||') or_expr = RIGHT_BINARY(OR, and_expr) #or_expr = pp.Forward() #or_expr << (and_expr + pp.Optional((pp.Literal('||') + or_expr).setParseAction(push1))) #induc_expr = pp.Forward() #induc_expr << (or_expr + pp.Optional((pp.Literal('==>') + induc_expr).setParseAction(push1))) IND = pp.Literal("==>") induc_expr = RIGHT_BINARY(IND, or_expr) equiv_expr = induc_expr + pp.Optional((pp.Literal('<==>') + induc_expr).setParseAction(push1)) cond_expr = pp.Forward() QUEST = pp.Literal('?') cond_tail = QUEST + cond_expr + COLON + cond_expr cond_tail.setParseAction(lambda t: pop(t[0], 3)) cond_expr << (equiv_expr + pp.Optional(cond_tail)) expr << cond_expr buildin_type = pp.Keyword('int') extern_mod = pp.Optional(pp.Keyword('extern')) extern_mod.setParseAction(lambda toks: len(toks) != 0) extern_mod.setResultsName('extern') var_decl_body = pp.Forward() stmt = pp.Forward() block_stmt = LBRA + pp.Group(pp.ZeroOrMore(stmt)) + RBRA assign_stmt = access_expr + ASSIGN + expr + SCOLON RUN = pp.Suppress(pp.Keyword('run')) run_stmt = RUN + ident + SCOLON IF = pp.Suppress(pp.Keyword('if')) ELSE = pp.Suppress(pp.Keyword('else')) if_stmt = pp.Forward() if_stmt << IF + LPAR + expr + RPAR + block_stmt + pp.Optional(ELSE + (block_stmt ^ if_stmt)) arg_bind = pp.Group(DOT + ident + ASSIGN + access_expr) arg_bind_list = pp.Group(pp.Optional(arg_bind + pp.ZeroOrMore(COMMA + arg_bind) + pp.Optional(COMMA))) template_stmt = ident + NOT + LPAR + arg_bind_list + RPAR + SCOLON stmt << pp.Group(block_stmt ^ if_stmt ^ run_stmt ^ assign_stmt ^ template_stmt) contracts_decl = pp.Empty() arg_list = pp.Group(pp.Optional(var_decl_body + pp.ZeroOrMore( \ COMMA + var_decl_body) + pp.Optional(COMMA))) TEMPLATE = pp.Suppress(pp.Keyword('template')) template_decl = extern_mod + TEMPLATE + ident + LPAR + \ arg_list + RPAR + contracts_decl + block_stmt SEQUENCE = pp.Suppress(pp.Keyword('sequence')) sequence_decl = extern_mod + SEQUENCE + ident + contracts_decl + block_stmt STRUCT = pp.Keyword('struct') struct_def = LBRA + pp.Group(pp.ZeroOrMore(var_decl_body + SCOLON)) + RBRA struct_decl = STRUCT + ident + struct_def + SCOLON struct_type = STRUCT + ident type_decl = buildin_type ^ struct_type var_decl_body << type_decl + ident var_decl = extern_mod + var_decl_body + pp.Optional(ASSIGN + expr, default = None) + SCOLON decl = pp.Group(var_decl ^ struct_decl ^ template_decl ^ sequence_decl) grammar = pp.ZeroOrMore(decl) comment = pp.cppStyleComment() grammar.ignore(comment) def show(t): print(stack) grammar.setParseAction(show) return grammar