def tokenize_wrapper(input): """Tokenizes a string suppressing significant whitespace.""" skip = (token.NEWLINE, token.INDENT, token.DEDENT) tokens = tokenize.generate_tokens(driver.generate_lines(input).next) for quintuple in tokens: type, value, start, end, line_text = quintuple if type not in skip: yield quintuple
def suite(self, text): #if util.PY2: if _READ_SOURCE_AS_UNICODE: f = io.StringIO(text) else: import cStringIO f = cStringIO.StringIO() tokens = tokenize.generate_tokens(f.readline) tree = self.driver.parse_tokens(tokens, start_symbol=self.start_symbol) return tree
def __init__(self, filename, stream=None): close_stream = None if stream is None: stream = open(filename) close_stream = stream.close self.filename = filename self.stream = stream self.generator = tokenize.generate_tokens(stream.readline) self.gettoken() # Initialize lookahead self.dfas, self.startsymbol = self.parse() if close_stream is not None: close_stream() self.first = {} # map from symbol name to set of tokens self.addfirstsets()
def main(argv): grammar_path = argv[1] # NOTE: This is cached as a pickle grammar = driver.load_grammar(grammar_path) FILE_INPUT = grammar.symbol2number['file_input'] symbols = Symbols(grammar) pytree.Init(symbols) # for type_repr() pretty printing transformer.Init(symbols) # for _names and other dicts # In Python 2 code, always use from __future__ import print_function. try: del grammar.keywords["print"] except KeyError: pass #do_glue = False do_glue = True if do_glue: # Make it a flag # Emulating parser.st structures from parsermodule.c. # They have a totuple() method, which outputs tuples like this. def py2st(grammar, raw_node): type, value, context, children = raw_node # See pytree.Leaf if context: _, (lineno, column) = context else: lineno = 0 # default in Leaf column = 0 if children: return (type,) + tuple(children) else: return (type, value, lineno, column) convert = py2st else: convert = pytree.convert d = driver.Driver(grammar, convert=convert) action = argv[2] if action == 'stdlib-parse': # This is what the compiler/ package was written against. import parser py_path = argv[3] with open(py_path) as f: st = parser.suite(f.read()) tree = st.totuple() n = transformer.CountTupleTree(tree) log('COUNT %d', n) printer = transformer.TupleTreePrinter(HostStdlibNames()) printer.Print(tree) elif action == 'parse': py_path = argv[3] with open(py_path) as f: tokens = tokenize.generate_tokens(f.readline) tree = d.parse_tokens(tokens, start_symbol=FILE_INPUT) if isinstance(tree, tuple): n = transformer.CountTupleTree(tree) log('COUNT %d', n) printer = transformer.TupleTreePrinter(transformer._names) printer.Print(tree) else: tree.PrettyPrint(sys.stdout) log('\tChildren: %d' % len(tree.children), file=sys.stderr) elif action == 'compile': py_path = argv[3] out_path = argv[4] if do_glue: py_parser = Pgen2PythonParser(d, FILE_INPUT) printer = transformer.TupleTreePrinter(transformer._names) tr = transformer.Pgen2Transformer(py_parser, printer) else: tr = transformer.Transformer() # for Python 2.7 compatibility: if _READ_SOURCE_AS_UNICODE: f = codecs.open(py_path, encoding='utf-8') else: f = open(py_path) contents = f.read() co = pycodegen.compile(contents, py_path, 'exec', transformer=tr) file_size = os.path.getsize(py_path) log("Code length: %d", len(co.co_code)) # Write the .pyc file with open(out_path, 'wb') as out_f: h = pycodegen.getPycHeader(py_path) out_f.write(h) marshal.dump(co, out_f) elif action == 'compile2': in_path = argv[3] out_path = argv[4] from compiler2 import pycodegen as pycodegen2 from misc import stdlib_compile stdlib_compile.compileAndWrite(in_path, out_path, pycodegen2.compile) else: raise RuntimeError('Invalid action %r' % action)
def OpyCommandMain(argv): """Dispatch to the right action.""" # TODO: Use core/arg_def. #opts, argv = Options().parse_args(argv) try: action = argv[0] except IndexError: raise error.Usage('opy: Missing required subcommand.') argv = argv[1:] # TODO: Should I do input.ReadRequiredArg()? # That will shift the input. if action in ( 'parse', 'parse-with', 'compile', 'dis', 'ast', 'symbols', 'cfg', 'compile-ovm', 'eval', 'repl', 'run', 'run-ovm'): loader = pyutil.GetResourceLoader() f = loader.open(GRAMMAR_REL_PATH) contents = f.read() f.close() gr = grammar.Grammar() gr.loads(contents) # In Python 2 code, always use from __future__ import print_function. try: del gr.keywords["print"] except KeyError: pass symbols = Symbols(gr) pytree.Init(symbols) # for type_repr() pretty printing transformer.Init(symbols) # for _names and other dicts compiler = skeleton.Compiler(gr) else: # e.g. pgen2 doesn't use any of these. Maybe we should make a different # tool. compiler = None # TODO: Also have a run_spec for 'opyc run'. compile_spec = arg_def.OilFlags('opy') compile_spec.Flag('-emit-docstring', args.Bool, default=True, help='Whether to emit docstrings') compile_spec.Flag('-fast-ops', args.Bool, default=True, help='Whether to emit LOAD_FAST, STORE_FAST, etc.') compile_spec.Flag('-oil-subset', args.Bool, default=False, help='Only allow the constructs necessary to implement' 'Oil. Example: using multiple inheritance will abort ' 'compilation.') # # Actions # if action == 'pgen2': grammar_path = argv[0] marshal_path = argv[1] WriteGrammar(grammar_path, marshal_path) elif action == 'stdlib-parse': # This is what the compiler/ package was written against. import parser py_path = argv[1] with open(py_path) as f: st = parser.suite(f.read()) tree = st.totuple() printer = TupleTreePrinter(HostStdlibNames()) printer.Print(tree) n = CountTupleTree(tree) log('COUNT %d', n) elif action == 'lex': py_path = argv[0] with open(py_path) as f: tokens = tokenize.generate_tokens(f.readline) for typ, val, start, end, unused_line in tokens: print('%10s %10s %-10s %r' % (start, end, token.tok_name[typ], val)) elif action == 'lex-names': # Print all the NAME tokens. for py_path in argv: log('Lexing %s', py_path) with open(py_path) as f: tokens = tokenize.generate_tokens(f.readline) for typ, val, start, end, unused_line in tokens: if typ == token.NAME: print(val) elif action == 'parse': py_path = argv[0] with open(py_path) as f: tokens = tokenize.generate_tokens(f.readline) p = parse.Parser(gr) pnode = driver.PushTokens(p, tokens, gr, 'file_input') printer = ParseTreePrinter(transformer._names) # print raw nodes printer.Print(pnode) # Parse with an arbitrary grammar, but the Python lexer. elif action == 'parse-with': grammar_path = argv[0] start_symbol = argv[1] code_str = argv[2] with open(grammar_path) as f: gr = pgen.MakeGrammar(f) f = cStringIO.StringIO(code_str) tokens = tokenize.generate_tokens(f.readline) p = parse.Parser(gr) # no convert= try: pnode = driver.PushTokens(p, tokens, gr, start_symbol) except parse.ParseError as e: # Extract location information and show it. _, _, (lineno, offset) = e.opaque # extra line needed for '\n' ? lines = code_str.splitlines() + [''] line = lines[lineno-1] log(' %s', line) log(' %s^', ' '*offset) log('Parse Error: %s', e) return 1 printer = ParseTreePrinter(transformer._names) # print raw nodes printer.Print(pnode) elif action == 'ast': # output AST opt, i = compile_spec.ParseArgv(argv) py_path = argv[i] with open(py_path) as f: graph = compiler.Compile(f, opt, 'exec', print_action='ast') elif action == 'symbols': # output symbols opt, i = compile_spec.ParseArgv(argv) py_path = argv[i] with open(py_path) as f: graph = compiler.Compile(f, opt, 'exec', print_action='symbols') elif action == 'cfg': # output Control Flow Graph opt, i = compile_spec.ParseArgv(argv) py_path = argv[i] with open(py_path) as f: graph = compiler.Compile(f, opt, 'exec', print_action='cfg') elif action == 'compile': # 'opyc compile' is pgen2 + compiler2 # spec.Arg('action', ['foo', 'bar']) # But that leads to some duplication. opt, i = compile_spec.ParseArgv(argv) py_path = argv[i] out_path = argv[i+1] with open(py_path) as f: co = compiler.Compile(f, opt, 'exec') log("Compiled to %d bytes of top-level bytecode", len(co.co_code)) # Write the .pyc file with open(out_path, 'wb') as out_f: h = misc.getPycHeader(py_path) out_f.write(h) marshal.dump(co, out_f) elif action == 'compile-ovm': # NOTE: obsolete from ovm2 import oheap2 opt, i = compile_spec.ParseArgv(argv) py_path = argv[i] out_path = argv[i+1] # Compile to Python bytecode (TODO: remove ovm_codegen.py) mode = 'exec' with open(py_path) as f: co = compiler.Compile(f, opt, mode) if 1: with open(out_path, 'wb') as out_f: oheap2.Write(co, out_f) return 0 log("Compiled to %d bytes of top-level bytecode", len(co.co_code)) # Write the .pyc file with open(out_path, 'wb') as out_f: if 1: out_f.write(co.co_code) else: h = misc.getPycHeader(py_path) out_f.write(h) marshal.dump(co, out_f) log('Wrote only the bytecode to %r', out_path) elif action == 'eval': # Like compile, but parses to a code object and prints it opt, i = compile_spec.ParseArgv(argv) py_expr = argv[i] f = skeleton.StringInput(py_expr, '<eval input>') co = compiler.Compile(f, opt, 'eval') v = dis_tool.Visitor() v.show_code(co) print() print('RESULT:') print(eval(co)) elif action == 'repl': # Like eval in a loop while True: py_expr = raw_input('opy> ') f = skeleton.StringInput(py_expr, '<REPL input>') # TODO: change this to 'single input'? Why doesn't this work? co = compiler.Compile(f, opt, 'eval') v = dis_tool.Visitor() v.show_code(co) print(eval(co)) elif action == 'dis-tables': out_dir = argv[0] pyc_paths = argv[1:] out = TableOutput(out_dir) for pyc_path in pyc_paths: with open(pyc_path) as f: magic, unixtime, timestamp, code = dis_tool.unpack_pyc(f) WriteDisTables(pyc_path, code, out) out.Close() elif action == 'dis': opt, i = compile_spec.ParseArgv(argv) path = argv[i] v = dis_tool.Visitor() if path.endswith('.py'): with open(path) as f: co = compiler.Compile(f, opt, 'exec') log("Compiled to %d bytes of top-level bytecode", len(co.co_code)) v.show_code(co) else: # assume pyc_path with open(path, 'rb') as f: v.Visit(f) elif action == 'dis-md5': pyc_paths = argv if not pyc_paths: raise error.Usage('dis-md5: At least one .pyc path is required.') for path in pyc_paths: h = hashlib.md5() with open(path) as f: magic = f.read(4) h.update(magic) ignored_timestamp = f.read(4) while True: b = f.read(64 * 1024) if not b: break h.update(b) print('%6d %s %s' % (os.path.getsize(path), h.hexdigest(), path)) elif action == 'run': # Compile and run, without writing pyc file # TODO: Add an option like -v in __main__ #level = logging.DEBUG if args.verbose else logging.WARNING #logging.basicConfig(level=level) #logging.basicConfig(level=logging.DEBUG) opt, i = compile_spec.ParseArgv(argv) py_path = argv[i] opy_argv = argv[i:] if py_path.endswith('.py'): with open(py_path) as f: co = compiler.Compile(f, opt, 'exec') num_ticks = execfile.run_code_object(co, opy_argv) elif py_path.endswith('.pyc') or py_path.endswith('.opyc'): with open(py_path) as f: f.seek(8) # past header. TODO: validate it! co = marshal.load(f) num_ticks = execfile.run_code_object(co, opy_argv) else: raise error.Usage('Invalid path %r' % py_path) elif action == 'run-ovm': # Compile and run, without writing pyc file opt, i = compile_spec.ParseArgv(argv) py_path = argv[i] opy_argv = argv[i+1:] if py_path.endswith('.py'): #mode = 'exec' mode = 'ovm' # OVM bytecode is different! with open(py_path) as f: co = compiler.Compile(f, opt, mode) log('Compiled to %d bytes of OVM code', len(co.co_code)) num_ticks = ovm.run_code_object(co, opy_argv) elif py_path.endswith('.pyc') or py_path.endswith('.opyc'): with open(py_path) as f: f.seek(8) # past header. TODO: validate it! co = marshal.load(f) num_ticks = ovm.run_code_object(co, opy_argv) else: raise error.Usage('Invalid path %r' % py_path) else: raise error.Usage('Invalid action %r' % action)
def Compile(f, opt, gr, mode, print_action=None): """Run the full compiler pipeline. Args: f: file handle with input source code opt: Parsed command line flags gr: Grammar start_symbol: name of the grammar start symbol mode: 'exec', 'eval', or 'single', like Python's builtin compile() print_action: 'ast' or 'cfg'. Print an intermediate representation. opt: Command line flags """ filename = f.name tokens = tokenize.generate_tokens(f.readline) p = parse.Parser(gr) if mode == 'single': start_symbol = 'single_input' elif mode == 'exec': start_symbol = 'file_input' elif mode == 'eval': start_symbol = 'eval_input' parse_tree = driver.PushTokens(p, tokens, gr, start_symbol) parse_tuples = _ParseTreeToTuples(parse_tree) tr = transformer.Transformer() as_tree = tr.transform(parse_tuples) if print_action == 'ast': print(as_tree) return # NOTE: This currently does nothing! v = syntax.SyntaxErrorChecker() v.Dispatch(as_tree) s = symbols.SymbolVisitor() s.Dispatch(as_tree) if print_action == 'symbols': _PrintScopes(s.scopes) return graph = pyassem.FlowGraph() # Mutated by code generator if mode == "single": # Not used now? ctx = _ModuleContext(filename, opt, s.scopes) # NOTE: the name of the Frame is a comment, not exposed to users. frame = pyassem.Frame("<interactive>", filename) # mutated gen = pycodegen.InteractiveCodeGenerator(ctx, frame, graph) gen.set_lineno(as_tree) elif mode == "exec": # TODO: Does this need to be made more efficient? p1 = future.FutureParser() p2 = future.BadFutureParser() p1.Dispatch(as_tree) p2.Dispatch(as_tree) ctx = _ModuleContext(filename, opt, s.scopes, futures=p1.get_features()) frame = pyassem.Frame("<module>", filename) # mutated gen = pycodegen.TopLevelCodeGenerator(ctx, frame, graph) elif mode == "eval": ctx = _ModuleContext(filename, opt, s.scopes) frame = pyassem.Frame("<expression>", filename) # mutated gen = pycodegen.TopLevelCodeGenerator(ctx, frame, graph) else: raise AssertionError('Invalid mode %r' % mode) # NOTE: There is no Start() or FindLocals() at the top level. gen.Dispatch(as_tree) # mutates graph gen.Finish() if print_action == 'cfg': print(graph) return co = pyassem.MakeCodeObject(frame, graph, opt) # NOTE: Could call marshal.dump here? return co