def __preppy__vlhs__(s,NAME=token.NAME,ENDMARKER=token.ENDMARKER): L = [] try: tokenize.tokenize(BytesIO(s.strip()).readline,lambda *a: L.append(a)) except: return False return len(L)==2 and L[0][0]==NAME and L[1][0]==ENDMARKER
def __call__(self): """ Parse and send the colored source. """ # store line offsets in self.lines self.lines = [0, 0] pos = 0 while True: pos = self.raw.find(b'\n', pos) + 1 if not pos: break self.lines.append(pos) self.lines.append(len(self.raw)) # parse the source and write it self.pos = 0 text = BytesIO(self.raw) self.out.write(b'<pre class="python">\n') try: if six.PY2: tokenize.tokenize(text.readline, self.format_tokenizer) else: for args in tokenize.tokenize(text.readline): self.format_tokenizer(*args) except tokenize.TokenError as ex: msg = ex.args[0] line = ex.args[1][0] self.out.write(b"<h5 class='error>'ERROR: %s%s</h5>" % ( msg, self.raw[self.lines[line]:])) self.out.write(b'\n</pre>\n') return safe_nativestring(self.out.getvalue())
def findUsages(): global directory, objMap, sharedFolders suffixes = (".py", ".csv", ".tsv") directories = [directory] # avoid folders that will be processed anyhow for shared in sharedFolders: skip = False tmpS = shared + "/" for folder in directories: tmpD = folder + "/" if platform.system() in ('Microsoft', 'Windows'): tmpS = tmpS.lower() tmpD = tmpD.lower() if tmpS.startswith(tmpD): skip = True break if not skip: directories.append(shared) for directory in directories: for root, dirnames, filenames in os.walk(directory): for filename in filter(lambda x: x.endswith(suffixes), filenames): currentFile = open(os.path.join(root, filename)) if filename.endswith(".py"): tokenize.tokenize(currentFile.readline, handle_token) elif filename.endswith(".csv"): handleDataFiles(currentFile, ",") elif filename.endswith(".tsv"): handleDataFiles(currentFile, "\t") currentFile.close() currentFile = open(objMap) tokenize.tokenize(currentFile.readline, handle_token) currentFile.close()
def format(self): """ Parse and send the colored source. """ # store line offsets in self.lines self.lines = [0, 0] pos = 0 while 1: pos = string.find(self.raw, '\n', pos) + 1 if not pos: break self.lines.append(pos) self.lines.append(len(self.raw)) # parse the source and write it self.pos = 0 text = cStringIO.StringIO(self.raw) self.out.write(self.stylesheet) self.out.write('<pre class="code">\n') try: tokenize.tokenize(text.readline, self) except tokenize.TokenError, ex: msg = ex[0] line = ex[1][0] self.out.write("<h3>ERROR: %s</h3>%s\n" % ( msg, self.raw[self.lines[line]:])) if self.cover_flag: self.out.write('</span>') self.cover_flag = False
def format(self, filename): global HEADER # store line offsets in self.lines self.lines = [0, 0] pos = 0 while 1: pos = string.find(self.raw, '\n', pos) + 1 if not pos: break self.lines.append(pos) self.lines.append(len(self.raw)) # parse the source and write it self.pos = 0 text = cStringIO.StringIO(self.raw) HEADER = HEADER.replace("$FILE", filename) if LOCAL_CONVERT: HEADER = HEADER.replace("$HIDE_INFO", "display: none;") self.out.write(HEADER) try: tokenize.tokenize(text.readline, self) except tokenize.TokenError, ex: msg = ex[0] line = ex[1][0] self.out.write("<h3>ERROR: %s</h3>%s\n" % ( msg, self.raw[self.lines[line]:])) self.out.write('</font></pre>')
def filter(filename): global name, module_has_docstring path,name = os.path.split(filename) root,ext = os.path.splitext(name) output("namespace "+root+" {\n",(0,0)) # set module name for tok_eater to use if there's a module doc string name = root sys.stderr.write('Filtering "'+filename+'"...') f = open(filename) tokenize.tokenize(f.readline, tok_eater) f.close() print_comment((0,0)) output("\n",(0,0)) output("} // end of namespace\n",(0,0)) if not module_has_docstring: # Put in default namespace documentation output('/** \\namespace '+root+' \n',(0,0)) output(' \\brief Module "%s" */\n'%(root),(0,0)) for s in outbuffer: outfile.write(s)
def format(self): """ Parse and send the colorized source to output.""" # Store line offsets in self.lines self.lines = [0, 0] pos = 0 while 1: pos = string.find(self.raw, '\n', pos) + 1 if not pos: break self.lines.append(pos) self.lines.append(len(self.raw)) # Parse the source and write it self.pos = 0 text = cStringIO.StringIO(self.raw) self.out.write('<pre><font face="Lucida,Courier New">') try: tokenize.tokenize(text.readline, self) # self as handler callable except tokenize.TokenError, ex: msg = ex[0] line = ex[1][0] self.out.write("<h3>ERROR: %s</h3>%s\n" % (msg, self.raw[self.lines[line]:]))
def _read_block(input, startlineno): r"""Read an indented block of expressions startlineno is *zero* origined line number. pre:: input.readline # must have readline function Examples: #>>> _read_block(StringIO('\tfoo:\n'), 0) #0 >>> _read_block(StringIO('\tpost[]: True\n'), 0) ('post', [], [('True', 1)], 1) >>> _read_block(StringIO('\tpre: 5 + 6 > 10\n'), 0) ('pre', [], [('5 + 6 > 10', 1)], 1) >>> _read_block(StringIO('\tpost:\n\t\t5 + 6 < 12\n\t\t2 + 2 == 4\n'), 0) ('post', [], [('5 + 6 < 12', 2), ('2 + 2 == 4', 3)], 3) >>> _read_block(StringIO('\tpost[foo.bar]: # changes\n' \ ... '\t\tlen(foo.bar) > 0\n'), 0) ('post', [['foo', 'bar']], [('len ( foo . bar ) > 0', 2)], 2) Handles double colons (for re-structured text):: >>> _read_block(StringIO('\tpre:: 5 + 6 > 10\n'), 0) ('pre', [], [('5 + 6 > 10', 1)], 1) """ t = tokenizer(input, startlineno) try: tokenize.tokenize(input.readline, t.next) except Done: pass input.seek(t.offset) return (t.keyword, t.decls, t.exprs, t.endlineno)
def check_roundtrip(self, f): """ Test roundtrip for `untokenize`. `f` is an open file or a string. The source code in f is tokenized to both 5- and 2-tuples. Both sequences are converted back to source code via tokenize.untokenize(), and the latter tokenized again to 2-tuples. The test fails if the 3 pair tokenizations do not match. When untokenize bugs are fixed, untokenize with 5-tuples should reproduce code that does not contain a backslash continuation following spaces. A proper test should test this. """ # Get source code and original tokenizations if isinstance(f, str): code = f.encode('utf-8') else: code = f.read() f.close() readline = iter(code.splitlines(keepends=True)).__next__ tokens5 = list(tokenize(readline)) tokens2 = [tok[:2] for tok in tokens5] # Reproduce tokens2 from pairs bytes_from2 = untokenize(tokens2) readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__ tokens2_from2 = [tok[:2] for tok in tokenize(readline2)] self.assertEqual(tokens2_from2, tokens2) # Reproduce tokens2 from 5-tuples bytes_from5 = untokenize(tokens5) readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__ tokens2_from5 = [tok[:2] for tok in tokenize(readline5)] self.assertEqual(tokens2_from5, tokens2)
def format(self, linenumber=True): """ Parse and send the colored source. """ # store line offsets in self.lines self.lines = [0, 0] pos = 0 while 1: pos = self.raw.find('\n', pos) + 1 if not pos: break self.lines.append(pos) self.lines.append(len(self.raw)) # write line numbers if linenumber: self.result.append('<table border="0"><tr><td align="right" valign="top">') self.result.append('<td align="right" valign="top"><pre><font face="Lucida,Courier New" color="%s">' % _colors[_TEXT]) for idx in range(1, len(self.lines)-1): self.result.append('%3d \n' % idx) self.result.append('</font></pre></td><td valign="top">') # parse the source and write it self.pos = 0 text = StringIO.StringIO(self.raw) self.result.append('<pre><font face="Lucida,Courier New">') try: tokenize.tokenize(text.readline, self) except tokenize.TokenError, ex: msg = ex[0] line = ex[1][0] self.result.append("<h3>ERROR: %s</h3>%s\n" % ( msg, self.raw[self.lines[line]:]))
def parse(self, source): """ Parse and send the colored source. """ self.source = string.expandtabs(source) self.tokenlist = [] # store line offsets in self.offset self.offset = [0, 0] self.lines = 0 pos = 0 while pos < len(self.source): self.lines = self.lines + 1 pos = string.find(self.source, '\n', pos) + 1 if not pos: break self.offset.append(pos) self.offset.append(len(self.source)) # parse the source self.pos = 0 text = cStringIO.StringIO(self.source) try: tokenize.tokenize(text.readline, self) except tokenize.TokenError, ex: msg = ex[0] line = ex[1][0] raise ParseError("ERROR %s\n%s" % ( msg, self.source[self.offset[line]:]))
def format(self, formatter): """ Parse and send the colored source. """ # store line offsets in self.lines self.lines = [0, 0] pos = 0 while 1: try: pos = self.raw.index('\n', pos) + 1 except ValueError: break self.lines.append(pos) self.lines.append(len(self.raw)) self.result = [] # collects output self._code_id = hash_new('sha1', self.raw.encode(config.charset)).hexdigest() self.result.append(formatter.code_area(1, self._code_id, 'ColorizedPython', self.show_num, self.num_start, self.num_step)) self.formatter = formatter self.result.append(formatter.code_line(1)) #len('%d' % (len(self.lines)-1, ))) # parse the source and write it self.pos = 0 text = StringIO.StringIO(self.raw) try: tokenize.tokenize(text.readline, self) except IndentationError, ex: msg = ex[0] errmsg = (self.formatter.linebreak() + self.formatter.strong(1) + "ERROR: %s" % msg + self.formatter.strong(0) + self.formatter.linebreak()) self.result.append(errmsg)
def colorize(self): """ Return an HTML string that renders the source code for the module that was specified in the constructor. """ # Initialize all our state variables self.pos = 0 self.cur_line = [] self.context = [] self.indents = [] self.lineno = 1 self.def_name = None # Load the module's text. self.text = open(self.module_filename).read() self.text = self.text.expandtabs().rstrip()+'\n' # Construct the line_offsets table. self.find_line_offsets() num_lines = self.text.count('\n')+1 self.linenum_size = len(`num_lines+1`) # Call the tokenizer, and send tokens to our `tokeneater()` # method. If anything goes wrong, then fall-back to using # the input text as-is (with no colorization). try: output = StringIO() self.out = output.write tokenize.tokenize(StringIO(self.text).readline, self.tokeneater) html = output.getvalue() except tokenize.TokenError, ex: html = self.text
def stringioize(self, string): """(internal) the following is really just a stupid hack to emulate the quirky behavior of the string tokenizer in java; it is a historical artifact that just isn't badly broken enough to require being removed yet. """ self.tokens = [] self._neg = None fd = StringIO.StringIO(string) tokenize.tokenize(fd.readline,self.eat) self.reset() sn = self.next() try: while sn.ttype != tokenize.ERRORTOKEN: sn = self.next() # this is the best part. It works completely by accident. # After 3 tries, you end up with a """ on the end of your # string, which is a multi-line string -- the tokenizer # will throw an exception for that (god knows why it # doesn't throw an exception for an EOF in a single-line # string...) self.stringioize(string+'"') except: pass # import traceback # traceback.print_exc() self.reset()
def __waiting(self, ttype, tstring, lineno): opts = self.__options # Do docstring extractions, if enabled if opts.docstrings and not opts.nodocstrings.get(self.__curfile): # module docstring? if self.__freshmodule: if ttype == tokenize.STRING: self.__addentry(safe_eval(tstring), lineno, isdocstring=1) self.__freshmodule = 0 elif ttype not in (tokenize.COMMENT, tokenize.NL): self.__freshmodule = 0 return # class docstring? if ttype == tokenize.NAME and tstring in ('class', 'def'): self.__state = self.__suiteseen return if ttype == tokenize.NAME and tstring in opts.keywords: self.__state = self.__keywordseen # In order to extract messages encapsulated in a string; for example, in view.mako: # var vLogTypeList = Array("${_('System')}", "${_('Traffic')}"); pattern = '(%s)\(.*\)' % '|'.join(opts.keywords) if ttype == tokenize.STRING and re.search(pattern, tstring): tstring = tstring.strip('\'"') tokenize.tokenize(StringIO.StringIO(tstring).readline, self)
def findMultiLineQuote(s): quotelist = [] def eatToken(type, string, begin, end, _, quotelist=quotelist): if type == token.STRING and RE_MULTI_LINE_QUOTE_BEGIN.match(string): quotelist.append((string, begin,end)) tokenize.tokenize(StringIO(s).readline, eatToken) return quotelist
def format(self, showLineNums=0): """ Parse and send the colored source. """ # store line offsets in self.lines self.lines = [0, 0] pos = 0 while 1: pos = string.find(self.raw, '\n', pos) + 1 if not pos: break self.lines.append(pos) self.lines.append(len(self.raw)) # write line numbers if showLineNums: self.lineNums=cStringIO.StringIO() self.lineNums.write('<pre>') for idx in range(1, len(self.lines)-1): self.lineNums.write('%3d \n' % idx) self.lineNums.write('</pre>') #self.out.write('<pre>') # parse the source and write it self.pos = 0 text = cStringIO.StringIO(self.raw) try: tokenize.tokenize(text.readline, self) except tokenize.TokenError, ex: pass msg = ex[0] line = ex[1][0] self.out.write('[ERROR: %s]<font color="red">%s</font>\n' % ( msg, self.raw[self.lines[line]:]))
def format(self, formatter, form): ''' Parse and send the colored source. ''' # Store line offsets in self.lines self.lines = [0, 0] pos = 0 # Gather lines while 1: pos = string.find(self.raw, '\n', pos) + 1 if not pos: break self.lines.append(pos) self.lines.append(len(self.raw)) # Wrap text in a filelike object self.pos = 0 text = cStringIO.StringIO(self.raw) # Html start self.doPageStart() # Parse the source. ## Tokenize calls the __call__ ## function for each token till done. try: tokenize.tokenize(text.readline, self) except tokenize.TokenError, ex: msg = ex[0] line = ex[1][0] self.out.write("<h3>ERROR: %s</h3>%s\n" % ( msg, self.raw[self.lines[line]:]))
def format(self): """ Parse and send the colored source. """ # store line offsets in self.lines self.lines = [0, 0] pos = 0 while True: pos = string.find(self.raw, '\n', pos) + 1 if not pos: break self.lines.append(pos) self.lines.append(len(self.raw)) # parse the source and write it self.pos = 0 text = StringIO(self.raw) self.out.write('<pre class="python">\n') try: tokenize.tokenize(text.readline, self) except tokenize.TokenError as ex: msg = ex[0] line = ex[1][0] self.out.write("<h5 class='error>'ERROR: %s%s</h5>" % ( msg, self.raw[self.lines[line]:])) self.out.write('\n</pre>\n')
def readFile(file, state): """ readFile( filename, State-object) Open the config file 'filename' and pass file descriptor to the tokenizer. Returns: nothing """ # Get the directory name of the current config file #state.dir = file[:string.rfind(file, '/')]+'/' state.dir = os.path.dirname( file ) try: conf = open(file, 'r') except IOError: print "Error opening file '%s'" % file; log.log( "<parseConfig>readFile(), Error, Cannot open '%s' - skipping" % (file), 4 ) return # add this filename to the list of config files configfiles.append(file) # Let tokenize.tokenize() parse the file into tokens which it will pass to # state.tokeneater() which will parse the tokens and create something # meaningful. try: tokenize.tokenize(conf.readline, state.tokeneater) except tokenize.TokenError, msg: raise config.ParseFailure, "Syntax error, %s"%(msg)
def format(self, formatter): """ Parse and send the colored source. """ # store line offsets in self.lines self.lines = [0, 0] pos = 0 while 1: pos = self.raw.find('\n', pos) + 1 if not pos: break self.lines.append(pos) self.lines.append(len(self.raw)) self._code_id = sha.new(self.raw.encode(config.charset)).hexdigest() self.request.write(formatter.code_area(1, self._code_id, 'ColorizedPython', self.show_num, self.num_start, self.num_step)) self.formatter = formatter self.request.write(formatter.code_line(1)) #len('%d' % (len(self.lines)-1, ))) # parse the source and write it self.pos = 0 text = StringIO.StringIO(self.raw) try: tokenize.tokenize(text.readline, self) except tokenize.TokenError, ex: msg = ex[0] line = ex[1][0] self.request.write("<b>ERROR: %s</b><br>%s\n" % ( msg, self.formatter.text(self.raw[self.lines[line]:])))
def inspect_traceback(tb): """Inspect a traceback and its frame, returning source for the expression where the exception was raised, with simple variable replacement performed and the line on which the exception was raised marked with '>>' """ log.debug('inspect traceback %s', tb) # we only want the innermost frame, where the exception was raised while tb.tb_next: tb = tb.tb_next frame = tb.tb_frame lines, exc_line = tbsource(tb) # figure out the set of lines to grab. inspect_lines, mark_line = find_inspectable_lines(lines, exc_line) src = StringIO(textwrap.dedent(''.join(inspect_lines))) exp = Expander(frame.f_locals, frame.f_globals) while inspect_lines: try: tokenize.tokenize(src.readline, exp) except tokenize.TokenError, e: # this can happen if our inspectable region happens to butt up # against the end of a construct like a docstring with the closing # """ on separate line log.debug("Tokenizer error: %s", e) inspect_lines.pop(0) mark_line -= 1 src = StringIO(textwrap.dedent(''.join(inspect_lines))) exp = Expander(frame.f_locals, frame.f_globals) continue break
def format(self, formatter, form): """ Parse and send the colored source. """ # store line offsets in self.lines self.lines = [0, 0] pos = 0 while 1: pos = string.find(self.raw, '\n', pos) + 1 if not pos: break self.lines.append(pos) self.lines.append(len(self.raw)) # parse the source and write it self.pos = 0 text = cStringIO.StringIO(self.raw) self.out.write("""<?xml version="1.0" encoding="utf-8" ?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <meta name="generator" content="Color.py" /> <title>%s</title> <link rel="stylesheet" href="../doc.css" type="text/css" /> </head> <body> <pre class="literal-block">""" % self.title) try: tokenize.tokenize(text.readline, self) except tokenize.TokenError, ex: msg = ex[0] line = ex[1][0] self.out.write("<h3>ERROR: %s</h3>%s\n" % ( msg, self.raw[self.lines[line]:]))
def check(file): if os.path.isdir(file) and not os.path.islink(file): names = os.listdir(file) for name in names: fullname = os.path.join(file, name) if (os.path.isdir(fullname) and not os.path.islink(fullname) or os.path.normcase(name[-3:]) == ".py"): check(fullname) return f = open(file) reset_globals() try: tokenize.tokenize(f.readline, tokeneater) except NannyNag, nag: badline = nag.get_lineno() return badline
def getTokens(command): """Return list of token tuples for command.""" # In case the command is unicode try encoding it if isinstance(command, str): try: command = command.encode('utf-8') except UnicodeEncodeError: pass # otherwise leave it alone f = StringIO(command) # tokens is a list of token tuples, each looking like: # (type, string, (srow, scol), (erow, ecol), line) tokens = [] # Can't use list comprehension: # tokens = [token for token in tokenize.generate_tokens(f.readline)] # because of need to append as much as possible before TokenError. try: ## This code wasn't backward compatible with Python 2.1.3. ## ## for token in tokenize.generate_tokens(f.readline): ## tokens.append(token) # This works with Python 2.1.3 (with nested_scopes). if not PY3: def eater(*args): tokens.append(args) tokenize.tokenize_loop(f.readline, eater) else: tokenize.tokenize(f.readline) except tokenize.TokenError: # This is due to a premature EOF, which we expect since we are # feeding in fragments of Python code. pass return tokens
def extracts(self): # calculate escapes make_escapes(self.options.escape) # calculate all keywords self.options.keywords.extend(default_keywords) # slurp through all the files eater = TokenEater(self.options) fp = self.pythonCode closep = 1 try: # eater.set_filename(self.filename) try: tokenize.tokenize(fp.readline, eater) except tokenize.TokenError, e: print >> sys.stderr, '%s: %s, line %d, column %d' % ( e[0], filename, e[1][0], e[1][1]) finally: if closep: fp.close() # write the output fp = sys.stdout closep = 0 res=[] try: res=eater.write(fp) finally: if closep: fp.close() return res
def py_strings(dir, domain="none", exclude=()): """Retrieve all Python messages from `dir` that are in the `domain`. """ eater = TokenEater() make_escapes(0) for filename in find_files( # We want to include cpy and vpy scripts as well # dir, '*.py', exclude=('extract.py', 'pygettext.py')+tuple(exclude)): # noqa dir, '*.*py', exclude=('extract.py', 'pygettext.py') + tuple(exclude) ): fp = codecs.open(filename, 'r', DEFAULT_CHARSET) try: eater.set_filename(filename) try: tokenize.tokenize(fp.readline, eater) except tokenize.TokenError, e: print >> sys.stderr, '%s: %s, line %d, column %d' % ( e[0], filename, e[1][0], e[1][1]) finally: fp.close() # One limitation of the Python message extractor is that it cannot # determine the domain of the string, since it is not contained anywhere # directly. The only way this could be done is by loading the module and # inspect the '_' function. For now we simply assume that all the found # strings have the domain the user specified. return eater.getCatalog()
def __call__(self, raw): """ Parse and send the colored source. """ self.out = cStringIO.StringIO() self.raw = raw.expandtabs().strip() # store line offsets in self.lines self.lines = [0, 0] pos = 0 while 1: pos = self.raw.find('\n', pos) + 1 if not pos: break self.lines.append(pos) self.lines.append(len(self.raw)) # # parse the source and write it self.pos = 0 text = cStringIO.StringIO(self.raw) self.out.write("<table width=100% cellpadding=0 cellspacing=0 " + """onclick="toggle_hidden('pysrc%d','toggle%d');"><tr> <td rowspan="3"> """ % (self.pysrcid, self.pysrcid) ) self.out.write("""<div class="pysrc" id="pysrc%dinv" style="display: none">...</div>"""% self.pysrcid) self.out.write('<div class="pysrc" id="pysrc%d" style="display: block ">'% self.pysrcid) try: tokenize.tokenize(text.readline, self.format) except tokenize.TokenError, ex: msg = ex[0] line = ex[1][0] print >> self.out, ("<h3>ERROR: %s</h3>%s" % (msg, self.raw[self.lines[line]:]))
def getblock(lines): """Extract the block of code at the top of the given list of lines.""" blockfinder = BlockFinder() try: tokenize.tokenize(iter(lines).next, blockfinder.tokeneater) except (EndOfBlock, IndentationError): pass return lines[:blockfinder.last]
def filter(inp, out, writer=HTMLWriter): out.write('<pre>\n') printer = TokenPrinter(writer(out.write).write).printtoken try: tokenize.tokenize(inp.readline, printer) except tokenize.TokenError: pass out.write('</pre>\n')
if token.LPAR <= toktype and toktype <= token.OP: toktype = token.OP elif toktype == token.NAME and keyword.iskeyword(toktext): toktype = _KEYWORD elif toktype == token.NAME and toktext in _colors.keys(): toktype = toktext style = _styles.get(toktype, _styles[_TEXT]) # send text out.write('<span class="%s">' % style) out.write(cgi.escape(toktext)) out.write('</span>') try: tokenize.tokenize(text.readline, call) except tokenize.TokenError, ex: msg = ex[0] line = ex[1][0] out.write("<h3>ERROR: %s</h3>%s\n" % (msg, raw[lines[line]:])) return 1 out.write('</font></pre></table>') info = str(source[1]) if os.path.isfile(info): sout = open(info) progs = sout.read() sout.close()
def tokenize_python(code, keep_comments=False, process_strings=True): assert isinstance(code, str) code = code.replace(r"\r", "") code = code.replace("\r", "") tokens = [] try: iterator = tokenize.tokenize(BytesIO(code.encode("utf-8")).readline) except SyntaxError as excep: raise SyntaxError(excep) removed_docstr = 0 while True: try: toktype, tok, _, _, line = next(iterator) except ( tokenize.TokenError, IndentationError, SyntaxError, UnicodeDecodeError, ) as e: raise ValueError( f'Impossible to parse tokens because of incorrect source code "{e}" ...' ) except StopIteration: raise Exception(f"End of iterator before ENDMARKER token.") if toktype == tokenize.ENCODING or toktype == tokenize.NL: continue elif toktype == tokenize.NEWLINE: if removed_docstr == 1: removed_docstr = 0 continue tokens.append("NEW_LINE") elif toktype == tokenize.COMMENT: if keep_comments: com = process_string( tok, PYTHON_CHAR2TOKEN, PYTHON_TOKEN2CHAR, True, do_whole_processing=process_strings, ) if len(com) > 0: tokens.append(com) else: continue elif toktype == tokenize.STRING: if tok == line.strip(): # docstring if not keep_comments: removed_docstr = 1 continue else: coms = process_string( tok, PYTHON_CHAR2TOKEN, PYTHON_TOKEN2CHAR, False, do_whole_processing=process_strings, ) if len(coms) > 0: tokens.append(coms) else: removed_docstr = 1 else: tokens.append( process_string( tok, PYTHON_CHAR2TOKEN, PYTHON_TOKEN2CHAR, False, do_whole_processing=process_strings, )) elif toktype == tokenize.INDENT: tokens.append("INDENT") elif toktype == tokenize.DEDENT: # empty block if tokens[-1] == "INDENT": tokens = tokens[:-1] else: tokens.append("DEDENT") elif toktype == tokenize.ENDMARKER: tokens.append("ENDMARKER") break else: tokens.append(tok) assert tokens[-1] == "ENDMARKER", "Error, no end marker" return tokens[:-1]
def preprocess(lines): lines = [l.rstrip() for l in lines] data = [(lnr, l) for lnr, l in enumerate(lines)] # Handle line continuation. no_backslash = [] while data: lnr, l = data.pop(0) while l.endswith("\\"): l = l[:-1] try: _, nextl = data.pop(0) l += nextl except IndexError: break no_backslash.append((lnr, l.strip())) variables = dict(mem for mem in inspect.getmembers(math) if not mem[0].startswith("_")) variables.update({c: Reg(c) for c in string.ascii_lowercase}) variables["pow"] = pow variables["math"] = math variables["data"] = Data num_instructions = 0 # Label pass. for lnr, l in no_backslash: if l.startswith("#") or not l: continue parse = re.search("^([a-zA-Z_][a-zA-Z0-9_]+)\s*(.*)", l) if parse is None: raise SyntaxError("Syntax error on line {}:\n{}".format( lnr + 1, lines[lnr])) ident, rest = parse.groups() if rest.startswith(":"): rest = rest[1:].lstrip() if not (rest.startswith("#") or not rest): raise SyntaxError( "Trailing characters after label on line {}:\n{}".format( lnr + 1, lines[lnr])) if ident in variables and isinstance(variables[ident], Label): raise SyntaxError( "Duplicate label name on line {}:\n{}".format( lnr + 1, lines[lnr])) variables[ident] = Label(num_instructions, ident) elif not rest.startswith("="): num_instructions += 1 # Read instructions and assignments. instructions = [] for lnr, l in no_backslash: if l.startswith("#") or not l: continue # Syntax already checked last time. ident, rest = re.search("^([a-zA-Z_][a-zA-Z0-9_]+)\s*(.*)", l).groups() # Strip comments (reuse Python's tokenizer to correctly handle comments in strings, etc). tokens = tokenize.tokenize(io.BytesIO(rest.encode("utf-8")).readline) stripped_tokens = [] for typ, tok, _, _, _ in tokens: if typ == tokenize.COMMENT: continue stripped_tokens.append((typ, tok)) rest = tokenize.untokenize(stripped_tokens).decode("utf-8") # Assignment. if rest.startswith("="): if ident in variables and isinstance(variables[ident], Label): raise SyntaxError( "Overwriting label name on line {}:\n{}".format( lnr + 1, lines[lnr])) variables[ident] = eval(rest[1:], variables) # Instruction. elif not rest.startswith(":"): args = list(eval("(None, {})".format(rest), variables)[1:]) check_instr_arguments(ident, args, lnr, lines) instructions.append(Instr(lnr, ident, args)) return instructions
def _remake_command(self, cmd, selector=None, receiver=None): from tokenize import tokenize, untokenize, NAME, OP, STRING DOT = (OP, '.') COLON = (OP, ':') COMMA = (OP, ',') OBRAC = (OP, '[') CBRAC = (OP, ']') OPAR = (OP, '(') CPAR = (OP, ')') from io import BytesIO recommand = [] if receiver: recommand += [(NAME, receiver), OBRAC, COLON, CBRAC, (OP, '='), ] try: cmd_encode = cmd.encode('utf-8') except AttributeError: cmd_encode = str(cmd).encode('utf-8') dims = len(self.shape) g = tokenize(BytesIO(cmd_encode).readline) if selector is None: screen_tokens = [COLON,] else: # try: # slicer_encode = selector.encode('utf-8') # except AttributeError: # slicer_encode = str(selector).encode('utf-8') # screen_tokens = [(toknum, tokval) for toknum, tokval, _, _, _ in tokenize(BytesIO(slicer_encode).readline)] screen_tokens = [(NAME, 'selector'), ] for toknum, tokval, _, _, _ in g: if toknum == NAME and tokval in self.data: # replace NAME tokens partial = [(NAME, 'self'), DOT, (NAME, 'data'), DOT, (NAME, tokval), OBRAC, ] partial += screen_tokens if len(self._groupnode._v_children[tokval].shape)>1: partial += [COMMA, COLON, ] if len(self._groupnode._v_children[tokval].shape)>2: partial += [COMMA, COLON, ] if len(self._groupnode._v_children[tokval].shape)>3: partial += [COMMA, COLON, ] partial += [CBRAC,] recommand.extend(partial) elif toknum == NAME and tokval in self.lookup: # replace NAME tokens partial = [(NAME, 'self'), DOT, (NAME, 'lookup'), DOT, (NAME, tokval), OBRAC, ] partial += screen_tokens if len(self._groupnode._v_children[tokval].shape) > 1: partial += [COMMA, COLON, ] if len(self._groupnode._v_children[tokval].shape) > 2: partial += [COMMA, COLON, ] if len(self._groupnode._v_children[tokval].shape) > 3: partial += [COMMA, COLON, ] partial += [CBRAC, ] recommand.extend(partial) else: recommand.append((toknum, tokval)) # print("<recommand>") # print(recommand) # print("</recommand>") ret = untokenize(recommand).decode('utf-8') from .util.aster import asterize return asterize(ret, mode="exec" if receiver is not None else "eval"), ret
def split_to_lines(self, source): lines = [] current_line = 1 current_col = 0 buffer = "" current_type = None source_io = io.BytesIO(source.encode()) formatter = Formatter() def readline(): return formatter.format( formatter.escape(source_io.readline().decode())).encode() tokens = tokenize.tokenize(readline) line = "" for token_info in tokens: token_type, token_string, start, end, _ = token_info lineno = start[0] if lineno == 0: # Encoding line continue if token_type == tokenize.ENDMARKER: # End of source if current_type is None: current_type = self.TOKEN_DEFAULT line += "<{}>{}</>".format(self._theme[current_type], buffer) lines.append(line) break if lineno > current_line: if current_type is None: current_type = self.TOKEN_DEFAULT diff = lineno - current_line if diff > 1: lines += [""] * (diff - 1) line += "<{}>{}</>".format(self._theme[current_type], buffer.rstrip("\n")) # New line lines.append(line) line = "" current_line = lineno current_col = 0 buffer = "" if token_string in self.KEYWORDS: new_type = self.TOKEN_KEYWORD elif token_string in self.BUILTINS or token_string == "self": new_type = self.TOKEN_BUILTIN elif token_type == tokenize.STRING: new_type = self.TOKEN_STRING elif token_type == tokenize.NUMBER: new_type = self.TOKEN_NUMBER elif token_type == tokenize.COMMENT: new_type = self.TOKEN_COMMENT elif token_type == tokenize.OP: new_type = self.TOKEN_OP elif token_type == tokenize.NEWLINE: continue else: new_type = self.TOKEN_DEFAULT if current_type is None: current_type = new_type if start[1] > current_col: buffer += token_info.line[current_col:start[1]] if current_type != new_type: line += "<{}>{}</>".format(self._theme[current_type], buffer) buffer = "" current_type = new_type if lineno < end[0]: # The token spans multiple lines token_lines = token_string.split("\n") line += "<{}>{}</>".format(self._theme[current_type], token_lines[0]) lines.append(line) for token_line in token_lines[1:-1]: lines.append("<{}>{}</>".format(self._theme[current_type], token_line)) current_line = end[0] buffer = token_lines[-1][:end[1]] line = "" continue buffer += token_string current_col = end[1] current_line = lineno return lines
def build_code_comment_pairs(self): if not self.functions: code_path = self.base_path / "code.pkl" if isfile(code_path): self.functions = pickle.load(open(code_path, "rb")) else: raise RuntimeWarning("Function dataset has not been built!!") filepath = self.base_path / "{}.pkl".format(self.name) code_filepath = self.base_path / "clean_code_data.pkl" if isfile(filepath) and isfile(code_filepath): self.code_comment_pairs = pickle.load(open(filepath, "rb")) self.clean_code_data = pickle.load(open(code_filepath, "rb")) return num_docs = 0 for idx, (identifier, code) in enumerate(tqdm(self.functions.items())): found_doc = False clean_code, clean_doc = list(), "" try: token_code = list( tok.tokenize(BytesIO(code.encode('utf-8')).readline)) for tok_type, token, (line, _), _, full_line in token_code: if tok_type == tok.COMMENT or tok_type == tok.ENCODING: continue if tok_type == tok.STRING and ("\"\"\"" in token or "'''" in token): full_line = full_line.strip() if full_line.endswith("'''") or full_line.endswith( "\"\"\""): for tok_type2, token2, ( line2, _), _, full_line2 in token_code: if line2 == line - 1 and "def" in full_line2: found_doc = True break elif line2 >= line: break if found_doc: clean_token = token.strip("\"\"\"").strip( "'''").strip("r\"\"\"").strip() double_newline = clean_token.find("\n\n") if double_newline > 1: clean_token = clean_token[:double_newline] param_idx = clean_token.find("Parameters\n") param_colon = clean_token.find("Parameters:\n") arrow_idx = clean_token.find(">>>") long_line = clean_token.find("----------\n") example_colon = clean_token.find("Example::\n") examples_colon = clean_token.find( "Examples::\n") refs_colon = clean_token.find("References::\n") examples = clean_token.find("Examples\n") example_Usage = clean_token.find( "Example Usage:\n") example_usage = clean_token.find( "Example usage:\n") requirements = clean_token.find( "Requirements\n") see_also_idx = clean_token.find("See Also\n") indices = [ s for s in [ param_idx, param_colon, arrow_idx, long_line, example_colon, examples, examples_colon, refs_colon, example_usage, example_Usage, requirements, see_also_idx ] if s >= 0 ] if len(indices) > 0: clean_doc += clean_token[:min(indices)] else: clean_doc += clean_token # if "----------" in clean_doc or "Example" in clean_doc: # print(clean_token) clean_doc = clean_doc.strip() if len(clean_doc) > 1: num_docs += 1 else: found_doc = False else: clean_code.append("<STRING>") elif tok_type == tok.NEWLINE or tok_type == tok.NL: clean_code.append("<NEWLINE>") elif tok_type == tok.INDENT: clean_code.append("<TAB>") elif tok_type == tok.DEDENT: clean_code.append("<UNTAB>") elif tok_type == tok.ENDMARKER: clean_code.append("<END>") elif tok_type == tok.NUMBER: number_sequence = clean_number(token) clean_code.extend(number_sequence) elif tok_type == tok.STRING: clean_code.append("<STRING>") elif tok_type == tok.NAME: if token in RESERVED_WORDS or token in BUILTIN_FUNCTIONS: clean_code.append(token) else: identifier_sequence = clean_identifier(token) clean_code.extend(identifier_sequence) else: clean_code.extend(token.split()) self.clean_code_data[identifier] = clean_code if found_doc: clean_doc = word_tokenize(clean_doc) clean_doct_str = " ".join(clean_doc) first_period = clean_doct_str.find(" . ") if 0 < first_period < 5: second_period = clean_doct_str.find( " . ", first_period + 3) clean_doct_str = clean_doct_str[:second_period + 3] elif first_period > 0: clean_doct_str = clean_doct_str[:first_period + 3] clean_doc = clean_doct_str.split() clean_doc = superclean_docstring(clean_doc) if len(clean_code) <= 3000 and len(clean_doc) <= 300: clean_code = ["<BoC>"] + clean_code + ["<EoC>"] clean_doc = ["<BoL>"] + clean_doc + ["<EoL>"] self.code_comment_pairs[identifier] = (clean_code, clean_doc) except tok.TokenError as e: print(e) # sys.exit() code = [(name, code) for name, (code, comm) in self.code_comment_pairs.items()] print("Sorting code") code.sort(key=lambda tup: tup[1]) print("Code is sorted") list_of_dup_lists = list() for idx, (name1, code1) in enumerate(tqdm(code, desc="Finding dups")): if idx < len(code): dup_list = list() for (name2, code2) in code[idx + 1:]: codestr1 = " ".join(code1) codestr2 = " ".join(code2) if codestr1 == codestr2: dup_list.extend([name1, name2]) else: break if len(dup_list) > 0: dup_list = list(set(dup_list)) dup_list.sort( key=lambda tup: (len(tup[0]), tup[1], tup[0])) list_of_dup_lists.append(dup_list) prev_length = len(self.code_comment_pairs.keys()) for dup_list in list_of_dup_lists: for key in dup_list[1:]: if key in self.code_comment_pairs: del self.code_comment_pairs[key] new_length = len(self.code_comment_pairs.keys()) print("Code/comm had {} examples, now has {} examples".format( prev_length, new_length)) code = [(name, code) for name, code in self.clean_code_data.items()] print("Sorting code") code.sort(key=lambda tup: tup[1]) print("Code is sorted") list_of_dup_lists = list() for idx, (name1, code1) in enumerate(tqdm(code, desc="Finding dups")): if idx < len(code): dup_list = list() for (name2, code2) in code[idx + 1:]: codestr1 = " ".join(code1) codestr2 = " ".join(code2) if codestr1 == codestr2: dup_list.extend([name1, name2]) else: break if len(dup_list) > 0: dup_list = list(set(dup_list)) dup_list.sort( key=lambda tup: (len(tup[0]), tup[1], tup[0])) list_of_dup_lists.append(dup_list) prev_length = len(self.clean_code_data.keys()) for dup_list in list_of_dup_lists: for key in dup_list[1:]: if key in self.clean_code_data: del self.clean_code_data[key] new_length = len(self.clean_code_data.keys()) print("Full code had {} examples, now has {} examples".format( prev_length, new_length)) pickle.dump(self.code_comment_pairs, open(filepath, "wb")) pickle.dump(self.clean_code_data, open(code_filepath, "wb"))
def set_by_str(self, f): tk_list = list(tokenize(BytesIO( f.strip().encode('utf-8')).readline))[1:-1] self.token_list = [tk.string for tk in tk_list] self.type_list = [token.tok_name[tk.type] for tk in tk_list]
def text(self, etype, evalue, etb, context=5): """Return a nice text document describing the traceback.""" # some locals try: etype = etype.__name__ except AttributeError: pass Colors = self.Colors # just a shorthand + quicker name lookup ColorsNormal = Colors.Normal # used a lot col_scheme = self.color_scheme_table.active_scheme_name indent = ' ' * INDENT_SIZE em_normal = '%s\n%s%s' % (Colors.valEm, indent, ColorsNormal) undefined = '%sundefined%s' % (Colors.em, ColorsNormal) exc = '%s%s%s' % (Colors.excName, etype, ColorsNormal) # some internal-use functions def text_repr(value): """Hopefully pretty robust repr equivalent.""" # this is pretty horrible but should always return *something* try: return pydoc.text.repr(value) except KeyboardInterrupt: raise except: try: return repr(value) except KeyboardInterrupt: raise except: try: # all still in an except block so we catch # getattr raising name = getattr(value, '__name__', None) if name: # ick, recursion return text_repr(name) klass = getattr(value, '__class__', None) if klass: return '%s instance' % text_repr(klass) except KeyboardInterrupt: raise except: return 'UNRECOVERABLE REPR FAILURE' def eqrepr(value, repr=text_repr): return '=%s' % repr(value) def nullrepr(value, repr=text_repr): return '' # meat of the code begins try: etype = etype.__name__ except AttributeError: pass if self.long_header: # Header with the exception type, python version, and date pyver = 'Python ' + string.split( sys.version)[0] + ': ' + sys.executable date = time.ctime(time.time()) head = '%s%s%s\n%s%s%s\n%s' % (Colors.topline, '-' * 75, ColorsNormal, exc, ' ' * (75 - len(str(etype)) - len(pyver)), pyver, string.rjust(date, 75)) head += "\nA problem occured executing Python code. Here is the sequence of function"\ "\ncalls leading up to the error, with the most recent (innermost) call last." else: # Simplified header head = '%s%s%s\n%s%s' % ( Colors.topline, '-' * 75, ColorsNormal, exc, string.rjust('Traceback (most recent call last)', 75 - len(str(etype)))) frames = [] # Flush cache before calling inspect. This helps alleviate some of the # problems with python 2.3's inspect.py. linecache.checkcache() # Drop topmost frames if requested try: # Try the default getinnerframes and Alex's: Alex's fixes some # problems, but it generates empty tracebacks for console errors # (5 blanks lines) where none should be returned. #records = inspect.getinnerframes(etb, context)[self.tb_offset:] #print 'python records:', records # dbg records = _fixed_getinnerframes(etb, context, self.tb_offset) #print 'alex records:', records # dbg except: # FIXME: I've been getting many crash reports from python 2.3 # users, traceable to inspect.py. If I can find a small test-case # to reproduce this, I should either write a better workaround or # file a bug report against inspect (if that's the real problem). # So far, I haven't been able to find an isolated example to # reproduce the problem. inspect_error() traceback.print_exc(file=Term.cerr) info( '\nUnfortunately, your original traceback can not be constructed.\n' ) return '' # build some color string templates outside these nested loops tpl_link = '%s%%s%s' % (Colors.filenameEm, ColorsNormal) tpl_call = 'in %s%%s%s%%s%s' % (Colors.vName, Colors.valEm, ColorsNormal) tpl_call_fail = 'in %s%%s%s(***failed resolving arguments***)%s' % \ (Colors.vName, Colors.valEm, ColorsNormal) tpl_local_var = '%s%%s%s' % (Colors.vName, ColorsNormal) tpl_global_var = '%sglobal%s %s%%s%s' % (Colors.em, ColorsNormal, Colors.vName, ColorsNormal) tpl_name_val = '%%s %s= %%s%s' % (Colors.valEm, ColorsNormal) tpl_line = '%s%%s%s %%s' % (Colors.lineno, ColorsNormal) tpl_line_em = '%s%%s%s %%s%s' % (Colors.linenoEm, Colors.line, ColorsNormal) # now, loop over all records printing context and info abspath = os.path.abspath for frame, file, lnum, func, lines, index in records: #print '*** record:',file,lnum,func,lines,index # dbg try: file = file and abspath(file) or '?' except OSError: # if file is '<console>' or something not in the filesystem, # the abspath call will throw an OSError. Just ignore it and # keep the original file string. pass link = tpl_link % file try: args, varargs, varkw, locals = inspect.getargvalues(frame) except: # This can happen due to a bug in python2.3. We should be # able to remove this try/except when 2.4 becomes a # requirement. Bug details at http://python.org/sf/1005466 inspect_error() traceback.print_exc(file=Term.cerr) info("\nIPython's exception reporting continues...\n") if func == '?': call = '' else: # Decide whether to include variable details or not var_repr = self.include_vars and eqrepr or nullrepr try: call = tpl_call % ( func, inspect.formatargvalues( args, varargs, varkw, locals, formatvalue=var_repr)) except KeyError: # Very odd crash from inspect.formatargvalues(). The # scenario under which it appeared was a call to # view(array,scale) in NumTut.view.view(), where scale had # been defined as a scalar (it should be a tuple). Somehow # inspect messes up resolving the argument list of view() # and barfs out. At some point I should dig into this one # and file a bug report about it. inspect_error() traceback.print_exc(file=Term.cerr) info("\nIPython's exception reporting continues...\n") call = tpl_call_fail % func # Initialize a list of names on the current line, which the # tokenizer below will populate. names = [] def tokeneater(token_type, token, start, end, line): """Stateful tokeneater which builds dotted names. The list of names it appends to (from the enclosing scope) can contain repeated composite names. This is unavoidable, since there is no way to disambguate partial dotted structures until the full list is known. The caller is responsible for pruning the final list of duplicates before using it.""" # build composite names if token == '.': try: names[-1] += '.' # store state so the next token is added for x.y.z names tokeneater.name_cont = True return except IndexError: pass if token_type == tokenize.NAME and token not in keyword.kwlist: if tokeneater.name_cont: # Dotted names names[-1] += token tokeneater.name_cont = False else: # Regular new names. We append everything, the caller # will be responsible for pruning the list later. It's # very tricky to try to prune as we go, b/c composite # names can fool us. The pruning at the end is easy # to do (or the caller can print a list with repeated # names if so desired. names.append(token) elif token_type == tokenize.NEWLINE: raise IndexError # we need to store a bit of state in the tokenizer to build # dotted names tokeneater.name_cont = False def linereader(file=file, lnum=[lnum], getline=linecache.getline): line = getline(file, lnum[0]) lnum[0] += 1 return line # Build the list of names on this line of code where the exception # occurred. try: # This builds the names list in-place by capturing it from the # enclosing scope. tokenize.tokenize(linereader, tokeneater) except IndexError: # signals exit of tokenizer pass except tokenize.TokenError, msg: _m = ("An unexpected error occurred while tokenizing input\n" "The following traceback may be corrupted or invalid\n" "The error message is: %s\n" % msg) error(_m) # prune names list of duplicates, but keep the right order unique_names = uniq_stable(names) # Start loop over vars lvals = [] if self.include_vars: for name_full in unique_names: name_base = name_full.split('.', 1)[0] if name_base in frame.f_code.co_varnames: if locals.has_key(name_base): try: value = repr(eval(name_full, locals)) except: value = undefined else: value = undefined name = tpl_local_var % name_full else: if frame.f_globals.has_key(name_base): try: value = repr(eval(name_full, frame.f_globals)) except: value = undefined else: value = undefined name = tpl_global_var % name_full lvals.append(tpl_name_val % (name, value)) if lvals: lvals = '%s%s' % (indent, em_normal.join(lvals)) else: lvals = '' level = '%s %s\n' % (link, call) if index is None: frames.append(level) else: frames.append('%s%s' % (level, ''.join( _formatTracebackLines(lnum, index, lines, Colors, lvals, col_scheme))))
def tokenize_module(module): with module.stream() as stream: readline = stream.readline return list(tokenize.tokenize(readline))
eater = TokenEater(options) for filename in args: if filename == '-': if options.verbose: print _('Reading standard input') fp = sys.stdin closep = 0 else: if options.verbose: print _('Working on %s') % filename fp = open(filename) closep = 1 try: eater.set_filename(filename) try: tokenize.tokenize(fp.readline, eater) except tokenize.TokenError, e: print >> sys.stderr, '%s: %s, line %d, column %d' % ( e[0], filename, e[1][0], e[1][1]) finally: if closep: fp.close() # write the output if options.outfile == '-': fp = sys.stdout closep = 0 else: if options.outpath: options.outfile = os.path.join(options.outpath, options.outfile) fp = open(options.outfile, 'w')
def make_new_code_method_from_source(source, func_name, cls_name): tokens = [] attributes = set() using_self = False g = tokenize(BytesIO(source.encode("utf-8")).readline) for toknum, tokval, _, _, _ in g: # logger.debug((tok_name[toknum], tokval)) if using_self == "self": if toknum == OP and tokval == ".": using_self = tokval continue elif toknum == OP and tokval in (",", ")"): tokens.append((NAME, "self")) using_self = False else: raise NotImplementedError( f"self{tokval} not supported by Transonic") if using_self == ".": if toknum == NAME: using_self = False tokens.append((NAME, "self_" + tokval)) attributes.add(tokval) continue else: raise NotImplementedError if toknum == NAME and tokval == "self": using_self = "self" continue tokens.append((toknum, tokval)) attributes = sorted(attributes) attributes_self = ["self_" + attr for attr in attributes] index_self = tokens.index((NAME, "self")) tokens_attr = [] for ind, attr in enumerate(attributes_self): tokens_attr.append((NAME, attr)) tokens_attr.append((OP, ",")) if tokens[index_self + 1] == (OP, ","): del tokens[index_self + 1] tokens = tokens[:index_self] + tokens_attr + tokens[index_self + 1:] index_func_name = tokens.index((NAME, func_name)) name_new_func = f"__for_method__{cls_name}__{func_name}" tokens[index_func_name] = (NAME, name_new_func) # change recursive calls if func_name in attributes: attributes.remove(func_name) index_rec_calls = [ index for index, (name, value) in enumerate(tokens) if value == "self_" + func_name ] # delete the occurrence of "self_" + func_name in function parameter del tokens[index_rec_calls[0] + 1] del tokens[index_rec_calls[0]] # consider the two deletes offset = -2 # adapt all recurrence calls for ind in index_rec_calls[1:]: # adapt the index to the inserts and deletes ind += offset tokens[ind] = (tokens[ind][0], name_new_func) # put the attributes in parameter for attr in reversed(attributes): tokens.insert(ind + 2, (1, ",")) tokens.insert(ind + 2, (1, "self_" + attr)) # consider the inserts offset += len(attributes) * 2 new_code = untokenize(tokens).decode("utf-8") return new_code, attributes, name_new_func
def split(s): """Splits one last token that needs to be autocompleted.""" # Treat magics specially, since they don't follow python syntax # and require '%%' symbols to be preserved magic_match = re.search(r'%%?\w+$', s) if magic_match: return magic_match.group(0) s2 = s.rstrip() if s != s2: # If there is whitespace at the end of the string # the completion token is empty return '' tokens = [] # Remove front whitespace, somehow it confuses tokenizer s = s.lstrip() try: # Convert input into readline analog lines = s.split('\n') # Add '\n to all lines except last one. lines[:-1] = [line + '\n' for line in lines[:-1]] # tokenize.tokenize has a different signature in python2 and python3. # # In both cases, it's important to gather tokens as we go: many inputs from # users are often incomplete python expressions, which will land us in the # `tokenize.TokenError` case below with an unexpected EOF. if six.PY3: # For python3, we need to yield lines of bytes, but our input is unicode, # so we decode each as we go. line_iterator = (line.encode('utf8') for line in lines) for out in tokenize.tokenize(line_iterator.__next__): tokens.append(out) else: readline = (e for e in lines).next accumulate = lambda *args: tokens.append(args) tokenize.tokenize(readline, accumulate) except tokenize.TokenError: # Tokenizer failed, usually an indication of not-terminated strings. # Remove all quotes and return the last sequence of not-spaces if not tokens: s = s.replace('"', ' ').replace("'", ' ').split() return s[-1] if s else '' except Exception: # pylint: disable=broad-except # If all else fails, use poor's man tokenizer s = s.split() return s[-1] if s else '' # First we check if there is unfinished quoted string. for each in reversed(tokens): if each[_TOKEN_TYPE] == tokenize.ERRORTOKEN and each[_TOKEN] in { "'", '"', '"""', "'''" }: line = each[_TOKEN_END][0] - 1 col = each[_TOKEN_END][1] return lines[line][col:] start_token = _find_expression_start(tokens) if start_token >= len(tokens): # This prevents us from generating random completions when there is # no completion to be generated return _last_real_token(tokens) start_pos = tokens[start_token][_TOKEN_START] first_line_index = start_pos[0] - 1 if first_line_index >= len(lines): return _last_real_token(tokens) first_line = lines[first_line_index][start_pos[1]:] result = first_line + ''.join(lines[first_line_index + 1:]) return result
def map_params(s, params): try: if s.split()[-1].startswith("http"): s = " ".join(s.split()[0:-1]) except: pass # TODO: this is discord specific if s.endswith(">"): s = s[0:s.rfind("<")] g = tokenize(BytesIO(s.strip().encode('utf-8')).readline) # Go through the parsed tokens and eliminate tokens used for string formatting input_toks = [] for toknum, tokval, _, _, _ in g: if toknum > token.N_TOKENS or toknum in [ token.ENDMARKER, token.ENCODING, token.NEWLINE ]: continue input_toks.append((tokval, toknum)) # Create a dictionary with the implicit parameter values output_vals = OrderedDict() for var in params: output_vals[var] = params[var]["default"] # Go through the input intoks = deque([None]) idx_in = 0 idx_out = 0 while idx_in < len(input_toks) and idx_out < len(output_vals): args = input_toks[idx_in][0] param = list(output_vals.keys())[idx_out] # If '-' operator is found, then look ahead of the inputs if args == "-" and idx_in != len(input_toks) - 1: idx_in += 1 intoks.append(args) continue # Check if it's a number num_sign = 1 if intoks[-1] == "-": if input_toks[idx_in][1] != NUMBER: args = intoks[-1] + args else: num_sign = -1 if params[param]["type"] == "int": output_vals[param] = int(args) * num_sign elif params[param]["type"] == "float": output_vals[param] = float(args) * num_sign else: output_vals[param] = args idx_in += 1 idx_out += 1 intoks.append(args) return output_vals
def update_event(self, inp=-1): self.set_output_val(0, tokenize.tokenize(self.input(0)))
def get_docstring_and_rest(filename): """Separate ``filename`` content between docstring and the rest Strongly inspired from ast.get_docstring. Parameters ---------- filename: str The path to the file containing the code to be read Returns ------- docstring: str docstring of ``filename`` category: list list of categories specified by the "# category:" comment rest: str ``filename`` content without the docstring lineno: int the line number on which the code starts Notes ----- This function adapted from the sphinx-gallery project; license: BSD-3 https://github.com/sphinx-gallery/sphinx-gallery/ """ node, content = _parse_source_file(filename) # Find the category comment find_category = re.compile('^#\s*category:\s*(.*)$', re.MULTILINE) match = find_category.search(content) if match is not None: category = match.groups()[0] # remove this comment from the content content = find_category.sub('', content) else: category = None if node is None: return SYNTAX_ERROR_DOCSTRING, category, content, 1 if not isinstance(node, ast.Module): raise TypeError("This function only supports modules. " "You provided {0}".format(node.__class__.__name__)) try: # In python 3.7 module knows its docstring. # Everything else will raise an attribute error docstring = node.docstring import tokenize from io import BytesIO ts = tokenize.tokenize(BytesIO(content).readline) ds_lines = 0 # find the first string according to the tokenizer and get # it's end row for tk in ts: if tk.exact_type == 3: ds_lines, _ = tk.end break # grab the rest of the file rest = '\n'.join(content.split('\n')[ds_lines:]) lineno = ds_lines + 1 except AttributeError: # this block can be removed when python 3.6 support is dropped if node.body and isinstance(node.body[0], ast.Expr) and \ isinstance(node.body[0].value, ast.Str): docstring_node = node.body[0] docstring = docstring_node.value.s # python2.7: Code was read in bytes needs decoding to utf-8 # unless future unicode_literals is imported in source which # make ast output unicode strings if hasattr(docstring, 'decode') and not isinstance(docstring, six.text_type): docstring = docstring.decode('utf-8') lineno = docstring_node.lineno # The last line of the string. # This get the content of the file after the docstring last line # Note: 'maxsplit' argument is not a keyword argument in python2 rest = content.split('\n', lineno)[-1] lineno += 1 else: docstring, rest = '', '' if not docstring: raise ValueError(('Could not find docstring in file "{0}". ' 'A docstring is required for the example gallery.') .format(filename)) return docstring, category, rest, lineno
#!/usr/bin/env python3 import sys import tokenize import nbformat.v4 as nbf4 import re FILE = sys.argv[1] nb_cells = [] flag_nl, buffer, pline = 0, list(), -1 for token in tokenize.tokenize(open(FILE, 'rb').readline): # print(token) if token.end == (0, 0): continue # auto encoding if token.start[0] == 1 and token.line.startswith('#!'): continue # auto encoding if token.type == 4: continue # line break if token.start[0] > pline: buffer.append(token.line) pline = token.start[0] if token.type == 57: # comment if re.search('^# \*+ ', token.line): buffer.pop() block = "".join(buffer).strip() if block: nb_cells.append(nbf4.new_code_cell(source=block)) heading = token.line.replace('*', '#') nb_cells.append(nbf4.new_markdown_cell(source=heading.strip())) buffer.clear()
def _get_tokens(source: bytes) -> Sequence[tokenize.TokenInfo]: return tuple(tokenize.tokenize(io.BytesIO(source).readline))
def _compile(s, fname): tokens = tokenize.tokenize(s) t = parse.parse(s, tokens) r = encode.encode(fname, s, t) return r
def _tokenize_string(s): return tokenize.tokenize(io.BytesIO(s.encode("utf-8")).readline)
def tokens(readline, tokeneater): for token in tokenize.tokenize(readline): yield tokeneater(*token)
def replacetokens(tokens, fullname): """Transform a stream of tokens from raw to Python 3. It is called by the custom module loading machinery to rewrite source/tokens between source decoding and compilation. Returns a generator of possibly rewritten tokens. The input token list may be mutated as part of processing. However, its changes do not necessarily match the output token stream. REMEMBER TO CHANGE ``BYTECODEHEADER`` WHEN CHANGING THIS FUNCTION OR CACHED FILES WON'T GET INVALIDATED PROPERLY. """ futureimpline = False # The following utility functions access the tokens list and i index of # the for i, t enumerate(tokens) loop below def _isop(j, *o): """Assert that tokens[j] is an OP with one of the given values""" try: return tokens[j].type == token.OP and tokens[j].string in o except IndexError: return False def _findargnofcall(n): """Find arg n of a call expression (start at 0) Returns index of the first token of that argument, or None if there is not that many arguments. Assumes that token[i + 1] is '('. """ nested = 0 for j in range(i + 2, len(tokens)): if _isop(j, ')', ']', '}'): # end of call, tuple, subscription or dict / set nested -= 1 if nested < 0: return None elif n == 0: # this is the starting position of arg return j elif _isop(j, '(', '[', '{'): nested += 1 elif _isop(j, ',') and nested == 0: n -= 1 return None def _ensureunicode(j): """Make sure the token at j is a unicode string This rewrites a string token to include the unicode literal prefix so the string transformer won't add the byte prefix. Ignores tokens that are not strings. Assumes bounds checking has already been done. """ st = tokens[j] if st.type == token.STRING and st.string.startswith(("'", '"')): tokens[j] = st._replace(string='u%s' % st.string) for i, t in enumerate(tokens): # Convert most string literals to byte literals. String literals # in Python 2 are bytes. String literals in Python 3 are unicode. # Most strings in Mercurial are bytes and unicode strings are rare. # Rather than rewrite all string literals to use ``b''`` to indicate # byte strings, we apply this token transformer to insert the ``b`` # prefix nearly everywhere. if t.type == token.STRING: s = t.string # Preserve docstrings as string literals. This is inconsistent # with regular unprefixed strings. However, the # "from __future__" parsing (which allows a module docstring to # exist before it) doesn't properly handle the docstring if it # is b''' prefixed, leading to a SyntaxError. We leave all # docstrings as unprefixed to avoid this. This means Mercurial # components touching docstrings need to handle unicode, # unfortunately. if s[0:3] in ("'''", '"""'): yield t continue # If the first character isn't a quote, it is likely a string # prefixing character (such as 'b', 'u', or 'r'. Ignore. if s[0] not in ("'", '"'): yield t continue # String literal. Prefix to make a b'' string. yield t._replace(string='b%s' % t.string) continue # Insert compatibility imports at "from __future__ import" line. # No '\n' should be added to preserve line numbers. if (t.type == token.NAME and t.string == 'import' and all(u.type == token.NAME for u in tokens[i - 2:i]) and [u.string for u in tokens[i - 2:i]] == ['from', '__future__']): futureimpline = True if t.type == token.NEWLINE and futureimpline: futureimpline = False if fullname == 'mercurial.pycompat': yield t continue r, c = t.start l = (b'; from mercurial.pycompat import ' b'delattr, getattr, hasattr, setattr, xrange, ' b'open, unicode\n') for u in tokenize.tokenize(io.BytesIO(l).readline): if u.type in (tokenize.ENCODING, token.ENDMARKER): continue yield u._replace(start=(r, c + u.start[1]), end=(r, c + u.end[1])) continue # This looks like a function call. if t.type == token.NAME and _isop(i + 1, '('): fn = t.string # *attr() builtins don't accept byte strings to 2nd argument. if (fn in ('getattr', 'setattr', 'hasattr', 'safehasattr') and not _isop(i - 1, '.')): arg1idx = _findargnofcall(1) if arg1idx is not None: _ensureunicode(arg1idx) # .encode() and .decode() on str/bytes/unicode don't accept # byte strings on Python 3. elif fn in ('encode', 'decode') and _isop(i - 1, '.'): for argn in range(2): argidx = _findargnofcall(argn) if argidx is not None: _ensureunicode(argidx) # It changes iteritems/values to items/values as they are not # present in Python 3 world. elif fn in ('iteritems', 'itervalues'): yield t._replace(string=fn[4:]) continue # Emit unmodified token. yield t
matcher = re.compile('def .*\(.*\)\:|class .*\(.*\)\:') source_file = os.path.join(os.getcwd(), '../snickers/main.py') #proc = subprocess.Popen("egrep 'def|class' {}".format(source_file), stdout=subprocess.PIPE) #readline = proc.readline readline = os.popen("egrep 'def|class' {}".format(source_file)).readline # source_file_handle = open(source_file, 'rb') # readline = source_file_handle.readline # Types = type: def, params: [], parent_node: <pointer> # Types = type: class, children: [], parent_node: <pointer> module_objects = [] tokens_g = tokenize.tokenize(readline) look_for = 'class_or_function' current_node = module_objects indentation_count = 0 def extract_interesting_lines(string): ss = string.split("\n") def state_append(c_node, obj, current_indentation): obj['parent_node'] = c_node obj['current_indentation'] = current_indentation if type(c_node) == list: c_node.append(obj) elif type(c_node) == dict: if c_node['type'] in ['class', 'def'] and obj['type'] in ['def', 'class']: c_node['children'].append(obj)
def __setTokens__(self): # g = tokenize(BytesIO(self._codes.encode('utf-8')).readline) # tokenize the string prev_num = -1 prev_val = None prev_end = -1 self.__ReplaceReserved__() # Split _codes line by line and identify each line ss = self._processed_codes.splitlines() # pdb.set_trace() for line in ss: try: # call python tokenize.tokenize and get the returned generator g g = tokenize(BytesIO(line.encode('utf-8')).readline) # tokenize the string try: for toknum, tokval, starrt, eend, _ in g: # pdb.set_trace() chop_start = 0 chop_end = len(tokval) - 1 # pdb.set_trace() # if the token type is NAME / OP / NUMBER and not only consists of [,)\-\"';\[\]|..+]+ if (toknum in [NAME, OP, NUMBER, ERRORTOKEN] and re.compile( r"^(?<![a-zA-Z])([\ ,):\"';\[\]}\{]+|\.\.+)(?![a-zA-Z])$").search(tokval) == None): # pdb.set_trace() # Take xx( / < / > as one token, instead of two, eg. xx and ( if (((prev_num == NAME and tokval == '(') or ( prev_val == '&' and (tokval == 'lt' or tokval == 'gt'))) and prev_end == starrt): self._tokens[-1] = self._tokens[-1] + tokval elif (tokval == '('): pass elif (toknum == NUMBER and int(tokval) in self._reserve_codes): self._tokens.append(self._reserve_codes[int(tokval)]) else: self._tokens.append(tokval) # For comment / string, code elif (toknum in [COMMENT, STRING]): # pdb.set_trace() if (toknum == STRING): # remove starting and ending ' / " while ((tokval[chop_start] == '"' or tokval[chop_start] == "'") and chop_start < chop_end): chop_start += 1 while ((tokval[chop_end] == '"' or tokval[chop_end] == "'") and chop_start < chop_end): chop_end -= 1 else: # remove starting # / ''' / """ while ((tokval[chop_start] == '#' and chop_start < chop_end) or (chop_end >= chop_start + 3 and tokval[chop_start:chop_start + 3] == "'''") or (chop_end >= chop_start + 3 and tokval[chop_start:chop_start + 3] == '"""')): if (tokval[chop_start] == '#'): chop_start += 1 else: chop_start += 3 if (chop_start < chop_end or (tokval[chop_start] not in ['#', "'", '"'])): words = CodesTokenizer(tokval[chop_start:chop_end + 1])._tokens if (words): self._tokens.extend(words) prev_num = toknum prev_val = tokval prev_end = eend except Exception as e: # print("Error in __setTokens__", e, line) # pdb.set_trace() pass except Exception as e: print("Error in __setTokens__", e, line) pdb.set_trace() pass
def old_mark_text_ranges(node, source: bytes): """ Node is an AST, source is corresponding source as string. Function adds recursively attributes end_lineno and end_col_offset to each node which has attributes lineno and col_offset. """ def _extract_tokens(tokens, lineno, col_offset, end_lineno, end_col_offset): return list( filter( (lambda tok: tok.start[0] >= lineno and (tok.start[1] >= col_offset or tok.start[0] > lineno ) and tok.end[0] <= end_lineno and (tok.end[1] <= end_col_offset or tok.end[0] < end_lineno ) and tok.string != ""), tokens, )) def _mark_text_ranges_rec(node, tokens, prelim_end_lineno, prelim_end_col_offset): """ Returns the earliest starting position found in given tree, this is convenient for internal handling of the siblings """ # set end markers to this node if "lineno" in node._attributes and "col_offset" in node._attributes: tokens = _extract_tokens( tokens, node.lineno, node.col_offset, prelim_end_lineno, prelim_end_col_offset, ) try: tokens = _mark_end_and_return_child_tokens( node, tokens, prelim_end_lineno, prelim_end_col_offset) except Exception: logging.getLogger("thonny").warning("Problem with marking %s", node) # fallback to incorrect marking instead of exception node.incorrect_range = True node.end_lineno = node.lineno node.end_col_offset = node.col_offset + 1 # mark its children, starting from last one # NB! need to sort children because eg. in dict literal all keys come first and then all values children = list(_get_ordered_child_nodes(node)) for child in reversed(children): (prelim_end_lineno, prelim_end_col_offset) = _mark_text_ranges_rec( child, tokens, prelim_end_lineno, prelim_end_col_offset) if "lineno" in node._attributes and "col_offset" in node._attributes: # new "front" is beginning of this node prelim_end_lineno = node.lineno prelim_end_col_offset = node.col_offset return (prelim_end_lineno, prelim_end_col_offset) def _strip_trailing_junk_from_expressions(tokens): while (tokens[-1].type not in ( token.RBRACE, token.RPAR, token.RSQB, token.NAME, token.NUMBER, token.STRING, token.ELLIPSIS, ) and tokens[-1].string != "..." # See https://bugs.python.org/issue31394 and tokens[-1].string not in ")}]" or tokens[-1].string in [ "and", "as", "assert", "class", "def", "del", "elif", "else", "except", "finally", "for", "from", "global", "if", "import", "in", "is", "lambda", "not", "or", "try", "while", "with", "yield", ]): del tokens[-1] def _strip_trailing_extra_closers(tokens, remove_naked_comma): level = 0 for i in range(len(tokens)): if tokens[i].string in "({[": level += 1 elif tokens[i].string in ")}]": level -= 1 if level == 0 and tokens[i].string == "," and remove_naked_comma: tokens[:] = tokens[0:i] return if level < 0: tokens[:] = tokens[0:i] return def _strip_unclosed_brackets(tokens): level = 0 for i in range(len(tokens) - 1, -1, -1): if tokens[i].string in "({[": level -= 1 elif tokens[i].string in ")}]": level += 1 if level < 0: tokens[:] = tokens[0:i] level = 0 # keep going, there may be more unclosed brackets def _mark_end_and_return_child_tokens(node, tokens, prelim_end_lineno, prelim_end_col_offset): """ # shortcut node.end_lineno = prelim_end_lineno node.end_col_offset = prelim_end_col_offset return tokens """ # prelim_end_lineno and prelim_end_col_offset are the start of # next positioned node or end of source, ie. the suffix of given # range may contain keywords, commas and other stuff not belonging to current node # Function returns the list of tokens which cover all its children if isinstance(node, ast.stmt): # remove empty trailing lines while tokens[-1].type in ( tokenize.NL, tokenize.COMMENT, token.NEWLINE, token.INDENT, ) or tokens[-1].string in (":", "else", "elif", "finally", "except"): del tokens[-1] else: _strip_trailing_extra_closers( tokens, not isinstance(node, (ast.Tuple, ast.Lambda))) _strip_trailing_junk_from_expressions(tokens) _strip_unclosed_brackets(tokens) # set the end markers of this node node.end_lineno = tokens[-1].end[0] node.end_col_offset = tokens[-1].end[1] # Peel off some trailing tokens which can't be part any # positioned child node. # TODO: maybe cleaning from parent side is better than # _strip_trailing_junk_from_expressions # Remove trailing empty parens from no-arg call if isinstance(node, ast.Call) and _tokens_text(tokens[-2:]) == "()": del tokens[-2:] # Remove trailing full slice elif isinstance(node, ast.Subscript): if _tokens_text(tokens[-3:]) == "[:]": del tokens[-3:] elif _tokens_text(tokens[-4:]) == "[::]": del tokens[-4:] # Attribute name would confuse the "value" of Attribute elif isinstance(node, ast.Attribute): assert tokens[-1].type == token.NAME del tokens[-1] _strip_trailing_junk_from_expressions(tokens) return tokens all_tokens = list(tokenize.tokenize(io.BytesIO(source).readline)) source_lines = source.splitlines(True) _fix_ast_problems(node, source_lines, all_tokens) prelim_end_lineno = len(source_lines) prelim_end_col_offset = len(source_lines[len(source_lines) - 1]) _mark_text_ranges_rec(node, all_tokens, prelim_end_lineno, prelim_end_col_offset)
def main(): global default_keywords try: opts, args = getopt.getopt( sys.argv[1:], "ad:DEhk:Kno:p:S:Vvw:x:X:", [ "extract-all", "default-domain=", "escape", "help", "keyword=", "no-default-keywords", "add-location", "no-location", "output=", "output-dir=", "style=", "verbose", "version", "width=", "exclude-file=", "docstrings", "no-docstrings", ], ) except getopt.error as msg: usage(1, msg) # for holding option values class Options: # constants GNU = 1 SOLARIS = 2 # defaults extractall = 0 # FIXME: currently this option has no effect at all. escape = 0 keywords = [] outpath = "" outfile = "messages.pot" writelocations = 1 locationstyle = GNU verbose = 0 width = 78 excludefilename = "" docstrings = 0 nodocstrings = {} options = Options() locations = {"gnu": options.GNU, "solaris": options.SOLARIS} # parse options for opt, arg in opts: if opt in ("-h", "--help"): usage(0) elif opt in ("-a", "--extract-all"): options.extractall = 1 elif opt in ("-d", "--default-domain"): options.outfile = arg + ".pot" elif opt in ("-E", "--escape"): options.escape = 1 elif opt in ("-D", "--docstrings"): options.docstrings = 1 elif opt in ("-k", "--keyword"): options.keywords.append(arg) elif opt in ("-K", "--no-default-keywords"): default_keywords = [] elif opt in ("-n", "--add-location"): options.writelocations = 1 elif opt in ("--no-location", ): options.writelocations = 0 elif opt in ("-S", "--style"): options.locationstyle = locations.get(arg.lower()) if options.locationstyle is None: usage(1, _("Invalid value for --style: %s") % arg) elif opt in ("-o", "--output"): options.outfile = arg elif opt in ("-p", "--output-dir"): options.outpath = arg elif opt in ("-v", "--verbose"): options.verbose = 1 elif opt in ("-V", "--version"): print(_("pygettext.py (xgettext for Python) %s") % __version__) sys.exit(0) elif opt in ("-w", "--width"): try: options.width = int(arg) except ValueError: usage(1, _("--width argument must be an integer: %s") % arg) elif opt in ("-x", "--exclude-file"): options.excludefilename = arg elif opt in ("-X", "--no-docstrings"): fp = open(arg) try: while 1: line = fp.readline() if not line: break options.nodocstrings[line[:-1]] = 1 finally: fp.close() # calculate escapes make_escapes(not options.escape) # calculate all keywords options.keywords.extend(default_keywords) # initialize list of strings to exclude if options.excludefilename: try: fp = open(options.excludefilename) options.toexclude = fp.readlines() fp.close() except IOError: print(_("Can't read --exclude-file: %s") % options.excludefilename, file=sys.stderr) sys.exit(1) else: options.toexclude = [] # resolve args to module lists expanded = [] for arg in args: if arg == "-": expanded.append(arg) else: expanded.extend(getFilesForName(arg)) args = expanded # slurp through all the files eater = TokenEater(options) for filename in args: if filename == "-": if options.verbose: print(_("Reading standard input")) fp = sys.stdin.buffer closep = 0 else: if options.verbose: print(_("Working on %s") % filename) fp = open(filename, "rb") closep = 1 try: eater.set_filename(filename) try: tokens = tokenize.tokenize(fp.readline) for _token in tokens: eater(*_token) except tokenize.TokenError as e: print("%s: %s, line %d, column %d" % (e.args[0], filename, e.args[1][0], e.args[1][1]), file=sys.stderr) finally: if closep: fp.close() # write the output if options.outfile == "-": fp = sys.stdout closep = 0 else: if options.outpath: options.outfile = os.path.join(options.outpath, options.outfile) fp = open(options.outfile, "w") closep = 1 try: eater.write(fp) finally: if closep: fp.close()
def run(self): tokenize.tokenize(self.getline, self.tokeneater) # Remove trailing empty lines. lines = self.lines while lines and lines[-1] == "\n": lines.pop() # Sentinel. stats = self.stats stats.append((len(lines), 0)) # Map count of leading spaces to # we want. have2want = {} # Program after transformation. after = self.after = [] # Copy over initial empty lines -- there's nothing to do until # we see a line with *something* on it. i = stats[0][0] after.extend(lines[1:i]) for i in range(len(stats) - 1): thisstmt, thislevel = stats[i] nextstmt = stats[i + 1][0] have = getlspace(lines[thisstmt]) want = thislevel * 4 if want < 0: # A comment line. if have: # An indented comment line. If we saw the same # indentation before, reuse what it most recently # mapped to. want = have2want.get(have, -1) if want < 0: # Then it probably belongs to the next real stmt. for j in xrange(i + 1, len(stats) - 1): jline, jlevel = stats[j] if jlevel >= 0: if have == getlspace(lines[jline]): want = jlevel * 4 break if want < 0: # Maybe it's a hanging # comment like this one, # in which case we should shift it like its base # line got shifted. for j in xrange(i - 1, -1, -1): jline, jlevel = stats[j] if jlevel >= 0: want = have + getlspace(after[jline-1]) - \ getlspace(lines[jline]) break if want < 0: # Still no luck -- leave it alone. want = have else: want = 0 assert want >= 0 have2want[have] = want diff = want - have if diff == 0 or have == 0: after.extend(lines[thisstmt:nextstmt]) else: for line in lines[thisstmt:nextstmt]: if diff > 0: if line == "\n": after.append(line) else: after.append(" " * diff + line) else: remove = min(getlspace(line), -diff) after.append(line[remove:]) return self.raw != self.after
def tokens(source_code: bytes) -> Iterator[TokenInfo]: """ Return an iterator over the tokens in a python source string. """ return tokenize(BytesIO(source_code).readline)
# python -m tokenize 180_language_tokenize.py import tokenize with tokenize.open('180_language_tokenize.py') as f: tokens = tokenize.generate_tokens(f.readline) for token in tokens: print(token) import tokenize with open('180_language_tokenize.py', 'rb') as f: tokens = tokenize.tokenize(f.readline) for token in tokens: print(token)
def getblock(lines): """Extract the block of code at the top of the given list of lines.""" try: tokenize.tokenize(ListReader(lines).readline, BlockFinder().tokeneater) except EndOfBlock, eob: return lines[:eob.args[0]]
def main(): global default_keywords try: opts, args = getopt.getopt(sys.argv[1:], 'ad:DEhk:Kno:p:S:Vvw:x:X:', ['extract-all', 'default-domain=', 'escape', 'help', 'keyword=', 'no-default-keywords', 'add-location', 'no-location', 'output=', 'output-dir=', 'style=', 'verbose', 'version', 'width=', 'exclude-file=', 'docstrings', 'no-docstrings', ]) except getopt.error as msg: usage(1, msg) # for holding option values class Options: # constants GNU = 1 SOLARIS = 2 # defaults extractall = 0 # FIXME: currently this option has no effect at all. escape = 0 keywords = [] outpath = '' outfile = 'messages.pot' writelocations = 1 locationstyle = GNU verbose = 0 width = 78 excludefilename = '' docstrings = 0 nodocstrings = {} options = Options() locations = {'gnu': options.GNU, 'solaris': options.SOLARIS,} # parse options for opt, arg in opts: if opt in ('-h', '--help'): usage(0) elif opt in ('-a', '--extract-all'): options.extractall = 1 elif opt in ('-d', '--default-domain'): options.outfile = arg + '.pot' elif opt in ('-E', '--escape'): options.escape = 1 elif opt in ('-D', '--docstrings'): options.docstrings = 1 elif opt in ('-k', '--keyword'): options.keywords.append(arg) elif opt in ('-K', '--no-default-keywords'): default_keywords = [] elif opt in ('-n', '--add-location'): options.writelocations = 1 elif opt in ('--no-location',): options.writelocations = 0 elif opt in ('-S', '--style'): options.locationstyle = locations.get(arg.lower()) if options.locationstyle is None: usage(1, _('Invalid value for --style: %s') % arg) elif opt in ('-o', '--output'): options.outfile = arg elif opt in ('-p', '--output-dir'): options.outpath = arg elif opt in ('-v', '--verbose'): options.verbose = 1 elif opt in ('-V', '--version'): print(_('pygettext.py (xgettext for Python) %s') % __version__) sys.exit(0) elif opt in ('-w', '--width'): try: options.width = int(arg) except ValueError: usage(1, _('--width argument must be an integer: %s') % arg) elif opt in ('-x', '--exclude-file'): options.excludefilename = arg elif opt in ('-X', '--no-docstrings'): fp = open(arg) try: while 1: line = fp.readline() if not line: break options.nodocstrings[line[:-1]] = 1 finally: fp.close() # calculate escapes make_escapes(not options.escape) # calculate all keywords options.keywords.extend(default_keywords) # initialize list of strings to exclude if options.excludefilename: try: fp = open(options.excludefilename) options.toexclude = fp.readlines() fp.close() except IOError: print(_("Can't read --exclude-file: %s") % options.excludefilename, file=sys.stderr) sys.exit(1) else: options.toexclude = [] # resolve args to module lists expanded = [] for arg in args: if arg == '-': expanded.append(arg) else: expanded.extend(getFilesForName(arg)) args = expanded # slurp through all the files eater = TokenEater(options) for filename in args: if filename == '-': if options.verbose: print(_('Reading standard input')) fp = sys.stdin.buffer closep = 0 else: if options.verbose: print(_('Working on %s') % filename) fp = open(filename, 'rb') closep = 1 try: eater.set_filename(filename) try: tokens = tokenize.tokenize(fp.readline) for _token in tokens: eater(*_token) except tokenize.TokenError as e: print('%s: %s, line %d, column %d' % ( e.args[0], filename, e.args[1][0], e.args[1][1]), file=sys.stderr) finally: if closep: fp.close() # write the output if options.outfile == '-': fp = sys.stdout closep = 0 else: if options.outpath: options.outfile = os.path.join(options.outpath, options.outfile) fp = open(options.outfile, 'w') closep = 1 try: eater.write(fp) finally: if closep: fp.close()