def main( ): # Program data pvars = { 'outputDir' : None, 'clobber' : False, 'executeScripts' : False, 'tokenFile' : None, 'inputScripts' : [], 'tokSuffix' : '.tok', 'breakOnError' : False } tokenizedFileNames = [] # Read input #try: readCommandLine( sys.argv[1:], pvars ) tokens = readTokenFile( pvars['tokenFile'] ) # Apply tokens to scripts for scriptName in pvars[ 'inputScripts' ]: tokenizedFileName, tokenizedFile = makeTokenizedFile( scriptName, pvars['clobber'], \ pvars['outputDir'], pvars['tokSuffix'] ) tokenizedFileNames.append( tokenizedFileName ) tokenize( tokenizedFile, tokens, scriptName ) tokenizedFile.close( ) # Run tokenized files if( pvars['executeScripts'] ): for tokenizedFileName in tokenizedFileNames: ret = os.system( tokenizedFileName ) if ret and pvars[ 'breakOnError' ]: print >> sys.stderr, "Script " + tokenizedFileName + \ " failed with exit code " + str( ret ) +". Aborting." sys.exit( 1 )
def errors(self, input_str, pos): try: Parse(tokenize(input_str)).go() self.fail('ParseError not raised: {0}'.format(input_str)) except ParseError, e: if e.position != pos: self.fail('ParseError at wrong position: expected {0}, got {1}'.format(pos, e.position))
def tokenize_fn(c): try: tokens = list(tokenize(BytesIO(c.encode('utf-8')).readline)) except Exception as e: # print(e) return None return tokens
def decistmt(s): """Substitute Decimals for floats in a string of statements. >>> from decimal import Decimal >>> s = 'print(+21.3e-5*-.1234/81.7)' >>> decistmt(s) "print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))" The format of the exponent is inherited from the platform C library. Known cases are "e-007" (Windows) and "e-07" (not Windows). Since we're only showing 12 digits, and the 13th isn't close to 5, the rest of the output should be platform-independent. >>> exec(s) #doctest: +ELLIPSIS -3.217160342717258e-0...7 Output from calculations with Decimal should be identical across all platforms. >>> exec(decistmt(s)) -3.217160342717258261933904529E-7 """ result = [] g = tokenize(BytesIO( s.encode('utf-8')).readline) # tokenize the string for toknum, tokval, _, _, _ in g: if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens result.extend([(NAME, 'Decimal'), (OP, '('), (STRING, repr(tokval)), (OP, ')')]) else: result.append((toknum, tokval)) return untokenize(result).decode('utf-8')
def decistmt(s): """Substitute Decimals for floats in a string of statements. >>> from decimal import Decimal >>> s = 'print(+21.3e-5*-.1234/81.7)' >>> decistmt(s) "print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))" The format of the exponent is inherited from the platform C library. Known cases are "e-007" (Windows) and "e-07" (not Windows). Since we're only showing 12 digits, and the 13th isn't close to 5, the rest of the output should be platform-independent. >>> exec(s) #doctest: +ELLIPSIS -3.217160342717258e-0...7 Output from calculations with Decimal should be identical across all platforms. >>> exec(decistmt(s)) -3.217160342717258261933904529E-7 """ result = [] g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string for toknum, tokval, _, _, _ in g: if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens result.extend([ (NAME, 'Decimal'), (OP, '('), (STRING, repr(tokval)), (OP, ')') ]) else: result.append((toknum, tokval)) return untokenize(result).decode('utf-8')
def batch_tokenize_process(source_list): tmp_sentences = [] tmp_index = [ele[0] for ele in source_list] for index, ele in enumerate(source_list): sentence = ele[1] #print sentence tmp_sentences.append(tokenize(sentence).strip() + "\n") return tmp_index, tmp_sentences
def parse(program): ## Log("HERE:D.1", program = program) global token, next token = None next = None next = tokenize(program).next token = next() return expression()
def parse(v): r = tokenize(v) p = ParserCtx(r) x = do_prog(p) # except: if x == None: print(" at line " + str( p.token.pos ) + " unknown error") return x
def source_to_code(self, data, path, *, _optimize=-1): print(path) source = importlib._bootstrap.decode_source(data) tokens = tokenize(io.BytesIO(source.encode('utf-8')).readline) tokens = retokenize(tokens) source = untokenize(tokens).decode('utf-8') return _call_with_frames_removed(compile, source, path, 'exec', dont_inherit=True, optimize=_optimize)
def parse(program): global token, next next = tokenize(program).next token = next() return expression()
def visit(self, featureset): try: _result = [] for text in featureset.get_column_values(self._column): if isinstance(text, list): _preprocessed = [] for word in text: _preprocessed.append(tokenize(word)) _result.append(_preprocessed) else: _preprocessed = tokenize(text) _result.append(_preprocessed) _new_result = np.asarray(list(_result))[:, np.newaxis] _new_result = _new_result.reshape( featureset.get_column_values(self._column).shape) featureset.set_featureset_column(self._column, _new_result) except Exception as error: util.print_error("Unable to tokenize column") util.print_error(error)
def parse(self, equa): self.token_generator = tokenize(equa, self.TOKENS_SPEC) self.current_token = None self.next_token = None self._next() self._tab(self._prob()) if self.next_token: raise Exception( 'Wrong token sequence busted. Processing stopped at : ' + self.next_token.value)
def feature_extract(self, tweets): preproc_tweets = map(lambda t: tokenize(t), tweets) model = Word2Vec(preproc_tweets) word2vec = dict(zip(model.wv.index2word, model.wv.syn0)) dim = len(word2vec.itervalues().next()) mean_embeds = np.array([ np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0) for words in tweets ])
def parse(content): r = tokenize(content) p = ParserCtx(r, content) p.next() try: while p.token.type != 'eof': parse_block(p) x = p.tree # except: if x == None: p.error() return x except Exception as e: # print(e, v) compile_error("parse", content, p.token, str(e)) raise (e)
def parse_file(fname): customize_symbols( ['function', 'var', 'end', 'append'], '-=[];,./!%*()+{}:<>@^$&', [ '-', '+', '*', '**', '/', '%', '<<', '>>', '-=', '+=', '*=', '/=', '=', '==', '!=', '<', '>', '<=', '>=', '[', ']', '{', '}', '(', ')', '.', ':', ',', ';', '&', '|', '!', '@', '^', '$' ]) disable_tk_indent() list = tokenize(load(fname)) list = list_to_chain(list) if len(list) == 0: return item = list[0] #while item != None: #print(item.val) #item = item.next parse_macro(item)
def read_input(f1_name,f2_name): #read line by line from file 1 and file 2 with open(f1_name) as f1, open(f2_name) as f2: for line1, line2 in zip(f1,f2): #get the classes the line belongs to class1 = line1.strip() priors[class1]+=1 #tokenize each line token_list = tokenize(line2) #store the tokens in a dictionary #update freq of token for token in token_list: if token not in tfreq_dict: tfreq_dict[token] = {'1':0,'-1':0,'0':0} tfreq_dict[token][class1]+=1
def cleanAndNormalizeText(data): tokens = tokenize(data) tokens = [ token if emoticon_re.search(token) else token.lower() for token in tokens ] filterText = [w for w in tokens if w not in stop] filterText = [w for w in filterText if not len(w) <= 1] # stem ps = PorterStemmer() for i in range(len(filterText) - 1): if len(filterText[i]) > 1: try: filterText[i] = ps.stem(filterText[i]) except Exception as e: filterText[i] = filterText[i] return filterText
def main(): file = get_source(argv[INPUT_INDEX]) code = file.read() check_parens(code) tokenized = tokenize(code) func_map = make_function_map(iter(tokenized), tokenized) if False: for f in func_map: for t in func_map[f].def_block: print(t.symbol) # check if the parse succeeds; # is_parsed fails with sys.exit() and message if parse error if is_parsed(func_map): #compile(func_map) pass file.close()
def transform(fname): s = load(fname) tokenList = tokenize(s) indents = 0 idx = 0 lastisnl = False needspace = False while hasnext(tokenList, idx): i = tokenList[idx] next = getnext(tokenList, idx) idx += 1 if i.type in _ws_after: printf("%s ", i.val) elif i.type in _ws_both: printf(" %s ", i.val) elif i.type == 'nl': printf('\n') if next == None: pass elif next.type == 'indent': printf(' ' * (indents + 4)) elif next.type == 'dedent': #printf(' '*(indents - 4)) pass else: printf(' ' * indents) elif i.type == 'indent': indents += 4 elif i.type == 'dedent': indents -= 4 if next == None: pass elif next.type != 'dedent': printf(' ' * indents) elif i.type == 'string': printf(get_printable_str(i.val)) elif i.type == 'notin': printf(' not in ') else: printf(i.val)
def transform(fname): s = load(fname) tokenList = tokenize(s) indents = 0 idx = 0 lastisnl = False needspace = False while hasnext(tokenList, idx): i = tokenList[idx] next = getnext(tokenList, idx) idx += 1 if i.type in _ws_after: printf("%s ", i.val) elif i.type in _ws_both: printf(" %s ", i.val) elif i.type == 'nl': printf('\n') if next == None: pass elif next.type == 'indent': printf(' '* (indents + 4)) elif next.type == 'dedent': #printf(' '*(indents - 4)) pass else: printf(' '* indents) elif i.type == 'indent': indents += 4 elif i.type == 'dedent': indents -= 4 if next == None: pass elif next.type != 'dedent': printf(' ' * indents) elif i.type == 'string': printf(get_printable_str(i.val)) elif i.type == 'notin': printf(' not in ') else: printf(i.val)
def calc_sim(query, threshold=0): ''' calculate similarity scores between documents and the query ''' query = clean_token(query) file_list = get_file_names() documents = {} for i in range(len(file_list)): documents[file_list[i]] = tokenize(convert(file_list[i])) query_vec = vectorize(query) results = {} for name, doc in documents.items(): doc_vec = vectorize(doc) sim_score = cos_sim(query_vec, doc_vec) if sim_score > threshold: results[name] = sim_score sort_result = sorted(results.items(), key=operator.itemgetter(1), reverse=True) return sort_result
def test1(): test("1") test("+1") test("-1") test("1+2") test("1+2+3") test("1+2*3") test("(1+2)*3") test("()") test("(1)") test("(1,)") test("(1, 2)") test("[1, 2, 3]") test("{}") test("{1: 'one', 2: 'two'}") test("1.0*2+3") test("'hello'+'world'") test("2**3**4") test("1 and 2") test("foo.bar") test("1 + hello") test("1 if 2 else 3") test("'hello'[0]") test("hello()") test("hello(1,2,3)") test("lambda: 1") test("lambda a, b, c: a+b+c") test("True") test("True or False") test("1 in 2") test("1 not in 2") test("1 is 2") test("1 is not 2") test("1 is (not 2)") print print list(tokenize("1 not in 2"))
arg = arg.replace(bracks, '') arg = arg.strip() arg = re.sub(' +', ' ', arg) t = ' '.join(arg.split(' ')[:-1] + [bracks]) n = arg.split(' ')[-1] types.append(t) names.append(n) return types, names if __name__ == '__main__': # parser parser = argparse.ArgumentParser() parser.add_argument('--input_file', default='', help='The file to strip comments from.') parser.add_argument('--l', default='python', choices=['python', 'java'], help='language of input code') args = parser.parse_args() assert args.input_file == '' or os.path.isfile(args.input_file) # read from standard input, or from input file if args.input_file == '': source = sys.stdin.read() else: with io.open(args.input_file, encoding='utf-8') as f: source = f.read() tokenize = globals()[f"tokenize_{args.l}"] # tokenize print(tokenize(source), end='')
def parse(string): tokenize(string)
def file_elements(filename, filtering='normal'): '''Take a Python file, return a tuple of contents. Argument 'filterint' determines how much filtering is applied to symbols that may be uninteresting. Possible values are 'minimal' or 'normal'. ''' header = '' comments = [] tmp_file = None full_path = os.path.join(os.getcwd(), filename) def cleanup(): stream.close() if tmp_file: log.debug('closing {}'.format(tmp_file)) tmp_file.close() # Set up the dictionary. We may end up returning only part of this # filled out, if we encounter errors along the way. elements = {} elements['header'] = '' elements['comments'] = [] elements['docstrings'] = [] elements['imports'] = [] elements['classes'] = [] elements['functions'] = [] elements['variables'] = [] elements['strings'] = [] elements['calls'] = [] elements['parse_result'] = 'success' # Open the file for reading. FileIO is needed for the Python 'ast' module. log = Logger('file_parser').get_log() log.info('parsing Python file {}'.format(full_path)) stream = io.FileIO(filename) # Pass #0: account for Python 2 vs 3 syntax. # I haven't found another way to detect whether a script uses Python 2 or # 3 syntax other than to try to parse it and test for failure. We need # to use ast later below, and if an input file needs Python 2, we have to # convert it first. So we test first and convert at the beginning. if assumes_python2(stream): try: # This creates a temporary file that must be deleted later. log.debug('attempting to convert from Python 2') tmp_file = convert_python2_file(filename) if tmp_file: log.debug('conversion successful'.format(full_path)) log.debug('closing file {}'.format(full_path)) stream.close() log.debug('opening file {}'.format(tmp_file.name)) stream = io.FileIO(tmp_file.name) else: # We thought it was Python 2 but couldn't convert it. # Something is wrong. Bail. log.warn( 'conversion failed -- giving up on {}'.format(full_path)) # At this point, we still have an empty elements dictionary. elements['parse_result'] = 'error' return elements except Exception as err: log.error( 'error trying to detect if {} uses Python 2'.format(full_path)) log.error(err) elements['parse_result'] = 'error' cleanup() return elements # Pass #1: use tokenize to find and store headers and comments. log.debug('tokenizing {}'.format(full_path)) try: tokens = tokenize(stream.readline) except Exception as err: log.error('error trying to tokenize {}'.format(full_path)) log.error(err) elements['parse_result'] = 'error' cleanup() return elements # Look for a header at the top, if any. There are two common forms in # Python: a string, and a comment block. The heuristic used here is that # if the first thing after any ignorable comments is a string, it's # assumed to be the doc string; else, any initial comments (after certain # special case comments, such as Unix hash-bang lines) are taken to be # the header; else, no header. for kind, thing, _, _, line in tokens: if kind == ENCODING: continue if ignorable_comment(thing): continue if kind != COMMENT and kind != NL: break header += strip_comment_char(thing) # When the above ends, 'thing' & 'kind' will be the next values to examine. # If it's a string, it's assumed to be the file doc string. # Once we do this, we'll have read the header comment or the doc string and # the file position will be immediately after that point. When we do our # 2nd pass, we don't want to read that stuff again. Back up over the last # non-string/comment thing we read, and remember where we are. if kind == STRING: restart_point = stream.tell() header = header + ' ' + thing.replace('"', '') (kind, thing, _, _, line) = next(tokens) else: restart_point = stream.tell() - len(line) # Iterate through the rest of the file, looking for comments. # This gathers consecutive comment lines together, on the premise that # they may contain sentences split across multiple comment lines. chunk = '' while thing != ENDMARKER: try: if kind == NL: pass elif kind == COMMENT and not ignorable_comment(thing): chunk = chunk + strip_comment_char(thing) + '\n' elif chunk: comments.append(chunk.strip()) chunk = '' (kind, thing, _, _, _) = next(tokens) except StopIteration: break except Exception: # Unicode decoding problems can cause exceptions. log.error('tokenization failed for {}'.format(full_path)) break # This concludes what we gather without parsing the file into an AST. # Store the header and comments, if any. elements['header'] = clean_plain_text(header) elements['comments'] = clean_plain_text_list(comments) # Pass #2: pull out remaining elements separately using the AST. This is # inefficient, because we're iterating over the file a 2nd time, but our # efforts right now are about getting things to work any way possible. # AST parsing failures are possible here, particularly if the file was # converted from Python 2. Some programs do stuff you can't automatically # convert with 2to3. If that happens, bail and return what we can. stream.seek(restart_point) try: log.debug('parsing into AST') tree = ast.parse(stream.read()) except Exception as err: log.error('AST parsing failed; returning what we have so far'.format( full_path)) cleanup() elements['parse_result'] = 'error' return elements # We were able to parse the file into an AST. try: collector = ElementCollector(filtering) collector.visit(tree) except Exception as err: log.error('internal AST code walking error'.format(full_path)) cleanup() elements['parse_result'] = 'error' return elements # We store the names of variables we find temporarily as paths separated # by '|' so that we can find unique variable name assignments within each # function or class context. E.g., variable x in function foo is "foo|x". # Remove the paths now, leaving just the variable names. # Also filter the variables to remove things we don't bother with. unique_var_paths = list(set(collector.variables)) collector.variables = [x[x.rfind('|') + 1:] for x in unique_var_paths] filtered_calls = filter_variables(collector.calls, collector.variables) # We are done. Do final cleanup and count up frequencies of some things. # Note that docstrings don't get frequencies associated with them. elements['docstrings'] = clean_plain_text_list(collector.docstrings) # The rest are turned into ('string', frequency) tuples. elements['imports'] = countify(collector.imports) elements['classes'] = countify(collector.classes) elements['functions'] = countify(collector.functions) elements['variables'] = countify(collector.variables) elements['strings'] = countify(clean_plain_text_list(collector.strings)) elements['calls'] = countify(filtered_calls) cleanup() return elements
"multiline "multi " Func x does stuff " to x func xx() foo() end a += 3 >= 4 loop i in 1:10 foo() ''' ''' "a "b''' EXAMPLE = EXAMPLE1 print('py') for token in tokenize_py(EXAMPLE): print(repr(token)) print('zoof') for token in tokenize(EXAMPLE, __file__, 286): print(token)
import sys from tokenize import * from dictionary_words_2 import * from stochastic_sampling import * # [brian] Usually `import *` is bad form in python. # One of the best things that distinguishes it from # ruby is that it's always easy to tell where some # behavior comes from. If you `import *` you'll # later have a hard time figuring out which module # a given function lives in. if __name__ == '__main__': source = open(sys.argv[1]).read() tokens = tokenize(source) a_dictionary = list_to_dictionary(tokens) stochastic_list = new_list(a_dictionary) root_node = construct_tree(stochastic_list) first_word = random_word(root_node)
printf("%-10s%-10s:debug source code\n", "-debug", "[file]") printf("%-10s%-10s:disassemble builtin-func\n", "-dis-bf", "[file]") argc = len(ARGV) if argc == 1: if ARGV[0] == '-help': print_usage() else: print_usage() elif argc > 2: opt = ARGV[1] name = ARGV[2] if opt == '-tk': from tokenize import * r = tokenize(load(name)) for i in r: printf("%s := %s\n", i.type, i.val) elif opt == '-src': printSource(name) elif opt == '-p': _execute_file(name) input("press any key to quit") elif opt == '-dis': from dis import dissimple argv = ARGV.clone() del argv[0] dissimple(argv) elif opt == '-dump': compilefile(name, name + '.bin') elif opt == '-ast':
def parse(program): global curr, next_token next_token = tokenize(program).next curr = next_token() return expression()
from tokenize import * from getLexicon import * from symScoreClassify import * from splitData import * from naiveBayesClassify import * import numpy as np posindir = os.path.abspath('') + '\\POS' negindir = os.path.abspath('') + '\\NEG' posDocs = tokenize(posindir) negDocs = tokenize(negindir) nfold = 10 posLexicon, negLexicon, posLexiconWeights, negLexiconWeights = getLexicon() len(posLexicon), len(negLexicon), len(posLexiconWeights), len( negLexiconWeights) resultsBow = np.zeros((10, 8)) resultsSig2nonW = np.zeros((10, 198)) resultsSig2W = np.zeros((10, 198)) for iteration in range(0, nfold): print iteration trainPosDocs, trainNegDocs, testPosDocs, testNegDocs = splitData( posDocs, negDocs, nfold, iteration) resultsIteration = symScoreClassify(testPosDocs, posLexicon, negLexicon, posLexiconWeights, negLexiconWeights, True) print resultsIteration[4:12] resultsBow[iteration, :] += np.array(resultsIteration[4:12])
def tk_test(string, types, vals): r = tokenize(string) assert tk_types(r) == types assert tk_vals(r) == vals
printf("%-10s%-10s:print abstract syntax tree\n", "-printast", "[file]") printf("%-10s%-10s:debug source code\n", "-debug", "[file]") printf("%-10s%-10s:disassemble builtin-func\n", "-dis-bf", "[file]") argc = len(ARGV) if argc == 1: if ARGV[0] == '-help': print_usage() else: print_usage() elif argc > 2: opt = ARGV[1] name = ARGV[2] if opt == '-tk': from tokenize import * r = tokenize(load(name)) for i in r: printf("%s := %s\n", i.type, i.val) elif opt == '-src': printSource(name) elif opt == '-p': _execute_file(name) input("press any key to quit") elif opt == '-dis': from dis import dissimple argv = ARGV.clone() del argv[0] dissimple(argv) elif opt == '-dump': compilefile(name, name + '.bin') elif opt == '-ast':
pattern = r"\s*(?:(<=|>=|\W)|([a-zA-Z]\w*)|(\d+(?:\.\d*)?))" for operator, name, literal in re.findall(pattern, program): if operator: yield "(operator)", operator elif name: yield "(name)", name elif literal: yield "(literal)", literal else: raise SyntaxError yield "(end)", "(end)" import time print len(program), "bytes" print len(list(tokenize(program))), "tokens" def bench(name, func): t0 = time.clock() for i in xrange(1000): func(program) print name, time.clock() - t0 import parser, compiler program_list = list(tokenize_python(program)) bench("topdown", parse) bench("topdown pretokenized", lambda program: parse(program_list)) tokenize_python = custom_tokenize_python
def same(self, input_str1, input_str2): self.assertEqual(str(Parse(tokenize(input_str1)).go()), str(Parse(tokenize(input_str2)).go()))
def matches(self, input_str, desired_function): self.assertEqual(str(Parse(tokenize(input_str)).go()), str(desired_function))