def processFile(l): def localCleanup(output_path, base_names): for base_name in base_names: tryRemove(os.path.join(output_path, base_name)) js_file_path = l[0] base_name = os.path.splitext(os.path.basename(js_file_path))[0] pid = int(multiprocessing.current_process().ident) candidates = [] try: # if True: # Temp files to be created during processing path_tmp = 'tmp_%d.js' % (pid) path_tmp_b = 'tmp_%d.b.js' % (pid) path_tmp_b_a = 'tmp_%d.b.a.js' % (pid) path_tmp_u = 'tmp_%d.u.js' % (pid) path_tmp_u_a = 'tmp_%d.u.a.js' % (pid) path_tmp_unugly = 'tmp_%d.n2p.js' % (pid) path_tmp_jsnice = 'tmp_%d.jsnice.js' % (pid) f2 = 'tmp_%d.no_renaming.js' % (pid) f3 = 'tmp_%d.basic_renaming.js' % (pid) f4 = 'tmp_%d.hash_renaming.js' % (pid) f5 = 'tmp_%d.hash_def_one_renaming.js' % (pid) f6 = 'tmp_%d.hash_def_two_renaming.js' % (pid) path_orig = '%s.js' % (base_name) path_ugly = '%s.u.js' % (base_name) path_unugly = '%s.n2p.js' % (base_name) path_jsnice = '%s.jsnice.js' % (base_name) # Strip comments, replace literals, etc try: prepro = Preprocessor(os.path.join(corpus_root, js_file_path)) prepro.write_temp_file(path_tmp) except: cleanup(pid) return (js_file_path, None, 'Preprocessor fail') # Pass through beautifier to fix layout clear = Beautifier() ok = clear.run(path_tmp, path_tmp_b+'.tmp1') if not ok: cleanup(pid) return (js_file_path, None, 'Beautifier 1 fail') jsNiceBeautifier = JSNice(flags=['--no-types', '--no-rename']) (ok, _out, _err) = jsNiceBeautifier.run(path_tmp_b+'.tmp1', path_tmp_b+'.tmp2') if not ok: cleanup(pid) return (js_file_path, None, 'JSNice Beautifier 1 fail') ok = clear.run(path_tmp_b+'.tmp2', path_tmp_b) if not ok: cleanup(pid) return (js_file_path, None, 'Beautifier 1 fail') # Minify ugly = Uglifier() ok = ugly.run(path_tmp_b, path_tmp_u) if not ok: cleanup(pid) return (js_file_path, None, 'Uglifier fail') # Num tokens before vs after try: tok_clear = Lexer(path_tmp_b).tokenList tok_ugly = Lexer(path_tmp_u).tokenList except: cleanup(pid) return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): cleanup(pid) return (js_file_path, None, 'Num tokens mismatch') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() # This is already the baseline corpus, no (smart) renaming yet aligner.align(path_tmp_b, path_tmp_u) except: cleanup(pid) return (js_file_path, None, 'Aligner fail') try: # iBuilder_clear = IndexBuilder(Lexer(path_tmp_b_a).tokenList) iBuilder_ugly = IndexBuilder(Lexer(path_tmp_u_a).tokenList) except: cleanup(pid) return (js_file_path, None, 'IndexBuilder fail') # Store original and uglified versions ok = clear.run(path_tmp_u_a, os.path.join(output_path, path_ugly)) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly]) return (js_file_path, None, 'Beautifier 2 fail') ok = clear.run(path_tmp_b_a, os.path.join(output_path, path_orig)) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig]) return (js_file_path, None, 'Beautifier 3 fail') # Run the JSNice from http://www.nice2predict.org unuglifyJS = UnuglifyJS() (ok, _out, _err) = unuglifyJS.run(path_tmp_b_a, path_tmp_unugly) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig]) return (js_file_path, None, 'Nice2Predict fail') ok = clear.run(path_tmp_unugly, path_tmp_unugly+'.tmp1') if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, path_unugly]) return (js_file_path, None, 'Beautifier 4 fail') (ok, _out, _err) = jsNiceBeautifier.run(path_tmp_unugly+'.tmp1', path_tmp_unugly+'.tmp2') if not ok: cleanup(pid) return (js_file_path, None, 'JSNice Beautifier 2 fail') ok = clear.run(path_tmp_unugly+'.tmp2', os.path.join(output_path, path_unugly)) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, path_unugly]) return (js_file_path, None, 'Beautifier 4 fail') try: scopeAnalyst = ScopeAnalyst(os.path.join( os.path.dirname(os.path.realpath(__file__)), path_tmp_unugly)) nameOrigin = scopeAnalyst.nameOrigin for (name, def_scope) in nameOrigin.iterkeys(): candidates.append(('Nice2Predict', def_scope, name, '', '')) except: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, path_unugly]) return (js_file_path, None, 'ScopeAnalyst fail') # Run the JSNice from http://www.jsnice.org jsNice = JSNice() (ok, _out, _err) = jsNice.run(path_tmp_b_a, path_tmp_jsnice) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, path_unugly]) return (js_file_path, None, 'JSNice fail') ok = clear.run(path_tmp_jsnice, os.path.join(output_path, path_jsnice)) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, \ path_unugly, path_jsnice]) return (js_file_path, None, 'Beautifier 5 fail') try: scopeAnalyst = ScopeAnalyst(os.path.join( os.path.dirname(os.path.realpath(__file__)), path_tmp_jsnice)) nameOrigin = scopeAnalyst.nameOrigin for (name, def_scope) in nameOrigin.iterkeys(): candidates.append(('JSNice', def_scope, name, '', '')) except: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, \ path_unugly, path_jsnice]) return (js_file_path, None, 'ScopeAnalyst fail') # Compute scoping: name2scope is a dictionary where keys # are (name, start_index) tuples and values are scope identifiers. # Note: start_index is a flat (unidimensional) index, # not a (line_chr_idx, col_chr_idx) index. try: scopeAnalyst = ScopeAnalyst(os.path.join( os.path.dirname(os.path.realpath(__file__)), path_tmp_u_a)) _name2defScope = scopeAnalyst.resolve_scope() _isGlobal = scopeAnalyst.isGlobal _name2useScope = scopeAnalyst.resolve_use_scope() except: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, \ path_unugly, path_jsnice]) return (js_file_path, None, 'ScopeAnalyst fail') no_renaming = [] for _line_idx, line in enumerate(iBuilder_ugly.tokens): no_renaming.append(' '.join([t for (_tt,t) in line]) + "\n") with open(f2, 'w') as f_no_renaming: f_no_renaming.writelines(no_renaming) moses = MosesDecoder(ini_path=os.path.join(ini_path, \ 'train.no_renaming', 'tuning', 'moses.ini')) (_moses_ok, translation, _err) = moses.run(f2) nc = processTranslation(translation, iBuilder_ugly, scopeAnalyst, lm_path, f2, output_path, base_name, clear) if nc: candidates += nc # Simple renaming: disambiguate overloaded names using scope id basic_renaming = renameUsingScopeId(scopeAnalyst, iBuilder_ugly) with open(f3, 'w') as f_basic_renaming: f_basic_renaming.writelines(basic_renaming) moses = MosesDecoder(ini_path=os.path.join(ini_path, \ 'train.basic_renaming', 'tuning', 'moses.ini')) (_moses_ok, translation, _err) = moses.run(f3) nc = processTranslation(translation, iBuilder_ugly, scopeAnalyst, lm_path, f3, output_path, base_name, clear) if nc: candidates += nc # More complicated renaming: collect the context around # each name (global variables, API calls, punctuation) # and build a hash of the concatenation. hash_renaming = renameUsingHashAllPrec(scopeAnalyst, iBuilder_ugly, debug=False) # print hash_renaming with open(f4, 'w') as f_hash_renaming: f_hash_renaming.writelines(hash_renaming) moses = MosesDecoder(ini_path=os.path.join(ini_path, \ 'train.hash_renaming', 'tuning', 'moses.ini')) (_moses_ok, translation, _err) = moses.run(f4) nc = processTranslation(translation, iBuilder_ugly, scopeAnalyst, lm_path, f4, output_path, base_name, clear) if nc: candidates += nc hash_def_one_renaming = renameUsingHashDefLine(scopeAnalyst, iBuilder_ugly, twoLines=False, debug=False) with open(f5, 'w') as f_hash_def_one_renaming: f_hash_def_one_renaming.writelines(hash_def_one_renaming) moses = MosesDecoder(ini_path=os.path.join(ini_path, \ 'train.hash_def_one_renaming', 'tuning', 'moses.ini')) (_moses_ok, translation, _err) = moses.run(f5) nc = processTranslation(translation, iBuilder_ugly, scopeAnalyst, lm_path, f5, output_path, base_name, clear) if nc: candidates += nc hash_def_two_renaming = renameUsingHashDefLine(scopeAnalyst, iBuilder_ugly, twoLines=True, debug=False) with open(f6, 'w') as f_hash_def_two_renaming: f_hash_def_two_renaming.writelines(hash_def_two_renaming) moses = MosesDecoder(ini_path=os.path.join(ini_path, \ 'train.hash_def_two_renaming', 'tuning', 'moses.ini')) (_moses_ok, translation, _err) = moses.run(f6) nc = processTranslation(translation, iBuilder_ugly, scopeAnalyst, lm_path, f6, output_path, base_name, clear) if nc: candidates += nc cleanup(pid) cleanupRenamed(pid) return (js_file_path, 'OK', candidates) except Exception, e: cleanup(pid) cleanupRenamed(pid) return (js_file_path, None, str(e).replace("\n", ""))
def processFile(js_file_path): # Load in the minified file minified = open(js_file_path).read() # Create lexer lexer = get_lexer_for_filename(js_file_path) # Tokenize input and compute mappings between the different # indices used: (line, col), flat, (l,c) in token list indexBuilder = IndexBuilder(lex(minified, lexer)) tokens = indexBuilder.tokens # print 'RUNNING IndexBuilder:', len(tokens)>0 # Compute scoping: name2scope is a dictionary where keys # are (name, start_index) tuples and values are scope identifiers. # Note: start_index is a flat (unidimensional) index, # not a (line_chr_idx, col_chr_idx) index. scopeAnalyst = ScopeAnalyst(js_file_path) name2defScope = scopeAnalyst.resolve_scope() isGlobal = scopeAnalyst.isGlobal name2useScope = scopeAnalyst.name2useScope name2pth = scopeAnalyst.name2pth nameOrigin = scopeAnalyst.nameOrigin scopes = set(name2useScope.values()) print print '=== FOUND %d SCOPES ===' % len(scopes) print for scope in scopes: print 'USE SCOPE:', scope lc_list = [ indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]] for (t, pos) in name2useScope.keys() if name2useScope[(t, pos)] == scope ] highlight(tokens, lc_list) print scopes = set(name2defScope.values()) print print '=== FOUND %d NAME SCOPES ===' % len(scopes) print for scope in scopes: print 'DEF SCOPE:', scope lc_list = [ indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]] for (t, pos) in name2defScope.keys() if name2defScope[(t, pos)] == scope ] highlight(tokens, lc_list) print # Discover the path to the source map map_path = sourcemap.discover(minified) # Read and parse our sourcemap if map_path: sourcemapIndex = sourcemap.load(open(map_path)) # Cluster names by scope nameScope2Positions = {} # Index data by (name,scope) for token, l in indexBuilder.name2CharPositions.iteritems(): for (line, col) in sorted(l, key=lambda (a, b): (a, b)): pos = indexBuilder.flatMap[(line, col)] if name2defScope.has_key((token, pos)): scope = name2defScope[(token, pos)] use_scope = name2useScope[(token, pos)] pth = name2pth[(token, pos)] glb = isGlobal[(token, pos)] nameScope2Positions.setdefault((token, scope, glb), []) nameScope2Positions[(token, scope, glb)].append((line, col)) # print token, pos # print 'def:', scope # print 'use:', use_scope # print 'pth:', pth # highlight(tokens, [indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]]]) # print print print for (token,scope,glb), positions in sorted(nameScope2Positions.iteritems(), \ key=lambda (x,y):x[0]): pos = sorted(positions, key=lambda e: (e[0], e[1])) tt = [] line_tok_idxs = set([]) for (l, c) in pos: (tl, tc) = indexBuilder.revTokMap[(l, c)] line_tok_idxs.add(tl) p = indexBuilder.flatMap[(l, c)] if map_path: orig = sourcemapIndex.lookup(line=l, column=c).name else: orig = token print token, scope, (l, c), orig tt.append(((tl, tc), p, orig)) # t.append(orig) # if token == 'n': print '\nNAME:', token.encode( 'utf-8'), '( isGlobal =', glb, '; original =', orig, ')' # print scope # highlight(tokens, [indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]]]) for ((tli, tci), p, orig) in tt: scope = name2defScope[(token, p)] use_scope = name2useScope[(token, p)] pth = name2pth[(token, p)] origin = nameOrigin[(token, scope)] # print token #, p, origin # print # print 'def:', scope # print 'use:', use_scope # print 'pth:', pth # print for tl in sorted(set([tli for ((tli, tci), p, orig) in tt])): l = list(tokens[tl]) for tc in [tci for ((tli, tci), p, orig) in tt if tli == tl]: l[tc] = (l[tc][0], unichr(0x2588) + token + unichr(0x2588)) # pos = indexBuilder.flatMap[(line,col)] print ' ', '%d:' % (tl + 1), ' '.join( [x[1].encode('utf-8') for x in l]) print return