def processFile(js_file_path): pid = int(multiprocessing.current_process().ident) try: signal.alarm(600) prepro = Preprocessor(js_file_path) prepro.write_temp_file('tmp_%d.js' % pid) beauty = Beautifier() ok = beauty.run('tmp_%d.js' % pid, 'tmp_%d.b.js' % pid) if ok: mc = MiniChecker('tmp_%d.b.js' % pid) try: isMini = mc.compare(keep_mini=False) except Exception as e: isMini = str(e) cleanup(pid) return [os.path.basename(js_file_path), isMini] else: cleanup(pid) return [os.path.basename(js_file_path), 'Beautifier failed'] except TimeExceededError: cleanup(pid) return [os.path.basename(js_file_path), 'Timeout']
def summarizeUnscopedTranslation(renaming_map, f_path, translation_strategy, output_path, base_name, name_candidates, name_positions, iBuilder): nc = [] f_base = os.path.basename(f_path) training_strategy = f_base.split('.')[1] tmp_path = '%s.%s.js' % (f_base[:-3], translation_strategy) o_path = '%s.%s.unscoped.%s.js' % (base_name, training_strategy, translation_strategy) # print f_path, f_base, training_strategy, tmp_path, o_path, base_name writeTmpLines(renameHashed(iBuilder, name_positions, renaming_map), tmp_path) clear = Beautifier() ok = clear.run(tmp_path, os.path.join(output_path, o_path)) if not ok: return False try: lexer = Lexer(os.path.join(output_path, o_path)) iBuilder_local = IndexBuilder(lexer.tokenList) scopeAnalyst_local = ScopeAnalyst(os.path.join(output_path, o_path)) except: return False nameOrigin = scopeAnalyst_local.nameOrigin isGlobal = scopeAnalyst_local.isGlobal for (name, def_scope) in nameOrigin.iterkeys(): pos = scopeAnalyst_local.nameDefScope2pos[(name, def_scope)] if not False: #isGlobal.get((name, pos), True): (lin, col) = iBuilder_local.revFlatMat[pos] (tok_lin, tok_col) = iBuilder_local.revTokMap[(lin, col)] nc.append( ('%s.unscoped.%s' % (training_strategy, translation_strategy), def_scope, tok_lin, tok_col, isGlobal.get( (name, pos), True), name, '', '')) return nc
def processFile(l): js_file_path = l[0] pid = int(multiprocessing.current_process().ident) try: # Temp files to be created during processing path_tmp = 'tmp_%d.js' % pid path_tmp_b = 'tmp_%d.b.js' % pid # Strip comments, replace literals, etc try: prepro = Preprocessor(os.path.join(corpus_root, js_file_path)) prepro.write_temp_file(path_tmp) except: cleanup(pid) return (js_file_path, None, 'Preprocessor fail') # Pass through beautifier to fix layout clear = Beautifier() ok = clear.run(path_tmp, path_tmp_b) if not ok: cleanup(pid) return (js_file_path, None, 'Beautifier fail') try: iBuilder_clear = IndexBuilder(Lexer(path_tmp_b).tokenList) except: cleanup(pid) return (js_file_path, None, 'IndexBuilder fail') n_lines = len(iBuilder_clear.tokens) max_line_len = max([len(l) for l in iBuilder_clear.tokens]) cleanup(pid) return (js_file_path, n_lines, max_line_len) except Exception, e: cleanup(pid) return (js_file_path, None, str(e))
def processFile(js_file_path): try: js_text = open(os.path.join(files_root, js_file_path), 'r').read() # Strip comments, replace literals, etc try: prepro = WebLMPreprocessor(js_text) prepro_text = str(prepro) except: return (js_file_path, None, 'Preprocessor fail') # Pass through beautifier to fix layout clear = Beautifier() (ok, beautified_text, _err) = clear.web_run(prepro_text) if not ok: return (js_file_path, None, 'Beautifier fail') try: lex_clear = WebLexer(beautified_text) tok1 = lex_clear.tokenList except: return (js_file_path, None, 'Lexer fail') try: iBuilder1 = IndexBuilder(tok1) except: return (js_file_path, None, 'IndexBuilder fail') orig = [] for _line_idx, line in enumerate(iBuilder1.tokens): orig.append(' '.join([t for (_tt,t) in line]) + "\n") return (js_file_path, orig) except Exception, e: return (js_file_path, None, str(e))
def summarizeFallbackTranslation(renaming_map, fallback_renaming_map, f_path, translation_strategy, output_path, base_name, name_candidates, name_positions, iBuilder, scopeAnalyst): nc = [] f_base = os.path.basename(f_path) training_strategy = f_base.split('.')[1] tmp_path = '%s.%s.js' % (f_base[:-3], translation_strategy) o_path = '%s.%s.%s.js' % (base_name, training_strategy, translation_strategy) # print f_path, f_base, training_strategy, tmp_path, o_path, base_name isGlobal = scopeAnalyst.isGlobal for (name, def_scope), renaming in renaming_map.iteritems(): pos = scopeAnalyst.nameDefScope2pos[(name, def_scope)] (lin, col) = iBuilder.revFlatMat[pos] (tok_lin, tok_col) = iBuilder.revTokMap[(lin, col)] nc.append(('%s.%s' % (training_strategy, translation_strategy), def_scope, tok_lin, tok_col, isGlobal.get((name, pos), True), renaming, ','.join(name_candidates[(name, def_scope)]))) writeTmpLines( renameHashedFallback(iBuilder, name_positions, renaming_map, fallback_renaming_map), tmp_path) clear = Beautifier() ok = clear.run(tmp_path, os.path.join(output_path, o_path)) if not ok: return False return nc
def processFile(row): js_file_path = os.path.join(corpus_root, row[0]) pid = int(multiprocessing.current_process().ident) base_name = os.path.splitext(os.path.basename(js_file_path))[0] # Temp files to be created during processing temp_files = { 'path_tmp': 'tmp_%d.js' % pid, 'path_tmp_b': 'tmp_%d.b.js' % pid, 'path_tmp_b_a': 'tmp_%d.b.a.js' % pid, 'path_tmp_u': 'tmp_%d.u.js' % pid, 'path_tmp_u_a': 'tmp_%d.u.a.js' % pid } try: # Pass through beautifier to fix layout: # # - once through JSNice without renaming # jsNiceBeautifier = JSNice(flags=['--no-types', '--no-rename']) # # (ok, _out, _err) = jsNiceBeautifier.run(js_file_path, # temp_files['path_tmp']) # if not ok: # cleanup(temp_files) # return (js_file_path, False, 'JSNice Beautifier fail') # # # # Weird JSNice renamings despite --no-rename # try: # before = set([token for (token, token_type) in # Lexer(js_file_path).tokenList # if is_token_subtype(token_type, Token.Name)]) # after = set([token for (token, token_type) in # Lexer(temp_files['path_tmp']).tokenList # if is_token_subtype(token_type, Token.Name)]) # # if not before == after: # return (js_file_path, False, 'Weird JSNice renaming') # # except: # cleanup(temp_files) # return (js_file_path, False, 'Lexer fail') # Strip comments, replace literals, etc try: prepro = Preprocessor(os.path.join(corpus_root, js_file_path)) prepro.write_temp_file(temp_files['path_tmp']) except: cleanup(temp_files) return (js_file_path, None, 'Preprocessor fail') # - and another time through uglifyjs pretty print only clear = Beautifier() ok = clear.run(temp_files['path_tmp'], temp_files['path_tmp_b']) if not ok: cleanup(temp_files) return (js_file_path, False, 'Beautifier fail') # Minify ugly = Uglifier() ok = ugly.run(temp_files['path_tmp_b'], temp_files['path_tmp_u']) if not ok: cleanup(temp_files) return (js_file_path, False, 'Uglifier fail') # Num tokens before vs after try: tok_clear = Lexer(temp_files['path_tmp_b']).tokenList tok_ugly = Lexer(temp_files['path_tmp_u']).tokenList except: cleanup(temp_files) return (js_file_path, False, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): cleanup(temp_files) return (js_file_path, False, 'Num tokens mismatch') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() # This is already the baseline corpus, no (smart) renaming yet aligner.align(temp_files['path_tmp_b'], temp_files['path_tmp_u']) except: cleanup(temp_files) return (js_file_path, False, 'Aligner fail') # Check if minification resulted in any change # It's not very interesting otherwise if open(temp_files['path_tmp_b_a']).read() == \ open(temp_files['path_tmp_u_a']).read(): cleanup(temp_files) return (js_file_path, False, 'Not minified') try: lex_ugly = Lexer(temp_files['path_tmp_u_a']) _iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: cleanup(temp_files) return (js_file_path, False, 'IndexBuilder fail') # Store original and uglified versions ok = clear.run(temp_files['path_tmp_b_a'], os.path.join(output_path, '%s.js' % base_name)) if not ok: cleanup(temp_files) cleanupProcessed(base_name) return (js_file_path, False, 'Beautifier fail') ok = clear.run(temp_files['path_tmp_u_a'], os.path.join(output_path, '%s.u.js' % base_name)) if not ok: cleanup(temp_files) cleanupProcessed(base_name) return (js_file_path, False, 'Beautifier fail') cleanup(temp_files) return (js_file_path, True, 'OK') except Exception, e: cleanup(temp_files) return (js_file_path, False, str(e))
def processFile(l): js_file_path = l[0] if js_file_path in seen: return (js_file_path, None, 'Skipped') pid = int(multiprocessing.current_process().ident) # Temp files to be created during processing temp_files = {'path_tmp': 'tmp_%d.js' % pid, 'path_tmp_b': 'tmp_%d.b.js' % pid, 'path_tmp_b_n': 'tmp_%d.b.n.js' % pid, 'path_tmp_u': 'tmp_%d.u.js' % pid, 'path_tmp_u_n': 'tmp_%d.u.n.js' % pid, 'path_tmp_b_a': 'tmp_%d.b.a.js' % pid, 'path_tmp_u_a': 'tmp_%d.u.a.js' % pid} try: # Strip comments, replace literals, etc try: prepro = Preprocessor(os.path.join(corpus_root, js_file_path)) prepro.write_temp_file(temp_files['path_tmp']) except: cleanup(temp_files) return (js_file_path, None, 'Preprocessor fail') # Pass through beautifier to fix layout: # - once through JSNice without renaming # jsNiceBeautifier = JSNice(flags=['--no-types', '--no-rename']) # # (ok, _out, _err) = jsNiceBeautifier.run(temp_files['path_tmp'], # temp_files['path_tmp_b_n']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'JSNice Beautifier fail') # # - and another time through uglifyjs pretty print only # clear = Beautifier() # ok = clear.run(temp_files['path_tmp_b_n'], # temp_files['path_tmp_b']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'Beautifier fail') # # JSNice is down! clear = Beautifier() ok = clear.run(temp_files['path_tmp'], temp_files['path_tmp_b_n']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Beautifier fail') # Normalize norm = Normalizer() ok = norm.run(os.path.join(os.path.dirname(os.path.realpath(__file__)), temp_files['path_tmp_b_n']), False, temp_files['path_tmp_b']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Normalizer fail') # Minify ugly = Uglifier() ok = ugly.run(temp_files['path_tmp_b'], temp_files['path_tmp_u_n']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Uglifier fail') # Normalize norm = Normalizer() ok = norm.run(os.path.join(os.path.dirname(os.path.realpath(__file__)), temp_files['path_tmp_u_n']), False, temp_files['path_tmp_u']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Normalizer fail') # Num tokens before vs after try: tok_clear = Lexer(temp_files['path_tmp_b']).tokenList tok_ugly = Lexer(temp_files['path_tmp_u']).tokenList except: cleanup(temp_files) return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): cleanup(temp_files) return (js_file_path, None, 'Num tokens mismatch') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() # This is already the baseline corpus, no (smart) renaming yet aligner.align(temp_files['path_tmp_b'], temp_files['path_tmp_u']) except: cleanup(temp_files) return (js_file_path, None, 'Aligner fail') try: lex_clear = Lexer(temp_files['path_tmp_b_a']) iBuilder_clear = IndexBuilder(lex_clear.tokenList) lex_ugly = Lexer(temp_files['path_tmp_u_a']) iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: cleanup(temp_files) return (js_file_path, None, 'IndexBuilder fail') # Normalize norm = Normalizer() ok = norm.run(os.path.join(os.path.dirname(os.path.realpath(__file__)), temp_files['path_tmp_b']), True, temp_files['path_tmp_u_n']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Normalizer fail') try: lex_norm = Lexer(temp_files['path_tmp_u_n']) iBuilder_norm = IndexBuilder(lex_norm.tokenList) except: cleanup(temp_files) return (js_file_path, None, 'IndexBuilder fail') normalized = [] for line_idx, line in enumerate(iBuilder_norm.tokens): normalized.append(' '.join([t for (_tt,t) in line]) + "\n") # Compute scoping: name2scope is a dictionary where keys # are (name, start_index) tuples and values are scope identifiers. # Note: start_index is a flat (unidimensional) index, # not a (line_chr_idx, col_chr_idx) index. try: scopeAnalyst = ScopeAnalyst(os.path.join( os.path.dirname(os.path.realpath(__file__)), temp_files['path_tmp_u_a'])) # _name2defScope = scopeAnalyst.resolve_scope() # _isGlobal = scopeAnalyst.isGlobal # _name2useScope = scopeAnalyst.resolve_use_scope() except: cleanup(temp_files) return (js_file_path, None, 'ScopeAnalyst fail') orig = [] no_renaming = [] for line_idx, line in enumerate(iBuilder_ugly.tokens): orig.append(' '.join([t for (_tt,t) in \ iBuilder_clear.tokens[line_idx]]) + "\n") no_renaming.append(' '.join([t for (_tt,t) in line]) + "\n") # # Simple renaming: disambiguate overloaded names using scope id basic_renaming = renameUsingScopeId(scopeAnalyst, iBuilder_ugly) # More complicated renaming: collect the context around # each name (global variables, API calls, punctuation) # and build a hash of the concatenation. # hash_renaming = renameUsingHashAllPrec(scopeAnalyst, # iBuilder_ugly, # debug=True) hash_def_one_renaming = renameUsingHashDefLine(scopeAnalyst, iBuilder_ugly, twoLines=False, debug=False) hash_def_two_renaming = renameUsingHashDefLine(scopeAnalyst, iBuilder_ugly, twoLines=True, debug=False) cleanup(temp_files) return (js_file_path, orig, no_renaming, basic_renaming, normalized, # hash_renaming, hash_def_one_renaming, hash_def_two_renaming) except Exception, e: cleanup(temp_files) return (js_file_path, None, str(e))
def processFile(js_file_path): try: # Num tokens before vs after try: tok1 = Lexer(os.path.join(files_root, 'orig', js_file_path)).tokenList tok2 = Lexer(os.path.join(files_root, 'no_renaming', js_file_path)).tokenList # tok3 = Lexer(os.path.join(files_root, 'basic_renaming', js_file_path)).tokenList # tok4 = Lexer(os.path.join(files_root, 'normalized', js_file_path)).tokenList tok5 = Lexer( os.path.join(files_root, 'hash_def_one_renaming', js_file_path)).tokenList tok6 = Lexer( os.path.join(files_root, 'hash_def_two_renaming', js_file_path)).tokenList except: return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(set([len(tok1), len(tok2), len(tok5), len(tok6)])) == 1: return (js_file_path, None, 'Num tokens mismatch') clear = Beautifier() # Align minified and clear files, in case the beautifier # did something weird aligner = Aligner() (aligned1, aligned2) = aligner.web_align(tok1, tok2) (ok, beautified1, _err) = clear.web_run(aligned1) tok11 = WebLexer(beautified1).tokenList (ok, beautified2, _err) = clear.web_run(aligned2) tok22 = WebLexer(beautified2).tokenList (aligned5, aligned2) = aligner.web_align(tok5, tok2) (ok, beautified5, _err) = clear.web_run(aligned5) tok55 = WebLexer(beautified5).tokenList (aligned6, aligned2) = aligner.web_align(tok6, tok2) (ok, beautified6, _err) = clear.web_run(aligned6) tok66 = WebLexer(beautified6).tokenList # try: # aligner = Aligner() # # This is already the baseline corpus, no (smart) renaming yet # aligner.align(temp_files['path_tmp_b'], # temp_files['path_tmp_u']) # except: # return (js_file_path, None, 'Aligner fail') try: iBuilder1 = IndexBuilder(tok11) iBuilder2 = IndexBuilder(tok22) # iBuilder3 = IndexBuilder(tok3) # iBuilder4 = IndexBuilder(tok4) iBuilder5 = IndexBuilder(tok55) iBuilder6 = IndexBuilder(tok66) except: return (js_file_path, None, 'IndexBuilder fail') # Check that at least one variable was renamed during minification orig_names = set([ token for line in iBuilder1.tokens for (token_type, token) in line if is_token_subtype(token_type, Token.Name) ]) ugly_names = set([ token for line in iBuilder2.tokens for (token_type, token) in line if is_token_subtype(token_type, Token.Name) ]) if not len(orig_names.difference(ugly_names)): return (js_file_path, None, 'Not minified') orig = [] no_renaming = [] # basic_renaming = [] # normalized = [] hash_def_one_renaming = [] hash_def_two_renaming = [] for _line_idx, line in enumerate(iBuilder1.tokens): orig.append(' '.join([t for (_tt, t) in line]) + "\n") for _line_idx, line in enumerate(iBuilder2.tokens): no_renaming.append(' '.join([t for (_tt, t) in line]) + "\n") # for _line_idx, line in enumerate(iBuilder3.tokens): # basic_renaming.append(' '.join([t for (_tt,t) in line]) + "\n") # for _line_idx, line in enumerate(iBuilder4.tokens): # normalized.append(' '.join([t for (_tt,t) in line]) + "\n") for _line_idx, line in enumerate(iBuilder5.tokens): hash_def_one_renaming.append(' '.join([t for (_tt, t) in line]) + "\n") for _line_idx, line in enumerate(iBuilder6.tokens): hash_def_two_renaming.append(' '.join([t for (_tt, t) in line]) + "\n") return ( js_file_path, orig, no_renaming, # basic_renaming, # normalized, hash_def_one_renaming, hash_def_two_renaming) except Exception, e: return (js_file_path, None, str(e))
def deobfuscateJS(self, obfuscatedCode, use_mix, transactionID, debug_output=False, parallel=True, use_local=True): """ Take a string representing minified javascript code and attempt to translate it into a version with better renamings. Parameters ---------- obfuscatedCode: The minified javascript text. use_mix: True/False -> should we invoke JSNice and throw the names into the language model mix? transactionID: an ID for storing temp files - used currently only to identify the input to JSNice. debug_output: should we print debugging output in this pass (TRUE/FALSE) parallel: enable parallelization performance enhancements -> such as calling the moses servers in parallel. Returns ------- A tuple: renamed_text - the renamed text jsnice_error - "" if no error, otherwise a message stating where the jsnice mixing failed Third element is a tuple of TIMING_COUNT performance times preprocess time - total time to preprocess before invoking moses servers prepre time - how long does the first step of the preprocessor take? jsnice time - part of the preprocessing, how long does it take to get and parse jsnice names renaming time - how long did the hashing steps in preprocess take lex_total_time - how long did all the lexers take, builder_time - how long did all the Index Builders take scoper_time - how long did all the scopeAnalysts take moses time - how long did the moses servers take moses_rn_parallel - total time for the parallel moses and renaming to complete postprocess time - how long did the consistency resolution and language model queries take. """ RS = RenamingStrategies() CS = ConsistencyStrategies() r_strategy = RS.HASH_ONE #c_strategy = CS.FREQLEN # or CS.LM? (CS.LM requires a language model + a querylm from moses) #c_strategy = CS.LM c_strategy = CS.LOGMODEL if (use_local == False): proxies = MosesProxy().web_proxies else: proxies = MosesProxy().web_local mosesParams = {} #lm_path = "/data/bogdanv/deobfuscator/experiments/corpora/corpus.lm.970k/js.blm.lm" #lm_path = "/data/bogdanv/deobfuscator/experiments/corpora/corpus.lm.500k/js.blm.lm" lm_path = "./phrase-tables/langmodels/js.blm.lm" #if socket.gethostname() == 'bogdan.mac': # lm_path = "/Users/bogdanv/workspace2/deobfuscator/data/lm/js.blm.lm" #elif socket.gethostname() == "Caseys-MacBook-Pro.local" or socket.gethostname() == "campus-019-136.ucdavis.edu": # lm_path = "/Users/caseycas/jsnaughty_lms/js970k.blm.lm" #Hashed Name -> Minified Name (name, def_scope) -> (name, def_scope) hash_name_map = {} #Minified Name -> jsnice name (name, def_scope) -> (name, def_scope) jsnice_name_map = {} #Record of any errors we get in the js mixing. #If this feature is enabled (to be added as a switch on the website) #it should not crash the input if there is a failure. If the query #doesn't work for some reason, then we should just use the candidate #names provided by moses. jsnice_errors = [] start = time.time() # Strip comments, replace literals, etc try: #if True: prepro = WebLMPreprocessor(obfuscatedCode) prepro_text = str(prepro) if (debug_output): print("Prepro_text----------------------------------") print(prepro_text) print("Prepro_text----------------------------------") except: return ((prepro_error, "", (0, ) * TIMING_COUNT)) prepre_end = time.time() prepre_time = prepre_end - start clear = Beautifier() (ok, beautified_text, _err) = clear.web_run(prepro_text) if (debug_output): print("Beautified Text") print(beautified_text) if (not ok): return ((beaut_error, "", (0, ) * TIMING_COUNT)) #Due to a bug? in the jsnice web service, we need to save the #input text as a file. min_input_file = os.path.join(self.tmpDir, str(transactionID) + ".u.js") with open(min_input_file, 'w') as f: f.write(beautified_text) try: # lex_ugly = Lexer(beautFile) lex_ugly = WebLexer(beautified_text) if (debug_output): print("Lex_ugly---------------------") print(lex_ugly.tokenList) print("Lex_ugly---------------------") iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: return ((ib_error, "", (0, ) * TIMING_COUNT)) #Do Scope related tasks #a raw text version try: # scopeAnalyst = ScopeAnalyst(beautFile) scopeAnalyst = WebScopeAnalyst(beautified_text) except: return ((sa_error, "", (0, ) * TIMING_COUNT)) #Cut short if no variables if (not scopeAnalyst.hasMinifiableVariables()): return ((beautified_text, "No Minifiable Variables", (0, ) * TIMING_COUNT)) elif (debug_output): print("GLOBAL VAR MAP: " + str(scopeAnalyst.isGlobal)) #lex_ugly.write_temp_file(tempFile) js_start = time.time() ######################## # Nice2Predict start ######################## #Don't want a crashing failure for jsnice query. # BV: Next block left out until I figure out the pipe issue # BV: Update: I couldn't pipe input to N2P. TODO: FIX # Run the JSNice from http://www.nice2predict.org if (use_mix): unuglifyJS = UnuglifyJS() (ok, n2p_text, _err) = unuglifyJS.run(min_input_file) #ok = False #Failure test if not ok: jsnice_errors.append('Nice2Predict fail') #return (js_file_path, None, 'Nice2Predict fail') if (use_mix and jsnice_errors == []): (ok, n2p_text_beautified, _err) = clear.web_run(n2p_text) if not ok: jsnice_errors.append('Beautifier failed for JSNice.') #return (js_file_path, None, 'Beautifier fail') if (debug_output): print("JSNice Text") print(n2p_text_beautified) try: n2p_lexer = WebLexer(n2p_text_beautified) n2p_iBuilder = IndexBuilder(n2p_lexer.tokenList) n2p_scopeAnalyst = WebScopeAnalyst(n2p_text_beautified) except: jsnice_errors.append( "IndexBuilder or ScopeAnalysted failed for JSNice.") #return (js_file_path, None, 'IndexBuilder / ScopeAnalyst fail') ######################## # Nice2Predict End ######################## js_end = time.time() js_time = js_end - js_start #Do Scope related tasks (name_positions, position_names, use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst) #Map the jsnice names to the minified counterparts. if (use_mix and jsnice_errors == [] ): #only attempt if we are error free for jsnice up to this point. try: orderedVarsNew = sorted(scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) orderedVarsN2p = sorted(n2p_scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) if (len(orderedVarsNew) != len(orderedVarsN2p)): jsnice_errors.append( "JSNice and minified name lists different lengths.") #raise IndexError("Length Mismatch") #Probably better to have our own defined error type, but this will do for now #return ("JsNice and New Name lists different length") for i in range(0, len(orderedVarsNew)): name_new = orderedVarsNew[i][0] def_scope_new = scopeAnalyst.name2defScope[ orderedVarsNew[i]] name_n2p = orderedVarsN2p[i][0] def_scope_n2p = scopeAnalyst.name2defScope[ orderedVarsNew[i]] jsnice_name_map[(name_new, def_scope_new)] = (name_n2p, def_scope_n2p) except: jsnice_errors.append( "JSNice to minified name map building failed.") (_name_positions, \ position_names, _use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst) #Note: we want to put these in parallel once we've tested the #serial version... pre_outer_end = time.time() pre_time = pre_outer_end - start if (not parallel): #Get moses output for no_renaming (status, error_msg, translation_default, name_candidates_default, iBuilder_default, scopeAnalyst_default, name_positions_default, position_names_default, use_scopes_default, hash_name_map_default, rn_time_default, m_time_default, lex_time_default, post_start_default) = getMosesTranslation(proxies[RS.NONE], RS.NONE, RS, clear, iBuilder_ugly, scopeAnalyst, debug_output) #print("MOSES NO RENAMING: " + str(m_time_default)) if (not status): return ((error_msg, "", (0, ) * TIMING_COUNT)) #Get moses output for hash_renaming (status, error_msg, translation, name_candidates, a_iBuilder, a_scopeAnalyst, a_name_positions, a_position_names, a_use_scopes, hash_name_map, rn_time, m_time, lex_time, post_start) = getMosesTranslation(proxies[r_strategy], r_strategy, RS, clear, iBuilder_ugly, scopeAnalyst, debug_output) #print("MOSES HASH RENAMING: " + str(m_time)) if (not status): return ((error_msg, "", (0, ) * TIMING_COUNT)) m_parallel_time = 0 else: #Parallel version none_wrapper = (RS.NONE, RS, clear, iBuilder_ugly, scopeAnalyst, debug_output, use_local) hash_wrapper = (r_strategy, RS, clear, iBuilder_ugly, scopeAnalyst, debug_output, use_local) wrappers = [none_wrapper, hash_wrapper] pool = multiprocessing.Pool(processes=2) m_parallel_start = time.time() for result in pool.imap(getMosesTranslationParallel, wrappers): if (result[0] == RS.NONE): #No renaming (status, error_msg, translation_default, name_candidates_default, iBuilder_default, scopeAnalyst_default, name_positions_default, position_names_default, use_scopes_default, hash_name_map_default, rn_time_default, m_time_default, lex_time_default, post_start_default) = result[1] #print("MOSES NO RENAMING: " + str(m_time_default)) if (not status): return ((error_msg, "", (0, ) * TIMING_COUNT)) else: (status, error_msg, translation, name_candidates, a_iBuilder, a_scopeAnalyst, a_name_positions, a_position_names, a_use_scopes, hash_name_map, rn_time, m_time, lex_time, post_start) = result[1] #print("MOSES HASH RENAMING: " + str(m_time)) if (not status): return ((error_msg, "", (0, ) * TIMING_COUNT)) m_parallel_time = time.time() - m_parallel_start pre_time += rn_time_default + rn_time if (debug_output): print("Serial: " + str(m_time + m_time_default + rn_time + rn_time_default)) print("Parallel: " + str(m_parallel_time)) if translation is not None and translation_default is not None: for key_default, suggestions in name_candidates_default.iteritems( ): # (name_default, def_scope_default) = key_default pos_default = scopeAnalyst_default.nameDefScope2pos[ key_default] (lin, col) = iBuilder_default.revFlatMat[pos_default] (line_num, line_idx) = iBuilder_default.revTokMap[(lin, col)] (name, def_scope) = a_position_names[line_num][line_idx] key = (name, def_scope) for name_translation, lines in suggestions.iteritems(): name_candidates.setdefault(key, {}) name_candidates[key].setdefault(name_translation, set([])) name_candidates[key][name_translation].update(lines) # name_candidates is a dictionary of dictionaries: # keys are (name, None) (if scopeAnalyst=None) or # (name, def_scope) tuples (otherwise); # values are suggested translations with the sets # of line numbers on which they appear. #if(True): if (debug_output): print("Name_candidates") print(name_candidates) print("jsnice_name_map") print(jsnice_name_map) print("hash_name_map") print(hash_name_map) # **** BV: This might be all we need to combine Naughty & Nice if ( use_mix and jsnice_errors == [] ): #only attempt if we are error free for jsnice up to this point. try: name_candidates_copy = deepcopy(name_candidates) for key, suggestions in name_candidates_copy.iteritems(): if (debug_output): print("Key: " + str(key)) print("Suggestions: " + str(suggestions)) if r_strategy == RS.NONE: (name_n2p, def_scope_n2p) = jsnice_name_map[key] else: (name_n2p, def_scope_n2p ) = jsnice_name_map[hash_name_map.get(key, key)] for name_translation, lines in suggestions.iteritems(): name_candidates.setdefault(key, {}) name_candidates[key].setdefault(name_n2p, set([])) name_candidates[key][name_n2p].update(lines) except: jsnice_errors.append( "Failure while adding jsnice names to candidate pool.") cr = ConsistencyController(debug_mode=debug_output) # An identifier may have been translated inconsistently # across different lines (Moses treats each line independently). # Try different strategies to resolve inconsistencies, if any # Compute renaming map (x -> length, y -> width, ...) # Note that x,y here are names after renaming #Hash error is occuring in here. try: (temp_renaming_map, seen) = cr.computeRenaming( c_strategy, name_candidates, a_name_positions, a_use_scopes, a_iBuilder, lm_path, {}, hash_name_map) except: return ("Compute renaming fail.", "", (0, ) * TIMING_COUNT) if (debug_output): print("Temp renaming map") print(temp_renaming_map) # Fall back on original names in input, if # no translation was suggested postRen = PostRenamer() renaming_map = postRen.updateRenamingMap(a_name_positions, position_names, a_use_scopes, temp_renaming_map, seen, r_strategy) if (debug_output): print("Renaming Map") print(renaming_map) # Apply renaming map and save output for future inspection renamed_text = postRen.applyRenaming(a_iBuilder, a_name_positions, renaming_map) (ok, beautified_renamed_text, _err) = clear.web_run_end(renamed_text) #print(name_candidates) #print("--------------") #print(renamed_text) #print("--------------") #print(beautified_renamed_text) #print("--------------") #print(" ".join(jsnice_errors)) if not ok: return ((beaut_error, "", (0, ) * TIMING_COUNT)) if (debug_output): print("Renamed text") print(beautified_renamed_text) #Time Calculations... (Will need to update for when it becomes parallel post_end = time.time() post_time = post_end - post_start #Record any jsnice errors (but leave output blank if there are none). jsnice_error_string = "" if (jsnice_errors != []): jsnice_error_string = "JSNice mixing attempt failed. Reporting renaming with only our method. \nJSNice Errors : \n" jsnice_error_string += "\n".join(jsnice_errors) + "\n" #Tally up the build times for the lexers, indexbuilders and scopers. if (not use_mix): n2pLexTime = 0 n2pBuildTime = 0 n2pSATime = 0 else: n2pLexTime = n2p_lexer.build_time n2pBuildTime = n2p_iBuilder.build_time n2pSATime = n2p_scopeAnalyst.build_time #Lexers lex_total_time = lex_time + lex_time_default + lex_ugly.build_time + n2pLexTime #IndexBuilders builder_time = iBuilder_ugly.build_time + n2pBuildTime + a_iBuilder.build_time + iBuilder_default.build_time #scopers scoper_time = n2pSATime + scopeAnalyst.build_time + scopeAnalyst_default.build_time + a_scopeAnalyst.build_time #Change the presentation of this to return performance information #and error codes as separate elements in a tuple #New return: translation, jsnice_error, preprocess time, js_time, rename_time #m_time, post_time. return ((str(beautified_renamed_text), jsnice_error_string, (pre_time, prepre_time, js_time, rn_time + rn_time_default, lex_total_time, builder_time, scoper_time, m_time + m_time_default, m_parallel_time, post_time)))
def processFile(l): js_file_path = l[0] # if True: try: js_text = open(os.path.join(corpus_root, js_file_path), 'r').read() # Strip comments, replace literals, etc try: prepro = WebLMPreprocessor(js_text) prepro_text = str(prepro) except: return (js_file_path, None, 'Preprocessor fail') # Pass through beautifier to fix layout clear = Beautifier() (ok, tmp_beautified_text, _err) = clear.web_run(prepro_text) if not ok: return (js_file_path, None, 'Beautifier fail') # Minify ugly = Uglifier() (ok, tmp_minified_text, _err) = ugly.web_run(tmp_beautified_text) if not ok: return (js_file_path, None, 'Uglifier fail') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() (aligned_clear, aligned_minified) = aligner.web_align( WebLexer(tmp_beautified_text).tokenList, WebLexer(tmp_minified_text).tokenList) except: return (js_file_path, None, 'Aligner fail') # Pass through beautifier to fix layout (ok, beautified_text, _err) = clear.web_run(aligned_clear) if not ok: return (js_file_path, None, 'Beautifier fail') (ok, minified_text, _err) = clear.web_run(aligned_minified) if not ok: return (js_file_path, None, 'Beautifier fail') # Num tokens before vs after try: lex_clear = WebLexer(beautified_text) tok_clear = lex_clear.tokenList lex_ugly = WebLexer(minified_text) tok_ugly = lex_ugly.tokenList except: return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): return (js_file_path, None, 'Num tokens mismatch') if beautified_text == minified_text: return (js_file_path, None, 'Not minified') try: iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: return (js_file_path, None, 'IndexBuilder fail') try: scopeAnalyst = WebScopeAnalyst(minified_text) except: return (js_file_path, None, 'ScopeAnalyst fail') processed = [] # Try different renaming strategies (hash, etc) for r_strategy in RS.all(): try: # if True: # Rename input prior to translation preRen = PreRenamer() after_text = preRen.rename(r_strategy, iBuilder_ugly, scopeAnalyst) (ok, beautified_after_text, _err) = clear.web_run(after_text) if not ok: return (js_file_path, None, 'Beautifier fail') processed.append((r_strategy, beautified_after_text)) except: return (js_file_path, None, 'Renaming fail') with open(os.path.join(output_path, 'orig', js_file_path), 'w') as f: f.write(beautified_text) for (r_strategy, text) in processed: with open(os.path.join(output_path, r_strategy, js_file_path), 'w') as f: f.write(text) return (js_file_path, 'OK', None) except Exception, e: return (js_file_path, None, str(e).replace("\n", ""))
def processFile(l): js_file_path = l[0] base_name = os.path.splitext(os.path.basename(js_file_path))[0] pid = int(multiprocessing.current_process().ident) temp_files = { 'path_tmp': 'tmp_%d.js' % pid, 'path_tmp_b': 'tmp_%d.b.js' % pid, 'path_tmp_b_1': 'tmp_%d.b.1.js' % pid, 'path_tmp_b_2': 'tmp_%d.b.2.js' % pid, 'path_tmp_b_a': 'tmp_%d.b.a.js' % pid, 'path_tmp_u': 'tmp_%d.u.js' % pid, 'path_tmp_u_a': 'tmp_%d.u.a.js' % pid, 'path_tmp_unugly': 'tmp_%d.n2p.js' % pid, 'path_tmp_unugly_1': 'tmp_%d.n2p.1.js' % pid, 'path_tmp_unugly_2': 'tmp_%d.n2p.2.js' % pid, 'path_tmp_jsnice': 'tmp_%d.jsnice.js' % pid, 'f2': 'tmp_%d.no_renaming.js' % pid, # 'f3': 'tmp_%d.basic_renaming.js' % pid, # 'f4': 'tmp_%d.hash_renaming.js' % pid, 'f5': 'tmp_%d.hash_def_one_renaming.js' % pid, # 'f6': 'tmp_%d.hash_def_two_renaming.js' % pid, 'f7': 'tmp_%d.hash_def_one_renaming_fb.js' % pid, 'path_orig': os.path.join(output_path, '%s.js' % base_name), 'path_ugly': os.path.join(output_path, '%s.u.js' % base_name), 'path_unugly': os.path.join(output_path, '%s.n2p.js' % base_name), 'path_jsnice': os.path.join(output_path, '%s.jsnice.js' % base_name) } # for strategy in ['js', 'lm.js', 'len.js', 'freqlen.js']: # for renaming in ['no_renaming', 'hash_def_one_renaming']: # temp_files['path_tmp_%s_%s' % (renaming, strategy)] = \ # 'tmp_%d.%s.%s' % (pid, renaming, strategy) candidates = [] # if True: try: # Strip comments, replace literals, etc try: prepro = Preprocessor(os.path.join(corpus_root, js_file_path)) prepro.write_temp_file(temp_files['path_tmp']) except: cleanup(temp_files) return (js_file_path, None, 'Preprocessor fail') # Pass through beautifier to fix layout clear = Beautifier() ok = clear.run(temp_files['path_tmp'], temp_files['path_tmp_b']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Beautifier fail') # # Pass through beautifier to fix layout # clear = Beautifier() # ok = clear.run(temp_files['path_tmp'], # temp_files['path_tmp_b_1']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'Beautifier fail') # # jsNiceBeautifier = JSNice(flags=['--no-types', '--no-rename']) # # (ok, _out, _err) = jsNiceBeautifier.run(temp_files['path_tmp_b_1'], # temp_files['path_tmp_b_2']) # if not ok: # cleanup(temp_files) # print js_file_path, _err # return (js_file_path, None, 'JSNice Beautifier fail') # # ok = clear.run(temp_files['path_tmp_b_2'], # temp_files['path_tmp_b']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'Beautifier fail') # # # # Weird JSNice renamings despite --no-rename # try: # before = set([token for (token, token_type) in # Lexer(temp_files['path_tmp_b_1']).tokenList # if is_token_subtype(token_type, Token.Name)]) # after = set([token for (token, token_type) in # Lexer(temp_files['path_tmp_b']).tokenList # if is_token_subtype(token_type, Token.Name)]) # # if not before == after: # return (js_file_path, None, 'Weird JSNice renaming') # # except: # cleanup(temp_files) # return (js_file_path, None, 'Lexer fail') # Minify ugly = Uglifier() ok = ugly.run(temp_files['path_tmp_b'], temp_files['path_tmp_u']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Uglifier fail') # Num tokens before vs after try: tok_clear = Lexer(temp_files['path_tmp_b']).tokenList tok_ugly = Lexer(temp_files['path_tmp_u']).tokenList except: cleanup(temp_files) return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): cleanup(temp_files) return (js_file_path, None, 'Num tokens mismatch') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() # This is already the baseline corpus, no (smart) renaming yet aligner.align(temp_files['path_tmp_b'], temp_files['path_tmp_u']) except: cleanup(temp_files) return (js_file_path, None, 'Aligner fail') if open(temp_files['path_tmp_b']).read() == \ open(temp_files['path_tmp_u']).read(): cleanup(temp_files) return (js_file_path, None, 'Not minified') try: lex_ugly = Lexer(temp_files['path_tmp_u_a']) iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: cleanup(temp_files) return (js_file_path, None, 'IndexBuilder fail') ############################################################ # From now on only work with path_tmp_b_a and path_tmp_u_a ############################################################ # Store original and uglified versions ok = clear.run(temp_files['path_tmp_b_a'], temp_files['path_orig']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Beautifier fail') ok = clear.run(temp_files['path_tmp_u_a'], temp_files['path_ugly']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Beautifier fail') # Run the JSNice from http://www.nice2predict.org unuglifyJS = UnuglifyJS() (ok, _out, _err) = unuglifyJS.run(temp_files['path_tmp_u_a'], temp_files['path_tmp_unugly']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Nice2Predict fail') ok = clear.run(temp_files['path_tmp_unugly'], temp_files['path_unugly']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Beautifier fail') # ok = clear.run(temp_files['path_tmp_unugly'], # temp_files['path_tmp_unugly_1']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'Beautifier fail') # # (ok, _out, _err) = jsNiceBeautifier.run(temp_files['path_tmp_unugly_1'], # temp_files['path_tmp_unugly_2']) # if not ok: # cleanup(temp_files) # print js_file_path, _err # return (js_file_path, None, 'JSNice Beautifier fail') # # ok = clear.run(temp_files['path_tmp_unugly_2'], # temp_files['path_unugly']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'Beautifier fail') try: lexer = Lexer(temp_files['path_unugly']) iBuilder = IndexBuilder(lexer.tokenList) except: cleanup(temp_files) return (js_file_path, None, 'IndexBuilder fail') try: scopeAnalyst = ScopeAnalyst( os.path.join(os.path.dirname(os.path.realpath(__file__)), temp_files['path_unugly'])) nameOrigin = scopeAnalyst.nameOrigin isGlobal = scopeAnalyst.isGlobal for (name, def_scope) in nameOrigin.iterkeys(): pos = scopeAnalyst.nameDefScope2pos[(name, def_scope)] (lin, col) = iBuilder.revFlatMat[pos] (tok_lin, tok_col) = iBuilder.revTokMap[(lin, col)] candidates.append(('Nice2Predict', def_scope, tok_lin, tok_col, isGlobal.get((name, pos), True), name, '', '')) except: cleanup(temp_files) return (js_file_path, None, 'ScopeAnalyst fail') # # Run the JSNice from http://www.jsnice.org # jsNice = JSNice() # (ok, _out, _err) = jsNice.run(temp_files['path_tmp_u_a'], # temp_files['path_tmp_jsnice']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'JSNice fail') # # ok = clear.run(temp_files['path_tmp_jsnice'], # temp_files['path_jsnice']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'Beautifier fail') # # try: # lexer = Lexer(temp_files['path_jsnice']) # iBuilder = IndexBuilder(lexer.tokenList) # except: # cleanup(temp_files) # return (js_file_path, None, 'IndexBuilder fail') # # try: # scopeAnalyst = ScopeAnalyst(os.path.join( # os.path.dirname(os.path.realpath(__file__)), # temp_files['path_jsnice'])) # nameOrigin = scopeAnalyst.nameOrigin # isGlobal = scopeAnalyst.isGlobal # # for (name, def_scope) in nameOrigin.iterkeys(): # # pos = scopeAnalyst.nameDefScope2pos[(name, def_scope)] # (lin,col) = iBuilder.revFlatMat[pos] # (tok_lin,tok_col) = iBuilder.revTokMap[(lin,col)] # # candidates.append(('JSNice', def_scope, # tok_lin, tok_col, # isGlobal.get((name, pos), True), # name, '','')) # except: # cleanup(temp_files) # return (js_file_path, None, 'ScopeAnalyst fail') # Compute scoping: name2scope is a dictionary where keys # are (name, start_index) tuples and values are scope identifiers. # Note: start_index is a flat (unidimensional) index, # not a (line_chr_idx, col_chr_idx) index. try: scopeAnalyst = ScopeAnalyst( os.path.join(os.path.dirname(os.path.realpath(__file__)), temp_files['path_tmp_u_a'])) except: cleanup(temp_files) return (js_file_path, None, 'ScopeAnalyst fail') # Baseline translation: No renaming, no scoping no_renaming = [] for _line_idx, line in enumerate(iBuilder_ugly.tokens): no_renaming.append(' '.join([t for (_tt, t) in line]) + "\n") with open(temp_files['f2'], 'w') as f_no_renaming: f_no_renaming.writelines(no_renaming) moses = MosesDecoder(ini_path=os.path.join(ini_path, \ 'train.no_renaming', 'tuning', 'moses.ini')) (_moses_ok, translation_no_renaming, _err) = moses.run(temp_files['f2']) nc = processTranslationUnscoped(translation_no_renaming, iBuilder_ugly, lm_path, temp_files['f2'], output_path, base_name) if nc: candidates += nc # translation, iBuilder, lm_path, # f_path, output_path, base_name # Default translation: No renaming # no_renaming = [] # for _line_idx, line in enumerate(iBuilder_ugly.tokens): # no_renaming.append(' '.join([t for (_tt,t) in line]) + "\n") # # with open(temp_files['f2'], 'w') as f_no_renaming: # f_no_renaming.writelines(no_renaming) # # moses = MosesDecoder(ini_path=os.path.join(ini_path, \ # 'train.no_renaming', 'tuning', 'moses.ini')) # (_moses_ok, translation, _err) = moses.run(temp_files['f2']) nc = processTranslationScoped(translation_no_renaming, iBuilder_ugly, scopeAnalyst, lm_path, temp_files['f2'], output_path, base_name) if nc: candidates += nc # More complicated renaming: collect the context around # each name (global variables, API calls, punctuation) # and build a hash of the concatenation. hash_def_one_renaming = renameUsingHashDefLine(scopeAnalyst, iBuilder_ugly, twoLines=False, debug=False) with open(temp_files['f5'], 'w') as f_hash_def_one_renaming: f_hash_def_one_renaming.writelines(hash_def_one_renaming) # moses = MosesDecoder(ini_path=os.path.join(ini_path, \ # 'train.hash_def_one_renaming', 'tuning', 'moses.ini')) # (_moses_ok, # translation_hash_renaming, # _err) = moses.run(temp_files['f5']) mosesParams = {} mosesParams["text"] = hash_def_one_renaming #lex_ugly.collapsedText #mosesParams["align"] = "true" #mosesParams["report-all-factors"] = "true" mresults = proxy.translate( mosesParams) # __request("translate", mosesParams) rawText = Postprocessor(mresults["nbest"]) translation_hash_renaming = rawText.getProcessedOutput() nc = processTranslationScoped(translation_hash_renaming, iBuilder_ugly, scopeAnalyst, lm_path, temp_files['f5'], output_path, base_name) if nc: candidates += nc # nc = processTranslationScopedFallback(translation_hash_renaming, # translation_no_renaming, # iBuilder_ugly, # scopeAnalyst, # lm_path, # temp_files['f7'], # output_path, # base_name) # if nc: # candidates += nc cleanup(temp_files) cleanupRenamed(pid) return (js_file_path, 'OK', candidates) except Exception, e: cleanup(temp_files) cleanupRenamed(pid) return (js_file_path, None, str(e).replace("\n", ""))
def processFile(l): js_file_path = l[0] base_name = os.path.splitext(os.path.basename(js_file_path))[0] temp_files = { 'orig': '%s.js' % base_name, 'minified': '%s.u.js' % base_name, 'n2p': '%s.n2p.js' % base_name } for r_strategy in RS.all(): temp_files['%s' % (r_strategy)] = \ '%s.%s.js' % (base_name, r_strategy) for c_strategy in CS.all(): temp_files['%s_%s' % (r_strategy, c_strategy)] = \ '%s.%s.%s.js' % (base_name, r_strategy, c_strategy) for k, v in temp_files.iteritems(): temp_files[k] = os.path.join(output_path, v) candidates = [] #Minified Name -> Original Name (name, def_scope) -> (name, def_scope) min_name_map = {} #Hashed Name -> Minified Name (name, def_scope) -> (name, def_scope) hash_name_map = {} #Minified Name -> jsnice name (name, def_scope) -> (name, def_scope) jsnice_name_map = {} #Output Lines for the suggestoin_model.csv model_rows = [] try: js_text = open(os.path.join(corpus_root, js_file_path), 'r').read() # Strip comments, replace literals, etc try: prepro = WebLMPreprocessor(js_text) prepro_text = str(prepro) except: return (js_file_path, None, 'Preprocessor fail') # Pass through beautifier to fix layout clear = Beautifier() (ok, tmp_beautified_text, _err) = clear.web_run(prepro_text) if not ok: return (js_file_path, None, 'Beautifier fail') # Minify ugly = Uglifier() (ok, tmp_minified_text, _err) = ugly.web_run(tmp_beautified_text) if not ok: return (js_file_path, None, 'Uglifier fail') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() (aligned_clear, aligned_minified) = aligner.web_align( WebLexer(tmp_beautified_text).tokenList, WebLexer(tmp_minified_text).tokenList) except: return (js_file_path, None, 'Aligner fail') # Pass through beautifier to fix layout (ok, beautified_text, _err) = clear.web_run(aligned_clear) if not ok: return (js_file_path, None, 'Beautifier fail') (ok, minified_text, _err) = clear.web_run(aligned_minified) if not ok: return (js_file_path, None, 'Beautifier fail') # Num tokens before vs after try: lex_clear = WebLexer(beautified_text) tok_clear = lex_clear.tokenList lex_ugly = WebLexer(minified_text) tok_ugly = lex_ugly.tokenList except: return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): return (js_file_path, None, 'Num tokens mismatch') if beautified_text == minified_text: return (js_file_path, None, 'Not minified') #try: # iBuilder_clear = IndexBuilder(lex_clear.tokenList) #except: # return (js_file_path, None, "IndexBuilder fail on original file.") try: iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: return (js_file_path, None, 'IndexBuilder fail') with open(temp_files['orig'], 'w') as f: f.write(beautified_text) with open(temp_files['minified'], 'w') as f: f.write(minified_text) # try: # orig_lexer = WebLexer(beautified_text) # orig_iBuilder = IndexBuilder(orig_lexer.tokenList) # orig_scopeAnalyst = WebScopeAnalyst(beautified_text) # except: # return (js_file_path, None, 'IndexBuilder/Scoper fail on original') ######################## # Nice2Predict ######################## # BV: Next block left out until I figure out the pipe issue # BV: Update: I couldn't pipe input to N2P. TODO: FIX # Run the JSNice from http://www.nice2predict.org unuglifyJS = UnuglifyJS() (ok, n2p_text, _err) = unuglifyJS.run(temp_files['minified']) if not ok: return (js_file_path, None, 'Nice2Predict fail') (ok, n2p_text_beautified, _err) = clear.web_run(n2p_text) if not ok: return (js_file_path, None, 'Beautifier fail') with open(temp_files['n2p'], 'w') as f: f.write(n2p_text_beautified) try: n2p_lexer = WebLexer(n2p_text_beautified) n2p_iBuilder = IndexBuilder(n2p_lexer.tokenList) n2p_scopeAnalyst = WebScopeAnalyst(n2p_text_beautified) except: return (js_file_path, None, 'IndexBuilder / ScopeAnalyst fail') # Save some translation stats to compare different methods ts = TranslationSummarizer() candidates += [['n2p', ''] + x for x in ts.compute_summary_unscoped( n2p_iBuilder, n2p_scopeAnalyst)] ################################################ # All other JSNaughty variants ################################################ try: scopeAnalyst = WebScopeAnalyst(minified_text) except: return (js_file_path, None, 'ScopeAnalyst minified fail') try: scopeAnalyst_clear = WebScopeAnalyst(beautified_text) except: return (js_file_path, None, 'ScopeAnalyst clear fail') #if(not check(iBuilder_clear, scopeAnalyst_clear, n2p_iBuilder, n2p_scopeAnalyst)): # return (js_file_path, None, 'JsNice restructured file. Skipping..') #Map the original names to the minified counterparts. orderedVarsNew = sorted(scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) orderedVarsOld = sorted(scopeAnalyst_clear.name2defScope.keys(), key=lambda x: x[1]) orderedVarsN2p = sorted(n2p_scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) if (len(orderedVarsOld) != len(orderedVarsNew)): return (js_file_path, None, "Old and New Name lists different length") if (len(orderedVarsOld) != len(orderedVarsN2p)): return (js_file_path, None, "JsNice and Old Name lists different length") for i in range(0, len(orderedVarsOld)): name_old = orderedVarsOld[i][0] def_scope_old = scopeAnalyst_clear.name2defScope[orderedVarsOld[i]] name_new = orderedVarsNew[i][0] def_scope_new = scopeAnalyst.name2defScope[orderedVarsNew[i]] min_name_map[(name_new, def_scope_new)] = (name_old, def_scope_old) name_n2p = orderedVarsN2p[i][0] def_scope_n2p = scopeAnalyst.name2defScope[orderedVarsNew[i]] jsnice_name_map[(name_new, def_scope_new)] = (name_n2p, def_scope_n2p) #Once we have the scopeAnalyst, iBuilder, and tokenlist for the minified #version, we can get the name properties # vm = VariableMetrics(scopeAnalyst, iBuilder_ugly, lex_ugly.tokenList) # variableKeySet = vm.getVariables() # for variableKey in variableKeySet: # name_features[variableKey] = vm.getNameMetrics(variableKey) (_name_positions, \ position_names, _use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst) # Try different renaming strategies (hash, etc) for r_strategy, proxy in proxies: # Rename input prior to translation preRen = PreRenamer() after_text = preRen.rename(r_strategy, iBuilder_ugly, scopeAnalyst) (ok, beautified_after_text, _err) = clear.web_run(after_text) if not ok: return (js_file_path, None, 'Beautifier fail') # Save renamed input to disk for future inspection with open(temp_files['%s' % (r_strategy)], 'w') as f: f.write(beautified_after_text) a_lexer = WebLexer(beautified_after_text) a_iBuilder = IndexBuilder(a_lexer.tokenList) a_scopeAnalyst = WebScopeAnalyst(beautified_after_text) if (r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO): # try: # scopeAnalyst_hash = WebScopeAnalyst(beautified_after_text) #This should be beautified_after_text instead of after_text # except: # return (js_file_path, None, "ScopeAnalyst hash fail") #Map the hashed names to the minified counterparts. orderedVarsMin = sorted(scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) orderedVarsHash = sorted(a_scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) if (len(orderedVarsMin) != len(orderedVarsHash)): return (js_file_path, None, "Hash and Min lists different length") for i in range(0, len(orderedVarsHash)): name_hash = orderedVarsHash[i][0] def_scope_hash = a_scopeAnalyst.name2defScope[ orderedVarsHash[i]] name_min = orderedVarsMin[i][0] def_scope_min = scopeAnalyst.name2defScope[ orderedVarsMin[i]] hash_name_map[(name_hash, def_scope_hash)] = (name_min, def_scope_min) # We can switch this back once we train models on a corpus with literals # lx = WebLexer(a_iBuilder.get_text()) lx = WebLexer(a_iBuilder.get_text_wo_literals()) # Translate renamed input md = WebMosesDecoder(proxy) (ok, translation, _err) = md.run(lx.collapsedText) if not ok: return (js_file_path, None, 'Moses translation fail') (a_name_positions, a_position_names, a_use_scopes) = prepHelpers(a_iBuilder, a_scopeAnalyst) nc = [] if translation is not None: # Parse moses output mp = MosesParser() name_candidates = mp.parse(translation, a_iBuilder, a_position_names) # name_candidates is a dictionary of dictionaries: # keys are (name, def_scope) tuples; # values are suggested translations with the sets # of line numbers on which they appear. # Update name_candidates with some default values # (in this case the translation without any renaming) # if the translation is empty if r_strategy == RS.NONE: # RS.NONE should always be first, by construction name_candidates_default = name_candidates scopeAnalyst_default = a_scopeAnalyst iBuilder_default = a_iBuilder else: for key_default, suggestions in name_candidates_default.iteritems( ): # (name_default, def_scope_default) = key_default pos_default = scopeAnalyst_default.nameDefScope2pos[ key_default] (lin, col) = iBuilder_default.revFlatMat[pos_default] (line_num, line_idx) = iBuilder_default.revTokMap[(lin, col)] (name, def_scope) = a_position_names[line_num][line_idx] key = (name, def_scope) for name_translation, lines in suggestions.iteritems(): name_candidates.setdefault(key, {}) name_candidates[key].setdefault( name_translation, set([])) name_candidates[key][name_translation].update( lines) # **** BV: This might be all we need to combine Naughty & Nice name_candidates_copy = deepcopy(name_candidates) for key, suggestions in name_candidates_copy.iteritems(): if r_strategy == RS.NONE: (name_n2p, def_scope_n2p) = jsnice_name_map[key] else: (name_n2p, def_scope_n2p) = jsnice_name_map[hash_name_map.get( key, key)] for name_translation, lines in suggestions.iteritems(): name_candidates.setdefault(key, {}) name_candidates[key].setdefault(name_n2p, set([])) name_candidates[key][name_n2p].update(lines) cc = ConsistencyController(debug_mode=False) ts = TranslationSummarizer() # An identifier may have been translated inconsistently # across different lines (Moses treats each line independently). # Try different strategies to resolve inconsistencies, if any for c_strategy in CS.all(): # Compute renaming map (x -> length, y -> width, ...) # Note that x,y here are names after (hash) renaming (temp_renaming_map, seen) = cc.computeRenaming( c_strategy, name_candidates, a_name_positions, a_use_scopes, a_iBuilder, lm_path, {}, hash_name_map) # After computeRenaming, we have both the entropies stored # if we are in LMDrop strategy and have the suggestions # frequency from name_candidates. Fill in suggestion_Features # if(c_strategy == CS.LMDROP and r_strategy not in suggestion_features): # assert(cc.suggestion_cache != None) # suggestion_features[r_strategy] = {} # """ # name_candidates: dict # name_candidates[(name, def_scope)][name_translation] # = set of line numbers in the translation # """ # for variableKey, suggestionDictionary in name_candidates.iteritems(): # for suggestionName, linesSuggested in suggestionDictionary.iteritems(): # # # I need to revert variableKey[0] in the suggestion from its hash to its original minified name. # if(r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO): # unhashedKey = hash_name_map[variableKey] # suggestionKey = (unhashedKey[0], unhashedKey[1], suggestionName) # else: # suggestionKey = (variableKey[0], variableKey[1], suggestionName) # # entropyVals = cc.suggestion_cache.getEntropyStats(variableKey, suggestionName) # if(entropyVals != (ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR)): # suggestionValue = [len(linesSuggested)] + \ # list(getSuggestionStats(suggestionName)) + \ # list(entropyVals) # # suggestion_features[r_strategy][suggestionKey] = suggestionValue # Fall back on original names in input, if # no translation was suggested postRen = PostRenamer() renaming_map = postRen.updateRenamingMap( a_name_positions, position_names, a_use_scopes, temp_renaming_map, seen, r_strategy) # Apply renaming map and save output for future inspection renamed_text = postRen.applyRenaming( a_iBuilder, a_name_positions, renaming_map) (ok, beautified_renamed_text, _err) = clear.web_run(renamed_text) if not ok: return (js_file_path, None, 'Beautifier fail') with open(temp_files['%s_%s' % (r_strategy, c_strategy)], 'w') as f: f.write(beautified_renamed_text) # Save some stats about which names were renamed to what # This is what enables the comparison between the different # methods. r = [[c_strategy] + x for x in ts.compute_summary_scoped( renaming_map, name_candidates, a_iBuilder, a_scopeAnalyst)] if not r: return (js_file_path, None, 'Compute summary failed') nc += r if nc: candidates += [[r_strategy] + x for x in nc] #create the rows for the suggestion_model.csv # for r_strategy in RS.all(): # for suggestionKey, s_feat in suggestion_features[r_strategy].iteritems(): # variableKey = (suggestionKey[0], suggestionKey[1]) # original_name = min_name_map[variableKey][0] # js_nice_name = jsnice_name_map[variableKey][0] # n_feat = list(name_features[variableKey]) # #Convert the def_scope to an equivalent, but smaller, easier to read key: (line_num, token_num) # newKey = scopeAnalyst.nameDefScope2pos[variableKey] # (keyLine, keyToken) = iBuilder_ugly.revFlatMat[newKey] # model_rows.append([original_name, r_strategy, suggestionKey[0], keyLine, keyToken, suggestionKey[2], js_nice_name] + n_feat + s_feat) return (js_file_path, 'OK', candidates, model_rows) except Exception, e: return (js_file_path, None, str(e).replace("\n", ""), model_rows)
def deobfuscateJS(self, obfuscatedCode, transactionID): proxy = xmlrpclib.ServerProxy("http://godeep.cs.ucdavis.edu:8080/RPC2") mosesParams = {} candidates = [] baseDir = "/home/ccasal/temp/" tempFile = baseDir + str(transactionID) + "_temp.js" lm_path = "/data/bogdanv/deobfuscator/experiments/corpora/corpus.lm.970k/js.blm.lm" preproFile = baseDir + str(transactionID) + "_prepro.js" beautFile = baseDir + str(transactionID) + "_beaut.js" # Strip comments, replace literals, etc try: prepro = WebPreprocessor(obfuscatedCode) #TODO replace with: prepro = WebPreprocessor(text) prepro.write_temp_file(preproFile) except: cleanup([preproFile]) print("Preprocessor failed") return ("Preprocessor Failed") clear = Beautifier() #TODO: Need a text version of beautifier to avoid the file read and write. #(ok, beautText, err) = clear.webRun(preproText) ok = clear.run(preproFile, beautFile) print(ok) if (not ok): cleanup([preproFile, beautFile]) return ("Beautifier Failed") #quit() try: lex_ugly = Lexer(beautFile) iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: cleanup([preproFile, beautFile]) print("IndexBuilder fail") return ("IndexBuilder Failed") lex_ugly.write_temp_file(tempFile) #Do Scope related tasks #a raw text version try: scopeAnalyst = ScopeAnalyst(tempFile) except: cleanup({"temp": tempFile}) print("ScopeAnalyst Fail") return ("ScopeAnalyst Failed") #Do Rename related tasks #In our case, I don't think we need to actually do anything for no_renaming #no_renaming = [] #for _line_idx, line in enumerate(iBuilder_ugly.tokens): # no_renaming.append(' '.join([t for (_tt,t) in line]) + "\n") #Hash_def_one_renaming #beautText = renameUsingHashDefLine(scopeAnalyst, # iBuilder_ugly, # twoLines=False, # debug=False) print(lex_ugly.collapsedText) mosesParams["text"] = lex_ugly.collapsedText mosesParams["align"] = "true" mosesParams["report-all-factors"] = "true" results = proxy.translate( mosesParams) # __request("translate", mosesParams) rawText = Postprocessor(results["nbest"]) translation = rawText.getProcessedOutput() #Send to output: cleanup([preproFile, beautFile, tempFile]) return (translation)
js_tmp.write('\n'.join([' '.join([token for (_token_type, token) in line]) for line in lines]).encode('utf8')) js_tmp.write('\n') js_tmp.close() input_file = os.path.abspath(sys.argv[1]) output_file = os.path.abspath(sys.argv[2]) mode = int(sys.argv[3]) prepro = Preprocessor(input_file) prepro.write_temp_file('tmp.js') clear = Beautifier() ok = clear.run('tmp.js', 'tmp.b.js') lexer = Lexer('tmp.b.js') iBuilder = IndexBuilder(lexer.tokenList) scopeAnalyst = ScopeAnalyst(os.path.join( os.path.dirname(os.path.realpath(__file__)), 'tmp.b.js')) hash_renaming = renameUsingHashDefLine(scopeAnalyst, iBuilder, twoLines=False, debug=mode)
def processFile(js_file_path): js_file_path = os.path.abspath(js_file_path) print 'READING:', js_file_path acorn = Acorn() (_stdout, acorn_ok) = acorn.run(js_file_path) print 'RUNNING Acorn:', acorn_ok # Load in the minified file minified = open(js_file_path).read() b = Beautifier() (ok, out, err) = b.web_run(minified) # print out # Create lexer lexer = get_lexer_for_filename(js_file_path) # Tokenize input and compute mappings between the different # indices used: (line, col), flat, (l,c) in token list indexBuilder = IndexBuilder(lex(minified, lexer)) tokens = indexBuilder.tokens print 'RUNNING IndexBuilder:', len(tokens) > 0 #nice1 = JSNice() #(ok, _out, _err) = nice1.run(js_file_path) #print 'RUNNING JSNice:', ok #nice2 = UnuglifyJS() #(ok, _out, _err) = nice2.run(js_file_path) #print 'RUNNING UnuglifyJS:', ok _pid = multiprocessing.current_process().ident # Compute scoping: name2scope is a dictionary where keys # are (name, start_index) tuples and values are scope identifiers. # Note: start_index is a flat (unidimensional) index, # not a (line_chr_idx, col_chr_idx) index. # scopeAnalyst = ScopeAnalyst(js_file_path) # name2defScope = scopeAnalyst.resolve_scope() # isGlobal = scopeAnalyst.isGlobal scopeAnalyst = WebScopeAnalyst(minified) name2defScope = scopeAnalyst.resolve_scope() isGlobal = scopeAnalyst.isGlobal print 'RUNNING ScopeAnalyst:', len(name2defScope) > 0 name2useScope = scopeAnalyst.name2useScope name2pth = scopeAnalyst.name2pth nameOrigin = scopeAnalyst.nameOrigin scopes = set(name2useScope.values()) for scope in scopes: print scope lc_list = [ indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]] for (t, pos) in name2useScope.keys() if name2useScope[(t, pos)] == scope ] highlight(tokens, lc_list) print # Discover the path to the source map _map_path = sourcemap.discover(minified) # Read and parse our sourcemap # sourcemapIndex = sourcemap.load(open(map_path)) # Cluster names by scope nameScope2Positions = {} # Index data by (name,scope) for token, l in indexBuilder.name2CharPositions.iteritems(): for (line, col) in sorted(l, key=lambda (a, b): (a, b)): pos = indexBuilder.flatMap[(line, col)] if name2defScope.has_key((token, pos)): scope = name2defScope[(token, pos)] use_scope = name2useScope[(token, pos)] pth = name2pth[(token, pos)] glb = isGlobal[(token, pos)] nameScope2Positions.setdefault((token, scope, glb), []) nameScope2Positions[(token, scope, glb)].append((line, col)) # print token, pos # print 'def:', scope # print 'use:', use_scope # print 'pth:', pth # highlight(tokens, [indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]]]) # print for (token,scope,glb), positions in sorted(nameScope2Positions.iteritems(), \ key=lambda (x,y):x[0]): if glb: continue pos = sorted(positions, key=lambda e: (e[0], e[1])) # t = [] tt = [] line_tok_idxs = set([]) for (l, c) in pos: # orig = sourcemapIndex.lookup(line=l, column=c).name (tl, tc) = indexBuilder.revTokMap[(l, c)] line_tok_idxs.add(tl) p = indexBuilder.flatMap[(l, c)] tt.append(((tl, tc), p)) # t.append(orig) # if token == 'n': print '\nNAME:', token.encode('utf-8'), 'isGlobal =', glb # print scope # highlight(tokens, [indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]]]) for ((tli, tci), p) in tt: scope = name2defScope[(token, p)] use_scope = name2useScope[(token, p)] pth = name2pth[(token, p)] origin = nameOrigin[(token, scope)] # print token #, p, origin # print # print 'def:', scope # print 'use:', use_scope # print 'pth:', pth # print for tl in sorted(set([tli for ((tli, tci), p) in tt])): l = list(tokens[tl]) for tc in [tci for ((tli, tci), p) in tt if tli == tl]: l[tc] = (l[tc][0], unichr(0x2588) + token + unichr(0x2588)) # pos = indexBuilder.flatMap[(line,col)] print ' ', '%d:' % (tl + 1), ' '.join( [x[1].encode('utf-8') for x in l]) print return
def processFile(js_file_path): # js_file_path = l[0] base_name = os.path.splitext(os.path.basename(js_file_path))[0] if dbg: print js_file_path temp_files = {'orig': '%s.js' % base_name, 'minified': '%s.u.js' % base_name, 'n2p': '%s.n2p.js' % base_name} for r_strategy in RS.all(): temp_files['%s' % (r_strategy)] = \ '%s.%s.js' % (base_name, r_strategy) for c_strategy in CS.all(): temp_files['%s_%s' % (r_strategy, c_strategy)] = \ '%s.%s.%s.js' % (base_name, r_strategy, c_strategy) for k,v in temp_files.iteritems(): temp_files[k] = os.path.join(output_path, v) candidates = [] #Minified Name -> Original Name (name, def_scope) -> (name, def_scope) min_name_map = {} #Hashed Name -> Minified Name (name, def_scope) -> (name, def_scope) hash_name_map = {} #Minified Name -> jsnice name (name, def_scope) -> (name, def_scope) jsnice_name_map = {} #Data for the suggestion model.csv #Map of variable (name, def_scope) -> results of variableMetrics features function name_features = {} #Map of maps of variable-suggestion (name, def_scope, suggestion) -> suggestion line counts + suggestionMetrics features function #The first key is the renaming strategy #Ultimately, we will iterate over this to get the keys out of name_features and build model_rows suggestion_features = {} #Output Lines for the suggestoin_model.csv model_rows = [] if True: # try: # js_text = open(os.path.join(corpus_root, js_file_path), 'r').read() js_text = open(js_file_path, 'r').read() # Strip comments, replace literals, etc # if True: # try: prepro = WebLMPreprocessor(js_text) prepro_text = str(prepro) # except: # return (js_file_path, None, 'Preprocessor fail') # print 'Preprocessor' # print prepro_text # Pass through beautifier to fix layout clear = Beautifier() (ok, tmp_beautified_text, _err) = clear.web_run(prepro_text) print '\nOK:', ok, 'ERR:', _err print tmp_beautified_text if not ok: return (js_file_path, None, 'Beautifier fail') # Minify ugly = Uglifier() (ok, tmp_minified_text, _err) = ugly.web_run(tmp_beautified_text) # print '\nOK:', ok, 'ERR:', _err # print tmp_minified_text if not ok: return (js_file_path, None, 'Uglifier fail') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() (aligned_clear, aligned_minified) = aligner.web_align(WebLexer(tmp_beautified_text).tokenList, WebLexer(tmp_minified_text).tokenList) except: return (js_file_path, None, 'Aligner fail') # print '\nAligned clear' # print aligned_clear # print '\nAligned minified' # print aligned_minified # print # Pass through beautifier to fix layout (ok, beautified_text, _err) = clear.web_run(aligned_clear) if not ok: return (js_file_path, None, 'Beautifier fail') (ok, minified_text, _err) = clear.web_run(aligned_minified) if not ok: return (js_file_path, None, 'Beautifier fail') # print beautified_text # print # print minified_text # Num tokens before vs after try: lex_clear = WebLexer(beautified_text) tok_clear = lex_clear.tokenList lex_ugly = WebLexer(minified_text) tok_ugly = lex_ugly.tokenList except: return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): return (js_file_path, None, 'Num tokens mismatch') if beautified_text == minified_text: return (js_file_path, None, 'Not minified') try: iBuilder_clear = IndexBuilder(lex_clear.tokenList) except: return (js_file_path, None, "IndexBuilder fail on original file.") try: iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: return (js_file_path, None, 'IndexBuilder fail on minified file.') # print 'Writing' with open(temp_files['orig'], 'w') as f: f.write(beautified_text) with open(temp_files['minified'], 'w') as f: f.write(minified_text) ######################## # Nice2Predict ######################## # BV: Next block left out until I figure out the pipe issue # BV: Update: I couldn't pipe input to N2P. TODO: FIX # Run the JSNice from http://www.nice2predict.org unuglifyJS = UnuglifyJS() (ok, n2p_text, _err) = unuglifyJS.run(temp_files['minified']) if not ok: return (js_file_path, None, 'Nice2Predict fail') (ok, n2p_text_beautified, _err) = clear.web_run(n2p_text) if not ok: return (js_file_path, None, 'Beautifier fail') with open(temp_files['n2p'], 'w') as f: f.write(n2p_text_beautified) if(True): #try: n2p_lexer = WebLexer(n2p_text_beautified) n2p_iBuilder = IndexBuilder(n2p_lexer.tokenList) n2p_scopeAnalyst = WebScopeAnalyst(n2p_text_beautified) #except: # return (js_file_path, None, 'IndexBuilder / ScopeAnalyst fail') # print 'n2p' # Save some translation stats to compare different methods ts = TranslationSummarizer() candidates += [['n2p', ''] + x for x in ts.compute_summary_unscoped(n2p_iBuilder, n2p_scopeAnalyst)] ################################################ # All other JSNaughty variants ################################################ try: scopeAnalyst = WebScopeAnalyst(minified_text) except: return (js_file_path, None, 'ScopeAnalyst minified fail') try: scopeAnalyst_clear = WebScopeAnalyst(beautified_text) except: return (js_file_path, None, 'ScopeAnalyst clear fail') if(not check(iBuilder_clear, scopeAnalyst_clear, n2p_iBuilder, n2p_scopeAnalyst)): return (js_file_path, None, 'JsNice restructured file. Skipping..') #Map the original names to the minified counterparts and minified ones to jsnice renamings orderedVarsNew = sorted(scopeAnalyst.name2defScope.keys(), key = lambda x: x[1]) orderedVarsOld = sorted(scopeAnalyst_clear.name2defScope.keys(), key = lambda x: x[1]) orderedVarsN2p = sorted(n2p_scopeAnalyst.name2defScope.keys(), key = lambda x: x[1]) if(len(orderedVarsOld) != len(orderedVarsNew)): return (js_file_path, None, "Old and New Name lists different length") if(len(orderedVarsOld) != len(orderedVarsN2p)): return (js_file_path, None, "JsNice and Old Name lists different length") for i in range(0, len(orderedVarsOld)): name_old = orderedVarsOld[i][0] def_scope_old = scopeAnalyst_clear.name2defScope[orderedVarsOld[i]] name_new = orderedVarsNew[i][0] def_scope_new = scopeAnalyst.name2defScope[orderedVarsNew[i]] min_name_map[(name_new, def_scope_new)] = (name_old, def_scope_old) name_n2p = orderedVarsN2p[i][0] def_scope_n2p = scopeAnalyst.name2defScope[orderedVarsNew[i]] jsnice_name_map[(name_new, def_scope_new)] = (name_n2p, def_scope_n2p) #Once we have the scopeAnalyst, iBuilder, and tokenlist for the minified #version, we can get the name properties vm = VariableMetrics(scopeAnalyst, iBuilder_ugly, lex_ugly.tokenList) variableKeySet = vm.getVariables() for variableKey in variableKeySet: name_features[variableKey] = vm.getNameMetrics(variableKey) (name_positions, \ position_names, use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst) # print 'Helpers' # Try different renaming strategies (hash, etc) for r_strategy, proxy in proxies: if dbg: print '\n=====================' print r_strategy print '=====================\n' # try: # if True: # Rename input prior to translation preRen = PreRenamer() after_text = preRen.rename(r_strategy, iBuilder_ugly, scopeAnalyst) # print 'After text:' # print after_text # print (ok, beautified_after_text, _err) = clear.web_run(after_text) if not ok: return (js_file_path, None, 'Beautifier fail') # print 'Beautified:' # print beautified_after_text # print if(r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO): try: scopeAnalyst_hash = WebScopeAnalyst(after_text) except: return (js_file_path, None, "ScopeAnalyst hash fail") #Map the hashed names to the minified counterparts. orderedVarsMin = sorted(scopeAnalyst.name2defScope.keys(), key = lambda x: x[1]) orderedVarsHash = sorted(scopeAnalyst_hash.name2defScope.keys(), key = lambda x: x[1]) if(len(orderedVarsMin) != len(orderedVarsHash)): return (js_file_path, None, "Hash and Min lists different length") for i in range(0, len(orderedVarsHash)): name_hash = orderedVarsHash[i][0] def_scope_hash = scopeAnalyst_hash.name2defScope[orderedVarsHash[i]] name_min = orderedVarsMin[i][0] def_scope_min = scopeAnalyst.name2defScope[orderedVarsMin[i]] hash_name_map[(name_hash, def_scope_hash)] = (name_min, def_scope_min) # Save renamed input to disk for future inspection with open(temp_files['%s' % (r_strategy)], 'w') as f: f.write(beautified_after_text) a_lexer = WebLexer(beautified_after_text) a_iBuilder = IndexBuilder(a_lexer.tokenList) a_scopeAnalyst = WebScopeAnalyst(beautified_after_text) # except: # return (js_file_path, None, 'Renaming fail') # print 'Lexing' # lx = WebLexer(a_iBuilder.get_text()) lx = WebLexer(a_iBuilder.get_text_wo_literals()) # print a_iBuilder.get_text_wo_literals() # Translate renamed input md = WebMosesDecoder(proxy) (ok, translation, _err) = md.run(lx.collapsedText) if not ok: return (js_file_path, None, 'Moses translation fail') # print '\ntranslation-------------' # print translation # if r_strategy == RS.HASH_ONE: # exit() (a_name_positions, a_position_names, a_use_scopes) = prepHelpers(a_iBuilder, a_scopeAnalyst) nc = [] if translation is not None: # Parse moses output mp = MosesParser() if dbg: print '\nr_strategy-----------', r_strategy name_candidates = mp.parse(translation, a_iBuilder, a_position_names) # name_candidates is a dictionary of dictionaries: # keys are (name, None) (if scopeAnalyst=None) or # (name, def_scope) tuples (otherwise); # values are suggested translations with the sets # of line numbers on which they appear. # print '\nname_candidates before ----------' # for key, suggestions in name_candidates.iteritems(): # print key[0], key[1][-50:] # # for use_scope, suggestions in val.iteritems(): # # print '\t...', use_scope[-50:] # for name_translation, lines in suggestions.iteritems(): # print '\t', name_translation, lines # Update name_candidates with some default values # (in this case the translation without any renaming) # if the translation is empty if r_strategy == RS.NONE: # RS.NONE should always be first, by construction name_candidates_default = name_candidates scopeAnalyst_default = a_scopeAnalyst iBuilder_default = a_iBuilder else: for key_default, suggestions in name_candidates_default.iteritems(): # (name_default, def_scope_default) = key_default pos_default = scopeAnalyst_default.nameDefScope2pos[key_default] (lin, col) = iBuilder_default.revFlatMat[pos_default] (line_num, line_idx) = iBuilder_default.revTokMap[(lin, col)] (name, def_scope) = a_position_names[line_num][line_idx] key = (name, def_scope) for name_translation, lines in suggestions.iteritems(): name_candidates.setdefault(key, {}) name_candidates[key].setdefault(name_translation, set([])) name_candidates[key][name_translation].update(lines) # for use_scope, suggestions in val.iteritems(): # for name_translation, lines in suggestions.iteritems(): # # key = preRen.simple_direct_map.get(key_default, key_default) # # name_candidates.setdefault(key, {}) # name_candidates[key].setdefault(use_scope, {}) # name_candidates[key][use_scope].setdefault(name_translation, set([])) # name_candidates[key][use_scope][name_translation].update(lines) # print '\nname_candidates after ----------' # for key, suggestions in name_candidates.iteritems(): # print key[0], key[1][-50:] # # for use_scope, suggestions in val.iteritems(): # # print '\t...', use_scope[-50:] # for name_translation, lines in suggestions.iteritems(): # print '\t', name_translation, lines cc = ConsistencyController(debug_mode=True) ts = TranslationSummarizer() # An identifier may have been translated inconsistently # across different lines (Moses treats each line independently). # Try different strategies to resolve inconsistencies, if any for c_strategy in CS.all(): if dbg: print '\nc_strategy----------', c_strategy #assert(hash_name_map != {}) # Compute renaming map (x -> length, y -> width, ...) # Note that x,y here are names after renaming (temp_renaming_map, seen) = cc.computeRenaming(c_strategy, name_candidates, a_name_positions, a_use_scopes, a_iBuilder, lm_path, vm, hash_name_map) #After computeRenaming, we have both the entropies stored #if we are in LMDrop strategy and have the suggestions #frequency from name_candidates. Fill in suggestion_Features if(c_strategy == CS.LMDROP and r_strategy not in suggestion_features): assert(cc.suggestion_cache != None) suggestion_features[r_strategy] = {} #Need some way of iterating over all name, suggestion groups... """ name_candidates: dict name_candidates[(name, def_scope)][name_translation] = set of line numbers in the translation """ for variableKey, suggestionDictionary in name_candidates.iteritems(): for suggestionName, linesSuggested in suggestionDictionary.iteritems(): # I need to revert variableKey[0] in the suggestion from its hash to its original minified name. if(r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO): unhashedKey = hash_name_map[variableKey] suggestionKey = (unhashedKey[0], unhashedKey[1], suggestionName) else: suggestionKey = (variableKey[0], variableKey[1], suggestionName) entropyVals = cc.suggestion_cache.getEntropyStats(variableKey, suggestionName) if(True): #eval_dbg only #if(entropyVals != (ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR)): suggestionValue = [len(linesSuggested)] + \ list(getSuggestionStats(suggestionName)) + \ list(entropyVals) suggestion_features[r_strategy][suggestionKey] = suggestionValue if dbg: print '\ntemp_renaming_map-------------' for (name, def_scope), renaming in temp_renaming_map.iteritems(): print (name, def_scope[-50:]), renaming # Fall back on original names in input, if # no translation was suggested postRen = PostRenamer() renaming_map = postRen.updateRenamingMap(a_name_positions, position_names, a_use_scopes, temp_renaming_map, seen, r_strategy) # new_name_candidates = {} # # for (name, def_scope), renaming in temp_renaming_map.iteritems(): # (line_num, line_idx) = a_name_positions[(name, def_scope)][0] # (old_name, old_def_scope) = position_names[line_num][line_idx] # # new_name_candidates.setdefault((old_name, old_def_scope), {}) # new_name_candidates[(old_name, old_def_scope)][renaming] = set([1]) # tmp_renamed_text = postRen.applyRenaming(a_iBuilder, # a_name_positions, # temp_renaming_map) # (ok, tmp_beautified_renamed_text, _err) = clear.web_run(tmp_renamed_text) # if not ok: # return (js_file_path, None, 'Beautifier fail') # # tmp_lexer = WebLexer(tmp_beautified_renamed_text) # tmp_iBuilder = IndexBuilder(tmp_lexer.tokenList) # tmp_scopeAnalyst = WebScopeAnalyst(tmp_beautified_renamed_text) # # (tmp_name_positions, # tmp_position_names, # tmp_use_scopes) = prepHelpers(tmp_iBuilder, tmp_scopeAnalyst) # renaming_map = postRen.updateRenamingMap(tmp_name_positions, # position_names, # temp_renaming_map, # r_strategy) # # renaming_map = cc.computeRenaming(CS.FREQLEN, # new_name_candidates, # name_positions, # use_scopes, # iBuilder_ugly, # lm_path) # # Fall back on original names in input, if # # no translation was suggested # postRen = PostRenamer() # renaming_map = postRen.updateRenamingMap(a_name_positions, # position_names, # temp_renaming_map, # r_strategy) if dbg: print '\nrenaming_map-------------' for (name, def_scope), renaming in renaming_map.iteritems(): print (name, def_scope[-50:]), renaming, '(%s)' % temp_renaming_map[(name, def_scope)] # Apply renaming map and save output for future inspection renamed_text = postRen.applyRenaming(a_iBuilder, a_name_positions, renaming_map) print '\nrenamed_text--------------' print renamed_text print (ok, beautified_renamed_text, _err) = clear.web_run(renamed_text) if not ok: return (js_file_path, None, 'Beautifier fail') with open(temp_files['%s_%s' % (r_strategy, c_strategy)], 'w') as f: f.write(beautified_renamed_text) # Save some stats about which names were renamed to what # This is what enables the comparison between the different # methods. r = [[c_strategy] + x for x in ts.compute_summary_scoped(renaming_map, name_candidates, a_iBuilder, a_scopeAnalyst)] if not r: return (js_file_path, None, 'Compute summary failed') nc += r if nc: candidates += [[r_strategy] + x for x in nc] #create the rows for the suggestion_model.csv for r_strategy in RS.all(): for suggestionKey, s_feat in suggestion_features[r_strategy].iteritems(): variableKey = (suggestionKey[0], suggestionKey[1]) original_name = min_name_map[variableKey][0] js_nice_name = jsnice_name_map[variableKey][0] if(variableKey in name_features): #eval_dbg only n_feat = list(name_features[variableKey]) #Convert the def_scope to an equivalent, but smaller, easier to read key: (line_num, token_num) newKey = scopeAnalyst.nameDefScope2pos[variableKey] (keyLine, keyToken) = iBuilder_ugly.revFlatMat[newKey] model_rows.append([original_name, r_strategy, suggestionKey[0], keyLine, keyToken, suggestionKey[2], js_nice_name] + n_feat + s_feat) return (js_file_path, 'OK', candidates, model_rows)
def processFile(l): def localCleanup(output_path, base_names): for base_name in base_names: tryRemove(os.path.join(output_path, base_name)) js_file_path = l[0] base_name = os.path.splitext(os.path.basename(js_file_path))[0] pid = int(multiprocessing.current_process().ident) candidates = [] try: # if True: # Temp files to be created during processing path_tmp = 'tmp_%d.js' % (pid) path_tmp_b = 'tmp_%d.b.js' % (pid) path_tmp_b_a = 'tmp_%d.b.a.js' % (pid) path_tmp_u = 'tmp_%d.u.js' % (pid) path_tmp_u_a = 'tmp_%d.u.a.js' % (pid) path_tmp_unugly = 'tmp_%d.n2p.js' % (pid) path_tmp_jsnice = 'tmp_%d.jsnice.js' % (pid) f2 = 'tmp_%d.no_renaming.js' % (pid) f3 = 'tmp_%d.basic_renaming.js' % (pid) f4 = 'tmp_%d.hash_renaming.js' % (pid) f5 = 'tmp_%d.hash_def_one_renaming.js' % (pid) f6 = 'tmp_%d.hash_def_two_renaming.js' % (pid) path_orig = '%s.js' % (base_name) path_ugly = '%s.u.js' % (base_name) path_unugly = '%s.n2p.js' % (base_name) path_jsnice = '%s.jsnice.js' % (base_name) # Strip comments, replace literals, etc try: prepro = Preprocessor(os.path.join(corpus_root, js_file_path)) prepro.write_temp_file(path_tmp) except: cleanup(pid) return (js_file_path, None, 'Preprocessor fail') # Pass through beautifier to fix layout clear = Beautifier() ok = clear.run(path_tmp, path_tmp_b+'.tmp1') if not ok: cleanup(pid) return (js_file_path, None, 'Beautifier 1 fail') jsNiceBeautifier = JSNice(flags=['--no-types', '--no-rename']) (ok, _out, _err) = jsNiceBeautifier.run(path_tmp_b+'.tmp1', path_tmp_b+'.tmp2') if not ok: cleanup(pid) return (js_file_path, None, 'JSNice Beautifier 1 fail') ok = clear.run(path_tmp_b+'.tmp2', path_tmp_b) if not ok: cleanup(pid) return (js_file_path, None, 'Beautifier 1 fail') # Minify ugly = Uglifier() ok = ugly.run(path_tmp_b, path_tmp_u) if not ok: cleanup(pid) return (js_file_path, None, 'Uglifier fail') # Num tokens before vs after try: tok_clear = Lexer(path_tmp_b).tokenList tok_ugly = Lexer(path_tmp_u).tokenList except: cleanup(pid) return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): cleanup(pid) return (js_file_path, None, 'Num tokens mismatch') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() # This is already the baseline corpus, no (smart) renaming yet aligner.align(path_tmp_b, path_tmp_u) except: cleanup(pid) return (js_file_path, None, 'Aligner fail') try: # iBuilder_clear = IndexBuilder(Lexer(path_tmp_b_a).tokenList) iBuilder_ugly = IndexBuilder(Lexer(path_tmp_u_a).tokenList) except: cleanup(pid) return (js_file_path, None, 'IndexBuilder fail') # Store original and uglified versions ok = clear.run(path_tmp_u_a, os.path.join(output_path, path_ugly)) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly]) return (js_file_path, None, 'Beautifier 2 fail') ok = clear.run(path_tmp_b_a, os.path.join(output_path, path_orig)) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig]) return (js_file_path, None, 'Beautifier 3 fail') # Run the JSNice from http://www.nice2predict.org unuglifyJS = UnuglifyJS() (ok, _out, _err) = unuglifyJS.run(path_tmp_b_a, path_tmp_unugly) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig]) return (js_file_path, None, 'Nice2Predict fail') ok = clear.run(path_tmp_unugly, path_tmp_unugly+'.tmp1') if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, path_unugly]) return (js_file_path, None, 'Beautifier 4 fail') (ok, _out, _err) = jsNiceBeautifier.run(path_tmp_unugly+'.tmp1', path_tmp_unugly+'.tmp2') if not ok: cleanup(pid) return (js_file_path, None, 'JSNice Beautifier 2 fail') ok = clear.run(path_tmp_unugly+'.tmp2', os.path.join(output_path, path_unugly)) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, path_unugly]) return (js_file_path, None, 'Beautifier 4 fail') try: scopeAnalyst = ScopeAnalyst(os.path.join( os.path.dirname(os.path.realpath(__file__)), path_tmp_unugly)) nameOrigin = scopeAnalyst.nameOrigin for (name, def_scope) in nameOrigin.iterkeys(): candidates.append(('Nice2Predict', def_scope, name, '', '')) except: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, path_unugly]) return (js_file_path, None, 'ScopeAnalyst fail') # Run the JSNice from http://www.jsnice.org jsNice = JSNice() (ok, _out, _err) = jsNice.run(path_tmp_b_a, path_tmp_jsnice) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, path_unugly]) return (js_file_path, None, 'JSNice fail') ok = clear.run(path_tmp_jsnice, os.path.join(output_path, path_jsnice)) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, \ path_unugly, path_jsnice]) return (js_file_path, None, 'Beautifier 5 fail') try: scopeAnalyst = ScopeAnalyst(os.path.join( os.path.dirname(os.path.realpath(__file__)), path_tmp_jsnice)) nameOrigin = scopeAnalyst.nameOrigin for (name, def_scope) in nameOrigin.iterkeys(): candidates.append(('JSNice', def_scope, name, '', '')) except: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, \ path_unugly, path_jsnice]) return (js_file_path, None, 'ScopeAnalyst fail') # Compute scoping: name2scope is a dictionary where keys # are (name, start_index) tuples and values are scope identifiers. # Note: start_index is a flat (unidimensional) index, # not a (line_chr_idx, col_chr_idx) index. try: scopeAnalyst = ScopeAnalyst(os.path.join( os.path.dirname(os.path.realpath(__file__)), path_tmp_u_a)) _name2defScope = scopeAnalyst.resolve_scope() _isGlobal = scopeAnalyst.isGlobal _name2useScope = scopeAnalyst.resolve_use_scope() except: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, \ path_unugly, path_jsnice]) return (js_file_path, None, 'ScopeAnalyst fail') no_renaming = [] for _line_idx, line in enumerate(iBuilder_ugly.tokens): no_renaming.append(' '.join([t for (_tt,t) in line]) + "\n") with open(f2, 'w') as f_no_renaming: f_no_renaming.writelines(no_renaming) moses = MosesDecoder(ini_path=os.path.join(ini_path, \ 'train.no_renaming', 'tuning', 'moses.ini')) (_moses_ok, translation, _err) = moses.run(f2) nc = processTranslation(translation, iBuilder_ugly, scopeAnalyst, lm_path, f2, output_path, base_name, clear) if nc: candidates += nc # Simple renaming: disambiguate overloaded names using scope id basic_renaming = renameUsingScopeId(scopeAnalyst, iBuilder_ugly) with open(f3, 'w') as f_basic_renaming: f_basic_renaming.writelines(basic_renaming) moses = MosesDecoder(ini_path=os.path.join(ini_path, \ 'train.basic_renaming', 'tuning', 'moses.ini')) (_moses_ok, translation, _err) = moses.run(f3) nc = processTranslation(translation, iBuilder_ugly, scopeAnalyst, lm_path, f3, output_path, base_name, clear) if nc: candidates += nc # More complicated renaming: collect the context around # each name (global variables, API calls, punctuation) # and build a hash of the concatenation. hash_renaming = renameUsingHashAllPrec(scopeAnalyst, iBuilder_ugly, debug=False) # print hash_renaming with open(f4, 'w') as f_hash_renaming: f_hash_renaming.writelines(hash_renaming) moses = MosesDecoder(ini_path=os.path.join(ini_path, \ 'train.hash_renaming', 'tuning', 'moses.ini')) (_moses_ok, translation, _err) = moses.run(f4) nc = processTranslation(translation, iBuilder_ugly, scopeAnalyst, lm_path, f4, output_path, base_name, clear) if nc: candidates += nc hash_def_one_renaming = renameUsingHashDefLine(scopeAnalyst, iBuilder_ugly, twoLines=False, debug=False) with open(f5, 'w') as f_hash_def_one_renaming: f_hash_def_one_renaming.writelines(hash_def_one_renaming) moses = MosesDecoder(ini_path=os.path.join(ini_path, \ 'train.hash_def_one_renaming', 'tuning', 'moses.ini')) (_moses_ok, translation, _err) = moses.run(f5) nc = processTranslation(translation, iBuilder_ugly, scopeAnalyst, lm_path, f5, output_path, base_name, clear) if nc: candidates += nc hash_def_two_renaming = renameUsingHashDefLine(scopeAnalyst, iBuilder_ugly, twoLines=True, debug=False) with open(f6, 'w') as f_hash_def_two_renaming: f_hash_def_two_renaming.writelines(hash_def_two_renaming) moses = MosesDecoder(ini_path=os.path.join(ini_path, \ 'train.hash_def_two_renaming', 'tuning', 'moses.ini')) (_moses_ok, translation, _err) = moses.run(f6) nc = processTranslation(translation, iBuilder_ugly, scopeAnalyst, lm_path, f6, output_path, base_name, clear) if nc: candidates += nc cleanup(pid) cleanupRenamed(pid) return (js_file_path, 'OK', candidates) except Exception, e: cleanup(pid) cleanupRenamed(pid) return (js_file_path, None, str(e).replace("\n", ""))