def processFile(js_file_path): try: js_text = open(os.path.join(files_root, js_file_path), 'r').read() # Strip comments, replace literals, etc try: prepro = WebLMPreprocessor(js_text) prepro_text = str(prepro) except: return (js_file_path, None, 'Preprocessor fail') # Pass through beautifier to fix layout clear = Beautifier() (ok, beautified_text, _err) = clear.web_run(prepro_text) if not ok: return (js_file_path, None, 'Beautifier fail') try: lex_clear = WebLexer(beautified_text) tok1 = lex_clear.tokenList except: return (js_file_path, None, 'Lexer fail') try: iBuilder1 = IndexBuilder(tok1) except: return (js_file_path, None, 'IndexBuilder fail') orig = [] for _line_idx, line in enumerate(iBuilder1.tokens): orig.append(' '.join([t for (_tt,t) in line]) + "\n") return (js_file_path, orig) except Exception, e: return (js_file_path, None, str(e))
def processFile(js_file_path): try: # Num tokens before vs after try: tok1 = Lexer(os.path.join(files_root, 'orig', js_file_path)).tokenList tok2 = Lexer(os.path.join(files_root, 'no_renaming', js_file_path)).tokenList # tok3 = Lexer(os.path.join(files_root, 'basic_renaming', js_file_path)).tokenList # tok4 = Lexer(os.path.join(files_root, 'normalized', js_file_path)).tokenList tok5 = Lexer( os.path.join(files_root, 'hash_def_one_renaming', js_file_path)).tokenList tok6 = Lexer( os.path.join(files_root, 'hash_def_two_renaming', js_file_path)).tokenList except: return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(set([len(tok1), len(tok2), len(tok5), len(tok6)])) == 1: return (js_file_path, None, 'Num tokens mismatch') clear = Beautifier() # Align minified and clear files, in case the beautifier # did something weird aligner = Aligner() (aligned1, aligned2) = aligner.web_align(tok1, tok2) (ok, beautified1, _err) = clear.web_run(aligned1) tok11 = WebLexer(beautified1).tokenList (ok, beautified2, _err) = clear.web_run(aligned2) tok22 = WebLexer(beautified2).tokenList (aligned5, aligned2) = aligner.web_align(tok5, tok2) (ok, beautified5, _err) = clear.web_run(aligned5) tok55 = WebLexer(beautified5).tokenList (aligned6, aligned2) = aligner.web_align(tok6, tok2) (ok, beautified6, _err) = clear.web_run(aligned6) tok66 = WebLexer(beautified6).tokenList # try: # aligner = Aligner() # # This is already the baseline corpus, no (smart) renaming yet # aligner.align(temp_files['path_tmp_b'], # temp_files['path_tmp_u']) # except: # return (js_file_path, None, 'Aligner fail') try: iBuilder1 = IndexBuilder(tok11) iBuilder2 = IndexBuilder(tok22) # iBuilder3 = IndexBuilder(tok3) # iBuilder4 = IndexBuilder(tok4) iBuilder5 = IndexBuilder(tok55) iBuilder6 = IndexBuilder(tok66) except: return (js_file_path, None, 'IndexBuilder fail') # Check that at least one variable was renamed during minification orig_names = set([ token for line in iBuilder1.tokens for (token_type, token) in line if is_token_subtype(token_type, Token.Name) ]) ugly_names = set([ token for line in iBuilder2.tokens for (token_type, token) in line if is_token_subtype(token_type, Token.Name) ]) if not len(orig_names.difference(ugly_names)): return (js_file_path, None, 'Not minified') orig = [] no_renaming = [] # basic_renaming = [] # normalized = [] hash_def_one_renaming = [] hash_def_two_renaming = [] for _line_idx, line in enumerate(iBuilder1.tokens): orig.append(' '.join([t for (_tt, t) in line]) + "\n") for _line_idx, line in enumerate(iBuilder2.tokens): no_renaming.append(' '.join([t for (_tt, t) in line]) + "\n") # for _line_idx, line in enumerate(iBuilder3.tokens): # basic_renaming.append(' '.join([t for (_tt,t) in line]) + "\n") # for _line_idx, line in enumerate(iBuilder4.tokens): # normalized.append(' '.join([t for (_tt,t) in line]) + "\n") for _line_idx, line in enumerate(iBuilder5.tokens): hash_def_one_renaming.append(' '.join([t for (_tt, t) in line]) + "\n") for _line_idx, line in enumerate(iBuilder6.tokens): hash_def_two_renaming.append(' '.join([t for (_tt, t) in line]) + "\n") return ( js_file_path, orig, no_renaming, # basic_renaming, # normalized, hash_def_one_renaming, hash_def_two_renaming) except Exception, e: return (js_file_path, None, str(e))
def processFile(l): js_file_path = l[0] # if True: try: js_text = open(os.path.join(corpus_root, js_file_path), 'r').read() # Strip comments, replace literals, etc try: prepro = WebLMPreprocessor(js_text) prepro_text = str(prepro) except: return (js_file_path, None, 'Preprocessor fail') # Pass through beautifier to fix layout clear = Beautifier() (ok, tmp_beautified_text, _err) = clear.web_run(prepro_text) if not ok: return (js_file_path, None, 'Beautifier fail') # Minify ugly = Uglifier() (ok, tmp_minified_text, _err) = ugly.web_run(tmp_beautified_text) if not ok: return (js_file_path, None, 'Uglifier fail') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() (aligned_clear, aligned_minified) = aligner.web_align( WebLexer(tmp_beautified_text).tokenList, WebLexer(tmp_minified_text).tokenList) except: return (js_file_path, None, 'Aligner fail') # Pass through beautifier to fix layout (ok, beautified_text, _err) = clear.web_run(aligned_clear) if not ok: return (js_file_path, None, 'Beautifier fail') (ok, minified_text, _err) = clear.web_run(aligned_minified) if not ok: return (js_file_path, None, 'Beautifier fail') # Num tokens before vs after try: lex_clear = WebLexer(beautified_text) tok_clear = lex_clear.tokenList lex_ugly = WebLexer(minified_text) tok_ugly = lex_ugly.tokenList except: return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): return (js_file_path, None, 'Num tokens mismatch') if beautified_text == minified_text: return (js_file_path, None, 'Not minified') try: iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: return (js_file_path, None, 'IndexBuilder fail') try: scopeAnalyst = WebScopeAnalyst(minified_text) except: return (js_file_path, None, 'ScopeAnalyst fail') processed = [] # Try different renaming strategies (hash, etc) for r_strategy in RS.all(): try: # if True: # Rename input prior to translation preRen = PreRenamer() after_text = preRen.rename(r_strategy, iBuilder_ugly, scopeAnalyst) (ok, beautified_after_text, _err) = clear.web_run(after_text) if not ok: return (js_file_path, None, 'Beautifier fail') processed.append((r_strategy, beautified_after_text)) except: return (js_file_path, None, 'Renaming fail') with open(os.path.join(output_path, 'orig', js_file_path), 'w') as f: f.write(beautified_text) for (r_strategy, text) in processed: with open(os.path.join(output_path, r_strategy, js_file_path), 'w') as f: f.write(text) return (js_file_path, 'OK', None) except Exception, e: return (js_file_path, None, str(e).replace("\n", ""))
def processFile(l): js_file_path = l[0] base_name = os.path.splitext(os.path.basename(js_file_path))[0] temp_files = { 'orig': '%s.js' % base_name, 'minified': '%s.u.js' % base_name, 'n2p': '%s.n2p.js' % base_name } for r_strategy in RS.all(): temp_files['%s' % (r_strategy)] = \ '%s.%s.js' % (base_name, r_strategy) for c_strategy in CS.all(): temp_files['%s_%s' % (r_strategy, c_strategy)] = \ '%s.%s.%s.js' % (base_name, r_strategy, c_strategy) for k, v in temp_files.iteritems(): temp_files[k] = os.path.join(output_path, v) candidates = [] #Minified Name -> Original Name (name, def_scope) -> (name, def_scope) min_name_map = {} #Hashed Name -> Minified Name (name, def_scope) -> (name, def_scope) hash_name_map = {} #Minified Name -> jsnice name (name, def_scope) -> (name, def_scope) jsnice_name_map = {} #Output Lines for the suggestoin_model.csv model_rows = [] try: js_text = open(os.path.join(corpus_root, js_file_path), 'r').read() # Strip comments, replace literals, etc try: prepro = WebLMPreprocessor(js_text) prepro_text = str(prepro) except: return (js_file_path, None, 'Preprocessor fail') # Pass through beautifier to fix layout clear = Beautifier() (ok, tmp_beautified_text, _err) = clear.web_run(prepro_text) if not ok: return (js_file_path, None, 'Beautifier fail') # Minify ugly = Uglifier() (ok, tmp_minified_text, _err) = ugly.web_run(tmp_beautified_text) if not ok: return (js_file_path, None, 'Uglifier fail') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() (aligned_clear, aligned_minified) = aligner.web_align( WebLexer(tmp_beautified_text).tokenList, WebLexer(tmp_minified_text).tokenList) except: return (js_file_path, None, 'Aligner fail') # Pass through beautifier to fix layout (ok, beautified_text, _err) = clear.web_run(aligned_clear) if not ok: return (js_file_path, None, 'Beautifier fail') (ok, minified_text, _err) = clear.web_run(aligned_minified) if not ok: return (js_file_path, None, 'Beautifier fail') # Num tokens before vs after try: lex_clear = WebLexer(beautified_text) tok_clear = lex_clear.tokenList lex_ugly = WebLexer(minified_text) tok_ugly = lex_ugly.tokenList except: return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): return (js_file_path, None, 'Num tokens mismatch') if beautified_text == minified_text: return (js_file_path, None, 'Not minified') #try: # iBuilder_clear = IndexBuilder(lex_clear.tokenList) #except: # return (js_file_path, None, "IndexBuilder fail on original file.") try: iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: return (js_file_path, None, 'IndexBuilder fail') with open(temp_files['orig'], 'w') as f: f.write(beautified_text) with open(temp_files['minified'], 'w') as f: f.write(minified_text) # try: # orig_lexer = WebLexer(beautified_text) # orig_iBuilder = IndexBuilder(orig_lexer.tokenList) # orig_scopeAnalyst = WebScopeAnalyst(beautified_text) # except: # return (js_file_path, None, 'IndexBuilder/Scoper fail on original') ######################## # Nice2Predict ######################## # BV: Next block left out until I figure out the pipe issue # BV: Update: I couldn't pipe input to N2P. TODO: FIX # Run the JSNice from http://www.nice2predict.org unuglifyJS = UnuglifyJS() (ok, n2p_text, _err) = unuglifyJS.run(temp_files['minified']) if not ok: return (js_file_path, None, 'Nice2Predict fail') (ok, n2p_text_beautified, _err) = clear.web_run(n2p_text) if not ok: return (js_file_path, None, 'Beautifier fail') with open(temp_files['n2p'], 'w') as f: f.write(n2p_text_beautified) try: n2p_lexer = WebLexer(n2p_text_beautified) n2p_iBuilder = IndexBuilder(n2p_lexer.tokenList) n2p_scopeAnalyst = WebScopeAnalyst(n2p_text_beautified) except: return (js_file_path, None, 'IndexBuilder / ScopeAnalyst fail') # Save some translation stats to compare different methods ts = TranslationSummarizer() candidates += [['n2p', ''] + x for x in ts.compute_summary_unscoped( n2p_iBuilder, n2p_scopeAnalyst)] ################################################ # All other JSNaughty variants ################################################ try: scopeAnalyst = WebScopeAnalyst(minified_text) except: return (js_file_path, None, 'ScopeAnalyst minified fail') try: scopeAnalyst_clear = WebScopeAnalyst(beautified_text) except: return (js_file_path, None, 'ScopeAnalyst clear fail') #if(not check(iBuilder_clear, scopeAnalyst_clear, n2p_iBuilder, n2p_scopeAnalyst)): # return (js_file_path, None, 'JsNice restructured file. Skipping..') #Map the original names to the minified counterparts. orderedVarsNew = sorted(scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) orderedVarsOld = sorted(scopeAnalyst_clear.name2defScope.keys(), key=lambda x: x[1]) orderedVarsN2p = sorted(n2p_scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) if (len(orderedVarsOld) != len(orderedVarsNew)): return (js_file_path, None, "Old and New Name lists different length") if (len(orderedVarsOld) != len(orderedVarsN2p)): return (js_file_path, None, "JsNice and Old Name lists different length") for i in range(0, len(orderedVarsOld)): name_old = orderedVarsOld[i][0] def_scope_old = scopeAnalyst_clear.name2defScope[orderedVarsOld[i]] name_new = orderedVarsNew[i][0] def_scope_new = scopeAnalyst.name2defScope[orderedVarsNew[i]] min_name_map[(name_new, def_scope_new)] = (name_old, def_scope_old) name_n2p = orderedVarsN2p[i][0] def_scope_n2p = scopeAnalyst.name2defScope[orderedVarsNew[i]] jsnice_name_map[(name_new, def_scope_new)] = (name_n2p, def_scope_n2p) #Once we have the scopeAnalyst, iBuilder, and tokenlist for the minified #version, we can get the name properties # vm = VariableMetrics(scopeAnalyst, iBuilder_ugly, lex_ugly.tokenList) # variableKeySet = vm.getVariables() # for variableKey in variableKeySet: # name_features[variableKey] = vm.getNameMetrics(variableKey) (_name_positions, \ position_names, _use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst) # Try different renaming strategies (hash, etc) for r_strategy, proxy in proxies: # Rename input prior to translation preRen = PreRenamer() after_text = preRen.rename(r_strategy, iBuilder_ugly, scopeAnalyst) (ok, beautified_after_text, _err) = clear.web_run(after_text) if not ok: return (js_file_path, None, 'Beautifier fail') # Save renamed input to disk for future inspection with open(temp_files['%s' % (r_strategy)], 'w') as f: f.write(beautified_after_text) a_lexer = WebLexer(beautified_after_text) a_iBuilder = IndexBuilder(a_lexer.tokenList) a_scopeAnalyst = WebScopeAnalyst(beautified_after_text) if (r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO): # try: # scopeAnalyst_hash = WebScopeAnalyst(beautified_after_text) #This should be beautified_after_text instead of after_text # except: # return (js_file_path, None, "ScopeAnalyst hash fail") #Map the hashed names to the minified counterparts. orderedVarsMin = sorted(scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) orderedVarsHash = sorted(a_scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) if (len(orderedVarsMin) != len(orderedVarsHash)): return (js_file_path, None, "Hash and Min lists different length") for i in range(0, len(orderedVarsHash)): name_hash = orderedVarsHash[i][0] def_scope_hash = a_scopeAnalyst.name2defScope[ orderedVarsHash[i]] name_min = orderedVarsMin[i][0] def_scope_min = scopeAnalyst.name2defScope[ orderedVarsMin[i]] hash_name_map[(name_hash, def_scope_hash)] = (name_min, def_scope_min) # We can switch this back once we train models on a corpus with literals # lx = WebLexer(a_iBuilder.get_text()) lx = WebLexer(a_iBuilder.get_text_wo_literals()) # Translate renamed input md = WebMosesDecoder(proxy) (ok, translation, _err) = md.run(lx.collapsedText) if not ok: return (js_file_path, None, 'Moses translation fail') (a_name_positions, a_position_names, a_use_scopes) = prepHelpers(a_iBuilder, a_scopeAnalyst) nc = [] if translation is not None: # Parse moses output mp = MosesParser() name_candidates = mp.parse(translation, a_iBuilder, a_position_names) # name_candidates is a dictionary of dictionaries: # keys are (name, def_scope) tuples; # values are suggested translations with the sets # of line numbers on which they appear. # Update name_candidates with some default values # (in this case the translation without any renaming) # if the translation is empty if r_strategy == RS.NONE: # RS.NONE should always be first, by construction name_candidates_default = name_candidates scopeAnalyst_default = a_scopeAnalyst iBuilder_default = a_iBuilder else: for key_default, suggestions in name_candidates_default.iteritems( ): # (name_default, def_scope_default) = key_default pos_default = scopeAnalyst_default.nameDefScope2pos[ key_default] (lin, col) = iBuilder_default.revFlatMat[pos_default] (line_num, line_idx) = iBuilder_default.revTokMap[(lin, col)] (name, def_scope) = a_position_names[line_num][line_idx] key = (name, def_scope) for name_translation, lines in suggestions.iteritems(): name_candidates.setdefault(key, {}) name_candidates[key].setdefault( name_translation, set([])) name_candidates[key][name_translation].update( lines) # **** BV: This might be all we need to combine Naughty & Nice name_candidates_copy = deepcopy(name_candidates) for key, suggestions in name_candidates_copy.iteritems(): if r_strategy == RS.NONE: (name_n2p, def_scope_n2p) = jsnice_name_map[key] else: (name_n2p, def_scope_n2p) = jsnice_name_map[hash_name_map.get( key, key)] for name_translation, lines in suggestions.iteritems(): name_candidates.setdefault(key, {}) name_candidates[key].setdefault(name_n2p, set([])) name_candidates[key][name_n2p].update(lines) cc = ConsistencyController(debug_mode=False) ts = TranslationSummarizer() # An identifier may have been translated inconsistently # across different lines (Moses treats each line independently). # Try different strategies to resolve inconsistencies, if any for c_strategy in CS.all(): # Compute renaming map (x -> length, y -> width, ...) # Note that x,y here are names after (hash) renaming (temp_renaming_map, seen) = cc.computeRenaming( c_strategy, name_candidates, a_name_positions, a_use_scopes, a_iBuilder, lm_path, {}, hash_name_map) # After computeRenaming, we have both the entropies stored # if we are in LMDrop strategy and have the suggestions # frequency from name_candidates. Fill in suggestion_Features # if(c_strategy == CS.LMDROP and r_strategy not in suggestion_features): # assert(cc.suggestion_cache != None) # suggestion_features[r_strategy] = {} # """ # name_candidates: dict # name_candidates[(name, def_scope)][name_translation] # = set of line numbers in the translation # """ # for variableKey, suggestionDictionary in name_candidates.iteritems(): # for suggestionName, linesSuggested in suggestionDictionary.iteritems(): # # # I need to revert variableKey[0] in the suggestion from its hash to its original minified name. # if(r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO): # unhashedKey = hash_name_map[variableKey] # suggestionKey = (unhashedKey[0], unhashedKey[1], suggestionName) # else: # suggestionKey = (variableKey[0], variableKey[1], suggestionName) # # entropyVals = cc.suggestion_cache.getEntropyStats(variableKey, suggestionName) # if(entropyVals != (ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR)): # suggestionValue = [len(linesSuggested)] + \ # list(getSuggestionStats(suggestionName)) + \ # list(entropyVals) # # suggestion_features[r_strategy][suggestionKey] = suggestionValue # Fall back on original names in input, if # no translation was suggested postRen = PostRenamer() renaming_map = postRen.updateRenamingMap( a_name_positions, position_names, a_use_scopes, temp_renaming_map, seen, r_strategy) # Apply renaming map and save output for future inspection renamed_text = postRen.applyRenaming( a_iBuilder, a_name_positions, renaming_map) (ok, beautified_renamed_text, _err) = clear.web_run(renamed_text) if not ok: return (js_file_path, None, 'Beautifier fail') with open(temp_files['%s_%s' % (r_strategy, c_strategy)], 'w') as f: f.write(beautified_renamed_text) # Save some stats about which names were renamed to what # This is what enables the comparison between the different # methods. r = [[c_strategy] + x for x in ts.compute_summary_scoped( renaming_map, name_candidates, a_iBuilder, a_scopeAnalyst)] if not r: return (js_file_path, None, 'Compute summary failed') nc += r if nc: candidates += [[r_strategy] + x for x in nc] #create the rows for the suggestion_model.csv # for r_strategy in RS.all(): # for suggestionKey, s_feat in suggestion_features[r_strategy].iteritems(): # variableKey = (suggestionKey[0], suggestionKey[1]) # original_name = min_name_map[variableKey][0] # js_nice_name = jsnice_name_map[variableKey][0] # n_feat = list(name_features[variableKey]) # #Convert the def_scope to an equivalent, but smaller, easier to read key: (line_num, token_num) # newKey = scopeAnalyst.nameDefScope2pos[variableKey] # (keyLine, keyToken) = iBuilder_ugly.revFlatMat[newKey] # model_rows.append([original_name, r_strategy, suggestionKey[0], keyLine, keyToken, suggestionKey[2], js_nice_name] + n_feat + s_feat) return (js_file_path, 'OK', candidates, model_rows) except Exception, e: return (js_file_path, None, str(e).replace("\n", ""), model_rows)
def processFile(js_file_path): # js_file_path = l[0] base_name = os.path.splitext(os.path.basename(js_file_path))[0] if dbg: print js_file_path temp_files = {'orig': '%s.js' % base_name, 'minified': '%s.u.js' % base_name, 'n2p': '%s.n2p.js' % base_name} for r_strategy in RS.all(): temp_files['%s' % (r_strategy)] = \ '%s.%s.js' % (base_name, r_strategy) for c_strategy in CS.all(): temp_files['%s_%s' % (r_strategy, c_strategy)] = \ '%s.%s.%s.js' % (base_name, r_strategy, c_strategy) for k,v in temp_files.iteritems(): temp_files[k] = os.path.join(output_path, v) candidates = [] #Minified Name -> Original Name (name, def_scope) -> (name, def_scope) min_name_map = {} #Hashed Name -> Minified Name (name, def_scope) -> (name, def_scope) hash_name_map = {} #Minified Name -> jsnice name (name, def_scope) -> (name, def_scope) jsnice_name_map = {} #Data for the suggestion model.csv #Map of variable (name, def_scope) -> results of variableMetrics features function name_features = {} #Map of maps of variable-suggestion (name, def_scope, suggestion) -> suggestion line counts + suggestionMetrics features function #The first key is the renaming strategy #Ultimately, we will iterate over this to get the keys out of name_features and build model_rows suggestion_features = {} #Output Lines for the suggestoin_model.csv model_rows = [] if True: # try: # js_text = open(os.path.join(corpus_root, js_file_path), 'r').read() js_text = open(js_file_path, 'r').read() # Strip comments, replace literals, etc # if True: # try: prepro = WebLMPreprocessor(js_text) prepro_text = str(prepro) # except: # return (js_file_path, None, 'Preprocessor fail') # print 'Preprocessor' # print prepro_text # Pass through beautifier to fix layout clear = Beautifier() (ok, tmp_beautified_text, _err) = clear.web_run(prepro_text) print '\nOK:', ok, 'ERR:', _err print tmp_beautified_text if not ok: return (js_file_path, None, 'Beautifier fail') # Minify ugly = Uglifier() (ok, tmp_minified_text, _err) = ugly.web_run(tmp_beautified_text) # print '\nOK:', ok, 'ERR:', _err # print tmp_minified_text if not ok: return (js_file_path, None, 'Uglifier fail') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() (aligned_clear, aligned_minified) = aligner.web_align(WebLexer(tmp_beautified_text).tokenList, WebLexer(tmp_minified_text).tokenList) except: return (js_file_path, None, 'Aligner fail') # print '\nAligned clear' # print aligned_clear # print '\nAligned minified' # print aligned_minified # print # Pass through beautifier to fix layout (ok, beautified_text, _err) = clear.web_run(aligned_clear) if not ok: return (js_file_path, None, 'Beautifier fail') (ok, minified_text, _err) = clear.web_run(aligned_minified) if not ok: return (js_file_path, None, 'Beautifier fail') # print beautified_text # print # print minified_text # Num tokens before vs after try: lex_clear = WebLexer(beautified_text) tok_clear = lex_clear.tokenList lex_ugly = WebLexer(minified_text) tok_ugly = lex_ugly.tokenList except: return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): return (js_file_path, None, 'Num tokens mismatch') if beautified_text == minified_text: return (js_file_path, None, 'Not minified') try: iBuilder_clear = IndexBuilder(lex_clear.tokenList) except: return (js_file_path, None, "IndexBuilder fail on original file.") try: iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: return (js_file_path, None, 'IndexBuilder fail on minified file.') # print 'Writing' with open(temp_files['orig'], 'w') as f: f.write(beautified_text) with open(temp_files['minified'], 'w') as f: f.write(minified_text) ######################## # Nice2Predict ######################## # BV: Next block left out until I figure out the pipe issue # BV: Update: I couldn't pipe input to N2P. TODO: FIX # Run the JSNice from http://www.nice2predict.org unuglifyJS = UnuglifyJS() (ok, n2p_text, _err) = unuglifyJS.run(temp_files['minified']) if not ok: return (js_file_path, None, 'Nice2Predict fail') (ok, n2p_text_beautified, _err) = clear.web_run(n2p_text) if not ok: return (js_file_path, None, 'Beautifier fail') with open(temp_files['n2p'], 'w') as f: f.write(n2p_text_beautified) if(True): #try: n2p_lexer = WebLexer(n2p_text_beautified) n2p_iBuilder = IndexBuilder(n2p_lexer.tokenList) n2p_scopeAnalyst = WebScopeAnalyst(n2p_text_beautified) #except: # return (js_file_path, None, 'IndexBuilder / ScopeAnalyst fail') # print 'n2p' # Save some translation stats to compare different methods ts = TranslationSummarizer() candidates += [['n2p', ''] + x for x in ts.compute_summary_unscoped(n2p_iBuilder, n2p_scopeAnalyst)] ################################################ # All other JSNaughty variants ################################################ try: scopeAnalyst = WebScopeAnalyst(minified_text) except: return (js_file_path, None, 'ScopeAnalyst minified fail') try: scopeAnalyst_clear = WebScopeAnalyst(beautified_text) except: return (js_file_path, None, 'ScopeAnalyst clear fail') if(not check(iBuilder_clear, scopeAnalyst_clear, n2p_iBuilder, n2p_scopeAnalyst)): return (js_file_path, None, 'JsNice restructured file. Skipping..') #Map the original names to the minified counterparts and minified ones to jsnice renamings orderedVarsNew = sorted(scopeAnalyst.name2defScope.keys(), key = lambda x: x[1]) orderedVarsOld = sorted(scopeAnalyst_clear.name2defScope.keys(), key = lambda x: x[1]) orderedVarsN2p = sorted(n2p_scopeAnalyst.name2defScope.keys(), key = lambda x: x[1]) if(len(orderedVarsOld) != len(orderedVarsNew)): return (js_file_path, None, "Old and New Name lists different length") if(len(orderedVarsOld) != len(orderedVarsN2p)): return (js_file_path, None, "JsNice and Old Name lists different length") for i in range(0, len(orderedVarsOld)): name_old = orderedVarsOld[i][0] def_scope_old = scopeAnalyst_clear.name2defScope[orderedVarsOld[i]] name_new = orderedVarsNew[i][0] def_scope_new = scopeAnalyst.name2defScope[orderedVarsNew[i]] min_name_map[(name_new, def_scope_new)] = (name_old, def_scope_old) name_n2p = orderedVarsN2p[i][0] def_scope_n2p = scopeAnalyst.name2defScope[orderedVarsNew[i]] jsnice_name_map[(name_new, def_scope_new)] = (name_n2p, def_scope_n2p) #Once we have the scopeAnalyst, iBuilder, and tokenlist for the minified #version, we can get the name properties vm = VariableMetrics(scopeAnalyst, iBuilder_ugly, lex_ugly.tokenList) variableKeySet = vm.getVariables() for variableKey in variableKeySet: name_features[variableKey] = vm.getNameMetrics(variableKey) (name_positions, \ position_names, use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst) # print 'Helpers' # Try different renaming strategies (hash, etc) for r_strategy, proxy in proxies: if dbg: print '\n=====================' print r_strategy print '=====================\n' # try: # if True: # Rename input prior to translation preRen = PreRenamer() after_text = preRen.rename(r_strategy, iBuilder_ugly, scopeAnalyst) # print 'After text:' # print after_text # print (ok, beautified_after_text, _err) = clear.web_run(after_text) if not ok: return (js_file_path, None, 'Beautifier fail') # print 'Beautified:' # print beautified_after_text # print if(r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO): try: scopeAnalyst_hash = WebScopeAnalyst(after_text) except: return (js_file_path, None, "ScopeAnalyst hash fail") #Map the hashed names to the minified counterparts. orderedVarsMin = sorted(scopeAnalyst.name2defScope.keys(), key = lambda x: x[1]) orderedVarsHash = sorted(scopeAnalyst_hash.name2defScope.keys(), key = lambda x: x[1]) if(len(orderedVarsMin) != len(orderedVarsHash)): return (js_file_path, None, "Hash and Min lists different length") for i in range(0, len(orderedVarsHash)): name_hash = orderedVarsHash[i][0] def_scope_hash = scopeAnalyst_hash.name2defScope[orderedVarsHash[i]] name_min = orderedVarsMin[i][0] def_scope_min = scopeAnalyst.name2defScope[orderedVarsMin[i]] hash_name_map[(name_hash, def_scope_hash)] = (name_min, def_scope_min) # Save renamed input to disk for future inspection with open(temp_files['%s' % (r_strategy)], 'w') as f: f.write(beautified_after_text) a_lexer = WebLexer(beautified_after_text) a_iBuilder = IndexBuilder(a_lexer.tokenList) a_scopeAnalyst = WebScopeAnalyst(beautified_after_text) # except: # return (js_file_path, None, 'Renaming fail') # print 'Lexing' # lx = WebLexer(a_iBuilder.get_text()) lx = WebLexer(a_iBuilder.get_text_wo_literals()) # print a_iBuilder.get_text_wo_literals() # Translate renamed input md = WebMosesDecoder(proxy) (ok, translation, _err) = md.run(lx.collapsedText) if not ok: return (js_file_path, None, 'Moses translation fail') # print '\ntranslation-------------' # print translation # if r_strategy == RS.HASH_ONE: # exit() (a_name_positions, a_position_names, a_use_scopes) = prepHelpers(a_iBuilder, a_scopeAnalyst) nc = [] if translation is not None: # Parse moses output mp = MosesParser() if dbg: print '\nr_strategy-----------', r_strategy name_candidates = mp.parse(translation, a_iBuilder, a_position_names) # name_candidates is a dictionary of dictionaries: # keys are (name, None) (if scopeAnalyst=None) or # (name, def_scope) tuples (otherwise); # values are suggested translations with the sets # of line numbers on which they appear. # print '\nname_candidates before ----------' # for key, suggestions in name_candidates.iteritems(): # print key[0], key[1][-50:] # # for use_scope, suggestions in val.iteritems(): # # print '\t...', use_scope[-50:] # for name_translation, lines in suggestions.iteritems(): # print '\t', name_translation, lines # Update name_candidates with some default values # (in this case the translation without any renaming) # if the translation is empty if r_strategy == RS.NONE: # RS.NONE should always be first, by construction name_candidates_default = name_candidates scopeAnalyst_default = a_scopeAnalyst iBuilder_default = a_iBuilder else: for key_default, suggestions in name_candidates_default.iteritems(): # (name_default, def_scope_default) = key_default pos_default = scopeAnalyst_default.nameDefScope2pos[key_default] (lin, col) = iBuilder_default.revFlatMat[pos_default] (line_num, line_idx) = iBuilder_default.revTokMap[(lin, col)] (name, def_scope) = a_position_names[line_num][line_idx] key = (name, def_scope) for name_translation, lines in suggestions.iteritems(): name_candidates.setdefault(key, {}) name_candidates[key].setdefault(name_translation, set([])) name_candidates[key][name_translation].update(lines) # for use_scope, suggestions in val.iteritems(): # for name_translation, lines in suggestions.iteritems(): # # key = preRen.simple_direct_map.get(key_default, key_default) # # name_candidates.setdefault(key, {}) # name_candidates[key].setdefault(use_scope, {}) # name_candidates[key][use_scope].setdefault(name_translation, set([])) # name_candidates[key][use_scope][name_translation].update(lines) # print '\nname_candidates after ----------' # for key, suggestions in name_candidates.iteritems(): # print key[0], key[1][-50:] # # for use_scope, suggestions in val.iteritems(): # # print '\t...', use_scope[-50:] # for name_translation, lines in suggestions.iteritems(): # print '\t', name_translation, lines cc = ConsistencyController(debug_mode=True) ts = TranslationSummarizer() # An identifier may have been translated inconsistently # across different lines (Moses treats each line independently). # Try different strategies to resolve inconsistencies, if any for c_strategy in CS.all(): if dbg: print '\nc_strategy----------', c_strategy #assert(hash_name_map != {}) # Compute renaming map (x -> length, y -> width, ...) # Note that x,y here are names after renaming (temp_renaming_map, seen) = cc.computeRenaming(c_strategy, name_candidates, a_name_positions, a_use_scopes, a_iBuilder, lm_path, vm, hash_name_map) #After computeRenaming, we have both the entropies stored #if we are in LMDrop strategy and have the suggestions #frequency from name_candidates. Fill in suggestion_Features if(c_strategy == CS.LMDROP and r_strategy not in suggestion_features): assert(cc.suggestion_cache != None) suggestion_features[r_strategy] = {} #Need some way of iterating over all name, suggestion groups... """ name_candidates: dict name_candidates[(name, def_scope)][name_translation] = set of line numbers in the translation """ for variableKey, suggestionDictionary in name_candidates.iteritems(): for suggestionName, linesSuggested in suggestionDictionary.iteritems(): # I need to revert variableKey[0] in the suggestion from its hash to its original minified name. if(r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO): unhashedKey = hash_name_map[variableKey] suggestionKey = (unhashedKey[0], unhashedKey[1], suggestionName) else: suggestionKey = (variableKey[0], variableKey[1], suggestionName) entropyVals = cc.suggestion_cache.getEntropyStats(variableKey, suggestionName) if(True): #eval_dbg only #if(entropyVals != (ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR)): suggestionValue = [len(linesSuggested)] + \ list(getSuggestionStats(suggestionName)) + \ list(entropyVals) suggestion_features[r_strategy][suggestionKey] = suggestionValue if dbg: print '\ntemp_renaming_map-------------' for (name, def_scope), renaming in temp_renaming_map.iteritems(): print (name, def_scope[-50:]), renaming # Fall back on original names in input, if # no translation was suggested postRen = PostRenamer() renaming_map = postRen.updateRenamingMap(a_name_positions, position_names, a_use_scopes, temp_renaming_map, seen, r_strategy) # new_name_candidates = {} # # for (name, def_scope), renaming in temp_renaming_map.iteritems(): # (line_num, line_idx) = a_name_positions[(name, def_scope)][0] # (old_name, old_def_scope) = position_names[line_num][line_idx] # # new_name_candidates.setdefault((old_name, old_def_scope), {}) # new_name_candidates[(old_name, old_def_scope)][renaming] = set([1]) # tmp_renamed_text = postRen.applyRenaming(a_iBuilder, # a_name_positions, # temp_renaming_map) # (ok, tmp_beautified_renamed_text, _err) = clear.web_run(tmp_renamed_text) # if not ok: # return (js_file_path, None, 'Beautifier fail') # # tmp_lexer = WebLexer(tmp_beautified_renamed_text) # tmp_iBuilder = IndexBuilder(tmp_lexer.tokenList) # tmp_scopeAnalyst = WebScopeAnalyst(tmp_beautified_renamed_text) # # (tmp_name_positions, # tmp_position_names, # tmp_use_scopes) = prepHelpers(tmp_iBuilder, tmp_scopeAnalyst) # renaming_map = postRen.updateRenamingMap(tmp_name_positions, # position_names, # temp_renaming_map, # r_strategy) # # renaming_map = cc.computeRenaming(CS.FREQLEN, # new_name_candidates, # name_positions, # use_scopes, # iBuilder_ugly, # lm_path) # # Fall back on original names in input, if # # no translation was suggested # postRen = PostRenamer() # renaming_map = postRen.updateRenamingMap(a_name_positions, # position_names, # temp_renaming_map, # r_strategy) if dbg: print '\nrenaming_map-------------' for (name, def_scope), renaming in renaming_map.iteritems(): print (name, def_scope[-50:]), renaming, '(%s)' % temp_renaming_map[(name, def_scope)] # Apply renaming map and save output for future inspection renamed_text = postRen.applyRenaming(a_iBuilder, a_name_positions, renaming_map) print '\nrenamed_text--------------' print renamed_text print (ok, beautified_renamed_text, _err) = clear.web_run(renamed_text) if not ok: return (js_file_path, None, 'Beautifier fail') with open(temp_files['%s_%s' % (r_strategy, c_strategy)], 'w') as f: f.write(beautified_renamed_text) # Save some stats about which names were renamed to what # This is what enables the comparison between the different # methods. r = [[c_strategy] + x for x in ts.compute_summary_scoped(renaming_map, name_candidates, a_iBuilder, a_scopeAnalyst)] if not r: return (js_file_path, None, 'Compute summary failed') nc += r if nc: candidates += [[r_strategy] + x for x in nc] #create the rows for the suggestion_model.csv for r_strategy in RS.all(): for suggestionKey, s_feat in suggestion_features[r_strategy].iteritems(): variableKey = (suggestionKey[0], suggestionKey[1]) original_name = min_name_map[variableKey][0] js_nice_name = jsnice_name_map[variableKey][0] if(variableKey in name_features): #eval_dbg only n_feat = list(name_features[variableKey]) #Convert the def_scope to an equivalent, but smaller, easier to read key: (line_num, token_num) newKey = scopeAnalyst.nameDefScope2pos[variableKey] (keyLine, keyToken) = iBuilder_ugly.revFlatMat[newKey] model_rows.append([original_name, r_strategy, suggestionKey[0], keyLine, keyToken, suggestionKey[2], js_nice_name] + n_feat + s_feat) return (js_file_path, 'OK', candidates, model_rows)
def deobfuscateJS(self, obfuscatedCode, use_mix, transactionID, debug_output=False, parallel=True, use_local=True): """ Take a string representing minified javascript code and attempt to translate it into a version with better renamings. Parameters ---------- obfuscatedCode: The minified javascript text. use_mix: True/False -> should we invoke JSNice and throw the names into the language model mix? transactionID: an ID for storing temp files - used currently only to identify the input to JSNice. debug_output: should we print debugging output in this pass (TRUE/FALSE) parallel: enable parallelization performance enhancements -> such as calling the moses servers in parallel. Returns ------- A tuple: renamed_text - the renamed text jsnice_error - "" if no error, otherwise a message stating where the jsnice mixing failed Third element is a tuple of TIMING_COUNT performance times preprocess time - total time to preprocess before invoking moses servers prepre time - how long does the first step of the preprocessor take? jsnice time - part of the preprocessing, how long does it take to get and parse jsnice names renaming time - how long did the hashing steps in preprocess take lex_total_time - how long did all the lexers take, builder_time - how long did all the Index Builders take scoper_time - how long did all the scopeAnalysts take moses time - how long did the moses servers take moses_rn_parallel - total time for the parallel moses and renaming to complete postprocess time - how long did the consistency resolution and language model queries take. """ RS = RenamingStrategies() CS = ConsistencyStrategies() r_strategy = RS.HASH_ONE #c_strategy = CS.FREQLEN # or CS.LM? (CS.LM requires a language model + a querylm from moses) #c_strategy = CS.LM c_strategy = CS.LOGMODEL if (use_local == False): proxies = MosesProxy().web_proxies else: proxies = MosesProxy().web_local mosesParams = {} #lm_path = "/data/bogdanv/deobfuscator/experiments/corpora/corpus.lm.970k/js.blm.lm" #lm_path = "/data/bogdanv/deobfuscator/experiments/corpora/corpus.lm.500k/js.blm.lm" lm_path = "./phrase-tables/langmodels/js.blm.lm" #if socket.gethostname() == 'bogdan.mac': # lm_path = "/Users/bogdanv/workspace2/deobfuscator/data/lm/js.blm.lm" #elif socket.gethostname() == "Caseys-MacBook-Pro.local" or socket.gethostname() == "campus-019-136.ucdavis.edu": # lm_path = "/Users/caseycas/jsnaughty_lms/js970k.blm.lm" #Hashed Name -> Minified Name (name, def_scope) -> (name, def_scope) hash_name_map = {} #Minified Name -> jsnice name (name, def_scope) -> (name, def_scope) jsnice_name_map = {} #Record of any errors we get in the js mixing. #If this feature is enabled (to be added as a switch on the website) #it should not crash the input if there is a failure. If the query #doesn't work for some reason, then we should just use the candidate #names provided by moses. jsnice_errors = [] start = time.time() # Strip comments, replace literals, etc try: #if True: prepro = WebLMPreprocessor(obfuscatedCode) prepro_text = str(prepro) if (debug_output): print("Prepro_text----------------------------------") print(prepro_text) print("Prepro_text----------------------------------") except: return ((prepro_error, "", (0, ) * TIMING_COUNT)) prepre_end = time.time() prepre_time = prepre_end - start clear = Beautifier() (ok, beautified_text, _err) = clear.web_run(prepro_text) if (debug_output): print("Beautified Text") print(beautified_text) if (not ok): return ((beaut_error, "", (0, ) * TIMING_COUNT)) #Due to a bug? in the jsnice web service, we need to save the #input text as a file. min_input_file = os.path.join(self.tmpDir, str(transactionID) + ".u.js") with open(min_input_file, 'w') as f: f.write(beautified_text) try: # lex_ugly = Lexer(beautFile) lex_ugly = WebLexer(beautified_text) if (debug_output): print("Lex_ugly---------------------") print(lex_ugly.tokenList) print("Lex_ugly---------------------") iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: return ((ib_error, "", (0, ) * TIMING_COUNT)) #Do Scope related tasks #a raw text version try: # scopeAnalyst = ScopeAnalyst(beautFile) scopeAnalyst = WebScopeAnalyst(beautified_text) except: return ((sa_error, "", (0, ) * TIMING_COUNT)) #Cut short if no variables if (not scopeAnalyst.hasMinifiableVariables()): return ((beautified_text, "No Minifiable Variables", (0, ) * TIMING_COUNT)) elif (debug_output): print("GLOBAL VAR MAP: " + str(scopeAnalyst.isGlobal)) #lex_ugly.write_temp_file(tempFile) js_start = time.time() ######################## # Nice2Predict start ######################## #Don't want a crashing failure for jsnice query. # BV: Next block left out until I figure out the pipe issue # BV: Update: I couldn't pipe input to N2P. TODO: FIX # Run the JSNice from http://www.nice2predict.org if (use_mix): unuglifyJS = UnuglifyJS() (ok, n2p_text, _err) = unuglifyJS.run(min_input_file) #ok = False #Failure test if not ok: jsnice_errors.append('Nice2Predict fail') #return (js_file_path, None, 'Nice2Predict fail') if (use_mix and jsnice_errors == []): (ok, n2p_text_beautified, _err) = clear.web_run(n2p_text) if not ok: jsnice_errors.append('Beautifier failed for JSNice.') #return (js_file_path, None, 'Beautifier fail') if (debug_output): print("JSNice Text") print(n2p_text_beautified) try: n2p_lexer = WebLexer(n2p_text_beautified) n2p_iBuilder = IndexBuilder(n2p_lexer.tokenList) n2p_scopeAnalyst = WebScopeAnalyst(n2p_text_beautified) except: jsnice_errors.append( "IndexBuilder or ScopeAnalysted failed for JSNice.") #return (js_file_path, None, 'IndexBuilder / ScopeAnalyst fail') ######################## # Nice2Predict End ######################## js_end = time.time() js_time = js_end - js_start #Do Scope related tasks (name_positions, position_names, use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst) #Map the jsnice names to the minified counterparts. if (use_mix and jsnice_errors == [] ): #only attempt if we are error free for jsnice up to this point. try: orderedVarsNew = sorted(scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) orderedVarsN2p = sorted(n2p_scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) if (len(orderedVarsNew) != len(orderedVarsN2p)): jsnice_errors.append( "JSNice and minified name lists different lengths.") #raise IndexError("Length Mismatch") #Probably better to have our own defined error type, but this will do for now #return ("JsNice and New Name lists different length") for i in range(0, len(orderedVarsNew)): name_new = orderedVarsNew[i][0] def_scope_new = scopeAnalyst.name2defScope[ orderedVarsNew[i]] name_n2p = orderedVarsN2p[i][0] def_scope_n2p = scopeAnalyst.name2defScope[ orderedVarsNew[i]] jsnice_name_map[(name_new, def_scope_new)] = (name_n2p, def_scope_n2p) except: jsnice_errors.append( "JSNice to minified name map building failed.") (_name_positions, \ position_names, _use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst) #Note: we want to put these in parallel once we've tested the #serial version... pre_outer_end = time.time() pre_time = pre_outer_end - start if (not parallel): #Get moses output for no_renaming (status, error_msg, translation_default, name_candidates_default, iBuilder_default, scopeAnalyst_default, name_positions_default, position_names_default, use_scopes_default, hash_name_map_default, rn_time_default, m_time_default, lex_time_default, post_start_default) = getMosesTranslation(proxies[RS.NONE], RS.NONE, RS, clear, iBuilder_ugly, scopeAnalyst, debug_output) #print("MOSES NO RENAMING: " + str(m_time_default)) if (not status): return ((error_msg, "", (0, ) * TIMING_COUNT)) #Get moses output for hash_renaming (status, error_msg, translation, name_candidates, a_iBuilder, a_scopeAnalyst, a_name_positions, a_position_names, a_use_scopes, hash_name_map, rn_time, m_time, lex_time, post_start) = getMosesTranslation(proxies[r_strategy], r_strategy, RS, clear, iBuilder_ugly, scopeAnalyst, debug_output) #print("MOSES HASH RENAMING: " + str(m_time)) if (not status): return ((error_msg, "", (0, ) * TIMING_COUNT)) m_parallel_time = 0 else: #Parallel version none_wrapper = (RS.NONE, RS, clear, iBuilder_ugly, scopeAnalyst, debug_output, use_local) hash_wrapper = (r_strategy, RS, clear, iBuilder_ugly, scopeAnalyst, debug_output, use_local) wrappers = [none_wrapper, hash_wrapper] pool = multiprocessing.Pool(processes=2) m_parallel_start = time.time() for result in pool.imap(getMosesTranslationParallel, wrappers): if (result[0] == RS.NONE): #No renaming (status, error_msg, translation_default, name_candidates_default, iBuilder_default, scopeAnalyst_default, name_positions_default, position_names_default, use_scopes_default, hash_name_map_default, rn_time_default, m_time_default, lex_time_default, post_start_default) = result[1] #print("MOSES NO RENAMING: " + str(m_time_default)) if (not status): return ((error_msg, "", (0, ) * TIMING_COUNT)) else: (status, error_msg, translation, name_candidates, a_iBuilder, a_scopeAnalyst, a_name_positions, a_position_names, a_use_scopes, hash_name_map, rn_time, m_time, lex_time, post_start) = result[1] #print("MOSES HASH RENAMING: " + str(m_time)) if (not status): return ((error_msg, "", (0, ) * TIMING_COUNT)) m_parallel_time = time.time() - m_parallel_start pre_time += rn_time_default + rn_time if (debug_output): print("Serial: " + str(m_time + m_time_default + rn_time + rn_time_default)) print("Parallel: " + str(m_parallel_time)) if translation is not None and translation_default is not None: for key_default, suggestions in name_candidates_default.iteritems( ): # (name_default, def_scope_default) = key_default pos_default = scopeAnalyst_default.nameDefScope2pos[ key_default] (lin, col) = iBuilder_default.revFlatMat[pos_default] (line_num, line_idx) = iBuilder_default.revTokMap[(lin, col)] (name, def_scope) = a_position_names[line_num][line_idx] key = (name, def_scope) for name_translation, lines in suggestions.iteritems(): name_candidates.setdefault(key, {}) name_candidates[key].setdefault(name_translation, set([])) name_candidates[key][name_translation].update(lines) # name_candidates is a dictionary of dictionaries: # keys are (name, None) (if scopeAnalyst=None) or # (name, def_scope) tuples (otherwise); # values are suggested translations with the sets # of line numbers on which they appear. #if(True): if (debug_output): print("Name_candidates") print(name_candidates) print("jsnice_name_map") print(jsnice_name_map) print("hash_name_map") print(hash_name_map) # **** BV: This might be all we need to combine Naughty & Nice if ( use_mix and jsnice_errors == [] ): #only attempt if we are error free for jsnice up to this point. try: name_candidates_copy = deepcopy(name_candidates) for key, suggestions in name_candidates_copy.iteritems(): if (debug_output): print("Key: " + str(key)) print("Suggestions: " + str(suggestions)) if r_strategy == RS.NONE: (name_n2p, def_scope_n2p) = jsnice_name_map[key] else: (name_n2p, def_scope_n2p ) = jsnice_name_map[hash_name_map.get(key, key)] for name_translation, lines in suggestions.iteritems(): name_candidates.setdefault(key, {}) name_candidates[key].setdefault(name_n2p, set([])) name_candidates[key][name_n2p].update(lines) except: jsnice_errors.append( "Failure while adding jsnice names to candidate pool.") cr = ConsistencyController(debug_mode=debug_output) # An identifier may have been translated inconsistently # across different lines (Moses treats each line independently). # Try different strategies to resolve inconsistencies, if any # Compute renaming map (x -> length, y -> width, ...) # Note that x,y here are names after renaming #Hash error is occuring in here. try: (temp_renaming_map, seen) = cr.computeRenaming( c_strategy, name_candidates, a_name_positions, a_use_scopes, a_iBuilder, lm_path, {}, hash_name_map) except: return ("Compute renaming fail.", "", (0, ) * TIMING_COUNT) if (debug_output): print("Temp renaming map") print(temp_renaming_map) # Fall back on original names in input, if # no translation was suggested postRen = PostRenamer() renaming_map = postRen.updateRenamingMap(a_name_positions, position_names, a_use_scopes, temp_renaming_map, seen, r_strategy) if (debug_output): print("Renaming Map") print(renaming_map) # Apply renaming map and save output for future inspection renamed_text = postRen.applyRenaming(a_iBuilder, a_name_positions, renaming_map) (ok, beautified_renamed_text, _err) = clear.web_run_end(renamed_text) #print(name_candidates) #print("--------------") #print(renamed_text) #print("--------------") #print(beautified_renamed_text) #print("--------------") #print(" ".join(jsnice_errors)) if not ok: return ((beaut_error, "", (0, ) * TIMING_COUNT)) if (debug_output): print("Renamed text") print(beautified_renamed_text) #Time Calculations... (Will need to update for when it becomes parallel post_end = time.time() post_time = post_end - post_start #Record any jsnice errors (but leave output blank if there are none). jsnice_error_string = "" if (jsnice_errors != []): jsnice_error_string = "JSNice mixing attempt failed. Reporting renaming with only our method. \nJSNice Errors : \n" jsnice_error_string += "\n".join(jsnice_errors) + "\n" #Tally up the build times for the lexers, indexbuilders and scopers. if (not use_mix): n2pLexTime = 0 n2pBuildTime = 0 n2pSATime = 0 else: n2pLexTime = n2p_lexer.build_time n2pBuildTime = n2p_iBuilder.build_time n2pSATime = n2p_scopeAnalyst.build_time #Lexers lex_total_time = lex_time + lex_time_default + lex_ugly.build_time + n2pLexTime #IndexBuilders builder_time = iBuilder_ugly.build_time + n2pBuildTime + a_iBuilder.build_time + iBuilder_default.build_time #scopers scoper_time = n2pSATime + scopeAnalyst.build_time + scopeAnalyst_default.build_time + a_scopeAnalyst.build_time #Change the presentation of this to return performance information #and error codes as separate elements in a tuple #New return: translation, jsnice_error, preprocess time, js_time, rename_time #m_time, post_time. return ((str(beautified_renamed_text), jsnice_error_string, (pre_time, prepre_time, js_time, rn_time + rn_time_default, lex_total_time, builder_time, scoper_time, m_time + m_time_default, m_parallel_time, post_time)))
def getMosesTranslation(proxy, r_strategy, RS, a_beautifier, iBuilder_ugly, scopeAnalyst_ugly, debug_mode=False): """ A helper function so that we can run multiple different renaming strategies through moses in a more modular and hopefully parallelizable manner. It performs hashing/no hashing preparation of the file for the renaming strategy specified by r_stategy, and then calls the appropriate moses_server. Parameters ---------- proxy: A pointer to which port the appropriate moses server is listening in on for this particular renaming strategy. r_strategy: One of the renaming strategies from RenamingStrategies RS: A renaming strategies object. a_beautifier: a beautify object to make sure the renamed text is cleanly formatted. iBuilder_ugly: Index Builder for the minified file. scopeAnalyst_ugly: Scope Analyst for the minified file. start: The starting time for the preprocessing step. Used for performance metrics. debug_mode: Print debug information? (True/False - defaults to False) Returns ------- (status, error, translation, name_candidates, a_iBuilder, a_scopeAnalyst, a_name_positions, a_position_names, a_use_scopes, hash_name_map, pre_time, rn_time, m_time, post_start) status: Did this complete without error? If False, then the rest of the output besides error will be empty/null. error: What is the reason for the failure? If status is True (successful completion) this is "". translation: The raw Moses output name_candidates: The set of Moses suggestions for this renaming a_iBuilder,a_scopeAnalyst: Index Builder and Scope Analyst for this renaming a_name_positions, a_posistion_names, a_use_scopes: Addition tracking info hash_name_map: a map from the hashed names to the original minified names rn_time, m_time, lex_time, post_start: The duration of the renaming, Moses translation steps, and lexing steps along with the start time for the postprocessing of the Moses output. """ rn_start = time.time() #We need both the base_text and the hashed_text. preRen = PreRenamer() if (debug_mode): print("Tokens-------------------") print(iBuilder_ugly.tokens) print("Tokens-------------------") #We always need the non hashed names as a fallback. try: after_text = preRen.rename(r_strategy, iBuilder_ugly, scopeAnalyst_ugly) except: return (False, "Renaming failed for " + str(r_strategy), "", {}, None, None, {}, {}, {}, {}, 0, 0, 0, 0) (ok, beautified_after_text, _err) = a_beautifier.web_run(after_text) if not ok: return (False, "Beautifier failed on the renamed text for " + str(r_strategy), "", {}, None, None, {}, {}, {}, {}, 0, 0, 0, 0) # Align hashed and non hashed files, in case the beautifier # line wrapped the extended lines. try: aligner = Aligner() (aligned_after, aligned_before) = aligner.web_align( WebLexer(beautified_after_text).tokenList, WebLexer(iBuilder_ugly.get_text()).tokenList) except: return (False, "Aligner failed on the renamed text for " + str(r_strategy), "", {}, None, None, {}, {}, {}, {}, 0, 0, 0, 0) #print("--------Aligned After-------") #print(aligned_after) #print("----------------------------") a_lexer = WebLexer(aligned_after) a_iBuilder = IndexBuilder(a_lexer.tokenList) a_scopeAnalyst = WebScopeAnalyst(aligned_after) hash_name_map = {} if (r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO): #Something below here is buggy... orderedVarsMin = sorted(scopeAnalyst_ugly.name2defScope.keys(), key=lambda x: x[1]) orderedVarsHash = sorted(a_scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) #print("Min len: " + str(len(orderedVarsMin))) #print("Hash len: " + str(len(orderedVarsHash))) if (len(orderedVarsMin) != len(orderedVarsHash)): return (False, "Mismatch between minified and hashed names.", "", {}, a_iBuilder, a_scopeAnalyst, {}, {}, {}, {}, 0, 0, 0, 0) for i in range(0, len(orderedVarsHash)): name_hash = orderedVarsHash[i][0] def_scope_hash = a_scopeAnalyst.name2defScope[orderedVarsHash[i]] name_min = orderedVarsMin[i][0] def_scope_min = scopeAnalyst_ugly.name2defScope[orderedVarsMin[i]] hash_name_map[(name_hash, def_scope_hash)] = (name_min, def_scope_min) if (debug_mode): print("HASH NAME MAP LEN: " + str(len(hash_name_map))) # We can switch this back once we train models on a corpus with literals # lx = WebLexer(a_iBuilder.get_text()) lx = WebLexer(a_iBuilder.get_text_wo_literals()) #print("-----------------Moses In ----------------------") #print(lx) #print("------------------------------------------------") #print(a_iBuilder.charPosition2Name) #print("------------------------------------------------") #line_subset = a_scopeAnalyst.getMinifiableLines(a_iBuilder) #line_list = sorted(list(line_subset)) #line_map = {} #m_line = 0 #for next_line in line_list: # line_map[m_line] = next_line # m_line += 1 #lx = WebLexer(a_iBuilder.get_text_on_lines_wo_literals(line_subset)) #Performance measures -> wrap up the preprocessing/ renaming #phases end = time.time() rn_time = end - rn_start m_start = time.time() #if(debug_mode): # print("Invoking Moses.") # print(lx.collapsedText) # Translate renamed input #md = WebMosesDecoder(proxy) #(ok, translation, _err) = md.run(lx.collapsedText) (ok, translation, _err) = segmentedTranslation(lx, SEGMENTED_TRANS_SIZE, proxy, debug_mode) if not ok: return (False, "Moses server failed for " + str(r_strategy), translation, {}, a_iBuilder, a_scopeAnalyst, {}, {}, {}, hash_name_map, 0, 0, 0, 0) m_end = time.time() m_time = m_end - m_start post_start = time.time() (a_name_positions, a_position_names, a_use_scopes) = prepHelpers(a_iBuilder, a_scopeAnalyst) if translation is not None: # Parse moses output mp = MosesParser() if (debug_mode): print(translation) name_candidates = mp.parse(translation, a_iBuilder, a_position_names) #, #a_scopeAnalyst) #A slightly modified version of parse to remap the moses #output lines to the correct original lines. #name_candidates = mp.parse_subset(translation, # a_iBuilder, # a_position_names, # line_map) lex_time = lx.build_time + a_lexer.build_time return (True, "", translation, name_candidates, a_iBuilder, a_scopeAnalyst, a_name_positions, a_position_names, a_use_scopes, hash_name_map, rn_time, m_time, lex_time, post_start)