def processFile(js_file_path): # js_file_path = l[0] base_name = os.path.splitext(os.path.basename(js_file_path))[0] if dbg: print js_file_path temp_files = {'orig': '%s.js' % base_name, 'minified': '%s.u.js' % base_name, 'n2p': '%s.n2p.js' % base_name} for r_strategy in RS.all(): temp_files['%s' % (r_strategy)] = \ '%s.%s.js' % (base_name, r_strategy) for c_strategy in CS.all(): temp_files['%s_%s' % (r_strategy, c_strategy)] = \ '%s.%s.%s.js' % (base_name, r_strategy, c_strategy) for k,v in temp_files.iteritems(): temp_files[k] = os.path.join(output_path, v) candidates = [] #Minified Name -> Original Name (name, def_scope) -> (name, def_scope) min_name_map = {} #Hashed Name -> Minified Name (name, def_scope) -> (name, def_scope) hash_name_map = {} #Minified Name -> jsnice name (name, def_scope) -> (name, def_scope) jsnice_name_map = {} #Data for the suggestion model.csv #Map of variable (name, def_scope) -> results of variableMetrics features function name_features = {} #Map of maps of variable-suggestion (name, def_scope, suggestion) -> suggestion line counts + suggestionMetrics features function #The first key is the renaming strategy #Ultimately, we will iterate over this to get the keys out of name_features and build model_rows suggestion_features = {} #Output Lines for the suggestoin_model.csv model_rows = [] if True: # try: # js_text = open(os.path.join(corpus_root, js_file_path), 'r').read() js_text = open(js_file_path, 'r').read() # Strip comments, replace literals, etc # if True: # try: prepro = WebLMPreprocessor(js_text) prepro_text = str(prepro) # except: # return (js_file_path, None, 'Preprocessor fail') # print 'Preprocessor' # print prepro_text # Pass through beautifier to fix layout clear = Beautifier() (ok, tmp_beautified_text, _err) = clear.web_run(prepro_text) print '\nOK:', ok, 'ERR:', _err print tmp_beautified_text if not ok: return (js_file_path, None, 'Beautifier fail') # Minify ugly = Uglifier() (ok, tmp_minified_text, _err) = ugly.web_run(tmp_beautified_text) # print '\nOK:', ok, 'ERR:', _err # print tmp_minified_text if not ok: return (js_file_path, None, 'Uglifier fail') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() (aligned_clear, aligned_minified) = aligner.web_align(WebLexer(tmp_beautified_text).tokenList, WebLexer(tmp_minified_text).tokenList) except: return (js_file_path, None, 'Aligner fail') # print '\nAligned clear' # print aligned_clear # print '\nAligned minified' # print aligned_minified # print # Pass through beautifier to fix layout (ok, beautified_text, _err) = clear.web_run(aligned_clear) if not ok: return (js_file_path, None, 'Beautifier fail') (ok, minified_text, _err) = clear.web_run(aligned_minified) if not ok: return (js_file_path, None, 'Beautifier fail') # print beautified_text # print # print minified_text # Num tokens before vs after try: lex_clear = WebLexer(beautified_text) tok_clear = lex_clear.tokenList lex_ugly = WebLexer(minified_text) tok_ugly = lex_ugly.tokenList except: return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): return (js_file_path, None, 'Num tokens mismatch') if beautified_text == minified_text: return (js_file_path, None, 'Not minified') try: iBuilder_clear = IndexBuilder(lex_clear.tokenList) except: return (js_file_path, None, "IndexBuilder fail on original file.") try: iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: return (js_file_path, None, 'IndexBuilder fail on minified file.') # print 'Writing' with open(temp_files['orig'], 'w') as f: f.write(beautified_text) with open(temp_files['minified'], 'w') as f: f.write(minified_text) ######################## # Nice2Predict ######################## # BV: Next block left out until I figure out the pipe issue # BV: Update: I couldn't pipe input to N2P. TODO: FIX # Run the JSNice from http://www.nice2predict.org unuglifyJS = UnuglifyJS() (ok, n2p_text, _err) = unuglifyJS.run(temp_files['minified']) if not ok: return (js_file_path, None, 'Nice2Predict fail') (ok, n2p_text_beautified, _err) = clear.web_run(n2p_text) if not ok: return (js_file_path, None, 'Beautifier fail') with open(temp_files['n2p'], 'w') as f: f.write(n2p_text_beautified) if(True): #try: n2p_lexer = WebLexer(n2p_text_beautified) n2p_iBuilder = IndexBuilder(n2p_lexer.tokenList) n2p_scopeAnalyst = WebScopeAnalyst(n2p_text_beautified) #except: # return (js_file_path, None, 'IndexBuilder / ScopeAnalyst fail') # print 'n2p' # Save some translation stats to compare different methods ts = TranslationSummarizer() candidates += [['n2p', ''] + x for x in ts.compute_summary_unscoped(n2p_iBuilder, n2p_scopeAnalyst)] ################################################ # All other JSNaughty variants ################################################ try: scopeAnalyst = WebScopeAnalyst(minified_text) except: return (js_file_path, None, 'ScopeAnalyst minified fail') try: scopeAnalyst_clear = WebScopeAnalyst(beautified_text) except: return (js_file_path, None, 'ScopeAnalyst clear fail') if(not check(iBuilder_clear, scopeAnalyst_clear, n2p_iBuilder, n2p_scopeAnalyst)): return (js_file_path, None, 'JsNice restructured file. Skipping..') #Map the original names to the minified counterparts and minified ones to jsnice renamings orderedVarsNew = sorted(scopeAnalyst.name2defScope.keys(), key = lambda x: x[1]) orderedVarsOld = sorted(scopeAnalyst_clear.name2defScope.keys(), key = lambda x: x[1]) orderedVarsN2p = sorted(n2p_scopeAnalyst.name2defScope.keys(), key = lambda x: x[1]) if(len(orderedVarsOld) != len(orderedVarsNew)): return (js_file_path, None, "Old and New Name lists different length") if(len(orderedVarsOld) != len(orderedVarsN2p)): return (js_file_path, None, "JsNice and Old Name lists different length") for i in range(0, len(orderedVarsOld)): name_old = orderedVarsOld[i][0] def_scope_old = scopeAnalyst_clear.name2defScope[orderedVarsOld[i]] name_new = orderedVarsNew[i][0] def_scope_new = scopeAnalyst.name2defScope[orderedVarsNew[i]] min_name_map[(name_new, def_scope_new)] = (name_old, def_scope_old) name_n2p = orderedVarsN2p[i][0] def_scope_n2p = scopeAnalyst.name2defScope[orderedVarsNew[i]] jsnice_name_map[(name_new, def_scope_new)] = (name_n2p, def_scope_n2p) #Once we have the scopeAnalyst, iBuilder, and tokenlist for the minified #version, we can get the name properties vm = VariableMetrics(scopeAnalyst, iBuilder_ugly, lex_ugly.tokenList) variableKeySet = vm.getVariables() for variableKey in variableKeySet: name_features[variableKey] = vm.getNameMetrics(variableKey) (name_positions, \ position_names, use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst) # print 'Helpers' # Try different renaming strategies (hash, etc) for r_strategy, proxy in proxies: if dbg: print '\n=====================' print r_strategy print '=====================\n' # try: # if True: # Rename input prior to translation preRen = PreRenamer() after_text = preRen.rename(r_strategy, iBuilder_ugly, scopeAnalyst) # print 'After text:' # print after_text # print (ok, beautified_after_text, _err) = clear.web_run(after_text) if not ok: return (js_file_path, None, 'Beautifier fail') # print 'Beautified:' # print beautified_after_text # print if(r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO): try: scopeAnalyst_hash = WebScopeAnalyst(after_text) except: return (js_file_path, None, "ScopeAnalyst hash fail") #Map the hashed names to the minified counterparts. orderedVarsMin = sorted(scopeAnalyst.name2defScope.keys(), key = lambda x: x[1]) orderedVarsHash = sorted(scopeAnalyst_hash.name2defScope.keys(), key = lambda x: x[1]) if(len(orderedVarsMin) != len(orderedVarsHash)): return (js_file_path, None, "Hash and Min lists different length") for i in range(0, len(orderedVarsHash)): name_hash = orderedVarsHash[i][0] def_scope_hash = scopeAnalyst_hash.name2defScope[orderedVarsHash[i]] name_min = orderedVarsMin[i][0] def_scope_min = scopeAnalyst.name2defScope[orderedVarsMin[i]] hash_name_map[(name_hash, def_scope_hash)] = (name_min, def_scope_min) # Save renamed input to disk for future inspection with open(temp_files['%s' % (r_strategy)], 'w') as f: f.write(beautified_after_text) a_lexer = WebLexer(beautified_after_text) a_iBuilder = IndexBuilder(a_lexer.tokenList) a_scopeAnalyst = WebScopeAnalyst(beautified_after_text) # except: # return (js_file_path, None, 'Renaming fail') # print 'Lexing' # lx = WebLexer(a_iBuilder.get_text()) lx = WebLexer(a_iBuilder.get_text_wo_literals()) # print a_iBuilder.get_text_wo_literals() # Translate renamed input md = WebMosesDecoder(proxy) (ok, translation, _err) = md.run(lx.collapsedText) if not ok: return (js_file_path, None, 'Moses translation fail') # print '\ntranslation-------------' # print translation # if r_strategy == RS.HASH_ONE: # exit() (a_name_positions, a_position_names, a_use_scopes) = prepHelpers(a_iBuilder, a_scopeAnalyst) nc = [] if translation is not None: # Parse moses output mp = MosesParser() if dbg: print '\nr_strategy-----------', r_strategy name_candidates = mp.parse(translation, a_iBuilder, a_position_names) # name_candidates is a dictionary of dictionaries: # keys are (name, None) (if scopeAnalyst=None) or # (name, def_scope) tuples (otherwise); # values are suggested translations with the sets # of line numbers on which they appear. # print '\nname_candidates before ----------' # for key, suggestions in name_candidates.iteritems(): # print key[0], key[1][-50:] # # for use_scope, suggestions in val.iteritems(): # # print '\t...', use_scope[-50:] # for name_translation, lines in suggestions.iteritems(): # print '\t', name_translation, lines # Update name_candidates with some default values # (in this case the translation without any renaming) # if the translation is empty if r_strategy == RS.NONE: # RS.NONE should always be first, by construction name_candidates_default = name_candidates scopeAnalyst_default = a_scopeAnalyst iBuilder_default = a_iBuilder else: for key_default, suggestions in name_candidates_default.iteritems(): # (name_default, def_scope_default) = key_default pos_default = scopeAnalyst_default.nameDefScope2pos[key_default] (lin, col) = iBuilder_default.revFlatMat[pos_default] (line_num, line_idx) = iBuilder_default.revTokMap[(lin, col)] (name, def_scope) = a_position_names[line_num][line_idx] key = (name, def_scope) for name_translation, lines in suggestions.iteritems(): name_candidates.setdefault(key, {}) name_candidates[key].setdefault(name_translation, set([])) name_candidates[key][name_translation].update(lines) # for use_scope, suggestions in val.iteritems(): # for name_translation, lines in suggestions.iteritems(): # # key = preRen.simple_direct_map.get(key_default, key_default) # # name_candidates.setdefault(key, {}) # name_candidates[key].setdefault(use_scope, {}) # name_candidates[key][use_scope].setdefault(name_translation, set([])) # name_candidates[key][use_scope][name_translation].update(lines) # print '\nname_candidates after ----------' # for key, suggestions in name_candidates.iteritems(): # print key[0], key[1][-50:] # # for use_scope, suggestions in val.iteritems(): # # print '\t...', use_scope[-50:] # for name_translation, lines in suggestions.iteritems(): # print '\t', name_translation, lines cc = ConsistencyController(debug_mode=True) ts = TranslationSummarizer() # An identifier may have been translated inconsistently # across different lines (Moses treats each line independently). # Try different strategies to resolve inconsistencies, if any for c_strategy in CS.all(): if dbg: print '\nc_strategy----------', c_strategy #assert(hash_name_map != {}) # Compute renaming map (x -> length, y -> width, ...) # Note that x,y here are names after renaming (temp_renaming_map, seen) = cc.computeRenaming(c_strategy, name_candidates, a_name_positions, a_use_scopes, a_iBuilder, lm_path, vm, hash_name_map) #After computeRenaming, we have both the entropies stored #if we are in LMDrop strategy and have the suggestions #frequency from name_candidates. Fill in suggestion_Features if(c_strategy == CS.LMDROP and r_strategy not in suggestion_features): assert(cc.suggestion_cache != None) suggestion_features[r_strategy] = {} #Need some way of iterating over all name, suggestion groups... """ name_candidates: dict name_candidates[(name, def_scope)][name_translation] = set of line numbers in the translation """ for variableKey, suggestionDictionary in name_candidates.iteritems(): for suggestionName, linesSuggested in suggestionDictionary.iteritems(): # I need to revert variableKey[0] in the suggestion from its hash to its original minified name. if(r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO): unhashedKey = hash_name_map[variableKey] suggestionKey = (unhashedKey[0], unhashedKey[1], suggestionName) else: suggestionKey = (variableKey[0], variableKey[1], suggestionName) entropyVals = cc.suggestion_cache.getEntropyStats(variableKey, suggestionName) if(True): #eval_dbg only #if(entropyVals != (ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR)): suggestionValue = [len(linesSuggested)] + \ list(getSuggestionStats(suggestionName)) + \ list(entropyVals) suggestion_features[r_strategy][suggestionKey] = suggestionValue if dbg: print '\ntemp_renaming_map-------------' for (name, def_scope), renaming in temp_renaming_map.iteritems(): print (name, def_scope[-50:]), renaming # Fall back on original names in input, if # no translation was suggested postRen = PostRenamer() renaming_map = postRen.updateRenamingMap(a_name_positions, position_names, a_use_scopes, temp_renaming_map, seen, r_strategy) # new_name_candidates = {} # # for (name, def_scope), renaming in temp_renaming_map.iteritems(): # (line_num, line_idx) = a_name_positions[(name, def_scope)][0] # (old_name, old_def_scope) = position_names[line_num][line_idx] # # new_name_candidates.setdefault((old_name, old_def_scope), {}) # new_name_candidates[(old_name, old_def_scope)][renaming] = set([1]) # tmp_renamed_text = postRen.applyRenaming(a_iBuilder, # a_name_positions, # temp_renaming_map) # (ok, tmp_beautified_renamed_text, _err) = clear.web_run(tmp_renamed_text) # if not ok: # return (js_file_path, None, 'Beautifier fail') # # tmp_lexer = WebLexer(tmp_beautified_renamed_text) # tmp_iBuilder = IndexBuilder(tmp_lexer.tokenList) # tmp_scopeAnalyst = WebScopeAnalyst(tmp_beautified_renamed_text) # # (tmp_name_positions, # tmp_position_names, # tmp_use_scopes) = prepHelpers(tmp_iBuilder, tmp_scopeAnalyst) # renaming_map = postRen.updateRenamingMap(tmp_name_positions, # position_names, # temp_renaming_map, # r_strategy) # # renaming_map = cc.computeRenaming(CS.FREQLEN, # new_name_candidates, # name_positions, # use_scopes, # iBuilder_ugly, # lm_path) # # Fall back on original names in input, if # # no translation was suggested # postRen = PostRenamer() # renaming_map = postRen.updateRenamingMap(a_name_positions, # position_names, # temp_renaming_map, # r_strategy) if dbg: print '\nrenaming_map-------------' for (name, def_scope), renaming in renaming_map.iteritems(): print (name, def_scope[-50:]), renaming, '(%s)' % temp_renaming_map[(name, def_scope)] # Apply renaming map and save output for future inspection renamed_text = postRen.applyRenaming(a_iBuilder, a_name_positions, renaming_map) print '\nrenamed_text--------------' print renamed_text print (ok, beautified_renamed_text, _err) = clear.web_run(renamed_text) if not ok: return (js_file_path, None, 'Beautifier fail') with open(temp_files['%s_%s' % (r_strategy, c_strategy)], 'w') as f: f.write(beautified_renamed_text) # Save some stats about which names were renamed to what # This is what enables the comparison between the different # methods. r = [[c_strategy] + x for x in ts.compute_summary_scoped(renaming_map, name_candidates, a_iBuilder, a_scopeAnalyst)] if not r: return (js_file_path, None, 'Compute summary failed') nc += r if nc: candidates += [[r_strategy] + x for x in nc] #create the rows for the suggestion_model.csv for r_strategy in RS.all(): for suggestionKey, s_feat in suggestion_features[r_strategy].iteritems(): variableKey = (suggestionKey[0], suggestionKey[1]) original_name = min_name_map[variableKey][0] js_nice_name = jsnice_name_map[variableKey][0] if(variableKey in name_features): #eval_dbg only n_feat = list(name_features[variableKey]) #Convert the def_scope to an equivalent, but smaller, easier to read key: (line_num, token_num) newKey = scopeAnalyst.nameDefScope2pos[variableKey] (keyLine, keyToken) = iBuilder_ugly.revFlatMat[newKey] model_rows.append([original_name, r_strategy, suggestionKey[0], keyLine, keyToken, suggestionKey[2], js_nice_name] + n_feat + s_feat) return (js_file_path, 'OK', candidates, model_rows)
def processFile(l): js_file_path = l[0] base_name = os.path.splitext(os.path.basename(js_file_path))[0] temp_files = { 'orig': '%s.js' % base_name, 'minified': '%s.u.js' % base_name, 'n2p': '%s.n2p.js' % base_name } for r_strategy in RS.all(): temp_files['%s' % (r_strategy)] = \ '%s.%s.js' % (base_name, r_strategy) for c_strategy in CS.all(): temp_files['%s_%s' % (r_strategy, c_strategy)] = \ '%s.%s.%s.js' % (base_name, r_strategy, c_strategy) for k, v in temp_files.iteritems(): temp_files[k] = os.path.join(output_path, v) candidates = [] #Minified Name -> Original Name (name, def_scope) -> (name, def_scope) min_name_map = {} #Hashed Name -> Minified Name (name, def_scope) -> (name, def_scope) hash_name_map = {} #Minified Name -> jsnice name (name, def_scope) -> (name, def_scope) jsnice_name_map = {} #Output Lines for the suggestoin_model.csv model_rows = [] try: js_text = open(os.path.join(corpus_root, js_file_path), 'r').read() # Strip comments, replace literals, etc try: prepro = WebLMPreprocessor(js_text) prepro_text = str(prepro) except: return (js_file_path, None, 'Preprocessor fail') # Pass through beautifier to fix layout clear = Beautifier() (ok, tmp_beautified_text, _err) = clear.web_run(prepro_text) if not ok: return (js_file_path, None, 'Beautifier fail') # Minify ugly = Uglifier() (ok, tmp_minified_text, _err) = ugly.web_run(tmp_beautified_text) if not ok: return (js_file_path, None, 'Uglifier fail') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() (aligned_clear, aligned_minified) = aligner.web_align( WebLexer(tmp_beautified_text).tokenList, WebLexer(tmp_minified_text).tokenList) except: return (js_file_path, None, 'Aligner fail') # Pass through beautifier to fix layout (ok, beautified_text, _err) = clear.web_run(aligned_clear) if not ok: return (js_file_path, None, 'Beautifier fail') (ok, minified_text, _err) = clear.web_run(aligned_minified) if not ok: return (js_file_path, None, 'Beautifier fail') # Num tokens before vs after try: lex_clear = WebLexer(beautified_text) tok_clear = lex_clear.tokenList lex_ugly = WebLexer(minified_text) tok_ugly = lex_ugly.tokenList except: return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): return (js_file_path, None, 'Num tokens mismatch') if beautified_text == minified_text: return (js_file_path, None, 'Not minified') #try: # iBuilder_clear = IndexBuilder(lex_clear.tokenList) #except: # return (js_file_path, None, "IndexBuilder fail on original file.") try: iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: return (js_file_path, None, 'IndexBuilder fail') with open(temp_files['orig'], 'w') as f: f.write(beautified_text) with open(temp_files['minified'], 'w') as f: f.write(minified_text) # try: # orig_lexer = WebLexer(beautified_text) # orig_iBuilder = IndexBuilder(orig_lexer.tokenList) # orig_scopeAnalyst = WebScopeAnalyst(beautified_text) # except: # return (js_file_path, None, 'IndexBuilder/Scoper fail on original') ######################## # Nice2Predict ######################## # BV: Next block left out until I figure out the pipe issue # BV: Update: I couldn't pipe input to N2P. TODO: FIX # Run the JSNice from http://www.nice2predict.org unuglifyJS = UnuglifyJS() (ok, n2p_text, _err) = unuglifyJS.run(temp_files['minified']) if not ok: return (js_file_path, None, 'Nice2Predict fail') (ok, n2p_text_beautified, _err) = clear.web_run(n2p_text) if not ok: return (js_file_path, None, 'Beautifier fail') with open(temp_files['n2p'], 'w') as f: f.write(n2p_text_beautified) try: n2p_lexer = WebLexer(n2p_text_beautified) n2p_iBuilder = IndexBuilder(n2p_lexer.tokenList) n2p_scopeAnalyst = WebScopeAnalyst(n2p_text_beautified) except: return (js_file_path, None, 'IndexBuilder / ScopeAnalyst fail') # Save some translation stats to compare different methods ts = TranslationSummarizer() candidates += [['n2p', ''] + x for x in ts.compute_summary_unscoped( n2p_iBuilder, n2p_scopeAnalyst)] ################################################ # All other JSNaughty variants ################################################ try: scopeAnalyst = WebScopeAnalyst(minified_text) except: return (js_file_path, None, 'ScopeAnalyst minified fail') try: scopeAnalyst_clear = WebScopeAnalyst(beautified_text) except: return (js_file_path, None, 'ScopeAnalyst clear fail') #if(not check(iBuilder_clear, scopeAnalyst_clear, n2p_iBuilder, n2p_scopeAnalyst)): # return (js_file_path, None, 'JsNice restructured file. Skipping..') #Map the original names to the minified counterparts. orderedVarsNew = sorted(scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) orderedVarsOld = sorted(scopeAnalyst_clear.name2defScope.keys(), key=lambda x: x[1]) orderedVarsN2p = sorted(n2p_scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) if (len(orderedVarsOld) != len(orderedVarsNew)): return (js_file_path, None, "Old and New Name lists different length") if (len(orderedVarsOld) != len(orderedVarsN2p)): return (js_file_path, None, "JsNice and Old Name lists different length") for i in range(0, len(orderedVarsOld)): name_old = orderedVarsOld[i][0] def_scope_old = scopeAnalyst_clear.name2defScope[orderedVarsOld[i]] name_new = orderedVarsNew[i][0] def_scope_new = scopeAnalyst.name2defScope[orderedVarsNew[i]] min_name_map[(name_new, def_scope_new)] = (name_old, def_scope_old) name_n2p = orderedVarsN2p[i][0] def_scope_n2p = scopeAnalyst.name2defScope[orderedVarsNew[i]] jsnice_name_map[(name_new, def_scope_new)] = (name_n2p, def_scope_n2p) #Once we have the scopeAnalyst, iBuilder, and tokenlist for the minified #version, we can get the name properties # vm = VariableMetrics(scopeAnalyst, iBuilder_ugly, lex_ugly.tokenList) # variableKeySet = vm.getVariables() # for variableKey in variableKeySet: # name_features[variableKey] = vm.getNameMetrics(variableKey) (_name_positions, \ position_names, _use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst) # Try different renaming strategies (hash, etc) for r_strategy, proxy in proxies: # Rename input prior to translation preRen = PreRenamer() after_text = preRen.rename(r_strategy, iBuilder_ugly, scopeAnalyst) (ok, beautified_after_text, _err) = clear.web_run(after_text) if not ok: return (js_file_path, None, 'Beautifier fail') # Save renamed input to disk for future inspection with open(temp_files['%s' % (r_strategy)], 'w') as f: f.write(beautified_after_text) a_lexer = WebLexer(beautified_after_text) a_iBuilder = IndexBuilder(a_lexer.tokenList) a_scopeAnalyst = WebScopeAnalyst(beautified_after_text) if (r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO): # try: # scopeAnalyst_hash = WebScopeAnalyst(beautified_after_text) #This should be beautified_after_text instead of after_text # except: # return (js_file_path, None, "ScopeAnalyst hash fail") #Map the hashed names to the minified counterparts. orderedVarsMin = sorted(scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) orderedVarsHash = sorted(a_scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) if (len(orderedVarsMin) != len(orderedVarsHash)): return (js_file_path, None, "Hash and Min lists different length") for i in range(0, len(orderedVarsHash)): name_hash = orderedVarsHash[i][0] def_scope_hash = a_scopeAnalyst.name2defScope[ orderedVarsHash[i]] name_min = orderedVarsMin[i][0] def_scope_min = scopeAnalyst.name2defScope[ orderedVarsMin[i]] hash_name_map[(name_hash, def_scope_hash)] = (name_min, def_scope_min) # We can switch this back once we train models on a corpus with literals # lx = WebLexer(a_iBuilder.get_text()) lx = WebLexer(a_iBuilder.get_text_wo_literals()) # Translate renamed input md = WebMosesDecoder(proxy) (ok, translation, _err) = md.run(lx.collapsedText) if not ok: return (js_file_path, None, 'Moses translation fail') (a_name_positions, a_position_names, a_use_scopes) = prepHelpers(a_iBuilder, a_scopeAnalyst) nc = [] if translation is not None: # Parse moses output mp = MosesParser() name_candidates = mp.parse(translation, a_iBuilder, a_position_names) # name_candidates is a dictionary of dictionaries: # keys are (name, def_scope) tuples; # values are suggested translations with the sets # of line numbers on which they appear. # Update name_candidates with some default values # (in this case the translation without any renaming) # if the translation is empty if r_strategy == RS.NONE: # RS.NONE should always be first, by construction name_candidates_default = name_candidates scopeAnalyst_default = a_scopeAnalyst iBuilder_default = a_iBuilder else: for key_default, suggestions in name_candidates_default.iteritems( ): # (name_default, def_scope_default) = key_default pos_default = scopeAnalyst_default.nameDefScope2pos[ key_default] (lin, col) = iBuilder_default.revFlatMat[pos_default] (line_num, line_idx) = iBuilder_default.revTokMap[(lin, col)] (name, def_scope) = a_position_names[line_num][line_idx] key = (name, def_scope) for name_translation, lines in suggestions.iteritems(): name_candidates.setdefault(key, {}) name_candidates[key].setdefault( name_translation, set([])) name_candidates[key][name_translation].update( lines) # **** BV: This might be all we need to combine Naughty & Nice name_candidates_copy = deepcopy(name_candidates) for key, suggestions in name_candidates_copy.iteritems(): if r_strategy == RS.NONE: (name_n2p, def_scope_n2p) = jsnice_name_map[key] else: (name_n2p, def_scope_n2p) = jsnice_name_map[hash_name_map.get( key, key)] for name_translation, lines in suggestions.iteritems(): name_candidates.setdefault(key, {}) name_candidates[key].setdefault(name_n2p, set([])) name_candidates[key][name_n2p].update(lines) cc = ConsistencyController(debug_mode=False) ts = TranslationSummarizer() # An identifier may have been translated inconsistently # across different lines (Moses treats each line independently). # Try different strategies to resolve inconsistencies, if any for c_strategy in CS.all(): # Compute renaming map (x -> length, y -> width, ...) # Note that x,y here are names after (hash) renaming (temp_renaming_map, seen) = cc.computeRenaming( c_strategy, name_candidates, a_name_positions, a_use_scopes, a_iBuilder, lm_path, {}, hash_name_map) # After computeRenaming, we have both the entropies stored # if we are in LMDrop strategy and have the suggestions # frequency from name_candidates. Fill in suggestion_Features # if(c_strategy == CS.LMDROP and r_strategy not in suggestion_features): # assert(cc.suggestion_cache != None) # suggestion_features[r_strategy] = {} # """ # name_candidates: dict # name_candidates[(name, def_scope)][name_translation] # = set of line numbers in the translation # """ # for variableKey, suggestionDictionary in name_candidates.iteritems(): # for suggestionName, linesSuggested in suggestionDictionary.iteritems(): # # # I need to revert variableKey[0] in the suggestion from its hash to its original minified name. # if(r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO): # unhashedKey = hash_name_map[variableKey] # suggestionKey = (unhashedKey[0], unhashedKey[1], suggestionName) # else: # suggestionKey = (variableKey[0], variableKey[1], suggestionName) # # entropyVals = cc.suggestion_cache.getEntropyStats(variableKey, suggestionName) # if(entropyVals != (ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR)): # suggestionValue = [len(linesSuggested)] + \ # list(getSuggestionStats(suggestionName)) + \ # list(entropyVals) # # suggestion_features[r_strategy][suggestionKey] = suggestionValue # Fall back on original names in input, if # no translation was suggested postRen = PostRenamer() renaming_map = postRen.updateRenamingMap( a_name_positions, position_names, a_use_scopes, temp_renaming_map, seen, r_strategy) # Apply renaming map and save output for future inspection renamed_text = postRen.applyRenaming( a_iBuilder, a_name_positions, renaming_map) (ok, beautified_renamed_text, _err) = clear.web_run(renamed_text) if not ok: return (js_file_path, None, 'Beautifier fail') with open(temp_files['%s_%s' % (r_strategy, c_strategy)], 'w') as f: f.write(beautified_renamed_text) # Save some stats about which names were renamed to what # This is what enables the comparison between the different # methods. r = [[c_strategy] + x for x in ts.compute_summary_scoped( renaming_map, name_candidates, a_iBuilder, a_scopeAnalyst)] if not r: return (js_file_path, None, 'Compute summary failed') nc += r if nc: candidates += [[r_strategy] + x for x in nc] #create the rows for the suggestion_model.csv # for r_strategy in RS.all(): # for suggestionKey, s_feat in suggestion_features[r_strategy].iteritems(): # variableKey = (suggestionKey[0], suggestionKey[1]) # original_name = min_name_map[variableKey][0] # js_nice_name = jsnice_name_map[variableKey][0] # n_feat = list(name_features[variableKey]) # #Convert the def_scope to an equivalent, but smaller, easier to read key: (line_num, token_num) # newKey = scopeAnalyst.nameDefScope2pos[variableKey] # (keyLine, keyToken) = iBuilder_ugly.revFlatMat[newKey] # model_rows.append([original_name, r_strategy, suggestionKey[0], keyLine, keyToken, suggestionKey[2], js_nice_name] + n_feat + s_feat) return (js_file_path, 'OK', candidates, model_rows) except Exception, e: return (js_file_path, None, str(e).replace("\n", ""), model_rows)
def getMosesTranslation(proxy, r_strategy, RS, a_beautifier, iBuilder_ugly, scopeAnalyst_ugly, debug_mode=False): """ A helper function so that we can run multiple different renaming strategies through moses in a more modular and hopefully parallelizable manner. It performs hashing/no hashing preparation of the file for the renaming strategy specified by r_stategy, and then calls the appropriate moses_server. Parameters ---------- proxy: A pointer to which port the appropriate moses server is listening in on for this particular renaming strategy. r_strategy: One of the renaming strategies from RenamingStrategies RS: A renaming strategies object. a_beautifier: a beautify object to make sure the renamed text is cleanly formatted. iBuilder_ugly: Index Builder for the minified file. scopeAnalyst_ugly: Scope Analyst for the minified file. start: The starting time for the preprocessing step. Used for performance metrics. debug_mode: Print debug information? (True/False - defaults to False) Returns ------- (status, error, translation, name_candidates, a_iBuilder, a_scopeAnalyst, a_name_positions, a_position_names, a_use_scopes, hash_name_map, pre_time, rn_time, m_time, post_start) status: Did this complete without error? If False, then the rest of the output besides error will be empty/null. error: What is the reason for the failure? If status is True (successful completion) this is "". translation: The raw Moses output name_candidates: The set of Moses suggestions for this renaming a_iBuilder,a_scopeAnalyst: Index Builder and Scope Analyst for this renaming a_name_positions, a_posistion_names, a_use_scopes: Addition tracking info hash_name_map: a map from the hashed names to the original minified names rn_time, m_time, lex_time, post_start: The duration of the renaming, Moses translation steps, and lexing steps along with the start time for the postprocessing of the Moses output. """ rn_start = time.time() #We need both the base_text and the hashed_text. preRen = PreRenamer() if (debug_mode): print("Tokens-------------------") print(iBuilder_ugly.tokens) print("Tokens-------------------") #We always need the non hashed names as a fallback. try: after_text = preRen.rename(r_strategy, iBuilder_ugly, scopeAnalyst_ugly) except: return (False, "Renaming failed for " + str(r_strategy), "", {}, None, None, {}, {}, {}, {}, 0, 0, 0, 0) (ok, beautified_after_text, _err) = a_beautifier.web_run(after_text) if not ok: return (False, "Beautifier failed on the renamed text for " + str(r_strategy), "", {}, None, None, {}, {}, {}, {}, 0, 0, 0, 0) # Align hashed and non hashed files, in case the beautifier # line wrapped the extended lines. try: aligner = Aligner() (aligned_after, aligned_before) = aligner.web_align( WebLexer(beautified_after_text).tokenList, WebLexer(iBuilder_ugly.get_text()).tokenList) except: return (False, "Aligner failed on the renamed text for " + str(r_strategy), "", {}, None, None, {}, {}, {}, {}, 0, 0, 0, 0) #print("--------Aligned After-------") #print(aligned_after) #print("----------------------------") a_lexer = WebLexer(aligned_after) a_iBuilder = IndexBuilder(a_lexer.tokenList) a_scopeAnalyst = WebScopeAnalyst(aligned_after) hash_name_map = {} if (r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO): #Something below here is buggy... orderedVarsMin = sorted(scopeAnalyst_ugly.name2defScope.keys(), key=lambda x: x[1]) orderedVarsHash = sorted(a_scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) #print("Min len: " + str(len(orderedVarsMin))) #print("Hash len: " + str(len(orderedVarsHash))) if (len(orderedVarsMin) != len(orderedVarsHash)): return (False, "Mismatch between minified and hashed names.", "", {}, a_iBuilder, a_scopeAnalyst, {}, {}, {}, {}, 0, 0, 0, 0) for i in range(0, len(orderedVarsHash)): name_hash = orderedVarsHash[i][0] def_scope_hash = a_scopeAnalyst.name2defScope[orderedVarsHash[i]] name_min = orderedVarsMin[i][0] def_scope_min = scopeAnalyst_ugly.name2defScope[orderedVarsMin[i]] hash_name_map[(name_hash, def_scope_hash)] = (name_min, def_scope_min) if (debug_mode): print("HASH NAME MAP LEN: " + str(len(hash_name_map))) # We can switch this back once we train models on a corpus with literals # lx = WebLexer(a_iBuilder.get_text()) lx = WebLexer(a_iBuilder.get_text_wo_literals()) #print("-----------------Moses In ----------------------") #print(lx) #print("------------------------------------------------") #print(a_iBuilder.charPosition2Name) #print("------------------------------------------------") #line_subset = a_scopeAnalyst.getMinifiableLines(a_iBuilder) #line_list = sorted(list(line_subset)) #line_map = {} #m_line = 0 #for next_line in line_list: # line_map[m_line] = next_line # m_line += 1 #lx = WebLexer(a_iBuilder.get_text_on_lines_wo_literals(line_subset)) #Performance measures -> wrap up the preprocessing/ renaming #phases end = time.time() rn_time = end - rn_start m_start = time.time() #if(debug_mode): # print("Invoking Moses.") # print(lx.collapsedText) # Translate renamed input #md = WebMosesDecoder(proxy) #(ok, translation, _err) = md.run(lx.collapsedText) (ok, translation, _err) = segmentedTranslation(lx, SEGMENTED_TRANS_SIZE, proxy, debug_mode) if not ok: return (False, "Moses server failed for " + str(r_strategy), translation, {}, a_iBuilder, a_scopeAnalyst, {}, {}, {}, hash_name_map, 0, 0, 0, 0) m_end = time.time() m_time = m_end - m_start post_start = time.time() (a_name_positions, a_position_names, a_use_scopes) = prepHelpers(a_iBuilder, a_scopeAnalyst) if translation is not None: # Parse moses output mp = MosesParser() if (debug_mode): print(translation) name_candidates = mp.parse(translation, a_iBuilder, a_position_names) #, #a_scopeAnalyst) #A slightly modified version of parse to remap the moses #output lines to the correct original lines. #name_candidates = mp.parse_subset(translation, # a_iBuilder, # a_position_names, # line_map) lex_time = lx.build_time + a_lexer.build_time return (True, "", translation, name_candidates, a_iBuilder, a_scopeAnalyst, a_name_positions, a_position_names, a_use_scopes, hash_name_map, rn_time, m_time, lex_time, post_start)