def get_text_on_lines_wo_literals(self, lines): """ Parameters: ----------- lines - a set of line numbers of the text we wish to return (starting from 0) Returns: ------- A string resembling the stored code, but only from the lines listed. The other lines are omitted. """ tokens = [] for _line_idx, line in enumerate(self.tokens): x = [] if(_line_idx not in lines): #Skip the line if its not in our request list continue for (tt,t) in line: if is_token_subtype(tt, String): x.append('TOKEN_LITERAL_STRING') elif is_token_subtype(tt, Number): x.append('TOKEN_LITERAL_NUMBER') else: x.append(t) tokens.append(' '.join(x)) return '\n'.join(tokens)
def handleUnicodeEscape(tokens, tokenType, technique="REMOVE"): ''' The subprocess module is struggling with unicode escape characters inside string literals. This method removes or replaces all unicode escape characters inside tokens of tokenType. technique: "REMOVE" or "REPLACE", defaults to "REMOVE" if another string is sent. ''' rt = [] if(technique == "REPLACE"): #Not implemented yet. for t in tokens: newTok = t if(is_token_subtype(t[0], tokenType)): #DO Stuff newTok = (t[0], re.sub(r'[^\x00-\x7F]+',' ', t[1].decode("unicode_escape"))) rt.append(newTok) else: for t in tokens: newTok = t if(is_token_subtype(t[0], tokenType)): # print(t) # newTok = (t[0], re.sub(r'[^\x00-\x7F]+','', t[1].decode("unicode_escape"))) newTok = (t[0], unidecode(t[1])) rt.append(newTok) return rt
def process_config(l_num, line): context = {'is_config': False, 'config_group': [], 'group_ended': False, 'config_value': []} for token, text in inline_lexer.get_tokens(line): is_comment = is_token_subtype(token, Token.Comment) is_string = is_token_subtype(token, Token.Literal.String) if is_comment: return if context['is_config'] and token is Token.Punctuation and ';' in text: context['is_config'] = False if context['is_config'] and not context['group_ended'] and is_string: context['config_group'].append(text[1:-1]) if context['is_config'] and context['group_ended'] and token is not Token.Text: context['config_value'].append((token, text)) if context['is_config'] and token is Token.Operator and text == '=': context['group_ended'] = True if token is Token.Name.Variable and text == '$config': context['is_config'] = True # print token, (text,) if context['config_group']: value = [] for token, val in context['config_value']: if token is Token.Literal.Number.Integer: value.append(int(val)) elif token is Token.Literal.Number.Float: value.append(float(val)) elif is_token_subtype(token, Token.Literal.String): value.append(val[1:-1]) elif token is Token.Keyword: value.append({'true': True, 'false': False, 'null': None}.get(val)) # else: # print token, val group = config for g in context['config_group'][:-1]: group = group[g] group[context['config_group'][-1]] = value
def hash_line(line): """Hashes placeholders in a line passed as a list of (token_type, token_name) pairs. A hash combines the hash of the tokens to the left of a placeholder (excluding other placeholders) with the hash of the tokens to the right of a placeholder (including itself. This encodes position in addition to the contents of the rest of the line. """ names = [ name if not is_token_subtype(token_type, Token.Placeholder) else "@@PLACEHOLDER" for (token_type, name) in line ] hashed_line = [] for index, (token_type, name) in enumerate(line): if is_token_subtype(token_type, Token.Placeholder): fst = names[:index] snd = names[index:] hashed_name = sha256() hashed_name.update(str(fst).encode("utf-8")) hashed_name.update(hashed_name.digest()) hashed_name.update(str(snd).encode("utf-8")) hashed_line.append((token_type, hashed_name.hexdigest())) else: hashed_line.append((token_type, name)) return hashed_line
def test_functions(self): self.assertTrue(token.is_token_subtype(token.String, token.String)) self.assertTrue(token.is_token_subtype(token.String, token.Literal)) self.assertFalse(token.is_token_subtype(token.Literal, token.String)) self.assertTrue(token.string_to_tokentype(token.String) is token.String) self.assertTrue(token.string_to_tokentype("") is token.Token) self.assertTrue(token.string_to_tokentype("String") is token.String)
def test_functions(self): self.assert_(token.is_token_subtype(token.String, token.String)) self.assert_(token.is_token_subtype(token.String, token.Literal)) self.failIf(token.is_token_subtype(token.Literal, token.String)) self.assert_(token.string_to_tokentype(token.String) is token.String) self.assert_(token.string_to_tokentype('') is token.Token) self.assert_(token.string_to_tokentype('String') is token.String)
def test_functions(): assert token.is_token_subtype(token.String, token.String) assert token.is_token_subtype(token.String, token.Literal) assert not token.is_token_subtype(token.Literal, token.String) assert token.string_to_tokentype(token.String) is token.String assert token.string_to_tokentype('') is token.Token assert token.string_to_tokentype('String') is token.String
def test_functions(self): self.assertTrue(token.is_token_subtype(token.String, token.String)) self.assertTrue(token.is_token_subtype(token.String, token.Literal)) self.assertFalse(token.is_token_subtype(token.Literal, token.String)) self.assertTrue(token.string_to_tokentype(token.String) is token.String) self.assertTrue(token.string_to_tokentype('') is token.Token) self.assertTrue(token.string_to_tokentype('String') is token.String)
def interpret_host(cls, host_node, symbol_table): assert host_node assert isinstance(host_node, AST_T_Node) _pos, _type, _value = host_node.data if token.is_token_subtype(_type, TT_VARIABLE): return symbol_table[_value] if token.is_token_subtype(_type, TT_IPv4): return _value
def traversal(scopeAnalyst, iBuilder, context, condition): seen = {} print( "name2defScope---------------------------------------------------") print(scopeAnalyst.name2defScope) for line_idx, line in enumerate(iBuilder.tokens): print("Traversing: " + str(line_idx) + " ----- " + str(line)) for token_idx, (token_type, token) in enumerate(line): (l, c) = iBuilder.tokMap[(line_idx, token_idx)] pos = iBuilder.flatMap[(l, c)] #if(True): try: if (is_token_subtype(token_type, Token.Name)): print("NAME!!!!!!" + str(token)) def_scope = scopeAnalyst.name2defScope[(token, pos)] # use_scope = scopeAnalyst.name2useScope[(token, pos)] pth = scopeAnalyst.name2pth[(token, pos)] except KeyError: print("KEY ERROR! " + str(token_idx) + " -- " + str(token_type) + " -- " + str(token)) continue if not isValidContextToken((token_type, token)): continue if scopeAnalyst.isGlobal.get((token, pos), True): continue context_tokens = [] # If token is defined on the current line, # count this line towards token's context. if condition(pth, scopeAnalyst, token, def_scope, seen): for tidx, (tt, t) in enumerate(line): (tl, tc) = iBuilder.tokMap[(line_idx, tidx)] p = iBuilder.flatMap[(tl, tc)] if scopeAnalyst.isGlobal.get((t, p), True) or \ not is_token_subtype(tt, Token.Name): context_tokens.append(t) if t == token and p == pos and \ not scopeAnalyst.isGlobal.get((t, p), True): context_tokens.append('#') seen[(token, def_scope)] = True context.setdefault((token, def_scope), []) context[(token, def_scope)] += context_tokens return context
class PreRenamer: def __init__(self): self.RS = RenamingStrategies() # self.simple_direct_map = {} # self.simple_inverse_map = {} def __isValidContextToken(self, (token_type, token)): if is_token_subtype(token_type, String) or \ is_token_subtype(token_type, Number): return False return True
def strip_literals(self, iBuilder): tokens = [] for line in iBuilder.tokens: new_line = [] for (token_type, token) in line: if is_token_subtype(token_type, String): new_line.append('TOKEN_LITERAL_STRING') elif is_token_subtype(token_type, Number): new_line.append('TOKEN_LITERAL_NUMBER') else: new_line.append(token) tokens.append(new_line) return tokens
def tokensExceptTokenType(tokens, tokenType, ignoreSubtypes = False): """ @author: Naji Dmeiri :param tokens: A list of `Token` objects as defined in `pygments.token` :param tokenType: A `TokenType` object as defined in `pygments.token` :param ignoreSubtypes: When set to True, the returned list will include subtypes of `tokenType` ; default is `False`. :returns: An iterable of tuples that each hold information about a `tokenType` tokens. """ if tokenType not in STANDARD_TYPES: raise ValueError("%s is not a standard Pygments token type." % tokenType) rt = [] for t in tokens: rm = False if not ignoreSubtypes: if is_token_subtype(t[0], tokenType): rm = True else: if t[0] == tokenType: rm = True if not rm: rt.append(t) else: if t[0] == Comment.Single: if t[1].endswith('\n'): rt.append((Token.Text, u'\n')) return rt
def processTokenList(self): """ Helper: Walk through the token list and record the lines where literals, for loops, and while loops occur Parameters ---------- Returns ------- """ line_num = 0 for token_type, token in self.tokens: if token_type == Token.Text and '\n' in token: line_num += 1 else: #Check if literal if(is_token_subtype(token_type, Token.Literal)): self.literalsOnLines.add(line_num) #Check if for loop if(token_type == Token.Keyword and token.strip() == u'for'): self.forOnLines.add(line_num) #Check if while loop if(token_type == Token.Keyword and token.strip() == u'while'): self.whileOnLines.add(line_num)
def get_text_wo_literals(self): tokens = [] for _line_idx, line in enumerate(self.tokens): x = [] for (tt,t) in line: if is_token_subtype(tt, String): x.append('TOKEN_LITERAL_STRING') elif is_token_subtype(tt, Number): x.append('TOKEN_LITERAL_NUMBER') else: x.append(t) tokens.append(' '.join(x)) return '\n'.join(tokens)
def test_highlight_block(): code = ("""def sandwich(bread, cheese=True): result = [] result.append(bread.slice()) if cheese: result.append('cheese') return result""") indent = 15 * ' ' result = highlight_block('python', code, None) reference = \ [Token('def', Keyword), Token(' ', Text), Token('sandwich', Name.Function), Token('(', Punctuation), Token('bread', Name), Token(',', Punctuation), Token(' ', Text), Token('cheese', Name), Token('=', Operator), Token('True', Name.Builtin.Pseudo), Token('):', Punctuation), Token('\n' + indent + ' ', Text), Token('result', Name), Token(' ', Text), Token('=', Operator), Token(' ', Text), Token('[]', Punctuation), Token('\n' + indent + ' ', Text), Token('result', Name), Token('.', Operator), Token('append', Name), Token('(', Punctuation), Token('bread', Name), Token('.', Operator), Token('slice', Name), Token('())', Punctuation), Token('\n' + indent + ' ', Text), Token('if', Keyword), Token(' ', Text), Token('cheese', Name), Token(':', Punctuation), Token('\n' + indent + ' ' + ' ', Text), Token('result', Name), Token('.', Operator), Token('append', Name), Token('(', Punctuation), Token("'cheese'", Literal.String), Token(')', Punctuation), Token('\n' + indent + ' ', Text), Token('return', Keyword), Token(' ', Text), Token('result', Name), Token('\n', Text)] for res, ref in zip(result, reference): assert res.text(None) == ref.text(None) assert is_token_subtype(res.type, ref.type)
def formatTokens(tokenList): lines = [] line = [] for (token_type, token) in tokenList: if not is_token_subtype(token_type, Token.Text): line.append((token_type, token.strip())) elif '\n' in token: lines.append(line) line = [] return lines
def _match_N(self, s_top_N, _token): _rules_LL = self.predict[s_top_N] _rule = None for term, rule in _rules_LL.items(): if token.is_token_subtype(_token[1], term): _rule = rule break return _rule
def prepareHelpers(iBuilder, scopeAnalyst=None): # Collect names and their locations in various formats # that will come in handy later: # Which locations [(line number, index within line)] does # a variable name appear at? name_positions = {} # Which variable name is at a location specified by # [line number][index within line]? position_names = {} for line_num, line in enumerate(iBuilder.tokens): position_names.setdefault(line_num, {}) for line_idx, (token_type, token) in enumerate(line): if is_token_subtype(token_type, Token.Name): (l, c) = iBuilder.tokMap[(line_num, line_idx)] p = iBuilder.flatMap[(l, c)] # cond = False if scopeAnalyst is not None: name2defScope = scopeAnalyst.resolve_scope() isGlobal = scopeAnalyst.isGlobal # if not False: #isGlobal.get((token, p), True): try: def_scope = name2defScope[(token, p)] name_positions.setdefault((token, def_scope), []) name_positions[(token, def_scope)].append( (line_num, line_idx)) position_names[line_num][line_idx] = (token, def_scope) except KeyError: pass # cond = True # # print (token, def_scope), line_num, line_idx else: def_scope = None name_positions.setdefault((token, def_scope), []) name_positions[(token, def_scope)].append( (line_num, line_idx)) position_names[line_num][line_idx] = (token, def_scope) # cond = True # if cond: # print (token, def_scope), line_num, line_idx return (name_positions, position_names)
def get_context(self, string): """ Assuming the cursor is at the end of the specified string, get the context (a list of names) for the symbol at cursor position. """ context = [] reversed_tokens = list(self._lexer.get_tokens(string)) reversed_tokens.reverse() # Pygments often tacks on a newline when none is specified in the input. # Remove this newline. if reversed_tokens and reversed_tokens[0][1].endswith('\n') and \ not string.endswith('\n'): reversed_tokens.pop(0) current_op = '' for token, text in reversed_tokens: if is_token_subtype(token, Token.Name): # Handle a trailing separator, e.g 'foo.bar.' if current_op in self._name_separators: if not context: context.insert(0, '') # Handle non-separator operators and punction. elif current_op: break context.insert(0, text) current_op = '' # Pygments doesn't understand that, e.g., '->' is a single operator # in C++. This is why we have to build up an operator from # potentially several tokens. elif token is Token.Operator or token is Token.Punctuation: # Handle a trailing separator, e.g 'foo.bar.' if current_op in self._name_separators: if not context: context.insert(0, '') else: current_op = text + current_op # Break on anything that is not a Operator, Punctuation, or Name. else: break return context
def renameUsingScopeId(scopeAnalyst, iBuilder_ugly): ''' Simple renaming: disambiguate overloaded names with indices: n -> n_1, n_2, n_3. The index is the def_scope id. ''' name2defScope = scopeAnalyst.resolve_scope() isGlobal = scopeAnalyst.isGlobal # Figure out which _scope_idx suffixes are illegal except_ids = map(int, [ name.split('_')[-1] for name in scopeAnalyst.nameScopes.keys() if name.split('_')[-1].isdigit() ]) # Compute shorter def_scope identifiers scopes = set(name2defScope.values()) scope2id = dict(zip(scopes, generateScopeIds(len(scopes), except_ids))) renaming = [] for line_idx, line in enumerate(iBuilder_ugly.tokens): new_line = [] for token_idx, (token_type, token) in enumerate(line): try: (l, c) = iBuilder_ugly.tokMap[(line_idx, token_idx)] pos = iBuilder_ugly.flatMap[(l, c)] def_scope = name2defScope[(token, pos)] except KeyError: new_line.append(token) continue if is_token_subtype(token_type, Token.Name) and \ scopeAnalyst.is_overloaded(token) and \ not isGlobal[(token, pos)]: # Must rename token to something else # Append def_scope id to name new_line.append('%s_%d' % (token, scope2id[def_scope])) else: new_line.append(token) renaming.append(' '.join(new_line) + "\n") return renaming
def get_tokens(self, var_names=Names.RAW): """Generate tokens from a raw_code string, skipping comments. Keyword arguments: var_names -- Which variable names to output (default RAW). """ previous_string = None for (token_type, token) in self.tokens: # Pygments breaks up strings into individual tokens representing # things like opening quotes and escaped characters. We want to # collapse all of these into a single string literal token. if previous_string and not is_token_subtype( token_type, Token.String): yield (Token.String, previous_string) previous_string = None if is_token_subtype(token_type, Token.String): if previous_string: previous_string += token else: previous_string = token elif is_token_subtype(token_type, Token.Number): yield (token_type, token) # Skip comments elif is_token_subtype(token_type, Token.Comment): continue # Skip the :: token added by HexRays elif is_token_subtype(token_type, Token.Operator) and token == '::': continue # Replace the text of placeholder tokens elif is_token_subtype(token_type, Token.Placeholder): yield { Names.RAW: (token_type, token), Names.SOURCE: (token_type, token.split('@@')[2]), Names.TARGET: (token_type, token.split('@@')[3]), }[var_names] elif not is_token_subtype(token_type, Token.Text): yield (token_type, token.strip()) # Skip whitespace elif is_token_subtype(token_type, Token.Text): continue else: raise TokenError(f"No token ({token_type}, {token})")
def tokensReplaceTokenOfType(tokens, tokenType, replacementValue, ignoreSubtypes = False): """ :param tokens: A list of `Token` objects as defined in `pygments.token` :param tokenType: A `TokenType` object as defined in `pygments.token` :param replacementValue: :param ignoreSubtypes: When set to True, the returned list will include subtypes of `tokenType` ; default is `False`. :returns: An iterable of tuples that each hold information about a `tokenType` tokens. """ if tokenType not in STANDARD_TYPES: raise ValueError("%s is not a standard Pygments token type." % tokenType) if not ignoreSubtypes: return [t if not is_token_subtype(t[0], tokenType) else (t[0], replacementValue) for t in tokens] else: return [t if not t[0] == tokenType else (t[0], replacementValue) for t in tokens]
def get_lines(self, var_naming=VarNaming.NONE, var_table=None): line = [] for (token_type, token) in self.tokens: if is_token_subtype(token_type, Token.Comment) and len(line) > 0: if var_naming == VarNaming.HASH: line = hash_line(line) yield line line = [] elif is_token_subtype(token_type, Token.String): # Pygments breaks up strings into individual tokens representing # things like opening quotes and escaped characters. We want to # collapse all of these into a single string literal token. if line != [] and line[-1] == ( Token.String, "<LITERAL_STRING>", ): continue line.append((Token.String, "<LITERAL_STRING>")) elif is_token_subtype(token_type, Token.Number): line.append((Token.String, "<LITERAL_NUMBER>")) # Skip the :: token elif is_token_subtype(token_type, Token.Operator) and token == "::": continue # Replace placeholders if using table renaming elif var_naming == VarNaming.TABLE and is_token_subtype( token_type, Token.Placeholder.Var): if not var_table: raise KeyError("var_table must be set with table renaming") # Remove the '@@VAR_' from the beginning of the placeholder var_id = token[6:] line.append( (Token.Placeholder.Var, self.var_table[var_id][var_table])) elif not is_token_subtype(token_type, Token.Text): line.append((token_type, token.strip())) elif "\n" in token and len(line) > 0: if var_naming == VarNaming.HASH: line = hash_line(line) yield line line = []
def processFile(js_file_path): try: # Num tokens before vs after try: tok1 = Lexer(os.path.join(files_root, 'orig', js_file_path)).tokenList tok2 = Lexer(os.path.join(files_root, 'no_renaming', js_file_path)).tokenList # tok3 = Lexer(os.path.join(files_root, 'basic_renaming', js_file_path)).tokenList # tok4 = Lexer(os.path.join(files_root, 'normalized', js_file_path)).tokenList tok5 = Lexer( os.path.join(files_root, 'hash_def_one_renaming', js_file_path)).tokenList tok6 = Lexer( os.path.join(files_root, 'hash_def_two_renaming', js_file_path)).tokenList except: return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(set([len(tok1), len(tok2), len(tok5), len(tok6)])) == 1: return (js_file_path, None, 'Num tokens mismatch') clear = Beautifier() # Align minified and clear files, in case the beautifier # did something weird aligner = Aligner() (aligned1, aligned2) = aligner.web_align(tok1, tok2) (ok, beautified1, _err) = clear.web_run(aligned1) tok11 = WebLexer(beautified1).tokenList (ok, beautified2, _err) = clear.web_run(aligned2) tok22 = WebLexer(beautified2).tokenList (aligned5, aligned2) = aligner.web_align(tok5, tok2) (ok, beautified5, _err) = clear.web_run(aligned5) tok55 = WebLexer(beautified5).tokenList (aligned6, aligned2) = aligner.web_align(tok6, tok2) (ok, beautified6, _err) = clear.web_run(aligned6) tok66 = WebLexer(beautified6).tokenList # try: # aligner = Aligner() # # This is already the baseline corpus, no (smart) renaming yet # aligner.align(temp_files['path_tmp_b'], # temp_files['path_tmp_u']) # except: # return (js_file_path, None, 'Aligner fail') try: iBuilder1 = IndexBuilder(tok11) iBuilder2 = IndexBuilder(tok22) # iBuilder3 = IndexBuilder(tok3) # iBuilder4 = IndexBuilder(tok4) iBuilder5 = IndexBuilder(tok55) iBuilder6 = IndexBuilder(tok66) except: return (js_file_path, None, 'IndexBuilder fail') # Check that at least one variable was renamed during minification orig_names = set([ token for line in iBuilder1.tokens for (token_type, token) in line if is_token_subtype(token_type, Token.Name) ]) ugly_names = set([ token for line in iBuilder2.tokens for (token_type, token) in line if is_token_subtype(token_type, Token.Name) ]) if not len(orig_names.difference(ugly_names)): return (js_file_path, None, 'Not minified') orig = [] no_renaming = [] # basic_renaming = [] # normalized = [] hash_def_one_renaming = [] hash_def_two_renaming = [] for _line_idx, line in enumerate(iBuilder1.tokens): orig.append(' '.join([t for (_tt, t) in line]) + "\n") for _line_idx, line in enumerate(iBuilder2.tokens): no_renaming.append(' '.join([t for (_tt, t) in line]) + "\n") # for _line_idx, line in enumerate(iBuilder3.tokens): # basic_renaming.append(' '.join([t for (_tt,t) in line]) + "\n") # for _line_idx, line in enumerate(iBuilder4.tokens): # normalized.append(' '.join([t for (_tt,t) in line]) + "\n") for _line_idx, line in enumerate(iBuilder5.tokens): hash_def_one_renaming.append(' '.join([t for (_tt, t) in line]) + "\n") for _line_idx, line in enumerate(iBuilder6.tokens): hash_def_two_renaming.append(' '.join([t for (_tt, t) in line]) + "\n") return ( js_file_path, orig, no_renaming, # basic_renaming, # normalized, hash_def_one_renaming, hash_def_two_renaming) except Exception, e: return (js_file_path, None, str(e))
def tokenize(program_path, raw=False): lexer = RubyLexer() token_streams = [] with open(program_path, "r") as f: program = f.readlines() num_of_lines = len(program) last_indent_count = 0 for line in program: line_of_token = [] for token_data in lexer.get_tokens(line): token_type = token_data[0] token = token_data[-1] if raw: if is_token_subtype(token_type, Comment) or is_token_subtype( token_type, Literal): arranged_token = replace_special_char(token, comment=True) else: arranged_token = replace_special_char(token, comment=False) else: if is_token_subtype(token_type, Literal): arranged_token = "<LITERAL>" elif is_token_subtype(token_type, String): arranged_token = "<STRING>" elif is_token_subtype(token_type, Number): arranged_token = "<NUMBER>" elif token_type == Token.Name.Operator: arranged_token = "<OPERATOR>" elif token_type == Name and token not in reserved: arranged_token = "<ID>" elif token_type == Name.Variable.Instance: arranged_token = "<INSTANCE_VAL>" elif token_type == Name.Variable.Class: arranged_token = "<CLASS_VAL>" elif token_type == Name.Constant: arranged_token = "<CONSTANT_ID>" elif token_type == Name.Function: arranged_token = "<FUNCTION>" elif token_type == Name.Class: arranged_token = "<CLASS>" elif token_type == Name.Namespace: arranged_token = "<NAMESPACE>" elif token_type == Token.Name.Variable.Global: arranged_token = "<GLOBAL_VAL>" elif token_type == Token.Error: arranged_token = "<ERROR>" # pygments内で字句解析が失敗した際のトークン (絵文字など) elif is_token_subtype(token_type, Comment): arranged_token = "<COMMENT>" else: arranged_token = replace_special_char(token) # if arranged_token not in reserved and "SPACE" not in arranged_token and "NEWLINE" not in arranged_token: # if token_type not in (Token.Punctuation, Token.Operator, Token.Name.Builtin, Token.Keyword.Pseudo): # print("==============") # print(program_path) # print(line.rstrip()) # print("{} : {}".format(arranged_token.encode("utf-8"), token_type)) # print("==============") line_of_token.append(arranged_token + " ") # 空白区切りにするため、最後にスペースをつける # 行頭の空白二つはインデントとみなす line_of_token[0] = line_of_token[0].replace("<SPACE> <SPACE> ", "<INDENT> ") # インデントは前の行との相対的な値を番号として付与する indent_count = len(re.findall("<INDENT>", line_of_token[0])) if indent_count != 0: # 空行がインデントされていると0番目の要素にインデントと改行が両方含まれている場合があるため、 # インデント情報を取り除いてから、相対的なインデント情報を付け加える indent_char = "<INDENT{}> ".format(indent_count - last_indent_count) line_of_token[0] = line_of_token[0].replace("<INDENT> ", "") line_of_token[0] = indent_char + line_of_token[0] if len(line_of_token) != 1: last_indent_count = indent_count token_streams.append(line_of_token) return token_streams, num_of_lines
def compare(self, mini_js_path=None, keep_mini=True): pid = int(multiprocessing.current_process().ident) lexer = get_lexer_for_filename(self.js_path) # before tmp_b = open(self.js_path, 'r').read() tokens_b = list(lex(tmp_b, lexer)) # Discover the path to the source map map_path = sourcemap.discover(tmp_b) if map_path is not None: # The file couldn't have a source map unless it is already minified return True # after if mini_js_path is None: uglifier = Uglifier() mini_js_path = os.path.abspath('tmp_%d.u.js' % pid) uglifyjs_ok = uglifier.run(self.js_path, mini_js_path) if not uglifyjs_ok: raise Exception, 'Uglifier failed' uglified = open(mini_js_path, 'r').read() tokens_u = list(lex(uglified, lexer)) # returns a generator of tuples if not len(tokens_b) == len(tokens_u): if not keep_mini: remove_file(mini_js_path) raise Exception, 'Different number of tokens' clean_names = [ token for (token_type, token) in tokens_b if is_token_subtype(token_type, Token.Name) ] ugly_names = [ token for (token_type, token) in tokens_u if is_token_subtype(token_type, Token.Name) ] same = [ idx for (idx, token) in enumerate(clean_names) if ugly_names[idx] == token ] clean_names_n = [ token for (idx, token) in enumerate(clean_names) if idx not in same ] ugly_names_n = [ token for (idx, token) in enumerate(ugly_names) if idx not in same ] if not clean_names_n: if not keep_mini: remove_file(mini_js_path) return False if sum([len(v) for v in clean_names_n]) <= \ sum([len(v) for v in ugly_names_n]): if not keep_mini: remove_file(mini_js_path) return True if not keep_mini: remove_file(mini_js_path) return False
def intim_introspection(): # TODO: some of these are not ABSOLUTELY needed. Make user free not # to install them. from pygments.token import Token, is_token_subtype from pygments.lexers import python as pylex import os # for module type from sys import stdout # for default 'file' from types import ModuleType, MethodType # to define particular types from numpy import ufunc as UFuncType # yet other particular types import inspect # to check for type types from enum import Enum # for analysing enum types Example_enum = Enum("Example", 'value') filenames = {USERSCRIPTFILES} # sed by vimscript, remove duplicates source = '' # concat here all these files for filename in filenames: with open(filename, 'r') as file: source += '\n' + file.read() class Type(object): """Type class for typing nodes of the token forest Can iterate over its instances for convenience """ _instances = set() def __init__(self, id, python_type): """ python_type: either """ self.id = 'IntimPy' + id self._instances.add(self) self.type = python_type @classmethod def instances(cls): """Iterate over all instances """ return iter(cls._instances) # Supported types Bool = Type("Bool" , type(True)) BuiltIn = Type("Builtin" , type(dir)) Class = Type("Class" , None) # checked while typing node EnumType = Type("EnumType" , type(Example_enum)) EnumValue = Type("EnumValue" , type(Example_enum.value)) Float = Type("Float" , type(1.)) Function = Type("Function" , None) # checked while typing node Method = Type("Method" , None) # checked while typing node Instance = Type("Instance" , None) # instance of user's custom class Int = Type("Int" , type(1)) Module = Type("Module" , type(os)) NoneType = Type("NoneType" , type(None)) String = Type("String" , type('a')) Unexistent = Type("Unexistent" , None) # node yet undefined in the session # Store them so that they can easily be found from actual python types types_map = {} for cls in Type.instances(): # All `None` keys override each other.. types_map[cls.type] = cls # .. never mind. class Node(object): """Identifier and references to its parents and kids. It may have no parent, it is a root then. """ def __init__(self, id, parent=None, type=Unexistent): """ id: string the node's identifier: i.e. how it is written in the script. parent: Node its parent node in the graph, root node if None type: Type associated type with coloration etc """ self.id = id self.parent = parent self._kids = {} # {id: Node} self.type = type @property def leaf(self): """True if has no kids """ return not bool(self._kids) @property def root(self): """True if parent is None or a Forest """ return self.parent is None or isinstance(self.parent, Forest) def add_node(self, node): """basic procedure to add a node as a kid """ node.parent = self self._kids[node.id] = node def add_id(self, id): """Create a new kid from a string id if it already exists, do not erase the existing one return the newly created node """ node = self._kids.get(id) if node: return node node = Node(id=id, parent=self) self._kids[id] = node return node @property def parents(self): """iterate backwards until a root parent is found """ yield self if self.parent: yield from self.parent.parents else: raise StopIteration() @property def path(self): """Use backward iteration to build the full path to this node """ res = [parent.id for parent in self.parents] return '.'.join(reversed(res)) @property def kids(self): """iterate over kids """ return iter(self._kids.values()) @property def leaves(self): """Iterate over all leaf kids """ if self.leaf: yield self else: for kid in self.kids: yield from kid.leaves def __iter__(self): """Iterate over all nodes, top-down """ yield self for kid in self.kids: yield from kid def _repr(self, prefix): """Iterate over all nodes and print full paths """ res = "{}{}: {}\n".format(prefix, self.id, self.type.id) for kid in self.kids: res += kid._repr(prefix + self.id + '.') return res def __repr__(self): return self._repr('') def __len__(self): """Number of nodes: ourselves as a node + the weight of our kids """ return 1 + sum(len(kid) for kid in self.kids) def type_nodes(self, prefix=''): """Ultimate use of this forest: evaluate our id in the current context to retrieve information on the current state of this access path prefix: string previous path (context) of this node called by the parents """ path = prefix + self.id # analyse type of this node: try: t = eval("type({})".format(path), globals()) except (AttributeError, NameError) as e: # then all subsequent nodes are unexistent for node in self: node.type = Unexistent return # is the type available, special? node_type = types_map.get(t) if node_type: self.type = node_type else: # then it is just a plain valid, known node, probably # instance of a custom class or a function, method # unelegant way to get these functions into eval scope: if eval("f({})".format(path), globals(), {'f': inspect.ismethod}): self.type = Method elif eval("f({})".format(path), globals(), {'f': lambda x: inspect.isfunction(x) or type(x) is UFuncType}): # inspect misses this one self.type = Function elif eval("f({})".format(path), globals(), {'f': inspect.isclass}): if eval("f({})".format(path), globals(), {'f': lambda c: issubclass(c, Enum)}): self.type = EnumType else: self.type = Class elif eval("f({})".format(path), globals(), {'f': lambda i: isinstance(i, Enum)}): self.type = EnumValue else: self.type = Instance for kid in self.kids: kid.type_nodes(path + '.') def write(self, prefix, depth, file=stdout): """Build a vim syntax command to color this node, given information recursively given from above: prefix: string prefix to the command, build from above depth: int our depth within the forest, build from above file: send there the resulting commands: once on each node """ # match expressions from the root, but only color the leaf: suffix = r"\>'hs=e-" + str(len(self.id) - 1) # allow any amount of whitespace around the '.' operator whitespace = r"[ \s\t\n]*\.[ \s\t\n]*" # for speed, provide Vim information about the items inclusions: if not self.root: suffix += " contained" if self.leaf: suffix += " contains=NONE" if not self.leaf: # watch out: here is an additional iteration on kids! ** subgroups = {sub.type.id for sub in self.kids} suffix += " contains=" + ','.join(subgroups) # here is the full command: command = "syntax match " + self.type.id + prefix + suffix # throw it up print(command, file=file) # ask the kids to do so :) for kid in self.kids: # ** second iteration, could be the only one if kid.type is not Unexistent: # to release burden a little bit kid.write(prefix + whitespace + kid.id, depth + 1, file) class Forest(Node): """A Forest is a special Node with no parent, no id, and containing only root nodes. """ def __init__(self): self._kids = {} @property def parents(self): """A forest has no parents """ raise StopIteration() def __repr__(self): if self.leaf: return "empty Forest." res = "" for kid in self.kids: res += repr(kid) return res def __len__(self): """Total number of nodes in the forest: """ return sum(len(kid) for kid in self.kids) def __iter__(self): """Iterate over all trees, not ourselves """ for kid in self.kids: yield from kid def type_nodes(self): """Ask each tree to type itself """ for kid in self.kids: kid.type_nodes() def write(self, file=stdout): """Visit the forest to build an ad-hoc vim syntax file and color the nodes in the source file. """ # The root name starts without being a subname of something else. root_prefix = ( r" '\(" # (exclude comments endind with a period) r"\n\s\{-}[^#].\{-}" # previous identifier + period r"[a-zA-Z][0-9]\{-}[\s\n]\{-}\.[\s\n]\{-}" # well, do not match all this. r"\)\@<!\<") for kid in self.kids: if kid.type is not Unexistent: # to ease the file a little bit kid.write(root_prefix + kid.id, 0, file=file) # signal to Intim: the syntax file may be read now! print('" end', file=file) # Start lexing! lx = pylex.Python3Lexer() g = lx.get_tokens(source) # gather names to color as a forest of '.' operators: forest = Forest() current = forest # flag to keep track of whether to add in depth or go back to the root last_was_a_name = True # Also gather misc immediate tokens.. just for fun and extensibility misc = {} for i in [Token.Name.Decorator, Token.Name.Namespace, Token.Name.Operator, Token.Name.Keyword, Token.Name.Literal, Token.Comment, ]: misc[i] = set() # iterate over type_of_token, string for t, i in g: in_misc = False for subtype, harvest in misc.items(): if is_token_subtype(t, subtype): in_misc = True harvest.add(i) break if in_misc: # no need to go further: this token does not belong to the forest continue if is_token_subtype(t, Token.Name): node = forest if last_was_a_name else current current = node.add_id(i) last_was_a_name = True elif is_token_subtype(t, Token.Operator): if i == '.': last_was_a_name = False # gather information on node types forest.type_nodes() # Write the vimscript commands to the syntax file: filename = INTIMSYNTAXFILE # sed by vimscript with open(filename, 'w') as file: forest.write(file)
def intim_introspection(): # TODO: some of these are not ABSOLUTELY needed. Make user free not # to install them. from pygments.token import Token, is_token_subtype from pygments.lexers import python as pylex import os # for module type from sys import stdout # for default 'file' from types import ModuleType, MethodType # to define particular types from numpy import ufunc as UFuncType # yet other particular types import inspect # to check for type types from enum import Enum # for analysing enum types Example_enum = Enum("Example", 'value') filenames = {USERSCRIPTFILES} # sed by vimscript, remove duplicates source = '' # concat here all these files for filename in filenames: with open(filename, 'r') as file: source += '\n' + file.read() class Type(object): """Type class for typing nodes of the token forest Can iterate over its instances for convenience """ _instances = set() def __init__(self, id, python_type): """ python_type: either """ self.id = 'IntimPy' + id self._instances.add(self) self.type = python_type @classmethod def instances(cls): """Iterate over all instances """ return iter(cls._instances) # Supported types Bool = Type("Bool", type(True)) BuiltIn = Type("Builtin", type(dir)) Class = Type("Class", None) # checked while typing node EnumType = Type("EnumType", type(Example_enum)) EnumValue = Type("EnumValue", type(Example_enum.value)) Float = Type("Float", type(1.)) Function = Type("Function", None) # checked while typing node Method = Type("Method", None) # checked while typing node Instance = Type("Instance", None) # instance of user's custom class Int = Type("Int", type(1)) Module = Type("Module", type(os)) NoneType = Type("NoneType", type(None)) String = Type("String", type('a')) Unexistent = Type("Unexistent", None) # node yet undefined in the session # Store them so that they can easily be found from actual python types types_map = {} for cls in Type.instances(): # All `None` keys override each other.. types_map[cls.type] = cls # .. never mind. class Node(object): """Identifier and references to its parents and kids. It may have no parent, it is a root then. """ def __init__(self, id, parent=None, type=Unexistent): """ id: string the node's identifier: i.e. how it is written in the script. parent: Node its parent node in the graph, root node if None type: Type associated type with coloration etc """ self.id = id self.parent = parent self._kids = {} # {id: Node} self.type = type @property def leaf(self): """True if has no kids """ return not bool(self._kids) @property def root(self): """True if parent is None or a Forest """ return self.parent is None or isinstance(self.parent, Forest) def add_node(self, node): """basic procedure to add a node as a kid """ node.parent = self self._kids[node.id] = node def add_id(self, id): """Create a new kid from a string id if it already exists, do not erase the existing one return the newly created node """ node = self._kids.get(id) if node: return node node = Node(id=id, parent=self) self._kids[id] = node return node @property def parents(self): """iterate backwards until a root parent is found """ yield self if self.parent: yield from self.parent.parents else: raise StopIteration() @property def path(self): """Use backward iteration to build the full path to this node """ res = [parent.id for parent in self.parents] return '.'.join(reversed(res)) @property def kids(self): """iterate over kids """ return iter(self._kids.values()) @property def leaves(self): """Iterate over all leaf kids """ if self.leaf: yield self else: for kid in self.kids: yield from kid.leaves def __iter__(self): """Iterate over all nodes, top-down """ yield self for kid in self.kids: yield from kid def _repr(self, prefix): """Iterate over all nodes and print full paths """ res = "{}{}: {}\n".format(prefix, self.id, self.type.id) for kid in self.kids: res += kid._repr(prefix + self.id + '.') return res def __repr__(self): return self._repr('') def __len__(self): """Number of nodes: ourselves as a node + the weight of our kids """ return 1 + sum(len(kid) for kid in self.kids) def type_nodes(self, prefix=''): """Ultimate use of this forest: evaluate our id in the current context to retrieve information on the current state of this access path prefix: string previous path (context) of this node called by the parents """ path = prefix + self.id # analyse type of this node: try: t = eval("type({})".format(path), globals()) except (AttributeError, NameError) as e: # then all subsequent nodes are unexistent for node in self: node.type = Unexistent return # is the type available, special? node_type = types_map.get(t) if node_type: self.type = node_type else: # then it is just a plain valid, known node, probably # instance of a custom class or a function, method # unelegant way to get these functions into eval scope: if eval("f({})".format(path), globals(), {'f': inspect.ismethod}): self.type = Method elif eval("f({})".format(path), globals(), {'f': inspect.isfunction}): self.type = Function elif eval("f({})".format(path), globals(), {'f': inspect.isclass}): if eval("f({})".format(path), globals(), {'f': lambda c: issubclass(c, Enum)}): self.type = EnumType else: self.type = Class elif eval("f({})".format(path), globals(), {'f': lambda i: isinstance(i, Enum)}): self.type = EnumValue else: self.type = Instance for kid in self.kids: kid.type_nodes(path + '.') def write(self, prefix, depth, file=stdout): """Build a vim syntax command to color this node, given information recursively given from above: prefix: string prefix to the command, build from above depth: int our depth within the forest, build from above file: send there the resulting commands: once on each node """ # match expressions from the root, but only color the leaf: suffix = r"\>'hs=e-" + str(len(self.id) - 1) # allow any amount of whitespace around the '.' operator whitespace = r"[ \s\t\n]*\.[ \s\t\n]*" # for speed, provide Vim information about the items inclusions: if not self.root: suffix += " contained" if self.leaf: suffix += " contains=NONE" if not self.leaf: # watch out: here is an additional iteration on kids! ** subgroups = {sub.type.id for sub in self.kids} suffix += " contains=" + ','.join(subgroups) # here is the full command: command = "syntax match " + self.type.id + prefix + suffix # throw it up print(command, file=file) # ask the kids to do so :) for kid in self.kids: # ** second iteration, could be the only one if kid.type is not Unexistent: # to release burden a little bit kid.write(prefix + whitespace + kid.id, depth + 1, file) class Forest(Node): """A Forest is a special Node with no parent, no id, and containing only root nodes. """ def __init__(self): self._kids = {} @property def parents(self): """A forest has no parents """ raise StopIteration() def __repr__(self): if self.leaf: return "empty Forest." res = "" for kid in self.kids: res += repr(kid) return res def __len__(self): """Total number of nodes in the forest: """ return sum(len(kid) for kid in self.kids) def __iter__(self): """Iterate over all trees, not ourselves """ for kid in self.kids: yield from kid def type_nodes(self): """Ask each tree to type itself """ for kid in self.kids: kid.type_nodes() def write(self, file=stdout): """Visit the forest to build an ad-hoc vim syntax file and color the nodes in the source file. """ # the root name starts without being a subname of something else: root_prefix = r" '\([a-zA-Z][a-zA-Z0-9]*[ \s\t\n]*\.[ \s\t\n]*\)\@<!\<" for kid in self.kids: if kid.type is not Unexistent: # to ease the file a little bit kid.write(root_prefix + kid.id, 0, file=file) # signal to Intim: the syntax file may be read now! print('" end', file=file) # Start lexing! lx = pylex.Python3Lexer() g = lx.get_tokens(source) # gather names to color as a forest of '.' operators: forest = Forest() current = forest # flag to keep track of whether to add in depth or go back to the root last_was_a_name = True # Also gather misc immediate tokens.. just for fun and extensibility misc = {} for i in [ Token.Name.Decorator, Token.Name.Namespace, Token.Name.Operator, Token.Name.Keyword, Token.Name.Literal, Token.Comment, ]: misc[i] = set() # iterate over type_of_token, string for t, i in g: in_misc = False for subtype, harvest in misc.items(): if is_token_subtype(t, subtype): in_misc = True harvest.add(i) break if in_misc: # no need to go further: this token does not belong to the forest continue if is_token_subtype(t, Token.Name): node = forest if last_was_a_name else current current = node.add_id(i) last_was_a_name = True elif is_token_subtype(t, Token.Operator): if i == '.': last_was_a_name = False # gather information on node types forest.type_nodes() # Write the vimscript commands to the syntax file: filename = INTIMSYNTAXFILE # sed by vimscript with open(filename, 'w') as file: forest.write(file)
lexer = get_lexer_for_filename("jsFile.js") f = open('really_big_file.dat') corpora = Folder(sys.argv[1]).fullFileNames("*.js") for path_corpus in [c for c in corpora if 'orig' in c or 'no_renaming' in c or 'hash_def_one_renaming' in c]: print os.path.basename(path_corpus) f = open(path_corpus) names = set([]) for piece in read_in_chunks(f): #process_data(piece) tokens = lex(piece, lexer).tokenList names.update([token for (token_type, token) in tokens if is_token_subtype(token_type, Token.Name)]) cnt = Counter(names) print ' ', len(cnt.keys()), 'names' s = stats.describe(cnt.values()) print ' min =', s[1][0] print ' max =', s[1][1] print ' mean =', s[2] print ' variance =', s[3] print ' median =', median(cnt.values()) print
def processTranslation(translation, iBuilder_clear, scopeAnalyst, lm_path, f, output_path, base_name, clear): nc = [] def writeTmpLines(lines, out_file_path): js_tmp = open(out_file_path, 'w') js_tmp.write('\n'.join([' '.join([token for (_token_type, token) in line]) for line in lines]).encode('utf8')) js_tmp.write('\n') js_tmp.close() if translation is not None: # Compute scoping try: # name2Xscope are dictionaries where keys are (name, start_index) # tuples and values are scope identifiers. Note: start_index is a # flat (unidimensional) index, not (line_chr_idx, col_chr_idx). name2defScope = scopeAnalyst.resolve_scope() # name2useScope = scopeAnalyst.resolve_use_scope() # isGlobal has similar structure and returns True/False isGlobal = scopeAnalyst.isGlobal # name2pth has similar structure and returns AST depths # name2pth = scopeAnalyst.resolve_path() # nameOrigin[(name, def_scope)] = depth # nameOrigin = scopeAnalyst.nameOrigin except: return False name_candidates = {} # Collect names and their locations in various formats # that will come in handy later: # Which locations [(line number, index within line)] does # a variable name appear at? name_positions = {} # Which variable name is at a location specified by # [line number][index within line]? position_names = {} for line_num, line in enumerate(iBuilder_clear.tokens): position_names.setdefault(line_num, {}) for line_idx, (token_type, token) in enumerate(line): if is_token_subtype(token_type, Token.Name): (l,c) = iBuilder_clear.tokMap[(line_num, line_idx)] p = iBuilder_clear.flatMap[(l,c)] if not isGlobal.get((token, p), True): def_scope = name2defScope[(token, p)] name_positions.setdefault((token, def_scope), []) name_positions[(token, def_scope)].append((line_num, line_idx)) position_names[line_num][line_idx] = (token, def_scope) # Parse moses output. lines_translated = set([]) translations = {} for line in translation.split('\n'): parts = line.split('|||') if not len(parts[0]): continue # The index of the line in the input to which this # translated line corresponds, starting at 0: n = int(parts[0]) lines_translated.add(n) # The translation: translation = parts[1].strip() translation_parts = translation.split(' ') # Only keep translations that have exactly the same # number of tokens as the input # If the translation has more tokens, copy the input if len(translation_parts) != len(iBuilder_clear.tokens[n]): translation_parts = [token for (token_type, token) \ in iBuilder_clear.tokens[n]] translation = ' '.join(translation_parts) # An input can have identical translations, but with # different scores (the number of different translations # per input is controlled by the -n-best-list decoder # parameter). Keep only unique translations. translations.setdefault(n, set([])) translations[n].add(translation) #print n, translation_parts # Which within-line indices have non-global var names? line_dict = position_names.get(n, {}) # For each variable name, record its candidate translation # and on how many lines (among the -n-best-list) it appears on for line_idx in line_dict.keys(): # The original variable name (name, def_scope) = line_dict[line_idx] # The translated variable name name_translation = translation_parts[line_idx] # Record the line number (we will give more weight # to names that appear on many translation lines) name_candidates.setdefault((name, def_scope), {}) name_candidates[(name, def_scope)].setdefault(name_translation, set([])) name_candidates[(name, def_scope)][name_translation].add(n) # for (name, def_scope), d in name_candidates.iteritems(): # nc.append( (def_scope, name, ','.join(d.keys())) ) #print name, name_translation, n, def_scope def computeFreqLenRenaming(lines, name_candidates, name_positions): renaming_map = {} seen = {} # There is no uncertainty about the translation for # variables that have a single candidate translation for ((name, def_scope), val) in [((name, def_scope), val) for (name, def_scope), val in name_candidates.items() if len(val.keys()) == 1]: candidate_name = val.keys()[0] # Don't use the same translation for different # variables within the same scope. if not seen.has_key((candidate_name, def_scope)): renaming_map[(name, def_scope)] = candidate_name seen[(candidate_name, def_scope)] = True else: renaming_map[(name, def_scope)] = name # For the remaining variables, choose the translation # that has the longest name token_lines = [] for (name, def_scope), pos in name_positions.iteritems(): # pos is a list of tuples [(line_num, line_idx)] token_lines.append(((name, def_scope), \ len(set([line_num \ for (line_num, _line_idx) in pos])))) # Sort names by how many lines they appear # on in the input, descending token_lines = sorted(token_lines, \ key=lambda ((name, def_scope), num_lines): -num_lines) for (name, def_scope), _num_lines in token_lines: # Sort candidates by how many lines in the translation # they appear on, and by name length, both descending candidates = sorted([(name_translation, len(line_nums)) \ for (name_translation,line_nums) \ in name_candidates[(name, def_scope)].items()], key=lambda e:(-e[1],-len(e[0]))) if len(candidates) > 1: unseen_candidates = [candidate_name for (candidate_name, _occurs) in candidates if not seen.has_key((candidate_name, def_scope))] if len(unseen_candidates): candidate_name = unseen_candidates[0] renaming_map[(name, def_scope)] = candidate_name seen[(candidate_name, def_scope)] = True else: renaming_map[(name, def_scope)] = name seen[(name, def_scope)] = True return renaming_map def computeLenRenaming(lines, name_candidates, name_positions): renaming_map = {} seen = {} # There is no uncertainty about the translation for # variables that have a single candidate translation for ((name, def_scope), val) in [((name, def_scope), val) for (name, def_scope), val in name_candidates.items() if len(val.keys()) == 1]: candidate_name = val.keys()[0] if not seen.has_key((candidate_name, def_scope)): renaming_map[(name, def_scope)] = candidate_name seen[(candidate_name, def_scope)] = True else: renaming_map[(name, def_scope)] = name # For the remaining variables, choose the translation that # has the longest name token_lines = [] for (name, def_scope), pos in name_positions.iteritems(): token_lines.append(((name, def_scope), \ len(set([line_num \ for (line_num, _line_idx) in pos])))) # Sort names by how many lines they appear # on in the input, descending token_lines = sorted(token_lines, key=lambda ((name, def_scope), num_lines): -num_lines) for (name, def_scope), _num_lines in token_lines: # Sort candidates by length of translation, descending candidates = sorted([(name_translation, len(line_nums)) \ for (name_translation,line_nums) \ in name_candidates[(name, def_scope)].items()], key=lambda e:-len(e[0])) if len(candidates) > 1: unseen_candidates = [candidate_name for (candidate_name, _occurs) in candidates if not seen.has_key((candidate_name, def_scope))] if len(unseen_candidates): candidate_name = unseen_candidates[0] renaming_map[(name, def_scope)] = candidate_name seen[(candidate_name, def_scope)] = True else: renaming_map[(name, def_scope)] = name seen[(name, def_scope)] = True return renaming_map def computeLMRenaming(lines, name_candidates, name_positions, lm_path): renaming_map = {} seen = {} #print name_candidates # There is no uncertainty about the translation for # variables that have a single candidate translation for ((name, def_scope), val) in [((name, def_scope), val) for (name, def_scope), val in name_candidates.items() if len(val.keys()) == 1]: candidate_name = val.keys()[0] if not seen.has_key((candidate_name, def_scope)): renaming_map[(name, def_scope)] = candidate_name seen[(candidate_name, def_scope)] = True else: renaming_map[(name, def_scope)] = name # For the remaining variables, choose the translation that # gives the highest language model log probability token_lines = [] for (name, def_scope), pos in name_positions.iteritems(): token_lines.append(((name, def_scope), \ len(set([line_num \ for (line_num, _line_idx) in pos])))) # Sort names by how many lines they appear # on in the input, descending token_lines = sorted(token_lines, key=lambda ((name, def_scope), num_lines): -num_lines) for (name, def_scope), _num_lines in token_lines: # Sort candidates by how many lines in the translation # they appear on, and by name length, both descending candidates = sorted([(name_translation, len(line_nums)) \ for (name_translation,line_nums) \ in name_candidates[(name, def_scope)].items()], key=lambda e:(-e[1],-len(e[0]))) if len(candidates) > 1: log_probs = [] unseen_candidates = [candidate_name for (candidate_name, _occurs) in candidates if not seen.has_key((candidate_name, def_scope))] if len(unseen_candidates): for candidate_name in unseen_candidates: line_nums = set([num \ for (num,idx) in name_positions[(name, def_scope)]]) draft_lines = [] for line_num in line_nums: draft_line = [token for (token_type, token) in lines[line_num]] for line_idx in [idx for (num, idx) in name_positions[(name, def_scope)] if num == line_num]: draft_line[line_idx] = candidate_name draft_lines.append(' '.join(draft_line)) line_log_probs = [] for line in draft_lines: lmquery = LMQuery(lm_path=lm_path) (lm_ok, lm_log_prob, _lm_err) = lmquery.run(line) #print _lm_err if not lm_ok: lm_log_prob = -9999999999 line_log_probs.append(lm_log_prob) if not len(line_log_probs): lm_log_prob = -9999999999 else: lm_log_prob = float(sum(line_log_probs)/len(line_log_probs)) log_probs.append((candidate_name, lm_log_prob)) #print candidate_name, log_probs candidate_names = sorted(log_probs, key=lambda e:-e[1]) candidate_name = candidate_names[0][0] renaming_map[(name, def_scope)] = candidate_name seen[(candidate_name, def_scope)] = True else: renaming_map[(name, def_scope)] = name seen[(name, def_scope)] = True #print renaming_map return renaming_map def rename(lines, renaming_map): draft_translation = deepcopy(lines) for (name, def_scope), renaming in renaming_map.iteritems(): for (line_num, line_idx) in name_positions[(name, def_scope)]: (token_type, name) = draft_translation[line_num][line_idx] draft_translation[line_num][line_idx] = (token_type, renaming) return draft_translation # def replaceLiterals(lines, revLiteralsMap): # draft_translation = deepcopy(lines) # # Replace back literals # lineLengths = [len(l) for l in lines] # idx = 0 # sumIdx = 0 # for (flatIdx, literal) in revLiteralsMap: # while flatIdx > sumIdx + lineLengths[idx]: # sumIdx += lineLengths[idx] # idx += 1 # (token_type, name) = draft_translation[idx][flatIdx-sumIdx] # draft_translation[idx][flatIdx-sumIdx] = (token_type, literal) # return draft_translation strategy = f.split('.')[1] renaming_map = computeLMRenaming(iBuilder_clear.tokens, name_candidates, name_positions, lm_path) for (name, def_scope), renaming in renaming_map.iteritems(): nc.append( (strategy+'.lm', def_scope, renaming, name, ','.join(name_candidates[(name, def_scope)])) ) lm_translation = rename(iBuilder_clear.tokens, renaming_map) writeTmpLines(lm_translation, f[:-3] + '.lm.js') ok = clear.run(f[:-3] + '.lm.js', os.path.join(output_path, '%s.%s.lm.js' % (base_name, strategy))) if not ok: return False renaming_map = computeLenRenaming(iBuilder_clear.tokens, name_candidates, name_positions) for (name, def_scope), renaming in renaming_map.iteritems(): nc.append( (strategy+'.len', def_scope, renaming, name, ','.join(name_candidates[(name, def_scope)])) ) len_translation = rename(iBuilder_clear.tokens, renaming_map) writeTmpLines(len_translation, f[:-3] + '.len.js') ok = clear.run(f[:-3] + '.len.js', os.path.join(output_path, '%s.%s.len.js' % (base_name, strategy))) if not ok: return False renaming_map = computeFreqLenRenaming(iBuilder_clear.tokens, name_candidates, name_positions) for (name, def_scope), renaming in renaming_map.iteritems(): nc.append( (strategy+'.freqlen', def_scope, renaming, name, ','.join(name_candidates[(name, def_scope)])) ) freqlen_translation = rename(iBuilder_clear.tokens, renaming_map) writeTmpLines(freqlen_translation, f[:-3] + '.freqlen.js') ok = clear.run(f[:-3] + '.freqlen.js', os.path.join(output_path, '%s.%s.freqlen.js' % (base_name, strategy))) if not ok: return False return nc
def semantics(cls, tree_node, symbol_table, check=False): if isinstance(tree_node, AST_N_Node): if tree_node.n == '<HOST_GRP>': assert tree_node.data # if TT_EXCLMARK == tree_node.data[0][1]: # invert if len(tree_node.data) == 2\ and isinstance(tree_node.data[0], AST_T_Node)\ and token.is_token_subtype(tree_node.data[0].data[1], TT_EXCLMARK): _yes, _no = cls.semantics(tree_node.data[1], symbol_table, check=check) return _no, _yes elif len(tree_node.data) >= 3\ and tree_node.data[1].n == '<HOST_GRP>'\ and tree_node.data[2].n == '<HOST_PARENS_CONTD>': _gyes, _gno = cls.semantics(tree_node.data[1], symbol_table, check=check) _pyes, _pno = cls.semantics(tree_node.data[2], symbol_table, check=check) if check: assert cls.disjoint_sets(_gyes, _gno, _pyes, _pno) return _gyes | _pyes, _gno | _pno elif len(tree_node.data ) == 1 and tree_node.data[0].n == '<HOST_EXPR>': return cls.semantics(tree_node.data[0], symbol_table, check=check) else: assert False elif tree_node.n == '<HOST_EXPR>': assert tree_node.data assert len(tree_node.data) == 1 _host = cls.interpret_host(tree_node.data[0], symbol_table) return {_host}, set() elif tree_node.n == '<HOST_PARENS_CONTD>': if tree_node.data: assert tree_node.data[1].n == '<HOST_GRP>' _gyes, _gno = cls.semantics(tree_node.data[1], symbol_table, check=check) if len(tree_node.data) == 3: _pyes, _pno = cls.semantics(tree_node.data[2], symbol_table, check=check) if check: assert cls.disjoint_sets(_gyes, _gno, _pyes, _pno) _gyes |= _pyes _gno |= _pno return _gyes, _gno else: assert False return set(), set()
def parse(self, tokens, ignore_ws=False): """ """ if ignore_ws: tokens = [ _t for _t in tokens if not token.is_token_subtype(token.Token.Text.Whitespace, _t) ] tokens.extend([self.__DOLLAR]) t_it = iter(tokens) t = next(t_it) _root_parent = AST_N_Node('<$>', parent=AST_Node.TN_NO_PARENT) ast_root = AST_N_Node(self.S, parent=_root_parent) _root_parent.data = [ast_root, self.__DOLLAR] top_node = None for top_node in ast_root: try: if top_node is self.__DOLLAR or t is self.__DOLLAR: break elif isinstance(top_node, AST_T_Node): # Terminal -> Match term = top_node.data if self._match_T(term, t): top_node.data = t else: self.SyntaxError(node_root=ast_root, node_ptr=top_node, tokens=tokens, got=t) t = next(t_it) elif isinstance(top_node, AST_N_Node): # Nonterminal -> Predict/Expand nonterm = top_node.n rule = self._match_N(nonterm, t) if not rule: self.SyntaxError(node_root=ast_root, node_ptr=top_node, tokens=tokens, got=t) production = rule[1][:] if production: top_node.expand_node(production, lambda p: p in self.T) else: top_node.expand_node([]) else: raise Exception('Lexical Error: Unknown Lexem: {0}'.format( type(top_node))) except StopIteration: break if top_node is self.__DOLLAR and t is self.__DOLLAR: # ok pass elif top_node is self.__DOLLAR: # Overflow self.SyntaxError(node_root=ast_root, node_ptr=top_node, tokens=tokens, got=t) elif t is self.__DOLLAR: # Underflow self.SyntaxError(node_root=ast_root, node_ptr=top_node, tokens=tokens, got=t) else: raise Exception('Unknown Error: It: {}, Node: {}'.format( t, str(top_node))) _nodes = ast_root.get_nodes(depth=None) for _n in _nodes: print(str(_n.n if isinstance(_n, AST_N_Node) else _n.data)) return ast_root
def _match_T(self, s_top_T, _token): return token.is_token_subtype(_token[1], s_top_T)