def tokenize(path): t = Tokenizer(path) tokens = t.raw_tokenize() idents = [] for token in tokens: if token.kind.name == "IDENTIFIER": name = token.spelling.lower() name = re.sub("_", "", name) idents.append(name) return "\n".join(idents)
def tokenize(path): t = Tokenizer(path) tokens = t.raw_tokenize() items = [] for token in tokens: if token.kind.name == "LITERAL": text = token.spelling cursor_kind = clang.cindex.CursorKind kind = token.cursor.kind if kind == cursor_kind.STRING_LITERAL: # do extra processing on strings text = sha256(mangle_text(token.spelling)).hexdigest()[:10] items.append(text) if token.kind.name == "COMMENT": hashed = sha256(mangle_text(token.spelling[2:])).hexdigest()[:10] items.append(hashed) return "\n".join(items)
def genStats(path, helpers): t = Tokenizer(path) tokens = t.raw_tokenize() # stats numLines = 0 numWhitespace = 0 numComments = 0 avgIdentLength = 0 numFunctions = 0 # ident followed by (, declarations and calls numDefines = 0 numMathOps = 0 lenLongestLine = 0 numReturns = 0 # other data idents = [] text = io.readFile(path) lastWasIdent = False # get info from tokens for token in tokens: # look for a comment if token.kind.name == "COMMENT": numComments += 1 # look for math ops if token.spelling in ["+", "-", "*", "/", "|", "&", "+=", "-=", "*=", "/=", ">>=", "<<=", "++", "--", "~", ">>", "!"]: numMathOps += 1 # look for function decs/calls if lastWasIdent and token.spelling == "(": numFunctions += 1 # count the number of returns if token.spelling == "return": numReturns += 1 # add the identifier to the list, set lastWasIdent if token.kind.name == "IDENTIFIER": idents.append(token.spelling) lastWasIdent = True else: lastWasIdent = False # get average ident length total = 0.0 for ident in idents: total += float(len(ident)) avgIdentLength = 0.0 if len(idents) > 0: avgIdentLenth = total / float(len(idents)) # find the number of defines defines = re.findall("#\s*define ", text.lower()) numDefines = len(defines) # find the number of lines lines = text.split("\n") if len(lines) == 1: # ugh, windows lines = text.split("\r") numLines = len(lines) # get the length of the longest line for line in lines: if len(line) > lenLongestLine: lenLongestLine = len(line) # find the total amount of whitespace for char in text: if char in [" ", "\n", "\t", "\r"]: numWhitespace += 1 # create a dict of results and return results = {} results["numLines"] = numLines results["numWhitespace"] = numWhitespace results["numComments"] = numComments results["avgIdentLength"] = avgIdentLength results["numFunctions"] = numFunctions results["numDefines"] = numDefines results["numMathOps"] = numMathOps results["numReturns"] = numReturns results["lenLongestLine"] = lenLongestLine return results
def genStats(path, helpers): t = Tokenizer(path) tokens = t.raw_tokenize() # stats numLines = 0 numWhitespace = 0 numComments = 0 avgIdentLength = 0 numFunctions = 0 # ident followed by (, declarations and calls numDefines = 0 numMathOps = 0 lenLongestLine = 0 numReturns = 0 # other data idents = [] text = io.readFile(path) lastWasIdent = False # get info from tokens for token in tokens: # look for a comment if token.kind.name == "COMMENT": numComments += 1 # look for math ops if token.spelling in [ "+", "-", "*", "/", "|", "&", "+=", "-=", "*=", "/=", ">>=", "<<=", "++", "--", "~", ">>", "!" ]: numMathOps += 1 # look for function decs/calls if lastWasIdent and token.spelling == "(": numFunctions += 1 # count the number of returns if token.spelling == "return": numReturns += 1 # add the identifier to the list, set lastWasIdent if token.kind.name == "IDENTIFIER": idents.append(token.spelling) lastWasIdent = True else: lastWasIdent = False # get average ident length total = 0.0 for ident in idents: total += float(len(ident)) avgIdentLength = 0.0 if len(idents) > 0: avgIdentLenth = total / float(len(idents)) # find the number of defines defines = re.findall("#\s*define ", text.lower()) numDefines = len(defines) # find the number of lines lines = text.split("\n") if len(lines) == 1: # ugh, windows lines = text.split("\r") numLines = len(lines) # get the length of the longest line for line in lines: if len(line) > lenLongestLine: lenLongestLine = len(line) # find the total amount of whitespace for char in text: if char in [" ", "\n", "\t", "\r"]: numWhitespace += 1 # create a dict of results and return results = {} results["numLines"] = numLines results["numWhitespace"] = numWhitespace results["numComments"] = numComments results["avgIdentLength"] = avgIdentLength results["numFunctions"] = numFunctions results["numDefines"] = numDefines results["numMathOps"] = numMathOps results["numReturns"] = numReturns results["lenLongestLine"] = lenLongestLine return results