def bracket_strings(start, end, b_brack, e_brack): sep = "tzvlw" t = basicutils.CompileTextFromRange(start, end, sep) tokens = [tk.lower() for tk in t.split(sep)] b = [] for tk in tokens: tk = tk.strip() if tk.startswith(b_brack): b_contents = tk[1:tk.find(e_brack)] #Hack to get rid of [-],[+],[*] - could also try to remove non alpha if (len(b_contents) > 3): #Hack for debug prints that started with [0x%x] if (b_contents != "0x%x"): b.append(tk[1:tk.find(e_brack)]) print "bracket_strings tokens:" print tokens print b u_gram = "" u_gram_score = 0 if (len(b) > 0): f = nltk.FreqDist(b) u_gram = f.most_common(1)[0][0] u_gram_score = f.most_common(1)[0][1] return (u_gram, u_gram_score)
def string_range_tokenize(start, end, sep): # get all string references in this range concatenated into a single string t = basicutils.CompileTextFromRange(start, end, sep) #Enable this if you already have a bunch of function names and want to include that in the mix #t+= basicutils.CompileFuncNamesFromRangeAsText(start,end,sep) #print "string_range_tokenize: raw text:" #print t #remove printf/sprintf format strings tc = re.sub("%[0-9A-Za-z]+", " ", t) #convert dash to underscore tc = re.sub("-", "_", tc) #replace _ and / with space - may want to turn this off sometimes #this will break up snake case and paths #problem is that if you have a path that is used throughout the binary it will probably dominate results tc = re.sub("_", " ", tc) #replace / and \\ with a space tc = re.sub("[/\\\\]", " ", tc) #remove anything except alphanumeric, spaces, . (for .c, .cpp, etc) and _ tc = re.sub("[^A-Za-z0-9_\.\s]", " ", tc) #lowercase it - and store this as the original set of tokens to work with tokens = [tk.lower() for tk in tc.split()] #remove English stop words #this is the list from the MIT *bow project eng_stopw = { "about", "all", "am", "an", "and", "are", "as", "at", "be", "been", "but", "by", "can", "cannot", "did", "do", "does", "doing", "done", "for", "from", "had", "has", "have", "having", "if", "in", "is", "it", "its", "of", "on", "that", "the", "these", "they", "this", "those", "to", "too", "want", "wants", "was", "what", "which", "will", "with", "would" } #remove "code" stop words #e.g. common words in debugging strings code_sw = { "error", "err", "errlog", "log", "return", "returned", "byte", "bytes", "status", "len", "length", "size", "ok", "0x", "warning", "fail", "failed", "failure", "invalid", "illegal", "param", "parameter", "done", "complete", "assert", "assertion", "cant", "didnt", "class", "foundation", "cdecl", "stdcall", "thiscall" } stopw = eng_stopw.union(code_sw) c = 0 tokens_f = [] for t in tokens: if t not in stopw: tokens_f.append(t) return tokens_f
def source_file_strings(start, end): sep = "tzvlw" t = basicutils.CompileTextFromRange(start, end, sep) #normally would do lower here to normalize but we lose camel case that way tokens = [tk for tk in t.split(sep)] #for each string, remove quotes and commas, then tokenize based on spaces to generate the final list tokens2 = [] for tk in tokens: tk = tk.strip() #strip punctuation, need to leave in _ for filenames and / and \ for paths tk = re.sub("[\"\'\,]", " ", tk) for tk2 in tk.split(" "): tokens2.append(tk2) b = [] for tk in tokens2: tk = tk.strip() if tk.endswith(".c") or tk.endswith(".cpp") or tk.endswith(".cc"): #If there's a dir path, only use the end filename #This could be tweaked if the directory structure is part of the software architecture #e.g. if there are multiple source directories with meaningful names if tk.rfind("/") != -1: ntk = tk[tk.rfind("/") + 1:] elif tk.rfind("\\") != -1: ntk = tk[tk.rfind("\\") + 1:] else: ntk = tk b.append(ntk) print "source_file_strings tokens:" #print tokens print b #a better way to do this (if there are multiple) #would be to sort, uniquify, and then make the name foo.c_and_bar.c u_gram = "" u_gram_score = 0 if (len(b) > 0): f = nltk.FreqDist(b) u_gram = f.most_common(1)[0][0] u_gram_score = f.most_common(1)[0][1] return (u_gram, u_gram_score)