def query_node_type(): step = JoernSteps() step.setGraphDbURL('http://localhost:7474/db/data/') step.connectToDatabase() # get all of function in database query = """getNodesWithType('Function')""" res = step.runGremlinQuery(query) for function in res: # for one function, get type for every line line_dict = dict() function_node_id = int(function.ref[5:]) # get map of type-location query = """queryNodeIndex("functionId:%i").as("x").statements().as("y").select{it.type}{it.location}""" % function_node_id function_nodes = step.runGremlinQuery(query) for node in function_nodes: # get node type and location type = str(node[0]) location = str(node[1]) if (location != 'None'): loc = str(location).split(':')[0] # find in line_dict if (line_dict.has_key(loc)): temp = line_dict.get(loc) + ' ' + type line_dict[loc] = temp else: line_dict[loc] = type clean_type = cc.AST_type_clean(line_dict, True) # do another query to know which files this function belongs to query = """g.v(%d).in("IS_FILE_OF").filter{it.type=="File"}.filepath""" % function_node_id file_path = step.runGremlinQuery(query) file_name = str(file_path[0]).split('/')[-1]
class DBContentsProvider: def __init__(self): self._initDatabaseConnection() def _initDatabaseConnection(self): self.j = JoernSteps() self.j.connectToDatabase() self.j.addStepsDir('steps/') def RunGremlinQuery(self, query): results = self.j.runGremlinQuery(query) return results def GetCalleesInfo(self): query = "getCalleeListInfo()" return self.j.runGremlinQuery(query) """ Generate contents for a given selector, overwriting the contents currently held in cndToQueries memory by the server. """ def generate(self, selector): query = """generateTaintLearnStructures(%s.id.toList()) _()""" % (selector) for unused in self.j.runGremlinQuery(query): pass
def runQueryChunk(): j = JoernSteps() j.setGraphDbURL('http://localhost:7474/db/data/') j.connectToDatabase() query = """getNodesWithType('Function').id""" res = j.runGremlinQuery(query) flag = 1 CHUNK_SIZE = 51 for chunk in j.chunks(res, CHUNK_SIZE): if (flag): functionTuple = tuple(chunk) functionIdStr = str(functionTuple) functionIdStr = functionIdStr.replace(',', '') functionIdStr = functionIdStr.replace('\'', '') #query = """queryNodeIndex("functionId:%s").as("x").statements().map("functionId","location").as("y").select{it.type}{it}""" % functionIdStr query = """queryNodeIndex("functionId:%s").as("x").statements().as("y").as("z").select{it.type}{it.location}{it.functionId}""" % functionIdStr stms = j.runGremlinQuery(query) query = """idListToNodes(%s).as("x").in("IS_FILE_OF").filepath.as("y").select{it.id}{it}""" % chunk stmsFiles = j.runGremlinQuery(query) files = dict() for stmsFile in stmsFiles: files[int(stmsFile[0])] = str(stmsFile[1]).split('/')[-1] codes = dict() for stm in stms: functionnodeid = int(stm[2]) loc = stm[1] type = str(stm[0]) if (codes.__contains__(functionnodeid)): codes[functionnodeid].append([loc, type]) else: codeList = [[loc, type]] codes[functionnodeid] = codeList codesList = codes.items() for id, elem in codesList: lineDict = dict() for e in elem: location = str(e[0]) type = e[1] if (location != u'None'): loc = str(location).split(':')[0] if (lineDict.has_key(loc)): temp = lineDict.get(loc) + ' ' + type lineDict[loc] = temp else: lineDict[loc] = type text = getCleanText(lineDict, False) fileName = files.get(id) addInfoToSourceFile(text, fileName) flag += 1 print flag
def query_node_type_chunk(): step = JoernSteps() step.setGraphDbURL('http://localhost:7474/db/data/') step.connectToDatabase() # get function id query = """getNodesWithType('Function').id""" res = step.runGremlinQuery(query) flag = 1 CHUNK_SIZE = 51 for chunk in step.chunks(res, CHUNK_SIZE): function_tuple = tuple(chunk) function_id_str = str(function_tuple).replace(',', '').replace('\'', '') # to know which files this function belongs to query = """idListToNodes(%s).as("x").in("IS_FILE_OF").filepath.as("y").select{it.id}{it}""" % chunk stms_files = step.runGremlinQuery(query) files = dict() for stms_file in stms_files: files[int(stms_file[0])] = str(stms_file[1]).split('/')[-1] query = """queryNodeIndex("functionId:%s").as("x").statements().as("y").as("z").select{it.type}{it.location}{it.functionId}""" % function_id_str stms = step.runGremlinQuery(query) # get node types codes = dict() for stm in stms: function_node_id = int(stm[2]) loc = stm[1] type = str(stm[0]) if (function_node_id in codes): codes[function_node_id].append([loc, type]) else: codeList = [[loc, type]] codes[function_node_id] = codeList codesList = codes.items() for id, elem in codesList: line_dict = dict() for e in elem: location = str(e[0]) type = e[1] if (location != u'None'): loc = str(location).split(':')[0] if (line_dict.has_key(loc)): temp = line_dict.get(loc) + ' ' + type line_dict[loc] = temp else: line_dict[loc] = type clean_type = cc.AST_type_clean(line_dict, True) fileName = files.get(id)
class DBInterface: def connectToDatabase(self): self.j = JoernSteps() self.j.addStepsDir(JOERN_TOOLS_STEPDIR) self.j.connectToDatabase() def runGremlinQuery(self, query): return self.j.runGremlinQuery(query) def chunks(self, ids, chunkSize): return self.j.chunks(ids, chunkSize)
class DBContentsProvider: def __init__(self): self.j = JoernSteps() self.init_database_connection() def init_database_connection(self): self.j.connectToDatabase() self.j.addStepsDir('steps/') def run_gremlin_query(self, query_script): results = self.j.runGremlinQuery(query_script) return results
def runQuery(): j = JoernSteps() j.setGraphDbURL('http://localhost:7474/db/data/') j.connectToDatabase() query = """getNodesWithType('Function')""" res = j.runGremlinQuery(query) flag = 1 for function in res: if (flag): lineDict = dict() functionnodeid = int(function.ref[5:]) #query = """g.v(%d).functionToAST().astNodes()""" % (functionnodeid) #allNodesOfFunction1 = j.runGremlinQuery(query) query = """queryNodeIndex("functionId:%i").as("x").statements().as("y").select{it.type}{it.location}""" % functionnodeid allNodesOfFunction = j.runGremlinQuery(query) for node in allNodesOfFunction: #print node type = str(node[0]) location = str(node[1]) if (location != 'None'): loc = str(location).split(':')[0] if (lineDict.has_key(loc)): temp = lineDict.get(loc) + ' ' + type lineDict[loc] = temp else: lineDict[loc] = type text = getCleanText(lineDict, False) #print text query = """g.v(%d).in("IS_FILE_OF").filter{it.type=="File"}.filepath""" % functionnodeid filepath = j.runGremlinQuery(query) fileName = str(filepath[0]).split('/')[-1] addInfoToSourceFile(text, fileName) flag += 1 print flag
def getFunctionSimilarity(): # initialize write file analysis = file(my_constant.FUNC_SIMILAIRTY_FILE_NAME, 'wb') analyze_writer = csv.writer(analysis) analyze_writer.writerow(['func_a', 'func_b', 'similarity']) # initialize python-joern instance joern_instance = JoernSteps() joern_instance.addStepsDir("/data/joern-code/query/") joern_instance.setGraphDbURL("http://localhost:7474/db/data/") # connect to database joern_instance.connectToDatabase() # fetch all function info functions_query = '_().getFunctions()' functions_temp = joern_instance.runGremlinQuery(functions_query)[0] len_func = len(functions_temp) # filter some operator reload functions functions = [] for function in functions_temp: # remove namespace before:: function = my_util.removeNamespace(function) if function == '': continue if not function.startswith("operator ") and [function ] not in functions: functions.append([function]) len_func = len(functions) # compute similarity and write back into file func_similarity_dic = {} word_list_dict = {} for i in range(len_func): for j in range(len_func): if i == j: continue similarity, word_list_dict = computeSim(functions[i], functions[j], word_list_dict) # store back if similarity > 0.5: analyze_writer.writerow( [functions[i][0], functions[j][0], similarity]) func_similarity_dic[(functions[i][0], functions[j][0])] = similarity # close files analysis.close() return func_similarity_dic
def createdb(coverage_db,json_dbname,joern_url='http://localhost:7474/db/data/'): """ combine coverage information with joern queries and create json db with results""" global j,conn from joern.all import JoernSteps j = JoernSteps() j.setGraphDbURL(joern_url) j.connectToDatabase() conditionals = {} # filename is key if_ids = j.runGremlinQuery('queryNodeIndex("type:IfStatement").id') print "Total number of IfStatements:%d"%len(if_ids) switch_ids = j.runGremlinQuery('queryNodeIndex("type:SwitchStatement").id') print "Total number of SwitchStatement:%d"%len(switch_ids) if_ids += switch_ids conn = sqlite3.connect(coverage_db) cur = conn.cursor() idx = 0 for id in if_ids: # iterate over each conditional and gather branch info conditional = get_conditional_info(id,idx) if conditional == {}: continue idx+=1 sys.stdout.write("Processing conditional %d out of %d total.\r"%(idx,len(if_ids))) sys.stdout.flush() if conditional["filename"] not in conditionals: #group by file name conditionals[conditional["filename"]] = [] conditionals[conditional["filename"]].append(conditional) #now sort them by filenames and line numbers sorted_conditionals = [] for filename in conditionals: conditionals[filename].sort(key = lambda c: c["line"]) sorted_conditionals += conditionals[filename] #save as json json.dump(sorted_conditionals,open(json_dbname,"wb")) print "\nDone!"
class DBInterface(object): """Provides database connection""" DATABASE_URL = "http://localhost:7474/db/data/" def __init__(self): self.connection = None def __getConnection(self): print "[+] Creating connection." try: self.connection = JoernSteps() except Exception as e: print "[Error] Cannot instantiate Python-Joern database interface, DBInterface says: {}".format( e.args) return False return True def connectToDB(self): if not self.__getConnection(): return False print "[+] Connecting to the database." self.connection.setGraphDbURL(DBInterface.DATABASE_URL) try: self.connection.connectToDatabase() except Exception as e: print "[Error] Cannot connect to the database, DBInterface says: {}".format( e.args) return False return True def runQuery(self, code): results = None try: results = self.connection.runGremlinQuery(code) except Exception as e: print "[Error] Error occured during query execution, DBInterface says: {}".format( e.args) return None return results
class DBContentsProvider: def __init__(self): self.j = JoernSteps() self.init_database_connection() def init_database_connection(self): self.j.connectToDatabase() self.j.addStepsDir('steps/') def run_gremlin_query(self, query_script): results = self.j.runGremlinQuery(query_script) return results # 以下为用户的数据库查询过程 # 功能:从数据库中查询所有的被调用的函数名称 def query_allCallee_name(self): query = """ g.V.has('type','Callee').as('x').code.dedup().back('x').code.toList() """ result = self.run_gremlin_query(query) return result
def produce_file_function_location_triads(file): j = JoernSteps() j.setGraphDbURL('http://localhost:7474/db/data/') j.connectToDatabase() root_nodes = j.runGremlinQuery('queryNodeIndex("type:Function")') start_indices = [] function_names = [] for root_node in root_nodes: locationString = root_node.properties['location'] lineNumber = locationString.split(":")[0] start_indices.append(str(int(lineNumber) - 1)) function_names.append(root_node.properties['name']) triads = [] x = 0 while x < len(start_indices): triads.append( (file.split("/")[-1], function_names[x], start_indices[x])) x = x + 1 return triads
print "level,", level while len(queue) != 0: cur = queue[0] if cur._Node__id not in visited: visited.add(cur._id) if cur._id > -1: print cur.properties query = 'g.v(' + str(cur._id) + ').code' print j.runGremlinQuery(query) query = 'g.v(' + str(cur._id) + ').out' #print query s = j.runGremlinQuery(query) print len(s) neighbor_vs = [] for one in s: neighbor_vs.append(one._id) print neighbor_vs # print s temp_queue.extend(s) del queue[0] queue = temp_queue temp_queue = [] level = level + 1 print "Depth, ", level if __name__ == '__main__': main_nodes = j.runGremlinQuery('getFunctionsByName("main")') bfs(main_nodes[0])
return 0 ptr = "" for filename in glob.glob('*.txt'): #print filename with open(filename, 'r') as f: my_list = [line.rstrip('\n') for line in f] #print my_list ptr = filename.split("_")[1].split('.txt')[0] newfilename = filename.split('.txt')[0]+'.c' S_file=open('/home/hongfa/workspace/DependencyGraph/sphinx3_src/'+newfilename,'w+') CompoundStatement_query = """g.v("""+filename.split("_")[0]+""").out().filter{it.type == "FunctionDef"}.ithChildren("0").out().id""" CompoundStatements =j.runGremlinQuery(CompoundStatement_query) #print CompoundStatements duplicate = [] duplicate2 = [] klee_var = '' for S in CompoundStatements: query_sta = """g.v("""+str(S)+""").code""" query_sta_type = """g.v("""+str(S)+""").type""" query_sta_id = """g.v("""+str(S)+""").id""" query_id = """g.v("""+str(S)+""").astNodes().filter{it.type == "Identifier" }.code""" ids = j.runGremlinQuery(query_id) sta = j.runGremlinQuery(query_sta) sta_type = j.runGremlinQuery(query_sta_type) sta_id=j.runGremlinQuery(query_sta_id) lable = 0
class DBContentsProvider: def __init__(self): self._initDatabaseConnection() def _initDatabaseConnection(self): self.j = JoernSteps() self.j.connectToDatabase() self.j.addStepsDir('steps/') """ Generate contents for a given selector, overwriting the contents currently held in cndToQueries memory by the server. """ def generate(self, selector): query = """generateTaintLearnStructures(%s.id.toList()) _()""" % (selector) for unused in self.j.runGremlinQuery(query): pass def generateChecksForInvocations(self, invocs): query = """generateChecksForInvocations(%s.toList()) _()""" % (invocs) for unused in self.j.runGremlinQuery(query): pass # Source Analysis def getSourceAPISymbols(self): query = """_().transform{ getSourceAPISymbols() }.scatter() """ return [x for x in self.j.runGremlinQuery(query)] def getAllDefStmtsPerArg(self): query = """_().transform{ getAllDefStmtsPerArg() }.scatter() """ return [x for x in self.j.runGremlinQuery(query)] # Condition Analysis def getAllChecksPerArg(self): query = """_().transform{ getAllChecksPerArg() }.scatter() """ return [x for x in self.j.runGremlinQuery(query)] def getAllConditions(self): query = """_().transform{ getAllConditions() }.scatter() """ return [x for x in self.j.runGremlinQuery(query)] def getAllConditionsCode(self): query = """_().transform{ getAllConditionsCode() }.scatter() """ return [x for x in self.j.runGremlinQuery(query)] def getInvocationCallSiteIds(self): query = """_().transform{ getInvocationCallSites() }.scatter() """ return [x for x in self.j.runGremlinQuery(query)] def getSubConditions(self, nodeId): query = """_().transform{ subConditions(%s) }.scatter() """ % (nodeId) return [x for x in self.j.runGremlinQuery(query)] def getAllCndFeatureVectors(self, invocs=[], argNum=None): if not invocs: if argNum != None: query = """_().transform{ getAllCndFeatureVectors(%d) }.scatter() """ % ( argNum) else: query = """_().transform{ getAllCndFeatureVectors() }.scatter() """ else: if argNum != None: query = """_().transform{ getCndFeatureVectorsForInvocs(%s, %d) }.scatter() """ % ( invocs, argNum) else: query = """_().transform{ getCndFeatureVectorsForInvocs(%s) }.scatter() """ % ( invocs) return [x for x in self.j.runGremlinQuery(query)] def getAllASTNodeLabels(self): query = """_().transform{ getAllASTNodeLabels() }.scatter() """ return [x for x in self.j.runGremlinQuery(query)] # Choosing sinks def getControlledSinks(self, nodeId): query = """_().transform{ getControlledSinks(%s) }.scatter() """ % ( nodeId) return [x for x in self.j.runGremlinQuery(query)]
j = JoernSteps() j.setGraphDbURL('http://localhost:7474/db/data') j.connectToDatabase() sys.argv.pop(0) print "Running tests:" # tests hashes are encoded in the intermediate path names, this extracts them def extract_paths(paths): paths = map(lambda p: str.split(str(p), "/")[-1], paths) return map(lambda p: str.split(str(p), ".c")[0], paths) all_tests = extract_paths( j.runGremlinQuery("getNodesWithType('File').filepath")) for arg in sys.argv: yaml = load(file(arg, 'r'), Loader) for idx, entry in enumerate(yaml): query = entry['QUERY'] query = re.sub("^ +", "", query, flags=re.MULTILINE) query = re.sub(" +$", "", query, flags=re.MULTILINE) query = str.split(query, "\n") query = filter(lambda l: not re.match('//', l), query) query = str.join("", query) query = """%s .transform { g.v(it.functionId).functionToFile().filepath }.scatter() """ % (query)
class get_basic_blocks(): """""" #---------------------------------------------------------------------- def __init__(self, ): """Constructor""" self.JS = JoernSteps() self.JS.setGraphDbURL(NEO4J_URL) self.JS.connectToDatabase() self.get_function_list(ChunkStartTool) self.FUNCTION_LIST = {} self.BASIC_BLOCK_LIST = {} """take all the function name and functionId into a dict""" def get_function_list(self, ChunkStartTool): '''query_get_all_functions=""" queryNodeIndex('type:Function AND name:%s').id """ ''' list_function = get_all_functions.ListFuncs( ChunkStartTool) #ListFuncs() list_function.run() self.FUNCTION_LIST = list_function.ALL_FUNCTIONS """given a functionId , this can get the graph of the function """ '''def get_subgraph(functionid): query_get_function_graph=""" queryNodeIndex('functionId:%s').outE """%functionid #query=""" queryNodeIndex('type:Function AND name:%s').id""" function_graphEdges = self.JS.runGremlinQuery(query_get_function_graph) return function_graph ''' """given a functionId , this can get the control flow graph of the function""" def get_cfg_graph(self, functionid): query_get_cfg_graph = """queryNodeIndex('functionId:%s').outE .filter{it.label=="CONTROLS"||it.label=="POST_DOM"} .transform{[it.outV.id,it.id,it.label,it.inV.id]}.toList()""" % functionid function_cfg_graph = self.JS.runGremlinQuery(query_get_cfg_graph) return function_cfg_graph """given a functionId , this can get the ENTRY node id of the function""" def get_ENTRY_node(self, functionid): query_from_entry = """queryNodeIndex('functionId:%s AND type:CFGEntryNode').id""" % functionid #transform{[it.id]}.toList() id_entry = self.JS.runGremlinQuery(query_from_entry) return id_entry[0] """given a nodeId , this can get the code of the node ,result is utf string""" def get_code(self, node_id): query_get_code = """g.v(%s).getProperty("code")""" % node_id query_result = self.JS.runGremlinQuery(query_get_code) return query_result """given a nodeId , this can get the type of the node ,result is utf string""" def get_type(self, node_id): query_get_code = """g.v(%s).getProperty("type")""" % node_id query_result = self.JS.runGremlinQuery(query_get_code) return query_result """given a nodeId , this can get the nodes that be controled ,result are ids""" def get_control_nodes(self, node_id): query_get_control_code = """g.v(%s).out("CONTROLS").id""" % node_id query_result = self.JS.runGremlinQuery(query_get_control_code) control_node_list = [] for r in query_result: control_node_list.append(r) return control_node_list """given a nodeId , this can get the nodes that POST_DOM ,result are ids""" def get_POST_DOM_nodes(self, node_id): query_get_POST_DOM_code = """g.v(%s).out("POST_DOM").id""" % node_id query_result = self.JS.runGremlinQuery(query_get_POST_DOM_code) POST_DOM_node_list = [] for r in query_result: POST_DOM_node_list.append(r) if len(POST_DOM_node_list) < 1: return [] return POST_DOM_node_list[0] """given a nodeId , this can get the nodes that DOM ,result are ids""" def get_DOM_nodes(self, node_id): query_get_POST_DOM_code = """g.v(%s).in("POST_DOM").id""" % node_id query_result = self.JS.runGremlinQuery(query_get_POST_DOM_code) POST_DOM_node_list = [] for r in query_result: POST_DOM_node_list.append(r) if len(POST_DOM_node_list) < 1: return [] return POST_DOM_node_list[0] """check the node not in the two_demission list""" def check_in_or_not(self, node, BBs): in_BBS = False max_i = len(BBs) for eachNum in range(max_i): if node in BBs[eachNum]: in_BBS = True return in_BBS def Dom_list_sort(self, node_list): BBs_sorted = [] BBs = [] for node in node_list: if not self.check_in_or_not(node, BBs): BBs_sorted = [] BBs_sorted.append(node) last_node = node #get_DOM_nodes(node) next_node = node #get_POST_DOM_nodes(node) while self.get_DOM_nodes(last_node) in node_list: BBs_sorted.insert(BBs_sorted.index(last_node), self.get_DOM_nodes(last_node)) last_node = self.get_DOM_nodes(last_node) while self.get_POST_DOM_nodes(next_node) in node_list: BBs_sorted.append(self.get_POST_DOM_nodes(next_node)) next_node = self.get_POST_DOM_nodes(next_node) BBs.append(BBs_sorted) '''for node_list in BBs: for node in node_list: if self.get_type(self.get_POST_DOM_nodes(node))== "Parameter": node_list.remove(node)''' for sub_node_list in BBs: list_tmp = [] for node in sub_node_list: if self.get_type(node) != "Parameter": list_tmp.append(node) BBs.remove(sub_node_list) BBs.append(list_tmp) return BBs '''given a node, return the basic blocks of it''' def get_BBs_of_node(self, node_id): current_control_ids = self.get_control_nodes(node_id) if len(current_control_ids) == 0: return [] else: return self.Dom_list_sort(current_control_ids) '''get the basic blocks of the function''' def function_basic_blocks(self, functionid): basic_block_ids = [] #store the basic blocks node id queue = [] #put the FIFO node entry_id = self.get_ENTRY_node(functionid) queue.append(entry_id) while len(queue) > 0: control_nodes = self.get_control_nodes(queue[0]) queue.remove(queue[0]) if len(control_nodes) > 0: queue = queue + control_nodes basic_block_ids = basic_block_ids + self.Dom_list_sort( control_nodes) return basic_block_ids #self.get_BBs_of_node(entry_id) def execute(self): #for etem in self.function_basic_blocks(54): # print etem[:] #Project_BBs={} self.get_function_list(ChunkStartTool) for etem in self.FUNCTION_LIST.iterkeys(): #self.get_cfg_graph(functionid) self.BASIC_BLOCK_LIST[etem] = self.function_basic_blocks(etem) print "ok"
class method_3(): """this method takes different features into one vector, but the different place in vector is specified.""" '''[0:the return type of the function, 1:the number of the parameters 2~20:the num of different property type nodes 21~30:the num of different leble edges 31~40:the num of different kinds of API functions 41~45:the num of different kinds of node types,like Callee]''' #---------------------------------------------------------------------- def __init__(self): """Constructor""" self.JS = JoernSteps() self.JS.setGraphDbURL(NEO4J_URL) self.JS.connectToDatabase() self.return_type_dataDir = os.path.join(FILE_PATH, 'return_type_data') self.parameter_dataDir = os.path.join(FILE_PATH, 'parameter_data') self.edge_dataDir = os.path.join(FILE_PATH, 'edge_data') self.node_type_dataDir = os.path.join(FILE_PATH, 'node_type_data') self.all_return_type=[] self.all_node_type=[] self.FUNCTION_LIST = {} self.get_function_list(ChunkStartTool) self.get_all_return_type() #---------------------------------------------------------------------- '''def chunks(self, l, n): for i in xrange(0, len(l), n): yield l[i:i+n] """""" ''' """take all the function name and functionId into a dict""" def get_function_list(self,ChunkStartTool): '''query_get_all_functions="""queryNodeIndex('type:Function AND name:%s').id""" ''' list_function = get_all_functions.ListFuncs(ChunkStartTool)#ListFuncs() list_function.run() self.FUNCTION_LIST=list_function.ALL_FUNCTIONS #---------------------------------------------------------------------- def return_type(self,function_id): """get the return_type of current function""" query_return_type = """queryNodeIndex('functionId:%s AND type:ReturnType').code"""%function_id#transform{[it.id]}.toList() return_type_code_u = self.JS.runGremlinQuery(query_return_type) return_type_code=[] for type_code in return_type_code_u: return_type_code.append(type_code.encode("utf-8") ) return return_type_code #---------------------------------------------------------------------- def get_all_return_type(self): """get the return_type of all functions""" for function_id in self.FUNCTION_LIST.iterkeys():#self.chunks(self.FUNCTION_LIST.keys(), CHUNK_SIZE): for current_return_type in self.return_type(function_id): if current_return_type not in self.all_return_type: self.all_return_type.append(current_return_type) #---------------------------------------------------------------------- def get_parameter_type(self,function_id): """get the num of specified function parameters""" query_parameter_type = """queryNodeIndex('functionId:%s AND type:ParameterType').code.toList()"""%function_id parameter_list = self.JS.runGremlinQuery(query_parameter_type) return parameter_list #---------------------------------------------------------------------- def get_different_edge_num(self,function_id): """get the num of different type edges in the specified function""" query_edge_num="""queryNodeIndex('functionId:%s').outE.label"""%function_id edge_label=self.JS.runGremlinQuery(query_edge_num) total_num=len(edge_label) edge_label_dic_standard={} edge_label_dic=dict((a.encode("utf-8"),edge_label.count(a)) for a in edge_label) for key in edge_label_dic.iterkeys(): edge_label_dic_standard[key]=float(edge_label_dic[key]/total_num) #edge_label.count() return edge_label_dic_standard #---------------------------------------------------------------------- def get_all_types(self): """get the types of all nodes""" for function_id in self.FUNCTION_LIST.iterkeys():#self.chunks(self.FUNCTION_LIST.keys(), CHUNK_SIZE): query_node_type = """queryNodeIndex('functionId:%s').type"""%function_id node_types = self.JS.runGremlinQuery(query_node_type) for current_node_type in node_types: if current_node_type.encode("utf-8") not in self.all_node_type: self.all_node_type.append(current_node_type.encode("utf-8")) print "ok" #---------------------------------------------------------------------- def get_different_type_num(self,function_id): """get the num of different node types in the specified function""" query_type_num="""queryNodeIndex('functionId:%s').type"""%function_id type_label=self.JS.runGremlinQuery(query_type_num) total_num=len(type_label) type_dic_standard={} type_dic=dict((a.encode("utf-8"),type_label.count(a)) for a in type_label) for key in type_dic.iterkeys(): type_dic_standard[key]=float(type_dic[key]/total_num) #edge_label.count() return type_dic_standard def output_return_type(self): return_type_writer=Writer() return_type_writer.setOutputDirectory(self.return_type_dataDir) return_type_writer.run() for function_id in self.FUNCTION_LIST.iterkeys(): symbols= self.return_type(function_id) return_type_writer._writeDataPoints(function_id, symbols) return_type_writer._finalizeOutputDirectory() #---------------------------------------------------------------------- def output_parameter_type(self): """""" return_type_writer=Writer() return_type_writer.setOutputDirectory(self.parameter_dataDir) return_type_writer.run() for function_id in self.FUNCTION_LIST.iterkeys(): symbols= self.get_parameter_type(function_id) return_type_writer._writeDataPoints(function_id, symbols) return_type_writer._finalizeOutputDirectory() #---------------------------------------------------------------------- def output_edge_type(self): """""" return_type_writer=Writer() return_type_writer.setOutputDirectory(self.edge_dataDir) return_type_writer.run() for function_id in self.FUNCTION_LIST.iterkeys(): symbols= self.get_different_edge_num(function_id) return_type_writer._writeDataPoints(function_id, symbols) return_type_writer._finalizeOutputDirectory() #for function_id in self.FUNCTION_LIST.iterkeys():#self.chunks(self.FUNCTION_LIST.keys(), CHUNK_SIZE): # current_function_paras=self.get_different_type_num(function_id) # print function_id #self.get_all_types() #---------------------------------------------------------------------- def output_node_type(self): """""" return_type_writer=Writer() return_type_writer.setOutputDirectory(self.node_type_dataDir) return_type_writer.run() for function_id in self.FUNCTION_LIST.iterkeys(): symbols= self.get_different_type_num(function_id) return_type_writer._writeDataPoints(function_id, symbols) return_type_writer._finalizeOutputDirectory() def execute(self): self.output_return_type() self.output_parameter_type() self.output_edge_type() self.output_node_type()
class Analysis(object): SQL_QUERY_FUNCS = """sql_query_funcs = [ "mysql_query", "mysqli_query", "pg_query", "sqlite_query" ]\n""" XSS_FUNCS = """xss_funcs = [ "print", "echo" ]\n""" OS_COMMAND_FUNCS = """os_command_funcs = [ "backticks", "exec" , "expect_popen","passthru","pcntl_exec", "popen","proc_open","shell_exec","system", "mail" ]\n""" def __init__(self, port): ''' Constructor ''' self.j = JoernSteps() self.j.setGraphDbURL('http://localhost:%d/db/data/' % (int(port))) self.j.connectToDatabase() def prepareQueryStatic(self, attackType): query = self.XSS_FUNCS + self.SQL_QUERY_FUNCS + self.OS_COMMAND_FUNCS query += " m =[]; " if attackType == "sql": query += """ queryMapList =[]; g.V().filter{sql_query_funcs.contains(it.code) && isCallExpression(it.nameToCall().next()) }.callexpressions() .sideEffect{m = start(it, [], 0, 'sql', false, queryMapList)} .sideEffect{ warnmessage = warning(it.toFileAbs().next().name, it.lineno, it.id, 'sql', '1')} .sideEffect{ reportmessage = report(it.toFileAbs().next().name, it.lineno, it.id)} .ifThenElse{m.isEmpty()} {it.transform{reportmessage}} {it.transform{findSinkLocation(m, warnmessage, 'sql', queryMapList, it)}}""" elif attackType == "xss": query += """ queryMapList = []; g.V().filter{it.type == TYPE_ECHO || it.type == TYPE_PRINT} .sideEffect{m = start(it, [], 0, 'xss', false, queryMapList)} .sideEffect{ warnmessage = warning(it.toFileAbs().next().name, it.lineno, it.id, 'xss', '1')} .sideEffect{ reportmessage = report(it.toFileAbs().next().name, it.lineno, it.id)} .ifThenElse{m.isEmpty()} {it.transform{reportmessage}} {it.transform{findSinkLocation(m, warnmessage, 'xss', queryMapList, it)}}""" elif attackType == "code": query += """queryMapList =[]; g.V().filter{it.type == TYPE_INCLUDE_OR_EVAL && it.flags.contains(FLAG_EXEC_EVAL)} .sideEffect{m = start(it, [], 0, 'code', false, queryMapList )} .sideEffect{ warnmessage = warning(it.toFileAbs().next().name, it.lineno, it.id, 'code', '1')} .sideEffect{ reportmessage = report(it.toFileAbs().next().name, it.lineno, it.id)} .ifThenElse{m.isEmpty()} {it.transform{reportmessage}} {it.transform{findSinkLocation(m, warnmessage, 'code', queryMapList, it)}}""" # command execution : sinks considered are : # [backticks, exec,expect_popen,passthru,pcntl_exec,popen,proc_open,shell_exec,system,mail] elif attackType == "os-command": query += """queryMapList =[] g.V().filter{os_command_funcs.contains(it.code) && isCallExpression(it.nameToCall().next()) }.callexpressions() .filter{os_command_funcs.contains(it.ithChildren(0).out.code.next())} .sideEffect{m = start(it, [], 0, 'os-command', false, queryMapList )} .sideEffect{ warnmessage = warning(it.toFileAbs().next().name, it.lineno, it.id, 'os-command', '1')} .sideEffect{ reportmessage = report(it.toFileAbs().next().name, it.lineno, it.id)} .ifThenElse{m.isEmpty()} {it.transform{reportmessage}} {it.transform{findSinkLocation(m, warnmessage, 'os-command', queryMapList, it)}}""" elif attackType == "file-inc": query += """queryMapList =[]; g.V().filter{it.type == TYPE_INCLUDE_OR_EVAL && !(it.flags.contains(FLAG_EXEC_EVAL))} .sideEffect{m = start(it, [], 0, 'file-inc', false, queryMapList)} .sideEffect{ warnmessage = warning(it.toFileAbs().next().name, it.lineno, it.id, 'file-inc', '1')} .sideEffect{ reportmessage = report(it.toFileAbs().next().name, it.lineno, it.id)} .ifThenElse{m.isEmpty()} {it.transform{reportmessage}} {it.transform{findSinkLocation(m, warnmessage, 'file-inc', queryMapList, it)}}""" elif attackType == "ear": query += """ g.V().filter{ "header" == it.code && isCallExpression(it.nameToCall().next()) }.callexpressions() .ithChildren(1).astNodes() .filter{it.code != null && it.code.startsWith("Location")} .callexpressions() .as('call') .out('FLOWS_TO') .filter{it.type != "AST_EXIT" && it.type != "NULL" } .or( _().filter{it.type == "AST_CALL"} .sideEffect{n = jumpToCallingFunction(it)} .filter{n.type != "AST_EXIT" && n.type != "NULL" && n.type != "AST_RETURN"} , _().filter{it.type == "AST_CALL"} .sideEffect{n = jumpToCallingFunction(it)} .filter{n.type == "AST_RETURN"} .out('FLOWS_TO') .filter{n.type != "AST_EXIT" && n.type != "NULL" } , _().filter{it.type != "AST_CALL"} , _().as('b') .filter{it.type == "AST_CALL"} .astNodes() .filter{it.code != null && it.code != "/home/user/log/codeCoverage.txt"} .back('b') ) .back('call') .sideEffect{ warnmessage = warning(it.toFileAbs().next().name, it.lineno, it.id, 'ear', '1')} .transform{warnmessage}""" return query def prepareFinalQuery(self, seed): get = [] for g in seed.get: if '=' in g: t = g[0:g.find('=')] get.append('?' + t + '=') get.append('&' + t + '=') params = [] for p in seed.params: if '=' in p: params.append(p[0:p.find('=')] + '=') query = """g.V('url', '%s') .findNavigationSeq(%s, %s, %s).dedup().path""" % ( seed.src, seed.dst, get, params) print(query) # {it.url} return query def runQuery(self, query): return query def runTimedQuery(self, query): start = time.time() res = None try: if query: res = self.j.runGremlinQuery(query) except Exception as err: print "Caught exception:", type(err), err elapsed = time.time() - start timestr = "Query done in %f seconds." % (elapsed) return (res, timestr) def readExploitSeedsFile(self, attackType): if attackType == "sql": print( 'Reading Exploit Seeds File in /home/user/navex/results/include_map_resolution_results_xss.txt' ) file = '/home/user/navex/results/include_map_resolution_results.txt' elif attackType == "xss": file = '/home/user/navex/results/include_map_resolution_results_xss.txt' print( 'Reading Exploit Seeds File in /home/user/navex/results/include_map_resolution_results_xss.txt' ) elif attackType == "code": file = '/home/user/navex/results/include_map_resolution_results_code.txt' print( 'Reading Exploit Seeds File in /home/user/navex/results/include_map_resolution_results_code.txt' ) elif attackType == "os-command": file = '/home/user/navex/results/include_map_resolution_results_os-command.txt' print( 'Reading Exploit Seeds File in /home/user/navex/results/include_map_resolution_results_os-command.txt' ) elif attackType == "file-inc": file = '/home/user/navex/results/include_map_resolution_results_file-inc.txt' print( 'Reading Exploit Seeds File in /home/user/navex/results/include_map_resolution_results_file-inc.txt' ) elif attackType == "ear": file = '/home/user/navex/results/include_map_resolution_results_ear.txt' print( 'Reading Exploit Seeds File in /home/user/navex/results/include_map_resolution_results_ear.txt' ) with open(file, 'r') as f: lines = [line.strip() for line in f] return lines
exit(1) j = JoernSteps() j.setGraphDbURL("http://localhost:7474/db/data") j.connectToDatabase() sys.argv.pop(0) print "Running tests:" # tests hashes are encoded in the intermediate path names, this extracts them def extract_paths(paths): paths = map(lambda p: str.split(str(p), "/")[-1], paths) return map(lambda p: str.split(str(p), ".c")[0], paths) all_tests = extract_paths(j.runGremlinQuery("getNodesWithType('File').filepath")) for arg in sys.argv: yaml = load(file(arg, "r"), Loader) for idx, entry in enumerate(yaml): query = entry["QUERY"] query = re.sub("^ +", "", query, flags=re.MULTILINE) query = re.sub(" +$", "", query, flags=re.MULTILINE) query = str.split(query, "\n") query = filter(lambda l: not re.match("//", l), query) query = str.join("", query) query = """%s .transform { g.v(it.functionId).functionToFile().filepath }.scatter() """ % (
from joern.all import JoernSteps j = JoernSteps() j.setGraphDbURL('http://localhost:7474/db/data/') # j.addStepsDir('Use this to inject utility traversals') j.connectToDatabase() res = j.runGremlinQuery('getFunctionsByName("main")') # res = j.runCypherQuery('...') for r in res: print r
#.filter{ # it.in('USES') # .filter{it.type == 'Condition'}.toList() == [] #} #""" #query = """ #getArguments('memcpy', '1') #""" #.sideEffect{ paramName = '.*len.*' } #.filter{ it.code.matches(paramName) } #.unsanitized{ it.isCheck( paramName ) }""" #.params( paramName ) query = "getFilesByName('*')" results = j.runGremlinQuery(query) print "[+] Number of files:", len(results) for i in range(len(results)): cur = results[i] print i, results[i], results[i].properties #query = 'g.v(' + str(cur._id) + ').out().{it.type == "Function"}' query = 'g.v(' + str(cur._id) + ').out' #().{it.type == "Function"}' s = j.runGremlinQuery(query) for node in s: if node['type'] == 'Function': print node # print s # if i > 0: break
from joern.all import JoernSteps import glob import sys import pickle import os j=JoernSteps() j.setGraphDbURL('http://localhost:7474/db/data/') # j.addStepsDir('Use this to inject utility traversals') j.connectToDatabase() FunctionName_query = """queryNodeIndex("type:File").out().filter{it.type == "Function"}.name""" FunctionName = j.runGremlinQuery(FunctionName_query) ori_duplicate = [] ptrlist_file=open('/home/hongfa/workspace/thttpd_test/ptrList','r') ptrs=ptrlist_file.readlines() ptrlist = [] # ori_functionId = ptr.split("functionId:")[1].split("\n")[0] # #print ori_functionId # if ori_functionId not in ori_duplicate: # # ori_duplicate.append(ori_functionId) # command =' echo "'+str(ori_functionId)+'" | joern-location | joern-code > /home/hongfa/workspace/DependencyGraph/sphinx3_or_src/'+ori_functionId+'.c ' # # os.system(command) for ptr in ptrs: nodeId = ptr.split(" ")[0]
from joern.all import JoernSteps j = JoernSteps() j.setGraphDbURL('http://localhost:7474/db/data/') j.connectToDatabase() query = """ getArguments('strcpy', '1') .sideEffect{ argument = it.code;} .unsanitized( {it._().or( _().isCheck('.*' + argument + '.*'), _().codeContains('.*min.*'))} .locations() """ print "[+] Running query! " results = j.runGremlinQuery(query) print "[+] Number of results: " + str(len(results)) for r in results: print r
if not tm_params: return #process it's actual param list if len(ast_children) == 2: actual_params = j.runGremlinQuery("g.v(%d)" "outE('IS_AST_PARENT')" ".inV()"%(node2id(ast_children[1]))) #calculate the affected variables for param in tm_params: if param == 0: retval = find_retval(call_id) if retval: affected_vars.add(retval) else: vars = get_param_vars(node2id(actual_params[param-1])) for v in vars: affected_vars.add(v) print get_ifs(code_to_symbols(affected_vars)); j = JoernSteps() j.setGraphDbURL('http://localhost:7474/db/data/') j.connectToDatabase() #find out all call expressions and process them one by one call_ids = j.runGremlinQuery("g.V().filter{it.type == 'CallExpression'}.id()") for call_id in call_ids: process_call(call_id)
from joern.all import JoernSteps import glob import sys import pickle import os j = JoernSteps() j.setGraphDbURL('http://localhost:7474/db/data/') # j.addStepsDir('Use this to inject utility traversals') j.connectToDatabase() FunctionName_query = """queryNodeIndex("type:File").out().filter{it.type == "Function"}.name""" FunctionName = j.runGremlinQuery(FunctionName_query) ori_duplicate = [] functionlist = [] import glob for filename in glob.glob( '/home/hongfa/workspace/DependencyGraph/lbm_or_src/*'): #print filename nameid = filename.split(".c")[0].split( "/home/hongfa/workspace/DependencyGraph/lbm_or_src/")[1] print nameid command = ' echo id:"' + str( nameid ) + '" | joern-lookup -a name >>/home/hongfa/workspace/DependencyGraph/lbmx_or_src/list_or.txt'
class ManualCCSearch(object): ''' classdocs ''' UNTRUSTED_DATA = """attacker_sources = [ "_GET", "_POST", "_COOKIE", "_REQUEST", "_ENV", "HTTP_ENV_VARS" ]\n""" SQL_QUERY_FUNCS = """sql_query_funcs = [ "mysql_query", "pg_query", "sqlite_query" ]\n""" # Gremlin operations ORDER_LN = ".order{it.a.lineno <=> it.b.lineno}" # Order by linenumber def __init__(self, port): ''' Constructor ''' self.j = JoernSteps() self.j.setGraphDbURL('http://localhost:%d/db/data/' % (int(port))) # self.j.addStepsDir( # Configurator.getPath(Configurator.KEY_PYTHON_JOERN) + # "/joern/phpjoernsteps" # ) self.j.addStepsDir( Configurator.getPath(Configurator.KEY_BASE_DIR) + "/custom_gremlin_steps" ) self.j.connectToDatabase() # self.QUERIES_DIR = Configurator.getPath(Configurator.BASE_DIR) + \ # "/gremlin_queries" def searchCCOne(self): """ Search for the first vulnerable tutorial (SQL injection from stackoverflow): $user_alcohol_permitted_selection = $_POST['alcohol_check']; //Value sent using jquery .load() $user_social_club_name_input = $_POST['name']; //Value sent using jquery .load() $query="SELECT * FROM social_clubs WHERE name = $user_social_club_name_input"; if ($user_alcohol_permitted_selection != "???") { $query.= "AND WHERE alcohol_permitted = $user_alcohol_permitted_selection"; } """ # construct gremlin query step by step: # 1. Find variable name X of "variable = $_POST[..]" # 2. Go to next statement list. # (3. Find variable name Y of "variable = $_POST[..]" # (4. Go to next statement list. # 5. Find variable name Z and string str1 of "variable = string" # 6. Check if str1 contains regexp "WHERE any_word=$Y". # (7. Go to next statement list.) # (8. Check for if-statement with variable $X.) # 9. Check if variable $Z is extended using string with regexp # "and where any_word=$X" # (10. Check for mysql_query($Z)) # all nodes # query = "g.V(NODE_TYPE, TYPE_STMT_LIST).out" # # # AST_ASSIGN nodes' right side # query += ".rval" query = "g.V" return query def sqlNewIndirect(self): query = self.UNTRUSTED_DATA + self.SQL_QUERY_FUNCS query += open(self.QUERIES_DIR + "sql_new_indirect.query", 'r').read() return query def runQuery(self, query): return query def runTimedQuery(self, myFunction, query=None): start = time.time() res = None try: if query: res = self.j.runGremlinQuery(myFunction(query)) else: res = self.j.runGremlinQuery(myFunction()) except Exception as err: print "Caught exception:", type(err), err elapsed = time.time() - start # print "Query done in %f seconds." % (elapsed) result = [] try: for node in res: print node data = CodeCloneData() data.stripDataFromOutput(node) data.setQueryTime(elapsed) result.append(data) except TypeError: # res is not iterable, because it is one/no node. # print res if res: data = CodeCloneData() data.stripDataFromOutput(node) data.setQueryTime(elapsed) result.append(data) print res return (result, elapsed)
#!/usr/bin/env python ############################################################# # A template for feature extraction for functions with joern. # Author: Fabian Yamaguchi ############################################################# from joern.all import JoernSteps j = JoernSteps() j.connectToDatabase() j.addStepsDir('steps/') statementIds = j.runGremlinQuery("queryNodeIndex('type:Function').id") for chunk in j.chunks(statementIds, 256): query = """ idListToNodes(%s).transform{ [it.id, it.name, it.functionToFeatureVec() ] } """ % (chunk) X = j.runGremlinQuery(query) for x in X: print '===' print 'FunctionId: %d' % (x[0]) print 'FunctionName: %s' % (x[1]) print 'Features (list): %s' % (x[2]) print '==='
class get_source(): """""" #---------------------------------------------------------------------- def __init__(self): """Constructor""" self.JS = JoernSteps() self.JS.setGraphDbURL(NEO4J_URL) self.JS.connectToDatabase() #self.source_sink_path=[] #self.sink_source_tree=tree() """given a nodeId , this can get the type of the node ,result is utf string""" def get_type(self, node_id): query_get_type = """g.v(%s).getProperty("type")""" % node_id query_result = self.JS.runGremlinQuery(query_get_type) return query_result.encode('utf-8') '''given a node that means sink, this can get the sources in the function''' def get_source_within_func(self, node_id): get_source_query = """g.v(%s).sources().id""" % node_id #getArguments("printf", "1") query_result = self.JS.runGremlinQuery(get_source_query) source_node_list = [] #store the source nodes source_node_tmp = [] #store the source nodes temply for r in query_result: if self.get_type(r) != "Parameter": print "the source node %s is not the \"Parameter\" type" else: source_node_list.append(r) return source_node_list '''given a node that means sink, this can get the sources in the function''' def get_source_of_IdentifierDeclStatement(self, node_id): source_node_list = [] #store the source nodes source_node_tmp = [] #store the source nodes temply if self.get_type(node_id) == "IdentifierDeclStatement": get_source_query = """g.v(%s).out("USE").filter{it.type=="Symbol"}.in("DEF").id""" % node_id query_result = self.JS.runGremlinQuery(get_source_query) for r in query_result: if self.get_type(r) != "Parameter": print "the source node %s is not the \"Parameter\" type" else: source_node_list.append(r) return source_node_list '''get the source parameter's Identifier node''' def get_Identifier_source(self, node_id): get_Identifier_source = """g.v(%s).out("IS_AST_PARENT").filter{it.type=="Identifier"}.id""" % node_id #getArguments("printf", "1") source_Identifier_node = self.JS.runGremlinQuery(get_Identifier_source) return source_Identifier_node '''get the source parameter's Identifier node''' def get_ParameterType_source(self, node_id): get_ParameterType_source = """g.v(%s).out("IS_AST_PARENT").filter{it.type=="ParameterType"}.id""" % node_id #getArguments("printf", "1") source_ParameterType_node = self.JS.runGremlinQuery( get_ParameterType_source) return source_ParameterType_node '''given a node that means sink, this can get the sources out of the function''' def get_source_between_func(self, node_id): get_source_query = """g.v(%s).in("IS_ARG").id""" % node_id #getArguments("printf", "1") query_result = self.JS.runGremlinQuery(get_source_query) source_node_list = [] #store the source nodes #source_node_tmp=[]#store the source nodes temply for r in query_result: if r not in source_node_list: source_node_list.append(r) return source_node_list '''get the sink nodes of specified function''' def get_sink(self, function_name, arg_num): sink_query = """getArguments(\"%s\", \"%s\").id""" % (function_name, arg_num) query_result = self.JS.runGremlinQuery(sink_query) sink_node_list = [] for r in query_result: if r not in sink_node_list: sink_node_list.append(r) return sink_node_list '''decide whether the node is over''' def wether_is_over(self, node_id): leble = False '''the symbol has not def nodes''' if self.get_type(node_id) == "Symbol": query = """g.v(%s).in("DEF").id""" % node_id query_result = self.JS.runGremlinQuery(query) if len(query_result) == 0: leble = True '''the PARAMETER has not Identifier nodes''' if self.get_type(node_id) == "Parameter": query = """g.v(%s).out("IS_AST_PARENT").filter{it.type=="Identifier"}.in("IS_ARG").id""" % node_id query_result = self.JS.runGremlinQuery(query) if len(query_result) == 0: leble = True '''the PARAMETER has not Identifier nodes''' if self.get_type(node_id) == "IdentifierDeclStatement": query = """g.v(%s).out("USE").filter{it.type=="Symbol"}.in("DEF").id""" % node_id query_result = self.JS.runGremlinQuery(query) if len(query_result) == 0: leble = True '''the PARAMETER has not Identifier nodes''' if self.get_type(node_id) == "Identifier": query = """g.v(%s).out.id""" % node_id query_result = self.JS.runGremlinQuery(query) query_has_arg = """g.v(%s).in("IS_ARG").id""" % node_id query_result1 = self.JS.runGremlinQuery(query_has_arg) if len(query_result) == 0 and len(query_result1) == 0: leble = True return leble '''get the source node of specified sink node''' def get_source(self, node): #source_current_func=get_source_within_func(node_id) #node.add(node_id) node_id = node.getdata() last_source = node_id if not self.wether_is_over(last_source): ###sources() get the parameter nodes''' if self.get_type(last_source) == "Argument": last_source_tmp = self.get_source_within_func(last_source) last_sources = last_source_tmp for last_source in last_sources: node.add(tree.node(last_source)) for source_node in last_sources: identifier_node = self.get_Identifier_source(source_node) if identifier_node is not None: for neighbor_source in identifier_node: node.add(self.get_source(neighbor_source)) ###this should get the arguement nodes''' elif len(self.get_source_between_func(last_source)) != 0: last_source_tmp = self.get_source_between_func(last_source) last_source = last_source_tmp node.add(last_source) for source_node in last_source: identifier_node = self.get_Identifier_source(source_node) if identifier_node is not None: for neighbor_source in identifier_node: node.add(self.get_source(neighbor_source)) ###sources() get the IdentifierDeclStatement nodes''' elif len(self.get_source_of_IdentifierDeclStatement( last_source)) != 0: last_source_tmp = self.get_source_of_IdentifierDeclStatement( last_source) last_source = last_source_tmp node.add(last_source) for source_node in last_source: identifier_node = self.get_Identifier_source(source_node) if identifier_node is not None: for neighbor_source in identifier_node: node.add(self.get_source(neighbor_source)) def execute(self): sink_func = sys.argv[0] arg_num = sys.argv[1] #root_node = tree.node(sink_func) for node_id in self.get_sink(sink_func, arg_num): print "Now, the sink node id is ", node_id sink_node = tree.node(node_id) self.sink_source_tree._head.add(sink_node) current_path = [] self.get_source(sink_node) print "ok" '''current_node.add(node_id)
# j.addStepsDir('Use this to inject utility traversals') j.connectToDatabase() ptrlist=open('/home/hongfa/workspace/thttpd_workspace/ptrList','r') ptrs=ptrlist.readlines() for ptr in ptrs: #print ptr functionID = ptr.split("functionId:")[1] nodeID = ptr.split(" ")[0] print nodeID ptrname_query= """g.v("""+nodeID+""").out().astNodes().filter{it.type=="Identifier"}.code""" ptrname = j.runGremlinQuery(ptrname_query) statements_query="""g.v("""+functionID+""").out().filter{it.type == "FunctionDef"}.ithChildren("0").astNodes().filter{it.type == "Identifier" && it.code=="""+"\""+ptrname[0]+"\""+"""}.statements()""" #res = j.runCypherQuery('g.v(84).out().filter{it.type=="Identifier"}.code') statements = j.runGremlinQuery(statements_query) P_file=open(functionID+ptrname[0],'a') #print len(statements) statements_list=[] for s in statements: print s s = str(s) ID = s.split(" {childNum")[0].split("(n")[1] type = s.split("type:")[1].split("\"")[1] if ID in statements_list:
class get_basic_blocks(): """""" #---------------------------------------------------------------------- def __init__(self, ): """Constructor""" self.JS = JoernSteps() self.JS.setGraphDbURL(NEO4J_URL) self.JS.connectToDatabase() self.get_function_list(ChunkStartTool) self.FUNCTION_LIST = {} self.BASIC_BLOCK_LIST = {} """take all the function name and functionId into a dict""" def get_function_list(self, ChunkStartTool): '''query_get_all_functions=""" queryNodeIndex('type:Function AND name:%s').id """ ''' list_function = get_all_functions.ListFuncs( ChunkStartTool) #ListFuncs() list_function.run() self.FUNCTION_LIST = list_function.ALL_FUNCTIONS """given a functionId , this can get the graph of the function """ '''def get_subgraph(functionid): query_get_function_graph=""" queryNodeIndex('functionId:%s').outE """%functionid #query=""" queryNodeIndex('type:Function AND name:%s').id""" function_graphEdges = self.JS.runGremlinQuery(query_get_function_graph) return function_graph ''' """given a functionId , this can get the control flow graph of the function""" def get_cfg_graph(self, functionid): query_get_cfg_graph = """queryNodeIndex('functionId:%s').outE .filter{it.label=="CONTROLS"||it.label=="POST_DOM"} .transform{[it.outV.id,it.id,it.label,it.inV.id]}.toList()""" % functionid function_cfg_graph = self.JS.runGremlinQuery(query_get_cfg_graph) return function_cfg_graph """given a functionId , this can get the ENTRY node id of the function""" def get_ENTRY_node(self, functionid): query_from_entry = """queryNodeIndex('functionId:%s AND type:CFGEntryNode').id""" % functionid #transform{[it.id]}.toList() id_entry = self.JS.runGremlinQuery(query_from_entry) return id_entry[0] """given a nodeId , this can get the code of the node ,result is utf string""" def get_code(self, node_id): query_get_code = """g.v(%s).getProperty("code")""" % node_id query_result = self.JS.runGremlinQuery(query_get_code) return query_result.encode('utf-8') """given a nodeId , this can get the type of the node ,result is utf string""" def get_type(self, node_id): query_get_type = """g.v(%s).getProperty("type")""" % node_id query_result = self.JS.runGremlinQuery(query_get_type) return query_result.encode('utf-8') """given a nodeId , this can get the nodes that be controled ,result are ids""" def get_control_nodes(self, node_id): query_get_control_code = """g.v(%s).out("CONTROLS").id""" % node_id query_result = self.JS.runGremlinQuery(query_get_control_code) control_node_list = [] for r in query_result: control_node_list.append(r) return control_node_list """given a nodeId , this can get the nodes that POST_DOM ,result are ids""" def get_POST_DOM_nodes(self, node_id): query_get_POST_DOM_code = """g.v(%s).out("POST_DOM").id""" % node_id query_result = self.JS.runGremlinQuery(query_get_POST_DOM_code) POST_DOM_node_list = [] for r in query_result: POST_DOM_node_list.append(r) if len(POST_DOM_node_list) < 1: return [] return POST_DOM_node_list """given a nodeId , this can get the nodes that DOM ,result are ids""" def get_DOM_nodes(self, node_id): query_get_POST_DOM_code = """g.v(%s).in("POST_DOM").id""" % node_id query_result = self.JS.runGremlinQuery(query_get_POST_DOM_code) POST_DOM_node_list = [] for r in query_result: POST_DOM_node_list.append(r) if len(POST_DOM_node_list) < 1: return [] return POST_DOM_node_list """check the node not in the two_demission list""" def check_in_or_not(self, node, BBs): in_BBS = False max_i = len(BBs) for eachNum in range(max_i): if node in BBs[eachNum]: in_BBS = True return in_BBS '''this function select the node occur simultaneously''' def list_mix_list(self, list1, list2): for etem in list1: if etem in list2: return etem return None '''this function take the controlled nodes into different basic blocks in order''' def Dom_list_sort(self, node_list): BBs_sorted = [] BBs = [] for node in node_list: if not self.check_in_or_not(node, BBs): BBs_sorted = [] BBs_sorted.append(node) last_node = node #get_DOM_nodes(node) next_node = node #get_POST_DOM_nodes(node) while self.list_mix_list(self.get_DOM_nodes(last_node), node_list) is not None: BBs_sorted.insert( BBs_sorted.index(last_node), self.list_mix_list(self.get_DOM_nodes(last_node), node_list)) last_node = self.list_mix_list( self.get_DOM_nodes(last_node), node_list) while self.list_mix_list(self.get_POST_DOM_nodes(next_node), node_list) is not None: BBs_sorted.append( self.list_mix_list(self.get_POST_DOM_nodes(next_node), node_list)) next_node = self.list_mix_list( self.get_POST_DOM_nodes(next_node), node_list) BBs.append(BBs_sorted) '''for node_list in BBs: for node in node_list: if self.get_type(self.get_POST_DOM_nodes(node))== "Parameter": node_list.remove(node)''' '''for sub_node_list in BBs: list_tmp=[] for node in sub_node_list: if self.get_type(node)!= "Parameter": list_tmp.append(node) BBs.remove(sub_node_list) BBs.append(list_tmp) ''' return BBs '''given a node, return the basic blocks of it''' def get_BBs_of_node(self, node_id): current_control_ids = self.get_control_nodes(node_id) if len(current_control_ids) == 0: return [] else: return self.Dom_list_sort(current_control_ids) '''get the basic blocks of the function''' def function_basic_blocks(self, functionid): basic_block_ids = [] #store the basic blocks node id queue = [] #put the FIFO node entry_id = self.get_ENTRY_node(functionid) queue.append(entry_id) function_BB_code = [] while len(queue) > 0: control_nodes = self.get_control_nodes(queue[0]) tmp_control_nodes = [] for control_node in control_nodes: if not self.check_in_or_not(control_node, basic_block_ids): tmp_control_nodes.append(control_node) '''for control_node in control_nodes: if self.check_in_or_not(control_node, basic_block_ids): control_nodes.remove(control_node)''' queue.remove(queue[0]) if len(tmp_control_nodes) > 0: queue = queue + tmp_control_nodes basic_block_ids = basic_block_ids + self.Dom_list_sort( tmp_control_nodes) '''this can take the parameter nodes away''' for sub_node_list in basic_block_ids: list_tmp = [] for node in sub_node_list: if self.get_type(node) != "Parameter": list_tmp.append(node) basic_block_ids.remove(sub_node_list) basic_block_ids.append(list_tmp) #return basic_block_ids#if do this, we can get the basic block node ids '''Do this can let us get the basic block code.''' for node_list_ids in basic_block_ids: node_list_ids.reverse() node_list_codes = [] for node_id in node_list_ids: node_list_codes.append(self.get_code( node_id)) #we can do get_code or get_type either function_BB_code.append(node_list_codes) return function_BB_code #self.get_BBs_of_node(entry_id) def execute(self): #for etem in self.function_basic_blocks(54): # print etem[:] #Project_BBs={} self.get_function_list(ChunkStartTool) for etem in self.FUNCTION_LIST.iterkeys(): #self.get_cfg_graph(functionid) file_name = FILE_PATH + "/" + etem if not os.path.exists(file_name): f_tmp = open(file_name, mode='w') try: for BB_list in self.function_basic_blocks(etem): for sentence in BB_list: f_tmp.writelines(sentence + "\r") f_tmp.writelines("\r\n") f_tmp.close() except Exception, ex: print etem print Exception.message os.remove(file_name) else: continue #self.BASIC_BLOCK_LIST[etem]=self.function_basic_blocks(etem) print "ok"
def produce_nodes_string(): def queryParent(j, nodeId): j.connectToDatabase() parent = j.runGremlinQuery('g.v(' + str(nodeId) + ').parents()') return parent def getStringForNode(node, nodes_and_parents): global global_node_types parent = nodes_and_parents[node] code = str(parent[0].properties['code']).replace(',', '') code = code.replace('¬', '') parentString = parent[0].properties[ 'type'] + "," + code + "," + str( parent[0].properties['functionId']) + "," + str( parent[0].properties['childNum']) parent_identifier = hash(tuple(parentString)) code = str(node.properties['code']).replace(',', '') code = code.replace('¬', '') nodeString = node.properties['type'] + "," + code + "," + str( node.properties['functionId']) + "," + str( node.properties['childNum']) node_identifier = hash(tuple(nodeString)) addition_string = str(node_identifier) + "," + str( node.properties['type']) + "," + str( node.properties['code']) + "," + str( node.properties['functionId']) + "," + str( node.properties['childNum']) + "," + str( parent_identifier) + "¬" #global_node_types.add(node.properties['type']) return addition_string syntactical_features = [] #max_depth_ast = get_max_depth_ast() #ast_node_types_tfs = get_node_types_tfs() #ast_node_types_tfidfs = get_node_types_tfidfs() #ast_node_type_avg_depths = get_node_type_avg_depth() #keywords_term_frequency = get_keywords_term_frequency() global global_node_types ast_features = [0] * 57 all_nodes_string = "" j = JoernSteps() j.setGraphDbURL('http://localhost:7474/db/data/') j.connectToDatabase() root_nodes = j.runGremlinQuery( 'queryNodeIndex("type:FunctionDef")') all_ast_nodes = j.runGremlinQuery( 'queryNodeIndex("type:FunctionDef").astNodes()') ast_parents = j.runGremlinQuery( 'queryNodeIndex("type:FunctionDef").astNodes().parents()') nodes_and_parents = {} for node in all_ast_nodes: nodes_and_parents[node] = queryParent(j, node._id) for node in all_ast_nodes: if not node in root_nodes: all_nodes_string += getStringForNode( node, nodes_and_parents) for ast_node in all_ast_nodes: x = 0 while x < len(global_node_types): if global_node_types[x] == ast_node.properties['type']: ast_features[x] += 1 x = x + 1 continue else: x = x + 1 #print(ast_features) #print(all_nodes_string) return all_nodes_string, ast_features
class DBContentsProvider: def __init__(self): self._initDatabaseConnection() def _initDatabaseConnection(self): self.j = JoernSteps() self.j.connectToDatabase() self.j.addStepsDir('steps/') """ Generate contents for a given selector, overwriting the contents currently held in cndToQueries memory by the server. """ def generate(self, selector): query = """generateTaintLearnStructures(%s.id.toList()) _()""" % (selector) for unused in self.j.runGremlinQuery(query): pass def generateChecksForInvocations(self, invocs): query = """generateChecksForInvocations(%s.toList()) _()""" % (invocs) for unused in self.j.runGremlinQuery(query): pass # Source Analysis def getSourceAPISymbols(self): query = """_().transform{ getSourceAPISymbols() }.scatter() """ return [x for x in self.j.runGremlinQuery(query)] def getAllDefStmtsPerArg(self): query = """_().transform{ getAllDefStmtsPerArg() }.scatter() """ return [x for x in self.j.runGremlinQuery(query)] # Condition Analysis def getAllChecksPerArg(self): query = """_().transform{ getAllChecksPerArg() }.scatter() """ return [x for x in self.j.runGremlinQuery(query)] def getAllConditions(self): query = """_().transform{ getAllConditions() }.scatter() """ return [x for x in self.j.runGremlinQuery(query)] def getAllConditionsCode(self): query = """_().transform{ getAllConditionsCode() }.scatter() """ return [x for x in self.j.runGremlinQuery(query)] def getInvocationCallSiteIds(self): query = """_().transform{ getInvocationCallSites() }.scatter() """ return [x for x in self.j.runGremlinQuery(query)] def getSubConditions(self, nodeId): query = """_().transform{ subConditions(%s) }.scatter() """ % (nodeId) return [x for x in self.j.runGremlinQuery(query)] def getAllCndFeatureVectors(self, invocs = [], argNum = None): if not invocs: if argNum != None: query = """_().transform{ getAllCndFeatureVectors(%d) }.scatter() """ % (argNum) else: query = """_().transform{ getAllCndFeatureVectors() }.scatter() """ else: if argNum != None: query = """_().transform{ getCndFeatureVectorsForInvocs(%s, %d) }.scatter() """ % (invocs, argNum) else: query = """_().transform{ getCndFeatureVectorsForInvocs(%s) }.scatter() """ % (invocs) return [x for x in self.j.runGremlinQuery(query)] def getAllASTNodeLabels(self): query = """_().transform{ getAllASTNodeLabels() }.scatter() """ return [x for x in self.j.runGremlinQuery(query)] # Choosing sinks def getControlledSinks(self, nodeId): query = """_().transform{ getControlledSinks(%s) }.scatter() """ % (nodeId) return [x for x in self.j.runGremlinQuery(query)]