def getVisibleNodes(projectName): db = DBInterface() db.connectToDatabase(projectName) visibleStatementTypes = [ 'CustomNode', 'ClassDef', 'DeclByClass', 'DeclByType', 'FunctionDef', 'CompoundStatement', 'Statement', 'DeclStmt', 'StructUnionEnum', 'FunctionPointerDeclare', 'TryStatement', 'CatchStatement', 'IfStatement', 'ElseStatement', 'SwitchStatement', 'ForStatement', 'DoStatement', 'WhileStatement', 'BreakStatement', 'ContinueStatement', 'GotoStatement', 'Label', 'ReturnStatement', 'ThrowStatement', 'ExpressionStatement', 'IdentifierDeclStatement', 'PreIfStatement', 'PreElIfStatement', 'PreElseStatement', 'PreEndIfStatement', 'PreDefine', 'PreUndef', 'PreDiagnostic', 'PreOther', 'PreInclude', 'PreIncludeNext', 'PreLine', 'PrePragma', 'UsingDirective', 'BlockCloser', 'Comment', 'File', 'Directory' ] # Remove unneeded nodes (we need to exclude IdentifierDeclStatement that have a ForInit or StructUnionEnum as parent) query = """g.V().has('type', within(%s)) .not(__.repeat(__.in(AST_EDGE)).emit().has('type', within('ForInit','StructUnionEnum'))) .id()""" % (visibleStatementTypes) result = db.runGremlinQuery(query) # Finally close db connection and release the shell db.runGremlinQuery("quit") return result
class StartTool(CmdLineTool): def __init__(self, DESCRIPTION): CmdLineTool.__init__(self, DESCRIPTION) # @Override def _constructQuery(self): """ Create a query from arguments that will be passed to the database. """ pass # @Override def _handleResult(self, res): """ Process the result of the query. """ pass def _runImpl(self): query = self._constructQuery() self.dbInterface = DBInterface() self.dbInterface.connectToDatabase() res = self.dbInterface.runGremlinQuery(query) self._handleResult(res)
class StartTool(CmdLineTool): def __init__(self, DESCRIPTION): CmdLineTool.__init__(self, DESCRIPTION) # @Override def _constructQuery(self): """ Create a query from arguments that will be passed to the database. """ pass # @Override def _handleResult(self, res): """ Process the result of the query. """ pass def _runImpl(self): query = self._constructQuery() self.dbInterface = DBInterface() self.dbInterface.connectToDatabase() res = self.dbInterface.runGremlinQuery(query) self._handleResult(res)
class ChunkStartTool(CmdLineTool): def __init__(self, DESCRIPTION): CmdLineTool.__init__(self, DESCRIPTION) # @Override def _constructIdQuery(self): pass # @Override def _constructQueryForChunk(self, chunk): pass # @Override def handleChunkResult(self, res, chunk): pass # @Override def _start(self): pass def _stop(self): pass def _runImpl(self): self.dbInterface = DBInterface() self.dbInterface.connectToDatabase() self._start() query = self._constructIdQuery() ids = self.dbInterface.runGremlinQuery(query) for chunk in self.dbInterface.chunks(ids, CHUNK_SIZE): query = self._constructQueryForChunk(chunk) res = self.dbInterface.runGremlinQuery(query) self._handleChunkResult(res, chunk) self._stop()
def run(self): if self.args.file != None: f = open(self.args.file, "r") else: f = sys.stdin lines = __class__._parseScript(f) query = "\n".join(lines) db = DBInterface() if self.args.no_json: db.disable_json() db.connectToDatabase(self.args.project) result = db.runGremlinQuery(query) pp = pprint.PrettyPrinter(indent=4, compact=True) for x in result: if self.args.raw: print(repr(x)) elif self.args.pretty: pp.pprint(x) else: print(x) db.runGremlinQuery("quit")
# List of all types that can use identifiers to do something (sorted by declarations) #Function: FunctionDef and CallExpression (need parent ExpressionStatement). Declares. #Macro: MacroDef and Callee or enywhere where we can identify a preMacroIdentifer? #Declares: ? #Enum:? prefix = "semanticUnit__" print("Adding prefixes...") # Connect to SU projectfile:///C:/Users/Lea/git/Joern_Advanced/testProjects/Collection/Plot.png db = DBInterface() db.connectToDatabase("EvoDiss.tar.gz") # Get the names of all functions query = """g.V().has('type', 'FunctionDef').out('IS_AST_PARENT').has('type', 'Identifier').values('code').as('function')""" functions = db.runGremlinQuery(query) # Get the names of all macros query = """g.V().has('type', 'PreDefine').out('IS_AST_PARENT').has('type','PreMacroIdentifier').values('code').as('macro')""" macros = db.runGremlinQuery(query) # Get the names of all declarations that can be declared on file scope query = """g.V().has('type', 'DeclStmt').out('DECLARES').has('type', 'Decl').values('identifier').as('declaration')""" declarations = db.runGremlinQuery(query) # Get the names of all StructUnionEnums query = """g.V().has('type', 'StructUnionEnum').out('IS_AST_PARENT').has('type', 'identifier').values('code').as('enum')""" enums = db.runGremlinQuery(query) functionResults = [] # Change the name of all FunctionDefs, CallExpressions and Declares of the respective function
class APIEmbedder(object): def __init__(self): self._initializeDBConnection() def _initializeDBConnection(self): self.dbInterface = DBInterface() def setOutputDirectory(self, directory): self.outputDirectory = directory def run(self, tfidf=True): try: # Will throw error if output directory already exists self._initializeOutputDirectory() except: return self._connectToDatabase() functions = self._getAPISymbolsFromDatabase() featureArray = self._createFeatureArray(functions) self._finalizeOutputDirectory() self.termDocMatrix = self._createTermDocumentMatrix(featureArray) if tfidf: self.termDocMatrix.tfidf() self._outputInLIBSVMFormat(self.outputDirectory) def _connectToDatabase(self): self.dbInterface.connectToDatabase() def _initializeOutputDirectory(self): directory = self.outputDirectory if os.path.exists(directory): raise os.makedirs(directory) self.tocFilename = os.path.join(directory, 'TOC') self.toc = open(self.tocFilename, 'w') def _finalizeOutputDirectory(self): self.toc.close() def _getAPISymbolsFromDatabase(self): CHUNK_SIZE = 1024 query = """queryNodeIndex('type:Function').id""" functionIds = self._runGremlinQuery(query) result = [] for chunk in self.chunks(functionIds, CHUNK_SIZE): query = """ _().transform{ %s }.scatter().transform{g.v(it)} .sideEffect{funcId = it.id} .transform{ [funcId, it.functionToAPISymbolNodes().code.toList()] } """ % (str(chunk)) result.extend(self._runGremlinQuery(query)) return result def chunks(self, l, n): for i in range(0, len(l), n): yield l[i:i + n] def _runGremlinQuery(self, query): return self.dbInterface.runGremlinQuery(query) def _createFeatureArray(self, functions): featureArray = FeatureArray() for index, (funcId, symbols) in enumerate(functions): for i in range(len(symbols)): symbols[i] = symbols[i] + '\n' featureArray.add(index, symbols) #label,items self.toc.write("%d\n" % (funcId)) self.toc.flush() return featureArray def _createTermDocumentMatrix(self, featureArray): converter = FeatureArrayToMatrix() return converter.convertFeatureArray(featureArray) def _outputInLIBSVMFormat(self, directory): from scipy.sparse import csc_matrix if self.termDocMatrix.matrix == None: return m = csc_matrix(self.termDocMatrix.matrix) nCols = m.shape[1] outFilename = os.path.join(directory, 'embedding.libsvm') outFile = open(outFilename, 'w') for i in range(nCols): label = self.termDocMatrix.index2Doc[i] col = m.getcol(i) entries = [(i, col[i, 0]) for i in col.indices] entries.sort() features = " ".join(['%d:%f' % e for e in entries]) row = '%s %s #%s\n' % (label, features, label) outFile.write(row) outFile.close()
#!/usr/bin/env python3 from octopus.server.DBInterface import DBInterface projectName = 'android.tar.gz' query = "queryNodeIndex('type:Function').id" db = DBInterface() db.connectToDatabase(projectName) ids = db.runGremlinQuery(query) CHUNK_SIZE = 256 LOCATION = '/home/sid/RABBIT_HOLE/CODE_ANALYSIS/joern/projects/octopus/data/projects/' for chunk in db.chunks(ids, CHUNK_SIZE): query = """ getCallsToRegex(".*read(Int|Uint)(32|64)") .statements() .out("REACHES") .has("code",textRegex(".*(malloc|memcpy).*")) .functions() .functionToLocationStr() """ query2 = """ getNodesWithTypeAndName(TYPE_FUNCTION, '*onTransact*') .out(FUNCTION_TO_AST_EDGE) .getArguments('(memcpy OR malloc)', '2') .out(USES_EDGE) .filter{
) )""" % (1, "functionName[0]", "functionName[0]", "functionName[0]") # Go to parent file # Follow IS_HEADER_OF to C file # 1. Look in AST children for functionDef with same functionName # 2. Get the include statement for the header file query = """g.V(%s).in('IS_FILE_OF').out('IS_HEADER_OF').union( __.out('IS_FILE_OF').has('type', 'Function').has('code', textContains('%s')).out('IS_FUNCTION_OF_AST'), __.out('IS_FILE_OF').has('type', 'PreInclude').has('code', textContains('%s')) ).id()""" % (184448, "bubblesort", "C.h") query = """g.V().has('type', 'MacroCall') """ # Execute equery result = db.runGremlinQuery(query) # Print results for x in result: print(x) ############################################################################################################ ##### NOT WORKING OR PROBLEMS ############################################################################################################ # Empty, because you need the full qualified name query = """getFunctionsByFilename("C.c")""" # Another regex problem query = """getCallsToRegex("bubblesor*").values('code')""" # No signature of method: org.apache.tinkerpop.gremlin.process.traversal.traverser.O_Traverser.codeContains() is applicable for argument types: (java.lang.String) values: [bubblesort] query = """g.V().sideEffect{it.codeContains("bubblesort");}"""
#!/usr/bin/env python3 from octopus.server.DBInterface import DBInterface # Connect to project DB projectName = 'EvoDiss.tar.gz' db = DBInterface() db.connectToDatabase(projectName) query = "g.V().has('type', 'Directory').has('code', '/home/lea/Downloads/Joern_Advanced/projects/octopus/data/projects/EvoDiss.tar.gz/src/home/lea/Downloads/EvoDiss/src').id()" result = ["# " + str(db.runGremlinQuery(query)) + " Directory src"] query = "g.V().has('type', 'File').has('code', '/home/lea/Downloads/Joern_Advanced/projects/octopus/data/projects/EvoDiss.tar.gz/src/home/lea/Downloads/EvoDiss/src/C_Test.c').id()" result.append("# " + str(db.runGremlinQuery(query)) + " File C_Test.c") query = "g.V().has('type', 'FunctionDef').has('code', textContains('compareResults')).id()" result.append("# " + str(db.runGremlinQuery(query)) + " FunctionDef compareResults") query = "g.V().has('type', 'FunctionDef').has('code', textContains('compareResults')).values('functionId')" id = db.runGremlinQuery(query)[0] query = "g.V().has('type', 'IfStatement').has('functionId', '%s').id()" % (id) result.append("# " + str(db.runGremlinQuery(query)) + " IfStatements in compareResults") query = "g.V().has('type', 'ElseStatement').has('functionId', '%s').id()" % (id) result.append("# " + str(db.runGremlinQuery(query)) + " ElseStatement in compareResults") query = "g.V().has('type', 'ForStatement').has('functionId', '%s').id()" % (id) result.append("# " + str(db.runGremlinQuery(query)) + " ForStatement in compareResults") query = "g.V().has('type', 'Condition').has('functionId', '%s').id()" % (id) result.append("# " + str(db.runGremlinQuery(query)) + " Conditions in compareResults")
class APIEmbedder(object): def __init__(self): self._initializeDBConnection() def _initializeDBConnection(self): self.dbInterface = DBInterface() def setOutputDirectory(self, directory): self.outputDirectory = directory def run(self): try: # Will throw error if output directory already exists self._initializeOutputDirectory() except: return self._connectToDatabase() functions = self._getAPISymbolsFromDatabase() self._writeDataPoints(functions) self._finalizeOutputDirectory() self._embed() def _embed(self): # self.embedder = SallyBasedEmbedder() self.embedder = Embedder() self.embedder.embed(self.outputDirectory) def _connectToDatabase(self): self.dbInterface.connectToDatabase() def _writeDataPoints(self, functions): for (funcId, symbols) in functions: self.toc.write("%d\n" % (funcId)) self._addDataPoint(symbols) def _addDataPoint(self, symbols): datapointFilename = os.path.join(self.dataDir, str(self.curDatapoint)) f = file(datapointFilename, 'w') f.writelines([x + "\n" for x in symbols]) f.close() self.curDatapoint += 1 def _initializeOutputDirectory(self): directory = self.outputDirectory if os.path.exists(directory): raise self.dataDir = os.path.join(directory, 'data') self.tocFilename = os.path.join(directory, 'TOC') os.makedirs(self.dataDir) self.toc = file(self.tocFilename, 'w') self.curDatapoint = 0 def _finalizeOutputDirectory(self): self.toc.close() def _getAPISymbolsFromDatabase(self): CHUNK_SIZE = 1024 query = """queryNodeIndex('type:Function').id""" functionIds = self._runGremlinQuery(query) result = [] for chunk in self.chunks(functionIds, CHUNK_SIZE): query = """ _().transform{ %s }.scatter().transform{g.v(it)} .sideEffect{funcId = it.id} .transform{ [funcId, it.functionToAPISymbolNodes().code.toList()] } """ % (str(chunk)) result.extend(self._runGremlinQuery(query)) return result def chunks(self, l, n): for i in xrange(0, len(l), n): yield l[i:i + n] def _runGremlinQuery(self, query): return self.dbInterface.runGremlinQuery(query)
def output(G): #Formatting outputString = '// DB \n' outputString += str(G) + '\n' outputString += '//###' + '\n' #Create DB folder (if its not already there) if not os.path.exists("DB"): os.makedirs("DB") filename = 'Complete.dot' #Write to file print("Creating "+filename+" ...") file = open("DB/"+filename, 'w') file.write(outputString) file.close() # Use terminal output to convert .dot to .png os.system("dot -Tpng 'DB/"+filename+"' -o 'DB/Complete.png'") #Print status update print("Creation of plot was successfull!") ####################################### Plotting ############################################### # Plot resulting graph plotResults() # Finally close db connection and release the shell db.runGremlinQuery("quit")