class StartTool(CmdLineTool): def __init__(self, DESCRIPTION): CmdLineTool.__init__(self, DESCRIPTION) # @Override def _constructQuery(self): """ Create a query from arguments that will be passed to the database. """ pass # @Override def _handleResult(self, res): """ Process the result of the query. """ pass def _runImpl(self): query = self._constructQuery() self.dbInterface = DBInterface() self.dbInterface.connectToDatabase() res = self.dbInterface.runGremlinQuery(query) self._handleResult(res)
def _runImpl(self): query = self._constructQuery() self.dbInterface = DBInterface() self.dbInterface.connectToDatabase() res = self.dbInterface.runGremlinQuery(query) self._handleResult(res)
def initialize(): # Get the ids from the SemanticUnit (first line is the projectName) idList = [line.rstrip('\n') for line in open('result.txt')] # Connect to project DB projectName = idList.pop(0) db = DBInterface() db.connectToDatabase(projectName) return [db, idList]
def getVisibleNodes(projectName): db = DBInterface() db.connectToDatabase(projectName) visibleStatementTypes = ['CustomNode', 'ClassDef', 'DeclByClass', 'DeclByType', 'FunctionDef', 'CompoundStatement', 'DeclStmt', 'StructUnionEnum', 'TryStatement', 'CatchStatement', 'IfStatement', 'ElseStatement', 'SwitchStatement', 'ForStatement', 'DoStatement', 'WhileStatement', 'BreakStatement', 'ContinueStatement', 'GotoStatement', 'Label', 'ReturnStatement', 'ThrowStatement', 'ExpressionStatement', 'IdentifierDeclStatement', 'PreIfStatement', 'PreElIfStatement', 'PreElseStatement', 'PreEndIfStatement', 'PreDefine', 'PreUndef', 'PreDiagnostic', 'PreOther', 'PreInclude', 'PreIncludeNext', 'PreLine', 'PrePragma', 'UsingDirective', 'BlockCloser', 'Comment'] # Remove unneeded nodes (we need to exclude IdentifierDeclStatement that have a ForInit or StructUnionEnum as parent) query = """g.V().has('type', within(%s)) .not(has('type', 'IdentifierDeclStatement').in(AST_EDGE).has('type', within('ForInit','StructUnionEnum'))) .id()""" % (visibleStatementTypes) result = db.runGremlinQuery(query) return result
def _runImpl(self): self.dbInterface = DBInterface() self.dbInterface.connectToDatabase() self._start() query = self._constructIdQuery() ids = self.dbInterface.runGremlinQuery(query) for chunk in self.dbInterface.chunks(ids, CHUNK_SIZE): query = self._constructQueryForChunk(chunk) res = self.dbInterface.runGremlinQuery(query) self._handleChunkResult(res, chunk) self._stop()
class ChunkStartTool(CmdLineTool): def __init__(self, DESCRIPTION): CmdLineTool.__init__(self, DESCRIPTION) # @Override def _constructIdQuery(self): pass # @Override def _constructQueryForChunk(self, chunk): pass # @Override def handleChunkResult(self, res, chunk): pass # @Override def _start(self): pass def _stop(self): pass def _runImpl(self): self.dbInterface = DBInterface() self.dbInterface.connectToDatabase() self._start() query = self._constructIdQuery() ids = self.dbInterface.runGremlinQuery(query) for chunk in self.dbInterface.chunks(ids, CHUNK_SIZE): query = self._constructQueryForChunk(chunk) res = self.dbInterface.runGremlinQuery(query) self._handleChunkResult(res, chunk) self._stop()
def run(self): if self.args.file != None: f = open(self.args.file, "r") else: f = sys.stdin lines = __class__._parseScript(f) query = "\n".join(lines) db = DBInterface() if self.args.no_json: db.disable_json() db.connectToDatabase(self.args.project) result = db.runGremlinQuery(query) pp = pprint.PrettyPrinter(indent=4, compact=True) for x in result: if self.args.raw: print(repr(x)) elif self.args.pretty: pp.pprint(x) else: print(x) db.runGremlinQuery("quit")
class APIEmbedder(object): def __init__(self): self._initializeDBConnection() def _initializeDBConnection(self): self.dbInterface = DBInterface() def setOutputDirectory(self, directory): self.outputDirectory = directory def run(self, tfidf=True): try: # Will throw error if output directory already exists self._initializeOutputDirectory() except: return self._connectToDatabase() functions = self._getAPISymbolsFromDatabase() featureArray = self._createFeatureArray(functions) self._finalizeOutputDirectory() self.termDocMatrix = self._createTermDocumentMatrix(featureArray) if tfidf: self.termDocMatrix.tfidf() self._outputInLIBSVMFormat(self.outputDirectory) def _connectToDatabase(self): self.dbInterface.connectToDatabase() def _initializeOutputDirectory(self): directory = self.outputDirectory if os.path.exists(directory): raise os.makedirs(directory) self.tocFilename = os.path.join(directory, 'TOC') self.toc = open(self.tocFilename, 'w') def _finalizeOutputDirectory(self): self.toc.close() def _getAPISymbolsFromDatabase(self): CHUNK_SIZE = 1024 query = """queryNodeIndex('type:Function').id""" functionIds = self._runGremlinQuery(query) result = [] for chunk in self.chunks(functionIds, CHUNK_SIZE): query = """ _().transform{ %s }.scatter().transform{g.v(it)} .sideEffect{funcId = it.id} .transform{ [funcId, it.functionToAPISymbolNodes().code.toList()] } """ % (str(chunk)) result.extend(self._runGremlinQuery(query)) return result def chunks(self, l, n): for i in range(0, len(l), n): yield l[i:i + n] def _runGremlinQuery(self, query): return self.dbInterface.runGremlinQuery(query) def _createFeatureArray(self, functions): featureArray = FeatureArray() for index, (funcId, symbols) in enumerate(functions): for i in range(len(symbols)): symbols[i] = symbols[i] + '\n' featureArray.add(index, symbols) #label,items self.toc.write("%d\n" % (funcId)) self.toc.flush() return featureArray def _createTermDocumentMatrix(self, featureArray): converter = FeatureArrayToMatrix() return converter.convertFeatureArray(featureArray) def _outputInLIBSVMFormat(self, directory): from scipy.sparse import csc_matrix if self.termDocMatrix.matrix == None: return m = csc_matrix(self.termDocMatrix.matrix) nCols = m.shape[1] outFilename = os.path.join(directory, 'embedding.libsvm') outFile = open(outFilename, 'w') for i in range(nCols): label = self.termDocMatrix.index2Doc[i] col = m.getcol(i) entries = [(i, col[i, 0]) for i in col.indices] entries.sort() features = " ".join(['%d:%f' % e for e in entries]) row = '%s %s #%s\n' % (label, features, label) outFile.write(row) outFile.close()
#!/usr/bin/env python3 from octopus.server.DBInterface import DBInterface projectName = 'android.tar.gz' query = "queryNodeIndex('type:Function').id" db = DBInterface() db.connectToDatabase(projectName) ids = db.runGremlinQuery(query) CHUNK_SIZE = 256 LOCATION = '/home/sid/RABBIT_HOLE/CODE_ANALYSIS/joern/projects/octopus/data/projects/' for chunk in db.chunks(ids, CHUNK_SIZE): query = """ getCallsToRegex(".*read(Int|Uint)(32|64)") .statements() .out("REACHES") .has("code",textRegex(".*(malloc|memcpy).*")) .functions() .functionToLocationStr() """ query2 = """ getNodesWithTypeAndName(TYPE_FUNCTION, '*onTransact*') .out(FUNCTION_TO_AST_EDGE) .getArguments('(memcpy OR malloc)', '2') .out(USES_EDGE) .filter{
class APIEmbedder(object): def __init__(self): self._initializeDBConnection() def _initializeDBConnection(self): self.dbInterface = DBInterface() def setOutputDirectory(self, directory): self.outputDirectory = directory def run(self): try: # Will throw error if output directory already exists self._initializeOutputDirectory() except: return self._connectToDatabase() functions = self._getAPISymbolsFromDatabase() self._writeDataPoints(functions) self._finalizeOutputDirectory() self._embed() def _embed(self): # self.embedder = SallyBasedEmbedder() self.embedder = Embedder() self.embedder.embed(self.outputDirectory) def _connectToDatabase(self): self.dbInterface.connectToDatabase() def _writeDataPoints(self, functions): for (funcId, symbols) in functions: self.toc.write("%d\n" % (funcId)) self._addDataPoint(symbols) def _addDataPoint(self, symbols): datapointFilename = os.path.join(self.dataDir, str(self.curDatapoint)) f = file(datapointFilename, 'w') f.writelines([x + "\n" for x in symbols]) f.close() self.curDatapoint += 1 def _initializeOutputDirectory(self): directory = self.outputDirectory if os.path.exists(directory): raise self.dataDir = os.path.join(directory, 'data') self.tocFilename = os.path.join(directory, 'TOC') os.makedirs(self.dataDir) self.toc = file(self.tocFilename, 'w') self.curDatapoint = 0 def _finalizeOutputDirectory(self): self.toc.close() def _getAPISymbolsFromDatabase(self): CHUNK_SIZE = 1024 query = """queryNodeIndex('type:Function').id""" functionIds = self._runGremlinQuery(query) result = [] for chunk in self.chunks(functionIds, CHUNK_SIZE): query = """ _().transform{ %s }.scatter().transform{g.v(it)} .sideEffect{funcId = it.id} .transform{ [funcId, it.functionToAPISymbolNodes().code.toList()] } """ % (str(chunk)) result.extend(self._runGremlinQuery(query)) return result def chunks(self, l, n): for i in xrange(0, len(l), n): yield l[i:i + n] def _runGremlinQuery(self, query): return self.dbInterface.runGremlinQuery(query)
# This is a collection of various queries #Define target project #projectName = 'SPLC' #projectName = 'Linux3' #projectName = 'EvoDiss.tar.gz' #projectName = 'JoernTest.tar.gz' #projectName = 'Linux.tar.gz' #projectName = 'Collection' #projectName = 'expat' #projectName = 'sample' #projectName = 'PV_Current.tar.gz' #projectName = 'DonorProject' projectName = 'Ag' #Connect do database of project db = DBInterface() db.connectToDatabase(projectName) ##### Normal Gremlin queries ##### # Get vertice with id 147512 query = "g.V(147512)" #Shows code of vertice 4256 query = "g.V(4256).values('code')" #Shows code of all nodes of type function query = "g.V().has('type', 'Function').values('code')" #Shows code of all file nodes query = "g.V().has('type', 'File').values('code')" # Get IDs of all argument verteces query = "g.V().has('type', 'Argument').id()" # Get all code vertices of a function
#!/usr/bin/env python3 from octopus.server.DBInterface import DBInterface # Connect to project DB projectName = 'EvoDiss.tar.gz' db = DBInterface() db.connectToDatabase(projectName) query = "g.V().has('type', 'Directory').has('code', '/home/lea/Downloads/Joern_Advanced/projects/octopus/data/projects/EvoDiss.tar.gz/src/home/lea/Downloads/EvoDiss/src').id()" result = ["# " + str(db.runGremlinQuery(query)) + " Directory src"] query = "g.V().has('type', 'File').has('code', '/home/lea/Downloads/Joern_Advanced/projects/octopus/data/projects/EvoDiss.tar.gz/src/home/lea/Downloads/EvoDiss/src/C_Test.c').id()" result.append("# " + str(db.runGremlinQuery(query)) + " File C_Test.c") query = "g.V().has('type', 'FunctionDef').has('code', textContains('compareResults')).id()" result.append("# " + str(db.runGremlinQuery(query)) + " FunctionDef compareResults") query = "g.V().has('type', 'FunctionDef').has('code', textContains('compareResults')).values('functionId')" id = db.runGremlinQuery(query)[0] query = "g.V().has('type', 'IfStatement').has('functionId', '%s').id()" % (id) result.append("# " + str(db.runGremlinQuery(query)) + " IfStatements in compareResults") query = "g.V().has('type', 'ElseStatement').has('functionId', '%s').id()" % (id) result.append("# " + str(db.runGremlinQuery(query)) + " ElseStatement in compareResults") query = "g.V().has('type', 'ForStatement').has('functionId', '%s').id()" % (id) result.append("# " + str(db.runGremlinQuery(query)) + " ForStatement in compareResults") query = "g.V().has('type', 'Condition').has('functionId', '%s').id()" % (id) result.append("# " + str(db.runGremlinQuery(query)) + " Conditions in compareResults")
class ProgramGraph(JoernTool): def __init__(self, DESCRIPTION): JoernTool.__init__(self, DESCRIPTION) # @Override def processLine(self, line): self.plot_configuration = PlotConfiguration() f = open(self.args.plot_config, "r") self.plot_configuration.parse(f) labels = self._getLabels() nodes = self._getNodes(int(line), labels) edges = self._getEdges(int(line), labels) G = pgv.AGraph(directed=True, strict=False) self._addNodes(G, nodes) self._addEdges(G, edges) self._outputGraph(G, line) def streamStart(self): self.dbInterface = DBInterface() self.dbInterface.connectToDatabase(self.args.project) def _addNodes(self, G, nodes): for v in nodes: nr = NodeResult(v) label = self._createGraphElementLabel( self.plot_configuration.getElementDisplayItems(nr)) plot_properties = self.plot_configuration.getElementLayout(nr) if label: plot_properties['label'] = label G.add_node(nr.getId(), **plot_properties) def _addEdges(self, G, edges): for e in edges: er = EdgeResult(e) label = self._createGraphElementLabel( self.plot_configuration.getElementDisplayItems(er)) plot_properties = self.plot_configuration.getElementLayout(er) plot_properties['label'] = label G.add_edge(er.getSrc(), er.getDest(), er.getId(), **plot_properties) def _createGraphElementLabel(self, labeldata): return "\n".join( [":".join([str(self._escape(e)) for e in d]) for d in labeldata]) def _escape(self, label): return str(label).replace("\\", "\\\\") def _outputGraph(self, G, identifier): outputString = '//' + identifier + '\n' outputString += str(G) + '\n' outputString += '//###' + '\n' self.output(outputString) def _getLabels(self): labels = ["FLOWS_TO", "USE", "DEF", "IS_AST_PARENT"] if self.args.show_all: return labels if not self.args.show_control_flow: labels.remove("FLOWS_TO") if not self.args.show_data_flow: labels.remove("USE") labels.remove("DEF") if not self.args.show_ast: labels.remove("IS_AST_PARENT") return labels def _getStartNode(self, functionId): if self.args.id_property: startnode = """g.V().has('type', 'Function').has('_key', {})""".format( functionId) else: startnode = """g.V({})""".format(functionId) startnode += """.union( out('IS_FUNCTION_OF_CFG'), out('IS_FUNCTION_OF_AST') )""" return startnode def _getNodes(self, functionId, labels): query = """ {}.repeat(outE({}).subgraph('sg').inV().dedup().simplePath()).cap('sg').next().traversal().V() """.format(self._getStartNode(functionId), ','.join(map(lambda x: "'{}'".format(x), labels))) return self._runGremlinQuery(query) def _getEdges(self, functionId, labels): query = """ {}.repeat(outE({}).subgraph('sg').inV().simplePath()).cap('sg').next().traversal().E() """.format(self._getStartNode(functionId), ','.join(map(lambda x: "'{}'".format(x), labels))) return self._runGremlinQuery(query)
def streamStart(self): self.dbInterface = DBInterface() self.dbInterface.connectToDatabase(self.args.project)
custom = False evaluation = False ############################################### # Connect to project DB #projectName = 'EvoDiss.tar.gz' #projectName = 'Revamp' #projectName = 'JoernTest.tar.gz' #projectName = 'SPLC' #projectName = 'ICSE' #projectName = 'expat' #projectName = 'sample' #projectName = 'Collection' projectName = 'DonorProject' #projectName = 'PV_Current.tar.gz' db = DBInterface() ####################################### Plotting ############################################### result = set() resultIDs = set() customStatementTypes = [ 'CustomNode', 'ClassDef', 'FunctionDef', 'CompoundStatement', 'DeclStmt', 'TryStatement', 'CatchStatement', 'IfStatement', 'ElseStatement', 'SwitchStatement', 'ForStatement', 'DoStatement', 'WhileStatement', 'BreakStatement', 'ContinueStatement', 'GotoStatement', 'Label', 'ReturnStatement', 'ThrowStatement', 'ExpressionStatement', 'IdentifierDeclStatement', 'PreIfStatement', 'PreElIfStatement', 'PreElseStatement', 'PreEndIfStatement', 'PreDefine', 'PreUndef', 'MacroCall', 'PreDiagnostic', 'PreOther', 'PreInclude', 'PreIncludeNext', 'PreLine', 'PrePragma', 'UsingDirective', 'BlockCloser', 'Symbol',
def _initializeDBConnection(self): self.dbInterface = DBInterface()
filename = 'SemanticUnit.dot' #Write to file print("Making of graph finished, creating "+filename+" ...") print("--------------------------------------------------------------------------------- \n") file = open("SemanticUnit/SemanticUnit.dot", 'w') file.write(outputString) file.close() # Use terminal output to convert .dot to .png os.system("dot -Tpng 'SemanticUnit/SemanticUnit.dot' -o 'SemanticUnit/SemanticUnit.png'") #Print status update print("Creation of plot was successfull!") ################################################### Start of program ################################################################# #Initialize DB interface db = DBInterface() # Input of entry points if (console): consoleInput() else: # projectName must be set manually db.connectToDatabase(projectName) print("Project is set to: "+projectName) # Start identification process identifySemanticUnits()
#File -> #PreDefine -> PreMacroIdentifier -> Identifier #File -> Function -IS_FUNCTION_OF_AST-> #FunctionDef -> Identifier #File -> #StructUnionEnum -> Identifier #File -> #DeclStatement -Declares-> Decl (first word is the identifier?) # List of all types that can use identifiers to do something (sorted by declarations) #Function: FunctionDef and CallExpression (need parent ExpressionStatement). Declares. #Macro: MacroDef and Callee or enywhere where we can identify a preMacroIdentifer? #Declares: ? #Enum:? prefix = "semanticUnit__" print("Adding prefixes...") # Connect to SU projectfile:///C:/Users/Lea/git/Joern_Advanced/testProjects/Collection/Plot.png db = DBInterface() db.connectToDatabase("EvoDiss.tar.gz") # Get the names of all functions query = """g.V().has('type', 'FunctionDef').out('IS_AST_PARENT').has('type', 'Identifier').values('code').as('function')""" functions = db.runGremlinQuery(query) # Get the names of all macros query = """g.V().has('type', 'PreDefine').out('IS_AST_PARENT').has('type','PreMacroIdentifier').values('code').as('macro')""" macros = db.runGremlinQuery(query) # Get the names of all declarations that can be declared on file scope query = """g.V().has('type', 'DeclStmt').out('DECLARES').has('type', 'Decl').values('identifier').as('declaration')""" declarations = db.runGremlinQuery(query) # Get the names of all StructUnionEnums
# Connect to project DB #projectName = 'EvoDiss.tar.gz' #projectName = 'Revamp' #projectName = 'JoernTest.tar.gz' #projectName = 'SPLC' #projectName = 'ICSE' #projectName = 'expat' #projectName = 'sample' #projectName = 'Collection' projectName = 'DonorProject' projectName = 'grep' projectName = 'Test' projectName = 'Example' projectName = 'Test3' #projectName = 'PV_Current.tar.gz' db = DBInterface() ####################################### Plotting ############################################### result = set() resultIDs = set() customStatementTypes = ['CustomNode', 'ClassDef', 'FunctionDef', 'CompoundStatement', 'DeclStmt', 'TryStatement', 'CatchStatement', 'IfStatement', 'ElseStatement', 'SwitchStatement', 'ForStatement', 'DoStatement', 'WhileStatement', 'BreakStatement', 'ContinueStatement', 'GotoStatement', 'Label', 'ReturnStatement', 'ThrowStatement', 'ExpressionStatement', 'IdentifierDeclStatement', 'PreIfStatement', 'PreElIfStatement', 'PreElseStatement', 'PreEndIfStatement', 'PreDefine', 'PreUndef', 'MacroCall', 'PreDiagnostic', 'PreOther', 'PreInclude', 'PreIncludeNext', 'PreLine', 'PrePragma', 'UsingDirective', 'BlockCloser', 'Symbol', 'CFGEntryNode', 'CFGExitNode', 'Comment'] cNodeIDs = set() visibleStatementTypes = ['CustomNode', 'ClassDef', 'DeclByClass', 'DeclByType', 'FunctionDef', 'CompoundStatement', 'DeclStmt', 'StructUnionEnum', 'FunctionPointerDeclare', 'TryStatement', 'CatchStatement', 'IfStatement', 'ElseStatement', 'SwitchStatement', 'ForStatement', 'DoStatement', 'WhileStatement', 'BreakStatement', 'ContinueStatement', 'GotoStatement', 'Label', 'ReturnStatement', 'ThrowStatement', 'ExpressionStatement', 'IdentifierDeclStatement', 'PreIfStatement', 'PreElIfStatement', 'PreElseStatement', 'PreEndIfStatement', 'PreDefine', 'PreUndef', 'MacroCall', 'PreDiagnostic', 'PreOther', 'PreInclude', 'PreIncludeNext', 'PreLine', 'PrePragma', 'UsingDirective', 'BlockCloser', 'Comment', 'File', 'Directory'] # Plots the results def plotResults (): db.connectToDatabase(projectName)