Пример #1
0
def getVisibleNodes(projectName):
    db = DBInterface()
    db.connectToDatabase(projectName)

    visibleStatementTypes = [
        'CustomNode', 'ClassDef', 'DeclByClass', 'DeclByType', 'FunctionDef',
        'CompoundStatement', 'Statement', 'DeclStmt', 'StructUnionEnum',
        'FunctionPointerDeclare', 'TryStatement', 'CatchStatement',
        'IfStatement', 'ElseStatement', 'SwitchStatement', 'ForStatement',
        'DoStatement', 'WhileStatement', 'BreakStatement', 'ContinueStatement',
        'GotoStatement', 'Label', 'ReturnStatement', 'ThrowStatement',
        'ExpressionStatement', 'IdentifierDeclStatement', 'PreIfStatement',
        'PreElIfStatement', 'PreElseStatement', 'PreEndIfStatement',
        'PreDefine', 'PreUndef', 'PreDiagnostic', 'PreOther', 'PreInclude',
        'PreIncludeNext', 'PreLine', 'PrePragma', 'UsingDirective',
        'BlockCloser', 'Comment', 'File', 'Directory'
    ]

    # Remove unneeded nodes (we need to exclude IdentifierDeclStatement that have a ForInit or StructUnionEnum as parent)
    query = """g.V().has('type', within(%s))
                .not(__.repeat(__.in(AST_EDGE)).emit().has('type', within('ForInit','StructUnionEnum')))
                .id()""" % (visibleStatementTypes)
    result = db.runGremlinQuery(query)

    # Finally close db connection and release the shell
    db.runGremlinQuery("quit")

    return result
Пример #2
0
class StartTool(CmdLineTool):
    def __init__(self, DESCRIPTION):
        CmdLineTool.__init__(self, DESCRIPTION)

    # @Override
    def _constructQuery(self):
        """
        Create a query from arguments that will be passed to the
        database.
        """
        pass

    # @Override
    def _handleResult(self, res):
        """
        Process the result of the query.
        """
        pass

    def _runImpl(self):
        query = self._constructQuery()

        self.dbInterface = DBInterface()
        self.dbInterface.connectToDatabase()

        res = self.dbInterface.runGremlinQuery(query)
        self._handleResult(res)
Пример #3
0
class StartTool(CmdLineTool):
    
    def __init__(self, DESCRIPTION):
        CmdLineTool.__init__(self, DESCRIPTION)

    # @Override
    def _constructQuery(self):
        """
        Create a query from arguments that will be passed to the
        database.
        """
        pass

    # @Override
    def _handleResult(self, res):
        """
        Process the result of the query.
        """
        pass

    def _runImpl(self):
        query = self._constructQuery()
        
        self.dbInterface = DBInterface()
        self.dbInterface.connectToDatabase()

        res = self.dbInterface.runGremlinQuery(query)
        self._handleResult(res)
Пример #4
0
class ChunkStartTool(CmdLineTool):

    def __init__(self, DESCRIPTION):
        CmdLineTool.__init__(self, DESCRIPTION)

    # @Override
    def _constructIdQuery(self):
        pass

    # @Override
    def _constructQueryForChunk(self, chunk):
        pass

    # @Override
    def handleChunkResult(self, res, chunk):
        pass

    # @Override
    def _start(self):
        pass

    def _stop(self):
        pass

    def _runImpl(self):
        
        self.dbInterface = DBInterface()
        self.dbInterface.connectToDatabase()

        self._start()

        query = self._constructIdQuery()
        ids = self.dbInterface.runGremlinQuery(query)
        
        for chunk in self.dbInterface.chunks(ids, CHUNK_SIZE):
            query = self._constructQueryForChunk(chunk)
            res = self.dbInterface.runGremlinQuery(query)
            self._handleChunkResult(res, chunk)

        self._stop()
Пример #5
0
    def run(self):
        if self.args.file != None:
            f = open(self.args.file, "r")
        else:
            f = sys.stdin
        lines = __class__._parseScript(f)
        query = "\n".join(lines)
        db = DBInterface()
        if self.args.no_json:
            db.disable_json()
        db.connectToDatabase(self.args.project)

        result = db.runGremlinQuery(query)
        pp = pprint.PrettyPrinter(indent=4, compact=True)
        for x in result:
            if self.args.raw:
                print(repr(x))
            elif self.args.pretty:
                pp.pprint(x)
            else:
                print(x)
        db.runGremlinQuery("quit")
Пример #6
0
# List of all types that can use identifiers to do something (sorted by declarations)
#Function: FunctionDef and CallExpression (need parent ExpressionStatement). Declares.
#Macro: MacroDef and Callee or enywhere where we can identify a preMacroIdentifer?
#Declares: ?
#Enum:?

prefix = "semanticUnit__"

print("Adding prefixes...")
# Connect to SU projectfile:///C:/Users/Lea/git/Joern_Advanced/testProjects/Collection/Plot.png
db = DBInterface()
db.connectToDatabase("EvoDiss.tar.gz")

# Get the names of all functions
query = """g.V().has('type', 'FunctionDef').out('IS_AST_PARENT').has('type', 'Identifier').values('code').as('function')"""
functions = db.runGremlinQuery(query)

# Get the names of all macros
query = """g.V().has('type', 'PreDefine').out('IS_AST_PARENT').has('type','PreMacroIdentifier').values('code').as('macro')"""
macros = db.runGremlinQuery(query)

# Get the names of all declarations that can be declared on file scope
query = """g.V().has('type', 'DeclStmt').out('DECLARES').has('type', 'Decl').values('identifier').as('declaration')"""
declarations = db.runGremlinQuery(query)

# Get the names of all StructUnionEnums
query = """g.V().has('type', 'StructUnionEnum').out('IS_AST_PARENT').has('type', 'identifier').values('code').as('enum')"""
enums = db.runGremlinQuery(query)

functionResults = []
# Change the name of all FunctionDefs, CallExpressions and Declares of the respective function
Пример #7
0
class APIEmbedder(object):
    def __init__(self):
        self._initializeDBConnection()

    def _initializeDBConnection(self):
        self.dbInterface = DBInterface()

    def setOutputDirectory(self, directory):
        self.outputDirectory = directory

    def run(self, tfidf=True):
        try:
            # Will throw error if output directory already exists
            self._initializeOutputDirectory()
        except:
            return
        self._connectToDatabase()
        functions = self._getAPISymbolsFromDatabase()
        featureArray = self._createFeatureArray(functions)
        self._finalizeOutputDirectory()
        self.termDocMatrix = self._createTermDocumentMatrix(featureArray)
        if tfidf:
            self.termDocMatrix.tfidf()
        self._outputInLIBSVMFormat(self.outputDirectory)

    def _connectToDatabase(self):
        self.dbInterface.connectToDatabase()

    def _initializeOutputDirectory(self):
        directory = self.outputDirectory
        if os.path.exists(directory):
            raise
        os.makedirs(directory)
        self.tocFilename = os.path.join(directory, 'TOC')
        self.toc = open(self.tocFilename, 'w')

    def _finalizeOutputDirectory(self):
        self.toc.close()

    def _getAPISymbolsFromDatabase(self):

        CHUNK_SIZE = 1024

        query = """queryNodeIndex('type:Function').id"""
        functionIds = self._runGremlinQuery(query)

        result = []

        for chunk in self.chunks(functionIds, CHUNK_SIZE):
            query = """
            _().transform{ %s }.scatter().transform{g.v(it)}
            .sideEffect{funcId = it.id}
            .transform{ [funcId, it.functionToAPISymbolNodes().code.toList()] }
            """ % (str(chunk))

            result.extend(self._runGremlinQuery(query))

        return result

    def chunks(self, l, n):
        for i in range(0, len(l), n):
            yield l[i:i + n]

    def _runGremlinQuery(self, query):
        return self.dbInterface.runGremlinQuery(query)

    def _createFeatureArray(self, functions):

        featureArray = FeatureArray()
        for index, (funcId, symbols) in enumerate(functions):
            for i in range(len(symbols)):
                symbols[i] = symbols[i] + '\n'
            featureArray.add(index, symbols)  #label,items
            self.toc.write("%d\n" % (funcId))
        self.toc.flush()
        return featureArray

    def _createTermDocumentMatrix(self, featureArray):
        converter = FeatureArrayToMatrix()
        return converter.convertFeatureArray(featureArray)

    def _outputInLIBSVMFormat(self, directory):

        from scipy.sparse import csc_matrix

        if self.termDocMatrix.matrix == None: return

        m = csc_matrix(self.termDocMatrix.matrix)
        nCols = m.shape[1]

        outFilename = os.path.join(directory, 'embedding.libsvm')
        outFile = open(outFilename, 'w')

        for i in range(nCols):
            label = self.termDocMatrix.index2Doc[i]

            col = m.getcol(i)
            entries = [(i, col[i, 0]) for i in col.indices]
            entries.sort()
            features = " ".join(['%d:%f' % e for e in entries])
            row = '%s %s #%s\n' % (label, features, label)
            outFile.write(row)

        outFile.close()
Пример #8
0
#!/usr/bin/env python3

from octopus.server.DBInterface import DBInterface

projectName = 'android.tar.gz'
query = "queryNodeIndex('type:Function').id"

db = DBInterface()
db.connectToDatabase(projectName)

ids = db.runGremlinQuery(query)

CHUNK_SIZE = 256
LOCATION = '/home/sid/RABBIT_HOLE/CODE_ANALYSIS/joern/projects/octopus/data/projects/'
for chunk in db.chunks(ids, CHUNK_SIZE):

    query = """
        getCallsToRegex(".*read(Int|Uint)(32|64)")
        .statements()
        .out("REACHES")
        .has("code",textRegex(".*(malloc|memcpy).*"))
        .functions()
        .functionToLocationStr()
    """

    query2 = """
       getNodesWithTypeAndName(TYPE_FUNCTION, '*onTransact*')
       .out(FUNCTION_TO_AST_EDGE)
       .getArguments('(memcpy OR malloc)', '2')
       .out(USES_EDGE)
       .filter{
Пример #9
0
            )
    )""" % (1, "functionName[0]", "functionName[0]", "functionName[0]")

# Go to parent file
# Follow IS_HEADER_OF to C file
# 1. Look in AST children for functionDef with same functionName
# 2. Get the include statement for the header file
query = """g.V(%s).in('IS_FILE_OF').out('IS_HEADER_OF').union(
    __.out('IS_FILE_OF').has('type', 'Function').has('code', textContains('%s')).out('IS_FUNCTION_OF_AST'),
    __.out('IS_FILE_OF').has('type', 'PreInclude').has('code', textContains('%s'))
    ).id()""" % (184448, "bubblesort", "C.h")

query = """g.V().has('type', 'MacroCall') """

# Execute equery
result = db.runGremlinQuery(query)

# Print results
for x in result:
    print(x)

############################################################################################################
##### NOT WORKING OR PROBLEMS
############################################################################################################

# Empty, because you need the full qualified name
query = """getFunctionsByFilename("C.c")"""
# Another regex problem
query = """getCallsToRegex("bubblesor*").values('code')"""
# No signature of method: org.apache.tinkerpop.gremlin.process.traversal.traverser.O_Traverser.codeContains() is applicable for argument types: (java.lang.String) values: [bubblesort]
query = """g.V().sideEffect{it.codeContains("bubblesort");}"""
Пример #10
0
#!/usr/bin/env python3
from octopus.server.DBInterface import DBInterface

# Connect to project DB
projectName = 'EvoDiss.tar.gz'
db = DBInterface()
db.connectToDatabase(projectName)

query = "g.V().has('type', 'Directory').has('code', '/home/lea/Downloads/Joern_Advanced/projects/octopus/data/projects/EvoDiss.tar.gz/src/home/lea/Downloads/EvoDiss/src').id()"
result = ["# " + str(db.runGremlinQuery(query))  + " Directory src"]

query = "g.V().has('type', 'File').has('code', '/home/lea/Downloads/Joern_Advanced/projects/octopus/data/projects/EvoDiss.tar.gz/src/home/lea/Downloads/EvoDiss/src/C_Test.c').id()"
result.append("# " + str(db.runGremlinQuery(query))  + " File C_Test.c")

query = "g.V().has('type', 'FunctionDef').has('code', textContains('compareResults')).id()"
result.append("# " + str(db.runGremlinQuery(query))  + " FunctionDef compareResults")

query = "g.V().has('type', 'FunctionDef').has('code', textContains('compareResults')).values('functionId')"
id = db.runGremlinQuery(query)[0]

query = "g.V().has('type', 'IfStatement').has('functionId', '%s').id()" % (id)
result.append("# " + str(db.runGremlinQuery(query))  + " IfStatements in compareResults")

query = "g.V().has('type', 'ElseStatement').has('functionId', '%s').id()" % (id)
result.append("# " + str(db.runGremlinQuery(query))  + " ElseStatement in compareResults")

query = "g.V().has('type', 'ForStatement').has('functionId', '%s').id()" % (id)
result.append("# " + str(db.runGremlinQuery(query))  + " ForStatement in compareResults")

query = "g.V().has('type', 'Condition').has('functionId', '%s').id()" % (id)
result.append("# " + str(db.runGremlinQuery(query))  + " Conditions in compareResults")
Пример #11
0
class APIEmbedder(object):
    def __init__(self):
        self._initializeDBConnection()

    def _initializeDBConnection(self):
        self.dbInterface = DBInterface()

    def setOutputDirectory(self, directory):
        self.outputDirectory = directory

    def run(self):

        try:
            # Will throw error if output directory already exists
            self._initializeOutputDirectory()
        except:
            return

        self._connectToDatabase()

        functions = self._getAPISymbolsFromDatabase()
        self._writeDataPoints(functions)
        self._finalizeOutputDirectory()

        self._embed()

    def _embed(self):
        # self.embedder = SallyBasedEmbedder()
        self.embedder = Embedder()
        self.embedder.embed(self.outputDirectory)

    def _connectToDatabase(self):
        self.dbInterface.connectToDatabase()

    def _writeDataPoints(self, functions):

        for (funcId, symbols) in functions:
            self.toc.write("%d\n" % (funcId))
            self._addDataPoint(symbols)

    def _addDataPoint(self, symbols):
        datapointFilename = os.path.join(self.dataDir, str(self.curDatapoint))
        f = file(datapointFilename, 'w')
        f.writelines([x + "\n" for x in symbols])
        f.close()
        self.curDatapoint += 1

    def _initializeOutputDirectory(self):
        directory = self.outputDirectory

        if os.path.exists(directory):
            raise

        self.dataDir = os.path.join(directory, 'data')
        self.tocFilename = os.path.join(directory, 'TOC')
        os.makedirs(self.dataDir)
        self.toc = file(self.tocFilename, 'w')

        self.curDatapoint = 0

    def _finalizeOutputDirectory(self):
        self.toc.close()

    def _getAPISymbolsFromDatabase(self):

        CHUNK_SIZE = 1024

        query = """queryNodeIndex('type:Function').id"""
        functionIds = self._runGremlinQuery(query)

        result = []

        for chunk in self.chunks(functionIds, CHUNK_SIZE):
            query = """
            _().transform{ %s }.scatter().transform{g.v(it)}
            .sideEffect{funcId = it.id}
            .transform{ [funcId, it.functionToAPISymbolNodes().code.toList()] }
            """ % (str(chunk))

            result.extend(self._runGremlinQuery(query))

        return result

    def chunks(self, l, n):
        for i in xrange(0, len(l), n):
            yield l[i:i + n]

    def _runGremlinQuery(self, query):
        return self.dbInterface.runGremlinQuery(query)
Пример #12
0
def output(G):
    #Formatting
    outputString = '// DB \n'
    outputString += str(G) + '\n'
    outputString += '//###' + '\n'

    #Create DB folder (if its not already there)
    if not os.path.exists("DB"):
        os.makedirs("DB")   
    filename = 'Complete.dot'
    
    #Write to file
    print("Creating "+filename+" ...")
    file = open("DB/"+filename, 'w')
    file.write(outputString)
    file.close()
    
    # Use terminal output to convert .dot to .png
    os.system("dot -Tpng 'DB/"+filename+"' -o 'DB/Complete.png'")
    #Print status update
    print("Creation of plot was successfull!")
    
####################################### Plotting ###############################################   


# Plot resulting graph
plotResults()

# Finally close db connection and release the shell
db.runGremlinQuery("quit")