def createXMLOutput(self): import os path = os.path.normpath(Globals.getConfig().dumpFile) file = os.path.basename(Globals.getConfig().inputFile) (filename,ext) = os.path.splitext(file) storePath = "{0}{1}{2}_output.xml".format(path,os.sep,filename) import sys import codecs old_stdout = sys.stdout handle = codecs.open(storePath,"w", "utf-8-sig") sys.stdout = handle print '<?xml version="1.0" encoding="utf-8" ?>' print "<protocolInformatics>" # Get the Discoverer XML result representation from the cluster collection object print self.getCCXMLRepresentation() # Get the statemachine XML result representation print self.env['sm'].getXMLRepresentation() print "</protocolInformatics>" handle.close() sys.stdout = old_stdout import os logging.info("Finished XML output. File size {0}".format(self.convert_bytes(os.path.getsize(storePath))))
def do_load_state(self, str): import cPickle handle = open(Globals.getConfig().dumpFile + "/disc_state","rb") self.env = cPickle.load(handle) # Update config with settings from backup Globals.setConfig(self.env['config']) discoverer.Globals.setProtocolClassification(self.env['protocolType']) handle.close()
def go(self, sequences): if self.env['sequences']==None: print "FATAL: No sequences loaded!" return import discoverer.statistics discoverer.statistics.reset_statistics() logging.info("Performing discoverer algorithm") start = time.time() # Perform the initial clustering self.setup(sequences) elapsed = (time.time() - start) logging.info("Setup took {:.3f} seconds".format(elapsed)) # Combines server and client flows self.env['messageFlows'] = self.combineflows(self.env['cluster_collection']) # Create a linked list self.linkmessages(self.env['messageFlows']) start = time.time() # Perform format inference self.do_format_inference("") elapsed = (time.time() - start) logging.info("Format inference took {:.3f} seconds".format(elapsed)) start = time.time() # Performs the semantic inference self.do_semantic_inference("") elapsed = (time.time() - start) logging.info("Semantic inference took {:.3f} seconds".format(elapsed)) start = time.time() # Performs the recursive clustering step self.do_recursive_clustering("") elapsed = (time.time() - start) logging.info("Recursive clustering took {:.3f} seconds".format(elapsed)) start = time.time() # Fixes tokenization errors self.do_fix_tokenization_errors("") elapsed = (time.time() - start) logging.info("Fixing tokenization errors took {:.3f} seconds".format(elapsed)) #self.print_clusterCollectionInfo() start = time.time() print "Merging..." # Merge while merging potential is present while self.env['cluster_collection'].mergeClustersWithSameFormat(): pass elapsed = (time.time() - start) logging.info("Merging took {:.3f} seconds".format(elapsed)) logging.info("Finished") # Perform one last format inference and semantic inference oldvalue = Globals.getConfig().considerOneMessageAsConstant Globals.getConfig().considerOneMessageAsConstant = True self.do_format_inference("") Globals.getConfig().considerOneMessageAsConstant = oldvalue self.do_semantic_inference("") if Globals.getConfig().debug: self.env['cluster_collection'].print_clusterCollectionInfo()
def dump_sm_dot(self, filename=""): if filename=="": path = os.path.normpath(Globals.getConfig().dumpFile) file = os.path.basename(Globals.getConfig().inputFile) (filename,ext) = os.path.splitext(file) storePath = "{0}{1}{2}.dot".format(path,os.sep,filename) else: storePath = filename self.env['sm'].dump_dot(storePath)
def __init__(self, env, config): cmd.Cmd.__init__(self) self.env = env # Just for backing it up into the state self.env['config'] = config self.config = config Globals.setConfig(config) self.__profile = collections.OrderedDict() self.__nextstate = 1 logging.info("Discoverer CLI initialized")
def do_testsuite(self, args): basename = Globals.getConfig().testbasename highloop=0 if args=="": highloop=4 else: highloop=int(args) logging.info("Using {0} as highloop".format(highloop)) for suffix in range(0,highloop): logging.info("Testing the {0}er batch".format(suffix)) Globals.getConfig().testFile = basename+"_{0}".format(suffix) logging.info("Set config.testFile to {0}".format(Globals.getConfig().testFile)) self.do_load_testdata("") # Perform the actual test self.do_statemachine_accepts("")
def do_dump_state(self, str): import cPickle handle = open(Globals.getConfig().dumpFile + "/disc_state","wb") sys.setrecursionlimit(50000) self.env['protocolType']=discoverer.Globals.getProtocolClassification() cPickle.dump(self.env, handle,2) handle.close()
def createPeachOutput(self): import os path = os.path.normpath(Globals.getConfig().dumpFile) file = os.path.basename(Globals.getConfig().inputFile) (filename,ext) = os.path.splitext(file) storePath = "{0}{1}{2}_peach.xml".format(path,os.sep,filename) import sys import codecs old_stdout = sys.stdout handle = codecs.open(storePath,"w", "utf-8-sig") sys.stdout = handle print self.env['sm'].dumpPeachXML() handle.close() sys.stdout = old_stdout import os logging.info("Finished Peach output. File size {0}".format(self.convert_bytes(os.path.getsize(storePath))))
def setup(self, sequences): #, direction): logging.info("Performing initial message analysis and clustering") if sequences == None: logging.error("FATAL: No sequences loaded yet!") return False # Perform initial token analysis setup = discoverer.setup.Setup(sequences, Globals.getConfig()) self.env['cluster_collection'] = setup.get_cluster_collection() logging.info("Built {0} clusters".format(setup.get_cluster_collection().num_of_clusters()))
def linkmessages(self, messageFlows): maxFlowLength = 0 minFlowLength = sys.maxint logging.info("Linking messages within flow") for flow in messageFlows: messages = messageFlows[flow] flowLength = len(messages) if flowLength>maxFlowLength: maxFlowLength = flowLength if flowLength<minFlowLength: minFlowLength = flowLength if len(messages)==1: if Globals.getConfig().debug: print "Flow {0} has only 1 message. Skipping flow".format(flow) continue #message_indices = messages.keys() from discoverer.peekable import peekable iterator = peekable(messages.items()) #for msg_id, message in messages.items(): lastMsg = None (msg_id, message) = iterator.next() message = message[0] while not iterator.isLast(): if lastMsg != None: lastMsg.setNextInFlow(message) message.setPrevInFlow(lastMsg) lastMsg = message #else # lastMsg = message (msg_id, message) = iterator.next() message = message[0] if lastMsg != message: lastMsg.setNextInFlow(message) message.setPrevInFlow(lastMsg) if Globals.getConfig().debug: self.printflow(flow) logging.info("Linked flows. Min flow length: {0}, max flow length: {1}".format(minFlowLength, maxFlowLength))
def do_dumpresult(self, string): if not self.env.has_key('cluster_collection'): return if Globals.getConfig().loadClientAndServerParts == True: # Dump 2 collections to two files path = os.path.normpath(Globals.getConfig().dumpFile) file = os.path.basename(Globals.getConfig().inputFile) (filename,ext) = os.path.splitext(file) storePath = "{0}{1}{2}_formats_dump.txt".format(path,os.sep,filename) self.dump2File(self.env['cluster_collection'],storePath) #storePath = "{0}{1}{2}_formats.xml".format(path,os.sep,filename) #self.dumpXML(self.env['cluster_collection'], storePath) #storePath = "{0}{1}{2}_server_dump.txt".format(path,os.sep,filename) #self.dump2File(self.env['cluster_collection_server'],storePath) else: # Dump only one file (client traffic) path = os.path.normpath(Globals.getConfig().dumpFile) file = os.path.basename(Globals.getConfig().inputFile) (filename,ext) = os.path.splitext(file) storePath = "{0}{1}{2}_dump.txt".format(path,os.sep,filename) self.dump2File(self.env['cluster_collection'],storePath)
def do_split_loaded(self, args): chunksize = 0 if args=="": chunksize = 2000 else: chunksize = int(args) if not self.env.has_key('testflows'): print "Error: No testflows laoded" testflows = self.env['testflows'] nr = 0 outfilename = Globals.getConfig().testFile fdoutclient = open("{0}_{1}_{2}_client".format(outfilename,chunksize, nr), "w") fdoutserver = open("{0}_{1}_{2}_server".format(outfilename,chunksize, nr), "w") linecnt = 0 blockseparator = "******************************************" print "Opened output file {0}_{1}_{2}".format(outfilename, chunksize, nr) flowcnt = 0 for flow in testflows: (has_no_gaps, is_alternating) = discoverer.common.flow_is_valid(testflows, flow) if not (has_no_gaps and is_alternating) or len(testflows[flow])==1: continue messages = testflows[flow] c_out = 1 s_out = 1 totalcnt = 1 for m_key in sorted(messages.keys()): msg = messages[m_key] if msg[1]=="server2client": fdoutserver.write("{0} {1} {2} {3} {4} {5}\n".format(blockseparator, flow, c_out, totalcnt, msg[0].get_length()*2, msg[0].get_payload_as_string())) c_out += 1 else: fdoutclient.write("{0} {1} {2} {3} {4} {5}\n".format(blockseparator, flow, s_out, totalcnt, msg[0].get_length()*2, msg[0].get_payload_as_string())) s_out += 1 totalcnt += 1 flowcnt += 1 if flowcnt>=chunksize: fdoutclient.close() fdoutserver.close() nr += 1 fdoutclient = open("{0}_{1}_{2}_client".format(outfilename,chunksize, nr), "w") fdoutserver = open("{0}_{1}_{2}_server".format(outfilename,chunksize, nr), "w") print "Opened output file {0}_{1}_{2}".format(outfilename, chunksize, nr) flowcnt = 0 #print "{0} lines read and {1} chunksize flows read. Creating new output file {2}_{1}_{3}".format(linecnt, chunksize,self.infilename, nr) #linecnt = 0 #fdout = open("{0}_{1}_{2}".format(self.infilename,chunksize, nr), "w") #inset.clear() fdoutclient.close() fdoutserver.close()
def do_dumpflow(self,file): if not Globals.getConfig().loadClientAndServerParts: print "Flow dumping is only available when analyzing client and server flows" return if file!="": import os.path path = os.path.normpath(Globals.getConfig().dumpFile) file = os.path.basename(Globals.getConfig().inputFile) (filename,ext) = os.path.splitext(file) storePath = "{0}{1}{2}_flow_dump.txt".format(path,os.sep,filename) import sys old_stdout = sys.stdout handle = open(storePath,"w") sys.stdout = handle print "Dump of 'Discoverer' flows" for f in self.env['messageFlows']: print "Flow: %s" % f for entry in self.env['messageFlows'][f]: print "\t{0}:\t{1} - {2}".format(entry,self.env['messageFlows'][f][entry][0].get_message(), self.env['messageFlows'][f][entry][0].getCluster().getFormatHash()) if file!="": handle.close() sys.stdout = old_stdout print "Finished. File size {0}".format(self.convert_bytes(os.path.getsize(storePath)))
def do_load_testdata(self, args=""): if len(args)!=0: tok = args.split() fileName = tok[0] element = int(tok[1]) fileName = Globals.getConfig().testFile import common import cmdinterface client2server_file = "{0}_client".format(fileName) server2client_file = "{0}_server".format(fileName) logging.debug("Using: {0} & {1} as testdata".format(client2server_file, server2client_file)) logging.info("Memory usage before loading testdata: {0}".format(self.getMemoryUsage())) self.profile("BeforeLoadingTestdata") logging.info("Loading {0} entries from test data from {1}".format(Globals.getConfig().numOfTestEntries,client2server_file)) # Load the client flows sequences_client2server = sequences = common.input.Bro(client2server_file, Globals.getConfig().numOfTestEntries).getConnections() logging.info("Loading {0} entries from test data from {1}".format(Globals.getConfig().numOfTestEntries, server2client_file)) # load the server flows sequences_server2client = sequences = common.input.Bro(server2client_file, Globals.getConfig().numOfTestEntries).getConnections() sequences = [(sequences_client2server, Message.directionClient2Server),(sequences_server2client, Message.directionServer2Client)] # Keep it compatible with existing code TODO logging.info("Loaded {0} test sequences from input files".format(len(sequences[0][0])+len(sequences[1][0]))) logging.info("Memory usage after loading testdata: {0}".format(self.getMemoryUsage())) self.profile("AfterLoadingTestdata") # Create quick setup tmpMaxPrefix = Globals.getConfig().maxMessagePrefix Globals.getConfig().maxMessagePrefix = 2048 setup = discoverer.setup.Setup(sequences, performFullAnalysis=False) Globals.getConfig().maxMessagePrefix = tmpMaxPrefix logging.info("Memory usage after preparing testsequences: {0}".format(self.getMemoryUsage())) self.profile("AfterPreparingTestdata") testcluster = setup.get_cluster_collection() testflows = self.combineflows(testcluster) logging.info("Memory usage after combining testsequences: {0}".format(self.getMemoryUsage())) self.profile("AfterCombiningTestdata") self.linkmessages(testflows) logging.info("Memory usage after linking testsequences: {0}".format(self.getMemoryUsage())) self.profile("AfterLinkingTestdata") self.env['testflows']=testflows # Hand test flows over to statemachine if self.env.has_key('sm'): self.env['sm'].setTestFlows(testflows)
def do_statemachine_accepts(self, args=""): # Tries to load the input and returns whether the statemachine accepts this input # Thoughts: # How do I map a single line of input to a transition? # A transition is the hash of a rich message format of a single message # # Basic Task: Match a single message to the best matching format # # Idea: Tokenize our single message and create a Message object out of it # The transition have linked information about the various messages that # are part of the cluster whose hash is the hash of the transition # # Idea: Compare message format of our single message (only text, binary senseful # at this moment) to the formats of the various clusters. # Then first examine whether we have perfect matches with respect to text/binary # (this might not be the case, if we've got rich cluster formats with merged clusters or sim. # If yes, compare the matching clusters's const values with our message and see whether our # values match the const value exactly. # If yes and we've only got one cluster that's our transition # If yes and we've got multiple matches let's see further # if we've got no match regarding the const values there are again 2 possibilities # : also consider variable cluster formats (in case our message had indeed a variable instead) # : also look for other cluster format combinations (e.g. merged tokens might change the length of the format, which # would have sorted this one out in the first instance) # Furthermore there are more test possibilites # e.g. load only client messages and see whether our app is able to answer with a server message # or load a full new set of client and server flows and replay flow by flow # Do it with flows import common import cmdinterface # load the test data if needed if not self.env.has_key('testflows') or len(self.env['testflows']) == 0: self.do_load_testdata(args) if not self.env.has_key('testflows'): print "ERROR: Loading test data failed!" return if not self.env.has_key('sm'): print "ERROR: Statemachine not yet built" return testflows = self.env['testflows'] # Prepare test statistic counters failedelements = [] success = 0 failures = 0 not_in_testflows = 0 only_one_msg = 0 has_gaps = 0 not_alternating = 0 not_all_transitioned = 0 not_ended_in_final = 0 gotMultipleChoice = 0 test2go = len(testflows.keys()) totalflows = test2go self.env['sm'].setTestFlows(testflows) # Make room and clean up the loaded sequences ;-) self.env['sequences']=None print "Memory usage before test: {0}".format(self.getMemoryUsage()) self.profile("BeforeStartingTest") for elem in testflows.keys(): print "{0} flows left to test ({1} failed so far, failrate {2} %)".format(test2go, failures, (1.0*failures/totalflows)*100) # Test the current flow res = self.statemachine_accepts_flow(elem, printSteps=False) test2go -= 1 del testflows[elem] # Delete tested flow to make room if res['testSuccessful']==True: success += 1 else: failures += 1 # Parse failure reason if not res['isInTestFlows']: not_in_testflows += 1 elif not res['hasMoreThanOneMessage']: only_one_msg += 1 elif not res['has_no_gaps']: has_gaps += 1 elif not res['is_alternating']: not_alternating += 1 elif not res['did_all_transitions' ]: not_all_transitioned += 1 if res['gotMultipleChoice']: gotMultipleChoice += 1 failedelements.append(elem) elif not res['finished_in_final']: not_ended_in_final += 1 if res['gotMultipleChoice']: gotMultipleChoice += 1 failedelements.append(elem) print "Finished" print "Memory usage after statemachine test: {0}".format(self.getMemoryUsage()) self.profile("AfterEndTests") logging.info("Testresults") logging.info("===========") logging.info("Number of flows: {0}, Success: {1}, Failures: {2}".format(success+failures, success, failures)) self.printProfile() if failures>0: print "Test flowID not in test flows: {0}".format(not_in_testflows) print "Flow had only one message: {0}".format(only_one_msg) print "Flow had gaps: {0}".format(has_gaps) print "Flow was not alternating: {0}".format(not_alternating) print "Flow rejected prematurely: {0}".format(not_all_transitioned) print "Flow did not end in final state: {0}".format(not_ended_in_final) print "Encountered into multiple choice when failed: {0}".format(gotMultipleChoice) print if len(failedelements)>0: print "Failed test flows (only tested flows):" for elem in failedelements: print "{0}".format(elem) # Dump results to file import os path = os.path.normpath(Globals.getConfig().dumpFile) file = os.path.basename(Globals.getConfig().testFile) (filename,ext) = os.path.splitext(file) storePath = "{0}{1}{2}_testresults.txt".format(path,os.sep,filename) import sys old_stdout = sys.stdout handle = open(storePath,"w") sys.stdout = handle print "Testresults" print "===========" print "Number of flows: {0}, Success: {1}, Failures: {2}".format(success+failures, success, failures) self.printProfile() if failures>0: print "Test flowID not in test flows: {0}".format(not_in_testflows) print "Flow had only one message: {0}".format(only_one_msg) print "Flow had gaps: {0}".format(has_gaps) print "Flow was not alternating: {0}".format(not_alternating) print "Flow rejected prematurely: {0}".format(not_all_transitioned) print "Flow did not end in final state: {0}".format(not_ended_in_final) print "Encountered into multiple choice when failed: {0}".format(gotMultipleChoice) print if len(failedelements)>0: print "Failed test flows (only tested flows):" for elem in failedelements: print "{0}".format(elem) print "Rerunning failed tests and logging output" self.do_load_testdata(args) for elem in failedelements: print 100*"+" print "Failed flow: {0}".format(elem) # Run test again, this time logging every transition self.statemachine_accepts_flow(elem, printSteps=True) print 100*"+" handle.close() sys.stdout = old_stdout logging.info("Finished. Test results written to file {0}, file size {1}".format(storePath,self.convert_bytes(os.path.getsize(storePath))))
def do_go(self, string): if self.env.has_key('cluster_collection'): del(self.env['cluster_collection']) if Globals.getConfig().loadClientAndServerParts == True: # Check if we want to constrain our maximum length based on configured confidence intervals if Globals.getConfig().calculateMaxMessageLength: maxPrefix = discoverer.setup.calcMaxMessageLengthConfidenceInterval(self.env['sequences'], 1-Globals.getConfig().maxMessageLengthConfidenceInterval) Globals.getConfig().maxMessagePrefix = maxPrefix logging.info("Calculated maximum message prefix based on confidence interval of {0}: {1}".format(Globals.getConfig().maxMessageLengthConfidenceInterval, maxPrefix)) logging.info("Using maximum message prefix for training data: {0}".format(Globals.getConfig().maxMessagePrefix)) # perform Discoverer analysis self.go(self.env['sequences']) # writes out the analysis results self.do_dumpresult("") # Build statemachine logging.info("Forcing regex rebuild") if self.env.has_key('cluster_collection'): self.env['cluster_collection'].updateClusterRegEx() logging.info("Performing sanity check over regexes") self.env['cluster_collection'].performSanityCheckForRegEx() logging.info("Flushing all messages in all clusters") # Construct statemachine sm = discoverer.statemachine.Statemachine(self.env['messageFlows']) self.env['sm'] = sm # Log time start = time.time() logging.info("Building statemachine") print "Memory usage w/o statemachine: {0}".format(self.getMemoryUsage()) self.profile("BeforeBuildStatemachine") # perform the build sm.build() duration = time.time()-start print "Statemachine building took {:.3f} seconds".format(duration) print "Memory usage with statemachine: {0}".format(self.getMemoryUsage()) self.profile("AfterBuildStatemachine") # Save the statemachine's dot file path = os.path.normpath(Globals.getConfig().dumpFile) file = os.path.basename(Globals.getConfig().inputFile) (filename,ext) = os.path.splitext(file) storePath = "{0}{1}{2}.dot".format(path,os.sep,filename) logging.info("Dumping state machine") sm.dump_dot(storePath) sm.dumpTransitions() storePath = "{0}{1}{2}_statemachine.xml".format(path,os.sep,filename) # Save the calculation state for later use self.do_dump_state("") if Globals.getConfig().autoCreateXML: # Dump the XML file print "Memory usage before creating XML: {0}".format(self.getMemoryUsage()) self.profile("BeforeBuildXML") self.createXMLOutput() self.createPeachOutput() print "Memory usage after creating XML: {0}".format(self.getMemoryUsage()) self.profile("AfterBuildXML") # Perform the acceptance test self.do_statemachine_accepts("") else: # Perform discoverer only for client pat self.go(self.env['sequences'],"unknownDirection")