def split(self): """ Perform the splitting of the segments. :return: List of segments splitted from the input. """ segmentStack = list(reversed(self.segments[1:])) mangledSegments = [self.segments[0]] if len(self.segments) > 1: while segmentStack: segc = segmentStack.pop() segl = mangledSegments[-1] if segl.offset + segl.length == segc.offset: # compare byte pairs' frequency splitshift = self.lookupLeastFrequent(segc) if ( 0 > splitshift >= -segl.length) \ or (0 < splitshift <= segc.length): if segl.length != -splitshift: mangledSegments[-1] = MessageSegment(mangledSegments[-1].analyzer, mangledSegments[-1].offset, mangledSegments[-1].length + splitshift) else: # segment to the left completely used up in center del mangledSegments[-1] if self._debug: print("Recombined segments: \n{} and {} into ".format(segl, segc)) segc = MessageSegment(segc.analyzer, segc.offset + splitshift, segc.length - splitshift) if self._debug: print("{} and {}".format(mangledSegments[-1] if mangledSegments else 'Empty', segc)) mangledSegments.append(segc) return mangledSegments
def split(self): """ Perform the splitting of the segments. :return: List of segments splitted from the input. """ segmentStack = list(reversed(self.segments)) mangledSegments = list() if len(self.segments) > 1: while segmentStack: # TODO check for equal analyzer, requires equality-check in analyzer # from inference.MessageAnalyzer import MessageAnalyzer segc = segmentStack.pop() # TODO: this is char specific only! if not isPrintable(segc.bytes): mangledSegments.append(segc) continue if mangledSegments: segl = mangledSegments[-1] if segl.offset + segl.length == segc.offset: splitpos = self.toTheLeft(segl) # segment to the left ends with chars, add them to the center segment if splitpos < segl.length: if splitpos > 0: mangledSegments[-1] = MessageSegment(mangledSegments[-1].analyzer, mangledSegments[-1].offset, splitpos) else: # segment to the left completely used up in center del mangledSegments[-1] restlen = segl.length - splitpos if self._debug: print("Recombined segments: \n{} and {} into ".format(segl, segc)) segc = MessageSegment(segc.analyzer, segc.offset - restlen, segc.length + restlen) if self._debug: print("{} and {}".format(mangledSegments[-1] if mangledSegments else 'Empty', segc)) if segmentStack: segr = segmentStack[-1] if segc.offset + segc.length == segr.offset: splitpos = self.toTheRight(segr) # segment to the right starts with chars, add them to the center segment if splitpos > 0: if segr.length - splitpos > 0: segmentStack[-1] = MessageSegment(segr.analyzer, segr.offset + splitpos, segr.length - splitpos) else: # segment to the right completely used up in center del segmentStack[-1] if self._debug: print("Recombined segments: \n{} and {} into ".format(segc, segr)) segc = MessageSegment(segc.analyzer, segc.offset, segc.length + splitpos) if self._debug: print("{} and {}".format(segc, segmentStack[-1] if segmentStack else 'Empty')) mangledSegments.append(segc) return mangledSegments
def split(self): newmsg = list() for sid, seg in enumerate(self.segments): # enum necessary to change to in place edit after debug (want to do?) didReplace = False for comfeat in self._moco: comoff = seg.bytes.find(comfeat) if comoff == -1: # comfeat not in moco, continue with next in moco continue featlen = len(comfeat) if seg.length == featlen: # its already the concise frequent feature newmsg.append(seg) else: if CropDistinct._debug: print("\nReplaced {} by:".format(seg.bytes.hex()), end=" ") absco = seg.offset + comoff if comoff > 0: segl = MessageSegment(seg.analyzer, seg.offset, comoff) newmsg.append(segl) if CropDistinct._debug: print(segl.bytes.hex(), end=" ") segc = MessageSegment(seg.analyzer, absco, featlen) newmsg.append(segc) if CropDistinct._debug: print(segc.bytes.hex(), end=" ") rlen = seg.length - comoff - featlen if rlen > 0: segr = MessageSegment(seg.analyzer, absco + featlen, rlen) newmsg.append(segr) if CropDistinct._debug: print(segr.bytes.hex(), end=" ") didReplace = True break # only most common match!? otherwise how to handle subsequent matches after split(s)? if not didReplace: newmsg.append(seg) elif CropDistinct._debug: print() return newmsg
def split(self, segmentID: int, chunkLength: int): selSeg = self.segments[segmentID] if chunkLength < selSeg.length: newSegs = list() for chunkoff in range(selSeg.offset, selSeg.nextOffset, chunkLength): remainLen = selSeg.nextOffset - chunkoff newSegs.append(MessageSegment(selSeg.analyzer, chunkoff, min(remainLen, chunkLength))) newmsg = self.segments[:segmentID] + newSegs + self.segments[segmentID + 1:] return newmsg else: return self.segments
def calcHexDist(hexA, hexB): from netzob.Model.Vocabulary.Messages.RawMessage import RawMessage from inference.analyzers import Value from inference.segments import MessageSegment from inference.templates import DistanceCalculator bytedata = [bytes.fromhex(hexA), bytes.fromhex(hexB)] messages = [RawMessage(bd) for bd in bytedata] analyzers = [Value(message) for message in messages] segments = [ MessageSegment(analyzer, 0, len(analyzer.message.data)) for analyzer in analyzers ] dc = DistanceCalculator(segments) return dc.pairDistance(*segments)
def messageSegmentation(self) -> List[MessageSegment]: """ Segment message by determining inflection points of sigma-s-gauss-filtered bit-congruence. The cut position is the delta max of the unsmoothed bcd in the scope of a min/max (rising) pair. additionally cut at high plateaus starts in the basic bc values. :return: Segmentation of this message based on this analyzer's type. """ if not self.values: if not self._analysisArgs: raise ValueError('No values or analysis parameters set.') self.analyze() # cut one byte before the inflection inflectionPoints = self.inflectionPoints() inflectionCuts = [int(i) - 1 for i in inflectionPoints[0]] # # cut one byte before the plateau # # | has yielded mixed quality results (was better for dhcp, much worse for ntp and dns) # # | TODO probably having some kind of precedence whether inflection or plateau is to be kept # # | if both cut positions are near to each other might make this worthwhile. # highPlats = self.bcHighPlateaus() # highPlatCuts = [ int(i)-1 for i in highPlats[0]] # # below: sorted( + highPlatCuts) # get candidates to cut segments from message cutCandidates = [0] + inflectionCuts \ + [len(self._message.data)] # add the message end # cut only where a segment is of a length larger than 1 cutPositions = [0] + [ right for left, right in zip(cutCandidates[:-1], cutCandidates[1:]) if right - left > 1 ] # cutPositions = list(sorted(cutPositions + nansep[0])) # add the end of the message if its not already there if cutPositions[-1] != cutCandidates[-1]: cutPositions[-1] = cutCandidates[-1] segments = list() for cutCurr, cutNext in zip(cutPositions[:-1], cutPositions[1:]): segments.append(MessageSegment(self, cutCurr, cutNext - cutCurr)) return segments
def merge(self): """ Perform the merging. :return: a new set of segments after the input has been merged """ mergedSegments = self.segments[0:1] if len(self.segments) > 1: for segl, segr in zip(self.segments[:-1], self.segments[1:]): # TODO check for equal analyzer, requires implementing a suitable equality-check in analyzer # from inference.MessageAnalyzer import MessageAnalyzer if segl.offset + segl.length == segr.offset and self.condition(segl, segr): mergedSegments[-1] = MessageSegment(mergedSegments[-1].analyzer, mergedSegments[-1].offset, mergedSegments[-1].length + segr.length) if self._debug: print("Merged segments: \n{} and \n{} into \n{}".format(segl, segr, mergedSegments[-1])) else: mergedSegments.append(segr) return mergedSegments
def messageSegmentation(self) -> List[MessageSegment]: segments = self._recursivePivotVar( MessageSegment(BitCongruence(self.message), 0, len(self._message.data))) sortedSegments = sorted(segments, key=lambda x: x.offset) # varPerSeg = list() # for segment in sortedSegments: # if segment.offset > len(varPerSeg): # raise ValueError('Segment before offset {} missing for message with data ...{}...'.format( # segment.offset, hex(self._message.data[len(varPerSeg):segment.offset]))) # # # instead of failing we could also add placeholders if something is missing. # # # But is shouldn't happen: We do not have overlapping or omitted segments. # # meanVarPerSeg.extend( [-1]*(segment.offset-len(meanVarPerSeg)) ) # # add mean value for all byte positions of one segment. # varPerSeg.extend( [ segment.stdev() ]*segment.length ) # self._values = varPerSeg if self.__debug: input('next message: ') return sortedSegments
def messageSegmentation(self) -> List[MessageSegment]: if not self.values: self.analyze() # value drop or rise more than 200 (?) in one step, split at highest abs(value) sc = self.steepChanges(200) # TODO iterate best value # and value drop to or rise from 0, split at the non-zero value zb = self.zeroBorders() cutat = numpy.add(sorted(set(sc + zb)), self._startskip).tolist() if cutat[0] != 0: cutat = [0] + cutat if cutat[-1] != len(self._message.data): cutat = cutat + [len(self._message.data)] segments = list() for cutCurr, cutNext in zip(cutat[:-1], cutat[1:]): # add the message end segments.append(MessageSegment(self, cutCurr, cutNext - cutCurr)) return segments
def generateTestSegments(): from netzob.Model.Vocabulary.Messages.RawMessage import RawMessage from inference.analyzers import Value from inference.segments import MessageSegment bytedata = [ bytes([1, 2, 3, 4]), bytes([2, 3, 4]), bytes([1, 3, 4]), bytes([2, 4]), bytes([2, 3]), bytes([20, 30, 37, 50, 69, 2, 30]), bytes([37, 5, 69]), bytes([0, 0, 0, 0]), bytes([3, 2, 3, 4]) ] messages = [RawMessage(bd) for bd in bytedata] analyzers = [Value(message) for message in messages] segments = [MessageSegment(analyzer, 0, len(analyzer.message.data)) for analyzer in analyzers] return segments
def messageSegmentation(self) -> List[MessageSegment]: """ produces very bad/unusable results. :return: """ if not self.values: self.analyze() # sudden drop (inversion?) in progression delta steepness. sc = self.steepChanges(.3) # TODO iterate best value cutat = numpy.add(sorted(set(sc)), self._startskip).tolist() if len(cutat) == 0 or cutat[0] != 0: cutat = [0] + cutat if len(cutat) == 0 or cutat[-1] != len(self._message.data): cutat = cutat + [len(self._message.data)] # add the message end segments = list() for cutCurr, cutNext in zip(cutat[:-1], cutat[1:]): segments.append(MessageSegment(self, cutCurr, cutNext - cutCurr)) return segments
def messageSegmentation(self) -> List[MessageSegment]: """ Segment message by determining local maxima of sigma-1.5-gauss-filtered 2-byte-horizon bit-congruence. >>> from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage >>> tstmsg = '19040aec0000027b000012850a6400c8d23d06a2535ed71ed23d09faa4673315d23d09faa1766325d23d09faa17b4b10' >>> l4m = L4NetworkMessage(bytes.fromhex(tstmsg)) >>> hbg = HorizonBitcongruenceGauss(l4m) >>> hbg.setAnalysisParams() >>> hbg.analyze() >>> spm = hbg.messageSegmentation() >>> print(b''.join([seg.bytes for seg in spm]).hex() == spm[0].message.data.hex()) True :return: Segmentation of this message based on this analyzer's type. """ if not self.values: if not self._analysisArgs: raise ValueError('No values or analysis parameters set.') self.analyze() bclmins = self.pinpointMinima() # prevent 1 byte segments, since they do not contain usable congruence! cutCandidates = [0] + [int(b) for b in bclmins] + [len(self._message.data) ] # add the message end cutPositions = [0] + [ right for left, right in zip(cutCandidates[:-1], cutCandidates[1:]) if right - left > 1 ] if cutPositions[-1] != cutCandidates[-1]: cutPositions[-1] = cutCandidates[-1] segments = list() for lmaxCurr, lmaxNext in zip(cutPositions[:-1], cutPositions[1:]): segments.append(MessageSegment(self, lmaxCurr, lmaxNext - lmaxCurr)) return segments
def _recursivePivotMean(self, segment: MessageSegment): """ Recursively split the segment in half, calculate the mean for the values of each of the two resulting sub-segments, and compare each of them to the original segments mean. If a sub-segment is sufficiently different from its parent (meanThreshold = .02) further split the sub-segment. :param segment: One message segment that should be segmented. :return: List of segments after the splitting. """ if not segment.values: segment.analyzer.analyze() mymean = segment.mean() if segment.length >= 4: # we need two bytes for each segment to get a bit congruence of them pivot = segment.length // 2 leftSegment = MessageSegment(segment.analyzer, segment.offset, pivot) rightSegment = MessageSegment(segment.analyzer, segment.offset + pivot, segment.length - pivot) # test for recursion conditions returnSegments = list() if abs(leftSegment.mean() - mymean) > self._meanThreshold: # still different returnSegments.extend(self._recursivePivotMean(leftSegment)) else: returnSegments.append(leftSegment) if abs(rightSegment.mean() - mymean) > self._meanThreshold: # still different returnSegments.extend(self._recursivePivotMean(rightSegment)) else: returnSegments.append(rightSegment) # if abs(lsm - rsm) > .1: # still different return returnSegments else: return [segment]
def cacheAndLoadDC(pcapfilename: str, analysisTitle: str, tokenizer: str, debug: bool, analyzerType: type, analysisArgs: Tuple=None, sigma: float=None, filterTrivial=False, refinementCallback:Union[Callable, None] = refinements, disableCache=False) \ -> Tuple[SpecimenLoader, MessageComparator, List[Tuple[MessageSegment]], DistanceCalculator, float, float]: """ cache or load the DistanceCalculator to or from the filesystem :param filterTrivial: Filter out **one-byte** segments and such just consisting of **zeros**. :param disableCache: When experimenting with distances manipulation, deactivate caching! :return: """ pcapbasename = os.path.basename(pcapfilename) # if refinementCallback == pcaMocoRefinements: # sigma = pcamocoSigmapertrace[pcapbasename] if not sigma and pcapbasename in pcamocoSigmapertrace else \ # 0.9 if not sigma else sigma # else: sigma = sigmapertrace[pcapbasename] if not sigma and pcapbasename in sigmapertrace else \ 0.9 if not sigma else sigma pcapName = os.path.splitext(pcapbasename)[0] # noinspection PyUnboundLocalVariable tokenparm = tokenizer if tokenizer != "nemesys" else \ "{}{:.0f}".format(tokenizer, sigma * 10) dccachefn = os.path.join( cacheFolder, 'cache-dc-{}-{}-{}-{}-{}.{}'.format( analysisTitle, tokenparm, "filtered" if filterTrivial else "all", refinementCallback.__name__ if refinementCallback is not None else "raw", pcapName, 'ddc')) # dccachefn = 'cache-dc-{}-{}-{}.{}'.format(analysisTitle, tokenizer, pcapName, 'dc') if disableCache or not os.path.exists(dccachefn): # dissect and label messages print("Load messages from {}...".format(pcapName)) specimens = SpecimenLoader(pcapfilename, 2, True) comparator = MessageComparator(specimens, 2, True, debug=debug) print("Segmenting messages...", end=' ') segmentationTime = time.time() # select tokenizer by command line parameter if tokenizer == "tshark": # 1. segment messages according to true fields from the labels segmentedMessages = annotateFieldTypes(analyzerType, analysisArgs, comparator) elif tokenizer == "4bytesfixed": # 2. segment messages into fixed size chunks for testing segmentedMessages = segmentsFixed(4, comparator, analyzerType, analysisArgs) elif tokenizer == "nemesys": # 3. segment messages by NEMESYS segmentsPerMsg = bcDeltaGaussMessageSegmentation(specimens, sigma) # get analyzer requested by analyzerType/analysisArgs segmentedMessages = [[ MessageSegment( MessageAnalyzer.findExistingAnalysis( analyzerType, MessageAnalyzer.U_BYTE, seg.message, analysisArgs), seg.offset, seg.length) for seg in msg ] for msg in segmentsPerMsg] if refinementCallback is not None: if refinementCallback.__code__.co_argcount > 1: # assume the second argument is expected to be a distance calculator chainedSegments = list( chain.from_iterable(segmentedMessages)) print("Refinement: Calculate distance for {} segments...". format(len(chainedSegments))) if len(chainedSegments)**2 > MemmapDC.maxMemMatrix: refinementDC = MemmapDC(chainedSegments) else: refinementDC = DelegatingDC(chainedSegments) segmentedMessages = refinementCallback( segmentedMessages, refinementDC) else: segmentedMessages = refinementCallback(segmentedMessages) # segments = list(chain.from_iterable(segmentedMessages)) segmentationTime = time.time() - segmentationTime print("done.") if filterTrivial: # noinspection PyUnboundLocalVariable chainedSegments = [ seg for seg in chain.from_iterable(segmentedMessages) if seg.length > 1 and set(seg.values) != {0} ] else: # noinspection PyUnboundLocalVariable chainedSegments = list(chain.from_iterable(segmentedMessages)) print("Calculate distance for {} segments...".format( len(chainedSegments))) # dc = DistanceCalculator(chainedSegments, reliefFactor=0.33) # Pairwise similarity of segments: dc.distanceMatrix dist_calc_segmentsTime = time.time() if len(chainedSegments)**2 > MemmapDC.maxMemMatrix: dc = MemmapDC(chainedSegments) else: dc = DelegatingDC(chainedSegments) assert chainedSegments == dc.rawSegments dist_calc_segmentsTime = time.time() - dist_calc_segmentsTime try: with open(dccachefn, 'wb') as f: pickle.dump((segmentedMessages, comparator, dc), f, pickle.HIGHEST_PROTOCOL) except MemoryError as e: print("DC could not be cached due to a MemoryError. Removing", dccachefn, "and continuing.") os.remove(dccachefn) else: print("Load distances from cache file {}".format(dccachefn)) with open(dccachefn, 'rb') as f: segmentedMessages, comparator, dc = pickle.load(f) if not (isinstance(comparator, MessageComparator) and isinstance(dc, DistanceCalculator)): print('Loading of cached distances failed.') exit(10) specimens = comparator.specimens # chainedSegments = list(chain.from_iterable(segmentedMessages)) segmentationTime, dist_calc_segmentsTime = None, None return specimens, comparator, segmentedMessages, dc, segmentationTime, dist_calc_segmentsTime
def messageSegmentation(self) -> List[MessageSegment]: """ Segment message by determining local extrema of sigma-s-gauss-filtered sliding n-byte-mean bit-congruence. >>> from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage >>> tstmsg = '19040aec0000027b000012850a6400c8d23d06a2535ed71ed23d09faa4673315d23d09faa1766325d23d09faa17b4b10' >>> l4m = L4NetworkMessage(bytes.fromhex(tstmsg)) >>> hbg = HorizonBitcongruenceGauss(l4m) >>> hbg.setAnalysisParams() >>> hbg.analyze() >>> spm = hbg.messageSegmentation() >>> print(b''.join([seg.bytes for seg in spm]).hex() == spm[0].message.data.hex()) True :return: Segmentation of this message based on this analyzer's type. """ if not self.values: if not self._analysisArgs: raise ValueError('No values or analysis parameters set.') self.analyze() bcd = MessageAnalyzer.findExistingAnalysis(BitCongruenceDelta, MessageAnalyzer.U_BYTE, self.message) # all local minima bclmins = self.pinpointMinima() # local maxima, if bc[e] < bc[e+1] or bc[e] > 2*s2mbc[e] for all e in cadidate indices bclmaxs = self.pinpointMaxima() bcdmaxs = [ e for e in bclmaxs if bcd.values[e + 1] > bcd.values[e] or bcd.values[e] > 2 * self.bitcongruences[e] ] minmax = bclmins for bdm in bcdmaxs: # only keep bcdmaxs if not in scope if min if bdm + 1 not in minmax and bdm - 1 not in minmax: minmax.append(bdm) # starts of plateaus of bit congruences bcplats = MessageAnalyzer.plateouStart( self.bitcongruences)[0] # bcd.values for bps in bcplats: # only keep platoustarts if not in scope if min or max if bps + 1 not in minmax and bps - 1 not in minmax: minmax.append(bps) # # separate nan-values # nansep = MessageAnalyzer.separateNaNs(self.values) relevantPositions = list(sorted(minmax)) # get candidates to cut segments from message cutCandidates = [0] + [int(b) for b in relevantPositions if not numpy.isnan(b)] \ + [len(self._message.data)] # add the message end # cut only where a segment is of a length larger than 1 cutPositions = [0] + [ right for left, right in zip(cutCandidates[:-1], cutCandidates[1:]) if right - left > 1 ] # cutPositions = list(sorted(cutPositions + nansep[0])) # add the end of the message if its not already there if cutPositions[-1] != cutCandidates[-1]: cutPositions[-1] = cutCandidates[-1] segments = list() for lmaxCurr, lmaxNext in zip(cutPositions[:-1], cutPositions[1:]): segments.append(MessageSegment(self, lmaxCurr, lmaxNext - lmaxCurr)) return segments
def _recursivePivotVar(self, segment: MessageSegment): """ Recursively split the segment at positions shifting from 2 to n-2, calculate the standard deviation for the values of each of the two resulting sub-segments, and compare each of them to the original segments deviation. If a sub-segment is sufficiently different from its parent (varThreshold = 0.5 parentvar * min(len(vl), len(vr))/(len(vl) + len(vr))) further split the sub-segment. :param segment: One message segment that should be segmented. :return: List of segments after the splitting. """ if not segment.values: segment.analyzer.analyze() myvar = segment.stdev() if segment.length >= 4: # we need two bytes for each segment to get a bit congruence of them # select a suitable pivot: find the one yielding the highest deviation-difference from parent segmentSplit = dict() for pivot in range(2, segment.length - 1): leftSegment = MessageSegment(segment.analyzer, segment.offset, pivot) rightSegment = MessageSegment(segment.analyzer, segment.offset + pivot, segment.length - pivot) # deviation needs to be higher towards the edges to be a probable splitting point lenweight = 2 * min(leftSegment.length, rightSegment.length) / segment.length # add splits: varDiff: (leftSegment, rightSegment) segmentSplit[abs(leftSegment.stdev() - rightSegment.stdev()) * lenweight] \ = (leftSegment, rightSegment) if self.__debug: from tabulate import tabulate print( tabulate( sorted([(wlrdiff, ls.offset, ls.stdev(), rs.offset, rs.stdev(), rs.offset + rs.length) for wlrdiff, (ls, rs) in segmentSplit.items()], key=lambda x: x[0]), headers=[ 'wlrdiff', 'l.o', 'lvar', 'r.o', 'rvar', 'r.b' ])) #abs(x[3] - x[4]) # use the segments splitted at selected pivot: search max varDiff in splits splitdiffmax = max(segmentSplit.keys()) leftSegment, rightSegment = segmentSplit[splitdiffmax] # weightedThresh = 0.5 * myvar * min(leftSegment.length, rightSegment.length) / segment.length weightedThresh = 0.1 * myvar if self.__debug: print('parent segment stdev:', myvar) print('weighted threshold:', weightedThresh) # test for recursion conditions: recurse if above weightedThresh returnSegments = list() if abs(leftSegment.stdev() - myvar) > weightedThresh: # still different if self.__debug: print('split left', leftSegment.offset) returnSegments.extend(self._recursivePivotVar(leftSegment)) else: if self.__debug: print('left finished', abs(rightSegment.stdev() - myvar)) returnSegments.append(leftSegment) if abs(rightSegment.stdev() - myvar) > weightedThresh: # still different if self.__debug: print('split right', rightSegment.offset) returnSegments.extend(self._recursivePivotVar(rightSegment)) else: if self.__debug: print('right finished', abs(rightSegment.stdev() - myvar)) returnSegments.append(rightSegment) # if abs(lsm - rsm) > .1: # still different return returnSegments else: return [segment]
def merge(self): """ Perform the merging. >>> from utils.loader import SpecimenLoader >>> from inference.segmentHandler import bcDeltaGaussMessageSegmentation >>> from inference.formatRefinement import CumulativeCharMerger >>> sl = SpecimenLoader('../input/dns_ictf2010_deduped-100.pcap', layer=0, relativeToIP=True) >>> segmentsPerMsg = bcDeltaGaussMessageSegmentation(sl) Segmentation by inflections of sigma-0.6-gauss-filtered bit-variance. >>> for messageSegments in segmentsPerMsg: ... ccm = CumulativeCharMerger(messageSegments) ... ccmmsg = ccm.merge() ... if ccmmsg != messageSegments: ... sgms = b''.join([m.bytes for m in ccmmsg]) ... sgss = b''.join([m.bytes for m in messageSegments]) ... if sgms != sgss: ... print("Mismatch!") :return: a new set of segments after the input has been merged """ from inference.segmentHandler import isExtendedCharSeq minLen = 6 segmentStack = list(reversed(self.segments)) newmsg = list() isCharCand = False workingStack = list() while segmentStack: workingStack.append(segmentStack.pop()) if sum([len(ws.bytes) for ws in workingStack]) < minLen: continue # now we have 6 bytes # and the merge is a new char candidate joinedbytes = b"".join([ws.bytes for ws in workingStack]) if isExtendedCharSeq(joinedbytes) \ and b"\x00\x00" not in joinedbytes: isCharCand = True continue # the last segment ended the char candidate elif isCharCand: isCharCand = False if len(workingStack) > 2: newlen = sum([ws.length for ws in workingStack[:-1]]) newseg = MessageSegment(workingStack[0].analyzer, workingStack[0].offset, newlen) newmsg.append(newseg) else: # retain the original segment (for equality test and to save creating a new object instance) newmsg.append(workingStack[0]) if len(workingStack) > 1: segmentStack.append(workingStack[-1]) workingStack = list() # there was not a char candidate else: newmsg.append(workingStack[0]) for ws in reversed(workingStack[1:]): segmentStack.append(ws) workingStack = list() # there are segments in the working stack left if len(workingStack) > 1 and isCharCand: newlen = sum([ws.length for ws in workingStack]) newseg = MessageSegment(workingStack[0].analyzer, workingStack[0].offset, newlen) newmsg.append(newseg) # there was no char sequence and there are segments in the working stack left else: newmsg.extend(workingStack) return newmsg
def segmentsFixed(length: int, comparator, analyzerType: type, analysisArgs: Union[Tuple, None], unit=MessageAnalyzer.U_BYTE, padded=False) \ -> List[Tuple[MessageSegment]]: """ Segment messages into fixed size chunks. >>> from utils.loader import SpecimenLoader >>> from validation.dissectorMatcher import MessageComparator >>> from inference.analyzers import Value >>> from inference.segmentHandler import segmentsFixed >>> specimens = SpecimenLoader("../input/ntp_SMIA-20111010_deduped-100.pcap", 2, True) >>> comparator = MessageComparator(specimens, 2, True, debug=False) >>> segmentedMessages = segmentsFixed(4, comparator, Value, None) >>> areIdentical = True >>> for msgsegs in segmentedMessages: ... msg = msgsegs[0].message ... msgbytes = b"".join([seg.bytes for seg in msgsegs]) ... areIdentical = areIdentical and msgbytes == msg.data >>> print(areIdentical) True :param length: Fixed length for all segments. Overhanging segments at the end that are shorter than length will be padded with NANs. :param comparator: Comparator that contains the payload messages. :param analyzerType: Type of the analysis. Subclass of inference.analyzers.MessageAnalyzer. :param analysisArgs: Arguments for the analysis method. :param unit: Base unit for the analysis. Either MessageAnalyzer.U_BYTE or MessageAnalyzer.U_NIBBLE. :param padded: Toggle to pad the last segment to the requested fixed length or leave the last segment to be shorter than length if the message length is not an exact multiple of the segment length. :return: Segments of the analyzer's message according to the true format. """ segments = list() for l4msg, rmsg in comparator.messages.items(): if len(l4msg.data) % length == 0: lastOffset = len(l4msg.data) else: lastOffset = (len(l4msg.data) // length) * length originalAnalyzer = MessageAnalyzer.findExistingAnalysis( analyzerType, unit, l4msg, analysisArgs) sequence = [ MessageSegment(originalAnalyzer, offset, length) for offset in range(0, lastOffset, length) ] if len(l4msg.data) > lastOffset: # append the overlap if padded: # here are nasty hacks! # TODO Better define a new subclass of MessageSegment that internally padds values # (and bytes? what are the guarantees?) to a given length that exceeds the message length residuepadd = lastOffset + length - len(l4msg.data) newMessage = copy.copy(originalAnalyzer.message) newMessage.data = newMessage.data + b'\x00' * residuepadd newAnalyzer = type(originalAnalyzer)( newMessage, originalAnalyzer.unit) # type: MessageAnalyzer newAnalyzer.setAnalysisParams(*originalAnalyzer.analysisParams) padd = [numpy.nan] * residuepadd newAnalyzer._values = originalAnalyzer.values + padd newSegment = MessageSegment(newAnalyzer, lastOffset, length) for seg in sequence: # replace all previous analyzers to make the sequence homogeneous for this message seg.analyzer = newAnalyzer sequence.append(newSegment) else: newSegment = MessageSegment(originalAnalyzer, lastOffset, len(l4msg.data) - lastOffset) sequence.append(newSegment) segments.append(tuple(sequence)) return segments