Python NeedlemanAndWunsch.alignField 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: netzob.Inference.Vocabulary.Alignment.NeedlemanAndWunsch

클래스/타입: NeedlemanAndWunsch

메소드/함수: alignField

hotexamples.com에서의 예제들: 9

Python NeedlemanAndWunsch.alignField - 9개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 netzob.Inference.Vocabulary.Alignment.NeedlemanAndWunsch.NeedlemanAndWunsch.alignField에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

NeedlemanAndWunsch(10)

alignField(4)

alignSymbol(3)

alignTwoMessages(2)

align(1)

alignSymbols(1)

buildRegexFromAlignment(1)

deserializeMessages(1)

getLastResult(1)

stop(1)

예제 #1

파일 보기

파일: test_Needleman.py 프로젝트: warsang/netzob

    def test_semanticAlignment_bug1(self):
        """test_semanticAlignment_bug1:
        A bug on the semantic alignment has been identified which prevent
        the computation of a valid regex. This test verifies the bug is not comming back.
        @date 18/04/2013
        """

        firstname1 = "antoine"
        email1 = "*****@*****.**"

        firstname2 = "luc"
        email2 = "*****@*****.**"

        msg1 = RawMessage(uuid.uuid4(), None, TypeConvertor.stringToNetzobRaw("6" + firstname1 + "GAHFSHQS" + email1))
        msg2 = RawMessage(uuid.uuid4(), None, TypeConvertor.stringToNetzobRaw("3" + firstname2 + "CVSDHISD" + email2))

        project = Project(uuid.uuid4(), "Experiment", datetime.now(), "")
        nwEngine = NeedlemanAndWunsch(8, project, False, None)
        symbol = Symbol(uuid.uuid4(), "Test", project)

        symbol.addMessages([msg1, msg2])
        msg1.addSemanticTag("firstname", 2, 2 + len(firstname1) * 2)
        msg1.addSemanticTag("email", 2 + len(firstname1) * 2 + 16, 2 + len(firstname1) * 2 + 16 + len(email1) * 2)

        msg2.addSemanticTag("firstname", 2, 2 + len(firstname2) * 2)
        msg2.addSemanticTag("email", 2 + len(firstname2) * 2 + 16, 2 + len(firstname2) * 2 + 16 + len(email2) * 2)

        nwEngine.alignField(symbol.getField())
        symbol.getField().setFormat(Format.STRING)

        print("Computed Regex : {0}".format(symbol.getRegex()))
        print(symbol.getCells(True))

        computedFields = symbol.getExtendedFields()
        self.assertTrue(len(computedFields) > 1, "Only one field has been computed which tells us something went wrong.")

예제 #2

파일 보기

파일: test_Needleman.py 프로젝트: EnjoyHacking/netzob

    def test_randomAlignmentsWithTwoCenteredMessages(self):
        workspace = self.getWorkspace()
        currentProject = workspace.getProjects()[0]

        doInternalSlick = currentProject.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_DO_INTERNAL_SLICK)
        defaultFormat = currentProject.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_GLOBAL_FORMAT)
        defaultUnitSize = 8

        # We generate 1000 random couples of data and try to align them
        # Objectives: just test if it executes
        nb_data = 1000
        nb_failed = 0
        nb_success = 0
        for i_test in range(0, nb_data):

            common_pattern = self.generateRandomString(30, 40)
            # Generate the content of two messages
            data1 = TypeConvertor.stringToNetzobRaw(self.generateRandomString(5, 100) + common_pattern + self.generateRandomString(5, 100))
            data2 = TypeConvertor.stringToNetzobRaw(self.generateRandomString(5, 100) + common_pattern + self.generateRandomString(5, 100))
            # Create the messages
            message1 = RawMessage(str(uuid.uuid4()), str(time.time()), data1)
            message2 = RawMessage(str(uuid.uuid4()), str(time.time()), data2)
            # Create the symbol
            symbol = Symbol(str(uuid.uuid4()), "test_randomAlignments#" + str(i_test), currentProject)
            symbol.addMessage(message1)
            symbol.addMessage(message2)
            field = symbol.getField()

            # Starts the alignment process
            alignmentProcess = NeedlemanAndWunsch(defaultUnitSize, currentProject, False, self.emptyAlignmentCB)
            alignmentProcess.alignField(field)

            if not TypeConvertor.stringToNetzobRaw(common_pattern[:]) in field.getAlignment():
                if self.debug is True:
                    print "Message 1: " + str(data1)
                    print "Message 2: " + str(data2)
                    print "Common pattern: " + TypeConvertor.stringToNetzobRaw(common_pattern)
                    print "Alignment: " + field.getAlignment()
                nb_failed += 1
            else:
                nb_success += 1
        if nb_failed > 0:
            print "A number of " + str(nb_failed) + "/" + str(nb_data) + " alignment failed !"
        self.assertEqual(0, nb_failed)
        self.assertEqual(nb_success, nb_data)

예제 #3

파일 보기

파일: test_Needleman.py 프로젝트: netzob/netzob

    def test_semanticAlignment_bug1(self):
        """test_semanticAlignment_bug1:
        A bug on the semantic alignment has been identified which prevent
        the computation of a valid regex. This test verifies the bug is not comming back.
        @date 18/04/2013
        """

        firstname1 = "antoine"
        email1 = "*****@*****.**"

        firstname2 = "luc"
        email2 = "*****@*****.**"

        msg1 = RawMessage(uuid.uuid4(), None, TypeConvertor.stringToNetzobRaw("6" + firstname1 + "GAHFSHQS" + email1))
        msg2 = RawMessage(uuid.uuid4(), None, TypeConvertor.stringToNetzobRaw("3" + firstname2 + "CVSDHISD" + email2))

        project = Project(uuid.uuid4(), "Experiment", datetime.now(), "")
        nwEngine = NeedlemanAndWunsch(8, project, False, None)
        symbol = Symbol(uuid.uuid4(), "Test", project)

        symbol.addMessages([msg1, msg2])
        msg1.addSemanticTag("firstname", 2, 2 + len(firstname1) * 2)
        msg1.addSemanticTag("email", 2 + len(firstname1) * 2 + 16, 2 + len(firstname1) * 2 + 16 + len(email1) * 2)

        msg2.addSemanticTag("firstname", 2, 2 + len(firstname2) * 2)
        msg2.addSemanticTag("email", 2 + len(firstname2) * 2 + 16, 2 + len(firstname2) * 2 + 16 + len(email2) * 2)

        nwEngine.alignField(symbol.getField())
        symbol.getField().setFormat(Format.STRING)

        print("Computed Regex : {0}".format(symbol.getRegex()))
        print("=======")
        print(symbol.getCells(True))

        computedFields = symbol.getExtendedFields()
        self.assertTrue(len(computedFields) > 1, "Only one field has been computed which tells us something went wrong.")

예제 #4

파일 보기

파일: test_Needleman.py 프로젝트: warsang/netzob

    def test_semanticAlignment_simple(self):
        """test_semanticAlignment_simple:
        Test that messages with embedded semantic are efficiently aligned.
        Format : <random 10 bytes><random username><random 5 ASCII><random email>

        Optimal Needleman & Wunsch Parameters :
        // Cost definitions for the alignment
        static const short int MATCH = 5;
        static const short int SEMANTIC_MATCH = 30;
        static const short int MISMATCH = -5;
        static const short int GAP = 0;
        static const short int BLEN = 10;
        // Consts for the definition of a mask
        static const unsigned char END = 2;
        static const unsigned char DIFFERENT = 1;
        static const unsigned char EQUAL = 0;
        """
        project = Project(uuid.uuid4(), "Experiment", datetime.now(), "")
        symbol = Symbol(uuid.uuid4(), "Test", project)

        nbMessage = 500
        usernames = []
        emails = []
        for iMessage in range(0, nbMessage):
            str_username = self.generateRandomString(4, 10)
            username = TypeConvertor.stringToNetzobRaw(str_username)
            usernames.append(str_username)

            email_prefix = self.generateRandomString(4, 10)
            email_domain = self.generateRandomString(4, 10)
            email_extension = self.generateRandomString(2, 3)
            str_email = "{0}@{1}.{2}".format(email_prefix, email_domain, email_extension)
            emails.append(str_email)
            email = TypeConvertor.stringToNetzobRaw(str_email)
            random10Bytes = self.generateRandomBytes(10, 10)
            random5ASCII = TypeConvertor.stringToNetzobRaw(self.generateRandomString(5, 5))
            data = "{0}{1}{2}{3}".format(random10Bytes, username, random5ASCII, email)

            message = RawMessage(uuid.uuid4(), None, data)
            message.addSemanticTag("username", len(random10Bytes), len(random10Bytes) + len(username))
            message.addSemanticTag("email", len(random10Bytes) + len(username) + len(random5ASCII), len(random10Bytes) + len(username) + len(random5ASCII) + len(email))

            symbol.addMessage(message)

        nwEngine = NeedlemanAndWunsch(8, project, False, None)
        nwEngine.alignField(symbol.getField())

        symbol.getField().setFormat(Format.STRING)

        print("Number of computed fields : {0}".format(len(symbol.getExtendedFields())))
        self.assertEqual(4, len(symbol.getExtendedFields()))
        nbValidMessages = 0

        for message in symbol.getMessages():
            isValid = symbol.getField().isRegexValidForMessage(message)
            if isValid:
                nbValidMessages += 1
            self.assertTrue(isValid)

        print(symbol.getCells())

        print("Computed regex is valid for {0}/{1} messages.".format(nbValidMessages, len(symbol.getMessages())))

예제 #5

파일 보기

파일: test_Needleman.py 프로젝트: netzob/netzob

    def test_semanticAlignment_simple(self):
        """test_semanticAlignment_simple:
        Test that messages with embedded semantic are efficiently aligned.
        Format : <random 10 bytes><random username><random 5 ASCII><random email>

        Optimal Needleman & Wunsch Parameters :
        // Cost definitions for the alignment
        static const short int MATCH = 5;
        static const short int SEMANTIC_MATCH = 30;
        static const short int MISMATCH = -5;
        static const short int GAP = 0;
        static const short int BLEN = 10;
        // Consts for the definition of a mask
        static const unsigned char END = 2;
        static const unsigned char DIFFERENT = 1;
        static const unsigned char EQUAL = 0;
        """
        project = Project(uuid.uuid4(), "Experiment", datetime.now(), "")
        symbol = Symbol(uuid.uuid4(), "Test", project)

        nbMessage = 500
        usernames = []
        emails = []
        for iMessage in range(0, nbMessage):
            str_username = self.generateRandomString(4, 10)
            username = TypeConvertor.stringToNetzobRaw(str_username)
            usernames.append(str_username)

            email_prefix = self.generateRandomString(4, 10)
            email_domain = self.generateRandomString(4, 10)
            email_extension = self.generateRandomString(2, 3)
            str_email = "{0}@{1}.{2}".format(email_prefix, email_domain, email_extension)
            emails.append(str_email)
            email = TypeConvertor.stringToNetzobRaw(str_email)
            random10Bytes = self.generateRandomBytes(10, 10)
            random5ASCII = TypeConvertor.stringToNetzobRaw(self.generateRandomString(5, 5))
            data = "{0}{1}{2}{3}".format(random10Bytes, username, random5ASCII, email)

            message = RawMessage(uuid.uuid4(), None, data)
            message.addSemanticTag("username", len(random10Bytes), len(random10Bytes) + len(username))
            message.addSemanticTag("email", len(random10Bytes) + len(username) + len(random5ASCII), len(random10Bytes) + len(username) + len(random5ASCII) + len(email))

            symbol.addMessage(message)

        nwEngine = NeedlemanAndWunsch(8, project, False, None)
        nwEngine.alignField(symbol.getField())

        symbol.getField().setFormat(Format.STRING)

        print("Number of computed fields : {0}".format(len(symbol.getExtendedFields())))
        self.assertEqual(4, len(symbol.getExtendedFields()))
        nbValidMessages = 0

        for message in symbol.getMessages():
            isValid = symbol.getField().isRegexValidForMessage(message)
            if isValid:
                nbValidMessages += 1
            self.assertTrue(isValid)

        print(symbol.getCells())

        print("Computed regex is valid for {0}/{1} messages.".format(nbValidMessages, len(symbol.getMessages())))

예제 #6

파일 보기

파일: UPGMA.py 프로젝트: lindi2/netzob

class UPGMA(object):
    """This class provides the required methods to compute clustering
    between multiple symbols/messages using UPGMA algorithms (see U{http://en.wikipedia.org/wiki/UPGMA}).
    When processing, the matrix of scores is computed by the C extensions (L{_libScoreComputation}
    and used to regroup messages and symbols into equivalent cluster."""

    def __init__(self, project, symbols, unitSize, cb_status=None, scores={}):
        self.project = project
        self.unitSize = unitSize
        self.cb_status = cb_status
        self.scores = scores

        # Then we retrieve all the parameters of the CLUSTERING / ALIGNMENT
        self.defaultFormat = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_GLOBAL_FORMAT)
        self.nbIteration = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_NB_ITERATION)
        self.minEquivalence = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_EQUIVALENCE_THRESHOLD)
        self.doInternalSlick = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_DO_INTERNAL_SLICK)

        self.log = logging.getLogger('netzob.Inference.Vocabulary.UPGMA.py')
        self.path = []
        self.flagStop = False
        self.currentAlignment = None

        # Create a symbol for each message
        self.symbols = []
        i_symbol = 1
        for symbol in symbols:
            for m in symbol.getMessages():
                tmpSymbol = Symbol(str(uuid.uuid4()), "Symbol " + str(i_symbol), project)
                tmpSymbol.addMessage(m)
                self.symbols.append(tmpSymbol)
                i_symbol += 1
        self.log.debug("A number of {0} already aligned symbols will be clustered.".format(str(len(symbols))))

    def cb_executionStatus(self, stage, donePercent, currentMessage):
        """Callback function called by the C extension to provide info on status
        @param donePercent: a float between 0 and 100 included
        @param currentMessage: a str which represents the current alignment status"""
        if self.cb_status is None:
            self.log.info("[UPGMA status]" + str(donePercent) + "% " + currentMessage)
        else:
            self.cb_status(stage, donePercent, currentMessage)

    def executeClustering(self):
        """Execute the clustering operation
        @return the new list of symbols"""
        self.log.debug("Re-Organize the symbols (nbIteration={0}, min_equivalence={1})".format(self.nbIteration, self.minEquivalence))
        # Process the UPGMA on symbols

        if self.isFinish():
            return None

        self.cb_executionStatus(0, 0, "Clustering into symbols...")
        self.processUPGMA()
        self.cb_executionStatus(1, 100, None)
        # Retrieve the alignment of each symbol and the build the associated regular expression
        self.cb_executionStatus(2, 0, "Compute the definition for each cluster...")

        if self.isFinish():
            return None

        self.currentAlignment = NeedlemanAndWunsch(self.unitSize, self.project, False, self.cb_status)
        self.currentAlignment.absoluteStage = 2
        self.currentAlignment.statusRatio = len(self.symbols)
        self.currentAlignment.statusRatioOffset = 0

        for symbol in self.symbols:

            if self.isFinish():
                return None

            self.currentAlignment.alignField(symbol.getField())
            self.currentAlignment.statusRatioOffset = self.currentAlignment.statusRatioOffset + 1

        return self.symbols

    def processUPGMA(self):
        """Computes the matrix of equivalences (in C) and reduce it
        iteratively."""
        self.log.debug("Computing the associated matrix")

        # Execute the Clustering part in C
        debug = False
        wrapper = WrapperArgsFactory("_libScoreComputation.getHighestEquivalentGroup")
        wrapper.typeList[wrapper.function](self.symbols)
        (i_max, j_max, maxScore, listScores) = _libScoreComputation.getHighestEquivalentGroup(self.doInternalSlick, self.cb_executionStatus, self.isFinish, debug, wrapper)

        # Retrieve the scores for each association of symbols
        self.scores = {}
        for (iuid, juid, score) in listScores:

            if self.isFinish():
                return (None, None, None)

            if iuid not in self.scores.keys():
                self.scores[iuid] = {}
            if juid not in self.scores.keys():
                self.scores[juid] = {}
            self.scores[iuid][juid] = score
            if iuid not in self.scores[juid].keys():
                self.scores[juid][iuid] = score

        # Reduce the UPGMA matrix (merge symbols by similarity)
        self.computePhylogenicTree()
        return (i_max, j_max, maxScore)

    def computePhylogenicTree(self):
        """Compute the phylogenic tree
        @var max_i: uid of i_maximum
        @var max_j: uid of j_maximum
        @var maxScore: the highest global score"""
        maxScore = 0
        status = 0
        step = (float(100) - float(self.minEquivalence)) / float(100)

        if len(self.scores) > 1:
            max_i = max(self.scores, key=lambda x: self.scores[x][max(self.scores[x], key=lambda y: self.scores[x][y])])
            max_j = max(self.scores[max_i], key=lambda y: self.scores[max_i][y])
            maxScore = self.scores[max_i][max_j]
        while len(self.scores) > 1 and maxScore >= self.minEquivalence:

            if self.isFinish():
                return

            symbols_uid = [s.getID() for s in self.symbols]  # List of the UID in of symbols
            (i_maximum, j_maximum) = (symbols_uid.index(max_i), symbols_uid.index(max_j))
            size_i = len(self.symbols[i_maximum].getMessages())
            size_j = len(self.symbols[j_maximum].getMessages())

            infoMessage = "Clustering {0} with {1} (score = {2})".format(str(i_maximum), str(j_maximum), str(maxScore))
            status = (float(100) - float(maxScore)) / float(step)
            self.cb_executionStatus(1, status, infoMessage)

            newuid = self.mergeEffectiveRowCol(i_maximum, j_maximum)
            self.updateScore(max_i, max_j, newuid, size_i, size_j)
#            self.log.debug("Score après: {0}".format(str(self.scores)))
            if len(self.scores) > 1:
                max_i = max(self.scores, key=lambda x: self.scores[x][max(self.scores[x], key=lambda y: self.scores[x][y])])
                max_j = max(self.scores[max_i], key=lambda y: self.scores[max_i][y])
                maxScore = self.scores[max_i][max_j]

    def updateScore(self, iuid, juid, newuid, size_i, size_j):
        """Update the score of two merged clusters.
        @param iuid: id of the first cluster merged
        @param juid: id of the second cluster merged
        @param newuid: new id of the merged cluster
        @param size_i: size of the first cluster
        @param size_j: size of the second cluster"""
        total_size = size_i + size_j
        del self.scores[iuid]
        del self.scores[juid]
        self.scores[newuid] = {}
        for k in self.scores.keys():
            if k != newuid:
                self.scores[k][newuid] = (size_i * self.scores[k][iuid] + size_j * self.scores[k][juid]) * 1.0 / total_size
                del self.scores[k][iuid]
                del self.scores[k][juid]
                self.scores[newuid][k] = self.scores[k][newuid]

    def computePathTree(self):
        """TODO ?"""
        if self.path == []:
            clusterIndex = int(random.random() * len(self.scores.keys()))
            self.path.append(self.scores.keys()[0])
        if len(self.path) > 1:  # Check if Cl-1,Cl-2 minimum pair
            lastId = self.path[len(self.path) - 1]
            if max(self.scores[lastId], key=lambda x: self.scores[lastId][x]) == self.path[len(self.path) - 2]:
                return
        while True:
            lastId = self.path[len(self.path) - 1]
            juid = max(self.scores[lastId], key=lambda x: self.scores[lastId][x])
            self.path.append(juid)
            if max(self.scores[juid], key=lambda x: self.scores[juid][x]) == lastId:
                break

    def mergeEffectiveRowCol(self, i_maximum, j_maximum):
        """Merge the symbols i and j in the "symbols" structure
        @param i_maximum: id of the first symbol to merge
        @param j_maximum: id of the second symbol to merge
        @return the newly created symbol result of the merged process"""
        # Extract symbols i and j
        if i_maximum > j_maximum:
            symbol1 = self.symbols.pop(i_maximum)
            symbol2 = self.symbols.pop(j_maximum)
        else:
            symbol1 = self.symbols.pop(j_maximum)
            symbol2 = self.symbols.pop(i_maximum)

        # Merge the symbols i and j
        messages = []
        messages.extend(symbol1.getMessages())
        messages.extend(symbol2.getMessages())

        newSymbol = Symbol(str(uuid.uuid4()), symbol1.getName(), self.project)
        newSymbol.setMinEqu(self.minEquivalence)
        for message in messages:
            newSymbol.addMessage(message)

        # Append th new symbol to the "symbols" structure
        self.symbols.append(newSymbol)

        return newSymbol.getID()

    def executeOrphanReduction(self):
        """Execute the orphan reduction process by merging symbols
        which are progressively reduced in size."""
        leftReductionFactor = 0
        rightReductionFactor = 0
        currentReductionIsLeft = False
        increment = 10

        while leftReductionFactor < 80 and rightReductionFactor < 80:

            # First we retrieve the current orphans
            orphans = []
            tmp_symbols = []
            # extract orphans
            for i, symbol in zip(range(len(self.symbols)), self.symbols):
                if len(symbol.getMessages()) == 1:
                    orphans.append(symbol)

            # create a tmp symbols array where symbols will be added once computed
            for symbol in self.symbols:
                if len(symbol.getMessages()) > 1:
                    tmp_symbols.append(symbol)

            if len(orphans) <= 1:
                self.log.info("Number of orphan symbols: {0}. The orphan merging op. is finished!".format(len(orphans)))
                break

            self.symbols = orphans
            if currentReductionIsLeft:
                leftReductionFactor = leftReductionFactor + increment
                # Reduce the size of the messages by 50% from the left
                for orphan in self.symbols:
                    orphan.getMessages()[0].setLeftReductionFactor(leftReductionFactor)
                    orphan.getMessages()[0].setRightReductionFactor(0)

                self.log.info("Start to merge orphans reduced by {0}% from the left".format(str(leftReductionFactor)))
                self.executeClustering()
                currentReductionIsLeft = False

            if not currentReductionIsLeft:
                rightReductionFactor = rightReductionFactor + increment
                # Reduce the size of the messages from the right
                for orphan in self.symbols:
                    orphan.getMessages()[0].setRightReductionFactor(rightReductionFactor)
                    orphan.getMessages()[0].setLeftReductionFactor(0)

                self.log.info("Start to merge orphans reduced by {0}% from the right".format(str(rightReductionFactor)))
                self.executeClustering()
                currentReductionIsLeft = True

            for orphan in self.symbols:
                for message in orphan.getMessages():
                    message.setLeftReductionFactor(0)
                    message.setRightReductionFactor(0)
                tmp_symbols.append(orphan)
            self.symbols = tmp_symbols

        self.cb_executionStatus(3, 50.0, "Executing last alignment...")
        alignment = NeedlemanAndWunsch(self.unitSize, self.project, False, self.cb_status)
        # Compute the regex/alignment of each symbol
        for symbol in self.symbols:
            alignment.alignField(symbol.getField())
        return self.symbols

    def getScores(self):
        """@return: the dictionnary of scores"""
        return self.scores

    def stop(self):
        """Stop the current execution of any clustering operation"""
        self.flagStop = True
        if self.currentAlignment is not None:
            self.currentAlignment.stop()

    def isFinish(self):
        """Compute if we should finish the current clustering operation"""
        return self.flagStop

예제 #7

파일 보기

파일: UPGMA.py 프로젝트: lindi2/netzob

    def executeOrphanReduction(self):
        """Execute the orphan reduction process by merging symbols
        which are progressively reduced in size."""
        leftReductionFactor = 0
        rightReductionFactor = 0
        currentReductionIsLeft = False
        increment = 10

        while leftReductionFactor < 80 and rightReductionFactor < 80:

            # First we retrieve the current orphans
            orphans = []
            tmp_symbols = []
            # extract orphans
            for i, symbol in zip(range(len(self.symbols)), self.symbols):
                if len(symbol.getMessages()) == 1:
                    orphans.append(symbol)

            # create a tmp symbols array where symbols will be added once computed
            for symbol in self.symbols:
                if len(symbol.getMessages()) > 1:
                    tmp_symbols.append(symbol)

            if len(orphans) <= 1:
                self.log.info("Number of orphan symbols: {0}. The orphan merging op. is finished!".format(len(orphans)))
                break

            self.symbols = orphans
            if currentReductionIsLeft:
                leftReductionFactor = leftReductionFactor + increment
                # Reduce the size of the messages by 50% from the left
                for orphan in self.symbols:
                    orphan.getMessages()[0].setLeftReductionFactor(leftReductionFactor)
                    orphan.getMessages()[0].setRightReductionFactor(0)

                self.log.info("Start to merge orphans reduced by {0}% from the left".format(str(leftReductionFactor)))
                self.executeClustering()
                currentReductionIsLeft = False

            if not currentReductionIsLeft:
                rightReductionFactor = rightReductionFactor + increment
                # Reduce the size of the messages from the right
                for orphan in self.symbols:
                    orphan.getMessages()[0].setRightReductionFactor(rightReductionFactor)
                    orphan.getMessages()[0].setLeftReductionFactor(0)

                self.log.info("Start to merge orphans reduced by {0}% from the right".format(str(rightReductionFactor)))
                self.executeClustering()
                currentReductionIsLeft = True

            for orphan in self.symbols:
                for message in orphan.getMessages():
                    message.setLeftReductionFactor(0)
                    message.setRightReductionFactor(0)
                tmp_symbols.append(orphan)
            self.symbols = tmp_symbols

        self.cb_executionStatus(3, 50.0, "Executing last alignment...")
        alignment = NeedlemanAndWunsch(self.unitSize, self.project, False, self.cb_status)
        # Compute the regex/alignment of each symbol
        for symbol in self.symbols:
            alignment.alignField(symbol.getField())
        return self.symbols

예제 #8

파일 보기

파일: UPGMA.py 프로젝트: windli4367/netzob

class UPGMA(object):
    """This class provides the required methods to compute clustering
    between multiple symbols/messages using UPGMA algorithms (see U{http://en.wikipedia.org/wiki/UPGMA}).
    When processing, the matrix of scores is computed by the C extensions (L{_libScoreComputation}
    and used to regroup messages and symbols into equivalent cluster."""

    def __init__(self, project, symbols, unitSize, cb_status=None, scores={}):
        self.project = project
        self.unitSize = unitSize
        self.cb_status = cb_status
        self.scores = scores

        # Then we retrieve all the parameters of the CLUSTERING / ALIGNMENT
        self.defaultFormat = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_GLOBAL_FORMAT)
        self.nbIteration = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_NB_ITERATION)
        self.minEquivalence = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_EQUIVALENCE_THRESHOLD)
        self.doInternalSlick = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_DO_INTERNAL_SLICK)

        self.log = logging.getLogger('netzob.Inference.Vocabulary.UPGMA.py')
        self.path = []
        self.flagStop = False
        self.currentAlignment = None

        # Create a symbol for each message
        self.symbols = []
        i_symbol = 1
        for symbol in symbols:
            for m in symbol.getMessages():
                tmpSymbol = Symbol(str(uuid.uuid4()), "Symbol " + str(i_symbol), project)
                tmpSymbol.addMessage(m)
                self.symbols.append(tmpSymbol)
                i_symbol += 1
        self.log.debug("A number of {0} already aligned symbols will be clustered.".format(str(len(symbols))))

    def cb_executionStatus(self, stage, donePercent, currentMessage):
        """Callback function called by the C extension to provide info on status
        @param donePercent: a float between 0 and 100 included
        @param currentMessage: a str which represents the current alignment status"""
        if self.cb_status is None:
            self.log.info("[UPGMA status]" + str(donePercent) + "% " + currentMessage)
        else:
            self.cb_status(stage, donePercent, currentMessage)

    def executeClustering(self):
        """Execute the clustering operation
        @return the new list of symbols"""
        self.log.debug("Re-Organize the symbols (nbIteration={0}, min_equivalence={1})".format(self.nbIteration, self.minEquivalence))
        # Process the UPGMA on symbols

        if self.isFinish():
            return None

        self.cb_executionStatus(0, 0, "Clustering into symbols...")
        self.processUPGMA()
        self.cb_executionStatus(1, 100, None)
        # Retrieve the alignment of each symbol and the build the associated regular expression
        self.cb_executionStatus(2, 0, "Compute the definition for each cluster...")

        if self.isFinish():
            return None

        self.currentAlignment = NeedlemanAndWunsch(self.unitSize, self.project, False, self.cb_status)
        self.currentAlignment.absoluteStage = 2
        self.currentAlignment.statusRatio = len(self.symbols)
        self.currentAlignment.statusRatioOffset = 0

        for symbol in self.symbols:

            if self.isFinish():
                return None

            self.currentAlignment.alignField(symbol.getField())
            self.currentAlignment.statusRatioOffset = self.currentAlignment.statusRatioOffset + 1

        return self.symbols

    def processUPGMA(self):
        """Computes the matrix of equivalences (in C) and reduce it
        iteratively."""
        self.log.debug("Computing the associated matrix")

        # Execute the Clustering part in C
        debug = False
        wrapper = WrapperArgsFactory("_libScoreComputation.computeSimilarityMatrix")
        wrapper.typeList[wrapper.function](self.symbols)
        (listScores) = _libScoreComputation.computeSimilarityMatrix(self.doInternalSlick, self.cb_executionStatus, self.isFinish, debug, wrapper)
        # Retrieve the scores for each association of symbols
        self.scores = {}
        for (iuid, juid, score) in listScores:
            if self.isFinish():
                return (None, None, None)

            if iuid not in self.scores.keys():
                self.scores[iuid] = {}
            if juid not in self.scores.keys():
                self.scores[juid] = {}
            self.scores[iuid][juid] = score
            if iuid not in self.scores[juid].keys():
                self.scores[juid][iuid] = score

        # Reduce the UPGMA matrix (merge symbols by similarity)
        self.computePhylogenicTree()

    def computePhylogenicTree(self):
        """Compute the phylogenic tree
        @var max_i: uid of i_maximum
        @var max_j: uid of j_maximum
        @var maxScore: the highest global score"""
        maxScore = 0
        status = 0
        step = (float(100) - float(self.minEquivalence)) / float(100)

        if len(self.scores) > 1:
            max_i = max(self.scores, key=lambda x: self.scores[x][max(self.scores[x], key=lambda y: self.scores[x][y])])
            max_j = max(self.scores[max_i], key=lambda y: self.scores[max_i][y])
            maxScore = self.scores[max_i][max_j]
        while len(self.scores) > 1 and maxScore >= self.minEquivalence:

            if self.isFinish():
                return

            symbols_uid = [s.getID() for s in self.symbols]  # List of the UID in of symbols
            (i_maximum, j_maximum) = (symbols_uid.index(max_i), symbols_uid.index(max_j))
            size_i = len(self.symbols[i_maximum].getMessages())
            size_j = len(self.symbols[j_maximum].getMessages())

            infoMessage = "Clustering {0} with {1} (score = {2})".format(str(i_maximum), str(j_maximum), str(maxScore))
            status = (float(100) - float(maxScore)) / float(step)
            self.cb_executionStatus(1, status, infoMessage)

            newuid = self.mergeEffectiveRowCol(i_maximum, j_maximum)
            self.updateScore(max_i, max_j, newuid, size_i, size_j)
#            self.log.debug("Score après: {0}".format(str(self.scores)))
            if len(self.scores) > 1:
                max_i = max(self.scores, key=lambda x: self.scores[x][max(self.scores[x], key=lambda y: self.scores[x][y])])
                max_j = max(self.scores[max_i], key=lambda y: self.scores[max_i][y])
                maxScore = self.scores[max_i][max_j]

    def updateScore(self, iuid, juid, newuid, size_i, size_j):
        """Update the score of two merged clusters.
        @param iuid: id of the first cluster merged
        @param juid: id of the second cluster merged
        @param newuid: new id of the merged cluster
        @param size_i: size of the first cluster
        @param size_j: size of the second cluster"""
        total_size = size_i + size_j
        del self.scores[iuid]
        del self.scores[juid]
        self.scores[newuid] = {}
        for k in self.scores.keys():
            if k != newuid:
                self.scores[k][newuid] = (size_i * self.scores[k][iuid] + size_j * self.scores[k][juid]) * 1.0 / total_size
                del self.scores[k][iuid]
                del self.scores[k][juid]
                self.scores[newuid][k] = self.scores[k][newuid]

    def computePathTree(self):
        """TODO ?"""
        if self.path == []:
            clusterIndex = int(random.random() * len(self.scores.keys()))
            self.path.append(self.scores.keys()[0])
        if len(self.path) > 1:  # Check if Cl-1,Cl-2 minimum pair
            lastId = self.path[len(self.path) - 1]
            if max(self.scores[lastId], key=lambda x: self.scores[lastId][x]) == self.path[len(self.path) - 2]:
                return
        while True:
            lastId = self.path[len(self.path) - 1]
            juid = max(self.scores[lastId], key=lambda x: self.scores[lastId][x])
            self.path.append(juid)
            if max(self.scores[juid], key=lambda x: self.scores[juid][x]) == lastId:
                break

    def mergeEffectiveRowCol(self, i_maximum, j_maximum):
        """Merge the symbols i and j in the "symbols" structure
        @param i_maximum: id of the first symbol to merge
        @param j_maximum: id of the second symbol to merge
        @return the newly created symbol result of the merged process"""
        # Extract symbols i and j
        if i_maximum > j_maximum:
            symbol1 = self.symbols.pop(i_maximum)
            symbol2 = self.symbols.pop(j_maximum)
        else:
            symbol1 = self.symbols.pop(j_maximum)
            symbol2 = self.symbols.pop(i_maximum)

        # Merge the symbols i and j
        messages = []
        messages.extend(symbol1.getMessages())
        messages.extend(symbol2.getMessages())

        newSymbol = Symbol(str(uuid.uuid4()), symbol1.getName(), self.project)
        newSymbol.setMinEqu(self.minEquivalence)
        for message in messages:
            newSymbol.addMessage(message)

        # Append th new symbol to the "symbols" structure
        self.symbols.append(newSymbol)

        return newSymbol.getID()

    def executeOrphanReduction(self):
        """Execute the orphan reduction process by merging symbols
        which are progressively reduced in size."""
        leftReductionFactor = 0
        rightReductionFactor = 0
        currentReductionIsLeft = False
        increment = 10

        while leftReductionFactor < 80 and rightReductionFactor < 80:

            # First we retrieve the current orphans
            orphans = []
            tmp_symbols = []
            # extract orphans
            for i, symbol in zip(range(len(self.symbols)), self.symbols):
                if len(symbol.getMessages()) == 1:
                    orphans.append(symbol)

            # create a tmp symbols array where symbols will be added once computed
            for symbol in self.symbols:
                if len(symbol.getMessages()) > 1:
                    tmp_symbols.append(symbol)

            if len(orphans) <= 1:
                self.log.info("Number of orphan symbols: {0}. The orphan merging op. is finished!".format(len(orphans)))
                break

            self.symbols = orphans
            if currentReductionIsLeft:
                leftReductionFactor = leftReductionFactor + increment
                # Reduce the size of the messages by 50% from the left
                for orphan in self.symbols:
                    orphan.getMessages()[0].setLeftReductionFactor(leftReductionFactor)
                    orphan.getMessages()[0].setRightReductionFactor(0)

                self.log.info("Start to merge orphans reduced by {0}% from the left".format(str(leftReductionFactor)))
                self.executeClustering()
                currentReductionIsLeft = False

            if not currentReductionIsLeft:
                rightReductionFactor = rightReductionFactor + increment
                # Reduce the size of the messages from the right
                for orphan in self.symbols:
                    orphan.getMessages()[0].setRightReductionFactor(rightReductionFactor)
                    orphan.getMessages()[0].setLeftReductionFactor(0)

                self.log.info("Start to merge orphans reduced by {0}% from the right".format(str(rightReductionFactor)))
                self.executeClustering()
                currentReductionIsLeft = True

            for orphan in self.symbols:
                for message in orphan.getMessages():
                    message.setLeftReductionFactor(0)
                    message.setRightReductionFactor(0)
                tmp_symbols.append(orphan)
            self.symbols = tmp_symbols

        self.cb_executionStatus(3, 50.0, "Executing last alignment...")
        alignment = NeedlemanAndWunsch(self.unitSize, self.project, False, self.cb_status)
        # Compute the regex/alignment of each symbol
        for symbol in self.symbols:
            alignment.alignField(symbol.getField())
        return self.symbols

    def getScores(self):
        """@return: the dictionnary of scores"""
        return self.scores

    def stop(self):
        """Stop the current execution of any clustering operation"""
        self.flagStop = True
        if self.currentAlignment is not None:
            self.currentAlignment.stop()

    def isFinish(self):
        """Compute if we should finish the current clustering operation"""
        return self.flagStop

예제 #9

파일 보기

파일: UPGMA.py 프로젝트: windli4367/netzob

    def executeOrphanReduction(self):
        """Execute the orphan reduction process by merging symbols
        which are progressively reduced in size."""
        leftReductionFactor = 0
        rightReductionFactor = 0
        currentReductionIsLeft = False
        increment = 10

        while leftReductionFactor < 80 and rightReductionFactor < 80:

            # First we retrieve the current orphans
            orphans = []
            tmp_symbols = []
            # extract orphans
            for i, symbol in zip(range(len(self.symbols)), self.symbols):
                if len(symbol.getMessages()) == 1:
                    orphans.append(symbol)

            # create a tmp symbols array where symbols will be added once computed
            for symbol in self.symbols:
                if len(symbol.getMessages()) > 1:
                    tmp_symbols.append(symbol)

            if len(orphans) <= 1:
                self.log.info("Number of orphan symbols: {0}. The orphan merging op. is finished!".format(len(orphans)))
                break

            self.symbols = orphans
            if currentReductionIsLeft:
                leftReductionFactor = leftReductionFactor + increment
                # Reduce the size of the messages by 50% from the left
                for orphan in self.symbols:
                    orphan.getMessages()[0].setLeftReductionFactor(leftReductionFactor)
                    orphan.getMessages()[0].setRightReductionFactor(0)

                self.log.info("Start to merge orphans reduced by {0}% from the left".format(str(leftReductionFactor)))
                self.executeClustering()
                currentReductionIsLeft = False

            if not currentReductionIsLeft:
                rightReductionFactor = rightReductionFactor + increment
                # Reduce the size of the messages from the right
                for orphan in self.symbols:
                    orphan.getMessages()[0].setRightReductionFactor(rightReductionFactor)
                    orphan.getMessages()[0].setLeftReductionFactor(0)

                self.log.info("Start to merge orphans reduced by {0}% from the right".format(str(rightReductionFactor)))
                self.executeClustering()
                currentReductionIsLeft = True

            for orphan in self.symbols:
                for message in orphan.getMessages():
                    message.setLeftReductionFactor(0)
                    message.setRightReductionFactor(0)
                tmp_symbols.append(orphan)
            self.symbols = tmp_symbols

        self.cb_executionStatus(3, 50.0, "Executing last alignment...")
        alignment = NeedlemanAndWunsch(self.unitSize, self.project, False, self.cb_status)
        # Compute the regex/alignment of each symbol
        for symbol in self.symbols:
            alignment.alignField(symbol.getField())
        return self.symbols