Exemplo n.º 1
0
    def test_alignmentOfAlmostEquivalentMessages(self):
        alignmentSolution = NeedlemanAndWunsch(8)
        nbTest = 1000
        for i_test in range(0, nbTest):
            common_pattern_before = self.generateRandomString(30, 40)
            common_pattern_after = self.generateRandomString(30, 40)
            # Generate the content of two messages
            data1 = TypeConvertor.stringToNetzobRaw(common_pattern_before +
                                                    "hercule" +
                                                    common_pattern_after)
            data2 = TypeConvertor.stringToNetzobRaw(common_pattern_before +
                                                    "thomas" +
                                                    common_pattern_after)
            # Create the messages
            message1 = RawMessage(str(uuid.uuid4()), str(time.time()), data1)
            message2 = RawMessage(str(uuid.uuid4()), str(time.time()), data2)

            (scores, alignment) = alignmentSolution.alignTwoMessages(
                False, message1, message2)
            (score1, score2, score3) = scores
            (scoresBis, alignment2) = alignmentSolution.alignTwoMessages(
                True, message1, message2)
            (scoreBis1, scoreBis2, scoreBis3) = scoresBis

            self.assertGreater(scoreBis1, score1)
            self.assertGreater(scoreBis1, 95)
Exemplo n.º 2
0
    def test_alignmentOfEquivalentMessages(self):
        alignmentSolution = NeedlemanAndWunsch(8)
        nbTest = 1000
        for i_test in range(0, nbTest):
            common_pattern = self.generateRandomString(30, 40)
            # Generate the content of two messages
            data1 = TypeConvertor.stringToNetzobRaw(common_pattern)
            data2 = TypeConvertor.stringToNetzobRaw(common_pattern)
            # Create the messages
            message1 = RawMessage(str(uuid.uuid4()), str(time.time()), data1)
            message2 = RawMessage(str(uuid.uuid4()), str(time.time()), data2)

            (scores, alignment) = alignmentSolution.alignTwoMessages(
                False, message1, message2)
            (score1, score2, score3) = scores
            self.assertEqual(score1, 100.0)
            self.assertEqual(score2, 100.0)
            self.assertEqual(score3, 100.0)

            (scores, alignment) = alignmentSolution.alignTwoMessages(
                True, message1, message2)
            (score1, score2, score3) = scores
            self.assertEqual(score1, 100.0)
            self.assertEqual(score2, 100.0)
            self.assertEqual(score3, 100.0)
Exemplo n.º 3
0
    def executeOrphanReduction(self):
        leftReductionFactor = 0
        rightReductionFactor = 0
        currentReductionIsLeft = False
        increment = 10

        while leftReductionFactor < 80 and rightReductionFactor < 80:

            # First we retrieve the current orphans
            orphans = []
            tmp_symbols = []
            # extract orphans
            for symbol in self.symbols:
                if len(symbol.getMessages()) == 1:
                    orphans.append(symbol)
            # create a tmp symbols array where symbols will be added once computed
            for symbol in self.symbols:
                if len(symbol.getMessages()) > 1:
                    tmp_symbols.append(symbol)

            if len(orphans) <= 1:
                self.log.info("Number of orphan symbols : {0}. The orphan merging op. is finished !".format(len(orphans)))
                break

            self.symbols = orphans
            if currentReductionIsLeft:
                leftReductionFactor = leftReductionFactor + increment
                # Reduce the size of the messages by 50% from the left
                for orphan in self.symbols:
                    orphan.getMessages()[0].setLeftReductionFactor(leftReductionFactor)
                    orphan.getMessages()[0].setRightReductionFactor(0)

                self.log.info("Start to merge orphans reduced by {0}% from the left".format(str(leftReductionFactor)))
                self.executeClustering()
                currentReductionIsLeft = False

            if not currentReductionIsLeft:
                rightReductionFactor = rightReductionFactor + increment
                # Reduce the size of the messages from the right
                for orphan in self.symbols:
                    orphan.getMessages()[0].setRightReductionFactor(rightReductionFactor)
                    orphan.getMessages()[0].setLeftReductionFactor(0)

                self.log.info("Start to merge orphans reduced by {0}% from the right".format(str(rightReductionFactor)))
                self.executeClustering()
                currentReductionIsLeft = True

            for orphan in self.symbols:
                for message in orphan.getMessages():
                    message.setLeftReductionFactor(0)
                    message.setRightReductionFactor(0)
                tmp_symbols.append(orphan)
            self.symbols = tmp_symbols

        self.cb_executionStatus(50.0, "Executing last alignment...")
        alignment = NeedlemanAndWunsch(self.unitSize, self.cb_status)
        # Compute the regex/alignment of each symbol
        for symbol in self.symbols:
            alignment.alignSymbol(symbol, self.doInternalSlick, self.defaultFormat)
        return self.symbols
Exemplo n.º 4
0
    def test_semanticAlignment_bug1(self):
        """test_semanticAlignment_bug1:
        A bug on the semantic alignment has been identified which prevent
        the computation of a valid regex. This test verifies the bug is not comming back.
        @date 18/04/2013
        """

        firstname1 = "antoine"
        email1 = "*****@*****.**"

        firstname2 = "luc"
        email2 = "*****@*****.**"

        msg1 = RawMessage(uuid.uuid4(), None, TypeConvertor.stringToNetzobRaw("6" + firstname1 + "GAHFSHQS" + email1))
        msg2 = RawMessage(uuid.uuid4(), None, TypeConvertor.stringToNetzobRaw("3" + firstname2 + "CVSDHISD" + email2))

        project = Project(uuid.uuid4(), "Experiment", datetime.now(), "")
        nwEngine = NeedlemanAndWunsch(8, project, False, None)
        symbol = Symbol(uuid.uuid4(), "Test", project)

        symbol.addMessages([msg1, msg2])
        msg1.addSemanticTag("firstname", 2, 2 + len(firstname1) * 2)
        msg1.addSemanticTag("email", 2 + len(firstname1) * 2 + 16, 2 + len(firstname1) * 2 + 16 + len(email1) * 2)

        msg2.addSemanticTag("firstname", 2, 2 + len(firstname2) * 2)
        msg2.addSemanticTag("email", 2 + len(firstname2) * 2 + 16, 2 + len(firstname2) * 2 + 16 + len(email2) * 2)

        nwEngine.alignField(symbol.getField())
        symbol.getField().setFormat(Format.STRING)

        print("Computed Regex : {0}".format(symbol.getRegex()))
        print(symbol.getCells(True))

        computedFields = symbol.getExtendedFields()
        self.assertTrue(len(computedFields) > 1, "Only one field has been computed which tells us something went wrong.")
Exemplo n.º 5
0
    def executeClustering(self):
        self.log.debug("Re-Organize the symbols (nbIteration={0}, min_equivalence={1})".format(self.nbIteration, self.minEquivalence))

        # Find equel messages. Useful for redundant protocols before doing heavy computations with Needleman (complexity=O(N²) where N is #Symbols)
        ll = len(self.symbols) - 1
        i_equ = 0
        while(ll > 0):
            currentMess = self.symbols[i_equ].getMessages()[0].getReducedStringData()
            for j in range(ll):
                if(currentMess == self.symbols[i_equ + j + 1].getMessages()[0].getReducedStringData()):
                    self.mergeEffectiveRowCol(i_equ, i_equ + j + 1)
                    self.log.debug("Merge the equal column/line {0} with the column/line {1}".format(str(i_equ), str(j + 1)))
                    i_equ -= 1
                    break
            ll -= 1
            i_equ += 1

        # Process the UPGMA on symbols
        self.processUPGMA()

        # Retrieve the alignment of each symbol and the build the associated regular expression
        self.cb_executionStatus(50.0, "Executing last alignment...")
        alignment = NeedlemanAndWunsch(self.unitSize, self.cb_status)
        for symbol in self.symbols:
            alignment.alignSymbol(symbol, self.doInternalSlick, self.defaultFormat)

        return self.symbols
Exemplo n.º 6
0
    def executeClustering(self):
        """Execute the clustering operation
        @return the new list of symbols"""
        self.log.debug("Re-Organize the symbols (nbIteration={0}, min_equivalence={1})".format(self.nbIteration, self.minEquivalence))
        # Process the UPGMA on symbols

        if self.isFinish():
            return None

        self.cb_executionStatus(0, 0, "Clustering into symbols...")
        self.processUPGMA()
        self.cb_executionStatus(1, 100, None)
        # Retrieve the alignment of each symbol and the build the associated regular expression
        self.cb_executionStatus(2, 0, "Compute the definition for each cluster...")

        if self.isFinish():
            return None

        self.currentAlignment = NeedlemanAndWunsch(self.unitSize, self.project, False, self.cb_status)
        self.currentAlignment.absoluteStage = 2
        self.currentAlignment.statusRatio = len(self.symbols)
        self.currentAlignment.statusRatioOffset = 0

        for symbol in self.symbols:

            if self.isFinish():
                return None

            self.currentAlignment.alignField(symbol.getField())
            self.currentAlignment.statusRatioOffset = self.currentAlignment.statusRatioOffset + 1

        return self.symbols
Exemplo n.º 7
0
 def test_deserialisationMessages(self):
     nbTest = 10
     alignmentSolution = NeedlemanAndWunsch(8)
     
     for iTest in range(0, nbTest) :
         messages = []
         # Generate a random number of message to serialize
         nbMessage = random.randint(2, 500)
         for iMessage in range(0, nbMessage) :
             data = TypeConvertor.stringToNetzobRaw(self.generateRandomString(5, 500))
             message = RawMessage(uuid.uuid4(), str(time.time()), data)
             messages.append(message)
         
         nbDeserializedTest = alignmentSolution.deserializeMessages(messages)
         self.assertEqual(nbMessage, nbDeserializedTest)
Exemplo n.º 8
0
    def executeClustering(self):
        """Execute the clustering operation
        @return the new list of symbols"""
        self.log.debug("Re-Organize the symbols (nbIteration={0}, min_equivalence={1})".format(self.nbIteration, self.minEquivalence))
        # Process the UPGMA on symbols

        if self.isFinish():
            return None

        self.cb_executionStatus(0, 0, "Clustering into symbols...")
        self.processUPGMA()
        self.cb_executionStatus(1, 100, None)
        # Retrieve the alignment of each symbol and the build the associated regular expression
        self.cb_executionStatus(2, 0, "Compute the definition for each cluster...")

        if self.isFinish():
            return None

        self.currentAlignment = NeedlemanAndWunsch(self.unitSize, self.project, False, self.cb_status)
        self.currentAlignment.absoluteStage = 2
        self.currentAlignment.statusRatio = len(self.symbols)
        self.currentAlignment.statusRatioOffset = 0

        for symbol in self.symbols:

            if self.isFinish():
                return None

            self.currentAlignment.alignField(symbol.getField())
            self.currentAlignment.statusRatioOffset = self.currentAlignment.statusRatioOffset + 1

        return self.symbols
    def sequence_execute_clicked_cb(self, widget):
        """Callback executed when the user request to start
        the alignment process"""

        self._view.sequence_cancel.set_sensitive(False)
        self._view.sequence_execute.set_sensitive(False)
        self._view.sequence_scale.set_sensitive(False)
        self._view.sequence_spinbutton.set_sensitive(False)
        self._view.radiobutton4bit.set_sensitive(False)
        self._view.radiobutton8bit.set_sensitive(False)
        self._view.orphanButton.set_sensitive(False)
        self._view.smoothButton.set_sensitive(False)

        # retrieves the alignment parameters
        similarityPercent = self._view.sequence_adjustment.get_value()
        if self._view.radiobutton8bit.get_mode():
            unitSize = 8
        else:
            unitSize = 4
        orphan = self._view.orphanButton.get_active()
        smooth = self._view.smoothButton.get_active()

        self.vocabularyController.getCurrentProject().getConfiguration().setVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_EQUIVALENCE_THRESHOLD, int(similarityPercent))
        self.vocabularyController.getCurrentProject().getConfiguration().setVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_ORPHAN_REDUCTION, orphan)
        self.vocabularyController.getCurrentProject().getConfiguration().setVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_DO_INTERNAL_SLICK, smooth)

        # Configure Needleman and Wunsch
        self.alignmentSolution = NeedlemanAndWunsch(unitSize, self.vocabularyController.getCurrentProject(), self.doUpgma, self.percentOfAlignmentProgessBar)

        # Define the alignment JOB
        self._view.sequence_stop.set_sensitive(True)
        Job(self.startSequenceAlignment(unitSize))
Exemplo n.º 10
0
    def test_deserialisationMessages(self):
        nbTest = 10
        alignmentSolution = NeedlemanAndWunsch(8)

        for iTest in range(0, nbTest):
            messages = []
            # Generate a random number of message to serialize
            nbMessage = random.randint(2, 500)
            for iMessage in range(0, nbMessage):
                data = TypeConvertor.stringToNetzobRaw(
                    self.generateRandomString(5, 500))
                message = RawMessage(str(uuid.uuid4()), str(time.time()), data)
                messages.append(message)

            nbDeserializedTest = alignmentSolution.deserializeMessages(
                messages)
            self.assertEqual(nbMessage, nbDeserializedTest)
Exemplo n.º 11
0
    def test_randomAlignmentsWithTwoCenteredMessages(self):
        workspace = self.getWorkspace()
        currentProject = workspace.getProjects()[0]

        doInternalSlick = currentProject.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_DO_INTERNAL_SLICK)
        defaultFormat = currentProject.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_GLOBAL_FORMAT)
        defaultUnitSize = 8

        # We generate 1000 random couples of data and try to align them
        # Objectives: just test if it executes
        nb_data = 1000
        nb_failed = 0
        nb_success = 0
        for i_test in range(0, nb_data):

            common_pattern = self.generateRandomString(30, 40)
            # Generate the content of two messages
            data1 = TypeConvertor.stringToNetzobRaw(self.generateRandomString(5, 100) + common_pattern + self.generateRandomString(5, 100))
            data2 = TypeConvertor.stringToNetzobRaw(self.generateRandomString(5, 100) + common_pattern + self.generateRandomString(5, 100))
            # Create the messages
            message1 = RawMessage(str(uuid.uuid4()), str(time.time()), data1)
            message2 = RawMessage(str(uuid.uuid4()), str(time.time()), data2)
            # Create the symbol
            symbol = Symbol(str(uuid.uuid4()), "test_randomAlignments#" + str(i_test), currentProject)
            symbol.addMessage(message1)
            symbol.addMessage(message2)
            field = symbol.getField()

            # Starts the alignment process
            alignmentProcess = NeedlemanAndWunsch(defaultUnitSize, currentProject, False, self.emptyAlignmentCB)
            alignmentProcess.alignField(field)

            if not TypeConvertor.stringToNetzobRaw(common_pattern[:]) in field.getAlignment():
                if self.debug is True:
                    print "Message 1: " + str(data1)
                    print "Message 2: " + str(data2)
                    print "Common pattern: " + TypeConvertor.stringToNetzobRaw(common_pattern)
                    print "Alignment: " + field.getAlignment()
                nb_failed += 1
            else:
                nb_success += 1
        if nb_failed > 0:
            print "A number of " + str(nb_failed) + "/" + str(nb_data) + " alignment failed !"
        self.assertEqual(0, nb_failed)
        self.assertEqual(nb_success, nb_data)
Exemplo n.º 12
0
 def test_alignmentOfAlmostEquivalentMessages(self):     
     alignmentSolution = NeedlemanAndWunsch(8)
     nbTest = 1000
     for i_test in range(0, nbTest) :
         common_pattern_before = self.generateRandomString(30, 40)
         common_pattern_after = self.generateRandomString(30, 40)
         # Generate the content of two messages
         data1 = TypeConvertor.stringToNetzobRaw(common_pattern_before + "hercule" + common_pattern_after)
         data2 = TypeConvertor.stringToNetzobRaw(common_pattern_before + "thomas" + common_pattern_after)
         # Create the messages
         message1 = RawMessage(uuid.uuid4(), str(time.time()), data1)
         message2 = RawMessage(uuid.uuid4(), str(time.time()), data2)
         
         (scores, alignment) = alignmentSolution.alignTwoMessages(False, message1, message2)
         (score1, score2, score3) = scores
         (scoresBis, alignment2) = alignmentSolution.alignTwoMessages(True, message1, message2)
         (scoreBis1, scoreBis2, scoreBis3) = scoresBis
        
         self.assertGreater(scoreBis1, score1)
         self.assertGreater(scoreBis1, 95)
Exemplo n.º 13
0
 def test_AlignementOfMessages(self):     
     alignmentSolution = NeedlemanAndWunsch(4)
     nbTest = 100
     
     for iTest in range(0, nbTest) :
         messages = []
         # Generate a random number of message to serialize
         nbMessage = random.randint(2, 50)
         for iMessage in range(0, nbMessage) :
             data = TypeConvertor.stringToNetzobRaw("bonjour" + self.generateRandomString(5, 30) + ", tout va bien ?")
             message = RawMessage(uuid.uuid4(), str(time.time()), data)
             messages.append(message)
         
         (alignment, scores) = alignmentSolution.align(False, messages)
         (score1, score2, score3) = scores
         (alignmentBis, scoresBis) = alignmentSolution.align(True, messages)
         (scoreBis1, scoreBis2, scoreBis3) = scoresBis
         print alignment
         print alignmentBis
         
         self.assertGreaterEqual(scoreBis1, score1)
         self.assertGreaterEqual(scoreBis1, 90)            
Exemplo n.º 14
0
    def clusterByTokenization(self, symbols):
        self.ServerInference()
    ################################### Cluster messages according to their tokens
        ll = len(self.symbols) - 1
        i_equ = 0
        while(ll > 0):
            currentPattern = self.symbols[i_equ].getMessages()[0].getPattern()[1]
            for j in range(ll):
                jnext = len(self.symbols) - j - 1
                cond = False
                for message in self.symbols[jnext].getMessages():
                    if currentPattern == message.getPattern()[1]:
#                        score = sum([p1 == p2 for p1, p2 in zip(currentPattern, message.getPattern()[1])])
                        score2 = self.computeSimilarities(currentPattern, message.getPattern()[1])
#                        if score >= min(len(currentPattern), len(message.getPattern()[1])):
                        minilength = min(len(message.getData()), len(self.symbols[i_equ].getMessages()[0].getData()))
                        if score2 * 2.0 / minilength >= 0.40:
                            cond = True
                        if cond:
                            break

                if(cond):
                    currentDst = self.symbols[i_equ].getPattern()[0]
                    otherDst = self.symbols[jnext].getPattern()[0]
                    if not self.server or (currentDst == otherDst) or (currentDst != self.server and otherDst != self.server):
                        self.mergeEffectiveRowCol(i_equ, jnext)
#                        self.log.debug("Merge the equal column/line {0} with the column/line {1}".format(str(i_equ), str(j + 1)))
                        i_equ -= 1
                        break
            ll -= 1
            i_equ += 1

    ################################## Align messages
        alignment = NeedlemanAndWunsch(self.unitSize, self.cb_status)
        tmpSymbols = []
        for symbol in self.symbols:

#            alignment.alignSymbols([symbol], self.project)
#            symbol.getFields()[0].setFormat(Format.STRING)
#            tmpSymbols.extend(alignment.getLastResult())
            try:
#                print "l"
                al = self.computeAlignment(symbol)
                symbol.getField().setAlignment(al)
                alignment.buildRegexFromAlignment(symbol, al, self.defaultFormat)

#                for (p, fields) in zip(symbol.getPattern()[1], symbol.getFields()):
#                    field.setFormat(p.getFormat())
            except:
                logging.warn("Partitionnement error: too much fields ( > 100) for the symbol '" + symbol.getName() + "' len=" + str(len(symbol.getExtendedFields())) + "len " + str(len(symbol.getPattern()[1])))
                symbol.getField().removeLocalFields()
                field = Field("Field 0", "(.{, })", symbol)
                symbol.addField(field)
                # Use the default protocol type for representation
                field.setFormat(self.defaultFormat)

        alignment.alignSymbols(self.symbols, self.project)
        self.symbols = alignment.getLastResult()
Exemplo n.º 15
0
 def test_alignmentOfEquivalentMessages(self):     
     alignmentSolution = NeedlemanAndWunsch(8)
     nbTest = 1000
     for i_test in range(0, nbTest) :
         common_pattern = self.generateRandomString(30, 40)
         # Generate the content of two messages
         data1 = TypeConvertor.stringToNetzobRaw(common_pattern)
         data2 = TypeConvertor.stringToNetzobRaw(common_pattern)
         # Create the messages
         message1 = RawMessage(uuid.uuid4(), str(time.time()), data1)
         message2 = RawMessage(uuid.uuid4(), str(time.time()), data2)
         
         (scores, alignment) = alignmentSolution.alignTwoMessages(False, message1, message2)
         (score1, score2, score3) = scores
         self.assertEqual(score1, 100.0)
         self.assertEqual(score2, 100.0)
         self.assertEqual(score3, 100.0)
         
         (scores, alignment) = alignmentSolution.alignTwoMessages(True, message1, message2)
         (score1, score2, score3) = scores
         self.assertEqual(score1, 100.0)
         self.assertEqual(score2, 100.0)
         self.assertEqual(score3, 100.0)
Exemplo n.º 16
0
    def test_AlignementOfMessages(self):
        alignmentSolution = NeedlemanAndWunsch(4)
        nbTest = 100

        for iTest in range(0, nbTest):
            messages = []
            # Generate a random number of message to serialize
            nbMessage = random.randint(2, 50)
            for iMessage in range(0, nbMessage):
                data = TypeConvertor.stringToNetzobRaw(
                    "bonjour" + self.generateRandomString(5, 30) +
                    ", tout va bien ?")
                message = RawMessage(str(uuid.uuid4()), str(time.time()), data)
                messages.append(message)

            (alignment, scores) = alignmentSolution.align(False, messages)
            (score1, score2, score3) = scores
            (alignmentBis, scoresBis) = alignmentSolution.align(True, messages)
            (scoreBis1, scoreBis2, scoreBis3) = scoresBis
            print(alignment)
            print(alignmentBis)

            self.assertGreaterEqual(scoreBis1, score1)
            self.assertGreaterEqual(scoreBis1, 90)
Exemplo n.º 17
0
    def test_semanticAlignment_bug1(self):
        """test_semanticAlignment_bug1:
        A bug on the semantic alignment has been identified which prevent
        the computation of a valid regex. This test verifies the bug is not comming back.
        @date 18/04/2013
        """

        firstname1 = "antoine"
        email1 = "*****@*****.**"

        firstname2 = "luc"
        email2 = "*****@*****.**"

        msg1 = RawMessage(uuid.uuid4(), None, TypeConvertor.stringToNetzobRaw("6" + firstname1 + "GAHFSHQS" + email1))
        msg2 = RawMessage(uuid.uuid4(), None, TypeConvertor.stringToNetzobRaw("3" + firstname2 + "CVSDHISD" + email2))

        project = Project(uuid.uuid4(), "Experiment", datetime.now(), "")
        nwEngine = NeedlemanAndWunsch(8, project, False, None)
        symbol = Symbol(uuid.uuid4(), "Test", project)

        symbol.addMessages([msg1, msg2])
        msg1.addSemanticTag("firstname", 2, 2 + len(firstname1) * 2)
        msg1.addSemanticTag("email", 2 + len(firstname1) * 2 + 16, 2 + len(firstname1) * 2 + 16 + len(email1) * 2)

        msg2.addSemanticTag("firstname", 2, 2 + len(firstname2) * 2)
        msg2.addSemanticTag("email", 2 + len(firstname2) * 2 + 16, 2 + len(firstname2) * 2 + 16 + len(email2) * 2)

        nwEngine.alignField(symbol.getField())
        symbol.getField().setFormat(Format.STRING)

        print("Computed Regex : {0}".format(symbol.getRegex()))
        print("=======")
        print(symbol.getCells(True))

        computedFields = symbol.getExtendedFields()
        self.assertTrue(len(computedFields) > 1, "Only one field has been computed which tells us something went wrong.")
Exemplo n.º 18
0
class UPGMA(object):
    """This class provides the required methods to compute clustering
    between multiple symbols/messages using UPGMA algorithms (see U{http://en.wikipedia.org/wiki/UPGMA}).
    When processing, the matrix of scores is computed by the C extensions (L{_libScoreComputation}
    and used to regroup messages and symbols into equivalent cluster."""

    def __init__(self, project, symbols, unitSize, cb_status=None, scores={}):
        self.project = project
        self.unitSize = unitSize
        self.cb_status = cb_status
        self.scores = scores

        # Then we retrieve all the parameters of the CLUSTERING / ALIGNMENT
        self.defaultFormat = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_GLOBAL_FORMAT)
        self.nbIteration = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_NB_ITERATION)
        self.minEquivalence = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_EQUIVALENCE_THRESHOLD)
        self.doInternalSlick = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_DO_INTERNAL_SLICK)

        self.log = logging.getLogger('netzob.Inference.Vocabulary.UPGMA.py')
        self.path = []
        self.flagStop = False
        self.currentAlignment = None

        # Create a symbol for each message
        self.symbols = []
        i_symbol = 1
        for symbol in symbols:
            for m in symbol.getMessages():
                tmpSymbol = Symbol(str(uuid.uuid4()), "Symbol " + str(i_symbol), project)
                tmpSymbol.addMessage(m)
                self.symbols.append(tmpSymbol)
                i_symbol += 1
        self.log.debug("A number of {0} already aligned symbols will be clustered.".format(str(len(symbols))))

    def cb_executionStatus(self, stage, donePercent, currentMessage):
        """Callback function called by the C extension to provide info on status
        @param donePercent: a float between 0 and 100 included
        @param currentMessage: a str which represents the current alignment status"""
        if self.cb_status is None:
            self.log.info("[UPGMA status]" + str(donePercent) + "% " + currentMessage)
        else:
            self.cb_status(stage, donePercent, currentMessage)

    def executeClustering(self):
        """Execute the clustering operation
        @return the new list of symbols"""
        self.log.debug("Re-Organize the symbols (nbIteration={0}, min_equivalence={1})".format(self.nbIteration, self.minEquivalence))
        # Process the UPGMA on symbols

        if self.isFinish():
            return None

        self.cb_executionStatus(0, 0, "Clustering into symbols...")
        self.processUPGMA()
        self.cb_executionStatus(1, 100, None)
        # Retrieve the alignment of each symbol and the build the associated regular expression
        self.cb_executionStatus(2, 0, "Compute the definition for each cluster...")

        if self.isFinish():
            return None

        self.currentAlignment = NeedlemanAndWunsch(self.unitSize, self.project, False, self.cb_status)
        self.currentAlignment.absoluteStage = 2
        self.currentAlignment.statusRatio = len(self.symbols)
        self.currentAlignment.statusRatioOffset = 0

        for symbol in self.symbols:

            if self.isFinish():
                return None

            self.currentAlignment.alignField(symbol.getField())
            self.currentAlignment.statusRatioOffset = self.currentAlignment.statusRatioOffset + 1

        return self.symbols

    def processUPGMA(self):
        """Computes the matrix of equivalences (in C) and reduce it
        iteratively."""
        self.log.debug("Computing the associated matrix")

        # Execute the Clustering part in C
        debug = False
        wrapper = WrapperArgsFactory("_libScoreComputation.getHighestEquivalentGroup")
        wrapper.typeList[wrapper.function](self.symbols)
        (i_max, j_max, maxScore, listScores) = _libScoreComputation.getHighestEquivalentGroup(self.doInternalSlick, self.cb_executionStatus, self.isFinish, debug, wrapper)

        # Retrieve the scores for each association of symbols
        self.scores = {}
        for (iuid, juid, score) in listScores:

            if self.isFinish():
                return (None, None, None)

            if iuid not in self.scores.keys():
                self.scores[iuid] = {}
            if juid not in self.scores.keys():
                self.scores[juid] = {}
            self.scores[iuid][juid] = score
            if iuid not in self.scores[juid].keys():
                self.scores[juid][iuid] = score

        # Reduce the UPGMA matrix (merge symbols by similarity)
        self.computePhylogenicTree()
        return (i_max, j_max, maxScore)

    def computePhylogenicTree(self):
        """Compute the phylogenic tree
        @var max_i: uid of i_maximum
        @var max_j: uid of j_maximum
        @var maxScore: the highest global score"""
        maxScore = 0
        status = 0
        step = (float(100) - float(self.minEquivalence)) / float(100)

        if len(self.scores) > 1:
            max_i = max(self.scores, key=lambda x: self.scores[x][max(self.scores[x], key=lambda y: self.scores[x][y])])
            max_j = max(self.scores[max_i], key=lambda y: self.scores[max_i][y])
            maxScore = self.scores[max_i][max_j]
        while len(self.scores) > 1 and maxScore >= self.minEquivalence:

            if self.isFinish():
                return

            symbols_uid = [s.getID() for s in self.symbols]  # List of the UID in of symbols
            (i_maximum, j_maximum) = (symbols_uid.index(max_i), symbols_uid.index(max_j))
            size_i = len(self.symbols[i_maximum].getMessages())
            size_j = len(self.symbols[j_maximum].getMessages())

            infoMessage = "Clustering {0} with {1} (score = {2})".format(str(i_maximum), str(j_maximum), str(maxScore))
            status = (float(100) - float(maxScore)) / float(step)
            self.cb_executionStatus(1, status, infoMessage)

            newuid = self.mergeEffectiveRowCol(i_maximum, j_maximum)
            self.updateScore(max_i, max_j, newuid, size_i, size_j)
#            self.log.debug("Score après: {0}".format(str(self.scores)))
            if len(self.scores) > 1:
                max_i = max(self.scores, key=lambda x: self.scores[x][max(self.scores[x], key=lambda y: self.scores[x][y])])
                max_j = max(self.scores[max_i], key=lambda y: self.scores[max_i][y])
                maxScore = self.scores[max_i][max_j]

    def updateScore(self, iuid, juid, newuid, size_i, size_j):
        """Update the score of two merged clusters.
        @param iuid: id of the first cluster merged
        @param juid: id of the second cluster merged
        @param newuid: new id of the merged cluster
        @param size_i: size of the first cluster
        @param size_j: size of the second cluster"""
        total_size = size_i + size_j
        del self.scores[iuid]
        del self.scores[juid]
        self.scores[newuid] = {}
        for k in self.scores.keys():
            if k != newuid:
                self.scores[k][newuid] = (size_i * self.scores[k][iuid] + size_j * self.scores[k][juid]) * 1.0 / total_size
                del self.scores[k][iuid]
                del self.scores[k][juid]
                self.scores[newuid][k] = self.scores[k][newuid]

    def computePathTree(self):
        """TODO ?"""
        if self.path == []:
            clusterIndex = int(random.random() * len(self.scores.keys()))
            self.path.append(self.scores.keys()[0])
        if len(self.path) > 1:  # Check if Cl-1,Cl-2 minimum pair
            lastId = self.path[len(self.path) - 1]
            if max(self.scores[lastId], key=lambda x: self.scores[lastId][x]) == self.path[len(self.path) - 2]:
                return
        while True:
            lastId = self.path[len(self.path) - 1]
            juid = max(self.scores[lastId], key=lambda x: self.scores[lastId][x])
            self.path.append(juid)
            if max(self.scores[juid], key=lambda x: self.scores[juid][x]) == lastId:
                break

    def mergeEffectiveRowCol(self, i_maximum, j_maximum):
        """Merge the symbols i and j in the "symbols" structure
        @param i_maximum: id of the first symbol to merge
        @param j_maximum: id of the second symbol to merge
        @return the newly created symbol result of the merged process"""
        # Extract symbols i and j
        if i_maximum > j_maximum:
            symbol1 = self.symbols.pop(i_maximum)
            symbol2 = self.symbols.pop(j_maximum)
        else:
            symbol1 = self.symbols.pop(j_maximum)
            symbol2 = self.symbols.pop(i_maximum)

        # Merge the symbols i and j
        messages = []
        messages.extend(symbol1.getMessages())
        messages.extend(symbol2.getMessages())

        newSymbol = Symbol(str(uuid.uuid4()), symbol1.getName(), self.project)
        newSymbol.setMinEqu(self.minEquivalence)
        for message in messages:
            newSymbol.addMessage(message)

        # Append th new symbol to the "symbols" structure
        self.symbols.append(newSymbol)

        return newSymbol.getID()

    def executeOrphanReduction(self):
        """Execute the orphan reduction process by merging symbols
        which are progressively reduced in size."""
        leftReductionFactor = 0
        rightReductionFactor = 0
        currentReductionIsLeft = False
        increment = 10

        while leftReductionFactor < 80 and rightReductionFactor < 80:

            # First we retrieve the current orphans
            orphans = []
            tmp_symbols = []
            # extract orphans
            for i, symbol in zip(range(len(self.symbols)), self.symbols):
                if len(symbol.getMessages()) == 1:
                    orphans.append(symbol)

            # create a tmp symbols array where symbols will be added once computed
            for symbol in self.symbols:
                if len(symbol.getMessages()) > 1:
                    tmp_symbols.append(symbol)

            if len(orphans) <= 1:
                self.log.info("Number of orphan symbols: {0}. The orphan merging op. is finished!".format(len(orphans)))
                break

            self.symbols = orphans
            if currentReductionIsLeft:
                leftReductionFactor = leftReductionFactor + increment
                # Reduce the size of the messages by 50% from the left
                for orphan in self.symbols:
                    orphan.getMessages()[0].setLeftReductionFactor(leftReductionFactor)
                    orphan.getMessages()[0].setRightReductionFactor(0)

                self.log.info("Start to merge orphans reduced by {0}% from the left".format(str(leftReductionFactor)))
                self.executeClustering()
                currentReductionIsLeft = False

            if not currentReductionIsLeft:
                rightReductionFactor = rightReductionFactor + increment
                # Reduce the size of the messages from the right
                for orphan in self.symbols:
                    orphan.getMessages()[0].setRightReductionFactor(rightReductionFactor)
                    orphan.getMessages()[0].setLeftReductionFactor(0)

                self.log.info("Start to merge orphans reduced by {0}% from the right".format(str(rightReductionFactor)))
                self.executeClustering()
                currentReductionIsLeft = True

            for orphan in self.symbols:
                for message in orphan.getMessages():
                    message.setLeftReductionFactor(0)
                    message.setRightReductionFactor(0)
                tmp_symbols.append(orphan)
            self.symbols = tmp_symbols

        self.cb_executionStatus(3, 50.0, "Executing last alignment...")
        alignment = NeedlemanAndWunsch(self.unitSize, self.project, False, self.cb_status)
        # Compute the regex/alignment of each symbol
        for symbol in self.symbols:
            alignment.alignField(symbol.getField())
        return self.symbols

    def getScores(self):
        """@return: the dictionnary of scores"""
        return self.scores

    def stop(self):
        """Stop the current execution of any clustering operation"""
        self.flagStop = True
        if self.currentAlignment is not None:
            self.currentAlignment.stop()

    def isFinish(self):
        """Compute if we should finish the current clustering operation"""
        return self.flagStop
Exemplo n.º 19
0
    def test_semanticAlignment_simple(self):
        """test_semanticAlignment_simple:
        Test that messages with embedded semantic are efficiently aligned.
        Format : <random 10 bytes><random username><random 5 ASCII><random email>

        Optimal Needleman & Wunsch Parameters :
        // Cost definitions for the alignment
        static const short int MATCH = 5;
        static const short int SEMANTIC_MATCH = 30;
        static const short int MISMATCH = -5;
        static const short int GAP = 0;
        static const short int BLEN = 10;
        // Consts for the definition of a mask
        static const unsigned char END = 2;
        static const unsigned char DIFFERENT = 1;
        static const unsigned char EQUAL = 0;
        """
        project = Project(uuid.uuid4(), "Experiment", datetime.now(), "")
        symbol = Symbol(uuid.uuid4(), "Test", project)

        nbMessage = 500
        usernames = []
        emails = []
        for iMessage in range(0, nbMessage):
            str_username = self.generateRandomString(4, 10)
            username = TypeConvertor.stringToNetzobRaw(str_username)
            usernames.append(str_username)

            email_prefix = self.generateRandomString(4, 10)
            email_domain = self.generateRandomString(4, 10)
            email_extension = self.generateRandomString(2, 3)
            str_email = "{0}@{1}.{2}".format(email_prefix, email_domain, email_extension)
            emails.append(str_email)
            email = TypeConvertor.stringToNetzobRaw(str_email)
            random10Bytes = self.generateRandomBytes(10, 10)
            random5ASCII = TypeConvertor.stringToNetzobRaw(self.generateRandomString(5, 5))
            data = "{0}{1}{2}{3}".format(random10Bytes, username, random5ASCII, email)

            message = RawMessage(uuid.uuid4(), None, data)
            message.addSemanticTag("username", len(random10Bytes), len(random10Bytes) + len(username))
            message.addSemanticTag("email", len(random10Bytes) + len(username) + len(random5ASCII), len(random10Bytes) + len(username) + len(random5ASCII) + len(email))

            symbol.addMessage(message)

        nwEngine = NeedlemanAndWunsch(8, project, False, None)
        nwEngine.alignField(symbol.getField())

        symbol.getField().setFormat(Format.STRING)

        print("Number of computed fields : {0}".format(len(symbol.getExtendedFields())))
        self.assertEqual(4, len(symbol.getExtendedFields()))
        nbValidMessages = 0

        for message in symbol.getMessages():
            isValid = symbol.getField().isRegexValidForMessage(message)
            if isValid:
                nbValidMessages += 1
            self.assertTrue(isValid)

        print(symbol.getCells())

        print("Computed regex is valid for {0}/{1} messages.".format(nbValidMessages, len(symbol.getMessages())))
Exemplo n.º 20
0
    def test_semanticAlignment_simple(self):
        """test_semanticAlignment_simple:
        Test that messages with embedded semantic are efficiently aligned.
        Format : <random 10 bytes><random username><random 5 ASCII><random email>

        Optimal Needleman & Wunsch Parameters :
        // Cost definitions for the alignment
        static const short int MATCH = 5;
        static const short int SEMANTIC_MATCH = 30;
        static const short int MISMATCH = -5;
        static const short int GAP = 0;
        static const short int BLEN = 10;
        // Consts for the definition of a mask
        static const unsigned char END = 2;
        static const unsigned char DIFFERENT = 1;
        static const unsigned char EQUAL = 0;
        """
        project = Project(uuid.uuid4(), "Experiment", datetime.now(), "")
        symbol = Symbol(uuid.uuid4(), "Test", project)

        nbMessage = 500
        usernames = []
        emails = []
        for iMessage in range(0, nbMessage):
            str_username = self.generateRandomString(4, 10)
            username = TypeConvertor.stringToNetzobRaw(str_username)
            usernames.append(str_username)

            email_prefix = self.generateRandomString(4, 10)
            email_domain = self.generateRandomString(4, 10)
            email_extension = self.generateRandomString(2, 3)
            str_email = "{0}@{1}.{2}".format(email_prefix, email_domain, email_extension)
            emails.append(str_email)
            email = TypeConvertor.stringToNetzobRaw(str_email)
            random10Bytes = self.generateRandomBytes(10, 10)
            random5ASCII = TypeConvertor.stringToNetzobRaw(self.generateRandomString(5, 5))
            data = "{0}{1}{2}{3}".format(random10Bytes, username, random5ASCII, email)

            message = RawMessage(uuid.uuid4(), None, data)
            message.addSemanticTag("username", len(random10Bytes), len(random10Bytes) + len(username))
            message.addSemanticTag("email", len(random10Bytes) + len(username) + len(random5ASCII), len(random10Bytes) + len(username) + len(random5ASCII) + len(email))

            symbol.addMessage(message)

        nwEngine = NeedlemanAndWunsch(8, project, False, None)
        nwEngine.alignField(symbol.getField())

        symbol.getField().setFormat(Format.STRING)

        print("Number of computed fields : {0}".format(len(symbol.getExtendedFields())))
        self.assertEqual(4, len(symbol.getExtendedFields()))
        nbValidMessages = 0

        for message in symbol.getMessages():
            isValid = symbol.getField().isRegexValidForMessage(message)
            if isValid:
                nbValidMessages += 1
            self.assertTrue(isValid)

        print(symbol.getCells())

        print("Computed regex is valid for {0}/{1} messages.".format(nbValidMessages, len(symbol.getMessages())))
Exemplo n.º 21
0
class UPGMA(object):
    """This class provides the required methods to compute clustering
    between multiple symbols/messages using UPGMA algorithms (see U{http://en.wikipedia.org/wiki/UPGMA}).
    When processing, the matrix of scores is computed by the C extensions (L{_libScoreComputation}
    and used to regroup messages and symbols into equivalent cluster."""

    def __init__(self, project, symbols, unitSize, cb_status=None, scores={}):
        self.project = project
        self.unitSize = unitSize
        self.cb_status = cb_status
        self.scores = scores

        # Then we retrieve all the parameters of the CLUSTERING / ALIGNMENT
        self.defaultFormat = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_GLOBAL_FORMAT)
        self.nbIteration = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_NB_ITERATION)
        self.minEquivalence = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_EQUIVALENCE_THRESHOLD)
        self.doInternalSlick = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_DO_INTERNAL_SLICK)

        self.log = logging.getLogger('netzob.Inference.Vocabulary.UPGMA.py')
        self.path = []
        self.flagStop = False
        self.currentAlignment = None

        # Create a symbol for each message
        self.symbols = []
        i_symbol = 1
        for symbol in symbols:
            for m in symbol.getMessages():
                tmpSymbol = Symbol(str(uuid.uuid4()), "Symbol " + str(i_symbol), project)
                tmpSymbol.addMessage(m)
                self.symbols.append(tmpSymbol)
                i_symbol += 1
        self.log.debug("A number of {0} already aligned symbols will be clustered.".format(str(len(symbols))))

    def cb_executionStatus(self, stage, donePercent, currentMessage):
        """Callback function called by the C extension to provide info on status
        @param donePercent: a float between 0 and 100 included
        @param currentMessage: a str which represents the current alignment status"""
        if self.cb_status is None:
            self.log.info("[UPGMA status]" + str(donePercent) + "% " + currentMessage)
        else:
            self.cb_status(stage, donePercent, currentMessage)

    def executeClustering(self):
        """Execute the clustering operation
        @return the new list of symbols"""
        self.log.debug("Re-Organize the symbols (nbIteration={0}, min_equivalence={1})".format(self.nbIteration, self.minEquivalence))
        # Process the UPGMA on symbols

        if self.isFinish():
            return None

        self.cb_executionStatus(0, 0, "Clustering into symbols...")
        self.processUPGMA()
        self.cb_executionStatus(1, 100, None)
        # Retrieve the alignment of each symbol and the build the associated regular expression
        self.cb_executionStatus(2, 0, "Compute the definition for each cluster...")

        if self.isFinish():
            return None

        self.currentAlignment = NeedlemanAndWunsch(self.unitSize, self.project, False, self.cb_status)
        self.currentAlignment.absoluteStage = 2
        self.currentAlignment.statusRatio = len(self.symbols)
        self.currentAlignment.statusRatioOffset = 0

        for symbol in self.symbols:

            if self.isFinish():
                return None

            self.currentAlignment.alignField(symbol.getField())
            self.currentAlignment.statusRatioOffset = self.currentAlignment.statusRatioOffset + 1

        return self.symbols

    def processUPGMA(self):
        """Computes the matrix of equivalences (in C) and reduce it
        iteratively."""
        self.log.debug("Computing the associated matrix")

        # Execute the Clustering part in C
        debug = False
        wrapper = WrapperArgsFactory("_libScoreComputation.computeSimilarityMatrix")
        wrapper.typeList[wrapper.function](self.symbols)
        (listScores) = _libScoreComputation.computeSimilarityMatrix(self.doInternalSlick, self.cb_executionStatus, self.isFinish, debug, wrapper)
        # Retrieve the scores for each association of symbols
        self.scores = {}
        for (iuid, juid, score) in listScores:
            if self.isFinish():
                return (None, None, None)

            if iuid not in self.scores.keys():
                self.scores[iuid] = {}
            if juid not in self.scores.keys():
                self.scores[juid] = {}
            self.scores[iuid][juid] = score
            if iuid not in self.scores[juid].keys():
                self.scores[juid][iuid] = score

        # Reduce the UPGMA matrix (merge symbols by similarity)
        self.computePhylogenicTree()

    def computePhylogenicTree(self):
        """Compute the phylogenic tree
        @var max_i: uid of i_maximum
        @var max_j: uid of j_maximum
        @var maxScore: the highest global score"""
        maxScore = 0
        status = 0
        step = (float(100) - float(self.minEquivalence)) / float(100)

        if len(self.scores) > 1:
            max_i = max(self.scores, key=lambda x: self.scores[x][max(self.scores[x], key=lambda y: self.scores[x][y])])
            max_j = max(self.scores[max_i], key=lambda y: self.scores[max_i][y])
            maxScore = self.scores[max_i][max_j]
        while len(self.scores) > 1 and maxScore >= self.minEquivalence:

            if self.isFinish():
                return

            symbols_uid = [s.getID() for s in self.symbols]  # List of the UID in of symbols
            (i_maximum, j_maximum) = (symbols_uid.index(max_i), symbols_uid.index(max_j))
            size_i = len(self.symbols[i_maximum].getMessages())
            size_j = len(self.symbols[j_maximum].getMessages())

            infoMessage = "Clustering {0} with {1} (score = {2})".format(str(i_maximum), str(j_maximum), str(maxScore))
            status = (float(100) - float(maxScore)) / float(step)
            self.cb_executionStatus(1, status, infoMessage)

            newuid = self.mergeEffectiveRowCol(i_maximum, j_maximum)
            self.updateScore(max_i, max_j, newuid, size_i, size_j)
#            self.log.debug("Score après: {0}".format(str(self.scores)))
            if len(self.scores) > 1:
                max_i = max(self.scores, key=lambda x: self.scores[x][max(self.scores[x], key=lambda y: self.scores[x][y])])
                max_j = max(self.scores[max_i], key=lambda y: self.scores[max_i][y])
                maxScore = self.scores[max_i][max_j]

    def updateScore(self, iuid, juid, newuid, size_i, size_j):
        """Update the score of two merged clusters.
        @param iuid: id of the first cluster merged
        @param juid: id of the second cluster merged
        @param newuid: new id of the merged cluster
        @param size_i: size of the first cluster
        @param size_j: size of the second cluster"""
        total_size = size_i + size_j
        del self.scores[iuid]
        del self.scores[juid]
        self.scores[newuid] = {}
        for k in self.scores.keys():
            if k != newuid:
                self.scores[k][newuid] = (size_i * self.scores[k][iuid] + size_j * self.scores[k][juid]) * 1.0 / total_size
                del self.scores[k][iuid]
                del self.scores[k][juid]
                self.scores[newuid][k] = self.scores[k][newuid]

    def computePathTree(self):
        """TODO ?"""
        if self.path == []:
            clusterIndex = int(random.random() * len(self.scores.keys()))
            self.path.append(self.scores.keys()[0])
        if len(self.path) > 1:  # Check if Cl-1,Cl-2 minimum pair
            lastId = self.path[len(self.path) - 1]
            if max(self.scores[lastId], key=lambda x: self.scores[lastId][x]) == self.path[len(self.path) - 2]:
                return
        while True:
            lastId = self.path[len(self.path) - 1]
            juid = max(self.scores[lastId], key=lambda x: self.scores[lastId][x])
            self.path.append(juid)
            if max(self.scores[juid], key=lambda x: self.scores[juid][x]) == lastId:
                break

    def mergeEffectiveRowCol(self, i_maximum, j_maximum):
        """Merge the symbols i and j in the "symbols" structure
        @param i_maximum: id of the first symbol to merge
        @param j_maximum: id of the second symbol to merge
        @return the newly created symbol result of the merged process"""
        # Extract symbols i and j
        if i_maximum > j_maximum:
            symbol1 = self.symbols.pop(i_maximum)
            symbol2 = self.symbols.pop(j_maximum)
        else:
            symbol1 = self.symbols.pop(j_maximum)
            symbol2 = self.symbols.pop(i_maximum)

        # Merge the symbols i and j
        messages = []
        messages.extend(symbol1.getMessages())
        messages.extend(symbol2.getMessages())

        newSymbol = Symbol(str(uuid.uuid4()), symbol1.getName(), self.project)
        newSymbol.setMinEqu(self.minEquivalence)
        for message in messages:
            newSymbol.addMessage(message)

        # Append th new symbol to the "symbols" structure
        self.symbols.append(newSymbol)

        return newSymbol.getID()

    def executeOrphanReduction(self):
        """Execute the orphan reduction process by merging symbols
        which are progressively reduced in size."""
        leftReductionFactor = 0
        rightReductionFactor = 0
        currentReductionIsLeft = False
        increment = 10

        while leftReductionFactor < 80 and rightReductionFactor < 80:

            # First we retrieve the current orphans
            orphans = []
            tmp_symbols = []
            # extract orphans
            for i, symbol in zip(range(len(self.symbols)), self.symbols):
                if len(symbol.getMessages()) == 1:
                    orphans.append(symbol)

            # create a tmp symbols array where symbols will be added once computed
            for symbol in self.symbols:
                if len(symbol.getMessages()) > 1:
                    tmp_symbols.append(symbol)

            if len(orphans) <= 1:
                self.log.info("Number of orphan symbols: {0}. The orphan merging op. is finished!".format(len(orphans)))
                break

            self.symbols = orphans
            if currentReductionIsLeft:
                leftReductionFactor = leftReductionFactor + increment
                # Reduce the size of the messages by 50% from the left
                for orphan in self.symbols:
                    orphan.getMessages()[0].setLeftReductionFactor(leftReductionFactor)
                    orphan.getMessages()[0].setRightReductionFactor(0)

                self.log.info("Start to merge orphans reduced by {0}% from the left".format(str(leftReductionFactor)))
                self.executeClustering()
                currentReductionIsLeft = False

            if not currentReductionIsLeft:
                rightReductionFactor = rightReductionFactor + increment
                # Reduce the size of the messages from the right
                for orphan in self.symbols:
                    orphan.getMessages()[0].setRightReductionFactor(rightReductionFactor)
                    orphan.getMessages()[0].setLeftReductionFactor(0)

                self.log.info("Start to merge orphans reduced by {0}% from the right".format(str(rightReductionFactor)))
                self.executeClustering()
                currentReductionIsLeft = True

            for orphan in self.symbols:
                for message in orphan.getMessages():
                    message.setLeftReductionFactor(0)
                    message.setRightReductionFactor(0)
                tmp_symbols.append(orphan)
            self.symbols = tmp_symbols

        self.cb_executionStatus(3, 50.0, "Executing last alignment...")
        alignment = NeedlemanAndWunsch(self.unitSize, self.project, False, self.cb_status)
        # Compute the regex/alignment of each symbol
        for symbol in self.symbols:
            alignment.alignField(symbol.getField())
        return self.symbols

    def getScores(self):
        """@return: the dictionnary of scores"""
        return self.scores

    def stop(self):
        """Stop the current execution of any clustering operation"""
        self.flagStop = True
        if self.currentAlignment is not None:
            self.currentAlignment.stop()

    def isFinish(self):
        """Compute if we should finish the current clustering operation"""
        return self.flagStop
Exemplo n.º 22
0
    def clusterByTokenization(self, symbols):
        self.ServerInference()
        ################################### Cluster messages according to their tokens
        ll = len(self.symbols) - 1
        i_equ = 0
        while (ll > 0):
            currentPattern = self.symbols[i_equ].getMessages()[0].getPattern(
            )[1]
            for j in range(ll):
                jnext = len(self.symbols) - j - 1
                cond = False
                for message in self.symbols[jnext].getMessages():
                    if currentPattern == message.getPattern()[1]:
                        #                        score = sum([p1 == p2 for p1, p2 in zip(currentPattern, message.getPattern()[1])])
                        score2 = self.computeSimilarities(
                            currentPattern,
                            message.getPattern()[1])
                        #                        if score >= min(len(currentPattern), len(message.getPattern()[1])):
                        minilength = min(
                            len(message.getData()),
                            len(self.symbols[i_equ].getMessages()
                                [0].getData()))
                        if score2 * 2.0 / minilength >= 0.40:
                            cond = True
                        if cond:
                            break

                if (cond):
                    currentDst = self.symbols[i_equ].getPattern()[0]
                    otherDst = self.symbols[jnext].getPattern()[0]
                    if not self.server or (currentDst == otherDst) or (
                            currentDst != self.server
                            and otherDst != self.server):
                        self.mergeEffectiveRowCol(i_equ, jnext)
                        #                        self.log.debug("Merge the equal column/line {0} with the column/line {1}".format(str(i_equ), str(j + 1)))
                        i_equ -= 1
                        break
            ll -= 1
            i_equ += 1

    ################################## Align messages
        alignment = NeedlemanAndWunsch(self.unitSize, self.cb_status)
        tmpSymbols = []
        for symbol in self.symbols:

            #            alignment.alignSymbols([symbol], self.project)
            #            symbol.getFields()[0].setFormat(Format.STRING)
            #            tmpSymbols.extend(alignment.getLastResult())
            try:
                #                print "l"
                al = self.computeAlignment(symbol)
                symbol.getField().setAlignment(al)
                alignment.buildRegexFromAlignment(symbol, al,
                                                  self.defaultFormat)

#                for (p, fields) in zip(symbol.getPattern()[1], symbol.getFields()):
#                    field.setFormat(p.getFormat())
            except:
                logging.warn(
                    "Partitionnement error: too much fields ( > 100) for the symbol '"
                    + symbol.getName() + "' len=" +
                    str(len(symbol.getExtendedFields())) + "len " +
                    str(len(symbol.getPattern()[1])))
                symbol.getField().removeLocalFields()
                field = Field("Field 0", "(.{, })", symbol)
                symbol.addLocalField(field)
                # Use the default protocol type for representation
                field.setFormat(self.defaultFormat)

        alignment.alignSymbols(self.symbols, self.project)
        self.symbols = alignment.getLastResult()
Exemplo n.º 23
0
    def executeOrphanReduction(self):
        """Execute the orphan reduction process by merging symbols
        which are progressively reduced in size."""
        leftReductionFactor = 0
        rightReductionFactor = 0
        currentReductionIsLeft = False
        increment = 10

        while leftReductionFactor < 80 and rightReductionFactor < 80:

            # First we retrieve the current orphans
            orphans = []
            tmp_symbols = []
            # extract orphans
            for i, symbol in zip(range(len(self.symbols)), self.symbols):
                if len(symbol.getMessages()) == 1:
                    orphans.append(symbol)

            # create a tmp symbols array where symbols will be added once computed
            for symbol in self.symbols:
                if len(symbol.getMessages()) > 1:
                    tmp_symbols.append(symbol)

            if len(orphans) <= 1:
                self.log.info("Number of orphan symbols: {0}. The orphan merging op. is finished!".format(len(orphans)))
                break

            self.symbols = orphans
            if currentReductionIsLeft:
                leftReductionFactor = leftReductionFactor + increment
                # Reduce the size of the messages by 50% from the left
                for orphan in self.symbols:
                    orphan.getMessages()[0].setLeftReductionFactor(leftReductionFactor)
                    orphan.getMessages()[0].setRightReductionFactor(0)

                self.log.info("Start to merge orphans reduced by {0}% from the left".format(str(leftReductionFactor)))
                self.executeClustering()
                currentReductionIsLeft = False

            if not currentReductionIsLeft:
                rightReductionFactor = rightReductionFactor + increment
                # Reduce the size of the messages from the right
                for orphan in self.symbols:
                    orphan.getMessages()[0].setRightReductionFactor(rightReductionFactor)
                    orphan.getMessages()[0].setLeftReductionFactor(0)

                self.log.info("Start to merge orphans reduced by {0}% from the right".format(str(rightReductionFactor)))
                self.executeClustering()
                currentReductionIsLeft = True

            for orphan in self.symbols:
                for message in orphan.getMessages():
                    message.setLeftReductionFactor(0)
                    message.setRightReductionFactor(0)
                tmp_symbols.append(orphan)
            self.symbols = tmp_symbols

        self.cb_executionStatus(3, 50.0, "Executing last alignment...")
        alignment = NeedlemanAndWunsch(self.unitSize, self.project, False, self.cb_status)
        # Compute the regex/alignment of each symbol
        for symbol in self.symbols:
            alignment.alignField(symbol.getField())
        return self.symbols