Пример #1
0
    def test_semanticAlignment_bug1(self):
        """test_semanticAlignment_bug1:
        A bug on the semantic alignment has been identified which prevent
        the computation of a valid regex. This test verifies the bug is not comming back.
        @date 18/04/2013
        """

        firstname1 = "antoine"
        email1 = "*****@*****.**"

        firstname2 = "luc"
        email2 = "*****@*****.**"

        msg1 = RawMessage(uuid.uuid4(), None, TypeConvertor.stringToNetzobRaw("6" + firstname1 + "GAHFSHQS" + email1))
        msg2 = RawMessage(uuid.uuid4(), None, TypeConvertor.stringToNetzobRaw("3" + firstname2 + "CVSDHISD" + email2))

        project = Project(uuid.uuid4(), "Experiment", datetime.now(), "")
        nwEngine = NeedlemanAndWunsch(8, project, False, None)
        symbol = Symbol(uuid.uuid4(), "Test", project)

        symbol.addMessages([msg1, msg2])
        msg1.addSemanticTag("firstname", 2, 2 + len(firstname1) * 2)
        msg1.addSemanticTag("email", 2 + len(firstname1) * 2 + 16, 2 + len(firstname1) * 2 + 16 + len(email1) * 2)

        msg2.addSemanticTag("firstname", 2, 2 + len(firstname2) * 2)
        msg2.addSemanticTag("email", 2 + len(firstname2) * 2 + 16, 2 + len(firstname2) * 2 + 16 + len(email2) * 2)

        nwEngine.alignField(symbol.getField())
        symbol.getField().setFormat(Format.STRING)

        print("Computed Regex : {0}".format(symbol.getRegex()))
        print("=======")
        print(symbol.getCells(True))

        computedFields = symbol.getExtendedFields()
        self.assertTrue(len(computedFields) > 1, "Only one field has been computed which tells us something went wrong.")
Пример #2
0
    def test_AlignementOfMessages(self):
        alignmentSolution = NeedlemanAndWunsch(4)
        nbTest = 100

        for iTest in range(0, nbTest):
            messages = []
            # Generate a random number of message to serialize
            nbMessage = random.randint(2, 50)
            for iMessage in range(0, nbMessage):
                data = TypeConvertor.stringToNetzobRaw(
                    "bonjour" + self.generateRandomString(5, 30) +
                    ", tout va bien ?")
                message = RawMessage(str(uuid.uuid4()), str(time.time()), data)
                messages.append(message)

            (alignment, scores) = alignmentSolution.align(False, messages)
            (score1, score2, score3) = scores
            (alignmentBis, scoresBis) = alignmentSolution.align(True, messages)
            (scoreBis1, scoreBis2, scoreBis3) = scoresBis
            print(alignment)
            print(alignmentBis)

            self.assertGreaterEqual(scoreBis1, score1)
            self.assertGreaterEqual(scoreBis1, 90)
Пример #3
0
    def test_semanticAlignment_bug1(self):
        """test_semanticAlignment_bug1:
        A bug on the semantic alignment has been identified which prevent
        the computation of a valid regex. This test verifies the bug is not comming back.
        @date 18/04/2013
        """

        firstname1 = "antoine"
        email1 = "*****@*****.**"

        firstname2 = "luc"
        email2 = "*****@*****.**"

        msg1 = RawMessage(uuid.uuid4(), None, TypeConvertor.stringToNetzobRaw("6" + firstname1 + "GAHFSHQS" + email1))
        msg2 = RawMessage(uuid.uuid4(), None, TypeConvertor.stringToNetzobRaw("3" + firstname2 + "CVSDHISD" + email2))

        project = Project(uuid.uuid4(), "Experiment", datetime.now(), "")
        nwEngine = NeedlemanAndWunsch(8, project, False, None)
        symbol = Symbol(uuid.uuid4(), "Test", project)

        symbol.addMessages([msg1, msg2])
        msg1.addSemanticTag("firstname", 2, 2 + len(firstname1) * 2)
        msg1.addSemanticTag("email", 2 + len(firstname1) * 2 + 16, 2 + len(firstname1) * 2 + 16 + len(email1) * 2)

        msg2.addSemanticTag("firstname", 2, 2 + len(firstname2) * 2)
        msg2.addSemanticTag("email", 2 + len(firstname2) * 2 + 16, 2 + len(firstname2) * 2 + 16 + len(email2) * 2)

        nwEngine.alignField(symbol.getField())
        symbol.getField().setFormat(Format.STRING)

        print("Computed Regex : {0}".format(symbol.getRegex()))
        print(symbol.getCells(True))

        computedFields = symbol.getExtendedFields()
        self.assertTrue(len(computedFields) > 1, "Only one field has been computed which tells us something went wrong.")
Пример #4
0
    def test_semanticAlignment_simple(self):
        """test_semanticAlignment_simple:
        Test that messages with embedded semantic are efficiently aligned.
        Format : <random 10 bytes><random username><random 5 ASCII><random email>

        Optimal Needleman & Wunsch Parameters :
        // Cost definitions for the alignment
        static const short int MATCH = 5;
        static const short int SEMANTIC_MATCH = 30;
        static const short int MISMATCH = -5;
        static const short int GAP = 0;
        static const short int BLEN = 10;
        // Consts for the definition of a mask
        static const unsigned char END = 2;
        static const unsigned char DIFFERENT = 1;
        static const unsigned char EQUAL = 0;
        """
        project = Project(uuid.uuid4(), "Experiment", datetime.now(), "")
        symbol = Symbol(uuid.uuid4(), "Test", project)

        nbMessage = 500
        usernames = []
        emails = []
        for iMessage in range(0, nbMessage):
            str_username = self.generateRandomString(4, 10)
            username = TypeConvertor.stringToNetzobRaw(str_username)
            usernames.append(str_username)

            email_prefix = self.generateRandomString(4, 10)
            email_domain = self.generateRandomString(4, 10)
            email_extension = self.generateRandomString(2, 3)
            str_email = "{0}@{1}.{2}".format(email_prefix, email_domain, email_extension)
            emails.append(str_email)
            email = TypeConvertor.stringToNetzobRaw(str_email)
            random10Bytes = self.generateRandomBytes(10, 10)
            random5ASCII = TypeConvertor.stringToNetzobRaw(self.generateRandomString(5, 5))
            data = "{0}{1}{2}{3}".format(random10Bytes, username, random5ASCII, email)

            message = RawMessage(uuid.uuid4(), None, data)
            message.addSemanticTag("username", len(random10Bytes), len(random10Bytes) + len(username))
            message.addSemanticTag("email", len(random10Bytes) + len(username) + len(random5ASCII), len(random10Bytes) + len(username) + len(random5ASCII) + len(email))

            symbol.addMessage(message)

        nwEngine = NeedlemanAndWunsch(8, project, False, None)
        nwEngine.alignField(symbol.getField())

        symbol.getField().setFormat(Format.STRING)

        print("Number of computed fields : {0}".format(len(symbol.getExtendedFields())))
        self.assertEqual(4, len(symbol.getExtendedFields()))
        nbValidMessages = 0

        for message in symbol.getMessages():
            isValid = symbol.getField().isRegexValidForMessage(message)
            if isValid:
                nbValidMessages += 1
            self.assertTrue(isValid)

        print(symbol.getCells())

        print("Computed regex is valid for {0}/{1} messages.".format(nbValidMessages, len(symbol.getMessages())))
Пример #5
0
    def test_semanticAlignment_simple(self):
        """test_semanticAlignment_simple:
        Test that messages with embedded semantic are efficiently aligned.
        Format : <random 10 bytes><random username><random 5 ASCII><random email>

        Optimal Needleman & Wunsch Parameters :
        // Cost definitions for the alignment
        static const short int MATCH = 5;
        static const short int SEMANTIC_MATCH = 30;
        static const short int MISMATCH = -5;
        static const short int GAP = 0;
        static const short int BLEN = 10;
        // Consts for the definition of a mask
        static const unsigned char END = 2;
        static const unsigned char DIFFERENT = 1;
        static const unsigned char EQUAL = 0;
        """
        project = Project(uuid.uuid4(), "Experiment", datetime.now(), "")
        symbol = Symbol(uuid.uuid4(), "Test", project)

        nbMessage = 500
        usernames = []
        emails = []
        for iMessage in range(0, nbMessage):
            str_username = self.generateRandomString(4, 10)
            username = TypeConvertor.stringToNetzobRaw(str_username)
            usernames.append(str_username)

            email_prefix = self.generateRandomString(4, 10)
            email_domain = self.generateRandomString(4, 10)
            email_extension = self.generateRandomString(2, 3)
            str_email = "{0}@{1}.{2}".format(email_prefix, email_domain, email_extension)
            emails.append(str_email)
            email = TypeConvertor.stringToNetzobRaw(str_email)
            random10Bytes = self.generateRandomBytes(10, 10)
            random5ASCII = TypeConvertor.stringToNetzobRaw(self.generateRandomString(5, 5))
            data = "{0}{1}{2}{3}".format(random10Bytes, username, random5ASCII, email)

            message = RawMessage(uuid.uuid4(), None, data)
            message.addSemanticTag("username", len(random10Bytes), len(random10Bytes) + len(username))
            message.addSemanticTag("email", len(random10Bytes) + len(username) + len(random5ASCII), len(random10Bytes) + len(username) + len(random5ASCII) + len(email))

            symbol.addMessage(message)

        nwEngine = NeedlemanAndWunsch(8, project, False, None)
        nwEngine.alignField(symbol.getField())

        symbol.getField().setFormat(Format.STRING)

        print("Number of computed fields : {0}".format(len(symbol.getExtendedFields())))
        self.assertEqual(4, len(symbol.getExtendedFields()))
        nbValidMessages = 0

        for message in symbol.getMessages():
            isValid = symbol.getField().isRegexValidForMessage(message)
            if isValid:
                nbValidMessages += 1
            self.assertTrue(isValid)

        print(symbol.getCells())

        print("Computed regex is valid for {0}/{1} messages.".format(nbValidMessages, len(symbol.getMessages())))
Пример #6
0
    def test_executingClustering(self):

        # We create 6 messages of 2 group

        # group1
        originalSymbol1 = Symbol(str(uuid.uuid4()), "TestSymbol", None)
        message1 = RawMessage(
            str(uuid.uuid4()), str(time.time()),
            TypeConvertor.stringToNetzobRaw("bonjour " +
                                            self.generateRandomString(20, 30) +
                                            " comment vas-tu ?"))
        originalSymbol1.addMessage(message1)

        originalSymbol2 = Symbol(str(uuid.uuid4()), "TestSymbol2", None)
        message2 = RawMessage(
            str(uuid.uuid4()), str(time.time()),
            TypeConvertor.stringToNetzobRaw("bonjour " +
                                            self.generateRandomString(20, 30) +
                                            " comment vas-tu ?"))
        originalSymbol2.addMessage(message2)

        originalSymbol3 = Symbol(str(uuid.uuid4()), "TestSymbol3", None)
        message3 = RawMessage(
            str(uuid.uuid4()), str(time.time()),
            TypeConvertor.stringToNetzobRaw("bonjour " +
                                            self.generateRandomString(20, 30) +
                                            " comment vas-tu ?"))
        originalSymbol3.addMessage(message3)

        # group2
        originalSymbol4 = Symbol(str(uuid.uuid4()), "TestSymbol4", None)
        message4 = RawMessage(
            str(uuid.uuid4()), str(time.time()),
            TypeConvertor.stringToNetzobRaw("salut à toi " +
                                            self.generateRandomString(10, 15) +
                                            " what's up ?"))
        originalSymbol4.addMessage(message4)

        originalSymbol5 = Symbol(str(uuid.uuid4()), "TestSymbol5", None)
        message5 = RawMessage(
            str(uuid.uuid4()), str(time.time()),
            TypeConvertor.stringToNetzobRaw("salut à toi " +
                                            self.generateRandomString(10, 15) +
                                            " what's up ?"))
        originalSymbol5.addMessage(message5)

        originalSymbol6 = Symbol(str(uuid.uuid4()), "TestSymbol6", None)
        message6 = RawMessage(
            str(uuid.uuid4()), str(time.time()),
            TypeConvertor.stringToNetzobRaw("salut à toi " +
                                            self.generateRandomString(10, 15) +
                                            " what's up ?"))
        originalSymbol6.addMessage(message6)

        symbols = [
            originalSymbol1, originalSymbol2, originalSymbol3, originalSymbol4,
            originalSymbol5, originalSymbol6
        ]

        # Start the clustering
        clusteringSolution = UPGMA(None, symbols, True, 100, 90, True,
                                   Format.ASCII)
        result = clusteringSolution.executeClustering()

        for symbol in result:
            print("Symbol: " + str(symbol.getName()))
            for m in symbol.getMessages():
                print(" + " + str(m.getStringData()))
Пример #7
0
    def test_executingClusteringWithOrphanReduction(self):

        # We create 6 messages of 2 group

        # group1
        originalSymbol1 = Symbol(str(uuid.uuid4()), "TestSymbol", None)
        message1 = RawMessage(
            str(uuid.uuid4()), str(time.time()),
            TypeConvertor.stringToNetzobRaw(
                "bonjour " + self.generateRandomString(200, 1000)))
        originalSymbol1.addMessage(message1)

        originalSymbol2 = Symbol(str(uuid.uuid4()), "TestSymbol2", None)
        message2 = RawMessage(
            str(uuid.uuid4()), str(time.time()),
            TypeConvertor.stringToNetzobRaw(
                "bonjour " + self.generateRandomString(200, 1000)))
        originalSymbol2.addMessage(message2)

        originalSymbol3 = Symbol(str(uuid.uuid4()), "TestSymbol3", None)
        message3 = RawMessage(
            str(uuid.uuid4()), str(time.time()),
            TypeConvertor.stringToNetzobRaw(
                "bonjour " + self.generateRandomString(200, 1000)))
        originalSymbol3.addMessage(message3)

        # group2
        originalSymbol4 = Symbol(str(uuid.uuid4()), "TestSymbol4", None)
        message4 = RawMessage(
            str(uuid.uuid4()), str(time.time()),
            TypeConvertor.stringToNetzobRaw(
                "salut " + self.generateRandomString(200, 1000)))
        originalSymbol4.addMessage(message4)

        originalSymbol5 = Symbol(str(uuid.uuid4()), "TestSymbol5", None)
        message5 = RawMessage(
            str(uuid.uuid4()), str(time.time()),
            TypeConvertor.stringToNetzobRaw(
                "salut " + self.generateRandomString(200, 1000)))
        originalSymbol5.addMessage(message5)

        originalSymbol6 = Symbol(str(uuid.uuid4()), "TestSymbol6", None)
        message6 = RawMessage(
            str(uuid.uuid4()), str(time.time()),
            TypeConvertor.stringToNetzobRaw(
                "salut " + self.generateRandomString(200, 1000)))
        originalSymbol6.addMessage(message6)

        symbols = [
            originalSymbol1, originalSymbol2, originalSymbol3, originalSymbol4,
            originalSymbol5, originalSymbol6
        ]

        # Start the clustering
        clusteringSolution = UPGMA(None, symbols, True, 100, 80, True,
                                   Format.ASCII)
        resultBeforeOrphan = clusteringSolution.executeClustering()
        resultAfterOrphan = clusteringSolution.executeOrphanReduction()

        if (len(resultAfterOrphan) < len(resultBeforeOrphan)):
            print("Before Orphan Reduction: ")
            for symbol in resultBeforeOrphan:
                print("Symbol: " + str(symbol.getName()))
                for m in symbol.getMessages():
                    print(" + " + str(m.getStringData()))

            print("After Orphan Reduction: ")
            for symbol in resultAfterOrphan:
                print("Symbol: " + str(symbol.getName()))
                for m in symbol.getMessages():
                    print(" + " + str(m.getStringData()))

        self.assertGreaterEqual(len(resultBeforeOrphan),
                                len(resultAfterOrphan))