def test_semanticAlignment_bug1(self): """test_semanticAlignment_bug1: A bug on the semantic alignment has been identified which prevent the computation of a valid regex. This test verifies the bug is not comming back. @date 18/04/2013 """ firstname1 = "antoine" email1 = "*****@*****.**" firstname2 = "luc" email2 = "*****@*****.**" msg1 = RawMessage(uuid.uuid4(), None, TypeConvertor.stringToNetzobRaw("6" + firstname1 + "GAHFSHQS" + email1)) msg2 = RawMessage(uuid.uuid4(), None, TypeConvertor.stringToNetzobRaw("3" + firstname2 + "CVSDHISD" + email2)) project = Project(uuid.uuid4(), "Experiment", datetime.now(), "") nwEngine = NeedlemanAndWunsch(8, project, False, None) symbol = Symbol(uuid.uuid4(), "Test", project) symbol.addMessages([msg1, msg2]) msg1.addSemanticTag("firstname", 2, 2 + len(firstname1) * 2) msg1.addSemanticTag("email", 2 + len(firstname1) * 2 + 16, 2 + len(firstname1) * 2 + 16 + len(email1) * 2) msg2.addSemanticTag("firstname", 2, 2 + len(firstname2) * 2) msg2.addSemanticTag("email", 2 + len(firstname2) * 2 + 16, 2 + len(firstname2) * 2 + 16 + len(email2) * 2) nwEngine.alignField(symbol.getField()) symbol.getField().setFormat(Format.STRING) print("Computed Regex : {0}".format(symbol.getRegex())) print(symbol.getCells(True)) computedFields = symbol.getExtendedFields() self.assertTrue(len(computedFields) > 1, "Only one field has been computed which tells us something went wrong.")
def test_randomAlignmentsWithTwoCenteredMessages(self): workspace = self.getWorkspace() currentProject = workspace.getProjects()[0] doInternalSlick = currentProject.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_DO_INTERNAL_SLICK) defaultFormat = currentProject.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_GLOBAL_FORMAT) defaultUnitSize = 8 # We generate 1000 random couples of data and try to align them # Objectives: just test if it executes nb_data = 1000 nb_failed = 0 nb_success = 0 for i_test in range(0, nb_data): common_pattern = self.generateRandomString(30, 40) # Generate the content of two messages data1 = TypeConvertor.stringToNetzobRaw(self.generateRandomString(5, 100) + common_pattern + self.generateRandomString(5, 100)) data2 = TypeConvertor.stringToNetzobRaw(self.generateRandomString(5, 100) + common_pattern + self.generateRandomString(5, 100)) # Create the messages message1 = RawMessage(str(uuid.uuid4()), str(time.time()), data1) message2 = RawMessage(str(uuid.uuid4()), str(time.time()), data2) # Create the symbol symbol = Symbol(str(uuid.uuid4()), "test_randomAlignments#" + str(i_test), currentProject) symbol.addMessage(message1) symbol.addMessage(message2) field = symbol.getField() # Starts the alignment process alignmentProcess = NeedlemanAndWunsch(defaultUnitSize, currentProject, False, self.emptyAlignmentCB) alignmentProcess.alignField(field) if not TypeConvertor.stringToNetzobRaw(common_pattern[:]) in field.getAlignment(): if self.debug is True: print "Message 1: " + str(data1) print "Message 2: " + str(data2) print "Common pattern: " + TypeConvertor.stringToNetzobRaw(common_pattern) print "Alignment: " + field.getAlignment() nb_failed += 1 else: nb_success += 1 if nb_failed > 0: print "A number of " + str(nb_failed) + "/" + str(nb_data) + " alignment failed !" self.assertEqual(0, nb_failed) self.assertEqual(nb_success, nb_data)
def test_semanticAlignment_bug1(self): """test_semanticAlignment_bug1: A bug on the semantic alignment has been identified which prevent the computation of a valid regex. This test verifies the bug is not comming back. @date 18/04/2013 """ firstname1 = "antoine" email1 = "*****@*****.**" firstname2 = "luc" email2 = "*****@*****.**" msg1 = RawMessage(uuid.uuid4(), None, TypeConvertor.stringToNetzobRaw("6" + firstname1 + "GAHFSHQS" + email1)) msg2 = RawMessage(uuid.uuid4(), None, TypeConvertor.stringToNetzobRaw("3" + firstname2 + "CVSDHISD" + email2)) project = Project(uuid.uuid4(), "Experiment", datetime.now(), "") nwEngine = NeedlemanAndWunsch(8, project, False, None) symbol = Symbol(uuid.uuid4(), "Test", project) symbol.addMessages([msg1, msg2]) msg1.addSemanticTag("firstname", 2, 2 + len(firstname1) * 2) msg1.addSemanticTag("email", 2 + len(firstname1) * 2 + 16, 2 + len(firstname1) * 2 + 16 + len(email1) * 2) msg2.addSemanticTag("firstname", 2, 2 + len(firstname2) * 2) msg2.addSemanticTag("email", 2 + len(firstname2) * 2 + 16, 2 + len(firstname2) * 2 + 16 + len(email2) * 2) nwEngine.alignField(symbol.getField()) symbol.getField().setFormat(Format.STRING) print("Computed Regex : {0}".format(symbol.getRegex())) print("=======") print(symbol.getCells(True)) computedFields = symbol.getExtendedFields() self.assertTrue(len(computedFields) > 1, "Only one field has been computed which tells us something went wrong.")
def test_semanticAlignment_simple(self): """test_semanticAlignment_simple: Test that messages with embedded semantic are efficiently aligned. Format : <random 10 bytes><random username><random 5 ASCII><random email> Optimal Needleman & Wunsch Parameters : // Cost definitions for the alignment static const short int MATCH = 5; static const short int SEMANTIC_MATCH = 30; static const short int MISMATCH = -5; static const short int GAP = 0; static const short int BLEN = 10; // Consts for the definition of a mask static const unsigned char END = 2; static const unsigned char DIFFERENT = 1; static const unsigned char EQUAL = 0; """ project = Project(uuid.uuid4(), "Experiment", datetime.now(), "") symbol = Symbol(uuid.uuid4(), "Test", project) nbMessage = 500 usernames = [] emails = [] for iMessage in range(0, nbMessage): str_username = self.generateRandomString(4, 10) username = TypeConvertor.stringToNetzobRaw(str_username) usernames.append(str_username) email_prefix = self.generateRandomString(4, 10) email_domain = self.generateRandomString(4, 10) email_extension = self.generateRandomString(2, 3) str_email = "{0}@{1}.{2}".format(email_prefix, email_domain, email_extension) emails.append(str_email) email = TypeConvertor.stringToNetzobRaw(str_email) random10Bytes = self.generateRandomBytes(10, 10) random5ASCII = TypeConvertor.stringToNetzobRaw(self.generateRandomString(5, 5)) data = "{0}{1}{2}{3}".format(random10Bytes, username, random5ASCII, email) message = RawMessage(uuid.uuid4(), None, data) message.addSemanticTag("username", len(random10Bytes), len(random10Bytes) + len(username)) message.addSemanticTag("email", len(random10Bytes) + len(username) + len(random5ASCII), len(random10Bytes) + len(username) + len(random5ASCII) + len(email)) symbol.addMessage(message) nwEngine = NeedlemanAndWunsch(8, project, False, None) nwEngine.alignField(symbol.getField()) symbol.getField().setFormat(Format.STRING) print("Number of computed fields : {0}".format(len(symbol.getExtendedFields()))) self.assertEqual(4, len(symbol.getExtendedFields())) nbValidMessages = 0 for message in symbol.getMessages(): isValid = symbol.getField().isRegexValidForMessage(message) if isValid: nbValidMessages += 1 self.assertTrue(isValid) print(symbol.getCells()) print("Computed regex is valid for {0}/{1} messages.".format(nbValidMessages, len(symbol.getMessages())))
class UPGMA(object): """This class provides the required methods to compute clustering between multiple symbols/messages using UPGMA algorithms (see U{http://en.wikipedia.org/wiki/UPGMA}). When processing, the matrix of scores is computed by the C extensions (L{_libScoreComputation} and used to regroup messages and symbols into equivalent cluster.""" def __init__(self, project, symbols, unitSize, cb_status=None, scores={}): self.project = project self.unitSize = unitSize self.cb_status = cb_status self.scores = scores # Then we retrieve all the parameters of the CLUSTERING / ALIGNMENT self.defaultFormat = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_GLOBAL_FORMAT) self.nbIteration = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_NB_ITERATION) self.minEquivalence = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_EQUIVALENCE_THRESHOLD) self.doInternalSlick = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_DO_INTERNAL_SLICK) self.log = logging.getLogger('netzob.Inference.Vocabulary.UPGMA.py') self.path = [] self.flagStop = False self.currentAlignment = None # Create a symbol for each message self.symbols = [] i_symbol = 1 for symbol in symbols: for m in symbol.getMessages(): tmpSymbol = Symbol(str(uuid.uuid4()), "Symbol " + str(i_symbol), project) tmpSymbol.addMessage(m) self.symbols.append(tmpSymbol) i_symbol += 1 self.log.debug("A number of {0} already aligned symbols will be clustered.".format(str(len(symbols)))) def cb_executionStatus(self, stage, donePercent, currentMessage): """Callback function called by the C extension to provide info on status @param donePercent: a float between 0 and 100 included @param currentMessage: a str which represents the current alignment status""" if self.cb_status is None: self.log.info("[UPGMA status]" + str(donePercent) + "% " + currentMessage) else: self.cb_status(stage, donePercent, currentMessage) def executeClustering(self): """Execute the clustering operation @return the new list of symbols""" self.log.debug("Re-Organize the symbols (nbIteration={0}, min_equivalence={1})".format(self.nbIteration, self.minEquivalence)) # Process the UPGMA on symbols if self.isFinish(): return None self.cb_executionStatus(0, 0, "Clustering into symbols...") self.processUPGMA() self.cb_executionStatus(1, 100, None) # Retrieve the alignment of each symbol and the build the associated regular expression self.cb_executionStatus(2, 0, "Compute the definition for each cluster...") if self.isFinish(): return None self.currentAlignment = NeedlemanAndWunsch(self.unitSize, self.project, False, self.cb_status) self.currentAlignment.absoluteStage = 2 self.currentAlignment.statusRatio = len(self.symbols) self.currentAlignment.statusRatioOffset = 0 for symbol in self.symbols: if self.isFinish(): return None self.currentAlignment.alignField(symbol.getField()) self.currentAlignment.statusRatioOffset = self.currentAlignment.statusRatioOffset + 1 return self.symbols def processUPGMA(self): """Computes the matrix of equivalences (in C) and reduce it iteratively.""" self.log.debug("Computing the associated matrix") # Execute the Clustering part in C debug = False wrapper = WrapperArgsFactory("_libScoreComputation.getHighestEquivalentGroup") wrapper.typeList[wrapper.function](self.symbols) (i_max, j_max, maxScore, listScores) = _libScoreComputation.getHighestEquivalentGroup(self.doInternalSlick, self.cb_executionStatus, self.isFinish, debug, wrapper) # Retrieve the scores for each association of symbols self.scores = {} for (iuid, juid, score) in listScores: if self.isFinish(): return (None, None, None) if iuid not in self.scores.keys(): self.scores[iuid] = {} if juid not in self.scores.keys(): self.scores[juid] = {} self.scores[iuid][juid] = score if iuid not in self.scores[juid].keys(): self.scores[juid][iuid] = score # Reduce the UPGMA matrix (merge symbols by similarity) self.computePhylogenicTree() return (i_max, j_max, maxScore) def computePhylogenicTree(self): """Compute the phylogenic tree @var max_i: uid of i_maximum @var max_j: uid of j_maximum @var maxScore: the highest global score""" maxScore = 0 status = 0 step = (float(100) - float(self.minEquivalence)) / float(100) if len(self.scores) > 1: max_i = max(self.scores, key=lambda x: self.scores[x][max(self.scores[x], key=lambda y: self.scores[x][y])]) max_j = max(self.scores[max_i], key=lambda y: self.scores[max_i][y]) maxScore = self.scores[max_i][max_j] while len(self.scores) > 1 and maxScore >= self.minEquivalence: if self.isFinish(): return symbols_uid = [s.getID() for s in self.symbols] # List of the UID in of symbols (i_maximum, j_maximum) = (symbols_uid.index(max_i), symbols_uid.index(max_j)) size_i = len(self.symbols[i_maximum].getMessages()) size_j = len(self.symbols[j_maximum].getMessages()) infoMessage = "Clustering {0} with {1} (score = {2})".format(str(i_maximum), str(j_maximum), str(maxScore)) status = (float(100) - float(maxScore)) / float(step) self.cb_executionStatus(1, status, infoMessage) newuid = self.mergeEffectiveRowCol(i_maximum, j_maximum) self.updateScore(max_i, max_j, newuid, size_i, size_j) # self.log.debug("Score après: {0}".format(str(self.scores))) if len(self.scores) > 1: max_i = max(self.scores, key=lambda x: self.scores[x][max(self.scores[x], key=lambda y: self.scores[x][y])]) max_j = max(self.scores[max_i], key=lambda y: self.scores[max_i][y]) maxScore = self.scores[max_i][max_j] def updateScore(self, iuid, juid, newuid, size_i, size_j): """Update the score of two merged clusters. @param iuid: id of the first cluster merged @param juid: id of the second cluster merged @param newuid: new id of the merged cluster @param size_i: size of the first cluster @param size_j: size of the second cluster""" total_size = size_i + size_j del self.scores[iuid] del self.scores[juid] self.scores[newuid] = {} for k in self.scores.keys(): if k != newuid: self.scores[k][newuid] = (size_i * self.scores[k][iuid] + size_j * self.scores[k][juid]) * 1.0 / total_size del self.scores[k][iuid] del self.scores[k][juid] self.scores[newuid][k] = self.scores[k][newuid] def computePathTree(self): """TODO ?""" if self.path == []: clusterIndex = int(random.random() * len(self.scores.keys())) self.path.append(self.scores.keys()[0]) if len(self.path) > 1: # Check if Cl-1,Cl-2 minimum pair lastId = self.path[len(self.path) - 1] if max(self.scores[lastId], key=lambda x: self.scores[lastId][x]) == self.path[len(self.path) - 2]: return while True: lastId = self.path[len(self.path) - 1] juid = max(self.scores[lastId], key=lambda x: self.scores[lastId][x]) self.path.append(juid) if max(self.scores[juid], key=lambda x: self.scores[juid][x]) == lastId: break def mergeEffectiveRowCol(self, i_maximum, j_maximum): """Merge the symbols i and j in the "symbols" structure @param i_maximum: id of the first symbol to merge @param j_maximum: id of the second symbol to merge @return the newly created symbol result of the merged process""" # Extract symbols i and j if i_maximum > j_maximum: symbol1 = self.symbols.pop(i_maximum) symbol2 = self.symbols.pop(j_maximum) else: symbol1 = self.symbols.pop(j_maximum) symbol2 = self.symbols.pop(i_maximum) # Merge the symbols i and j messages = [] messages.extend(symbol1.getMessages()) messages.extend(symbol2.getMessages()) newSymbol = Symbol(str(uuid.uuid4()), symbol1.getName(), self.project) newSymbol.setMinEqu(self.minEquivalence) for message in messages: newSymbol.addMessage(message) # Append th new symbol to the "symbols" structure self.symbols.append(newSymbol) return newSymbol.getID() def executeOrphanReduction(self): """Execute the orphan reduction process by merging symbols which are progressively reduced in size.""" leftReductionFactor = 0 rightReductionFactor = 0 currentReductionIsLeft = False increment = 10 while leftReductionFactor < 80 and rightReductionFactor < 80: # First we retrieve the current orphans orphans = [] tmp_symbols = [] # extract orphans for i, symbol in zip(range(len(self.symbols)), self.symbols): if len(symbol.getMessages()) == 1: orphans.append(symbol) # create a tmp symbols array where symbols will be added once computed for symbol in self.symbols: if len(symbol.getMessages()) > 1: tmp_symbols.append(symbol) if len(orphans) <= 1: self.log.info("Number of orphan symbols: {0}. The orphan merging op. is finished!".format(len(orphans))) break self.symbols = orphans if currentReductionIsLeft: leftReductionFactor = leftReductionFactor + increment # Reduce the size of the messages by 50% from the left for orphan in self.symbols: orphan.getMessages()[0].setLeftReductionFactor(leftReductionFactor) orphan.getMessages()[0].setRightReductionFactor(0) self.log.info("Start to merge orphans reduced by {0}% from the left".format(str(leftReductionFactor))) self.executeClustering() currentReductionIsLeft = False if not currentReductionIsLeft: rightReductionFactor = rightReductionFactor + increment # Reduce the size of the messages from the right for orphan in self.symbols: orphan.getMessages()[0].setRightReductionFactor(rightReductionFactor) orphan.getMessages()[0].setLeftReductionFactor(0) self.log.info("Start to merge orphans reduced by {0}% from the right".format(str(rightReductionFactor))) self.executeClustering() currentReductionIsLeft = True for orphan in self.symbols: for message in orphan.getMessages(): message.setLeftReductionFactor(0) message.setRightReductionFactor(0) tmp_symbols.append(orphan) self.symbols = tmp_symbols self.cb_executionStatus(3, 50.0, "Executing last alignment...") alignment = NeedlemanAndWunsch(self.unitSize, self.project, False, self.cb_status) # Compute the regex/alignment of each symbol for symbol in self.symbols: alignment.alignField(symbol.getField()) return self.symbols def getScores(self): """@return: the dictionnary of scores""" return self.scores def stop(self): """Stop the current execution of any clustering operation""" self.flagStop = True if self.currentAlignment is not None: self.currentAlignment.stop() def isFinish(self): """Compute if we should finish the current clustering operation""" return self.flagStop
def executeOrphanReduction(self): """Execute the orphan reduction process by merging symbols which are progressively reduced in size.""" leftReductionFactor = 0 rightReductionFactor = 0 currentReductionIsLeft = False increment = 10 while leftReductionFactor < 80 and rightReductionFactor < 80: # First we retrieve the current orphans orphans = [] tmp_symbols = [] # extract orphans for i, symbol in zip(range(len(self.symbols)), self.symbols): if len(symbol.getMessages()) == 1: orphans.append(symbol) # create a tmp symbols array where symbols will be added once computed for symbol in self.symbols: if len(symbol.getMessages()) > 1: tmp_symbols.append(symbol) if len(orphans) <= 1: self.log.info("Number of orphan symbols: {0}. The orphan merging op. is finished!".format(len(orphans))) break self.symbols = orphans if currentReductionIsLeft: leftReductionFactor = leftReductionFactor + increment # Reduce the size of the messages by 50% from the left for orphan in self.symbols: orphan.getMessages()[0].setLeftReductionFactor(leftReductionFactor) orphan.getMessages()[0].setRightReductionFactor(0) self.log.info("Start to merge orphans reduced by {0}% from the left".format(str(leftReductionFactor))) self.executeClustering() currentReductionIsLeft = False if not currentReductionIsLeft: rightReductionFactor = rightReductionFactor + increment # Reduce the size of the messages from the right for orphan in self.symbols: orphan.getMessages()[0].setRightReductionFactor(rightReductionFactor) orphan.getMessages()[0].setLeftReductionFactor(0) self.log.info("Start to merge orphans reduced by {0}% from the right".format(str(rightReductionFactor))) self.executeClustering() currentReductionIsLeft = True for orphan in self.symbols: for message in orphan.getMessages(): message.setLeftReductionFactor(0) message.setRightReductionFactor(0) tmp_symbols.append(orphan) self.symbols = tmp_symbols self.cb_executionStatus(3, 50.0, "Executing last alignment...") alignment = NeedlemanAndWunsch(self.unitSize, self.project, False, self.cb_status) # Compute the regex/alignment of each symbol for symbol in self.symbols: alignment.alignField(symbol.getField()) return self.symbols
class UPGMA(object): """This class provides the required methods to compute clustering between multiple symbols/messages using UPGMA algorithms (see U{http://en.wikipedia.org/wiki/UPGMA}). When processing, the matrix of scores is computed by the C extensions (L{_libScoreComputation} and used to regroup messages and symbols into equivalent cluster.""" def __init__(self, project, symbols, unitSize, cb_status=None, scores={}): self.project = project self.unitSize = unitSize self.cb_status = cb_status self.scores = scores # Then we retrieve all the parameters of the CLUSTERING / ALIGNMENT self.defaultFormat = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_GLOBAL_FORMAT) self.nbIteration = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_NB_ITERATION) self.minEquivalence = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_EQUIVALENCE_THRESHOLD) self.doInternalSlick = self.project.getConfiguration().getVocabularyInferenceParameter(ProjectConfiguration.VOCABULARY_DO_INTERNAL_SLICK) self.log = logging.getLogger('netzob.Inference.Vocabulary.UPGMA.py') self.path = [] self.flagStop = False self.currentAlignment = None # Create a symbol for each message self.symbols = [] i_symbol = 1 for symbol in symbols: for m in symbol.getMessages(): tmpSymbol = Symbol(str(uuid.uuid4()), "Symbol " + str(i_symbol), project) tmpSymbol.addMessage(m) self.symbols.append(tmpSymbol) i_symbol += 1 self.log.debug("A number of {0} already aligned symbols will be clustered.".format(str(len(symbols)))) def cb_executionStatus(self, stage, donePercent, currentMessage): """Callback function called by the C extension to provide info on status @param donePercent: a float between 0 and 100 included @param currentMessage: a str which represents the current alignment status""" if self.cb_status is None: self.log.info("[UPGMA status]" + str(donePercent) + "% " + currentMessage) else: self.cb_status(stage, donePercent, currentMessage) def executeClustering(self): """Execute the clustering operation @return the new list of symbols""" self.log.debug("Re-Organize the symbols (nbIteration={0}, min_equivalence={1})".format(self.nbIteration, self.minEquivalence)) # Process the UPGMA on symbols if self.isFinish(): return None self.cb_executionStatus(0, 0, "Clustering into symbols...") self.processUPGMA() self.cb_executionStatus(1, 100, None) # Retrieve the alignment of each symbol and the build the associated regular expression self.cb_executionStatus(2, 0, "Compute the definition for each cluster...") if self.isFinish(): return None self.currentAlignment = NeedlemanAndWunsch(self.unitSize, self.project, False, self.cb_status) self.currentAlignment.absoluteStage = 2 self.currentAlignment.statusRatio = len(self.symbols) self.currentAlignment.statusRatioOffset = 0 for symbol in self.symbols: if self.isFinish(): return None self.currentAlignment.alignField(symbol.getField()) self.currentAlignment.statusRatioOffset = self.currentAlignment.statusRatioOffset + 1 return self.symbols def processUPGMA(self): """Computes the matrix of equivalences (in C) and reduce it iteratively.""" self.log.debug("Computing the associated matrix") # Execute the Clustering part in C debug = False wrapper = WrapperArgsFactory("_libScoreComputation.computeSimilarityMatrix") wrapper.typeList[wrapper.function](self.symbols) (listScores) = _libScoreComputation.computeSimilarityMatrix(self.doInternalSlick, self.cb_executionStatus, self.isFinish, debug, wrapper) # Retrieve the scores for each association of symbols self.scores = {} for (iuid, juid, score) in listScores: if self.isFinish(): return (None, None, None) if iuid not in self.scores.keys(): self.scores[iuid] = {} if juid not in self.scores.keys(): self.scores[juid] = {} self.scores[iuid][juid] = score if iuid not in self.scores[juid].keys(): self.scores[juid][iuid] = score # Reduce the UPGMA matrix (merge symbols by similarity) self.computePhylogenicTree() def computePhylogenicTree(self): """Compute the phylogenic tree @var max_i: uid of i_maximum @var max_j: uid of j_maximum @var maxScore: the highest global score""" maxScore = 0 status = 0 step = (float(100) - float(self.minEquivalence)) / float(100) if len(self.scores) > 1: max_i = max(self.scores, key=lambda x: self.scores[x][max(self.scores[x], key=lambda y: self.scores[x][y])]) max_j = max(self.scores[max_i], key=lambda y: self.scores[max_i][y]) maxScore = self.scores[max_i][max_j] while len(self.scores) > 1 and maxScore >= self.minEquivalence: if self.isFinish(): return symbols_uid = [s.getID() for s in self.symbols] # List of the UID in of symbols (i_maximum, j_maximum) = (symbols_uid.index(max_i), symbols_uid.index(max_j)) size_i = len(self.symbols[i_maximum].getMessages()) size_j = len(self.symbols[j_maximum].getMessages()) infoMessage = "Clustering {0} with {1} (score = {2})".format(str(i_maximum), str(j_maximum), str(maxScore)) status = (float(100) - float(maxScore)) / float(step) self.cb_executionStatus(1, status, infoMessage) newuid = self.mergeEffectiveRowCol(i_maximum, j_maximum) self.updateScore(max_i, max_j, newuid, size_i, size_j) # self.log.debug("Score après: {0}".format(str(self.scores))) if len(self.scores) > 1: max_i = max(self.scores, key=lambda x: self.scores[x][max(self.scores[x], key=lambda y: self.scores[x][y])]) max_j = max(self.scores[max_i], key=lambda y: self.scores[max_i][y]) maxScore = self.scores[max_i][max_j] def updateScore(self, iuid, juid, newuid, size_i, size_j): """Update the score of two merged clusters. @param iuid: id of the first cluster merged @param juid: id of the second cluster merged @param newuid: new id of the merged cluster @param size_i: size of the first cluster @param size_j: size of the second cluster""" total_size = size_i + size_j del self.scores[iuid] del self.scores[juid] self.scores[newuid] = {} for k in self.scores.keys(): if k != newuid: self.scores[k][newuid] = (size_i * self.scores[k][iuid] + size_j * self.scores[k][juid]) * 1.0 / total_size del self.scores[k][iuid] del self.scores[k][juid] self.scores[newuid][k] = self.scores[k][newuid] def computePathTree(self): """TODO ?""" if self.path == []: clusterIndex = int(random.random() * len(self.scores.keys())) self.path.append(self.scores.keys()[0]) if len(self.path) > 1: # Check if Cl-1,Cl-2 minimum pair lastId = self.path[len(self.path) - 1] if max(self.scores[lastId], key=lambda x: self.scores[lastId][x]) == self.path[len(self.path) - 2]: return while True: lastId = self.path[len(self.path) - 1] juid = max(self.scores[lastId], key=lambda x: self.scores[lastId][x]) self.path.append(juid) if max(self.scores[juid], key=lambda x: self.scores[juid][x]) == lastId: break def mergeEffectiveRowCol(self, i_maximum, j_maximum): """Merge the symbols i and j in the "symbols" structure @param i_maximum: id of the first symbol to merge @param j_maximum: id of the second symbol to merge @return the newly created symbol result of the merged process""" # Extract symbols i and j if i_maximum > j_maximum: symbol1 = self.symbols.pop(i_maximum) symbol2 = self.symbols.pop(j_maximum) else: symbol1 = self.symbols.pop(j_maximum) symbol2 = self.symbols.pop(i_maximum) # Merge the symbols i and j messages = [] messages.extend(symbol1.getMessages()) messages.extend(symbol2.getMessages()) newSymbol = Symbol(str(uuid.uuid4()), symbol1.getName(), self.project) newSymbol.setMinEqu(self.minEquivalence) for message in messages: newSymbol.addMessage(message) # Append th new symbol to the "symbols" structure self.symbols.append(newSymbol) return newSymbol.getID() def executeOrphanReduction(self): """Execute the orphan reduction process by merging symbols which are progressively reduced in size.""" leftReductionFactor = 0 rightReductionFactor = 0 currentReductionIsLeft = False increment = 10 while leftReductionFactor < 80 and rightReductionFactor < 80: # First we retrieve the current orphans orphans = [] tmp_symbols = [] # extract orphans for i, symbol in zip(range(len(self.symbols)), self.symbols): if len(symbol.getMessages()) == 1: orphans.append(symbol) # create a tmp symbols array where symbols will be added once computed for symbol in self.symbols: if len(symbol.getMessages()) > 1: tmp_symbols.append(symbol) if len(orphans) <= 1: self.log.info("Number of orphan symbols: {0}. The orphan merging op. is finished!".format(len(orphans))) break self.symbols = orphans if currentReductionIsLeft: leftReductionFactor = leftReductionFactor + increment # Reduce the size of the messages by 50% from the left for orphan in self.symbols: orphan.getMessages()[0].setLeftReductionFactor(leftReductionFactor) orphan.getMessages()[0].setRightReductionFactor(0) self.log.info("Start to merge orphans reduced by {0}% from the left".format(str(leftReductionFactor))) self.executeClustering() currentReductionIsLeft = False if not currentReductionIsLeft: rightReductionFactor = rightReductionFactor + increment # Reduce the size of the messages from the right for orphan in self.symbols: orphan.getMessages()[0].setRightReductionFactor(rightReductionFactor) orphan.getMessages()[0].setLeftReductionFactor(0) self.log.info("Start to merge orphans reduced by {0}% from the right".format(str(rightReductionFactor))) self.executeClustering() currentReductionIsLeft = True for orphan in self.symbols: for message in orphan.getMessages(): message.setLeftReductionFactor(0) message.setRightReductionFactor(0) tmp_symbols.append(orphan) self.symbols = tmp_symbols self.cb_executionStatus(3, 50.0, "Executing last alignment...") alignment = NeedlemanAndWunsch(self.unitSize, self.project, False, self.cb_status) # Compute the regex/alignment of each symbol for symbol in self.symbols: alignment.alignField(symbol.getField()) return self.symbols def getScores(self): """@return: the dictionnary of scores""" return self.scores def stop(self): """Stop the current execution of any clustering operation""" self.flagStop = True if self.currentAlignment is not None: self.currentAlignment.stop() def isFinish(self): """Compute if we should finish the current clustering operation""" return self.flagStop