def __init__(self): self.wordSequence = [] self.sequenceLength = 0 self.TaskStack = TaskStack()
class BrownClusterModel: START_CLUSTER_SYMBOL = -1 NO_CLUSTER_SYMBOL = -2 def __init__(self): self.wordSequence = [] self.sequenceLength = 0 self.TaskStack = TaskStack() #################### Data Addition Methods # TODO: Make more efficient. (You don't really need to redo the statistics every time you add data) def addTrainingData(self, wordSequence): (wordDataLength, wordDataWidth) = wordSequence.shape # Assume it comes in as numpy array # Data Validation / Word Data Width if not hasattr(self, "wordDataWidth"): self.wordDataWidth = wordDataWidth if wordDataWidth != self.wordDataWidth: raise Exception("New words not of same length as old words") log.debug("Adding new word sequence of length %d...", wordDataLength) # Add Data. Simple Concatenation self.wordSequence = self.wordSequence + self.__addTrainingData_tuplefyWordSequence(wordSequence) # Update Sequence Length self.sequenceLength += wordDataLength # Reset Variables and Flags self.__resetDataStatistics() log.debug("Added new word sequence. Total word sequence is now %d.", self.sequenceLength) def __addTrainingData_tuplefyWordSequence(self, wordSequence): wordDataLength = len(wordSequence) newWordSequence = [None] * wordDataLength # Pre-allocate. Probably not worthwhile... for i in range(wordDataLength): newWordSequence[i] = tuple(wordSequence[i, :]) return newWordSequence def __resetDataStatistics(self): # (Superfluous) Delete Existing Data self.sortedWords = [] self.wordClusterMapping = {} self.clusterSequence = [] self.clusters = [] self.clusterUnigramCounts = {} self.clusterBigramCounts = None self.clusterCostTable = None self.mergeCostReductions = None self.mergeHistory = [] self.binaryRepresentation = {} self.interClusterDistance = {} # Add New Tasks self.TaskStack.push( lambda: self.__resetData_definemergeCostReductions(), "definemergeCostReductions", "Finding Merge Cost Reduction Shortcuts", ) self.TaskStack.push( lambda: self.__resetData_defineClusterCostTable(), "defineClusterCostTable", "Creating Cluster Cost Table (Finding Bigram Graph Edge Costs)", ) self.TaskStack.push( lambda: self.__resetData_defineClusterCounts(), "defineClusterCounts", "Counting Cluster NGrams from Cluster Sequence", ) self.TaskStack.push( lambda: self.__resetData_defineClusterSequence(), "defineClusterSequence", "Creating Cluster Sequence from Word-to-Cluster Mapping and Word Sequence", ) self.TaskStack.push( lambda: self.__resetData_defineWordClusterMap(), "defineWordClusterMap", "Defining Word-to-Cluster Mapping" ) # Clear The Tasks self.TaskStack.clear() def __resetData_defineWordClusterMap(self, numInitialClusters=None): self.sortedWords = [x[0] for x in Counter(self.wordSequence).most_common()] if numInitialClusters is None: self.wordClusterMapping = dict(zip(self.sortedWords, range(0, len(self.sortedWords)))) else: raise NotImplementedError("Have not implemented non-used clusters yet.") self.wordClusterMapping = dict(zip(self.sortedWords[0:numInitialClusters], range(0, numInitialClusters))) log.debug("Defined Word Cluster Map for %d words.", self.sequenceLength) log.debug("%d words found.", len(self.sortedWords)) log.debug("%d clusters created.", len(self.wordClusterMapping)) def clusterCount_getBigramCount(self, cluster1, cluster2): return self.clusterBigramCounts.get(cluster1, cluster2, 0.0) def clusterCount_getClusterCount(self, cluster1): return self.clusterUnigramCounts[cluster1] # Throw an error if not found def __resetData_defineClusterCounts(self): # clusterSequence = [self.START_CLUSTER_SYMBOL] + clusterSequence sequenceLength = len(self.clusterSequence) self.clusterUnigramCounts = {} self.clusterBigramCounts = SymmetricTable_Sparse(orderMatters=True) current = self.clusterSequence[0] self.clusterUnigramCounts[current] = 1.0 for i in range(1, sequenceLength): previous = current current = self.clusterSequence[i] self.clusterUnigramCounts[current] = self.clusterUnigramCounts.get(current, 0.0) + 1.0 self.clusterBigramCounts.set(current, previous, self.clusterBigramCounts.get(current, previous, 0.0) + 1.0) # CONSIDER: Does the concept of discounted probabilities mean anything here? self.clusters = self.clusterUnigramCounts.keys() self.originalClusters = copy(self.clusters) log.debug("Found cluster sequence for %d words.", self.sequenceLength) log.debug("%d clusters were found.", len(self.clusterUnigramCounts)) log.debug("%d bigrams were found.", len(self.clusterBigramCounts)) if self.NO_CLUSTER_SYMBOL in self.clusterUnigramCounts: log.debug("Unclustered words remain in the dataset.") else: log.debug("No unclustered words remain in the dataset.") def __resetData_defineClusterSequence(self): self.clusterSequence = self.__resetData_defineClusterSequence_generic(self.wordSequence) def __resetData_defineClusterSequence_generic(self, wordSequence): sequenceLength = len(wordSequence) clusterSequence = [None] * sequenceLength for i in range(sequenceLength): clusterSequence[i] = self.wordClusterMapping.get(wordSequence[i], self.NO_CLUSTER_SYMBOL) return clusterSequence def __resetData_defineClusterCostTable(self): numClusters = len(self.clusters) self.clusterCostTable = SymmetricTable_Sparse(orderMatters=False) for i in range(numClusters): c1 = self.clusters[i] cnt_c1 = self.clusterCount_getClusterCount(c1) for j in range(i, numClusters): c2 = self.clusters[j] cost = self.clusterCost_findSingleClusterPairCost(c1, c2, cnt_c1=cnt_c1) self.clusterCostTable.set(c1, c2, cost) def clusterCost_findSingleClusterPairCost(self, c1, c2, cnt_c1=None, cnt_c2=None): if cnt_c1 is None: cnt_c1 = self.clusterCount_getClusterCount(c1) if c1 == c2: bigramCount = self.clusterCount_getBigramCount(c1, c1) return self.mutualInfo_Equation(bigramCount, cnt_c1, cnt_c1) else: if cnt_c2 is None: cnt_c2 = self.clusterCount_getClusterCount(c2) bigramCount = self.clusterCount_getBigramCount(c1, c2) cost = self.mutualInfo_Equation(bigramCount, cnt_c1, cnt_c2) bigramCount = self.clusterCount_getBigramCount(c2, c1) cost += self.mutualInfo_Equation(bigramCount, cnt_c1, cnt_c2) return cost def mutualInfo_Equation(self, bigramCount, unigram1Count, unigram2Count): if bigramCount > 0: # I've been uncomfortable with logs being above/below 0, but what I'm coming to realize is that the zero # intercept of mutual information is where information goes from useful to not..? # Sample probability? or Population probability? Population. # n = sequenceLength - 1 # Sample probability n = self.sequenceLength # Population probability return bigramCount / n * lgrthm(n * bigramCount / unigram1Count / unigram2Count, 2) else: if unigram1Count == 0 or unigram2Count == 0: raise Exception("Erroneous clusters") return 0.0 # This is the mutual information from treating two separate clusters as one. # If you want to treat 1 and 2 as A, vis-a-vis 3, then: # you need to account for 1->3, 2->3, 3->1, and 3->2 connections, and treat them all as A->3 and 3->A; # you also need to account for P(A) = P(1)+P(2) def mutualInfo_PairVersusAnother(self, mc1, mc2, c3, cnt_mc1=None, cnt_mc2=None, cnt_c3=None): if cnt_mc1 is None: self.clusterCount_getClusterCount(mc1) if cnt_mc2 is None: cnt_mc2 = self.clusterCount_getClusterCount(mc2) if cnt_c3 is None: cnt_c3 = self.clusterCount_getClusterCount(c3) mutualInformation = 0.0 bigram1Count = self.clusterCount_getBigramCount(c3, mc1) # 3->1 bigram2Count = self.clusterCount_getBigramCount(c3, mc2) # 3->2 mutualInformation += self.mutualInfo_Equation( (bigram1Count + bigram2Count), cnt_c3, (cnt_mc1 + cnt_mc2) ) # 3->A bigram1Count = self.clusterCount_getBigramCount(mc1, c3) # 1->3 bigram2Count = self.clusterCount_getBigramCount(mc2, c3) # 2->3 mutualInformation += self.mutualInfo_Equation( (bigram1Count + bigram2Count), cnt_c3, (cnt_mc1 + cnt_mc2) ) # A->3 return mutualInformation # This is the mutual information from treating two separate clusters as one. # If you want to treat 1 and 2 as A, then: # you need to account for 1->2, 2->1, 1->1, and 2->2 connections, and treat them all as A->A; # you need to account for P(A) = P(1)+P(2) def mutualInfo_PairIntoOne(self, c1, c2, cnt_c1=None, cnt_c2=None): if cnt_c1 is None: cnt_c1 = self.clusterCount_getClusterCount(c1) if cnt_c2 is None: cnt_c2 = self.clusterCount_getClusterCount(c2) bigram1Count = self.clusterCount_getBigramCount(c1, c2) # 1->2 bigram2Count = self.clusterCount_getBigramCount(c2, c1) # 2->1 bigram3Count = self.clusterCount_getBigramCount(c1, c1) # 1<->1 bigram4Count = self.clusterCount_getBigramCount(c2, c2) # 2<->2 totalBigramCount = bigram1Count + bigram2Count + bigram3Count + bigram4Count return self.mutualInfo_Equation(totalBigramCount, (cnt_c1 + cnt_c2), (cnt_c1 + cnt_c2)) # This is the mutual information from treating 4 separate clusters as two. # If you want to treat 1 and 2 as A, and 3 and 4 as B, then: # you need to account for 1->3, 1->4, 2->3, and 2->4 connections, and treat them all as A->B; # you need to account for 3->1, 4->1, 3->2, and 4->2 connections, and treat them all as B->A; # you need to account for P(A) = P(1)+P(2) # you need to account for P(B) = P(3)+P(4) def mutualInfo_PairVersusPair(self, c1, c2, c3, c4, cnt_c1=None, cnt_c2=None, cnt_c3=None, cnt_c4=None): if cnt_c1 is None: cnt_c1 = self.clusterCount_getClusterCount(c1) if cnt_c2 is None: cnt_c2 = self.clusterCount_getClusterCount(c2) if cnt_c3 is None: cnt_c3 = self.clusterCount_getClusterCount(c3) if cnt_c4 is None: cnt_c4 = self.clusterCount_getClusterCount(c4) mutualInformation = 0.0 # Group 1 to Group 2 bigram1Count = self.clusterCount_getBigramCount(c1, c3) bigram2Count = self.clusterCount_getBigramCount(c1, c4) bigram3Count = self.clusterCount_getBigramCount(c2, c3) bigram4Count = self.clusterCount_getBigramCount(c2, c4) totalBigramCount = bigram1Count + bigram2Count + bigram3Count + bigram4Count mutualInformation += self.mutualInfo_Equation(totalBigramCount, (cnt_c1 + cnt_c2), (cnt_c3 + cnt_c4)) # Group 2 to Group 1 bigram1Count = self.clusterCount_getBigramCount(c3, c1) bigram2Count = self.clusterCount_getBigramCount(c3, c2) bigram3Count = self.clusterCount_getBigramCount(c4, c1) bigram4Count = self.clusterCount_getBigramCount(c4, c2) totalBigramCount = bigram1Count + bigram2Count + bigram3Count + bigram4Count mutualInformation += self.mutualInfo_Equation(totalBigramCount, (cnt_c1 + cnt_c2), (cnt_c3 + cnt_c4)) return mutualInformation # This function gives the merge reduction cost for a single cluster pair using the naive algorithm. # This algorithm has been verified. def mergeCost_SinglePair(self, c1, c2): cnt_c1 = self.clusterCount_getClusterCount(c1) cnt_c2 = self.clusterCount_getClusterCount(c2) clusterCostReduction = 0.0 clusterCostAddition = 0.0 for c3 in self.clusters: if c3 == c1 or c3 == c2: continue # deal with these separately. clusterCostReduction += self.clusterCostTable.get(c1, c3) # 1<->3 (Encompasses 1->3 and 3->1) clusterCostReduction += self.clusterCostTable.get(c2, c3) # 2<->3 (Encompasses 2->3 and 3->2) # This is the procedure you get if you try to combine two nodes into one: # P(c,c')*log(P(c,c')/P(c)/P(c')) cnt_c3 = self.clusterCount_getClusterCount(c3) clusterCostAddition += self.mutualInfo_PairVersusAnother(c1, c2, c3, cnt_c1, cnt_c2, cnt_c3) # Deal with connections among the pair clusterCostReduction += self.clusterCostTable.get(c1, c2) # 1<->2 (Encompasses 1->2 and 2->1) clusterCostReduction += self.clusterCostTable.get(c1, c1) # 1<->1 clusterCostReduction += self.clusterCostTable.get(c2, c2) # 2<->2 clusterCostAddition += self.mutualInfo_PairIntoOne(c1, c2, cnt_c1, cnt_c2) return clusterCostAddition - clusterCostReduction def __resetData_definemergeCostReductions(self): self.mergeCostReductions = SymmetricTable_Sparse(orderMatters=False) clusters = self.clusters numClusters = len(self.clusters) for i in range(numClusters): c1 = self.clusters[i] for j in range(i): c2 = self.clusters[j] mergeCost = self.mergeCost_SinglePair(c1, c2) self.mergeCostReductions.set(c1, c2, mergeCost) def mergeClusters_findMergeClusters(self): if len(self.mergeCostReductions) == 0: # Necessary? return (False, None) # Necessary? return (True, sorted(self.mergeCostReductions.items(), key=itemgetter(1), reverse=True)[0][0]) # Change NGram Counts from merging 1 into 2 # We're deleting 2 def mergeClusters_changeNGramCounts(self, mc1, mc2): # Change Unigram Counts / Delete Cluster mc2 self.clusterUnigramCounts[mc1] += self.clusterCount_getClusterCount(mc2) del self.clusterUnigramCounts[mc2] # Change Unigram Counts / Reset Saved Cluster Keys self.clusters = self.clusterUnigramCounts.keys() # Change Bigram Counts / Change Non-Merging Clusters for c3 in self.clusters: if c3 == mc1 or c3 == mc2: continue # deal with these separately. else: bigramCount = self.clusterBigramCounts.delete(c3, mc1, 0) # 3->1 bigramCount += self.clusterBigramCounts.delete(c3, mc2, 0) # 3->2 self.clusterBigramCounts.set(c3, mc1, bigramCount) # (3->2 + 3->1) => 3->1 bigramCount = self.clusterBigramCounts.get(mc1, c3, 0) # 1->3 bigramCount += self.clusterBigramCounts.delete(mc2, c3, 0) # 2->3 self.clusterBigramCounts.set(mc1, c3, bigramCount) # (1->3 + 2->3) => 1->3 # Change Bigram Counts / Change Merging Clusters bigramCount = self.clusterBigramCounts.delete(mc1, mc1, 0) # 1->1 bigramCount += self.clusterBigramCounts.delete(mc2, mc1, 0) # 2->1 bigramCount += self.clusterBigramCounts.delete(mc1, mc2, 0) # 1->2 bigramCount += self.clusterBigramCounts.delete(mc2, mc2, 0) # 2->2 self.clusterBigramCounts.set(mc1, mc1, bigramCount) # all => 1->1 # Change Cluster Costs from merging 1 into 2 # We're deleting 2 # Depends on NEW Cluster Counts (i.e. NGram Counts) def mergeClusters_changeClusterCosts(self, mc1, mc2): new_cnt_mc1 = self.clusterCount_getClusterCount(mc1) # Depends on NEW Cluster Counts # Change ClusterCost Table / Change Non-Merging Clusters for c3 in self.clusters: if c3 == mc1 or c3 == mc2: continue # deal with these separately. else: cost = self.clusterCost_findSingleClusterPairCost( mc1, c3, cnt_c1=new_cnt_mc1 ) # Depends on NEW Cluster Counts self.clusterCostTable.set(mc1, c3, cost) self.clusterCostTable.delete(mc2, c3, silent=True) # Change ClusterCost Table / Change Merging Clusters cost = self.clusterCost_findSingleClusterPairCost(mc1, mc1, cnt_c1=new_cnt_mc1) self.clusterCostTable.set(mc1, mc1, cost) self.clusterCostTable.delete(mc2, mc2, silent=True) def mergeClusters_mergeTop(self, updateClusterSequence=False, verbose=False): # 1) Find Merge Clusters (success, mergeClusters) = self.mergeClusters_findMergeClusters() if not success: return False (c1, c2) = mergeClusters if verbose: # TODO: Remove cost = self.mergeCostReductions.get(c1, c2) print "Merging Cluster {1} into Cluster {0}, with clusterCostReduction={2}".format(c1, c2, cost) # 2) Change Merge Cost Reduction using OLD Ngram Counts self.mergeClusters_changeMergeCostReductions_UnmergedClusters(c1, c2) self.mergeClusters_changeMergeCostReductions_RemoveDeletedCluster(c2) # 3) Change Unigram and Bigram Counts self.mergeClusters_changeNGramCounts(c1, c2) # 4) Change Cluster Cost Table self.mergeClusters_changeClusterCosts(c1, c2) # 5) Change Merge Cost Reduction using NEW Ngram Counts self.mergeClusters_changeMergeCostReductions_MergedClusters(c1) # 6) Record Change in MergeHistory self.mergeHistory.append((c1, c2)) # TODO: Skip mergeHistory; go straight to dictionaryrepresentation. # 7) Update Cluster Sequence (Optional) if updateClusterSequence: if not hasattr(self, "originalClusterSequence"): # sloppy self.originalClusterSequence = self.clusterSequence # sloppy self.mergeClusters_updateClusterSequence(c1, c2) return True # This updates the mergeCostReductions after mergeCluster1 and mergeCluster2 are merged. # This relies on NEW (i.e. post-merge) NGram Counts # That means mc2 shouldn't be in any tables anymore def mergeClusters_changeMergeCostReductions_MergedClusters(self, mc1): for c3 in self.clusters: if c3 == mc1: continue else: newMergeCostReduction = self.mergeCost_SinglePair(mc1, c3) self.mergeCostReductions.set(mc1, c3, newMergeCostReduction) def mergeClusters_changeMergeCostReductions_RemoveDeletedCluster(self, mc2): for c3 in self.clusters: if c3 == mc2: continue else: self.mergeCostReductions.delete(mc2, c3, silent=False) # Test. I think I want silent # This updates the mergeCostReductions after mergeCluster1 and mergeCluster2 are merged. # This relies on OLD (i.e. pre-merge) NGram Counts def mergeClusters_changeMergeCostReductions_UnmergedClusters(self, mc1, mc2): cnt_mc1 = self.clusterCount_getClusterCount(mc1) cnt_mc2 = self.clusterCount_getClusterCount(mc2) numClusters = len(self.clusters) for i in range(numClusters): c3 = self.clusters[i] if c3 == mc1 or c3 == mc2: continue cnt_c3 = self.clusterCount_getClusterCount(c3) for j in range(i): c4 = self.clusters[j] if c4 == mc1 or c4 == mc2: continue cnt_c4 = self.clusterCount_getClusterCount(c4) mergeCostAddition = 0 mergeCostAddition += self.clusterCostTable.get(c3, mc1) # c3<->mc1 mergeCostAddition += self.clusterCostTable.get(c3, mc2) # c3<->mc2 mergeCostAddition += self.clusterCostTable.get(c4, mc1) # c4<->mc1 mergeCostAddition += self.clusterCostTable.get(c4, mc2) # c4<->mc2 mergeCostAddition += self.mutualInfo_PairVersusPair( c3, c4, mc1, mc2, cnt_c3, cnt_c4, cnt_mc1, cnt_mc2 ) # (c3+c4)<->(mc1+mc2) mergeCostReduction = 0 mergeCostReduction += self.mutualInfo_PairVersusAnother( mc1, mc2, c3, cnt_mc1, cnt_mc2, cnt_c3 ) # c3<->(mc1+mc2) mergeCostReduction += self.mutualInfo_PairVersusAnother( mc1, mc2, c4, cnt_mc1, cnt_mc2, cnt_c4 ) # c4<->(mc1+mc2) mergeCostReduction += self.mutualInfo_PairVersusAnother( c3, c4, mc1, cnt_c3, cnt_c4, cnt_mc1 ) # mc1<->(c3+c4) mergeCostReduction += self.mutualInfo_PairVersusAnother( c3, c4, mc2, cnt_c3, cnt_c4, cnt_mc2 ) # mc2<->(c3+c4) mergeCostChange = mergeCostAddition - mergeCostReduction self.mergeCostReductions.set( c3, c4, self.mergeCostReductions.get(c3, c4) + mergeCostChange ) # Do we want the 'get' to return 0? def performClustering_convertMergeHistoryToBinaryWords(self): self.binaryRepresentation = dict.fromkeys(self.originalClusters, "") for (mc1, mc2) in reversed(self.mergeHistory): self.binaryRepresentation[mc2] = self.binaryRepresentation[mc1] + "1" self.binaryRepresentation[mc1] = self.binaryRepresentation[mc1] + "0" maxDepth = 0 for (cluster, binRep) in self.binaryRepresentation.iteritems(): maxDepth = max(maxDepth, len(binRep)) self.binaryRepresentationMaxLen = maxDepth def performClustering_convertMergeHistoryToDictionary(self): clusterWordMapping = dict(zip(self.wordClusterMapping.values(), self.wordClusterMapping.keys())) d = {} for (mc1, mc2) in self.mergeHistory: leftNode = d.get(mc1, (mc1, clusterWordMapping[mc1])) rightNode = d.get(mc2, (mc2, clusterWordMapping[mc2])) d[mc1] = {-2: leftNode, -1: rightNode} self.dictionaryRepresentation = {"root": d[mc1]} def performClustering_findInterClusterDistances(self): if len(self.binaryRepresentation) == 0: # sloppy. self.performClustering_convertMergeHistoryToBinaryWords() # sloppy. Use task stack instead. numClusters = len(self.originalClusters) self.interClusterDistance = SymmetricTable_FixedSize(numClusters, includeDiagonal=False) for i in range(numClusters): binRep_i = self.binaryRepresentation[i] for j in range(i + 1, numClusters): matchLength = self.performClustering_findInterClusterDistances_stringMatchLength( binRep_i, self.binaryRepresentation[j] ) self.interClusterDistance.set(i, j, self.binaryRepresentationMaxLen - matchLength) # Consider: could be done more efficiently? (i.e. don't use strings) def performClustering_findInterClusterDistances_stringMatchLength(self, string1, string2): length = 0 for i in range(min(len(string1), len(string2))): if string1[i] != string2[i]: break length += 1 return length # TODO: Can this be done more efficiently? def performClustering_establishDistanceMeasure(self): if hasattr(self, "originalClusterSequence"): # sloppy seq = self.originalClusterSequence # sloppy else: # sloppy seq = self.clusterSequence # sloppy sequenceLength = len(seq) self.distanceTable = SymmetricTable_FixedSize(sequenceLength, includeDiagonal=False) for i in range(sequenceLength): c1 = seq[i] for j in range(i + 1, sequenceLength): c2 = seq[j] if c1 == c2: self.distanceTable.set(i, j, 0) else: self.distanceTable.set(i, j, self.interClusterDistance.get(c1, c2)) def performBrownianClustering(self): leftToMerge = True while leftToMerge: leftToMerge = self.mergeClusters_mergeTop() self.performClustering_convertMergeHistoryToBinaryWords() self.performClustering_findInterClusterDistances() self.performClustering_establishDistanceMeasure() def mergeClusters_updateClusterSequence(self, mc1, mc2): for i in range(self.sequenceLength): if self.clusterSequence[i] == mc2: self.clusterSequence[i] = mc1 def findTotalClusteringCost(self): qualityCost = 0 for c1 in self.clusters: cnt_c1 = self.clusterCount_getClusterCount(c1) for c2 in self.clusters: cnt_c2 = self.clusterCount_getClusterCount(c2) bigramCount = self.clusterCount_getBigramCount(c1, c2) qualityCost += self.mutualInfo_Equation(bigramCount, cnt_c1, cnt_c2) return qualityCost