def run(ngramPath, sid_seq, outPath): """ this function is a simpler version of the main function it is a wrapper around runDiana intended to be used for rhcBootstrap (my testing script) :type ngramPath: str - the path to the computed pattern dataset :type sid_seq: Dict{int:Dict{str:int}} - for each user, record the pattern and corresponding occurence # :type outPath: str - the path to store the output and temporary file """ global matrixCompTotal startTime = time.time() idxToSid = [x+1 for x in range(len(sid_seq))] idfMap = rhc.excludeFeatures(rhc.getIdf(sid_seq, idxToSid), []) matrix = calculateDistance.partialMatrix( idxToSid, idfMap, ngramPath, 'tmp_%sroot' % int(time.time()), outPath, True) print('[LOG]: first matrixTime %f' % (time.time() - startTime)) matrixCompTotal += time.time() - startTime hc = HCClustering( matrix, sid_seq, outPath, [], idxToSid, sizeThreshold=0.05 * len(sid_seq), idfMap=idfMap) result = hc.runDiana() print('[STAT]: total clustering time %f' % (time.time() - startTime)) return result
def __init__(self, matrix, sid_seq, outPath, exclusions, idxToSid, sizeThreshold, idfMap=None): self.matrix = matrix self.sizeThreshold = sizeThreshold self.maxDistance = 100 self.sid_seq = sid_seq self.outPath = outPath self.exclusions = exclusions if not idxToSid: idxToSid = [x+1 for x in range(len(sid_seq))] self.idxToSid = idxToSid if not idfMap: idfMap = rhc.excludeFeatures(rhc.getIdf(sid_seq, idxToSid), exclusions) self.idfMap = idfMap
def runDiana(self): """ Perform recursive hierarchical clustering """ global matrixCompTotal, splitTotal, modularityTotal, diaTotal global excluTotal M = self.matrix self.modularityBasics() print('[LOG]: finished calculating modularityBasics') # child Cid => parent Cid clusterHi = [] # record the evaluation metrics evalResults = {} cid = 1 clusters = [(range(len(self.matrix)), self.maxDistance, cid)] # get a mapping from cid => list of row sums for all ids in cluster self.sumEntriesMap = {} self.sumEntriesMap[cid] = np.sum(self.matrix, axis=1, dtype=np.float64) while clusters[-1][1] and len(clusters[-1][0]) > self.sizeThreshold: parentCid = clusters[-1][2] # print('splitting %s\t%s' % (clusters[-1][1],clusters[-1][2])) clusterHi.append((parentCid, cid + 1, cid + 2)) curTime = time.time() (clusterA, clusterB, sumEntryA, sumEntryB, sumAB) = \ (self.splitCluster(clusters.pop())) splitTotal += time.time() - curTime curTime = time.time() cid += 1 self.sumEntriesMap[cid] = sumEntryA clusters.append((clusterA, self.getDia(cid, clusterA), cid)) # clusters.append((clusterA, np.mean(sumEntryA) / len(clusterA), cid)) cid += 1 self.sumEntriesMap[cid] = sumEntryB clusters.append((clusterB, self.getDia(cid, clusterB), cid)) # clusters.append((clusterB, np.mean(sumEntryB) / len(clusterB), cid)) diaTotal += time.time() - curTime curTime = time.time() clusters = \ sorted(clusters, key=lambda x: (x[1], len(x[0])) if len(x[0]) > self.sizeThreshold else (0, 0)) if len(clusters) == 2: # if it is the first time to compute modularity evalResult = self.evaluateModularity( (clusterA, clusterB), (sumEntryA, sumEntryB)) else: # if it is based on the previous scores evalResult = evalResults[len(clusters) - 1] + \ self.evaluateModularityShift((clusterA, clusterB), sumAB) modularityTotal += time.time() - curTime # print(sorted([len(x[0]) for x in clusters], reverse = True)) # print(len(clusters[-1][0])) evalResults[len(clusters)] = evalResult # print('cluster num is %d, modularity %f' % (len(clusters), evalResult)) # print(evalResults) sweetSpot = rhc.getSweetSpot(evalResults, 5) sweetSpot = sorted(evalResults.keys(), key=lambda x: abs(x - sweetSpot))[0] print('[LOG]: sweetSpot is %d, modularity %f' % (sweetSpot, evalResults[sweetSpot])) # merge the clusters to the point of sweet spot clusterMap = dict([(row[2], row) for row in clusters]) cids = [(row[2]) for row in clusters] while(len(cids) > sweetSpot): (parentCid, childACid, childBCid) = clusterHi.pop() # dismeter doesn't matter, so put zero here clusterMap[parentCid] = \ (clusterMap[childACid][0] + clusterMap[childBCid][0], 0, parentCid) cids.append(parentCid) cids.remove(childACid) cids.remove(childBCid) # reconstruct the cluster list after merging clusters = [(x[0], x[1], None, x[2]) for cid, x in clusterMap.items() if cid in cids] # get the exclusion map according to the current clustering startTime = time.time() excludeMap, exclusionScoreMap, scoreMap = \ rhc.getExclusionMap(clusters, self.sid_seq, self.idfMap, self.idxToSid, [row[3] for row in clusters], self.exclusions) excluTotal += time.time() - startTime # for each cluster, we start a new clustering results = [] for cidx in range(len(clusters)): row = clusters[cidx] idxs = row[0] # get the list of all node in clusters sids = sorted([self.idxToSid[nidx] for nidx in idxs]) excludedFeatures = excludeMap[row[3]] excludedScores = exclusionScoreMap[row[3]] # if we want to continue cluster this subcluster if len(sids) > self.sizeThreshold: newExclusions = self.exclusions + excludedFeatures # remove sids where the vector have all zeros newExclusionSet = set(newExclusions) oldLen = len(sids) excludedSids = [sid for sid in sids if len( set(self.sid_seq[sid].keys()) - newExclusionSet) == 0] sids = [sid for sid in sids if len( set(self.sid_seq[sid].keys()) - newExclusionSet) > 0] # if the cluster size is too small after feature selection, # don't cluster it # or if the cluster diameter is 0 if not len(sids) > self.sizeThreshold: result = ('l', sids + excludedSids, {'exclusions': excludedFeatures, 'exclusionsScore': excludedScores}) else: matrixStart = time.time() matrix = calculateDistance.partialMatrix( sids, rhc.excludeFeatures(rhc.getIdf(sid_seq, sids), newExclusions), ngramPath, 'tmp_%d' % row[3], '%st%d_' % (self.outPath, row[-1]), True) matrixCompTotal += time.time() - matrixStart # after the matrix is calculated, we need to handle a # speacial case where all entries in the matrix is zero, # besically means if the first row of the # matrix adds up to zero # if this is the case, do not split the cluster if np.sum(matrix[0]) == 0: result = ('l', sids + excludedSids, {'exclusions': excludedFeatures, 'exclusionsScore': excludedScores}) else: # now that we have a new distance matrix, go and # do another round of clustering result = HCClustering( matrix, sid_seq, '%sp%d_' % (self.outPath, row[-1]), newExclusions, sids, self.sizeThreshold).runDiana() if len(results) > 2: info = result[2] else: info = {} # put the excluded sids back as a cluster if (len(excludedSids) > 0): result[1].append(('l', excludedSids, {'isExclude': True})) info['exclusions'] = excludedFeatures info['exclusionsScore'] = excludedScores # base on the score map, calculate the gini coefficient # score map format {cid:[(feature, score)]} # info['gini'] = getGini([x[1] for x in scoreMap[row[3]]]) result = (result[0], result[1], info) else: result = ('l', sids, {'exclusions': excludedFeatures, 'exclusionsScore': excludedScores}) results.append(result) return(('t', results, {'sweetspot': evalResults[sweetSpot]}))