def getClustersConnections(self): '''using the data in self.connections, count the number of clusters.''' clusters = unionfind2.unionFind() for connection in self.connections: node1, node2 = connection[0], connection[1] # unpack clusters.union(node1, node2) # really is that simple return clusters.toLists()
def getRMSDclustersAll(self, rmsdCutoff=None, numClusters=1): '''uses the rmsdlist to make clusters of conformations based on rmsd. goes until either the rmsdCutoff is reached or numClusters is reached. using the numClusters will make this run very slowly. uses ALL linkage not single linkage.''' self.getRMSDtable() #make the table, or ensure it is made #self.rmsdList is a tuple of (rmsd, conf, conf) #self.rmsdTable is a dict of [conf][conf] -> rmsd clusters = unionfind2.unionFind() for xyzCount in xrange(len(self.atomXyz)): clusters.find(xyzCount) #init if rmsdCutoff is None: rmsdCutoff = self.rmsdList[-1][0] + 1.0 #make it never happen for rmsdTuple in self.rmsdList: if rmsdTuple[0] > rmsdCutoff: break #quit combining things! #have to do all linkage not just single.. oh my if clusters.different(rmsdTuple[1], rmsdTuple[2]): #otherwise already join combine = True clusterOne = clusters.getList(rmsdTuple[1]) clusterTwo = clusters.getList(rmsdTuple[2]) #print clusterOne, clusterTwo, for clusterOneRep in clusterOne: for clusterTwoRep in clusterTwo: thisRMSD = self.rmsdTable[clusterOneRep][clusterTwoRep] #print thisRMSD, if thisRMSD > rmsdTuple[0]: #means we can't combine yet combine = False break if not combine: break #print combine if combine: clusters.union(rmsdTuple[1], rmsdTuple[2]) return clusters.toLists()
def _findConformations(self, atomBonds, xyzData): '''uses bond and xyzs to figure out what sets of neighboring atoms move together and assign them to conformations and assign each set a specific bunch of conformations.''' #self.rigidComponent is the list of atom numbers for the rigid comp #self.atomsAssigned is the set of atom numbers for the rigid comp (@start) #self.atomsNotAssigned is the rest of the atom numbers self.confNums = [1] #rigid starts self.confAtoms = {} #maps to atom numbers self.confAtoms[1] = list(self.atomsAssigned) self.confInput = {} #maps to the input xyz lists self.confInput[1] = range(len(xyzData)) confClusters = {} for atomNum in self.atomsNotAssigned: for listInputs in self.posClusterLists[atomNum]: tupleInputs = tuple(listInputs) #can't use lists as keys if tupleInputs not in confClusters.keys(): confClusters[tupleInputs] = unionFind() confClusters[tupleInputs].find(atomNum) #in case of singletons for otherNum, bondType in atomBonds[atomNum]: if listInputs in self.posClusterLists[otherNum]: confClusters[tupleInputs].union(atomNum, otherNum) for tupleInputs, clusters in confClusters.iteritems(): for atomLists in clusters.toLists(): #make a conf for each thisConf = self.confNums[-1] + 1 self.confAtoms[thisConf] = atomLists self.confInput[thisConf] = tupleInputs self.confNums.append(thisConf)
def _countPositionsFewPoints(self, xyzData, tolerance): '''for a list of list of xyz data, count the number of positions each atom takes based on the tolerance and the distance. tolerance is compared to the euclidean difference squared to determine if a position is equal. actually uses a clustering algorithm and uses a unionfind data structure.''' self.posCount = [] self.posClusters = [] #just save all the data since we made it self.posClusterLists = [] #just save all the data since we made it tolerance2 = tolerance ** 2. #square the tolerance since it is compared for oneSet in xrange(len(xyzData[0])): #goes from 0 to atom count clusters = unionFind() xyzList = [] for oneIndex in xrange(len(xyzData)): #0 to number of positions (mol2#s) clusters.find(oneIndex) #initiate each position xyzList.append(xyzData[oneIndex][oneSet]) for oneIndex in xrange(len(xyzData)): #0 to positions oneXyz = xyzList[oneIndex] for twoIndex in xrange(oneIndex+1, len(xyzData)): #oneIndex to positions if geometry_basic.distL2Squared3(oneXyz, xyzList[twoIndex]) \ < tolerance2: clusters.union(oneIndex, twoIndex) tempLists = clusters.toLists() self.posCount.append(len(tempLists)) self.posClusters.append(clusters) self.posClusterLists.append(tempLists)
def _countPositions(self, xyzData, tolerance, verbose=False): '''for a list of list of xyz data, count the number of positions each atom takes based on the tolerance and the distance. tolerance is compared to the euclidean difference squared to determine if a position is equal. actually uses a clustering algorithm and uses a unionfind data structure.''' self.posCount = [] self.posClusters = [] #just save all the data since we made it self.posClusterLists = [] #just save all the data since we made it tolerance2 = tolerance ** 2. #square the tolerance since it is compared for oneSet in xrange(len(xyzData[0])): #goes from 0 to atom count #if verbose: # print oneSet, " atom positions being calculated" clusters = unionFind() xyzList = [] for oneIndex in xrange(len(xyzData)): #0 to number of positions (mol2#s) clusters.find(oneIndex) #initiate each position xyzList.append(xyzData[oneIndex][oneSet]) bucket = buckets.Bucket3d(xyzList, tolerance) #constructor to make fast bucket.getWithinCluster(clusters) #for pointA, pointB in bucket.getWithin(clusters): # clusters.union(pointA, pointB) tempLists = clusters.toLists() self.posCount.append(len(tempLists)) self.posClusters.append(clusters) self.posClusterLists.append(tempLists)
def findBiggestDisjointSets(pointList, triList, pointNeighborList): '''slightly improved code-- well 15 seconds faster on small stuff''' pointSetUF = unionfind2.unionFind() for nhbrPointsList in pointNeighborList: #first check to see if point is already in a list startPt = nhbrPointsList[0] for otherPt in nhbrPointsList[2:]: pointSetUF.union(startPt, otherPt) pointSets = pointSetUF.toLists() #remove points + tris not in the biggest disjoint set (cavities) largest, size = 0, 0 for index in xrange(len(pointSets)): if len(pointSets[index]) > size: largest, size = index, len(pointSets[index]) allowedPoints = pointSets[largest] #figured it out, make sets allPoints, cavPoints = set(), set() for point in pointList: if int(point[0]) in allowedPoints: allPoints.update([int(point[0])]) else: cavPoints.update([int(point[0])]) allTris, cavTris = set(), set() for tri in triList: if int(tri[1]) in allPoints: # any triangle point is okay allTris.update([int(tri[0])]) else: cavTris.update([int(tri[0])]) #print len(allPoints), len(pointList) #print len(allTris), len(triList) return allPoints, allTris, cavPoints, cavTris
def getRMSDclusters(self, rmsdCutoff=None, numClusters=1): '''uses the rmsdlist to make clusters of conformations based on rmsd. goes until either the rmsdCutoff is reached or numClusters is reached. using the numClusters will make this run very slowly. uses single linkage to make a new cluster.''' self.getRMSDtable() #make the table, or ensure it is made #self.rmsdList is a tuple of (rmsd, conf, conf) clusters = unionfind2.unionFind() for xyzCount in xrange(len(self.atomXyz)): clusters.find(xyzCount) # initialize all these to singleton clusters if rmsdCutoff is None: rmsdCutoff = self.rmsdList[-1][0] + 1.0 #make it never happen for rmsdTuple in self.rmsdList: if rmsdTuple[0] > rmsdCutoff: break #quit combining things! clusters.union(rmsdTuple[1], rmsdTuple[2]) return clusters.toLists()
def clusterAtoms(self, distanceCutoff=2.0): '''breaks into distinct unions of atoms based on distance cutoff''' ligandClusters = unionfind2.unionFind() cutoffSquared = distanceCutoff ** 2. #faster comparisons for index,coord in enumerate(self.coords): for index2,coord2 in enumerate(self.coords): if index2 > index: #only do comparisons once each distBetweenSquared = geometry.distL2Squared(coord, coord2) if distBetweenSquared <= cutoffSquared: ligandClusters.union(index, index2) clusteredLists = ligandClusters.toLists() newPdbs = [] #list of pdbData objects to return for oneCluster in clusteredLists: newPdb = self.copy() markedForRemoval = [] for index in xrange(len(self.coords)): if index not in oneCluster: markedForRemoval.append(index) for index in markedForRemoval: newPdb.removeLine(newPdb.atomToRaw[index]) newPdbs.append(newPdb) return newPdbs
def findBiggestDisjointSetsBreakCavities(pointList, triList, pointNeighborList): '''breaks out each cavity separately. doesn't return tris, just points''' pointSetUF = unionfind2.unionFind() for nhbrPointsList in pointNeighborList: #first check to see if point is already in a list startPt = nhbrPointsList[0] for otherPt in nhbrPointsList[2:]: pointSetUF.union(startPt, otherPt) pointSets = pointSetUF.toLists() #remove points + tris not in the biggest disjoint set (cavities) largest, size = 0, 0 for index in xrange(len(pointSets)): if len(pointSets[index]) > size: largest, size = index, len(pointSets[index]) allowedPoints = pointSets[largest] #figured it out, make sets allPoints, cavPoints = set(allowedPoints), set() pointSets.remove(allowedPoints) for cavPtSet in pointSets: cavPoints.update(cavPtSet) #print len(allPoints), len(pointList) return allPoints, cavPoints, pointSets
def _findRigidComponent(self, atomBonds): '''uses bond and position count information to find largest set of atoms that don't move. this is the rigid component. set into self.rigidComponent also find the complement of atomnums and the rigid component and set into self.atomsNotAssigned for use later''' clusters = unionFind() for atomNum in xrange(len(self.posCount)): if 1 == self.posCount[atomNum]: for otherNum, bondType in atomBonds[atomNum]: if 1 == self.posCount[otherNum]: clusters.union(atomNum, otherNum) maxSize = 0 maxCluster = None clusterLists = clusters.toLists() for clusterList in clusterLists: if len(clusterList) > maxSize: maxSize = len(clusterList) maxCluster = clusterList self.rigidComponent = maxCluster self.atomsAssigned = set(self.rigidComponent) self.atomsNotAssigned = set() for atomNum in xrange(len(self.posCount)): if atomNum not in self.rigidComponent: self.atomsNotAssigned.add(atomNum)
def addSearchConnections( self, totalThreshold, remove=False, mst=False, maxConnCount=100000000000, lineMst=False, startNode=None, endNode=None, clusterOutput=False): '''adds the connections to self.connections if they meet the requirements''' tempConns = self.matchList[:] # copy and destroy possibly self.connections = [] if clusterOutput: clusters = unionfind2.unionFind() overlapFunction = self.tmDataList[0].compareResidueIdentityMultipleNodes overlapCache = {} treeCountCache = {} for tmData in self.tmDataList: self.tmToSubgraph[tmData].resetKeepers() if mst: # init this data structure mstUF = unionfind2.unionFind() if lineMst: # init this data structure mstUF = unionfind2.unionFind() connsLimit2 = {} if startNode is not None and endNode is not None: # limit endpoints connsLimit2[startNode] = [endNode] # for lineMstEnds given hints tempConns.sort(key=operator.itemgetter(4)) # best first for aMatch in tempConns: tmData, tmData2, node1, node2, totalScore = aMatch # unpack mstOkay = (not mst) and (not lineMst) # iff both false, everything is ok if mst: # do checks for mst if mstUF.different(node1, node2): # calls find on node1+2 to init them mstOkay = True mstUF.union(node1, node2) elif lineMst: # if okay to mst might check for linemst if mstUF.different(node1, node2): # calls find on node1+2 to init them if (node1 not in connsLimit2 or len(connsLimit2[node1]) == 1) and \ (node2 not in connsLimit2 or len(connsLimit2[node2]) == 1): #only now we know it is completely okay if node1 not in connsLimit2: connsLimit2[node1] = [] connsLimit2[node1].append(node2) if node2 not in connsLimit2: connsLimit2[node2] = [] connsLimit2[node2].append(node1) mstOkay = True mstUF.union(node1, node2) if mstOkay: # means either everything is fine or not mst if totalScore < totalThreshold and len(self.connections) < maxConnCount: subGraph1 = self.tmToSubgraph[tmData] subGraph2 = self.tmToSubgraph[tmData2] newConn = [ subGraph1.nodeToName[node1], subGraph2.nodeToName[node2], totalScore, node1, node2] subGraph1.keepers[newConn[0]] = True subGraph2.keepers[newConn[1]] = True self.connections.append(newConn) if clusterOutput: clusters.union(node1, node2) # really is that simple clustLists = clusters.toLists() for aCluster in clustLists: aCluster.sort() tupleCluster = tuple(aCluster) if tupleCluster not in overlapCache: aOverlap = overlapFunction(aCluster) overlapCache[tupleCluster] = aOverlap else: aOverlap = overlapCache[tupleCluster] if tupleCluster not in treeCountCache: treeSet = set() for node in aCluster: treeSet.add(node.tree) treeCount = len(treeSet) treeCountCache[tupleCluster] = treeCount else: treeCount = treeCountCache[tupleCluster] if aOverlap >= 0.0 or len(clustLists) < 5: print len(self.connections), len(clustLists), print "len:", len(aCluster), "over:", aOverlap, print "count:", treeCount, print outputDrawStr(aCluster[0].tree.inputFileName, aCluster[0]) if remove: # sometimes may not want to do this here for subGraph in self.tmToSubgraph.values(): subGraph.removeNonKeepers()