class RegCluster(BaseEstimator, ClusterMixin, TransformerMixin): """ Class to perform regular clustering of a given data set RegCluster can be passed a radius or an approximate number of clusters. If a number of clusters is passed, KCenter clustering is used to estimate the necessary radius. RegCluster randomly chooses a point and assigns all points within the radius of this point to the same cluster. Then it proceeds with the nearest point, which is not yet assigned to a cluster and puts all unassigned points within the radius of this point in the next cluster and so on. Parameters ---------- radius: float radius of clusters n_clusters: int desired number of clusters Examples -------- >>> cluster = RegCluster(radius=5.1) >>> cluster.fit(data) Attributes ---------- cluster_centers: list list with the points, which are the centers of the clusters centerFrames : list list of indices of center points in data array labels_ : list list with number of cluster of each frame clusterSize_ : list list with number of frames in each cluster """ def __init__(self, radius=None, n_clusters=None): if radius is None and n_clusters is None: raise RuntimeError("radius or n_clusters needs to be set") self.radius = radius self.n_clusters = n_clusters self.labels_ = [] def fit(self, data): """ performs clustering of data Parameters ---------- data: np.ndarray array of data points to cluster merge: int minimal number of frames within each cluster. Smaller clusters are merged into next big one """ # if n_clusters is given and no r, estimate n_clusters if self.radius is None: from htmd.clustering.kcenters import KCenter estClust = KCenter(n_clusters=self.n_clusters) estClust.fit(data) self.radius = estClust.distance.max() logger.info("Estimated radius = {}".format(self.radius)) from pyemma.coordinates.clustering.regspace import RegularSpaceClustering self._reg = RegularSpaceClustering(dmin=self.radius) self.labels_ = self._reg.fit_transform(data).flatten() @property def cluster_centers_(self): return self._reg.clustercenters @property def clusterSize(self): return np.bincount(self.labels_)