def _transform_array(self, X): """get closest index of point in :attr:`clustercenters` to x.""" X = np.require(X, dtype=np.float32, requirements='C') # for performance reasons we pre-center the cluster centers for minRMSD. if self.metric == 'minRMSD' and not self._precentered: self._precentered = True model = ClusterModel(cluster_centers=self.clustercenters, metric=self.metric) dtraj = model.transform(X) res = dtraj[:, None] # always return a column vector in this function return res
def test_ndim_assignment(ndim, njobs): centers = np.random.uniform(size=(15, ndim)).squeeze() model = ClusterModel(centers) assert_equal(model.dim, ndim) data = np.random.uniform(size=(50, ndim)).squeeze() dtraj = model.transform(data, n_jobs=njobs) if data.ndim == 1: data = data[..., None] for i in range(len(data)): cc = dtraj[i] x = data[i] dists = np.linalg.norm(model.cluster_centers - x[None, :], axis=1) assert_equal(cc, np.argmin(dists))
def cluster(self, n_bins): from deeptime.clustering import ClusterModel minval = min(np.min(self.data), np.min(self.data_lagged)) maxval = max(np.max(self.data), np.max(self.data_lagged)) grid = np.linspace(minval, maxval, num=n_bins, endpoint=True) mesh = np.vstack(np.meshgrid(grid, grid, grid)).reshape(3, -1).T cm = ClusterModel(len(mesh), mesh) dtraj1 = cm.transform(self.data.astype(np.float64)) traj1 = np.zeros((len(self.data), mesh.shape[0])) traj1[np.arange(len(self.data)), dtraj1] = 1. dtraj2 = cm.transform(self.data_lagged.astype(np.float64)) traj2 = np.zeros((len(self.data_lagged), mesh.shape[0])) traj2[np.arange(len(self.data_lagged)), dtraj2] = 1. return BickleyJetEndpointsDataset3DClustered(traj1, traj2)
def test_minrmsd_assignments(self): # make sure impl is registered _ = KmeansClustering(n_clusters=5) # now we can import the impl impl = deeptime.clustering.metrics['minRMSD'] from scipy.linalg import expm, norm n_clusters = 5 n_particles = 3 n_frames_per_cluster = 25 def rotation_matrix(axis, theta): """ rotation matrix :param axis: np.ndarray, axis around which to rotate :param theta: float, angle in radians :return: rotation matrix """ return expm(np.cross(np.eye(3), axis / norm(axis) * theta)) out = np.zeros((n_clusters * n_frames_per_cluster, 3 * n_particles)) for i in range(n_clusters): # define `n_particles` random particle xyz positions, # repeat `n_frames_per_cluster` frames and add noise _pos = np.random.choice(np.arange(3 * n_particles), size=3 * n_particles) pos = np.repeat(_pos[None], n_frames_per_cluster, axis=0).astype(float) pos += np.random.normal(size=pos.shape, scale=.1) # add random rotation and translation for each frame rand_rot_trans = np.zeros_like(pos) for n, _pos in enumerate(pos): r = rotation_matrix(np.array([0, 1, 0]), np.pi * np.random.rand()) t = np.array([ np.random.normal(), np.random.normal(), np.random.normal() ]) for m in range(n_particles): rand_rot_trans[n, 3 * m:3 * (m + 1)] = np.dot(r, _pos[3 * m:3 * (m + 1)]) - t out[n_frames_per_cluster * i:n_frames_per_cluster * (i + 1)] = rand_rot_trans cc = impl.kmeans.init_centers_kmpp(out, k=n_clusters, random_seed=-1, n_threads=1, callback=None) cl = ClusterModel(cc, metric='minRMSD', converged=True) assignments = cl.transform(out) unique = [] for i in range(n_clusters): unique_in_inverval = np.unique( assignments[n_frames_per_cluster * i:n_frames_per_cluster * (i + 1)]) # assert that each interval is assigned correctly self.assertEqual(unique_in_inverval.shape[0], 1) unique.append(unique_in_inverval[0])
def _estimate(self, iterable, **kwargs): ######## # Calculate clustercenters: # 1. choose first datapoint as centroid # 2. for all X: calc distances to all clustercenters # 3. add new centroid, if min(distance to all other clustercenters) >= dmin ######## # temporary list to store cluster centers used_frames = 0 regspace = RegularSpace(dmin=self.dmin, max_centers=self.max_centers, metric=self.metric, n_jobs=self.n_jobs) it = iterable.iterator(return_trajindex=False, stride=self.stride, chunk=self.chunksize, skip=self.skip) try: with it: for X in it: regspace.partial_fit(X.astype(np.float32, order='C', copy=False), n_jobs=self.n_jobs) used_frames += len(X) self._converged = True except Exception as e: if 'MaxCentersReachedException' in e.__class__.__name__: self._converged = False msg = 'Maximum number of cluster centers reached.' \ ' Consider increasing max_centers or choose' \ ' a larger minimum distance, dmin.' self.logger.warning(msg) warnings.warn(msg) # pass amount of processed data used_data = used_frames / float(it.n_frames_total()) * 100.0 raise NotConvergedWarning("Used data for centers: %.2f%%" % used_data) else: # todo ugly workaround until maxcentersreached is placed not within metric subpackage but globally # somewhere raise finally: # even if not converged, we store the found centers. model = regspace.fetch_model() clustercenters = model.cluster_centers.squeeze().reshape( -1, iterable.ndim) self._inst = ClusterModel(clustercenters, metric=self.metric) from types import MethodType def _assign(self, data, _, n_jobs): out = self.transform(data, n_jobs=n_jobs) return out self._inst.assign = MethodType(_assign, self._inst) self.update_model_params(clustercenters=clustercenters, n_clusters=len(clustercenters)) if len(clustercenters) == 1: self.logger.warning('Have found only one center according to ' 'minimum distance requirement of %f' % self.dmin) return self