def setUpClass(cls): with numpy_random_seed(123): import msmtools.generation as msmgen # generate HMM with two Gaussians cls.P = np.array([[0.99, 0.01], [0.01, 0.99]]) cls.T = 40000 means = [np.array([-1, 1]), np.array([1, -1])] widths = [np.array([0.3, 2]), np.array([0.3, 2])] # continuous trajectory cls.X = np.zeros((cls.T, 2)) # hidden trajectory dtraj = msmgen.generate_traj(cls.P, cls.T) for t in range(cls.T): s = dtraj[t] cls.X[t, 0] = widths[s][0] * np.random.randn() + means[s][0] cls.X[t, 1] = widths[s][1] * np.random.randn() + means[s][1] # Set the lag time: cls.lag = 10 # Compute mean free data: mref = (np.sum(cls.X[:-cls.lag, :], axis=0) + np.sum( cls.X[cls.lag:, :], axis=0)) / float(2 * (cls.T - cls.lag)) mref_nr = np.sum(cls.X[:-cls.lag, :], axis=0) / float(cls.T - cls.lag) cls.X_mf = cls.X - mref[None, :] cls.X_mf_nr = cls.X - mref_nr[None, :] # Compute correlation matrices: cls.cov_ref = (np.dot(cls.X_mf[:-cls.lag, :].T, cls.X_mf[:-cls.lag, :]) +\ np.dot(cls.X_mf[cls.lag:, :].T, cls.X_mf[cls.lag:, :])) / float(2*(cls.T-cls.lag)) cls.cov_ref_nr = np.dot( cls.X_mf_nr[:-cls.lag, :].T, cls.X_mf_nr[:-cls.lag, :]) / float(cls.T - cls.lag) cls.cov_tau_ref = (np.dot(cls.X_mf[:-cls.lag, :].T, cls.X_mf[cls.lag:, :]) +\ np.dot(cls.X_mf[cls.lag:, :].T, cls.X_mf[:-cls.lag, :])) / float(2*(cls.T-cls.lag)) cls.cov_tau_ref_nr = np.dot( cls.X_mf_nr[:-cls.lag, :].T, cls.X_mf_nr[cls.lag:, :]) / float(cls.T - cls.lag) # do unscaled TICA reader = api.source(cls.X, chunksize=0) cls.tica_obj = api.tica(data=reader, lag=cls.lag, dim=1, kinetic_map=False) # non-reversible TICA cls.tica_obj_nr = api.tica(data=reader, lag=cls.lag, dim=1, kinetic_map=False, reversible=False)
def test_fit_transform(self): X = np.random.randn(100, 2) tica = _internal_tica(1, 1) out = tica.fit_transform(X) np.testing.assert_array_almost_equal( out, api.tica(data=X, lag=1, dim=1).get_output()[0])
def test_transformer_random_access_in_memory(self): feature_reader = self._get_reader_instance(1) tica = coor.tica(feature_reader) # everything normal assert tica.is_random_accessible from pyerna.coordinates.data._base.transformer import StreamingTransformerRandomAccessStrategy assert isinstance(tica._ra_jagged, StreamingTransformerRandomAccessStrategy) # set to memory tica.in_memory = True assert tica.is_random_accessible from pyerna.coordinates.data.data_in_memory import DataInMemoryJaggedRandomAccessStrategy assert isinstance(tica._ra_jagged, DataInMemoryJaggedRandomAccessStrategy) # not in memory anymore, expect to fall back tica.in_memory = False assert tica.is_random_accessible from pyerna.coordinates.data._base.transformer import StreamingTransformerRandomAccessStrategy assert isinstance(tica._ra_jagged, StreamingTransformerRandomAccessStrategy) # remove data source tica.data_producer = None assert not tica.is_random_accessible assert tica._ra_jagged is None
def test_duplicated_data_in_fit_transform(self): X = np.random.randn(100, 2) d = DataInMemory([X, X]) tica = api.tica(data=d, lag=1, dim=1) out1 = tica.get_output() out2 = tica.fit_transform([X, X]) np.testing.assert_array_almost_equal(out1, out2)
def test_in_memory(self): data = np.random.random((100, 10)) reader = api.source(data) tica_obj = api.tica(reader, lag=10, dim=1) tica_obj.in_memory = True tica_obj.get_output()
def test_with_pipeline_time_lagged(self): reader = api.source(self.trajfile, top=self.topfile) assert isinstance(reader, FeatureReader) t = tica(dim=2, lag=1) d = discretizer(reader, t, chunksize=10) d.parametrize()
def test_covariances_and_eigenvalues(self): reader = FeatureReader(self.trajnames, self.temppdb) for tau in [1, 10, 100, 1000, 2000]: trans = tica(lag=tau, dim=self.dim, kinetic_map=False) trans.estimate(reader) data = trans.get_output() log.info('max. eigenvalue: %f' % np.max(trans.eigenvalues)) self.assertTrue(np.all(trans.eigenvalues <= 1.0)) # check ICs check = tica(data=data, lag=tau, dim=self.dim) np.testing.assert_allclose(np.eye(self.dim), check.cov, atol=1e-8) np.testing.assert_allclose(check.mean, 0.0, atol=1e-8) ic_cov_tau = np.zeros((self.dim, self.dim)) ic_cov_tau[np.diag_indices(self.dim)] = trans.eigenvalues np.testing.assert_allclose(ic_cov_tau, check.cov_tau, atol=1e-8)
def test_partial_fit(self): reader = FeatureReader(self.trajnames, self.temppdb, chunksize=10000) output = reader.get_output() params = {'dim': self.dim, 'lag': 1001} ref = api.tica(reader, **params) partial = api.tica(**params) for traj in output: partial.partial_fit(traj) np.testing.assert_allclose(partial.eigenvalues, ref.eigenvalues, atol=1e-3) # only compare first two eigenvectors, because we only have two metastable processes np.testing.assert_allclose(np.abs(partial.eigenvectors[:2]), np.abs(ref.eigenvectors[:2]), rtol=1e-3, atol=1e-3)
def test_singular_zeros(self): # make some data that has one column of all zeros X = np.random.randn(100, 2) X = np.hstack((X, np.zeros((100, 1)))) tica_obj = api.tica(data=X, lag=1, dim=1) assert tica_obj.eigenvectors.dtype == np.float64 assert tica_obj.eigenvalues.dtype == np.float64
def test_no_cluster(self): reader_xtc = api.source(self.traj_files, top=self.pdb_file) # only reader api.pipeline(reader_xtc) reader_xtc.get_output() # reader + pca / tica tica = api.tica() pca = api.pca() api.pipeline([reader_xtc, tica])._chain[-1].get_output() api.pipeline([reader_xtc, pca])._chain[-1].get_output()
def test(self): # make it deterministic with numpy_random_seed(0): data = np.random.randn(100, 10) tica_obj = api.tica(data=data, lag=10, dim=1) Y = tica_obj._transform_array(data) # right shape assert types.is_float_matrix(Y) assert Y.shape[0] == 100 assert Y.shape[1] == 1, Y.shape[1]
def test_partial_fit(self): from pyerna.coordinates import source reader = source(self.trajnames, top=self.temppdb) reader_output = reader.get_output() for output_params in [{ 'kinetic_map': False }, { 'kinetic_map': True }, { 'kinetic_map': False, 'commute_map': True }]: params = {'lag': 10, 'dim': self.dim} params.update(output_params) tica_obj = tica(**params) tica_obj.partial_fit(reader_output[0]) assert not tica_obj._estimated # acccess eigenvectors to force diagonalization tica_obj.eigenvectors assert tica_obj._estimated tica_obj.partial_fit(reader_output[1]) assert not tica_obj._estimated tica_obj.eigenvalues assert tica_obj._estimated for traj in reader_output[2:]: tica_obj.partial_fit(traj) # reference ref = tica(reader, **params) np.testing.assert_allclose(tica_obj.cov, ref.cov, atol=1e-15) np.testing.assert_allclose(tica_obj.cov_tau, ref.cov_tau, atol=1e-15) np.testing.assert_allclose(tica_obj.eigenvalues, ref.eigenvalues, atol=1e-15)
def test_kinetic_map(self): # test kinetic map variances: tica_kinmap = api.tica(data=self.X, lag=self.lag, dim=-1, var_cutoff=1, kinetic_map=True) O = tica_kinmap.get_output()[0] vars = np.var(O, axis=0) refs = tica_kinmap.eigenvalues**2 assert np.max(np.abs(vars - refs)) < 0.01
def test_duplicated_data(self): # make some data that has one column repeated twice X = np.random.randn(100, 2) X = np.hstack((X, X[:, 0, np.newaxis])) d = DataInMemory(X) tica_obj = api.tica(data=d, lag=1, dim=1) assert tica_obj.eigenvectors.dtype == np.float64 assert tica_obj.eigenvalues.dtype == np.float64
def test_chunksize(self): reader_xtc = api.source(self.traj_files, top=self.pdb_file) chunksize = 1001 chain = [ reader_xtc, api.tica(), api.cluster_mini_batch_kmeans(batch_size=0.3, k=3) ] p = api.pipeline(chain, chunksize=chunksize, run=False) assert p.chunksize == chunksize for e in p._chain: assert e.chunksize == chunksize
def test_discretizer(self): reader_gen = DataInMemory(data=self.generated_data) # check if exception safe api.discretizer(reader_gen)._chain[-1].get_output() api.discretizer(reader_gen, transform=api.tica())._chain[-1].get_output() api.discretizer( reader_gen, cluster=api.cluster_uniform_time())._chain[-1].get_output() api.discretizer( reader_gen, transform=api.pca(), cluster=api.cluster_regspace(dmin=10))._chain[-1].get_output()
def test_dimension(self): assert types.is_int(self.tica_obj.dimension()) # Here: assert self.tica_obj.dimension() == 1 # Test other variants tica = api.tica(data=self.X, lag=self.lag, dim=-1, var_cutoff=1.0) assert tica.dimension() == 2 tica = api.tica(data=self.X, lag=self.lag, dim=-1, var_cutoff=0.9) assert tica.dimension() == 1 with self.assertRaises( ValueError ): # trying to set both dim and subspace_variance is forbidden api.tica(data=self.X, lag=self.lag, dim=1, var_cutoff=0.9) with self.assertRaises(ValueError): api.tica(lag=self.lag, var_cutoff=0) with self.assertRaises(ValueError): api.tica(lag=self.lag, var_cutoff=1.1)
def test_is_parametrized(self): # construct pipeline with all possible transformers p = api.pipeline([ api.source(self.traj_files, top=self.pdb_file), api.tica(), api.pca(), api.cluster_kmeans(k=50), api.cluster_regspace(dmin=50), api.cluster_uniform_time(k=20) ], run=False) self.assertFalse( p._is_estimated(), "If run=false, the pipeline should not be parametrized.") p.parametrize() self.assertTrue( p._is_estimated(), "If parametrized was called, the pipeline should be parametrized.")
def test_set_element(self): reader = api.source(self.traj_files, top=self.pdb_file) pca = api.pca() p = api.pipeline([reader, pca]) self.assertTrue(p._is_estimated()) pca_out = pca.get_output() tica = api.tica(lag=self.generated_lag) # replace pca with tica p.set_element(1, tica) self.assertFalse( p._is_estimated(), "After replacing an element, the pipeline should not be parametrized." ) p.parametrize() tica_out = tica.get_output() # check if replacement actually happened self.assertFalse( np.array_equal(pca_out[0], tica_out[0]), "The output should not be the same when the method got replaced.")
def test_covariances_and_eigenvalues(self): reader = FeatureReader(self.trajnames, self.temppdb, chunksize=10000) for lag in [1, 11, 101, 1001, 2001]: # avoid cos(w*tau)==0 trans = api.tica(data=reader, dim=self.dim, lag=lag) log.info('number of trajectories reported by tica %d' % trans.number_of_trajectories()) log.info('tau = %d corresponds to a number of %f cycles' % (lag, self.w * lag / (2.0 * np.pi))) # analytical solution for C_ij(lag) is 0.5*A[i]*A[j]*cos(phi[i]-phi[j])*cos(w*lag) ana_cov = 0.5 * self.A[:, np.newaxis] * self.A * np.cos( self.phi[:, np.newaxis] - self.phi) ana_cov_tau = ana_cov * np.cos(self.w * lag) self.assertTrue(np.allclose(ana_cov, trans.cov, atol=1.E-3)) self.assertTrue(np.allclose(ana_cov_tau, trans.cov_tau, atol=1.E-3)) log.info('max. eigenvalue: %f' % np.max(trans.eigenvalues)) self.assertTrue(np.all(trans.eigenvalues <= 1.0))
def testChunksizeResultsTica(self): chunk = 40 lag = 100 np.random.seed(0) X = np.random.randn(23000, 3) # un-chunked d = DataInMemory(X) tica_obj = api.tica(data=d, lag=lag, dim=1) cov = tica_obj.cov.copy() mean = tica_obj.mean.copy() # ------- run again with new chunksize ------- d = DataInMemory(X) d.chunksize = chunk tica_obj = tica(data=d, lag=lag, dim=1) np.testing.assert_allclose(tica_obj.mean, mean) np.testing.assert_allclose(tica_obj.cov, cov)
def test_transfomer_random_access(self): for in_memory in [True, False]: for r in range(0, 2): dim = self._get_reader_instance(r) tica = coor.tica(dim, dim=3) tica.in_memory = in_memory out = tica.get_output() # linear random access np.testing.assert_array_equal( np.squeeze(tica.ra_linear[0:2, 0]), out[0][0:2, 0]) # linear itraj random access np.testing.assert_array_equal( np.squeeze(tica.ra_itraj_linear[0, :12, 0]), out[0][:12, 0]) # jagged random access jagged = tica.ra_itraj_jagged[:, ::-3, 0] for i, X in enumerate(jagged): np.testing.assert_array_equal(X, out[i][::-3, 0]) # cuboid random access cube = tica.ra_itraj_cuboid[:, 0, 0] for i in range(3): np.testing.assert_array_equal(cube[i], out[i][0, 0])
def test_with_skip(self): data = np.random.random((100, 10)) tica_obj = api.tica(data, lag=10, dim=1, skip=1)