def test_discretizer(self): reader_gen = DataInMemory(data=self.generated_data) # check if exception safe api.discretizer(reader_gen)._chain[-1].get_output() api.discretizer(reader_gen, transform=api.tica())._chain[-1].get_output() api.discretizer(reader_gen, cluster=api.cluster_uniform_time())._chain[-1].get_output() api.discretizer(reader_gen, transform=api.pca(), cluster=api.cluster_regspace(dmin=10))._chain[-1].get_output()
def test_singular_zeros(self): # make some data that has one column of all zeros X = np.random.randn(100, 2) X = np.hstack((X, np.zeros((100, 1)))) tica_obj = api.tica(data=X, lag=1, dim=1) assert tica_obj.eigenvectors.dtype == np.float64 assert tica_obj.eigenvalues.dtype == np.float64
def test_no_cluster(self): reader_xtc = api.source(self.traj_files, top=self.pdb_file) # only reader api.pipeline(reader_xtc) reader_xtc.get_output() # reader + pca / tica tica = api.tica() pca = api.pca() api.pipeline([reader_xtc, tica])._chain[-1].get_output() api.pipeline([reader_xtc, pca])._chain[-1].get_output()
def test_duplicated_data(self): # make some data that has one column repeated twice X = np.random.randn(100, 2) X = np.hstack((X, X[:, 0, np.newaxis])) d = DataInMemory(X) tica_obj = api.tica(data=d, lag=1, dim=1) assert tica_obj.eigenvectors.dtype == np.float64 assert tica_obj.eigenvalues.dtype == np.float64
def test(self): np.random.seed(0) data = np.random.randn(100, 10) tica_obj = api.tica(data=data, lag=10, dim=1) tica_obj.parametrize() Y = tica_obj._map_array(data) # right shape assert types.is_float_matrix(Y) assert Y.shape[0] == 100 assert Y.shape[1] == 1
def test(self): # make it deterministic with numpy_random_seed(0): data = np.random.randn(100, 10) tica_obj = api.tica(data=data, lag=10, dim=1) tica_obj.parametrize() Y = tica_obj._transform_array(data) # right shape assert types.is_float_matrix(Y) assert Y.shape[0] == 100 assert Y.shape[1] == 1, Y.shape[1]
def test_kinetic_map(self): # test kinetic map variances: tica_kinmap = api.tica(data=self.X, lag=self.lag, dim=-1, var_cutoff=1, kinetic_map=True) O = tica_kinmap.get_output()[0] vars = np.var(O, axis=0) refs = tica_kinmap.eigenvalues**2 assert np.max(np.abs(vars - refs)) < 0.01
def test_covariances_and_eigenvalues(self): reader = FeatureReader(self.trajnames, self.temppdb) for tau in [1, 10, 100, 1000, 2000]: trans = tica(lag=tau, dim=self.dim, kinetic_map=False) trans.data_producer = reader log.info('number of trajectories reported by tica %d' % trans.number_of_trajectories()) trans.parametrize() data = trans.get_output() log.info('max. eigenvalue: %f' % np.max(trans.eigenvalues)) self.assertTrue(np.all(trans.eigenvalues <= 1.0)) # check ICs check = tica(data=data, lag=tau, dim=self.dim) np.testing.assert_allclose(np.eye(self.dim), check.cov, atol=1e-8) np.testing.assert_allclose(check.mean, 0.0, atol=1e-8) ic_cov_tau = np.zeros((self.dim, self.dim)) ic_cov_tau[np.diag_indices(self.dim)] = trans.eigenvalues np.testing.assert_allclose(ic_cov_tau, check.cov_tau, atol=1e-8)
def test(self): # FIXME: this ugly workaround is necessary... np.random.seed(0) data = np.random.randn(100, 10) tica_obj = api.tica(data=data, lag=10, dim=1) tica_obj.parametrize() Y = tica_obj._transform_array(data) # right shape assert types.is_float_matrix(Y) assert Y.shape[0] == 100 assert Y.shape[1] == 1
def test_chunksize(self): reader_xtc = api.source(self.traj_files, top=self.pdb_file) chunksize = 1001 chain = [ reader_xtc, api.tica(), api.cluster_mini_batch_kmeans(batch_size=0.3, k=3) ] p = api.pipeline(chain, chunksize=chunksize, run=False) assert p.chunksize == chunksize for e in p._chain: assert e.chunksize == chunksize
def setUpClass(cls): with numpy_random_seed(123): import msmtools.generation as msmgen # generate HMM with two Gaussians cls.P = np.array([[0.99, 0.01], [0.01, 0.99]]) cls.T = 40000 means = [np.array([-1, 1]), np.array([1, -1])] widths = [np.array([0.3, 2]), np.array([0.3, 2])] # continuous trajectory cls.X = np.zeros((cls.T, 2)) # hidden trajectory dtraj = msmgen.generate_traj(cls.P, cls.T) for t in range(cls.T): s = dtraj[t] cls.X[t, 0] = widths[s][0] * np.random.randn() + means[s][0] cls.X[t, 1] = widths[s][1] * np.random.randn() + means[s][1] # Set the lag time: cls.lag = 10 # Compute mean free data: mref = (np.sum(cls.X[:-cls.lag, :], axis=0) + np.sum(cls.X[cls.lag:, :], axis=0)) / float(2*(cls.T-cls.lag)) mref_nr = np.sum(cls.X[:-cls.lag, :], axis=0) / float(cls.T-cls.lag) cls.X_mf = cls.X - mref[None, :] cls.X_mf_nr = cls.X - mref_nr[None, :] # Compute correlation matrices: cls.cov_ref = (np.dot(cls.X_mf[:-cls.lag, :].T, cls.X_mf[:-cls.lag, :]) +\ np.dot(cls.X_mf[cls.lag:, :].T, cls.X_mf[cls.lag:, :])) / float(2*(cls.T-cls.lag)) cls.cov_ref_nr = np.dot(cls.X_mf_nr[:-cls.lag, :].T, cls.X_mf_nr[:-cls.lag, :]) / float(cls.T - cls.lag) cls.cov_tau_ref = (np.dot(cls.X_mf[:-cls.lag, :].T, cls.X_mf[cls.lag:, :]) +\ np.dot(cls.X_mf[cls.lag:, :].T, cls.X_mf[:-cls.lag, :])) / float(2*(cls.T-cls.lag)) cls.cov_tau_ref_nr = np.dot(cls.X_mf_nr[:-cls.lag, :].T, cls.X_mf_nr[cls.lag:, :]) / float(cls.T - cls.lag) # do unscaled TICA reader=api.source(cls.X, chunk_size=0) cls.tica_obj = api.tica(data=reader, lag=cls.lag, dim=1, kinetic_map=False) # non-reversible TICA cls.tica_obj_nr = api.tica(data=reader, lag=cls.lag, dim=1, kinetic_map=False, reversible=False)
def test_discretizer(self): reader_gen = DataInMemory(data=self.generated_data) # check if exception safe api.discretizer(reader_gen)._chain[-1].get_output() api.discretizer(reader_gen, transform=api.tica())._chain[-1].get_output() api.discretizer( reader_gen, cluster=api.cluster_uniform_time())._chain[-1].get_output() api.discretizer( reader_gen, transform=api.pca(), cluster=api.cluster_regspace(dmin=10))._chain[-1].get_output()
def test_set_element(self): reader = api.source(self.traj_files, top=self.pdb_file) pca = api.pca() p = api.pipeline([reader, pca]) self.assertTrue(p._is_parametrized()) pca_out = pca.get_output() tica = api.tica(lag=self.generated_lag) # replace pca with tica p.set_element(1, tica) self.assertFalse(p._is_parametrized(), "After replacing an element, the pipeline should not be parametrized.") p.parametrize() tica_out = tica.get_output() # check if replacement actually happened self.assertFalse(np.array_equal(pca_out[0], tica_out[0]), "The output should not be the same when the method got replaced.")
def test_covariances_and_eigenvalues(self): reader = FeatureReader(self.trajnames, self.temppdb, chunksize=10000) for lag in [1, 11, 101, 1001, 2001]: # avoid cos(w*tau)==0 trans = api.tica(data=reader, dim=self.dim, lag=lag) log.info('number of trajectories reported by tica %d' % trans.number_of_trajectories()) log.info('tau = %d corresponds to a number of %f cycles' % (lag, self.w*lag/(2.0*np.pi))) # analytical solution for C_ij(lag) is 0.5*A[i]*A[j]*cos(phi[i]-phi[j])*cos(w*lag) ana_cov = 0.5*self.A[:, np.newaxis]*self.A*np.cos(self.phi[:, np.newaxis]-self.phi) ana_cov_tau = ana_cov*np.cos(self.w*lag) self.assertTrue(np.allclose(ana_cov, trans.cov, atol=1.E-3)) self.assertTrue(np.allclose(ana_cov_tau, trans.cov_tau, atol=1.E-3)) log.info('max. eigenvalue: %f' % np.max(trans.eigenvalues)) self.assertTrue(np.all(trans.eigenvalues <= 1.0))
def test_is_parametrized(self): # construct pipeline with all possible transformers p = api.pipeline( [ api.source(self.traj_files, top=self.pdb_file), api.tica(), api.pca(), api.cluster_kmeans(k=50), api.cluster_regspace(dmin=50), api.cluster_uniform_time(k=20) ], run=False ) self.assertFalse(p._is_parametrized(), "If run=false, the pipeline should not be parametrized.") p.parametrize() self.assertTrue(p._is_parametrized(), "If parametrized was called, the pipeline should be parametrized.")
def test_dimension(self): assert types.is_int(self.tica_obj.dimension()) # Here: assert self.tica_obj.dimension() == 1 # Test other variants tica = api.tica(data=self.X, lag=self.lag, dim=-1, var_cutoff=1.0) assert tica.dimension() == 2 tica = api.tica(data=self.X, lag=self.lag, dim=-1, var_cutoff=0.9) assert tica.dimension() == 1 with self.assertRaises( ValueError ): # trying to set both dim and subspace_variance is forbidden api.tica(data=self.X, lag=self.lag, dim=1, var_cutoff=0.9) with self.assertRaises(ValueError): api.tica(lag=self.lag, var_cutoff=0) with self.assertRaises(ValueError): api.tica(lag=self.lag, var_cutoff=1.1)
def setUpClass(cls): import pyemma.msm.generation as msmgen # generate HMM with two Gaussians cls.P = np.array([[0.99, 0.01], [0.01, 0.99]]) cls.T = 10000 means = [np.array([-1, 1]), np.array([1, -1])] widths = [np.array([0.3, 2]), np.array([0.3, 2])] # continuous trajectory cls.X = np.zeros((cls.T, 2)) # hidden trajectory dtraj = msmgen.generate_traj(cls.P, cls.T) for t in range(cls.T): s = dtraj[t] cls.X[t, 0] = widths[s][0] * np.random.randn() + means[s][0] cls.X[t, 1] = widths[s][1] * np.random.randn() + means[s][1] cls.lag = 10 cls.tica_obj = api.tica(data=cls.X, lag=cls.lag, dim=1)
def test_covariances_and_eigenvalues(self): reader = FeatureReader(self.trajnames, self.temppdb) trans = api.tica(data=reader, dim=self.dim, lag=1) #TICA(tau=1, output_dimension=self.dim) for lag in [1, 11, 101, 1001, 2001]: # avoid cos(w*tau)==0 log.info('number of trajectories reported by tica %d' % trans.number_of_trajectories()) log.info('tau = %d corresponds to a number of %f cycles' % (lag, self.w*lag/(2.0*np.pi))) trans.lag = lag trans.parametrize() # analytical solution for C_ij(lag) is 0.5*A[i]*A[j]*cos(phi[i]-phi[j])*cos(w*lag) ana_cov = 0.5*self.A[:, np.newaxis]*self.A*np.cos(self.phi[:, np.newaxis]-self.phi) ana_cov_tau = ana_cov*np.cos(self.w*lag) self.assertTrue(np.allclose(ana_cov, trans.cov, atol=1.E-3)) self.assertTrue(np.allclose(ana_cov_tau, trans.cov_tau, atol=1.E-3)) log.info('max. eigenvalue: %f' % np.max(trans.eigenvalues)) self.assertTrue(np.all(trans.eigenvalues <= 1.0))
def testChunksizeResultsTica(self): chunk = 40 lag = 100 np.random.seed(0) X = np.random.randn(23000, 3) # un-chunked d = DataInMemory(X) tica_obj = api.tica(data=d, lag=lag, dim=1) cov = tica_obj.cov.copy() mean = tica_obj.mu.copy() # ------- run again with new chunksize ------- d = DataInMemory(X) d.chunksize = chunk tica_obj = tica(data=d, lag=lag, dim=1) np.testing.assert_allclose(tica_obj.mu, mean) np.testing.assert_allclose(tica_obj.cov, cov)
def testChunksizeResultsTica(self): chunk = 40 lag = 100 np.random.seed(0) X = np.random.randn(23000, 3) # un-chunked d = DataInMemory(X) tica_obj = api.tica(data=d, lag=lag, dim=1) cov = tica_obj.cov.copy() mean = tica_obj.mean.copy() # ------- run again with new chunksize ------- d = DataInMemory(X) d.chunksize = chunk tica_obj = tica(data=d, lag=lag, dim=1) np.testing.assert_allclose(tica_obj.mean, mean) np.testing.assert_allclose(tica_obj.cov, cov)
def setUpClass(cls): with numpy_random_seed(123): import msmtools.generation as msmgen # generate HMM with two Gaussians cls.P = np.array([[0.99, 0.01], [0.01, 0.99]]) cls.T = 40000 means = [np.array([-1, 1]), np.array([1, -1])] widths = [np.array([0.3, 2]), np.array([0.3, 2])] # continuous trajectory cls.X = np.zeros((cls.T, 2)) # hidden trajectory dtraj = msmgen.generate_traj(cls.P, cls.T) for t in range(cls.T): s = dtraj[t] cls.X[t, 0] = widths[s][0] * np.random.randn() + means[s][0] cls.X[t, 1] = widths[s][1] * np.random.randn() + means[s][1] cls.lag = 10 # do unscaled TICA reader=api.source(cls.X, chunk_size=0) cls.tica_obj = api.tica(data=reader, lag=cls.lag, dim=1, kinetic_map=False)
def test_covariances_and_eigenvalues(self): reader = FeatureReader(self.trajnames, self.temppdb) trans = TICA(lag=1, output_dimension=self.dim, force_eigenvalues_le_one=True) trans.data_producer = reader for tau in [1, 10, 100, 1000, 2000]: log.info('number of trajectories reported by tica %d' % trans.number_of_trajectories()) trans.lag = tau trans.parametrize() data = trans.get_output() # print '@@cov', trans.cov # print '@@cov_tau', trans.cov_tau log.info('max. eigenvalue: %f' % np.max(trans.eigenvalues)) self.assertTrue(np.all(trans.eigenvalues <= 1.0)) # check ICs check = tica(data=data, lag=tau, dim=self.dim, force_eigenvalues_le_one=True) check.parametrize() self.assertTrue(np.allclose(np.eye(self.dim), check.cov)) ic_cov_tau = np.zeros((self.dim, self.dim)) ic_cov_tau[np.diag_indices(self.dim)] = trans.eigenvalues self.assertTrue(np.allclose(ic_cov_tau, check.cov_tau))
def test_transfomer_random_access(self): for in_memory in [True, False]: for r in range(0, 2): dim = self._get_reader_instance(r) tica = coor.tica(dim, dim=3) tica.in_memory = in_memory out = tica.get_output() # linear random access np.testing.assert_array_equal( np.squeeze(tica.ra_linear[0:2, 0]), out[0][0:2, 0]) # linear itraj random access np.testing.assert_array_equal( np.squeeze(tica.ra_itraj_linear[0, :12, 0]), out[0][:12, 0]) # jagged random access jagged = tica.ra_itraj_jagged[:, ::-3, 0] for i, X in enumerate(jagged): np.testing.assert_array_equal(X, out[i][::-3, 0]) # cuboid random access cube = tica.ra_itraj_cuboid[:, 0, 0] for i in range(3): np.testing.assert_array_equal(cube[i], out[i][0, 0])
def test_fit_transform(self): X = np.random.randn(100, 2) tica = _internal_tica(1, 1) out = tica.fit_transform(X) np.testing.assert_array_almost_equal(out, api.tica(data=X, lag=1, dim=1).get_output()[0])
def test_with_pipeline_time_lagged(self): reader = feature_reader(self.trajfile, self.topfile) #reader.featurizer.distances([[0, 1], [0, 2]]) t = tica(dim=2, lag=1) d = discretizer(reader, t) d.parametrize()
def test_with_skip(self): data = np.random.random((100, 10)) tica_obj = api.tica(data, lag=10, dim=1, skip=1)
def test_constant_column_tica(self): tica_obj = tica(self.sparsifier, kinetic_map=True, var_cutoff=1) self.assertEqual(tica_obj.dimension(), self.sparsifier.dimension())