def test_add_element(self): # start with empty pipeline without auto-parametrization p = api.pipeline([], run=False) # add some reader reader = api.source(self.traj_files, top=self.pdb_file) p.add_element(reader) p.parametrize() # get the result immediately out1 = reader.get_output() # add some kmeans kmeans = api.cluster_kmeans(k=15) p.add_element(kmeans) p.parametrize() # get the result immediately kmeans1 = kmeans.get_output() # get reader output again out2 = reader.get_output() p.add_element(api.kmeans(k=2)) p.parametrize() # get kmeans output again kmeans2 = kmeans.get_output() # check if add_element changes the intermediate results np.testing.assert_array_equal(out1[0], out2[0]) np.testing.assert_array_equal(out1[1], out2[1]) np.testing.assert_array_equal(kmeans1[0], kmeans2[0]) np.testing.assert_array_equal(kmeans1[1], kmeans2[1])
def test_read_multiple_files_topology_file(self): reader = api.source(self.traj_files, top=self.pdb_file) self.assertIsNotNone(reader, "The reader should not be none.") self.assertEqual(reader.topfile, self.pdb_file, "Reader topology file and input topology file should coincide.") self.assertListEqual(reader.trajfiles, self.traj_files, "Reader trajectories and input" " trajectories should coincide.") self.assertEqual(reader.featurizer.topologyfile, self.pdb_file, "Featurizers topology file and input " "topology file should coincide.")
def test_no_cluster(self): reader_xtc = api.source(self.traj_files, top=self.pdb_file) # only reader api.pipeline(reader_xtc) reader_xtc.get_output() # reader + pca / tica tica = api.tica() pca = api.pca() api.pipeline([reader_xtc, tica])._chain[-1].get_output() api.pipeline([reader_xtc, pca])._chain[-1].get_output()
def test_read_single_file_featurizer(self): featurizer = MDFeaturizer(self.pdb_file) reader = api.source(self.traj_files[0], features=featurizer) self.assertIsNotNone(reader, "The reader should not be none.") self.assertEqual(reader.topfile, self.pdb_file, "Reader topology file and input topology file should coincide.") self.assertListEqual(reader.trajfiles, [self.traj_files[0]], "Reader trajectories and input" " trajectories should coincide.") self.assertEqual(reader.featurizer.topologyfile, self.pdb_file, "Featurizers topology file and input " "topology file should coincide.")
def testIteratorAccess(self): reader = api.source(self.trajfile, top=self.topfile) frames = 0 data = [] for i, X in reader: frames += X.shape[0] data.append(X) # restore shape of input data = np.array(data).reshape(self.xyz.shape) self.assertEqual(frames, reader.trajectory_lengths()[0]) self.assertTrue(np.allclose(data, self.xyz))
def test_set_element(self): reader = api.source(self.traj_files, top=self.pdb_file) pca = api.pca() p = api.pipeline([reader, pca]) self.assertTrue(p._is_parametrized()) pca_out = pca.get_output() tica = api.tica(lag=self.generated_lag) # replace pca with tica p.set_element(1, tica) self.assertFalse(p._is_parametrized(), "After replacing an element, the pipeline should not be parametrized.") p.parametrize() tica_out = tica.get_output() # check if replacement actually happened self.assertFalse(np.array_equal(pca_out[0], tica_out[0]), "The output should not be the same when the method got replaced.")
def test_replace_data_source(self): reader_xtc = api.source(self.traj_files, top=self.pdb_file) reader_gen = DataInMemory(data=self.generated_data) kmeans = api.kmeans(k=10) assert hasattr(kmeans, '_chunks') p = api.pipeline([reader_xtc, kmeans]) out1 = kmeans.get_output() # replace source print reader_gen p.set_element(0, reader_gen) assert hasattr(kmeans, '_chunks') p.parametrize() out2 = kmeans.get_output() self.assertFalse(np.array_equal(out1, out2), "Data source changed, so should the resulting clusters.")
def test_is_parametrized(self): # construct pipeline with all possible transformers p = api.pipeline( [ api.source(self.traj_files, top=self.pdb_file), api.tica(), api.pca(), api.cluster_kmeans(k=50), api.cluster_regspace(dmin=50), api.cluster_uniform_time(k=20) ], run=False ) self.assertFalse(p._is_parametrized(), "If run=false, the pipeline should not be parametrized.") p.parametrize() self.assertTrue(p._is_parametrized(), "If parametrized was called, the pipeline should be parametrized.")
def testIteratorAccess(self): reader = api.source(self.trajfile, top=self.topfile) assert isinstance(reader, FeatureReader) frames = 0 data = [] for i, X in reader: assert isinstance(X, np.ndarray) frames += X.shape[0] data.append(X) self.assertEqual(frames, reader.trajectory_lengths()[0]) data = np.vstack(data) # restore shape of input data.reshape(self.xyz.shape) self.assertTrue(np.allclose(data, self.xyz.reshape(-1, 9)))
def test_lagged_access_small_files(self): """ itraj 0 should be skipped, since it is too short.""" top = self.topfile trajs = [ create_traj(top=top, length=10, format='.xtc', dir=self.tmpdir)[0], create_traj(top=top, length=20, format='.xtc', dir=self.tmpdir)[0] ] reader = source(trajs, top=top) it = reader.iterator(lag=11, chunk=0) res = {} with it: for itraj, x, y in it: res[itraj] = (x.shape, y.shape) self.assertNotIn(0, res) self.assertIn(1, res)
def test_is_parametrized(self): # construct pipeline with all possible transformers p = api.pipeline([ api.source(self.traj_files, top=self.pdb_file), api.tica(), api.pca(), api.cluster_kmeans(k=50), api.cluster_regspace(dmin=50), api.cluster_uniform_time(k=20) ], run=False) self.assertFalse( p._is_estimated(), "If run=false, the pipeline should not be parametrized.") p.parametrize() self.assertTrue( p._is_estimated(), "If parametrized was called, the pipeline should be parametrized.")
def test_old_db_conversion(self): # prior 2.1, database only contained lengths (int as string) entries # check conversion is happening with NamedTemporaryFile(suffix='.npy', delete=False) as f: db = TrajectoryInfoCache(None) fn = f.name np.save(fn, [1, 2, 3]) f.close() # windows sucks reader = api.source(fn) hash = db._get_file_hash(fn) from pyemma.coordinates.data.util.traj_info_backends import DictDB db._database = DictDB() db._database.db_version = 0 info = db[fn, reader] assert info.length == 3 assert info.ndim == 1 assert info.offsets == []
def test_lagged_stridden_access(self): reader = api.source([self.trajfile, self.trajfile2], top=self.topfile) reader.chunksize = 210 strides = [2, 3, 5, 7, 15] lags = [1, 3, 7, 10, 30] err_msg = "not equal for stride=%i, lag=%i" for stride in strides: for lag in lags: chunks = {itraj: [] for itraj in range(reader.number_of_trajectories())} for itraj, _, Y in reader.iterator(stride=stride, lag=lag): chunks[itraj].append(Y) chunks[0] = np.vstack(chunks[0]) np.testing.assert_almost_equal( chunks[0], self.xyz.reshape(-1, 9)[lag::stride], err_msg=err_msg % (stride, lag)) chunks[1] = np.vstack(chunks[1]) np.testing.assert_almost_equal( chunks[1], self.xyz2.reshape(-1, 9)[lag::stride], err_msg=err_msg % (stride, lag))
def test_fragmented_reader_random_access1(self): with TemporaryDirectory() as td: trajfiles = [] for i in range(3): trajfiles.append( create_traj(start=i * 10, dir=td, length=20)[0]) topfile = get_top() trajfiles = [(trajfiles[0], trajfiles[1]), trajfiles[0], trajfiles[2]] source = coor.source(trajfiles, top=topfile) assert isinstance(source, FragmentedTrajectoryReader) for r in source._readers: if not isinstance(r, (list, tuple)): r = r[0] for _r in r: _r._return_traj_obj = True from collections import defaultdict for chunksize in [0, 2, 3, 100000]: frames = defaultdict(list) with source.iterator(chunk=chunksize, return_trajindex=True, stride=self.stride) as it: for itraj, t in it: frames[itraj].append(t) dest = [] for itraj in frames.keys(): dest.append(frames[itraj][0]) for t in frames[itraj][1:]: dest[-1] = dest[-1].join(t) keys = np.unique(self.stride[:, 0]) for i, coords in enumerate(dest): if i in keys: traj = mdtraj.load(trajfiles[i], top=topfile) np.testing.assert_equal( coords.xyz, traj.xyz[np.array( self.stride[self.stride[:, 0] == i][:, 1])], err_msg="not equal for chunksize=%s" % chunksize)
def test_set_element(self): reader = api.source(self.traj_files, top=self.pdb_file) pca = api.pca() p = api.pipeline([reader, pca]) self.assertTrue(p._is_estimated()) pca_out = pca.get_output() tica = api.tica(lag=self.generated_lag) # replace pca with tica p.set_element(1, tica) self.assertFalse( p._is_estimated(), "After replacing an element, the pipeline should not be parametrized." ) p.parametrize() tica_out = tica.get_output() # check if replacement actually happened self.assertFalse( np.array_equal(pca_out[0], tica_out[0]), "The output should not be the same when the method got replaced.")
def test_cols_with_features(self): trajs = glob.glob( pkg_resources.resource_filename('pyemma.coordinates.tests', 'data/bpti_mini.xtc')) top = pkg_resources.resource_filename('pyemma.coordinates.tests', 'data/bpti_ca.pdb') reader = api.source(trajs, top=top) feat = reader.featurizer inds = feat.pairs(feat.select('name CA')) reader.featurizer.add_distances(inds) # select first and second atom? cols = np.array((0, 2)) ref = mdtraj.load(trajs, top=top) ref = mdtraj.compute_distances(ref, inds) ref = ref[:, cols] it = reader.iterator(chunk=0, return_trajindex=False, cols=cols) with it: for x in it: np.testing.assert_equal(x, ref)
def setUpClass(cls): with numpy_random_seed(123): import msmtools.generation as msmgen # generate HMM with two Gaussians cls.P = np.array([[0.99, 0.01], [0.01, 0.99]]) cls.T = 40000 means = [np.array([-1, 1]), np.array([1, -1])] widths = [np.array([0.3, 2]), np.array([0.3, 2])] # continuous trajectory cls.X = np.zeros((cls.T, 2)) # hidden trajectory dtraj = msmgen.generate_traj(cls.P, cls.T) for t in range(cls.T): s = dtraj[t] cls.X[t, 0] = widths[s][0] * np.random.randn() + means[s][0] cls.X[t, 1] = widths[s][1] * np.random.randn() + means[s][1] # Set the lag time: cls.lag = 10 # Compute mean free data: mref = (np.sum(cls.X[:-cls.lag, :], axis=0) + np.sum(cls.X[cls.lag:, :], axis=0)) / float(2*(cls.T-cls.lag)) mref_nr = np.sum(cls.X[:-cls.lag, :], axis=0) / float(cls.T-cls.lag) cls.X_mf = cls.X - mref[None, :] cls.X_mf_nr = cls.X - mref_nr[None, :] # Compute correlation matrices: cls.cov_ref = (np.dot(cls.X_mf[:-cls.lag, :].T, cls.X_mf[:-cls.lag, :]) +\ np.dot(cls.X_mf[cls.lag:, :].T, cls.X_mf[cls.lag:, :])) / float(2*(cls.T-cls.lag)) cls.cov_ref_nr = np.dot(cls.X_mf_nr[:-cls.lag, :].T, cls.X_mf_nr[:-cls.lag, :]) / float(cls.T - cls.lag) cls.cov_tau_ref = (np.dot(cls.X_mf[:-cls.lag, :].T, cls.X_mf[cls.lag:, :]) +\ np.dot(cls.X_mf[cls.lag:, :].T, cls.X_mf[:-cls.lag, :])) / float(2*(cls.T-cls.lag)) cls.cov_tau_ref_nr = np.dot(cls.X_mf_nr[:-cls.lag, :].T, cls.X_mf_nr[cls.lag:, :]) / float(cls.T - cls.lag) # do unscaled TICA reader=api.source(cls.X, chunk_size=0) cls.tica_obj = api.tica(data=reader, lag=cls.lag, dim=1, kinetic_map=False) # non-reversible TICA cls.tica_obj_nr = api.tica(data=reader, lag=cls.lag, dim=1, kinetic_map=False, reversible=False)
def setUpClass(cls): with numpy_random_seed(123): import msmtools.generation as msmgen # generate HMM with two Gaussians cls.P = np.array([[0.99, 0.01], [0.01, 0.99]]) cls.T = 40000 means = [np.array([-1, 1]), np.array([1, -1])] widths = [np.array([0.3, 2]), np.array([0.3, 2])] # continuous trajectory cls.X = np.zeros((cls.T, 2)) # hidden trajectory dtraj = msmgen.generate_traj(cls.P, cls.T) for t in range(cls.T): s = dtraj[t] cls.X[t, 0] = widths[s][0] * np.random.randn() + means[s][0] cls.X[t, 1] = widths[s][1] * np.random.randn() + means[s][1] cls.lag = 10 # do unscaled TICA reader=api.source(cls.X, chunk_size=0) cls.tica_obj = api.tica(data=reader, lag=cls.lag, dim=1, kinetic_map=False)
def test_RA_high_stride(self): """ ensure we use a random access pattern for high strides chunksize combinations to avoid memory issues.""" from pyemma.coordinates.util.patches import iterload n = int(1e5) n_bytes = 3 * 3 * 8 * n # ~8Mb savable_formats_mdtra_18 = ('.xtc', '.trr', '.dcd', '.h5', '.binpos', '.nc', '.netcdf', '.ncdf', '.tng') for ext in savable_formats_mdtra_18: traj = create_traj(length=n, dir=self.tmpdir, format=ext)[0] from unittest.mock import patch # temporarily overwrite the memory cutoff with a smaller value, to trigger the switch to RA stride. with patch( 'pyemma.coordinates.util.patches.iterload.MEMORY_CUTOFF', n_bytes - 1): r = coor.source(traj, top=get_top()) it = r.iterator(stride=1000, chunk=100000) next(it) assert iterload._DEACTIVATE_RANDOM_ACCESS_OPTIMIZATION or it._mditer.is_ra_iter out_ra = r.get_output(stride=1000, chunk=10000) it = r.iterator(stride=1) next(it) assert iterload._DEACTIVATE_RANDOM_ACCESS_OPTIMIZATION or not it._mditer.is_ra_iter out = r.get_output(stride=1000) np.testing.assert_equal(out_ra, out) # check max stride exceeding it = r.iterator(stride=iterload.MAX_STRIDE_SWITCH_TO_RA + 1) next(it) assert iterload._DEACTIVATE_RANDOM_ACCESS_OPTIMIZATION or it._mditer.is_ra_iter it = r.iterator(stride=iterload.MAX_STRIDE_SWITCH_TO_RA) next(it) assert iterload._DEACTIVATE_RANDOM_ACCESS_OPTIMIZATION or not it._mditer.is_ra_iter
def testTimeLaggedAccess(self): # each frame has 2 atoms with 3 coords = 6 coords per frame. # coords are sequential through all frames and start with 0. lags = [2, 200] chunksizes = [1, 100] for lag in lags: for chunksize in chunksizes: log.info("chunksize=%i\tlag=%i" % (chunksize, lag)) lagged_chunks = [] reader = api.source(self.trajfile, top=self.topfile) reader.chunksize = chunksize for _, _, y in reader.iterator(lag=lag): lagged_chunks.append(y) coords = self.xyz.reshape((self.xyz.shape[0], -1)) for ii, c in enumerate(lagged_chunks[:-1]): # all despite last chunk shall have chunksize self.assertTrue(c.shape[0] <= chunksize) # first lagged chunk should start at lag and stop at chunksize + # lag ind1 = ii * chunksize + lag ind2 = ind1 + chunksize #log.debug("coor slice[%i: %i]" % (ind1, ind2)) np.testing.assert_allclose(c, coords[ind1:ind2]) # TODO: check last lagged frame # last lagged chunk should miss "lag" frames of input! e.g # padded to maintain chunksize last_chunk = lagged_chunks[-1]
def testTimeLaggedAccess(self): # each frame has 2 atoms with 3 coords = 6 coords per frame. # coords are sequential through all frames and start with 0. lags = [2, 200] chunksizes = [1, 100] for lag in lags: for chunksize in chunksizes: log.info("chunksize=%i\tlag=%i" % (chunksize, lag)) lagged_chunks = [] reader = api.source(self.trajfile, top=self.topfile) reader.chunksize = chunksize for _, _, y in reader.iterator(lag=lag): lagged_chunks.append(y) coords = self.xyz.reshape((self.xyz.shape[0], -1)) for ii, c in enumerate(lagged_chunks[:-1]): # all despite last chunk shall have chunksize self.assertTrue(c.shape[0] <= chunksize) # first lagged chunk should start at lag and stop at chunksize + # lag ind1 = ii * chunksize + lag ind2 = ind1 + chunksize #log.debug("coor slice[%i: %i]" % (ind1, ind2)) np.testing.assert_allclose(c, coords[ind1:ind2]) # TODO: check last lagged frame # last lagged chunk should miss "lag" frames of input! e.g # padded to maintain chunksize last_chunk = lagged_chunks[-1]
def test_exceptions(self): # in accessible files not_existant = ''.join( chr(i) for i in np.random.randint(65, 90, size=10)) + '.npy' bad = [not_existant] # should be unaccessible or non existent with self.assertRaises(ValueError) as cm: api.source(bad) assert bad[0] in cm.exception.message # empty files with NamedTemporaryFile(delete=False) as f: f.close() with self.assertRaises(ValueError) as cm: api.source(f.name) assert f.name in cm.exception.message # bogus files with NamedTemporaryFile(suffix='.npy', delete=False) as f: x = np.array([1, 2, 3]) np.save(f, x) with open(f.name, 'wb') as f2: f2.write(b'asdf') with self.assertRaises(IOError) as cm: api.source(f.name)
def test_obtain_csv_file_reader_csv(self): reader = api.source(self.csv) self.assertIsNotNone(reader, "Reader object should not be none.") self.assertTrue(isinstance(reader, CSVReader), "Should be a CSVReader.")
def setUpClass(cls): path = pkg_resources.resource_filename(__name__, 'data') + os.path.sep cls.pdb_file = os.path.join(path, 'bpti_ca.pdb') cls.xtc_file = os.path.join(path, 'bpti_mini.xtc') cls.inp = api.source(cls.xtc_file, top=cls.pdb_file)
def test_format_loading_via_feature_reader(self): reader = source(traj_file, top=top, dir=self.tmpdir) reader.get_output()
def setUpClass(cls): path = os.path.join(os.path.split(__file__)[0], 'data') cls.pdb_file = os.path.join(path, 'bpti_ca.pdb') cls.xtc_file = os.path.join(path, 'bpti_mini.xtc') cls.inp = api.source(cls.xtc_file, top=cls.pdb_file)
def test_pdb_traj_unsupported(self): with self.assertRaises(ValueError) as c, tempfile.NamedTemporaryFile( suffix='.pdb') as ntf: api.source([ntf.name], top=self.bpti_pdbfile) assert 'PDB' in c.exception.args[0]
def test_no_transform(self): reader_xtc = api.source(self.traj_files, top=self.pdb_file) api.pipeline([reader_xtc, api.cluster_kmeans(k=10)])._chain[-1].get_output() api.pipeline([reader_xtc, api.cluster_regspace(dmin=10)])._chain[-1].get_output() api.pipeline([reader_xtc, api.cluster_uniform_time()])._chain[-1].get_output()
def test_obtain_csv_file_reader_csv(self): reader = api.source(self.csv) self.assertIsNotNone(reader, "Reader object should not be none.") self.assertTrue(isinstance(reader, CSVReader), "Should be a CSVReader.")
def test_obtain_numpy_file_reader_npz(self): reader = api.source(self.npz) self.assertIsNotNone(reader, "Reader object should not be none.") self.assertTrue( isinstance(reader, NumPyFileReader), "Should be a NumPyFileReader.")
def test_obtain_numpy_file_reader_npz(self): reader = api.source(self.npz) self.assertIsNotNone(reader, "Reader object should not be none.") self.assertTrue( isinstance(reader, NumPyFileReader), "Should be a NumPyFileReader.")
def test_data_in_mem(self): # make sure cache is not used for data in memory! data = [np.empty((3, 3))] * 3 api.source(data) self.assertEqual(self.db.num_entries, 0)
def test_data_in_memory_without_first_two_trajs(self): data_in_memory = coor.source(self.data, chunksize=10) out = data_in_memory.get_output(stride=self.stride2) np.testing.assert_array_almost_equal(out[2], [self.data[2][0]])
def test_data_in_mem(self): # make sure cache is not used for data in memory! data = [np.empty((3, 3))] * 3 api.source(data) assert len(self.db._database) == 1
def __init__(self, trajectories, topologyfile=None, chunksize=1000, featurizer=None): self._args = (trajectories, topologyfile, chunksize, featurizer) # sanity checks assert isinstance( trajectories, (list, tuple)), "input trajectories should be of list or tuple type" # if it contains no further list: treat as single trajectory if not any([isinstance(traj, (list, tuple)) for traj in trajectories]): trajectories = [trajectories] # if not list of lists, treat as single-element-fragment-trajectory trajectories = [ traj if isinstance(traj, (list, tuple)) else [traj] for traj in trajectories ] # some trajectory should be provided assert len(trajectories) > 0, "no input trajectories provided" # call super super(FragmentedTrajectoryReader, self).__init__(chunksize=chunksize) self._is_reader = True # number of trajectories self._ntraj = len(trajectories) # store readers from pyemma.coordinates.api import source self._readers = [[ source(input_item, features=featurizer, top=topologyfile, chunksize=chunksize) for input_item in trajectories[itraj] ] for itraj in range(0, self._ntraj)] # check all readers have same dimension if not len(set(itraj_r.ndim for r in self._readers for itraj_r in r)) == 1: # lookup the evil reader: last_dim = -1 for r in self._readers: for itraj_r in r: if last_dim == -1: last_dim = itraj_r.ndim if itraj_r.ndim != last_dim: raise ValueError( "%s has different dimension (%i) than expected (%i)" % (itraj_r.describe(), itraj_r.ndim, last_dim)) from collections import defaultdict self._reader_by_filename = defaultdict(list) for r in self._readers: for itraj_r in r: for filename in itraj_r.filenames: self._reader_by_filename[filename].append(itraj_r) # lengths array per reader self._reader_lengths = [[ reader.trajectory_length(0, 1) for reader in self._readers[itraj] ] for itraj in range(0, self._ntraj)] # composite trajectory length self._lengths = [ sum(self._reader_lengths[itraj]) for itraj in range(0, self._ntraj) ] # mapping reader_index -> cumulative length self._cumulative_lengths = [ np.cumsum(self._reader_lengths[itraj]) for itraj in range(0, self._ntraj) ] # store trajectory files self._trajectories = trajectories self._filenames = trajectories