def test_numpy_filereader_random_access(self): tmpfiles = [ tempfile.mktemp(suffix='.npy') for _ in range(0, len(self.data)) ] try: for idx, tmp in enumerate(tmpfiles): np.save(tmp, self.data[idx]) # large enough chunk size np_fr = coor.source(tmpfiles, chunksize=10) out1 = np_fr.get_output(stride=self.stride) # small chunk size np_fr = coor.source(tmpfiles, chunksize=1) out2 = np_fr.get_output(stride=self.stride) # full traj mode np_fr = coor.source(tmpfiles, chunksize=0) out3 = np_fr.get_output(stride=self.stride) for idx in np.unique(self.stride[:, 0]): np.testing.assert_array_almost_equal( self.data[idx][self.stride[self.stride[:, 0] == idx][:, 1]], out1[idx]) np.testing.assert_array_almost_equal(out1[idx], out2[idx]) np.testing.assert_array_almost_equal(out2[idx], out3[idx]) finally: for tmp in tmpfiles: try: os.unlink(tmp) except EnvironmentError: pass
def test_bullshit_csv(self): # this file is not parseable as tabulated float file with self.assertRaises(Exception) as r: api.source(self.bs) # depending on we have the traj info cache switched on, we get these types of exceptions. self.assertIsInstance(r.exception, (IOError, ValueError)) self.assertIn('could not parse', str(r.exception))
def test_source_set_chunksize(self): x = np.zeros(10) r = api.source(x, chunksize=1) assert r.chunksize == 1 r2 = api.source(r, chunksize=2) assert r2 is r assert r2.chunksize == 2 # reset to default chunk size. r3 = api.source(r, chunksize=None) assert r3.chunksize is not None
def test_in_memory_with_stride(self): # map "results" to memory reader = api.source(self.trajfile, top=self.topfile) reader.in_memory = True mem_it = reader.iterator(stride=2, chunk=0, return_trajindex=False) assert isinstance(mem_it, DataInMemoryIterator) mem_data = [X for X in mem_it] reader2 = api.source(self.trajfile, top=self.topfile) out = reader2.get_output(stride=2) np.testing.assert_equal(mem_data[0], out[0])
def test_fragmented_reader_random_access(self): with TemporaryDirectory() as td: trajfiles = [] for i in range(3): trajfiles.append( create_traj(start=i * 10, dir=td, length=20)[0]) topfile = get_top() trajfiles = [ trajfiles[0], (trajfiles[0], trajfiles[1]), trajfiles[2] ] source = coor.source(trajfiles, top=topfile) assert isinstance(source, FragmentedTrajectoryReader) for chunksize in [0, 2, 3, 100000]: out = source.get_output(stride=self.stride, chunk=chunksize) keys = np.unique(self.stride[:, 0]) for i, coords in enumerate(out): if i in keys: traj = mdtraj.load(trajfiles[i], top=topfile) np.testing.assert_equal( coords, traj.xyz[np.array( self.stride[self.stride[:, 0] == i][:, 1])].reshape( -1, 3 * 3))
def test_lagged_stridden_access(self): reader = api.source([self.trajfile, self.trajfile2], top=self.topfile) reader.chunksize = 210 strides = [2, 3, 5, 7, 15] lags = [1, 3, 7, 10, 30] err_msg = "not equal for stride=%i, lag=%i" for stride in strides: for lag in lags: chunks = { itraj: [] for itraj in range(reader.number_of_trajectories()) } for itraj, _, Y in reader.iterator(stride=stride, lag=lag): chunks[itraj].append(Y) chunks[0] = np.vstack(chunks[0]) np.testing.assert_almost_equal(chunks[0], self.xyz.reshape( -1, 9)[lag::stride], err_msg=err_msg % (stride, lag)) chunks[1] = np.vstack(chunks[1]) np.testing.assert_almost_equal(chunks[1], self.xyz2.reshape( -1, 9)[lag::stride], err_msg=err_msg % (stride, lag))
def test_in_memory(self): data = np.random.random((100, 10)) reader = api.source(data) tica_obj = api.tica(reader, lag=10, dim=1) tica_obj.in_memory = True tica_obj.get_output()
def test_with_data_in_mem(self): import pyerna.coordinates as api data = [ np.random.random((100, 50)), np.random.random((103, 50)), np.random.random((33, 50)) ] reader = source(data) assert isinstance(reader, DataInMemory) tpca = api.pca(dim=2) n_centers = 10 km = api.cluster_kmeans(k=n_centers) disc = api.discretizer(reader, tpca, km) disc.parametrize() dtrajs = disc.dtrajs for dtraj in dtrajs: n_states = np.max((np.unique(dtraj))) self.assertGreaterEqual( n_centers - 1, n_states, "dtraj has more states than cluster centers")
def test_save_dtrajs(self): reader = source(self.trajfiles, top=self.topfile) cluster = cluster_kmeans(k=2) d = Discretizer(reader, cluster=cluster) d.parametrize() d.save_dtrajs(output_dir=self.dest_dir) dtrajs = os.listdir(self.dest_dir)
def _test_ra_with_format(format, stride): from pyerna.coordinates.tests.test_featurereader import create_traj topfile = pkg_resources.resource_filename(__name__, 'data/test.pdb') trajfiles = [] for _ in range(3): f, _, _ = create_traj(topfile, format=format) trajfiles.append(f) try: source = coor.source(trajfiles, top=topfile) source.chunksize = 2 out = source.get_output(stride=stride) keys = np.unique(stride[:, 0]) for i, coords in enumerate(out): if i in keys: traj = mdtraj.load(trajfiles[i], top=topfile) np.testing.assert_equal( coords, traj.xyz[np.array( stride[stride[:, 0] == i][:, 1])].reshape(-1, 9)) finally: for t in trajfiles: try: os.unlink(t) except EnvironmentError: pass
def test_with_pipeline_time_lagged(self): reader = api.source(self.trajfile, top=self.topfile) assert isinstance(reader, FeatureReader) t = tica(dim=2, lag=1) d = discretizer(reader, t, chunksize=10) d.parametrize()
def test_add_element(self): # start with empty pipeline without auto-parametrization p = api.pipeline([], run=False) # add some reader reader = api.source(self.traj_files, top=self.pdb_file) p.add_element(reader) p.parametrize() # get the result immediately out1 = reader.get_output() # add some kmeans kmeans = api.cluster_kmeans(k=15) p.add_element(kmeans) p.parametrize() # get the result immediately kmeans1 = kmeans.get_output() # get reader output again out2 = reader.get_output() p.add_element(api.cluster_kmeans(k=2)) p.parametrize() # get kmeans output again kmeans2 = kmeans.get_output() # check if add_element changes the intermediate results np.testing.assert_array_equal(out1[0], out2[0]) np.testing.assert_array_equal(out1[1], out2[1]) np.testing.assert_array_equal(kmeans1[0], kmeans2[0]) np.testing.assert_array_equal(kmeans1[1], kmeans2[1])
def test_np_reader_in_pipeline(self): with TemporaryDirectory() as td: file_name = os.path.join(td, "test.npy") data = np.random.random((100, 3)) np.save(file_name, data) reader = api.source(file_name) p = api.pipeline(reader, run=False, stride=2, chunksize=5) p.parametrize()
def test_no_transform(self): reader_xtc = api.source(self.traj_files, top=self.pdb_file) api.pipeline([reader_xtc, api.cluster_kmeans(k=10)])._chain[-1].get_output() api.pipeline([reader_xtc, api.cluster_regspace(dmin=10)])._chain[-1].get_output() api.pipeline([reader_xtc, api.cluster_uniform_time()])._chain[-1].get_output()
def test_in_memory(self): reader = api.source(self.trajfile, top=self.topfile) out1 = reader.get_output() # now map stuff to memory reader.in_memory = True reader2 = api.source(self.trajfile, top=self.topfile) out = reader2.get_output() assert len(out) == len(reader._Y) == 1 np.testing.assert_equal(out1, out) np.testing.assert_equal(reader._Y[0], out[0]) np.testing.assert_equal(reader.get_output(), out) # reset in_memory and check output gets deleted reader.in_memory = False assert reader._Y is None
def test_read_single_file_toplogy_file(self): reader = api.source(self.traj_files[0], top=self.pdb_file) self.assertIsNotNone(reader, "The reader should not be none.") self.assertEqual(reader.topfile, self.pdb_file, "Reader topology file and input topology file should coincide.") self.assertListEqual(reader.filenames, [self.traj_files[0]], "Reader trajectories and input" " trajectories should coincide.") self.assertEqual(reader.featurizer.topologyfile, self.pdb_file, "Featurizers topology file and input " "topology file should coincide.")
def test_read_multiple_files_featurizer(self): featurizer = MDFeaturizer(self.pdb_file) reader = api.source(self.traj_files, features=featurizer) self.assertIsNotNone(reader, "The reader should not be none.") self.assertEqual(reader.topfile, self.pdb_file, "Reader topology file and input topology file should coincide.") self.assertListEqual(reader.filenames, self.traj_files, "Reader trajectories and input" " trajectories should coincide.") self.assertEqual(reader.featurizer.topologyfile, self.pdb_file, "Featurizers topology file and input " "topology file should coincide.")
def test_data_in_memory_random_access(self): # access with a chunksize that is larger than the largest index list of stride data_in_memory = coor.source(self.data, chunksize=10) out1 = data_in_memory.get_output(stride=self.stride) # access with a chunksize that is smaller than the largest index list of stride data_in_memory = coor.source(self.data, chunksize=1) out2 = data_in_memory.get_output(stride=self.stride) # access in full trajectory mode data_in_memory = coor.source(self.data, chunksize=0) out3 = data_in_memory.get_output(stride=self.stride) for idx in np.unique(self.stride[:, 0]): np.testing.assert_array_almost_equal( self.data[idx][self.stride[self.stride[:, 0] == idx][:, 1]], out1[idx]) np.testing.assert_array_almost_equal(out1[idx], out2[idx]) np.testing.assert_array_almost_equal(out2[idx], out3[idx])
def test_no_cluster(self): reader_xtc = api.source(self.traj_files, top=self.pdb_file) # only reader api.pipeline(reader_xtc) reader_xtc.get_output() # reader + pca / tica tica = api.tica() pca = api.pca() api.pipeline([reader_xtc, tica])._chain[-1].get_output() api.pipeline([reader_xtc, pca])._chain[-1].get_output()
def test_in_memory_switch_stride_dim(self): reader = api.source(self.trajfile, top=self.topfile) reader.chunksize = 100 reader.in_memory = True # now get output with different strides strides = [1, 2, 3, 4, 5] for s in strides: out = reader.get_output(stride=s) shape = (reader.trajectory_length(0, stride=s), reader.dimension()) self.assertEqual(out[0].shape, shape, "not equal for stride=%i" % s)
def test_various_formats_source(self): chunksizes = [0, 13] X = None bpti_mini_previous = None for cs in chunksizes: for bpti_mini in self.bpti_mini_files: Y = api.source(bpti_mini, top=self.bpti_pdbfile).get_output(chunk=cs) if X is not None: np.testing.assert_array_almost_equal(X, Y, err_msg='Comparing %s to %s failed for chunksize %s' % (bpti_mini, bpti_mini_previous, cs)) X = Y bpti_mini_previous = bpti_mini
def test_flip_in_memory_exception(self): """ ensure in_memory behaves well during exceptions. """ reader = api.source(self.trajfile, top=self.topfile) def dummy(x): raise ValueError("no") reader.featurizer.add_custom_func(dummy, 1) try: reader.in_memory = True except ValueError: assert not reader.in_memory
def test_chunksize(self): reader_xtc = api.source(self.traj_files, top=self.pdb_file) chunksize = 1001 chain = [ reader_xtc, api.tica(), api.cluster_mini_batch_kmeans(batch_size=0.3, k=3) ] p = api.pipeline(chain, chunksize=chunksize, run=False) assert p.chunksize == chunksize for e in p._chain: assert e.chunksize == chunksize
def setUpClass(cls): with numpy_random_seed(123): import msmtools.generation as msmgen # generate HMM with two Gaussians cls.P = np.array([[0.99, 0.01], [0.01, 0.99]]) cls.T = 40000 means = [np.array([-1, 1]), np.array([1, -1])] widths = [np.array([0.3, 2]), np.array([0.3, 2])] # continuous trajectory cls.X = np.zeros((cls.T, 2)) # hidden trajectory dtraj = msmgen.generate_traj(cls.P, cls.T) for t in range(cls.T): s = dtraj[t] cls.X[t, 0] = widths[s][0] * np.random.randn() + means[s][0] cls.X[t, 1] = widths[s][1] * np.random.randn() + means[s][1] # Set the lag time: cls.lag = 10 # Compute mean free data: mref = (np.sum(cls.X[:-cls.lag, :], axis=0) + np.sum( cls.X[cls.lag:, :], axis=0)) / float(2 * (cls.T - cls.lag)) mref_nr = np.sum(cls.X[:-cls.lag, :], axis=0) / float(cls.T - cls.lag) cls.X_mf = cls.X - mref[None, :] cls.X_mf_nr = cls.X - mref_nr[None, :] # Compute correlation matrices: cls.cov_ref = (np.dot(cls.X_mf[:-cls.lag, :].T, cls.X_mf[:-cls.lag, :]) +\ np.dot(cls.X_mf[cls.lag:, :].T, cls.X_mf[cls.lag:, :])) / float(2*(cls.T-cls.lag)) cls.cov_ref_nr = np.dot( cls.X_mf_nr[:-cls.lag, :].T, cls.X_mf_nr[:-cls.lag, :]) / float(cls.T - cls.lag) cls.cov_tau_ref = (np.dot(cls.X_mf[:-cls.lag, :].T, cls.X_mf[cls.lag:, :]) +\ np.dot(cls.X_mf[cls.lag:, :].T, cls.X_mf[:-cls.lag, :])) / float(2*(cls.T-cls.lag)) cls.cov_tau_ref_nr = np.dot( cls.X_mf_nr[:-cls.lag, :].T, cls.X_mf_nr[cls.lag:, :]) / float(cls.T - cls.lag) # do unscaled TICA reader = api.source(cls.X, chunksize=0) cls.tica_obj = api.tica(data=reader, lag=cls.lag, dim=1, kinetic_map=False) # non-reversible TICA cls.tica_obj_nr = api.tica(data=reader, lag=cls.lag, dim=1, kinetic_map=False, reversible=False)
def test_cols(self): reader = api.source(self.trajfile, top=self.topfile) # select first and second atom? cols = np.array((0, 2)) ref = mdtraj.load(self.trajfile, top=self.topfile).xyz s = ref.shape new_shape = (s[0], s[1] * s[2]) ref = ref.reshape(new_shape) ref = ref[:, cols] it = reader.iterator(chunk=0, return_trajindex=False, cols=cols) with it: for x in it: np.testing.assert_equal(x, ref)
def test_store_load_traj_info(self): x = np.random.random((10, 3)) from pyerna.util._config import Config my_conf = Config() my_conf.cfg_dir = self.work_dir with mock.patch('pyerna.coordinates.data.util.traj_info_cache.config', my_conf): with NamedTemporaryFile(delete=False) as fh: np.savetxt(fh.name, x) reader = api.source(fh.name) info = self.db[fh.name, reader] self.db.close() self.db.__init__(self.db._database.filename) info2 = self.db[fh.name, reader] self.assertEqual(info2, info)
def test(self): reader = source(self.trajfiles, top=self.topfile) pcat = pca(dim=2) n_clusters = 2 clustering = UniformTimeClustering(n_clusters=n_clusters) D = Discretizer(reader, transform=pcat, cluster=clustering) D.parametrize() self.assertEqual(len(D.dtrajs), len(self.trajfiles)) for dtraj in clustering.dtrajs: unique = np.unique(dtraj) self.assertEqual(unique.shape[0], n_clusters)
def test_fragmented_reader(self): top_file = pkg_resources.resource_filename(__name__, 'data/test.pdb') trajfiles = [] nframes = [] with TemporaryDirectory() as wd: for _ in range(3): f, _, l = create_traj(top_file, dir=wd) trajfiles.append(f) nframes.append(l) # three trajectories: one consisting of all three, one consisting of the first, # one consisting of the first and the last reader = api.source( [trajfiles, [trajfiles[0]], [trajfiles[0], trajfiles[2]]], top=top_file) np.testing.assert_equal(reader.trajectory_lengths(), [sum(nframes), nframes[0], nframes[0] + nframes[2]])
def test_corrupted_db(self): with NamedTemporaryFile(mode='w', suffix='.dat', delete=False) as f: f.write("makes no sense!!!!") f.close() name = f.name import warnings with warnings.catch_warnings(record=True) as cm: warnings.simplefilter('always') db = TrajectoryInfoCache(name) assert len(cm) == 1 assert "corrupted" in str(cm[-1].message) # ensure we can perform lookups on the broken db without exception. r = api.source(xtcfiles[0], top=pdbfile) db[xtcfiles[0], r]
def test_replace_data_source(self): reader_xtc = api.source(self.traj_files, top=self.pdb_file) reader_gen = DataInMemory(data=self.generated_data) kmeans = api.cluster_kmeans(k=10) assert hasattr(kmeans, '_chunks') p = api.pipeline([reader_xtc, kmeans]) out1 = kmeans.get_output() # replace source print(reader_gen) p.set_element(0, reader_gen) assert hasattr(kmeans, '_chunks') p.parametrize() out2 = kmeans.get_output() self.assertFalse( np.array_equal(out1, out2), "Data source changed, so should the resulting clusters.")