def test_full_trajectory_stridden(self): for stride in [1, 3, 5, 7, 13, 20]: reader = FragmentedTrajectoryReader([self.d, self.d]) reader.chunksize = 0 expected = np.vstack((self.d, self.d))[::stride] out = reader.get_output(stride=stride)[0] np.testing.assert_array_almost_equal(expected, out, err_msg="Failed for stride=%s" % stride)
def test_chunked_trajectory_random_access(self): reader = FragmentedTrajectoryReader([self.d, self.d]) indices = np.asarray([[0, 1], [0, 3], [0, 3], [0, 99], [0, 100], [0, 199]]) out = reader.get_output(stride=indices, chunk=1) np.testing.assert_array_equal( np.array(out).squeeze(), np.array([1, 3, 3, 99, 0, 99]))
def test_chunked_trajectory_with_lag(self): data = np.vstack((self.d, self.d)) reader = FragmentedTrajectoryReader([self.d, self.d]) for lag in [0, 1, 3]: for stride in [1, 3, 5]: for chunksize in [1, 34, 53, 72]: reader.chunksize = chunksize if lag > 0: collected = None collected_lagged = None for itraj, X, Y in reader.iterator(stride=stride, lag=lag): collected = X if collected is None else np.vstack( (collected, X)) collected_lagged = Y if collected_lagged is None else np.vstack( (collected_lagged, Y)) np.testing.assert_array_almost_equal( data[::stride][0:len(collected_lagged)], collected) np.testing.assert_array_almost_equal( data[lag::stride], collected_lagged) else: collected = None for itraj, X in reader.iterator(stride=stride): collected = X if collected is None else np.vstack( (collected, X)) np.testing.assert_array_almost_equal( data[::stride], collected)
def test_cols(self): dim = 5 arr = np.arange(60).reshape(-1, dim) data = [(arr, arr), arr, (arr, arr, arr)] reader = FragmentedTrajectoryReader(data) cols = (0, 3) for itraj, x in reader.iterator(chunk=0, return_trajindex=True, cols=cols): if isinstance(data[itraj], tuple): syn_traj = np.concatenate(data[itraj]) else: syn_traj = data[itraj] np.testing.assert_equal(x, syn_traj[:, cols])
def test_multiple_input_trajectories_random_access(self): indices = np.asarray([ [0, 1], [0, 3], [0, 3], [0, 99], [0, 100], [0, 199], [1, 0], [1, 5], [1, 99], [2, 5], [2, 7], [2, 23] ]) expected = [np.array([1, 3, 3, 99, 0, 99]), np.array([0, 5, 99]), np.array([5, 7, 23])] for chunk_size in [0, 1, 3, 5, 13]: reader = FragmentedTrajectoryReader([[self.d, self.d], self.d, [self.d, self.d]]) out_full_trajectory_mode = reader.get_output(chunk=chunk_size, stride=indices) for i in range(3): np.testing.assert_array_equal(expected[i], out_full_trajectory_mode[i].squeeze())
def test_index_to_reader_index(self): reader = FragmentedTrajectoryReader([self.d, self.d]) assert (0, 0) == reader._index_to_reader_index( 0, 0), "first frame is first frame of first reader" assert (0, 1) == reader._index_to_reader_index( 1, 0), "second frame is second frame of first reader" assert (1, 0) == reader._index_to_reader_index( 100, 0), "101'st frame is first frame of second reader" assert (1, 1) == reader._index_to_reader_index( 101, 0), "102'nd frame is second frame of second reader" with self.assertRaises(ValueError): reader._index_to_reader_index(-1, 0) with self.assertRaises(ValueError): reader._index_to_reader_index(200, 0)
def test_full_trajectory_stridden_with_lag(self): reader = FragmentedTrajectoryReader([self.d, self.d]) data = np.vstack((self.d, self.d)) for lag in [1, 5, 7]: for stride in [1, 3, 5, 7, 13, 20]: reader.chunksize = 0 X, Y = None, None # not chunked for itraj, X, Y in reader.iterator(stride=stride, lag=lag): pass np.testing.assert_array_almost_equal(data[::stride][0:len(Y)], X) np.testing.assert_array_almost_equal(data[lag::stride], Y)
def test_invalid_readers_in_frag_traj(self): data = [np.array([[[1, 2], [3, 4]], [0, 1]])] from pyemma.coordinates.data.fragmented_trajectory_reader import FragmentedTrajectoryReader reader = FragmentedTrajectoryReader(data) with self.assertRaises(ValueError) as cm: save_traj(reader, self.sets, None) self.assertIn("FeatureReader", cm.exception.args[0])
def test_invalid_maximum_traj_index(self): frag_traj = [[self.trajfiles[0], self.trajfiles[1]], self.trajfiles[2], self.trajfiles[2]] set = [[0, 2], [0, 1], [2, 42]] from pyemma.coordinates.data.fragmented_trajectory_reader import FragmentedTrajectoryReader reader = FragmentedTrajectoryReader(frag_traj, topologyfile=self.pdbfile) with self.assertRaises(ValueError) as cm: save_traj(reader, set, None) self.assertIn("larger than", cm.exception.args[0])
def test_multiple_input_trajectories(self): reader = FragmentedTrajectoryReader([[self.d, self.d], self.d, [self.d, self.d]]) reader.chunksize = 37 out = reader.get_output() reader.chunksize = 0 out2 = reader.get_output() expected0_2 = np.vstack((self.d, self.d)) for itraj in range(0, 3): np.testing.assert_array_almost_equal(out[itraj], out2[itraj]) np.testing.assert_array_almost_equal(out[0], expected0_2) np.testing.assert_array_almost_equal(out[1], self.d) np.testing.assert_array_almost_equal(out[2], expected0_2)
def test_full_trajectory(self): reader = FragmentedTrajectoryReader([self.d, self.d]) reader.chunksize = 0 expected = np.vstack((self.d, self.d)) np.testing.assert_array_almost_equal(expected, reader.get_output(stride=1)[0])
def test_raise_different_dims(self): data = [self.d, np.array([[1, 2, 3], [4, 5, 6]])] with self.assertRaises(ValueError): FragmentedTrajectoryReader(data)
def test_is_random_accessible(self): dim = DataInMemory(self.data) frag = FragmentedTrajectoryReader([[self.data]]) assert dim.is_random_accessible is True assert frag.is_random_accessible is False
def create_file_reader(input_files, topology, featurizer, chunk_size=1000, **kw): r""" Creates a (possibly featured) file reader by a number of input files and either a topology file or a featurizer. Parameters ---------- :param input_files: A single input file or a list of input files. :param topology: A topology file. If given, the featurizer argument can be None. :param featurizer: A featurizer. If given, the topology file can be None. :param chunk_size: The chunk size with which the corresponding reader gets initialized. :return: Returns the reader. """ from pyemma.coordinates.data.numpy_filereader import NumPyFileReader from pyemma.coordinates.data.py_csv_reader import PyCSVReader from pyemma.coordinates.data import FeatureReader from pyemma.coordinates.data.fragmented_trajectory_reader import FragmentedTrajectoryReader # fragmented trajectories if (isinstance(input_files, (list, tuple)) and len(input_files) > 0 and any(isinstance(item, (list, tuple)) for item in input_files)): return FragmentedTrajectoryReader(input_files, topology, chunk_size, featurizer) # normal trajectories if (isinstance(input_files, string_types) or (isinstance(input_files, (list, tuple)) and (any(isinstance(item, string_types) for item in input_files) or len(input_files) is 0))): reader = None # check: if single string create a one-element list if isinstance(input_files, string_types): input_list = [input_files] elif len(input_files) > 0 and all( isinstance(item, string_types) for item in input_files): input_list = input_files else: if len(input_files) is 0: raise ValueError("The passed input list should not be empty.") else: raise ValueError( "The passed list did not exclusively contain strings or was a list of lists " "(fragmented trajectory).") # TODO: this does not handle suffixes like .xyz.gz (rare) _, suffix = os.path.splitext(input_list[0]) # check: do all files have the same file type? If not: raise ValueError. if all(item.endswith(suffix) for item in input_list): # do all the files exist? If not: Raise value error all_exist = True err_msg = "" for item in input_list: if not os.path.isfile(item): err_msg += "\n" if len(err_msg) > 0 else "" err_msg += "File %s did not exist or was no file" % item all_exist = False if not all_exist: raise ValueError( "Some of the given input files were directories" " or did not exist:\n%s" % err_msg) if all_exist: from mdtraj.formats.registry import FormatRegistry # CASE 1.1: file types are MD files if suffix in list(FormatRegistry.loaders.keys()): # check: do we either have a featurizer or a topology file name? If not: raise ValueError. # create a MD reader with file names and topology if not featurizer and not topology: raise ValueError( "The input files were MD files which makes it mandatory to have either a " "featurizer or a topology file.") reader = FeatureReader(input_list, featurizer=featurizer, topologyfile=topology, chunksize=chunk_size) else: if suffix in ['.npy', '.npz']: reader = NumPyFileReader(input_list, chunksize=chunk_size) # otherwise we assume that given files are ascii tabulated data else: reader = PyCSVReader(input_list, chunksize=chunk_size, **kw) else: raise ValueError( "Not all elements in the input list were of the type %s!" % suffix) else: raise ValueError("Input \"%s\" was no string or list of strings." % input) return reader
def create_file_reader(input_files, topology, featurizer, chunksize=None, **kw): r""" Creates a (possibly featured) file reader by a number of input files and either a topology file or a featurizer. Parameters ---------- :param input_files: A single input file or a list of input files. :param topology: A topology file. If given, the featurizer argument can be None. :param featurizer: A featurizer. If given, the topology file can be None. :param chunksize: The chunk size with which the corresponding reader gets initialized. :return: Returns the reader. """ from pyemma.coordinates.data.numpy_filereader import NumPyFileReader from pyemma.coordinates.data.py_csv_reader import PyCSVReader from pyemma.coordinates.data import FeatureReader from pyemma.coordinates.data.fragmented_trajectory_reader import FragmentedTrajectoryReader # fragmented trajectories if (isinstance(input_files, (list, tuple)) and len(input_files) > 0 and any(isinstance(item, (list, tuple)) for item in input_files)): return FragmentedTrajectoryReader(input_files, topology, chunksize, featurizer) # normal trajectories if (isinstance(input_files, (Path, str)) or (isinstance(input_files, (list, tuple)) and (any(isinstance(item, (Path, str)) for item in input_files) or len(input_files) == 0))): # check: if single string create a one-element list if isinstance(input_files, (Path, str)): input_list = [input_files] elif len(input_files) > 0 and all(isinstance(item, (Path, str)) for item in input_files): input_list = input_files else: if len(input_files) == 0: raise ValueError("The passed input list should not be empty.") else: raise ValueError("The passed list did not exclusively contain strings or was a list of lists " "(fragmented trajectory).") # convert to list of paths input_list = [Path(f) for f in input_list] # TODO: this does not handle suffixes like .xyz.gz (rare) suffix = input_list[0].suffix # check: do all files have the same file type? If not: raise ValueError. if all(item.suffix == suffix for item in input_list): # do all the files exist? If not: Raise value error all_exist = True from six import StringIO err_msg = StringIO() for item in input_list: if not item.is_file(): err_msg.write('\n' if err_msg.tell() > 0 else "") err_msg.write('File %s did not exist or was no file' % item) all_exist = False if not all_exist: raise ValueError('Some of the given input files were directories' ' or did not exist:\n%s' % err_msg.getvalue()) featurizer_or_top_provided = featurizer is not None or topology is not None # we need to check for h5 first, because of mdtraj custom HDF5 traj format (which is deprecated). if suffix in ('.h5', '.hdf5') and not featurizer_or_top_provided: # This check is potentially expensive for lots of files, we also re-open the file twice (causing atime updates etc.) # So we simply require that no featurizer option is given. # and not all((_is_mdtraj_hdf5_file(f) for f in input_files)): from pyemma.coordinates.data.h5_reader import H5Reader reader = H5Reader(filenames=input_files, chunk_size=chunksize, **kw) # CASE 1.1: file types are MD files elif FeatureReader.supports_format(suffix): # check: do we either have a featurizer or a topology file name? If not: raise ValueError. # create a MD reader with file names and topology if not featurizer_or_top_provided: raise ValueError('The input files were MD files which makes it mandatory to have either a ' 'Featurizer or a topology file.') if suffix in ('.pdb', '.pdb.gz'): raise ValueError('PyEMMA can not read PDB-fake-trajectories. ' 'Please consider using a sane trajectory format (e.g. xtc, dcd).') reader = FeatureReader(input_list, featurizer=featurizer, topologyfile=topology, chunksize=chunksize) elif suffix in ('.npy', '.npz'): reader = NumPyFileReader(input_list, chunksize=chunksize) # otherwise we assume that given files are ascii tabulated data else: reader = PyCSVReader(input_list, chunksize=chunksize, **kw) else: raise ValueError('Not all elements in the input list were of the type %s!' % suffix) else: raise ValueError('Input "{}" was no string or list of strings.'.format(input_files)) return reader