def test_full_trajectory_stridden(self):
     for stride in [1, 3, 5, 7, 13, 20]:
         reader = FragmentedTrajectoryReader([self.d, self.d])
         reader.chunksize = 0
         expected = np.vstack((self.d, self.d))[::stride]
         out = reader.get_output(stride=stride)[0]
         np.testing.assert_array_almost_equal(expected, out, err_msg="Failed for stride=%s" % stride)
Exemplo n.º 2
0
 def test_chunked_trajectory_random_access(self):
     reader = FragmentedTrajectoryReader([self.d, self.d])
     indices = np.asarray([[0, 1], [0, 3], [0, 3], [0, 99], [0, 100],
                           [0, 199]])
     out = reader.get_output(stride=indices, chunk=1)
     np.testing.assert_array_equal(
         np.array(out).squeeze(), np.array([1, 3, 3, 99, 0, 99]))
Exemplo n.º 3
0
 def test_chunked_trajectory_with_lag(self):
     data = np.vstack((self.d, self.d))
     reader = FragmentedTrajectoryReader([self.d, self.d])
     for lag in [0, 1, 3]:
         for stride in [1, 3, 5]:
             for chunksize in [1, 34, 53, 72]:
                 reader.chunksize = chunksize
                 if lag > 0:
                     collected = None
                     collected_lagged = None
                     for itraj, X, Y in reader.iterator(stride=stride,
                                                        lag=lag):
                         collected = X if collected is None else np.vstack(
                             (collected, X))
                         collected_lagged = Y if collected_lagged is None else np.vstack(
                             (collected_lagged, Y))
                     np.testing.assert_array_almost_equal(
                         data[::stride][0:len(collected_lagged)], collected)
                     np.testing.assert_array_almost_equal(
                         data[lag::stride], collected_lagged)
                 else:
                     collected = None
                     for itraj, X in reader.iterator(stride=stride):
                         collected = X if collected is None else np.vstack(
                             (collected, X))
                     np.testing.assert_array_almost_equal(
                         data[::stride], collected)
 def test_cols(self):
     dim = 5
     arr = np.arange(60).reshape(-1, dim)
     data = [(arr, arr), arr, (arr, arr, arr)]
     reader = FragmentedTrajectoryReader(data)
     cols = (0, 3)
     for itraj, x in reader.iterator(chunk=0, return_trajindex=True, cols=cols):
         if isinstance(data[itraj], tuple):
             syn_traj = np.concatenate(data[itraj])
         else:
             syn_traj = data[itraj]
         np.testing.assert_equal(x, syn_traj[:, cols])
 def test_multiple_input_trajectories_random_access(self):
     indices = np.asarray([
         [0, 1], [0, 3], [0, 3], [0, 99], [0, 100], [0, 199],
         [1, 0], [1, 5], [1, 99],
         [2, 5], [2, 7], [2, 23]
     ])
     expected = [np.array([1, 3, 3, 99, 0, 99]), np.array([0, 5, 99]), np.array([5, 7, 23])]
     for chunk_size in [0, 1, 3, 5, 13]:
         reader = FragmentedTrajectoryReader([[self.d, self.d], self.d, [self.d, self.d]])
         out_full_trajectory_mode = reader.get_output(chunk=chunk_size, stride=indices)
         for i in range(3):
             np.testing.assert_array_equal(expected[i], out_full_trajectory_mode[i].squeeze())
Exemplo n.º 6
0
 def test_index_to_reader_index(self):
     reader = FragmentedTrajectoryReader([self.d, self.d])
     assert (0, 0) == reader._index_to_reader_index(
         0, 0), "first frame is first frame of first reader"
     assert (0, 1) == reader._index_to_reader_index(
         1, 0), "second frame is second frame of first reader"
     assert (1, 0) == reader._index_to_reader_index(
         100, 0), "101'st frame is first frame of second reader"
     assert (1, 1) == reader._index_to_reader_index(
         101, 0), "102'nd frame is second frame of second reader"
     with self.assertRaises(ValueError):
         reader._index_to_reader_index(-1, 0)
     with self.assertRaises(ValueError):
         reader._index_to_reader_index(200, 0)
    def test_full_trajectory_stridden_with_lag(self):
        reader = FragmentedTrajectoryReader([self.d, self.d])
        data = np.vstack((self.d, self.d))
        for lag in [1, 5, 7]:
            for stride in [1, 3, 5, 7, 13, 20]:
                reader.chunksize = 0

                X, Y = None, None
                # not chunked
                for itraj, X, Y in reader.iterator(stride=stride, lag=lag):
                    pass

                np.testing.assert_array_almost_equal(data[::stride][0:len(Y)], X)
                np.testing.assert_array_almost_equal(data[lag::stride], Y)
Exemplo n.º 8
0
 def test_invalid_readers_in_frag_traj(self):
     data = [np.array([[[1, 2], [3, 4]], [0, 1]])]
     from pyemma.coordinates.data.fragmented_trajectory_reader import FragmentedTrajectoryReader
     reader = FragmentedTrajectoryReader(data)
     with self.assertRaises(ValueError) as cm:
         save_traj(reader, self.sets, None)
     self.assertIn("FeatureReader", cm.exception.args[0])
Exemplo n.º 9
0
 def test_invalid_maximum_traj_index(self):
     frag_traj = [[self.trajfiles[0], self.trajfiles[1]], self.trajfiles[2],
                  self.trajfiles[2]]
     set = [[0, 2], [0, 1], [2, 42]]
     from pyemma.coordinates.data.fragmented_trajectory_reader import FragmentedTrajectoryReader
     reader = FragmentedTrajectoryReader(frag_traj,
                                         topologyfile=self.pdbfile)
     with self.assertRaises(ValueError) as cm:
         save_traj(reader, set, None)
     self.assertIn("larger than", cm.exception.args[0])
Exemplo n.º 10
0
 def test_multiple_input_trajectories(self):
     reader = FragmentedTrajectoryReader([[self.d, self.d], self.d, [self.d, self.d]])
     reader.chunksize = 37
     out = reader.get_output()
     reader.chunksize = 0
     out2 = reader.get_output()
     expected0_2 = np.vstack((self.d, self.d))
     for itraj in range(0, 3):
         np.testing.assert_array_almost_equal(out[itraj], out2[itraj])
     np.testing.assert_array_almost_equal(out[0], expected0_2)
     np.testing.assert_array_almost_equal(out[1], self.d)
     np.testing.assert_array_almost_equal(out[2], expected0_2)
Exemplo n.º 11
0
 def test_full_trajectory(self):
     reader = FragmentedTrajectoryReader([self.d, self.d])
     reader.chunksize = 0
     expected = np.vstack((self.d, self.d))
     np.testing.assert_array_almost_equal(expected,
                                          reader.get_output(stride=1)[0])
Exemplo n.º 12
0
 def test_raise_different_dims(self):
     data = [self.d, np.array([[1, 2, 3], [4, 5, 6]])]
     with self.assertRaises(ValueError):
         FragmentedTrajectoryReader(data)
 def test_is_random_accessible(self):
     dim = DataInMemory(self.data)
     frag = FragmentedTrajectoryReader([[self.data]])
     assert dim.is_random_accessible is True
     assert frag.is_random_accessible is False
Exemplo n.º 14
0
def create_file_reader(input_files,
                       topology,
                       featurizer,
                       chunk_size=1000,
                       **kw):
    r"""
    Creates a (possibly featured) file reader by a number of input files and either a topology file or a featurizer.
    Parameters
    ----------
    :param input_files:
        A single input file or a list of input files.
    :param topology:
        A topology file. If given, the featurizer argument can be None.
    :param featurizer:
        A featurizer. If given, the topology file can be None.
    :param chunk_size:
        The chunk size with which the corresponding reader gets initialized.
    :return: Returns the reader.
    """
    from pyemma.coordinates.data.numpy_filereader import NumPyFileReader
    from pyemma.coordinates.data.py_csv_reader import PyCSVReader
    from pyemma.coordinates.data import FeatureReader
    from pyemma.coordinates.data.fragmented_trajectory_reader import FragmentedTrajectoryReader

    # fragmented trajectories
    if (isinstance(input_files, (list, tuple)) and len(input_files) > 0
            and any(isinstance(item, (list, tuple)) for item in input_files)):
        return FragmentedTrajectoryReader(input_files, topology, chunk_size,
                                          featurizer)

    # normal trajectories
    if (isinstance(input_files, string_types)
            or (isinstance(input_files, (list, tuple)) and
                (any(isinstance(item, string_types)
                     for item in input_files) or len(input_files) is 0))):
        reader = None
        # check: if single string create a one-element list
        if isinstance(input_files, string_types):
            input_list = [input_files]
        elif len(input_files) > 0 and all(
                isinstance(item, string_types) for item in input_files):
            input_list = input_files
        else:
            if len(input_files) is 0:
                raise ValueError("The passed input list should not be empty.")
            else:
                raise ValueError(
                    "The passed list did not exclusively contain strings or was a list of lists "
                    "(fragmented trajectory).")

        # TODO: this does not handle suffixes like .xyz.gz (rare)
        _, suffix = os.path.splitext(input_list[0])

        # check: do all files have the same file type? If not: raise ValueError.
        if all(item.endswith(suffix) for item in input_list):

            # do all the files exist? If not: Raise value error
            all_exist = True
            err_msg = ""
            for item in input_list:
                if not os.path.isfile(item):
                    err_msg += "\n" if len(err_msg) > 0 else ""
                    err_msg += "File %s did not exist or was no file" % item
                    all_exist = False
            if not all_exist:
                raise ValueError(
                    "Some of the given input files were directories"
                    " or did not exist:\n%s" % err_msg)

            if all_exist:
                from mdtraj.formats.registry import FormatRegistry

                # CASE 1.1: file types are MD files
                if suffix in list(FormatRegistry.loaders.keys()):
                    # check: do we either have a featurizer or a topology file name? If not: raise ValueError.
                    # create a MD reader with file names and topology
                    if not featurizer and not topology:
                        raise ValueError(
                            "The input files were MD files which makes it mandatory to have either a "
                            "featurizer or a topology file.")

                    reader = FeatureReader(input_list,
                                           featurizer=featurizer,
                                           topologyfile=topology,
                                           chunksize=chunk_size)
                else:
                    if suffix in ['.npy', '.npz']:
                        reader = NumPyFileReader(input_list,
                                                 chunksize=chunk_size)
                    # otherwise we assume that given files are ascii tabulated data
                    else:
                        reader = PyCSVReader(input_list,
                                             chunksize=chunk_size,
                                             **kw)
        else:
            raise ValueError(
                "Not all elements in the input list were of the type %s!" %
                suffix)
    else:
        raise ValueError("Input \"%s\" was no string or list of strings." %
                         input)
    return reader
Exemplo n.º 15
0
def create_file_reader(input_files, topology, featurizer, chunksize=None, **kw):
    r"""
    Creates a (possibly featured) file reader by a number of input files and either a topology file or a featurizer.
    Parameters
    ----------
    :param input_files:
        A single input file or a list of input files.
    :param topology:
        A topology file. If given, the featurizer argument can be None.
    :param featurizer:
        A featurizer. If given, the topology file can be None.
    :param chunksize:
        The chunk size with which the corresponding reader gets initialized.
    :return: Returns the reader.
    """
    from pyemma.coordinates.data.numpy_filereader import NumPyFileReader
    from pyemma.coordinates.data.py_csv_reader import PyCSVReader
    from pyemma.coordinates.data import FeatureReader
    from pyemma.coordinates.data.fragmented_trajectory_reader import FragmentedTrajectoryReader

    # fragmented trajectories
    if (isinstance(input_files, (list, tuple)) and len(input_files) > 0 and
            any(isinstance(item, (list, tuple)) for item in input_files)):
        return FragmentedTrajectoryReader(input_files, topology, chunksize, featurizer)

    # normal trajectories
    if (isinstance(input_files, (Path, str))
            or (isinstance(input_files, (list, tuple))
                and (any(isinstance(item, (Path, str)) for item in input_files)
                     or len(input_files) == 0))):
        # check: if single string create a one-element list
        if isinstance(input_files, (Path, str)):
            input_list = [input_files]
        elif len(input_files) > 0 and all(isinstance(item, (Path, str)) for item in input_files):
            input_list = input_files
        else:
            if len(input_files) == 0:
                raise ValueError("The passed input list should not be empty.")
            else:
                raise ValueError("The passed list did not exclusively contain strings or was a list of lists "
                                 "(fragmented trajectory).")

        # convert to list of paths
        input_list = [Path(f) for f in input_list]

        # TODO: this does not handle suffixes like .xyz.gz (rare)
        suffix = input_list[0].suffix

        # check: do all files have the same file type? If not: raise ValueError.
        if all(item.suffix == suffix for item in input_list):

            # do all the files exist? If not: Raise value error
            all_exist = True
            from six import StringIO
            err_msg = StringIO()
            for item in input_list:
                if not item.is_file():
                    err_msg.write('\n' if err_msg.tell() > 0 else "")
                    err_msg.write('File %s did not exist or was no file' % item)
                    all_exist = False
            if not all_exist:
                raise ValueError('Some of the given input files were directories'
                                 ' or did not exist:\n%s' % err_msg.getvalue())
            featurizer_or_top_provided = featurizer is not None or topology is not None
            # we need to check for h5 first, because of mdtraj custom HDF5 traj format (which is deprecated).
            if suffix in ('.h5', '.hdf5') and not featurizer_or_top_provided:
                # This check is potentially expensive for lots of files, we also re-open the file twice (causing atime updates etc.)
                # So we simply require that no featurizer option is given.
                # and not all((_is_mdtraj_hdf5_file(f) for f in input_files)):
                from pyemma.coordinates.data.h5_reader import H5Reader
                reader = H5Reader(filenames=input_files, chunk_size=chunksize, **kw)
            # CASE 1.1: file types are MD files
            elif FeatureReader.supports_format(suffix):
                # check: do we either have a featurizer or a topology file name? If not: raise ValueError.
                # create a MD reader with file names and topology
                if not featurizer_or_top_provided:
                    raise ValueError('The input files were MD files which makes it mandatory to have either a '
                                     'Featurizer or a topology file.')

                if suffix in ('.pdb', '.pdb.gz'):
                    raise ValueError('PyEMMA can not read PDB-fake-trajectories. '
                                     'Please consider using a sane trajectory format (e.g. xtc, dcd).')

                reader = FeatureReader(input_list, featurizer=featurizer, topologyfile=topology,
                                       chunksize=chunksize)
            elif suffix in ('.npy', '.npz'):
                reader = NumPyFileReader(input_list, chunksize=chunksize)
            # otherwise we assume that given files are ascii tabulated data
            else:
                reader = PyCSVReader(input_list, chunksize=chunksize, **kw)
        else:
            raise ValueError('Not all elements in the input list were of the type %s!' % suffix)
    else:
        raise ValueError('Input "{}" was no string or list of strings.'.format(input_files))
    return reader