Exemplo n.º 1
0
 def test_usecols(self):
     reader = NumPyFileReader(self.f4)
     cols = (0, 2)
     it = reader.iterator(chunk=0, return_trajindex=False, cols=cols)
     with it:
         for x in it:
             np.testing.assert_equal(x, self.d2[:, cols])
Exemplo n.º 2
0
    def test_stridden_access(self):
        reader = NumPyFileReader(self.f1)
        reader.chunksize = 10

        wanted = np.load(self.f1)

        for stride in [2, 3, 5, 7, 15]:
            first_traj = reader.get_output(stride=stride)[0]
            np.testing.assert_equal(first_traj, wanted[::stride],
                                    "did not match for stride %i" % stride)
Exemplo n.º 3
0
    def test_stridden_access(self):
        reader = NumPyFileReader(self.f1)
        reader.chunksize = 10

        wanted = np.load(self.f1)

        for stride in [2, 3, 5, 7, 15]:
            first_traj = reader.get_output(stride=stride)[0]
            np.testing.assert_equal(first_traj, wanted[::stride],
                                    "did not match for stride %i" % stride)
Exemplo n.º 4
0
 def test_lagged_stridden_access(self):
     reader = NumPyFileReader(self.f1)
     strides = [2, 3, 5, 7, 15]
     lags = [1, 3, 7, 10, 30]
     for stride in strides:
         for lag in lags:
             chunks = []
             for _, _, Y in reader.iterator(stride, lag):
                 chunks.append(Y)
             chunks = np.vstack(chunks)
             np.testing.assert_equal(chunks, self.d[lag::stride])
Exemplo n.º 5
0
 def test_lagged_stridden_access(self):
     reader = NumPyFileReader(self.f1)
     strides = [2, 3, 5, 7, 15]
     lags = [1, 3, 7, 10, 30]
     for stride in strides:
         for lag in lags:
             chunks = []
             for _, _, Y in reader.iterator(stride, lag):
                 chunks.append(Y)
             chunks = np.vstack(chunks)
             np.testing.assert_equal(chunks, self.d[lag::stride])
Exemplo n.º 6
0
    def test_npz(self):
        reader = NumPyFileReader(self.npz)

        all_data = reader.get_output()

        fh = np.load(self.npz)
        data = [x[1] for x in list(fh.items())]
        fh.close()

        self.assertEqual(reader.number_of_trajectories(), len(data))

        for outp, inp in zip(all_data, data):
            np.testing.assert_equal(outp, inp)
Exemplo n.º 7
0
    def test_only_npy(self):
        reader = NumPyFileReader(self.npy_files)

        from_files = [np.load(f) for f in self.npy_files]
        concatenated = np.vstack(from_files)

        output = reader.get_output()

        self.assertEqual(reader.number_of_trajectories(), len(self.npy_files))
        self.assertEqual(reader.n_frames_total(), concatenated.shape[0])

        for x, y in zip(output, from_files):
            np.testing.assert_array_almost_equal(x, y)
Exemplo n.º 8
0
    def test_npz(self):
        reader = NumPyFileReader(self.npz)

        all_data = reader.get_output()

        fh = np.load(self.npz)
        data = [x[1] for x in fh.items()]
        fh.close()

        self.assertEqual(reader.number_of_trajectories(), len(data))

        for outp, inp in zip(all_data, data):
            np.testing.assert_equal(outp, inp)
Exemplo n.º 9
0
    def test_only_npy(self):
        reader = NumPyFileReader(self.npy_files)

        from_files = [np.load(f) for f in self.npy_files]
        concatenated = np.vstack(from_files)

        output = reader.get_output()

        self.assertEqual(reader.number_of_trajectories(), len(self.npy_files))
        self.assertEqual(reader.n_frames_total(), concatenated.shape[0])

        for x, y in zip(output, from_files):
            np.testing.assert_array_almost_equal(x, y)
Exemplo n.º 10
0
    def load_from_files(cls, files):
        """ construct this by loading all files into memory

        Parameters
        ----------
        files: str or list of str
            filenames to read from
        """
        # import here to avoid cyclic import
        from pyemma.coordinates.data.numpy_filereader import NumPyFileReader

        reader = NumPyFileReader(files)
        data = reader.get_output()
        return cls(data)
Exemplo n.º 11
0
    def load_from_files(cls, files):
        """ construct this by loading all files into memory

        Parameters
        ----------
        files: str or list of str
            filenames to read from
        """
        # import here to avoid cyclic import
        from pyemma.coordinates.data.numpy_filereader import NumPyFileReader

        reader = NumPyFileReader(files)
        data = reader.get_output()
        return cls(data)
Exemplo n.º 12
0
 def test_numpy_reader(self):
     arr = np.random.random(10)
     from pyemma.util.files import TemporaryDirectory
     with TemporaryDirectory() as d:
         files = [os.path.join(d, '1.npy'), os.path.join(d, '2.npy')]
         np.save(files[0], arr)
         np.save(files[1], arr)
         params = {'filenames': files, 'chunksize': 23}
         r = NumPyFileReader(**params)
         self.compare(r, params)
Exemplo n.º 13
0
    def test_different_shapes_value_error(self):
        with tempfile.NamedTemporaryFile(delete=False, suffix='.npy') as f:
            x = np.zeros((3, 42))
            np.save(f.name, x)
            myfiles = self.files2d[:]
            myfiles.insert(1, f.name)

            with self.assertRaises(ValueError) as cm:
                NumPyFileReader(myfiles)
            self.assertIn("different dimensions", cm.exception.args[0])
Exemplo n.º 14
0
    def test_lagged_stridden_access_multiple_files(self):
        reader = NumPyFileReader(self.files2d)
        strides = [2, 3, 5, 7, 15]
        lags = [1, 3, 7, 10, 30]
        for stride in strides:
            for lag in lags:
                chunks = {
                    i: []
                    for i in range(reader.number_of_trajectories())
                }
                for itraj, _, Y in reader.iterator(stride, lag):
                    chunks[itraj].append(Y)

                for i, k in enumerate(chunks.values()):
                    stack = np.vstack(k)
                    d = np.load(self.files2d[i])
                    np.testing.assert_equal(
                        stack, d[lag::stride], "not equal for stride=%i"
                        " and lag=%i" % (stride, lag))
Exemplo n.º 15
0
 def test_skip(self):
     for skip in [0, 3, 13]:
         r1 = NumPyFileReader(self.npy_files[0])
         out_with_skip = r1.get_output(skip=skip)[0]
         r2 = NumPyFileReader(self.npy_files[0])
         out = r2.get_output()[0]
         np.testing.assert_almost_equal(
             out_with_skip,
             out[skip::],
             err_msg="The first %s rows were skipped, but that did not "
             "match the rows with skip=0 and sliced by [%s::]" %
             (skip, skip))
Exemplo n.º 16
0
    def test_npy_reader(self):
        lengths_and_dims = [(7, 3), (23, 3), (27, 3)]
        data = [
            np.empty((n, dim)) for n, dim in lengths_and_dims]
        files = []
        with TemporaryDirectory() as td:
            for i, x in enumerate(data):
                fn = os.path.join(td, "%i.npy" % i)
                np.save(fn, x)
                files.append(fn)

            reader = NumPyFileReader(files)

            # cache it and compare
            results = {f: (self.db[f, reader].length, self.db[f, reader].ndim,
                           self.db[f, reader].offsets) for f in files}
            expected = {f: (len(data[i]), data[i].shape[1], [])
                        for i, f in enumerate(files)}
            np.testing.assert_equal(results, expected)
Exemplo n.º 17
0
 def test_skip_input_list(self):
     for skip in [0, 3, 13]:
         r1 = NumPyFileReader(self.npy_files)
         out_with_skip = r1.get_output(skip=skip)
         r2 = NumPyFileReader(self.npy_files)
         out = r2.get_output()
         for i in range(0, len(self.npy_files)):
             np.testing.assert_almost_equal(
                 out_with_skip[i],
                 out[i][skip::],
                 err_msg=
                 "The first %s rows of the %s'th file were skipped, but that did not "
                 "match the rows with skip=0 and sliced by [%s::]" %
                 (skip, i, skip))
Exemplo n.º 18
0
def create_file_reader(input_files, topology, featurizer, chunksize=None, **kw):
    r"""
    Creates a (possibly featured) file reader by a number of input files and either a topology file or a featurizer.
    Parameters
    ----------
    :param input_files:
        A single input file or a list of input files.
    :param topology:
        A topology file. If given, the featurizer argument can be None.
    :param featurizer:
        A featurizer. If given, the topology file can be None.
    :param chunksize:
        The chunk size with which the corresponding reader gets initialized.
    :return: Returns the reader.
    """
    from pyemma.coordinates.data.numpy_filereader import NumPyFileReader
    from pyemma.coordinates.data.py_csv_reader import PyCSVReader
    from pyemma.coordinates.data import FeatureReader
    from pyemma.coordinates.data.fragmented_trajectory_reader import FragmentedTrajectoryReader

    # fragmented trajectories
    if (isinstance(input_files, (list, tuple)) and len(input_files) > 0 and
            any(isinstance(item, (list, tuple)) for item in input_files)):
        return FragmentedTrajectoryReader(input_files, topology, chunksize, featurizer)

    # normal trajectories
    if (isinstance(input_files, (Path, str))
            or (isinstance(input_files, (list, tuple))
                and (any(isinstance(item, (Path, str)) for item in input_files)
                     or len(input_files) == 0))):
        # check: if single string create a one-element list
        if isinstance(input_files, (Path, str)):
            input_list = [input_files]
        elif len(input_files) > 0 and all(isinstance(item, (Path, str)) for item in input_files):
            input_list = input_files
        else:
            if len(input_files) == 0:
                raise ValueError("The passed input list should not be empty.")
            else:
                raise ValueError("The passed list did not exclusively contain strings or was a list of lists "
                                 "(fragmented trajectory).")

        # convert to list of paths
        input_list = [Path(f) for f in input_list]

        # TODO: this does not handle suffixes like .xyz.gz (rare)
        suffix = input_list[0].suffix

        # check: do all files have the same file type? If not: raise ValueError.
        if all(item.suffix == suffix for item in input_list):

            # do all the files exist? If not: Raise value error
            all_exist = True
            from six import StringIO
            err_msg = StringIO()
            for item in input_list:
                if not item.is_file():
                    err_msg.write('\n' if err_msg.tell() > 0 else "")
                    err_msg.write('File %s did not exist or was no file' % item)
                    all_exist = False
            if not all_exist:
                raise ValueError('Some of the given input files were directories'
                                 ' or did not exist:\n%s' % err_msg.getvalue())
            featurizer_or_top_provided = featurizer is not None or topology is not None
            # we need to check for h5 first, because of mdtraj custom HDF5 traj format (which is deprecated).
            if suffix in ('.h5', '.hdf5') and not featurizer_or_top_provided:
                # This check is potentially expensive for lots of files, we also re-open the file twice (causing atime updates etc.)
                # So we simply require that no featurizer option is given.
                # and not all((_is_mdtraj_hdf5_file(f) for f in input_files)):
                from pyemma.coordinates.data.h5_reader import H5Reader
                reader = H5Reader(filenames=input_files, chunk_size=chunksize, **kw)
            # CASE 1.1: file types are MD files
            elif FeatureReader.supports_format(suffix):
                # check: do we either have a featurizer or a topology file name? If not: raise ValueError.
                # create a MD reader with file names and topology
                if not featurizer_or_top_provided:
                    raise ValueError('The input files were MD files which makes it mandatory to have either a '
                                     'Featurizer or a topology file.')

                if suffix in ('.pdb', '.pdb.gz'):
                    raise ValueError('PyEMMA can not read PDB-fake-trajectories. '
                                     'Please consider using a sane trajectory format (e.g. xtc, dcd).')

                reader = FeatureReader(input_list, featurizer=featurizer, topologyfile=topology,
                                       chunksize=chunksize)
            elif suffix in ('.npy', '.npz'):
                reader = NumPyFileReader(input_list, chunksize=chunksize)
            # otherwise we assume that given files are ascii tabulated data
            else:
                reader = PyCSVReader(input_list, chunksize=chunksize, **kw)
        else:
            raise ValueError('Not all elements in the input list were of the type %s!' % suffix)
    else:
        raise ValueError('Input "{}" was no string or list of strings.'.format(input_files))
    return reader
Exemplo n.º 19
0
def create_file_reader(input_files,
                       topology,
                       featurizer,
                       chunk_size=1000,
                       **kw):
    r"""
    Creates a (possibly featured) file reader by a number of input files and either a topology file or a featurizer.
    Parameters
    ----------
    :param input_files:
        A single input file or a list of input files.
    :param topology:
        A topology file. If given, the featurizer argument can be None.
    :param featurizer:
        A featurizer. If given, the topology file can be None.
    :param chunk_size:
        The chunk size with which the corresponding reader gets initialized.
    :return: Returns the reader.
    """
    from pyemma.coordinates.data.numpy_filereader import NumPyFileReader
    from pyemma.coordinates.data.py_csv_reader import PyCSVReader
    from pyemma.coordinates.data import FeatureReader
    from pyemma.coordinates.data.fragmented_trajectory_reader import FragmentedTrajectoryReader

    # fragmented trajectories
    if (isinstance(input_files, (list, tuple)) and len(input_files) > 0
            and any(isinstance(item, (list, tuple)) for item in input_files)):
        return FragmentedTrajectoryReader(input_files, topology, chunk_size,
                                          featurizer)

    # normal trajectories
    if (isinstance(input_files, string_types)
            or (isinstance(input_files, (list, tuple)) and
                (any(isinstance(item, string_types)
                     for item in input_files) or len(input_files) is 0))):
        reader = None
        # check: if single string create a one-element list
        if isinstance(input_files, string_types):
            input_list = [input_files]
        elif len(input_files) > 0 and all(
                isinstance(item, string_types) for item in input_files):
            input_list = input_files
        else:
            if len(input_files) is 0:
                raise ValueError("The passed input list should not be empty.")
            else:
                raise ValueError(
                    "The passed list did not exclusively contain strings or was a list of lists "
                    "(fragmented trajectory).")

        # TODO: this does not handle suffixes like .xyz.gz (rare)
        _, suffix = os.path.splitext(input_list[0])

        # check: do all files have the same file type? If not: raise ValueError.
        if all(item.endswith(suffix) for item in input_list):

            # do all the files exist? If not: Raise value error
            all_exist = True
            err_msg = ""
            for item in input_list:
                if not os.path.isfile(item):
                    err_msg += "\n" if len(err_msg) > 0 else ""
                    err_msg += "File %s did not exist or was no file" % item
                    all_exist = False
            if not all_exist:
                raise ValueError(
                    "Some of the given input files were directories"
                    " or did not exist:\n%s" % err_msg)

            if all_exist:
                from mdtraj.formats.registry import FormatRegistry

                # CASE 1.1: file types are MD files
                if suffix in list(FormatRegistry.loaders.keys()):
                    # check: do we either have a featurizer or a topology file name? If not: raise ValueError.
                    # create a MD reader with file names and topology
                    if not featurizer and not topology:
                        raise ValueError(
                            "The input files were MD files which makes it mandatory to have either a "
                            "featurizer or a topology file.")

                    reader = FeatureReader(input_list,
                                           featurizer=featurizer,
                                           topologyfile=topology,
                                           chunksize=chunk_size)
                else:
                    if suffix in ['.npy', '.npz']:
                        reader = NumPyFileReader(input_list,
                                                 chunksize=chunk_size)
                    # otherwise we assume that given files are ascii tabulated data
                    else:
                        reader = PyCSVReader(input_list,
                                             chunksize=chunk_size,
                                             **kw)
        else:
            raise ValueError(
                "Not all elements in the input list were of the type %s!" %
                suffix)
    else:
        raise ValueError("Input \"%s\" was no string or list of strings." %
                         input)
    return reader
Exemplo n.º 20
0
    def testSingleFile(self):
        reader = NumPyFileReader(self.npy_files[0])

        self.assertEqual(reader.n_frames_total(), self.d.shape[0])
Exemplo n.º 21
0
 def test_describe(self):
     r = NumPyFileReader(self.files2d)
     r.describe()
Exemplo n.º 22
0
    def testSingleFile(self):
        reader = NumPyFileReader(self.npy_files[0])

        self.assertEqual(reader.n_frames_total(), self.d.shape[0])