Пример #1
0
 def test_stride(self):
     reader = DataInMemory(self.d)
     stride = [1, 2, 3, 4, 5, 6, 7, 10, 11, 21, 23]
     for s in stride:
         output = reader.get_output(stride=s)[0]
         expected = self.d[::s]
         np.testing.assert_allclose(output, expected,
                                    err_msg="not equal for stride=%i" % s)
Пример #2
0
    def testDataArray(self):
        frames_per_traj = 100
        dim = 3

        data = np.random.random((frames_per_traj, dim))
        d = DataInMemory(data)

        np.testing.assert_equal(
                d.trajectory_lengths(), np.array([frames_per_traj for _ in range(1)]))
Пример #3
0
    def test1dData(self):
        n = 3
        data = np.arange(n)
        reader = DataInMemory(data)

        self.assertEqual(reader.trajectory_lengths(), np.array([n]))
        self.assertEqual(reader.ndim, 1)
        self.assertEqual(reader.number_of_trajectories(), 1)
        self.assertEqual(reader.n_frames_total(), n)
Пример #4
0
    def test1dDataList(self):
        n = 10
        data = [np.arange(n), np.arange(n)]
        reader = DataInMemory(data)

        np.testing.assert_equal(reader.trajectory_lengths(), np.array([n, n]))
        self.assertEqual(reader.ndim, 1)
        self.assertEqual(reader.number_of_trajectories(), 2)
        self.assertEqual(reader.n_frames_total(), 2 * n)
Пример #5
0
    def testListOfArrays(self):

        frames_per_traj = 100
        dim = 3
        data = [np.random.random((frames_per_traj, dim)) for _ in range(3)]

        d = DataInMemory(data)

        self.assertEqual(d.dimension(), dim)

        np.testing.assert_equal(
                d.trajectory_lengths(), np.array([frames_per_traj for _ in range(3)]))
Пример #6
0
 def test_lagged_stridden_access(self):
     data = np.random.random((1000, 2)).astype(np.float32)
     reader = DataInMemory(data)
     strides = [2, 3, 5, 7, 15]
     lags = [1, 3, 7, 10, 30]
     for stride in strides:
         for lag in lags:
             chunks = []
             for _, _, Y in reader.iterator(stride=stride, lag=lag):
                 chunks.append(Y)
             chunks = np.vstack(chunks)
             np.testing.assert_equal(chunks, data[lag::stride], "failed for stride=%s, lag=%s" % (stride, lag))
Пример #7
0
 def test_duplicated_data_in_fit_transform(self):
     X = np.random.randn(100, 2)
     d = DataInMemory([X, X])
     tica = api.tica(data=d, lag=1, dim=1)
     out1 = tica.get_output()
     out2 = tica.fit_transform([X, X])
     np.testing.assert_array_almost_equal(out1, out2)
Пример #8
0
    def test_big_k(self):
        x = np.random.random((300, 3))
        reader = DataInMemory(x)
        k = 151
        c = api.cluster_uniform_time(k=k)

        c.estimate(reader)
Пример #9
0
    def test_2d_skip(self):
        x = np.random.random((300, 3))
        reader = DataInMemory(x)

        k = 2
        c = api.cluster_uniform_time(k=k, skip=100)

        c.estimate(reader)
Пример #10
0
    def test_duplicated_data(self):
        # make some data that has one column repeated twice
        X = np.random.randn(100, 2)
        X = np.hstack((X, X[:, 0, np.newaxis]))

        d = DataInMemory(X)

        tica_obj = api.tica(data=d, lag=1, dim=1)

        assert tica_obj.eigenvectors.dtype == np.float64
        assert tica_obj.eigenvalues.dtype == np.float64
Пример #11
0
    def test_time_lagged_chunked_access(self):
        n = 100
        data = [np.random.random((n, 3)), np.zeros((29, 3)),
                np.random.random((n - 50, 3))]
        reader = DataInMemory(data)
        self.assertEqual(reader.n_frames_total(), n + n - 50 + 29)

        # iterate over data
        it = reader.iterator(lag=30, return_trajindex=True)
        for itraj, X, Y in it:
            if itraj == 0:
                # self.assertEqual(X.shape, (100, 3)) <-- changed behavior: return only chunks of same size
                self.assertEqual(X.shape, (70, 3))
                self.assertEqual(Y.shape, (70, 3))
            elif itraj == 1:
                # the time lagged chunk can not be built due to lag time
                self.assertEqual(X.shape, (0, 3))
                self.assertEqual(Y.shape, (0, 3))
            elif itraj == 2:
                self.assertEqual(X.shape, (20, 3))
                self.assertEqual(Y.shape, (20, 3))
Пример #12
0
    def testChunksizeResultsTica(self):
        chunk = 40
        lag = 100
        np.random.seed(0)
        X = np.random.randn(23000, 3)

        # un-chunked
        d = DataInMemory(X)

        tica_obj = api.tica(data=d, lag=lag, dim=1)

        cov = tica_obj.cov.copy()
        mean = tica_obj.mean.copy()

        # ------- run again with new chunksize -------
        d = DataInMemory(X)
        d.chunksize = chunk
        tica_obj = tica(data=d, lag=lag, dim=1)

        np.testing.assert_allclose(tica_obj.mean, mean)
        np.testing.assert_allclose(tica_obj.cov, cov)
Пример #13
0
    def test_lagged_iterator_1d_legacy(self):
        n = 30
        chunksize = 5
        lag = 9
        stride = 2

        data = [np.arange(n), np.arange(50), np.arange(33)]
        input_lens = [x.shape[0] for x in data]
        reader = DataInMemory(data, chunksize=chunksize)
        it = reader.iterator(chunk=chunksize, stride=stride, lag=lag)
        # lag > chunksize, so we expect a LegacyLaggedIter
        from pyerna.coordinates.data._base.iterable import _LegacyLaggedIterator
        self.assertIsInstance(it, _LegacyLaggedIterator)
        assert reader.chunksize == chunksize

        self.assertEqual(reader.n_frames_total(), sum(input_lens))

        # store results by traj
        chunked_trajs = [[] for _ in range(len(data))]
        chunked_lagged_trajs = [[] for _ in range(len(data))]

        # iterate over data
        for itraj, X, Y in reader.iterator(lag=lag, stride=stride):
            chunked_trajs[itraj].append(X)
            chunked_lagged_trajs[itraj].append(Y)

        trajs = [np.vstack(ichunks) for ichunks in chunked_trajs]
        lagged_trajs = [np.vstack(ichunks) for ichunks in chunked_lagged_trajs]

        # unlagged data
        for idx, (traj, input_traj) in enumerate(zip(trajs, data)):
            # do not consider chunks that have no lagged counterpart
            input_shape = input_traj.shape
            np.testing.assert_equal(traj.T.squeeze(), input_traj[::stride][:len(lagged_trajs[idx])].squeeze(),
                                    err_msg="failed for traj=%s"%idx)

        # lagged data
        for idx, (traj, input_traj) in enumerate(zip(lagged_trajs, data)):
            np.testing.assert_equal(traj.T.squeeze(), input_traj[lag::stride].squeeze(),
                                    err_msg="failed for traj=%s" % idx)
Пример #14
0
 def test_skip(self):
     for skip in [0, 3, 13]:
         r1 = DataInMemory(self.d)
         out_with_skip = r1.get_output(skip=skip)[0]
         r2 = DataInMemory(self.d)
         out = r2.get_output()[0]
         np.testing.assert_almost_equal(out_with_skip, out[skip::],
                                        err_msg="The first %s rows were skipped, but that did not "
                                                "match the rows with skip=0 and sliced by [%s::]" % (skip, skip))
Пример #15
0
    def test_ndim_input(self):
        data = np.empty((4, 2, 2, 2))

        reader = DataInMemory(data)

        self.assertEqual(reader.ndim, 2 * 2 * 2)
        self.assertEqual(reader.number_of_trajectories(), 1)
        self.assertEqual(reader.n_frames_total(), 4)
        np.testing.assert_equal(
                reader.trajectory_lengths(), np.array([reader.n_frames_total()]))
Пример #16
0
 def test_skip_input_list(self):
     for skip in [0, 3, 13]:
         r1 = DataInMemory([self.d, self.d])
         out_with_skip = r1.get_output(skip=skip)
         r2 = DataInMemory([self.d, self.d])
         out = r2.get_output()
         np.testing.assert_almost_equal(out_with_skip[0], out[0][skip::],
                                        err_msg="The first %s rows of the first file were skipped, but that did not "
                                                "match the rows with skip=0 and sliced by [%s::]" % (skip, skip))
         np.testing.assert_almost_equal(out_with_skip[1], out[1][skip::],
                                        err_msg="The first %s rows of the second file were skipped, but that did not"
                                                " match the rows with skip=0 and sliced by [%s::]" % (skip, skip))
Пример #17
0
    def get_output(self,
                   dimensions=slice(0, None),
                   stride=1,
                   skip=0,
                   chunk=None):
        """Maps all input data of this transformer and returns it as an array or list of arrays

        Parameters
        ----------
        dimensions : list-like of indexes or slice, default=all
           indices of dimensions you like to keep.
        stride : int, default=1
           only take every n'th frame.
        skip : int, default=0
            initially skip n frames of each file.
        chunk: int, default=None
            How many frames to process at once. If not given obtain the chunk size
            from the source.

        Returns
        -------
        output : list of ndarray(T_i, d)
           the mapped data, where T is the number of time steps of the input data, or if stride > 1,
           floor(T_in / stride). d is the output dimension of this transformer.
           If the input consists of a list of trajectories, Y will also be a corresponding list of trajectories

        """
        if isinstance(dimensions, int):
            ndim = 1
            dimensions = slice(dimensions, dimensions + 1)
        elif isinstance(dimensions, (list, np.ndarray, tuple, slice)):
            if hasattr(dimensions, 'ndim') and dimensions.ndim > 1:
                raise ValueError(
                    'dimension indices can\'t have more than one dimension')
            ndim = len(np.zeros(self.ndim)[dimensions])
        else:
            raise ValueError('unsupported type (%s) of "dimensions"' %
                             type(dimensions))

        assert ndim > 0, "ndim was zero in %s" % self.__class__.__name__

        if chunk is None:
            chunk = self.chunksize

        # create iterator
        if self.in_memory and not self._mapping_to_mem_active:
            from pyerna.coordinates.data.data_in_memory import DataInMemory
            assert self._Y is not None
            it = DataInMemory(self._Y)._create_iterator(skip=skip,
                                                        chunk=chunk,
                                                        stride=stride,
                                                        return_trajindex=True)
        else:
            it = self._create_iterator(skip=skip,
                                       chunk=chunk,
                                       stride=stride,
                                       return_trajindex=True)

        with it:
            # allocate memory
            try:
                from pyerna import config
                if config.coordinates_check_output:
                    trajs = [
                        np.full((l, ndim), np.nan, dtype=self.output_type())
                        for l in it.trajectory_lengths()
                    ]
                else:
                    # TODO: avoid having a copy here, if Y is already filled
                    trajs = [
                        np.empty((l, ndim), dtype=self.output_type())
                        for l in it.trajectory_lengths()
                    ]
            except MemoryError:
                self.logger.exception(
                    "Could not allocate enough memory to map all data."
                    " Consider using a larger stride.")
                return

            if self._logger_is_active(self._loglevel_DEBUG):
                self.logger.debug("get_output(): dimensions=%s" %
                                  str(dimensions))
                self.logger.debug(
                    "get_output(): created output trajs with shapes: %s" %
                    [x.shape for x in trajs])
                self.logger.debug("nchunks :%s, chunksize=%s" %
                                  (it.n_chunks, it.chunksize))
            # fetch data
            from pyerna._base.progress import ProgressReporter
            pg = ProgressReporter()
            pg.register(it.n_chunks,
                        description='getting output of %s' %
                        self.__class__.__name__)
            with pg.context(), it:
                for itraj, chunk in it:
                    i = slice(it.pos, it.pos + len(chunk))
                    assert i.stop - i.start > 0
                    trajs[itraj][i, :] = chunk[:, dimensions]
                    pg.update(1)

        if config.coordinates_check_output:
            for i, t in enumerate(trajs):
                finite = self._chunk_finite(t)
                if not np.all(finite):
                    # determine position
                    frames = np.where(np.logical_not(finite))
                    if not len(frames):
                        raise RuntimeError(
                            'nothing got assigned for traj {}'.format(i))
                    raise RuntimeError(
                        'unassigned sections in traj {i} in range [{frames}]'.
                        format(frames=frames, i=i))

        return trajs
Пример #18
0
 def test_cols(self):
     reader = DataInMemory(self.d)
     cols=(2, 0)
     for x in reader.iterator(chunk=0, return_trajindex=False, cols=cols):
         np.testing.assert_equal(x, self.d[:, cols])
Пример #19
0
 def testWrongArguments(self):
     with self.assertRaises(ValueError):
         reader = DataInMemory("foo")
Пример #20
0
    def iterator(self,
                 stride=1,
                 lag=0,
                 chunk=None,
                 return_trajindex=True,
                 cols=None,
                 skip=0):
        """ creates an iterator to stream over the (transformed) data.

        If your data is too large to fit into memory and you want to incrementally compute
        some quantities on it, you can create an iterator on a reader or transformer (eg. TICA)
        to avoid memory overflows.

        Parameters
        ----------

        stride : int, default=1
            Take only every stride'th frame.
        lag: int, default=0
            how many frame to omit for each file.
        chunk: int, default=None
            How many frames to process at once. If not given obtain the chunk size
            from the source.
        return_trajindex: boolean, default=True
            a chunk of data if return_trajindex is False, otherwise a tuple of (trajindex, data).
        cols: array like, default=None
            return only the given columns.
        skip: int, default=0
            skip 'n' first frames of each trajectory.

        Returns
        -------
        iter : instance of DataSourceIterator
            a implementation of a DataSourceIterator to stream over the data

        Examples
        --------

        >>> from pyerna.coordinates import source; import numpy as np
        >>> data = [np.arange(3), np.arange(4, 7)]
        >>> reader = source(data)
        >>> iterator = reader.iterator(chunk=1)
        >>> for array_index, chunk in iterator:
        ...     print(array_index, chunk)
        0 [[0]]
        0 [[1]]
        0 [[2]]
        1 [[4]]
        1 [[5]]
        1 [[6]]
        """
        if self.in_memory:
            from pyerna.coordinates.data.data_in_memory import DataInMemory
            return DataInMemory(self._Y).iterator(
                lag=lag,
                chunk=chunk,
                stride=stride,
                return_trajindex=return_trajindex,
                skip=skip)
        chunk = chunk if chunk is not None else self.chunksize
        if lag > 0:
            if chunk == 0 or lag <= chunk:
                it = self._create_iterator(skip=skip,
                                           chunk=chunk,
                                           stride=1,
                                           return_trajindex=return_trajindex,
                                           cols=cols)
                it.return_traj_index = True
                return _LaggedIterator(it, lag, return_trajindex, stride)
            else:
                it = self._create_iterator(skip=skip,
                                           chunk=chunk,
                                           stride=stride,
                                           return_trajindex=return_trajindex,
                                           cols=cols)
                it.return_traj_index = True
                it_lagged = self._create_iterator(skip=skip + lag,
                                                  chunk=chunk,
                                                  stride=stride,
                                                  return_trajindex=True,
                                                  cols=cols)
                return _LegacyLaggedIterator(it, it_lagged, return_trajindex)
        return self._create_iterator(skip=skip,
                                     chunk=chunk,
                                     stride=stride,
                                     return_trajindex=return_trajindex,
                                     cols=cols)
Пример #21
0
    def testNotEqualDims(self):
        """ should raise, since different dims can not be processed"""
        data = [np.zeros((10, 3)), np.zeros((10, 5))]

        with self.assertRaises(ValueError):
            DataInMemory(data)
Пример #22
0
    def test_1d(self):
        x = np.random.random(1000)
        reader = DataInMemory(x)

        k = 2
        c = api.cluster_uniform_time(reader, k=k)