def test1dDataList(self): n = 10 data = [np.arange(n), np.arange(n)] reader = DataInMemory(data) self.assertEqual(reader.trajectory_lengths(), [n, n]) self.assertEqual(reader.dimension(), 1) self.assertEqual(reader.number_of_trajectories(), 2) self.assertEqual(reader.n_frames_total(), 2 * n)
def testDataArray(self): frames_per_traj = 100 dim = 3 data = np.random.random((frames_per_traj, dim)) d = DataInMemory(data) self.assertEqual( d.trajectory_lengths(), [frames_per_traj for _ in xrange(1)])
def test1dData(self): n = 3 data = np.arange(n) reader = DataInMemory(data) self.assertEqual(reader.trajectory_lengths(), [n]) self.assertEqual(reader.dimension(), 1) self.assertEqual(reader.number_of_trajectories(), 1) self.assertEqual(reader.n_frames_total(), n)
def testListOfArrays(self): frames_per_traj = 100 dim = 3 data = [np.random.random((frames_per_traj, dim)) for _ in xrange(3)] d = DataInMemory(data) self.assertEqual(d.dimension(), dim) self.assertEqual( d.trajectory_lengths(), [frames_per_traj for _ in xrange(3)])
def testChunksizeResultsTica(self): chunk = 40 lag = 100 np.random.seed(0) X = np.random.randn(23000, 3) # un-chunked d = DataInMemory(X) tica_obj = api.tica(data=d, lag=lag, dim=1) cov = tica_obj.cov.copy() mean = tica_obj.mu.copy() # ------- run again with new chunksize ------- d = DataInMemory(X) d.chunksize = chunk tica_obj = tica(data=d, lag=lag, dim=1) np.testing.assert_allclose(tica_obj.mu, mean) np.testing.assert_allclose(tica_obj.cov, cov)
def test_lagged_iterator_2d(self): n = 57 chunksize = 10 lag = 1 # data = [np.random.random((n, 3)), # np.zeros((29, 3)), # np.random.random((n - 50, 3))] data = [np.arange(300).reshape((100, 3)), np.arange(29 * 3).reshape((29, 3)), np.arange(150).reshape(50, 3)] input_lens = [x.shape[0] for x in data] # print data[0].shape reader = DataInMemory(data) reader.chunksize = chunksize self.assertEqual(reader.n_frames_total(), sum(input_lens)) # store results by traj chunks = [[] for _ in xrange(len(data))] lagged_chunks = [[] for _ in xrange(len(data))] # iterate over data for itraj, X, Y in reader.iterator(lag=lag): chunks[itraj].append(X) lagged_chunks[itraj].append(Y) trajs = [np.vstack(ichunks) for ichunks in chunks] lagged_trajs = [np.vstack(ichunks) for ichunks in lagged_chunks] # unlagged data for traj, input_traj in zip(trajs, data): np.testing.assert_equal(traj.reshape(input_traj.shape), input_traj) # lagged data lagged_0 = [d[lag:] for d in data] for traj, input_traj in zip(lagged_trajs, lagged_0): np.testing.assert_equal(traj.reshape(input_traj.shape), input_traj)
def test_time_lagged_chunked_access(self): n = 100 data = [np.random.random((n, 3)), np.zeros((29, 3)), np.random.random((n - 50, 3))] reader = DataInMemory(data) self.assertEqual(reader.n_frames_total(), n + n - 50 + 29) # iterate over data lag = 30 t = 0 itraj = 0 last_chunk = False while not last_chunk: last_chunk_in_traj = False t = 0 while not last_chunk_in_traj: X, Y = reader._next_chunk(lag=lag) if itraj == 0: self.assertEqual(X.shape, (100, 3)) self.assertEqual(Y.shape, (70, 3)) elif itraj == 1: # the time lagged chunk can not be built due to lag time self.assertEqual(X.shape, (29, 3)) self.assertEqual(Y.shape, (0, 3)) elif itraj == 2: self.assertEqual(X.shape, (50, 3)) self.assertEqual(Y.shape, (20, 3)) L = np.shape(X)[0] # last chunk in traj? last_chunk_in_traj = ( t + L >= reader.trajectory_length(itraj)) # last chunk? last_chunk = ( last_chunk_in_traj and itraj >= reader.number_of_trajectories() - 1) t += L # increment trajectory itraj += 1
def test_ndim_input(self): data = np.empty((4, 2, 2, 2)) reader = DataInMemory(data) self.assertEqual(reader.dimension(), 2 * 2 * 2) self.assertEqual(reader.number_of_trajectories(), 1) self.assertEqual(reader.n_frames_total(), 4) self.assertEqual( reader.trajectory_lengths(), [reader.n_frames_total()])
def iterator(self, stride=1, lag=0, chunk=None, return_trajindex=True, cols=None, skip=0): """ creates an iterator to stream over the (transformed) data. If your data is too large to fit into memory and you want to incrementally compute some quantities on it, you can create an iterator on a reader or transformer (eg. TICA) to avoid memory overflows. Parameters ---------- stride : int, default=1 Take only every stride'th frame. lag: int, default=0 how many frame to omit for each file. chunk: int, default=None How many frames to process at once. If not given obtain the chunk size from the source. return_trajindex: boolean, default=True a chunk of data if return_trajindex is False, otherwise a tuple of (trajindex, data). cols: array like, default=None return only the given columns. skip: int, default=0 skip 'n' first frames of each trajectory. Returns ------- iter : instance of DataSourceIterator a implementation of a DataSourceIterator to stream over the data Examples -------- >>> from pyemma.coordinates import source; import numpy as np >>> data = [np.arange(3), np.arange(4, 7)] >>> reader = source(data) >>> iterator = reader.iterator(chunk=1) >>> for array_index, chunk in iterator: ... print(array_index, chunk) 0 [[0]] 0 [[1]] 0 [[2]] 1 [[4]] 1 [[5]] 1 [[6]] """ if self.in_memory: from pyemma.coordinates.data.data_in_memory import DataInMemory return DataInMemory(self._Y).iterator( lag=lag, chunk=chunk, stride=stride, return_trajindex=return_trajindex, skip=skip) chunk = chunk if chunk is not None else self.chunksize if 0 < lag <= chunk: it = self._create_iterator(skip=skip, chunk=chunk, stride=1, return_trajindex=return_trajindex, cols=cols) it.return_traj_index = True return _LaggedIterator(it, lag, return_trajindex, stride) elif lag > 0: it = self._create_iterator(skip=skip, chunk=chunk, stride=stride, return_trajindex=return_trajindex, cols=cols) it.return_traj_index = True it_lagged = self._create_iterator(skip=skip + lag, chunk=chunk, stride=stride, return_trajindex=True, cols=cols) return _LegacyLaggedIterator(it, it_lagged, return_trajindex) return self._create_iterator(skip=skip, chunk=chunk, stride=stride, return_trajindex=return_trajindex, cols=cols)
def get_output(self, dimensions=slice(0, None), stride=1, skip=0, chunk=None): """Maps all input data of this transformer and returns it as an array or list of arrays Parameters ---------- dimensions : list-like of indexes or slice, default=all indices of dimensions you like to keep. stride : int, default=1 only take every n'th frame. skip : int, default=0 initially skip n frames of each file. chunk: int, default=None How many frames to process at once. If not given obtain the chunk size from the source. Returns ------- output : list of ndarray(T_i, d) the mapped data, where T is the number of time steps of the input data, or if stride > 1, floor(T_in / stride). d is the output dimension of this transformer. If the input consists of a list of trajectories, Y will also be a corresponding list of trajectories """ if isinstance(dimensions, int): ndim = 1 dimensions = slice(dimensions, dimensions + 1) elif isinstance(dimensions, (list, np.ndarray, tuple, slice)): if hasattr(dimensions, 'ndim') and dimensions.ndim > 1: raise ValueError( 'dimension indices can\'t have more than one dimension') ndim = len(np.zeros(self.ndim)[dimensions]) else: raise ValueError('unsupported type (%s) of "dimensions"' % type(dimensions)) assert ndim > 0, "ndim was zero in %s" % self.__class__.__name__ if chunk is None: chunk = self.chunksize # create iterator if self.in_memory and not self._mapping_to_mem_active: from pyemma.coordinates.data.data_in_memory import DataInMemory assert self._Y is not None it = DataInMemory(self._Y)._create_iterator(skip=skip, chunk=chunk, stride=stride, return_trajindex=True) else: it = self._create_iterator(skip=skip, chunk=chunk, stride=stride, return_trajindex=True) with it: # allocate memory try: from pyemma import config if config.coordinates_check_output: trajs = [ np.full((l, ndim), np.nan, dtype=self.output_type()) for l in it.trajectory_lengths() ] else: # TODO: avoid having a copy here, if Y is already filled trajs = [ np.empty((l, ndim), dtype=self.output_type()) for l in it.trajectory_lengths() ] except MemoryError: self.logger.exception( "Could not allocate enough memory to map all data." " Consider using a larger stride.") return if self._logger_is_active(self._loglevel_DEBUG): self.logger.debug("get_output(): dimensions=%s" % str(dimensions)) self.logger.debug( "get_output(): created output trajs with shapes: %s" % [x.shape for x in trajs]) self.logger.debug("nchunks :%s, chunksize=%s" % (it.n_chunks, it.chunksize)) # fetch data from pyemma._base.progress import ProgressReporter pg = ProgressReporter() pg.register(it.n_chunks, description='getting output of %s' % self.__class__.__name__) with pg.context(), it: for itraj, chunk in it: i = slice(it.pos, it.pos + len(chunk)) assert i.stop - i.start > 0 trajs[itraj][i, :] = chunk[:, dimensions] pg.update(1) if config.coordinates_check_output: for i, t in enumerate(trajs): finite = self._chunk_finite(t) if not np.all(finite): # determine position frames = np.where(np.logical_not(finite)) if not len(frames): raise RuntimeError( 'nothing got assigned for traj {}'.format(i)) raise RuntimeError( 'unassigned sections in traj {i} in range [{frames}]'. format(frames=frames, i=i)) return trajs
def test_1d(self): x = np.random.random(1000) reader = DataInMemory(x) k = 2 c = api.cluster_uniform_time(reader, k=k)
def testWrongArguments(self): with self.assertRaises(ValueError): reader = DataInMemory("foo")
def test_cols(self): reader = DataInMemory(self.d) cols=(2, 0) for x in reader.iterator(chunk=0, return_trajindex=False, cols=cols): np.testing.assert_equal(x, self.d[:, cols])
def testNotEqualDims(self): """ should raise, since different dims can not be processed""" data = [np.zeros((10, 3)), np.zeros((10, 5))] with self.assertRaises(ValueError): DataInMemory(data)