def test_njobs_speedup(self): artificial_dtraj = [ np.random.randint(0, 100, size=10000) for _ in range(10) ] import time class timing(object): def __enter__(self): self.start = time.time() return self def __exit__(self, exc_type, exc_val, exc_tb): self.stop = time.time() self.diff = self.stop - self.start lag = 100 with timing() as serial: ceff = effective_count_matrix(artificial_dtraj, lag=lag) for n_jobs in (2, 3, 4): with timing() as parallel: ceff_parallel = effective_count_matrix(artificial_dtraj, lag=lag, n_jobs=n_jobs) self.assertLess(parallel.diff, serial.diff / n_jobs + 0.5, msg='does not scale for njobs=%s' % n_jobs) np.testing.assert_allclose( ceff_parallel.toarray(), ceff.toarray(), atol=1e-14, err_msg='different result for njobs=%s' % n_jobs)
def test_singletraj(self): # lag 1 C = count_matrix(self.dtraj_long, 1) Ceff = effective_count_matrix(self.dtraj_long, 1) assert np.array_equal(Ceff.shape, C.shape) assert np.array_equal(C.nonzero(), Ceff.nonzero()) assert np.all(Ceff.toarray() <= C.toarray()) # lag 100 C = count_matrix(self.dtraj_long, 100) Ceff = effective_count_matrix(self.dtraj_long, 100) assert np.array_equal(Ceff.shape, C.shape) assert np.array_equal(C.nonzero(), Ceff.nonzero()) assert np.all(Ceff.toarray() <= C.toarray())
def test_multitraj(self): dtrajs = [[1, 0, 1, 0, 1, 1, 0, 0, 0, 1], [2], [0, 1, 0, 1]] # lag 1 C = count_matrix(dtrajs, 1) Ceff = effective_count_matrix(dtrajs, 1) assert np.array_equal(Ceff.shape, C.shape) assert np.array_equal(C.nonzero(), Ceff.nonzero()) assert np.all(Ceff.toarray() <= C.toarray()) # lag 2 C = count_matrix(dtrajs, 2) Ceff = effective_count_matrix(dtrajs, 2) assert np.array_equal(Ceff.shape, C.shape) assert np.array_equal(C.nonzero(), Ceff.nonzero()) assert np.all(Ceff.toarray() <= C.toarray())
def count(count_mode: str, dtrajs: List[np.ndarray], lagtime: int, sparse: bool = False): r""" Computes a count matrix based on a counting mode, some discrete trajectories, a lagtime, and whether to use sparse matrices. Parameters ---------- count_mode : str The counting mode to use. One of "sample", "sliding", "sliding-effective", and "effective". See :meth:`__init__` for a more detailed description. dtrajs : array_like or list of array_like Discrete trajectories, i.e., a list of arrays which contain non-negative integer values. A single ndarray can also be passed, which is then treated as if it was a list with that one ndarray in it. lagtime : int Distance between two frames in the discretized trajectories under which their potential change of state is considered a transition. sparse : bool, default=False Whether to use sparse matrices or dense matrices. Sparse matrices can make sense when dealing with a lot of states. Returns ------- count_matrix : (N, N) ndarray or sparse array The computed count matrix. Can be ndarray or sparse depending on whether sparse was set to true or false. N is the number of encountered states, i.e., :code:`np.max(dtrajs)+1`. Example ------- >>> dtrajs = [np.array([0,0,1,1]), np.array([0,0,1])] >>> count_matrix = TransitionCountEstimator.count( ... count_mode="sliding", dtrajs=dtrajs, lagtime=1, sparse=False ... ) >>> np.testing.assert_equal(count_matrix, np.array([[2, 2], [0, 1]])) """ if count_mode == 'sliding' or count_mode == 'sliding-effective': count_matrix = msmest.count_matrix(dtrajs, lagtime, sliding=True, sparse_return=sparse) if count_mode == 'sliding-effective': count_matrix /= lagtime elif count_mode == 'sample': count_matrix = msmest.count_matrix(dtrajs, lagtime, sliding=False, sparse_return=sparse) elif count_mode == 'effective': count_matrix = msmest.effective_count_matrix(dtrajs, lagtime) if not sparse and issparse(count_matrix): count_matrix = count_matrix.toarray() else: raise ValueError('Count mode {} is unknown.'.format(count_mode)) return count_matrix
def test_compare_with_old_impl(self): # generated with v1.1@ from # pyemma.datasets.load_2well_discrete().dtraj_T100K_dt10_n6good Ceff_ref = np.array( [[ 2.21353316e+04, 2.13659736e+03, 4.63558176e+02, 1.56043628e+02, 3.88680098e+01, 1.14317676e+01 ], [ 1.84456322e+03, 3.74107190e+02, 1.79811199e+02, 9.29024530e+01, 5.59412620e+01, 2.59727288e+01 ], [ 3.45678646e+02, 1.42148228e+02, 8.19775293e+01, 7.75353971e+01, 5.73438875e+01, 8.19775293e+01 ], [ 9.08206988e+01, 6.53466003e+01, 7.82682445e+01, 7.71606750e+01, 8.38060919e+01, 2.84276171e+02 ], [ 3.56219388e+01, 3.43186971e+01, 7.64568442e+01, 1.13816439e+02, 2.51960055e+02, 1.33451946e+03 ], [ 1.57044024e+01, 3.26168358e+01, 1.12346879e+02, 4.34287128e+02, 1.88573632e+03, 2.35837843e+04 ]]) ref_dtraj = deeptime.data.double_well_discrete().dtraj_n6good Ceff = effective_count_matrix(ref_dtraj, lag=10, average='row', mact=1.0).toarray() Ceff2 = effective_count_matrix(ref_dtraj, lag=10, average='row', mact=1.0, n_jobs=2).toarray() np.testing.assert_allclose(Ceff, Ceff_ref, atol=1e-15, rtol=1e-8) np.testing.assert_allclose(Ceff2, Ceff_ref, atol=1e-15, rtol=1e-8)
def test_multitraj_njobs(self): dtrajs = [[1, 0, 1, 0, 1, 1, 0, 0, 0, 1], [2], [0, 1, 0, 1]] # lag 1 C = count_matrix(dtrajs, 1) Ceff = effective_count_matrix(dtrajs, 1, n_jobs=1) assert np.array_equal(Ceff.shape, C.shape) assert np.array_equal(C.nonzero(), Ceff.nonzero()) assert np.all(Ceff.toarray() <= C.toarray()) Ceff2 = effective_count_matrix(dtrajs, 1, n_jobs=2) np.testing.assert_equal(Ceff2.toarray(), Ceff.toarray()) np.testing.assert_allclose(Ceff2.toarray(), Ceff.toarray()) assert np.array_equal(Ceff2.shape, C.shape) assert np.array_equal(C.nonzero(), Ceff2.nonzero()) assert np.all(Ceff2.toarray() <= C.toarray()) # lag 2 C = count_matrix(dtrajs, 2) Ceff2 = effective_count_matrix(dtrajs, 2) assert np.array_equal(Ceff2.shape, C.shape) assert np.array_equal(C.nonzero(), Ceff2.nonzero()) assert np.all(Ceff2.toarray() <= C.toarray())
def effective_count_matrix(self): """Statistically uncorrelated transition counts within the active set of states You can use this count matrix for Bayesian estimation or error perturbation. References ---------- [1] Noe, F. (2015) Statistical inefficiency of Markov model count matrices http://publications.mi.fu-berlin.de/1699/1/autocorrelation_counts.pdf """ self._check_is_estimated() Ceff_full = effective_count_matrix(self._dtrajs_full, self.lag) from pyemma.util.linalg import submatrix Ceff = submatrix(Ceff_full, self.active_set) return Ceff
def fit(self, dtrajs, **kw): r""" Fits an MSM using Koopman reweighting. Parameters ---------- dtrajs : array_like or list of array_like Discrete trajectories. **kw API compatibility to sklearn, not actually used in algorithm. """ dtrajs = ensure_dtraj_list(dtrajs) # remove last lag steps from dtrajs: dtrajs_lag = [traj[:-self.lagtime] for traj in dtrajs] # statistics are collected over full trajectories histogram = count_states(dtrajs, ignore_negative=True) # because of double counting, only count lagged trajs count_matrix = TransitionCountEstimator.count( count_mode=self.count_mode, dtrajs=dtrajs_lag, lagtime=self.lagtime, sparse=self.sparse) count_model = TransitionCountModel(count_matrix, counting_mode=self.count_mode, lagtime=self.lagtime, state_histogram=histogram) count_model = count_model.submodel_largest( connectivity_threshold=self.connectivity_threshold, directed=True) # Estimate transition matrix using re-sampling: if self.rank_mode == 'bootstrap_counts': effective_count_mat = effective_count_matrix( dtrajs_lag, self.lagtime) Ceff = submatrix(effective_count_mat, count_model.state_symbols) smean, sdev = _impl.bootstrapping_count_matrix(Ceff, nbs=self.nbs) else: smean, sdev = _impl.bootstrapping_dtrajs( dtrajs_lag, self.lagtime, count_model.n_states_full, nbs=self.nbs, active_set=count_model.state_symbols) # Estimate two step count matrices: twostep_count_matrices = _impl.twostep_count_matrix( dtrajs, self.lagtime, count_model.n_states_full) # Rank decision: rank_ind = _impl.rank_decision(smean, sdev, tol=self.tol_rank) # Estimate OOM components: if issparse(count_model.count_matrix_full): cmat = count_model.count_matrix_full.toarray() else: cmat = count_model.count_matrix_full oom_components, omega, sigma, eigenvalues = _impl.oom_components( cmat, twostep_count_matrices, rank_ind=rank_ind, lcc=count_model.state_symbols) # Compute transition matrix: P, lcc_new = _impl.equilibrium_transition_matrix( oom_components, omega, sigma, reversible=self.reversible) # Update active set and derived quantities: if lcc_new.size < count_model.n_states: assert isinstance(count_model, TransitionCountModel) count_model = count_model.submodel( count_model.symbols_to_states(lcc_new)) warnings.warn( "Caution: Re-estimation of count matrix resulted in reduction of the active set." ) self._model = KoopmanReweightedMSM( transition_matrix=P, oom_eigenvalues=eigenvalues, oom_evaluator=sigma, oom_information_state_vector=omega, count_model=count_model, oom_components=oom_components, twostep_count_matrices=twostep_count_matrices) return self
def _estimate(self, dtrajs): """ Estimate MSM """ if self.core_set is not None: raise NotImplementedError( 'Core set MSMs currently not compatible with {}.'.format( self.__class__.__name__)) # remove last lag steps from dtrajs: dtrajs_lag = [traj[:-self.lag] for traj in dtrajs] # get trajectory counts. This sets _C_full and _nstates_full dtrajstats = self._get_dtraj_stats(dtrajs_lag) self._C_full = dtrajstats.count_matrix() # full count matrix self._nstates_full = self._C_full.shape[0] # number of states # set active set. This is at the same time a mapping from active to full if self.connectivity == 'largest': self.active_set = dtrajstats.largest_connected_set else: raise NotImplementedError( 'OOM based MSM estimation is only implemented for connectivity=\'largest\'.' ) # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean! # is estimated self._is_estimated = True # if active set is empty, we can't do anything. if _np.size(self.active_set) == 0: raise RuntimeError('Active set is empty. Cannot estimate MSM.') # active count matrix and number of states self._C_active = dtrajstats.count_matrix(subset=self.active_set) self._nstates = self._C_active.shape[0] # computed derived quantities # back-mapping from full to lcs self._full2active = -1 * _np.ones(dtrajstats.nstates, dtype=int) self._full2active[self.active_set] = _np.arange(len(self.active_set)) # Estimate transition matrix if self.connectivity == 'largest': # Re-sampling: if self.rank_Ct == 'bootstrap_counts': Ceff_full = effective_count_matrix(dtrajs_lag, self.lag) from pyemma.util.linalg import submatrix Ceff = submatrix(Ceff_full, self.active_set) smean, sdev = bootstrapping_count_matrix(Ceff, nbs=self.nbs) else: smean, sdev = bootstrapping_dtrajs(dtrajs_lag, self.lag, self._nstates_full, nbs=self.nbs, active_set=self._active_set) # Estimate two step count matrices: C2t = twostep_count_matrix(dtrajs, self.lag, self._nstates_full) # Rank decision: rank_ind = rank_decision(smean, sdev, tol=self.tol_rank) # Estimate OOM components: Xi, omega, sigma, l = oom_components(self._C_full.toarray(), C2t, rank_ind=rank_ind, lcc=self.active_set) # Compute transition matrix: P, lcc_new = equilibrium_transition_matrix( Xi, omega, sigma, reversible=self.reversible) else: raise NotImplementedError( 'OOM based MSM estimation is only implemented for connectivity=\'largest\'.' ) # Update active set and derived quantities: if lcc_new.size < self._nstates: self._active_set = self._active_set[lcc_new] self._C_active = dtrajstats.count_matrix(subset=self.active_set) self._nstates = self._C_active.shape[0] self._full2active = -1 * _np.ones(dtrajstats.nstates, dtype=int) self._full2active[self.active_set] = _np.arange( len(self.active_set)) warnings.warn( "Caution: Re-estimation of count matrix resulted in reduction of the active set." ) # continue sparse or dense? if not self.sparse: # converting count matrices to arrays. As a result the # transition matrix and all subsequent properties will be # computed using dense arrays and dense matrix algebra. self._C_full = self._C_full.toarray() self._C_active = self._C_active.toarray() # Done. We set our own model parameters, so this estimator is # equal to the estimated model. self._dtrajs_full = dtrajs self._connected_sets = connected_sets(self._C_full) self._Xi = Xi self._omega = omega self._sigma = sigma self._eigenvalues_OOM = l self._rank_ind = rank_ind self._oom_rank = self._sigma.size self._C2t = C2t self.set_model_params(P=P, pi=None, reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag)) return self