def test_connected_sets(self): """Directed""" cc = connected_sets(self.C) for i in range(len(cc)): self.assertTrue(np.all(self.cc_directed[i] == np.sort(cc[i]))) """Undirected""" cc = connected_sets(self.C, directed=False) for i in range(len(cc)): self.assertTrue(np.all(self.cc_undirected[i] == np.sort(cc[i])))
def _compute_connected_sets(C, mincount_connectivity, strong=True): """ Computes the connected sets of C. C : count matrix mincount_connectivity : float Minimum count which counts as a connection. strong : boolean True: Seek strongly connected sets. False: Seek weakly connected sets. Returns ------- Cconn, S """ import msmtools.estimation as msmest import scipy.sparse as scs if scs.issparse(C): Cconn = C.tocsr(copy=True) Cconn.data[Cconn.data < mincount_connectivity] = 0 Cconn.eliminate_zeros() else: Cconn = C.copy() Cconn[np.where(Cconn < mincount_connectivity)] = 0 # treat each connected set separately S = msmest.connected_sets(Cconn, directed=strong) return S
def compute_connected_sets(C, connectivity_threshold, directed=True): """ Computes the connected sets of a count matrix C. C : (N, N) np.ndarray count matrix mincount_connectivity : float Minimum count required to be included in the connected set computation. directed : boolean True: Seek connected sets in the directed graph. False: Seek connected sets in the undirected graph. Returns ------- A list of arrays, each array representing a connected set by enumerating the respective states. The list is in descending order by size of connected set. """ import msmtools.estimation as msmest import scipy.sparse as scs if connectivity_threshold > 0: if scs.issparse(C): Cconn = C.tocsr(copy=True) Cconn.data[Cconn.data < connectivity_threshold] = 0 Cconn.eliminate_zeros() else: Cconn = C.copy() Cconn[np.where(Cconn < connectivity_threshold)] = 0 else: Cconn = C # treat each connected set separately S = msmest.connected_sets(Cconn, directed=directed) return S
def count_lagged(self, lag, count_mode='sliding'): r""" Counts transitions at given lag time Parameters ---------- lag : int lagtime in trajectory steps count_mode : str, optional, default='sliding' mode to obtain count matrices from discrete trajectories. Should be one of: * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts at time indexes .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1) * 'effective' : Uses an estimate of the transition counts that are statistically uncorrelated. Recommended when used with a Bayesian MSM. * 'sample' : A trajectory of length T will have :math:`T / \tau` counts at time indexes .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T) """ # store lag time self._lag = lag # Compute count matrix count_mode = count_mode.lower() if count_mode == 'sliding': self._C = msmest.count_matrix(self._dtrajs, lag, sliding=True) elif count_mode == 'sample': self._C = msmest.count_matrix(self._dtrajs, lag, sliding=False) elif count_mode == 'effective': self._C = msmest.effective_count_matrix(self._dtrajs, lag) else: raise ValueError('Count mode ' + count_mode + ' is unknown.') # Compute reversibly connected sets self._connected_sets = msmest.connected_sets(self._C) # set sizes and count matrices on reversibly connected sets self._connected_set_sizes = np.zeros((len(self._connected_sets))) self._C_sub = np.empty((len(self._connected_sets)), dtype=np.object) for i in range(len(self._connected_sets)): # set size self._connected_set_sizes[i] = len(self._connected_sets[i]) # submatrix self._C_sub[i] = submatrix(self._C, self._connected_sets[i]) # largest connected set lcs = self._connected_sets[0] # mapping from full to lcs self._full2lcs = -1 * np.ones((self._nstates), dtype=int) self._full2lcs[lcs] = np.array(list(range(len(lcs))), dtype=int) # remember that this function was called self._counted_at_lag = True
def rdl_decomposition(P, reversible=True): # TODO: this treatment is probably not meaningful for weakly connected matrices. import msmtools.estimation as msmest import msmtools.analysis as msmana # output matrices n = np.shape(P)[0] if reversible: dtype = np.float64 norm = 'reversible' else: dtype = complex norm = 'standard' R = np.zeros((n, n), dtype=dtype) D = np.zeros((n, n), dtype=dtype) L = np.zeros((n, n), dtype=dtype) # treat each strongly connected set separately S = msmest.connected_sets(P) for s in S: indices = np.ix_(s, s) if len(s) > 1: right_eigvec, eigval_diag, left_eigvec = msmana.rdl_decomposition( P[s, :][:, s], norm=norm) # write to full R[indices] = right_eigvec D[indices] = eigval_diag L[indices] = left_eigvec else: # just one element. Write 1's R[indices] = 1 D[indices] = 1 L[indices] = 1 # done return R, D, L
def transform_transition_matrix_connected(transition_matrix): connected_nodes = connected_sets(transition_matrix)[0] connected_matrix = np.take(transition_matrix, connected_nodes, axis=0) connected_matrix = np.take(connected_matrix, connected_nodes, axis=1) #removed_nodes = [element[0] for element in connected_sets(transition_matrix)[1:]] removed_nodes = [ element for element in range(0, transition_matrix.shape[0]) if element not in connected_nodes ] removed_nodes.sort() removed_nodes.reverse() return connected_matrix, removed_nodes
def connected_sets(C, mincount_connectivity=0, strong=True): """ Computes the connected sets of C. C : count matrix mincount_connectivity : float Minimum count which counts as a connection. strong : boolean True: Seek strongly connected sets. False: Seek weakly connected sets. """ import msmtools.estimation as msmest Cconn = C.copy() Cconn[np.where(C < mincount_connectivity)] = 0 # treat each connected set separately S = msmest.connected_sets(Cconn, directed=strong) return S
def transform_transition_matrix_connected(transition_matrix): #TODO: I would prefer to use msmtools because the functions there are unit-tested... raise NotImplementedError('consider using msmtools or remove this line.') connected_nodes = connected_sets(transition_matrix)[0] connected_matrix = np.take(transition_matrix, connected_nodes, axis=0) connected_matrix = np.take(connected_matrix, connected_nodes, axis=1) #removed_nodes = [element[0] for element in connected_sets(transition_matrix)[1:]] removed_nodes = [ element for element in range(0, transition_matrix.shape[0]) if element not in connected_nodes ] removed_nodes.sort() removed_nodes.reverse() return connected_matrix, removed_nodes
def stationary_distribution(C, P): # import emma import msmtools.estimation as msmest import msmtools.analysis as msmana # disconnected sets n = np.shape(C)[0] ctot = np.sum(C) pi = np.zeros((n)) # treat each connected set separately S = msmest.connected_sets(C) for s in S: # compute weight w = np.sum(C[s,:]) / ctot pi[s] = w * msmana.statdist(P[s,:][:,s]) # reinforce normalization pi /= np.sum(pi) return pi
def estimate_P(C, reversible = True, fixed_statdist=None): # import emma import msmtools.estimation as msmest # output matrix. Initially eye n = np.shape(C)[0] P = np.eye((n), dtype=np.float64) # treat each connected set separately S = msmest.connected_sets(C) for s in S: if len(s) > 1: # if there's only one state, there's nothing to estimate and we leave it with diagonal 1 # compute transition sub-matrix on s Cs = C[s,:][:,s] Ps = msmest.transition_matrix(Cs, reversible = reversible, mu=fixed_statdist) # write back to matrix for i,I in enumerate(s): for j,J in enumerate(s): P[I,J] = Ps[i,j] P[s,:][:,s] = Ps # done return P
def rdl_decomposition(P, reversible=True): # TODO: this treatment is probably not meaningful for weakly connected matrices. import msmtools.estimation as msmest import msmtools.analysis as msmana # output matrices n = np.shape(P)[0] if reversible: dtype = np.float64 else: dtype = complex R = np.zeros((n, n), dtype=dtype) D = np.zeros((n, n), dtype=dtype) L = np.zeros((n, n), dtype=dtype) # treat each strongly connected set separately S = msmest.connected_sets(P) for s in S: I = np.ix_(s, s) if len(s) > 1: if reversible: r, d, l = msmana.rdl_decomposition(P[s, :][:, s], norm='reversible') # everything must be real-valued - this should rather be handled by msmtools R[I] = r.real D[I] = d.real L[I] = l.real else: r, d, l = msmana.rdl_decomposition(P[s, :][:, s], norm='standard') # write to full R[I] = r D[I] = d L[I] = l else: # just one element. Write 1's R[I] = 1 D[I] = 1 L[I] = 1 # done return R, D, L
def _estimate(self, dtrajs): """ Estimate MSM """ if self.core_set is not None: raise NotImplementedError( 'Core set MSMs currently not compatible with {}.'.format( self.__class__.__name__)) # remove last lag steps from dtrajs: dtrajs_lag = [traj[:-self.lag] for traj in dtrajs] # get trajectory counts. This sets _C_full and _nstates_full dtrajstats = self._get_dtraj_stats(dtrajs_lag) self._C_full = dtrajstats.count_matrix() # full count matrix self._nstates_full = self._C_full.shape[0] # number of states # set active set. This is at the same time a mapping from active to full if self.connectivity == 'largest': self.active_set = dtrajstats.largest_connected_set else: raise NotImplementedError( 'OOM based MSM estimation is only implemented for connectivity=\'largest\'.' ) # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean! # is estimated self._is_estimated = True # if active set is empty, we can't do anything. if _np.size(self.active_set) == 0: raise RuntimeError('Active set is empty. Cannot estimate MSM.') # active count matrix and number of states self._C_active = dtrajstats.count_matrix(subset=self.active_set) self._nstates = self._C_active.shape[0] # computed derived quantities # back-mapping from full to lcs self._full2active = -1 * _np.ones(dtrajstats.nstates, dtype=int) self._full2active[self.active_set] = _np.arange(len(self.active_set)) # Estimate transition matrix if self.connectivity == 'largest': # Re-sampling: if self.rank_Ct == 'bootstrap_counts': Ceff_full = msmest.effective_count_matrix(dtrajs_lag, self.lag) from pyerna.util.linalg import submatrix Ceff = submatrix(Ceff_full, self.active_set) smean, sdev = bootstrapping_count_matrix(Ceff, nbs=self.nbs) else: smean, sdev = bootstrapping_dtrajs(dtrajs_lag, self.lag, self._nstates_full, nbs=self.nbs, active_set=self._active_set) # Estimate two step count matrices: C2t = twostep_count_matrix(dtrajs, self.lag, self._nstates_full) # Rank decision: rank_ind = rank_decision(smean, sdev, tol=self.tol_rank) # Estimate OOM components: Xi, omega, sigma, l = oom_components(self._C_full.toarray(), C2t, rank_ind=rank_ind, lcc=self.active_set) # Compute transition matrix: P, lcc_new = equilibrium_transition_matrix( Xi, omega, sigma, reversible=self.reversible) else: raise NotImplementedError( 'OOM based MSM estimation is only implemented for connectivity=\'largest\'.' ) # Update active set and derived quantities: if lcc_new.size < self._nstates: self._active_set = self._active_set[lcc_new] self._C_active = dtrajstats.count_matrix(subset=self.active_set) self._nstates = self._C_active.shape[0] self._full2active = -1 * _np.ones(dtrajstats.nstates, dtype=int) self._full2active[self.active_set] = _np.arange( len(self.active_set)) warnings.warn( "Caution: Re-estimation of count matrix resulted in reduction of the active set." ) # continue sparse or dense? if not self.sparse: # converting count matrices to arrays. As a result the # transition matrix and all subsequent properties will be # computed using dense arrays and dense matrix algebra. self._C_full = self._C_full.toarray() self._C_active = self._C_active.toarray() # Done. We set our own model parameters, so this estimator is # equal to the estimated model. self._dtrajs_full = dtrajs self._connected_sets = msmest.connected_sets(self._C_full) self._Xi = Xi self._omega = omega self._sigma = sigma self._eigenvalues_OOM = l self._rank_ind = rank_ind self._oom_rank = self._sigma.size self._C2t = C2t self.set_model_params(P=P, pi=None, reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag)) return self
# so we want to put those into one single state and construct a # matrix with only 2 states, A and B, as follows: # old state -> new state # 0 -> A # 1 -> A # 2 -> B # the membership matrix chi defines this map # each row gives us a binary encoding for a given # state 0, 1, 2 to a state A, B # state goes to A, B chi = np.array([ [1, 0], # for state 0 [1, 0], # for state 1 [0, 1] ]) # for state 2 # this could be a 'fuzzy' state assignment as well. # the resulting 'coarse' transition matrix is this. # compare the probabilities to the above one. print(cg_transition_matrix(Tmat_full, chi)) C = np.array([[10, 1, 0], [2, 0, 3], [0, 0, 4]]) cc_directed = connected_sets(C) pdb.set_trace()
def pcca(P, m): """ PCCA+ spectral clustering method with optimized memberships [1]_ Clusters the first m eigenvectors of a transition matrix in order to cluster the states. This function does not assume that the transition matrix is fully connected. Disconnected sets will automatically define the first metastable states, with perfect membership assignments. Parameters ---------- P : ndarray (n,n) Transition matrix. m : int Number of clusters to group to. Returns ------- chi by default, or (chi,rot) if return_rot = True chi : ndarray (n x m) A matrix containing the probability or membership of each state to be assigned to each cluster. The rows sum to 1. References ---------- [1] S. Roeblitz and M. Weber, Fuzzy spectral clustering by PCCA+: application to Markov state models and data classification. Adv Data Anal Classif 7, 147-179 (2013). [2] F. Noe, multiset PCCA and HMMs, in preparation. """ # imports from msmtools.estimation import connected_sets from msmtools.analysis import eigenvalues, is_transition_matrix, hitting_probability # validate input n = np.shape(P)[0] if (m > n): raise ValueError( "Number of metastable states m = " + str(m) + " exceeds number of states of transition matrix n = " + str(n)) if not is_transition_matrix(P): raise ValueError("Input matrix is not a transition matrix.") # prepare output chi = np.zeros((n, m)) # test connectivity components = connected_sets(P) # print "all labels ",labels n_components = len( components ) # (n_components, labels) = connected_components(P, connection='strong') # print 'n_components' # store components as closed (with positive equilibrium distribution) # or as transition states (with vanishing equilibrium distribution) closed_components = [] transition_states = [] for i in range(n_components): component = components[i] # np.argwhere(labels==i).flatten() rest = list(set(range(n)) - set(component)) # is component closed? if (np.sum(P[component, :][:, rest]) == 0): closed_components.append(component) else: transition_states.append(component) n_closed_components = len(closed_components) closed_states = np.concatenate(closed_components) if len(transition_states) == 0: transition_states = np.array([], dtype=int) else: transition_states = np.concatenate(transition_states) # check if we have enough clusters to support the disconnected sets if (m < len(closed_components)): raise ValueError("Number of metastable states m = " + str(m) + " is too small. Transition matrix has " + str(len(closed_components)) + " disconnected components") # We collect eigenvalues in order to decide which closed_components_Psub = [] closed_components_ev = [] closed_components_enum = [] for i in range(n_closed_components): component = closed_components[i] # print "component ",i," ",component # compute eigenvalues in submatrix Psub = P[component, :][:, component] closed_components_Psub.append(Psub) closed_components_ev.append(eigenvalues(Psub)) closed_components_enum.append(i * np.ones((component.size), dtype=int)) # flatten closed_components_ev_flat = np.array(closed_components_ev).flatten() closed_components_enum_flat = np.array(closed_components_enum).flatten() # which components should be clustered? component_indexes = closed_components_enum_flat[np.argsort( closed_components_ev_flat)][0:m] # cluster each component ipcca = 0 for i in range(n_closed_components): component = closed_components[i] # how many PCCA states in this component? m_by_component = np.shape(np.argwhere(component_indexes == i))[0] # if 1, then the result is trivial if (m_by_component == 1): chi[component, ipcca] = 1.0 ipcca += 1 elif (m_by_component > 1): #print "submatrix: ",closed_components_Psub[i] chi[component, ipcca:ipcca + m_by_component] = _pcca_connected( closed_components_Psub[i], m_by_component) ipcca += m_by_component else: raise RuntimeError("Component " + str(i) + " spuriously has " + str(m_by_component) + " pcca sets") # finally assign all transition states # print "chi\n", chi # print "transition states: ",transition_states # print "closed states: ", closed_states if (transition_states.size > 0): # make all closed states absorbing, so we can see which closed state we hit first Pabs = P.copy() Pabs[closed_states, :] = 0.0 Pabs[closed_states, closed_states] = 1.0 for i in range(closed_states.size): # hitting probability to each closed state h = hitting_probability(Pabs, closed_states[i]) for j in range(transition_states.size): # transition states belong to closed states with the hitting probability, and inherit their chi chi[transition_states[j]] += h[transition_states[j]] * chi[ closed_states[i]] # check if we have m metastable sets. If less than m, we must raise nmeta = np.count_nonzero(chi.sum(axis=0)) assert m <= nmeta, str(m) + " metastable states requested, but transition matrix only has " + str(nmeta) \ + ". Consider using a prior or request less metastable states. " # print "chi\n", chi return chi
def _pcca_connected(P, n, return_rot=False): """ PCCA+ spectral clustering method with optimized memberships [1]_ Clusters the first n_cluster eigenvectors of a transition matrix in order to cluster the states. This function assumes that the transition matrix is fully connected. Parameters ---------- P : ndarray (n,n) Transition matrix. n : int Number of clusters to group to. Returns ------- chi by default, or (chi,rot) if return_rot = True chi : ndarray (n x m) A matrix containing the probability or membership of each state to be assigned to each cluster. The rows sum to 1. rot_mat : ndarray (m x m) A rotation matrix that rotates the dominant eigenvectors to yield the PCCA memberships, i.e.: chi = np.dot(evec, rot_matrix References ---------- [1] S. Roeblitz and M. Weber, Fuzzy spectral clustering by PCCA+: application to Markov state models and data classification. Adv Data Anal Classif 7, 147-179 (2013). """ # test connectivity from msmtools.estimation import connected_sets labels = connected_sets(P) n_components = len( labels ) # (n_components, labels) = connected_components(P, connection='strong') if (n_components > 1): raise ValueError( "Transition matrix is disconnected. Cannot use pcca_connected.") from msmtools.analysis import stationary_distribution pi = stationary_distribution(P) # print "statdist = ",pi from msmtools.analysis import is_reversible if not is_reversible(P, mu=pi): raise ValueError( "Transition matrix does not fulfill detailed balance. " "Make sure to call pcca with a reversible transition matrix estimate" ) # TODO: Susanna mentioned that she has a potential fix for nonreversible matrices by replacing each complex conjugate # pair by the real and imaginary components of one of the two vectors. We could use this but would then need to # orthonormalize all eigenvectors e.g. using Gram-Schmidt orthonormalization. Currently there is no theoretical # foundation for this, so I'll skip it for now. # right eigenvectors, ordered from msmtools.analysis import eigenvectors evecs = eigenvectors(P, n) # orthonormalize for i in range(n): evecs[:, i] /= math.sqrt(np.dot(evecs[:, i] * pi, evecs[:, i])) # make first eigenvector positive evecs[:, 0] = np.abs(evecs[:, 0]) # Is there a significant complex component? if not np.alltrue(np.isreal(evecs)): warnings.warn( "The given transition matrix has complex eigenvectors, so it doesn't exactly fulfill detailed balance " + "forcing eigenvectors to be real and continuing. Be aware that this is not theoretically solid." ) evecs = np.real(evecs) # create initial solution using PCCA+. This could have negative memberships (chi, rot_matrix) = _pcca_connected_isa(evecs, n) #print "initial chi = \n",chi # optimize the rotation matrix with PCCA++. rot_matrix = _opt_soft(evecs, rot_matrix, n) # These memberships should be nonnegative memberships = np.dot(evecs[:, :], rot_matrix) # We might still have numerical errors. Force memberships to be in [0,1] # print "memberships unnormalized: ",memberships memberships = np.maximum(0.0, memberships) memberships = np.minimum(1.0, memberships) # print "memberships unnormalized: ",memberships for i in range(0, np.shape(memberships)[0]): memberships[i] /= np.sum(memberships[i]) # print "final chi = \n",chi return memberships
def _estimate(self, dtrajs): """ Parameters ---------- dtrajs : list containing ndarrays(dtype=int) or ndarray(n, dtype=int) or :class:`pyemma.msm.util.dtraj_states.DiscreteTrajectoryStats` discrete trajectories, stored as integer ndarrays (arbitrary size) or a single ndarray for only one trajectory. **params : Other keyword parameters if different from the settings when this estimator was constructed Returns ------- MSM : :class:`pyemma.msm.EstimatedMSM` or :class:`pyemma.msm.MSM` """ # ensure right format dtrajs = ensure_dtraj_list(dtrajs) # harvest discrete statistics if isinstance(dtrajs, _DiscreteTrajectoryStats): dtrajstats = dtrajs else: # compute and store discrete trajectory statistics dtrajstats = _DiscreteTrajectoryStats(dtrajs) # check if this MSM seems too large to be dense if dtrajstats.nstates > 4000 and not self.sparse: self.logger.warn( 'Building a dense MSM with ' + str(dtrajstats.nstates) + ' states. This can be ' 'inefficient or unfeasible in terms of both runtime and memory consumption. ' 'Consider using sparse=True.') # count lagged dtrajstats.count_lagged(self.lag, count_mode=self.count_mode) # full count matrix and number of states self._C_full = dtrajstats.count_matrix() self._nstates_full = self._C_full.shape[0] # set active set. This is at the same time a mapping from active to full if self.connectivity == 'largest': if self.statdist_constraint is None: # statdist not given - full connectivity on all states self.active_set = dtrajstats.largest_connected_set else: # statdist given - simple connectivity on all nonzero probability states nz = _np.nonzero(self.statdist_constraint)[0] Cnz = dtrajstats.count_matrix(subset=nz) self.active_set = nz[msmest.largest_connected_set( Cnz, directed=False)] else: # for 'None' and 'all' all visited states are active self.active_set = dtrajstats.visited_set # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean! # is estimated self._is_estimated = True # active count matrix and number of states self._C_active = dtrajstats.count_matrix(subset=self.active_set) self._nstates = self._C_active.shape[0] # computed derived quantities # back-mapping from full to lcs self._full2active = -1 * _np.ones((dtrajstats.nstates), dtype=int) self._full2active[self.active_set] = _np.array(list( range(len(self.active_set))), dtype=int) # restrict stationary distribution to active set if self.statdist_constraint is None: statdist_active = None else: statdist_active = self.statdist_constraint[self.active_set] statdist_active /= statdist_active.sum() # renormalize # Estimate transition matrix if self.connectivity == 'largest': P = msmest.transition_matrix(self._C_active, reversible=self.reversible, mu=statdist_active, maxiter=self.maxiter, maxerr=self.maxerr) elif self.connectivity == 'none': # reversible mode only possible if active set is connected # - in this case all visited states are connected and thus # this mode is identical to 'largest' if self.reversible and not msmest.is_connected(self._C_active): raise ValueError( 'Reversible MSM estimation is not possible with connectivity mode \'none\', ' + 'because the set of all visited states is not reversibly connected' ) P = msmest.transition_matrix(self._C_active, reversible=self.reversible, mu=statdist_active, maxiter=self.maxiter, maxerr=self.maxerr) else: raise NotImplementedError( 'MSM estimation with connectivity=\'self.connectivity\' is currently not implemented.' ) # continue sparse or dense? if not self.sparse: # converting count matrices to arrays. As a result the # transition matrix and all subsequent properties will be # computed using dense arrays and dense matrix algebra. self._C_full = self._C_full.toarray() self._C_active = self._C_active.toarray() P = P.toarray() # Done. We set our own model parameters, so this estimator is # equal to the estimated model. self._dtrajs_full = dtrajs self._connected_sets = msmest.connected_sets(self._C_full) self.set_model_params(P=P, pi=statdist_active, reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag)) return self
def count_lagged(self, lag, count_mode='sliding', mincount_connectivity='1/n', show_progress=True): r""" Counts transitions at given lag time Parameters ---------- lag : int lagtime in trajectory steps count_mode : str, optional, default='sliding' mode to obtain count matrices from discrete trajectories. Should be one of: * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts at time indexes .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1) * 'effective' : Uses an estimate of the transition counts that are statistically uncorrelated. Recommended when used with a Bayesian MSM. * 'sample' : A trajectory of length T will have :math:`T / \tau` counts at time indexes .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T) show_progress: bool, default=True show the progress for the expensive effective count mode computation. """ # store lag time self._lag = lag # Compute count matrix count_mode = count_mode.lower() if count_mode == 'sliding': self._C = msmest.count_matrix(self._dtrajs, lag, sliding=True) elif count_mode == 'sample': self._C = msmest.count_matrix(self._dtrajs, lag, sliding=False) elif count_mode == 'effective': from pyemma.util.reflection import getargspec_no_self argspec = getargspec_no_self(msmest.effective_count_matrix) kw = {} if show_progress and 'callback' in argspec.args: from pyemma._base.progress import ProgressReporter from pyemma._base.parallel import get_n_jobs pg = ProgressReporter() # this is a fast operation C_temp = msmest.count_matrix(self._dtrajs, lag, sliding=True) pg.register(C_temp.nnz, 'compute statistical inefficiencies') del C_temp callback = lambda: pg.update(1) kw['callback'] = callback kw['n_jobs'] = get_n_jobs() self._C = msmest.effective_count_matrix(self._dtrajs, lag, **kw) else: raise ValueError('Count mode ' + count_mode + ' is unknown.') # store mincount_connectivity if mincount_connectivity == '1/n': mincount_connectivity = 1.0 / np.shape(self._C)[0] self._mincount_connectivity = mincount_connectivity # Compute reversibly connected sets if self._mincount_connectivity > 0: self._connected_sets = \ self._compute_connected_sets(self._C, mincount_connectivity=self._mincount_connectivity) else: self._connected_sets = msmest.connected_sets(self._C) # set sizes and count matrices on reversibly connected sets self._connected_set_sizes = np.zeros((len(self._connected_sets))) self._C_sub = np.empty((len(self._connected_sets)), dtype=np.object) for i in range(len(self._connected_sets)): # set size self._connected_set_sizes[i] = len(self._connected_sets[i]) # submatrix # self._C_sub[i] = submatrix(self._C, self._connected_sets[i]) # largest connected set self._lcs = self._connected_sets[0] # if lcs has no counts, make lcs empty if submatrix(self._C, self._lcs).sum() == 0: self._lcs = np.array([], dtype=int) # mapping from full to lcs self._full2lcs = -1 * np.ones((self._nstates), dtype=int) self._full2lcs[self._lcs] = np.arange(len(self._lcs)) # remember that this function was called self._counted_at_lag = True
def weakly_connected_sets(self): return msmest.connected_sets(self._Tij, directed=False)
def _estimate(self, dtrajs): if self.E is None or self.w is None or self.m is None: raise ValueError("E, w or m was not specified. Stopping.") # get trajectory counts. This sets _C_full and _nstates_full dtrajstats = self._get_dtraj_stats(dtrajs) self._C_full = dtrajstats.count_matrix() # full count matrix self._nstates_full = self._C_full.shape[0] # number of states # set active set. This is at the same time a mapping from active to full if self.connectivity == 'largest': # statdist not given - full connectivity on all states self.active_set = dtrajstats.largest_connected_set else: # for 'None' and 'all' all visited states are active self.active_set = dtrajstats.visited_set # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean! # is estimated self._is_estimated = True # if active set is empty, we can't do anything. if _np.size(self.active_set) == 0: raise RuntimeError('Active set is empty. Cannot estimate AMM.') # active count matrix and number of states self._C_active = dtrajstats.count_matrix(subset=self.active_set) self._nstates = self._C_active.shape[0] # computed derived quantities # back-mapping from full to lcs self._full2active = -1 * _np.ones(dtrajstats.nstates, dtype=int) self._full2active[self.active_set] = _np.arange(len(self.active_set)) # slice out active states from E matrix _dset = list(set(_np.concatenate(self._dtrajs_full))) _rras = [_dset.index(s) for s in self.active_set] self.E_active = self.E[_rras] if not self.sparse: self._C_active = self._C_active.toarray() self._C_full = self._C_full.toarray() # reversibly counted self._C2 = 0.5 * (self._C_active + self._C_active.T) self._nz = _np.nonzero(self._C2) self._csum = _np.sum(self._C_active, axis=1) # row sums C # get ranges of Markov model expectation values if self.support_ci == 1: self.E_min = _np.min(self.E_active, axis=0) self.E_max = _np.max(self.E_active, axis=0) else: # PyEMMA confidence interval calculation fails sometimes with conf=1.0 self.E_min, self.E_max = _ci(self.E_active, conf=self.support_ci) # dimensions of E matrix self.n_mstates_active, self.n_exp_active = _np.shape(self.E_active) assert self.n_exp_active == len(self.w) assert self.n_exp_active == len(self.m) self.count_outside = [] self.count_inside = [] self._lls = [] i = 0 # Determine which experimental values are outside the support as defined by the Confidence interval for emi, ema, mm, mw in zip(self.E_min, self.E_max, self.m, self.w): if mm < emi or ema < mm: self.logger.info( "Experimental value %f is outside the support (%f,%f)" % (mm, emi, ema)) self.count_outside.append(i) else: self.count_inside.append(i) i = i + 1 self.logger.info( "Total experimental constraints outside support %d of %d" % (len(self.count_outside), len(self.E_min))) # A number of initializations self.P, self.pi = msmest.tmatrix(self._C_active, reversible=True, return_statdist=True) self.lagrange = _np.zeros(self.m.shape) self._pihat = self.pi.copy() self._update_mhat() self._dmhat = 1e-1 * _np.ones(_np.shape(self.mhat)) # Determine number of slices of R-tensors computable at once with the given cache size self._slicesz = _np.floor(self.max_cache / (self.P.nbytes / 1.e6)).astype(int) # compute first bundle of slices self._update_Rslices(0) self._ll_old = self._log_likelihood_biased(self._C_active, self.P, self.m, self.mhat, self.w) self._lls = [self._ll_old] # make sure everything is initialized self._update_pihat() self._update_mhat() self._update_Q() self._update_X_and_pi() self._ll_old = self._log_likelihood_biased(self._C_active, self.P, self.m, self.mhat, self.w) self._update_G() # # Main estimation algorithm # 2-step algorithm, lagrange multipliers and pihat have different convergence criteria # when the lagrange multipliers have converged, pihat is updated until the log-likelihood has converged (changes are smaller than 1e-3). # These do not always converge together, but usually within a few steps of each other. # A better heuristic for the latter may be necessary. For realistic cases (the two ubiquitin examples in [1]) # this yielded results very similar to those with more stringent convergence criteria (changes smaller than 1e-9) with convergence times # which are seconds instead of tens of minutes. # converged = False # Convergence flag for lagrange multipliers i = 0 die = False while i <= self.maxiter: pihat_old = self._pihat.copy() self._update_pihat() if not _np.all(self._pihat > 0): self._pihat = pihat_old.copy() die = True self.logger.warning( "pihat does not have a finite probability for all states, terminating" ) self._update_mhat() self._update_Q() if i > 1: X_old = self.X.copy() self._update_X_and_pi() if _np.any(self.X[self._nz] < 0) and i > 0: die = True self.logger.warning( "Warning: new X is not proportional to C... reverting to previous step and terminating" ) self.X = X_old.copy() if not converged: self._newton_lagrange() else: # once Lagrange multipliers are converged compute likelihood here P = self.X / self.pi[:, None] _ll_new = self._log_likelihood_biased(self._C_active, P, self.m, self.mhat, self.w) self._lls.append(_ll_new) # General case fixed-point iteration if len(self.count_outside) > 0: if i > 1 and _np.all( (_np.abs(self._dmhat) / self.sigmas) < self.eps) and not converged: self.logger.info( "Converged Lagrange multipliers after %i steps..." % i) converged = True # Special case else: if _np.abs(self._lls[-2] - self._lls[-1]) < 1e-8: self.logger.info( "Converged Lagrange multipliers after %i steps..." % i) converged = True # if Lagrange multipliers are converged, check whether log-likelihood has converged if converged and _np.abs(self._lls[-2] - self._lls[-1]) < 1e-8: self.logger.info("Converged pihat after %i steps..." % i) die = True if die: break if i == self.maxiter: self.logger.info("Failed to converge within %i iterations. " "Consider increasing max_iter(now=%i)" % (i, self.max_iter)) i += 1 _P = msmest.tmatrix(self._C_active, reversible=True, mu=self._pihat) self._connected_sets = msmest.connected_sets(self._C_full) self.set_model_params(P=_P, pi=self._pihat, reversible=True, dt_model=self.timestep_traj.get_scaled(self.lag)) return self
def count_lagged(self, lag, count_mode='sliding', mincount_connectivity='1/n', show_progress=True, n_jobs=None, name='', core_set=None, milestoning_method='last_core'): r""" Counts transitions at given lag time Parameters ---------- lag : int lagtime in trajectory steps count_mode : str, optional, default='sliding' mode to obtain count matrices from discrete trajectories. Should be one of: * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts at time indexes .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1) * 'effective' : Uses an estimate of the transition counts that are statistically uncorrelated. Recommended when used with a Bayesian MSM. * 'sample' : A trajectory of length T will have :math:`T / \tau` counts at time indexes .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T) show_progress: bool, default=True show the progress for the expensive effective count mode computation. n_jobs: int or None """ # store lag time self._lag = lag # Compute count matrix count_mode = count_mode.lower() if core_set is not None and count_mode in ('sliding', 'sample'): if milestoning_method == 'last_core': # assign -1 frames to last visited core for d in self._dtrajs: assert d[0] != -1 while -1 in d: mask = (d == -1) d[mask] = d[np.roll(mask, -1)] self._C = msmest.count_matrix(self._dtrajs, lag, sliding=count_mode == 'sliding') else: raise NotImplementedError( 'Milestoning method {} not implemented.'.format( milestoning_method)) elif count_mode == 'sliding': self._C = msmest.count_matrix(self._dtrajs, lag, sliding=True) elif count_mode == 'sample': self._C = msmest.count_matrix(self._dtrajs, lag, sliding=False) elif count_mode == 'effective': if core_set is not None: raise RuntimeError( 'Cannot estimate core set MSM with effective counting.') from pyerna.util.reflection import getargspec_no_self argspec = getargspec_no_self(msmest.effective_count_matrix) kw = {} from pyerna.util.contexts import nullcontext ctx = nullcontext() if 'callback' in argspec.args: # msmtools effective cmatrix ready for multiprocessing? from pyerna._base.progress import ProgressReporter from pyerna._base.parallel import get_n_jobs kw['n_jobs'] = get_n_jobs() if n_jobs is None else n_jobs if show_progress: pg = ProgressReporter() # this is a fast operation C_temp = msmest.count_matrix(self._dtrajs, lag, sliding=True) pg.register( C_temp.nnz, '{}: compute stat. inefficiencies'.format(name), stage=0) del C_temp kw['callback'] = pg.update ctx = pg.context(stage=0) with ctx: self._C = msmest.effective_count_matrix( self._dtrajs, lag, **kw) else: raise ValueError('Count mode ' + count_mode + ' is unknown.') # store mincount_connectivity if mincount_connectivity == '1/n': mincount_connectivity = 1.0 / np.shape(self._C)[0] self._mincount_connectivity = mincount_connectivity # Compute reversibly connected sets if self._mincount_connectivity > 0: self._connected_sets = \ self._compute_connected_sets(self._C, mincount_connectivity=self._mincount_connectivity) else: self._connected_sets = msmest.connected_sets(self._C) # set sizes and count matrices on reversibly connected sets self._connected_set_sizes = np.zeros((len(self._connected_sets))) self._C_sub = np.empty((len(self._connected_sets)), dtype=np.object) for i in range(len(self._connected_sets)): # set size self._connected_set_sizes[i] = len(self._connected_sets[i]) # submatrix # self._C_sub[i] = submatrix(self._C, self._connected_sets[i]) # largest connected set self._lcs = self._connected_sets[0] # if lcs has no counts, make lcs empty if submatrix(self._C, self._lcs).sum() == 0: self._lcs = np.array([], dtype=int) # mapping from full to lcs self._full2lcs = -1 * np.ones((self._nstates), dtype=int) self._full2lcs[self._lcs] = np.arange(len(self._lcs)) # remember that this function was called self._counted_at_lag = True
def estimate_pi_error(dtrajs, orig_msm, ntrails=10, conf_interval=0.68, return_samples=False): """ Estimate boostrap error for stationary probability :param dtrajs: list of np.array, discrete trajectories :param orig_msm: pyemma.msm.MarkovModel Only used for reference of lag time and to incorporate ML stationary distribution to data frame :param ntrails: int, the number of bootstrap samples to draw. :param conf_interval: float 0 < conf_interval < 1 :return: pandas.DataFrame instance containing ML MSM pi and bootstrap error """ from pyemma.util.statistics import confidence_interval #pi_samples = np.zeros((ntrails, len(orig_msm.nstates))) pi_samples = np.zeros((ntrails, orig_msm.count_matrix_full.shape[0])) all_states = np.arange(start=0, stop=orig_msm.count_matrix_full.shape[0], step=1) for trial in tqdm(range(ntrails)): try: bs_sample = np.random.choice(len(dtrajs), size=len(dtrajs), replace=True) dtraj_sample = list(np.array(dtrajs)[bs_sample]) msm = pyemma.msm.estimate_markov_model(dtraj_sample, lag=orig_msm.lag) stationary_probs = msm.pi if len(connected_sets(msm.count_matrix_full)) > 1: disconnected_states = [ element for element in all_states if element not in connected_sets(msm.count_matrix_full)[0] ] if len(disconnected_states) > 0: for element in disconnected_states: stationary_probs = np.insert(stationary_probs, element, 0) #pi_samples[trial, msm.active_set] = stationary_probs pi_samples[trial, all_states] = stationary_probs except Exception as e: pdb.set_trace() print(e) if return_samples: return pi_samples std = pi_samples.std(axis=0) lower_confidence, upper_confidence = confidence_interval( pi_samples, conf_interval) probabilities = pd.DataFrame( np.array([ orig_msm.active_set, orig_msm.pi, std, lower_confidence, upper_confidence ]).T, columns=['State', 'StatDist', 'Std', 'LowerConf', 'UpperConf'], ) # type cast to int probabilities['State'] = probabilities['State'].astype(int) return probabilities
def _estimate(self, dtrajs): # ensure right format dtrajs = ensure_dtraj_list(dtrajs) # harvest discrete statistics if isinstance(dtrajs, _DiscreteTrajectoryStats): dtrajstats = dtrajs else: # compute and store discrete trajectory statistics dtrajstats = _DiscreteTrajectoryStats(dtrajs) # check if this MSM seems too large to be dense if dtrajstats.nstates > 4000 and not self.sparse: self.logger.warning('Building a dense MSM with ' + str(dtrajstats.nstates) + ' states. This can be ' 'inefficient or unfeasible in terms of both runtime and memory consumption. ' 'Consider using sparse=True.') # count lagged dtrajstats.count_lagged(self.lag, count_mode=self.count_mode) # full count matrix and number of states self._C_full = dtrajstats.count_matrix() self._nstates_full = self._C_full.shape[0] # set active set. This is at the same time a mapping from active to full if self.connectivity == 'largest': if self.statdist_constraint is None: # statdist not given - full connectivity on all states self.active_set = dtrajstats.largest_connected_set else: active_set = self._prepare_input_revpi(self._C_full, self.statdist_constraint) self.active_set = active_set else: # for 'None' and 'all' all visited states are active self.active_set = dtrajstats.visited_set # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean! # is estimated self._is_estimated = True # if active set is empty, we can't do anything. if _np.size(self.active_set) == 0: raise RuntimeError('Active set is empty. Cannot estimate MSM.') # active count matrix and number of states self._C_active = dtrajstats.count_matrix(subset=self.active_set) self._nstates = self._C_active.shape[0] # computed derived quantities # back-mapping from full to lcs self._full2active = -1 * _np.ones((dtrajstats.nstates), dtype=int) self._full2active[self.active_set] = _np.arange(len(self.active_set)) # restrict stationary distribution to active set if self.statdist_constraint is None: statdist_active = None else: statdist_active = self.statdist_constraint[self.active_set] statdist_active /= statdist_active.sum() # renormalize # Estimate transition matrix if self.connectivity == 'largest': P = msmest.transition_matrix(self._C_active, reversible=self.reversible, mu=statdist_active, maxiter=self.maxiter, maxerr=self.maxerr) elif self.connectivity == 'none': # reversible mode only possible if active set is connected # - in this case all visited states are connected and thus # this mode is identical to 'largest' if self.reversible and not msmest.is_connected(self._C_active): raise ValueError('Reversible MSM estimation is not possible with connectivity mode "none", ' 'because the set of all visited states is not reversibly connected') P = msmest.transition_matrix(self._C_active, reversible=self.reversible, mu=statdist_active, maxiter=self.maxiter, maxerr=self.maxerr) else: raise NotImplementedError( 'MSM estimation with connectivity=%s is currently not implemented.' % self.connectivity) # continue sparse or dense? if not self.sparse: # converting count matrices to arrays. As a result the # transition matrix and all subsequent properties will be # computed using dense arrays and dense matrix algebra. self._C_full = self._C_full.toarray() self._C_active = self._C_active.toarray() P = P.toarray() # Done. We set our own model parameters, so this estimator is # equal to the estimated model. self._dtrajs_full = dtrajs self._connected_sets = msmest.connected_sets(self._C_full) self.set_model_params(P=P, pi=statdist_active, reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag)) return self
def strongly_connected_sets(self): return msmest.connected_sets(self._Tij, directed=True)