def _svd_sym_koopman(K, C00_train, Ctt_train): """ Computes the SVD of the symmetrized Koopman operator in the empirical distribution. """ # reweight operator to empirical distribution C0t_re = mdot(C00_train, K) # symmetrized operator and SVD K_sym = mdot(spd_inv_sqrt(C00_train), C0t_re, spd_inv_sqrt(Ctt_train)) U, S, Vt = np.linalg.svd(K_sym, compute_uv=True, full_matrices=False) # projects back to singular functions of K U = mdot(spd_inv_sqrt(C00_train), U) Vt = mdot(Vt, spd_inv_sqrt(Ctt_train)) return U, S, Vt.T
def pcca(P, m): """PCCA+ spectral clustering method with optimized memberships [1]_ Clusters the first m eigenvectors of a transition matrix in order to cluster the states. This function does not assume that the transition matrix is fully connected. Disconnected sets will automatically define the first metastable states, with perfect membership assignments. Parameters ---------- P : ndarray (n,n) Transition matrix. m : int Number of clusters to group to. References ---------- [1] S. Roeblitz and M. Weber, Fuzzy spectral clustering by PCCA+: application to Markov state models and data classification. Adv Data Anal Classif 7, 147-179 (2013). """ assert 0 < m <= P.shape[0] from scipy.sparse import issparse if issparse(P): warnings.warn( 'PCCA is only implemented for dense matrices, ' 'converting sparse transition matrix to dense ndarray.', stacklevel=2) P = P.toarray() # memberships # TODO: can be improved. pcca computes stationary distribution internally, we don't need to compute it twice. from msmtools.analysis.dense.pcca import pcca as _algorithm_impl M = _algorithm_impl(P, m) # stationary distribution # TODO: in msmtools we recomputed this from P, we actually want to use pi from the msm obj, but this caused #1208 from msmtools.analysis import stationary_distribution pi = stationary_distribution(P) # coarse-grained stationary distribution pi_coarse = np.dot(M.T, pi) # HMM output matrix B = mdot(np.diag(1.0 / pi_coarse), M.T, np.diag(pi)) # renormalize B to make it row-stochastic B /= B.sum(axis=1)[:, None] # coarse-grained transition matrix W = np.linalg.inv(np.dot(M.T, M)) A = np.dot(np.dot(M.T, P), M) P_coarse = np.dot(W, A) # symmetrize and renormalize to eliminate numerical errors X = np.dot(np.diag(pi_coarse), P_coarse) P_coarse = X / X.sum(axis=1)[:, None] return PCCAModel(P_coarse, pi_coarse, M, B)
def propagate(self, p0, k): r""" Propagates the initial distribution p0 k times Computes the product .. math:: p_k = p_0^T P^k If the lag time of transition matrix :math:`P` is :math:`\tau`, this will provide the probability distribution at time :math:`k \tau`. Parameters ---------- p0 : ndarray(n) Initial distribution. Vector of size of the active set. k : int Number of time steps Returns ---------- pk : ndarray(n) Distribution after k steps. Vector of size of the active set. """ # p0 = types.ensure_ndarray(p0, ndim=1, kind='numeric') # assert types.is_int(k) and k >= 0, 'k must be a non-negative integer' if k == 0: # simply return p0 normalized return p0 / p0.sum() micro = False # are we on microstates space? if len(p0) == self.n_states_obs: micro = True # project to hidden and compute p0 = np.dot(self.observation_probabilities, p0) self._ensure_eigendecomposition(self.n_states) # TODO: eigenvectors_right() and so forth call ensure_eigendecomp again with self.neig instead of self.n_states pk = mdot(p0.T, self.eigenvectors_right(), np.diag(np.power(self.eigenvalues(), k)), self.eigenvectors_left()) if micro: pk = np.dot(pk, self.observation_probabilities ) # convert back to microstate space # normalize to 1.0 and return return pk / pk.sum()
def propagate(self, p0, k): r""" Propagates the initial distribution p0 defined on observable space k times. Therefore computes the product .. math:: p_k = p_0^T P^k If the lag time of transition matrix :math:`P` is :math:`\tau`, this will provide the probability distribution at time :math:`k \tau`. Parameters ---------- p0 : ndarray(n) Initial distribution. Vector of size of the active set. k : int Number of time steps Returns ---------- pk : ndarray(n) Distribution after k steps """ if k == 0: # simply return p0 normalized return p0 / p0.sum() p0 = self._project_to_hidden(p0) ev_right = self.transition_model.eigenvectors_right( self.n_hidden_states) ev_left = self.transition_model.eigenvectors_left(self.n_hidden_states) pk = mdot(p0.T, ev_right, np.diag(np.power(self.transition_model.eigenvalues(), k)), ev_left) pk = np.dot( pk, self.output_probabilities) # convert back to microstate space # normalize to 1.0 and return return pk / pk.sum()
def vamp_2_score(K, C00_train, C0t_train, Ctt_train, C00_test, C0t_test, Ctt_test, k=None): r""" Computes the VAMP-2 score of a kinetic model. Ranks the kinetic model described by the estimation of covariances C00, C0t and Ctt, defined by: :math:`C_{0t}^{train} = E_t[x_t x_{t+\tau}^T]` :math:`C_{tt}^{train} = E_t[x_{t+\tau} x_{t+\tau}^T]` These model covariances might have been subject to symmetrization or reweighting, depending on the type of model used. The covariances C00, C0t and Ctt of the test data are direct empirical estimates. singular vectors U and V using the test data with covariances C00, C0t, Ctt. U and V should come from the SVD of the symmetrized transition matrix or Koopman matrix: :math:`(C00^{train})^{-(1/2)} C0t^{train} (Ctt^{train})^{-(1/2)} = U S V.T` Parameters: ----------- K : ndarray(n, k) left singular vectors of the symmetrized transition matrix or Koopman matrix C00_train : ndarray(n, n) covariance matrix of the training data, defined by :math:`C_{00}^{train} = (T-\tau)^{-1} \sum_{t=0}^{T-\tau} x_t x_t^T` C0t_train : ndarray(n, n) time-lagged covariance matrix of the training data, defined by :math:`C_{0t}^{train} = (T-\tau)^{-1} \sum_{t=0}^{T-\tau} x_t x_{t+\tau}^T` Ctt_train : ndarray(n, n) covariance matrix of the training data, defined by :math:`C_{tt}^{train} = (T-\tau)^{-1} \sum_{t=0}^{T-\tau} x_{t+\tau} x_{t+\tau}^T` C00_test : ndarray(n, n) covariance matrix of the test data, defined by :math:`C_{00}^{test} = (T-\tau)^{-1} \sum_{t=0}^{T-\tau} x_t x_t^T` C0t_test : ndarray(n, n) time-lagged covariance matrix of the test data, defined by :math:`C_{0t}^{test} = (T-\tau)^{-1} \sum_{t=0}^{T-\tau} x_t x_{t+\tau}^T` Ctt_test : ndarray(n, n) covariance matrix of the test data, defined by :math:`C_{tt}^{test} = (T-\tau)^{-1} \sum_{t=0}^{T-\tau} x_{t+\tau} x_{t+\tau}^T` k : int number of slow processes to consider in the score Returns: -------- vamp2 : float VAMP-2 score """ # SVD of symmetrized operator in empirical distribution U, _, V = _svd_sym_koopman(K, C00_train, Ctt_train) if k is not None: U = U[:, :k] V = V[:, :k] A = spd_inv_sqrt(mdot(U.T, C00_test, U)) B = mdot(U.T, C0t_test, V) C = spd_inv_sqrt(mdot(V.T, Ctt_test, V)) # compute square frobenius, equal to the sum of squares of singular values score = np.linalg.norm(mdot(A, B, C), ord='fro')**2 return score
def vamp_e_score(K, C00_train, C0t_train, Ctt_train, C00_test, C0t_test, Ctt_test, k=None): r""" Computes the VAMP-E score of a kinetic model. Ranks the kinetic model described by the estimation of covariances C00, C0t and Ctt, defined by: :math:`C_{0t}^{train} = E_t[x_t x_{t+\tau}^T]` :math:`C_{tt}^{train} = E_t[x_{t+\tau} x_{t+\tau}^T]` These model covariances might have been subject to symmetrization or reweighting, depending on the type of model used. The covariances C00, C0t and Ctt of the test data are direct empirical estimates. singular vectors U and V using the test data with covariances C00, C0t, Ctt. U and V should come from the SVD of the symmetrized transition matrix or Koopman matrix: :math:`(C00^{train})^{-(1/2)} C0t^{train} (Ctt^{train})^{-(1/2)} = U S V.T` Parameters: ----------- K : ndarray(n, k) left singular vectors of the symmetrized transition matrix or Koopman matrix C00_train : ndarray(n, n) covariance matrix of the training data, defined by :math:`C_{00}^{train} = (T-\tau)^{-1} \sum_{t=0}^{T-\tau} x_t x_t^T` C0t_train : ndarray(n, n) time-lagged covariance matrix of the training data, defined by :math:`C_{0t}^{train} = (T-\tau)^{-1} \sum_{t=0}^{T-\tau} x_t x_{t+\tau}^T` Ctt_train : ndarray(n, n) covariance matrix of the training data, defined by :math:`C_{tt}^{train} = (T-\tau)^{-1} \sum_{t=0}^{T-\tau} x_{t+\tau} x_{t+\tau}^T` C00_test : ndarray(n, n) covariance matrix of the test data, defined by :math:`C_{00}^{test} = (T-\tau)^{-1} \sum_{t=0}^{T-\tau} x_t x_t^T` C0t_test : ndarray(n, n) time-lagged covariance matrix of the test data, defined by :math:`C_{0t}^{test} = (T-\tau)^{-1} \sum_{t=0}^{T-\tau} x_t x_{t+\tau}^T` Ctt_test : ndarray(n, n) covariance matrix of the test data, defined by :math:`C_{tt}^{test} = (T-\tau)^{-1} \sum_{t=0}^{T-\tau} x_{t+\tau} x_{t+\tau}^T` k : int number of slow processes to consider in the score Returns: -------- vampE : float VAMP-E score """ # SVD of symmetrized operator in empirical distribution U, s, V = _svd_sym_koopman(K, C00_train, Ctt_train) if k is not None: U = U[:, :k] S = np.diag(s[:k]) V = V[:, :k] score = np.trace(2.0 * mdot(V, S, U.T, C0t_test) - mdot(V, S, U.T, C00_test, U, S, V.T, Ctt_test)) return score
def pcca(P, m, stationary_distribution=None): """PCCA+ spectral clustering method with optimized memberships. Implementation according to [1]_. Clusters the first m eigenvectors of a transition matrix in order to cluster the states. This function does not assume that the transition matrix is fully connected. Disconnected sets will automatically define the first metastable states, with perfect membership assignments. Parameters ---------- P : ndarray (n,n) Transition matrix. m : int Number of clusters to group to. stationary_distribution : ndarray(n,), optional, default=None Stationary distribution over the full state space, can be given if already computed. References ---------- .. [1] S. Roeblitz and M. Weber, Fuzzy spectral clustering by PCCA+: application to Markov state models and data classification. Adv Data Anal Classif 7, 147-179 (2013). """ if m <= 0 or m > P.shape[0]: raise ValueError( "Number of metastable sets must be larger than 0 and can be at most as large as the number " "of states.") assert 0 < m <= P.shape[0] from scipy.sparse import issparse if issparse(P): warnings.warn( 'PCCA is only implemented for dense matrices, ' 'converting sparse transition matrix to dense ndarray.', stacklevel=2) P = P.toarray() # stationary distribution if stationary_distribution is None: from msmtools.analysis import stationary_distribution as statdist pi = statdist(P) else: pi = stationary_distribution # memberships # TODO: can be improved. pcca computes stationary distribution internally, we don't need to compute it twice. from msmtools.analysis.dense.pcca import pcca as _algorithm_impl M = _algorithm_impl(P, m) # coarse-grained stationary distribution pi_coarse = np.dot(M.T, pi) # HMM output matrix B = mdot(np.diag(1.0 / pi_coarse), M.T, np.diag(pi)) # renormalize B to make it row-stochastic B /= B.sum(axis=1)[:, None] # coarse-grained transition matrix W = np.linalg.inv(np.dot(M.T, M)) A = np.dot(np.dot(M.T, P), M) P_coarse = np.dot(W, A) # symmetrize and renormalize to eliminate numerical errors X = np.dot(np.diag(pi_coarse), P_coarse) # and normalize P_coarse = X / X.sum(axis=1)[:, None] return PCCAModel(P_coarse, pi_coarse, M, B)
def score(self, test_model=None, score_method='VAMP2'): """Compute the VAMP score for this model or the cross-validation score between self and a second model. Parameters ---------- test_model : VAMPModel, optional, default=None If `test_model` is not None, this method computes the cross-validation score between self and `test_model`. It is assumed that self was estimated from the "training" data and `test_model` was estimated from the "test" data. The score is computed for one realization of self and `test_model`. Estimation of the average cross-validation score and partitioning of data into test and training part is not performed by this method. If `test_model` is None, this method computes the VAMP score for the model contained in self. score_method : str, optional, default='VAMP2' Available scores are based on the variational approach for Markov processes [1]_: * 'VAMP1' Sum of singular values of the half-weighted Koopman matrix [1]_ . If the model is reversible, this is equal to the sum of Koopman matrix eigenvalues, also called Rayleigh quotient [1]_. * 'VAMP2' Sum of squared singular values of the half-weighted Koopman matrix [1]_ . If the model is reversible, this is equal to the kinetic variance [2]_ . * 'VAMPE' Approximation error of the estimated Koopman operator with respect to the true Koopman operator up to an additive constant [1]_ . Returns ------- score : float If `test_model` is not None, returns the cross-validation VAMP score between self and `test_model`. Otherwise return the selected VAMP-score of self. References ---------- .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data. arXiv:1707.04659v1 .. [2] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation. J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553 """ # TODO: implement for TICA too if test_model is None: test_model = self Uk = self.singular_vectors_left[:, 0:self.dimension()] Vk = self.singular_vectors_right[:, 0:self.dimension()] res = None if score_method == 'VAMP1' or score_method == 'VAMP2': A = spd_inv_sqrt(Uk.T.dot(test_model.cov_00).dot(Uk)) B = Uk.T.dot(test_model.cov_0t).dot(Vk) C = spd_inv_sqrt(Vk.T.dot(test_model.cov_tt).dot(Vk)) ABC = mdot(A, B, C) if score_method == 'VAMP1': res = np.linalg.norm(ABC, ord='nuc') elif score_method == 'VAMP2': res = np.linalg.norm(ABC, ord='fro')**2 elif score_method == 'VAMPE': Sk = np.diag(self.singular_values[0:self.dimension()]) res = np.trace(2.0 * mdot(Vk, Sk, Uk.T, test_model.cov_0t) - mdot(Vk, Sk, Uk.T, test_model.cov_00, Uk, Sk, Vk.T, test_model.cov_tt)) else: raise ValueError('"score" should be one of VAMP1, VAMP2 or VAMPE') # add the contribution (+1) of the constant singular functions to the result assert res return res + 1