def test_score_vs_MSM(self): from pyemma.util.contexts import numpy_random_seed with numpy_random_seed(32): trajs_test, trajs_train = cvsplit_dtrajs(self.trajs) with numpy_random_seed(32): dtrajs_test, dtrajs_train = cvsplit_dtrajs(self.dtrajs) methods = ('VAMP1', 'VAMP2', 'VAMPE') for m in methods: msm_train = estimate_markov_model(dtrajs=dtrajs_train, lag=self.lag, reversible=False) score_msm = msm_train.score(dtrajs_test, score_method=m, score_k=None) vamp_train = pyemma_api_vamp(data=trajs_train, lag=self.lag, dim=1.0) score_vamp = vamp_train.score(test_data=trajs_test, score_method=m) self.assertAlmostEqual(score_msm, score_vamp, places=2 if m == 'VAMPE' else 3, msg=m)
def test_blocksplit_dtrajs_cvsplit(self): dtrajs = [ np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), np.array([0, 1, 9, 10]) ] for lag in range(1, 5): dtrajs_new = blocksplit_dtrajs(dtrajs, lag=lag, sliding=False, shift=0) dtrajs_train, dtrajs_test = cvsplit_dtrajs(dtrajs_new) dtrajs_train = ensure_dtraj_list(dtrajs_train) dtrajs_test = ensure_dtraj_list(dtrajs_test) assert len(dtrajs_train) > 0 assert len(dtrajs_test) > 0
def score_cv(self, dtrajs, n=10, score_method=None, score_k=None): """ Scores the MSM using the variational approach for Markov processes [1]_ [2]_ and crossvalidation [3]_ . Divides the data into training and test data, fits a MSM using the training data using the parameters of this estimator, and scores is using the test data. Currently only one way of splitting is implemented, where for each n, the data is randomly divided into two approximately equally large sets of discrete trajectory fragments with lengths of at least the lagtime. Currently only implemented using dense matrices - will be slow for large state spaces. Parameters ---------- dtrajs : list of arrays Test data (discrete trajectories). n : number of samples Number of repetitions of the cross-validation. Use large n to get solid means of the score. score_method : str, optional, default='VAMP2' Overwrite scoring method to be used if desired. If `None`, the estimators scoring method will be used. Available scores are based on the variational approach for Markov processes [1]_ [2]_ : * 'VAMP1' Sum of singular values of the symmetrized transition matrix [2]_ . If the MSM is reversible, this is equal to the sum of transition matrix eigenvalues, also called Rayleigh quotient [1]_ [3]_ . * 'VAMP2' Sum of squared singular values of the symmetrized transition matrix [2]_ . If the MSM is reversible, this is equal to the kinetic variance [4]_ . score_k : int or None The maximum number of eigenvalues or singular values used in the score. If set to None, all available eigenvalues will be used. References ---------- .. [1] Noe, F. and F. Nueske: A variational approach to modeling slow processes in stochastic dynamical systems. SIAM Multiscale Model. Simul. 11, 635-655 (2013). .. [2] Wu, H and F. Noe: Variational approach for learning Markov processes from time series data (in preparation). .. [3] McGibbon, R and V. S. Pande: Variational cross-validation of slow dynamical modes in molecular kinetics, J. Chem. Phys. 142, 124105 (2015). .. [4] Noe, F. and C. Clementi: Kinetic distance and kinetic maps from molecular dynamics simulation. J. Chem. Theory Comput. 11, 5002-5011 (2015). """ dtrajs = ensure_dtraj_list(dtrajs) # ensure format from pyemma.msm.estimators._dtraj_stats import cvsplit_dtrajs if self.count_mode not in ('sliding', 'sample'): raise ValueError( 'score_cv currently only supports count modes "sliding" and "sample"' ) sliding = self.count_mode == 'sliding' scores = [] from pyemma._ext.sklearn.base import clone estimator = clone(self) for i in range(n): dtrajs_split = self._blocksplit_dtrajs(dtrajs, sliding) dtrajs_train, dtrajs_test = cvsplit_dtrajs(dtrajs_split) estimator.fit(dtrajs_train) s = estimator.score(dtrajs_test, score_method=score_method, score_k=score_k) scores.append(s) return _np.array(scores)