def setUpClass(cls): import pyerna.datasets cls.core_set = [34, 65] cls.dtraj = pyerna.datasets.load_2well_discrete().dtraj_T100K_dt10 nu = 1. * np.bincount(cls.dtraj)[cls.core_set] cls.statdist = nu / nu.sum() cls.tau = 10 maxerr = 1e-12 warnings.filterwarnings("ignore") with warnings.catch_warnings(): warnings.simplefilter('ignore') cls.msmrev = estimate_markov_model(cls.dtraj, cls.tau, maxerr=maxerr, core_set=cls.core_set) cls.msmrevpi = estimate_markov_model(cls.dtraj, cls.tau, maxerr=maxerr, statdist=cls.statdist, core_set=cls.core_set) cls.msm = estimate_markov_model(cls.dtraj, cls.tau, reversible=False, maxerr=maxerr, core_set=cls.core_set)
def test_oom(self): from pyerna import msm msm_one_over_n = msm.estimate_markov_model(self.dtraj, lag=1, mincount_connectivity='1/n', weights='oom') # we now restrict the connectivity to have at least 6 counts, so we will loose state 2 msm_restrict_connectivity = msm.estimate_markov_model(self.dtraj, lag=1, mincount_connectivity=6, weights='oom') self._test_connectivity(msm_one_over_n, msm_restrict_connectivity)
def test_valid_trajectory(self): pi = np.array([0.1, 0.0, 0.9]) dtraj_invalid = np.array([1, 1, 1, 1, 1, 1, 1]) dtraj_valid = np.array([0, 2, 0, 2, 2, 0, 1, 1]) msm = estimate_markov_model(dtraj_valid, 1, statdist=pi) self.assertTrue(np.all(msm.active_set==np.array([0, 2]))) with self.assertRaises(ValueError): msm = estimate_markov_model(dtraj_invalid, 1, statdist=pi)
def test_valid_stationary_vector(self): dtraj = np.array([0, 0, 1, 0, 1, 2]) pi_valid = np.array([0.1, 0.9, 0.0]) pi_invalid = np.array([0.1, 0.9]) active_set = np.array([0, 1]) msm = estimate_markov_model(dtraj, 1, statdist=pi_valid) self.assertTrue(np.all(msm.active_set==active_set)) with self.assertRaises(ValueError): msm = estimate_markov_model(dtraj, 1, statdist=pi_invalid)
def setUpClass(cls): # load observations import pyerna.datasets obs = pyerna.datasets.load_2well_discrete().dtraj_T100K_dt10 obs -= np.min(obs) # remove empty states # hidden states nstates = 2 # run with lag 1 and 10 cls.msm_lag1 = msm.estimate_markov_model([obs], 1, reversible=True, connectivity='largest') cls.hmsm_lag1 = msm.estimate_hidden_markov_model([obs], nstates, 1, reversible=True, observe_nonempty=True) cls.msm_lag10 = msm.estimate_markov_model([obs], 10, reversible=True, connectivity='largest') cls.hmsm_lag10 = msm.estimate_hidden_markov_model([obs], nstates, 10, reversible=True, observe_nonempty=True)
def test_rdl_recompute(self): """ test for issue 1301. Should recompute RDL decomposition in case of new transition matrix. """ msm = estimate_markov_model(self.dtraj, self.tau) ev1 = msm.eigenvectors_left(2) msm.estimate(self.dtraj, lag=self.tau+1) ev2 = msm.eigenvectors_left(2) assert ev2 is not ev1
def setUpClass(cls): N_steps = 10000 N_traj = 20 lag = 1 T = np.linalg.matrix_power( np.array([[0.7, 0.2, 0.1], [0.1, 0.8, 0.1], [0.1, 0.1, 0.8]]), lag) dtrajs = [generate(T, N_steps) for _ in range(N_traj)] p0 = np.zeros(3) p1 = np.zeros(3) trajs = [] for dtraj in dtrajs: traj = np.zeros((N_steps, T.shape[0])) traj[np.arange(len(dtraj)), dtraj] = 1.0 trajs.append(traj) p0 += traj[:-lag, :].sum(axis=0) p1 += traj[lag:, :].sum(axis=0) vamp = pyerna_api_vamp(trajs, lag=lag, scaling=None, dim=1.0) msm = estimate_markov_model(dtrajs, lag=lag, reversible=False) cls.trajs = trajs cls.dtrajs = dtrajs cls.lag = lag cls.msm = msm cls.vamp = vamp cls.p0 = p0 / p0.sum() cls.p1 = p1 / p1.sum() cls.atol = np.finfo(vamp.output_type()).eps * 1000.0
def test_CK_covariances_against_MSM(self): obs = np.eye(3) # observe every state sta = np.eye(3) # restrict p0 to every state cktest = self.vamp.cktest(observables=obs, statistics=sta, mlags=4, show_progress=True) pred = cktest.predictions[1:] est = cktest.estimates[1:] for i, (est_, pred_) in enumerate(zip(est, pred)): msm = estimate_markov_model(dtrajs=self.dtrajs, lag=self.lag * (i + 1), reversible=False) msm_esti = (self.p0 * sta).T.dot(msm.P).dot(obs).T msm_pred = (self.p0 * sta).T.dot( np.linalg.matrix_power(self.msm.P, (i + 1))).dot(obs).T np.testing.assert_allclose(np.diag(pred_), np.diag(msm_pred), atol=self.atol) np.testing.assert_allclose(np.diag(est_), np.diag(msm_esti), atol=self.atol) np.testing.assert_allclose(np.diag(est_), np.diag(pred_), atol=0.006)
def test_score_vs_MSM(self): from pyerna.util.contexts import numpy_random_seed with numpy_random_seed(32): trajs_test, trajs_train = cvsplit_dtrajs(self.trajs) with numpy_random_seed(32): dtrajs_test, dtrajs_train = cvsplit_dtrajs(self.dtrajs) methods = ('VAMP1', 'VAMP2', 'VAMPE') for m in methods: msm_train = estimate_markov_model(dtrajs=dtrajs_train, lag=self.lag, reversible=False) score_msm = msm_train.score(dtrajs_test, score_method=m, score_k=None) vamp_train = pyerna_api_vamp(data=trajs_train, lag=self.lag, dim=1.0) score_vamp = vamp_train.score(test_data=trajs_test, score_method=m) self.assertAlmostEqual(score_msm, score_vamp, places=2 if m == 'VAMPE' else 3, msg=m)
def test_valid_trajectory(self): pi = np.array([0.1, 0.9]) dtraj_invalid = np.array([1, 1, 1, 1, 1, 1, 1]) dtraj_valid = np.array([0, 2, 0, 2, 2, 0, 1, 1]) core_set = [0, 2] msm = estimate_markov_model(dtraj_valid, 1, statdist=pi, core_set=core_set) self.assertTrue(np.all(msm.active_set == np.array(core_set))) np.testing.assert_array_equal(msm.pi, pi) with self.assertRaises(ValueError): estimate_markov_model(dtraj_invalid, 1, statdist=pi, core_set=core_set)
def test_time_units(self): dtraj = np.random.randint(0, 4, 1000) tau = 12 dt = 0.456 msmobj = estimate_markov_model(dtraj, lag=tau, dt_traj='%f ns' % dt) # check MFPT consistency mfpt_ref = msmobj.mfpt([0], [1]) tptobj = tpt(msmobj, [0], [1]) assert_allclose(tptobj.mfpt, mfpt_ref) assert_allclose(msmana.mfpt(msmobj.P, [1], [0], tau=tau) * dt, mfpt_ref) assert_allclose(np.dot(msmobj.stationary_distribution, tptobj.backward_committor) / tptobj.total_flux, mfpt_ref) # check flux consistency total_flux_ref = tptobj.total_flux A = tptobj.A B = tptobj.B I = tptobj.I assert_allclose(tptobj.gross_flux[A, :][:, B].sum() + tptobj.gross_flux[A, :][:, I].sum(), total_flux_ref) assert_allclose(tptobj.net_flux[A, :][:, B].sum() + tptobj.net_flux[A, :][:, I].sum(), total_flux_ref) assert_allclose(tptobj.flux[A, :][:, B].sum() + tptobj.flux[A, :][:, I].sum(), total_flux_ref) mf = tptobj.major_flux(1.0) assert_allclose(mf[A, :][:, B].sum() + mf[A, :][:, I].sum(), total_flux_ref) # check that the coarse-grained version is consistent too _, tptobj2 = tptobj.coarse_grain([A, I, B]) assert_allclose(tptobj2.total_flux, total_flux_ref) assert_allclose(tptobj2.mfpt, mfpt_ref)
def test_MSM_sparse(self): msm = estimate_markov_model(self.dtraj, self.tau, sparse=True) assert_allclose(self.dtraj, msm.discrete_trajectories_full[0]) self.assertEqual(self.tau, msm.lagtime) assert_allclose(self.lcc_MSM, msm.largest_connected_set) self.assertTrue(np.allclose(self.Ccc_MSM.toarray(), msm.count_matrix_active.toarray())) self.assertTrue(np.allclose(self.C_MSM.toarray(), msm.count_matrix_full.toarray())) self.assertTrue(np.allclose(self.P_MSM.toarray(), msm.transition_matrix.toarray())) assert_allclose(self.mu_MSM, msm.stationary_distribution) assert_allclose(self.ts[1:], msm.timescales(self.k - 1))
def test_CK_expectation_against_MSM(self): obs = np.eye(3) # observe every state cktest = self.vamp.cktest(observables=obs, statistics=None, mlags=4) pred = cktest.predictions[1:] est = cktest.estimates[1:] for i, (est_, pred_) in enumerate(zip(est, pred)): msm = estimate_markov_model(dtrajs=self.dtrajs, lag=self.lag * (i + 1), reversible=False) msm_esti = self.p0.T.dot(msm.P).dot(obs) msm_pred = self.p0.T.dot( np.linalg.matrix_power(self.msm.P, (i + 1))).dot(obs) np.testing.assert_allclose(pred_, msm_pred, atol=self.atol) np.testing.assert_allclose(est_, msm_esti, atol=self.atol) np.testing.assert_allclose(est_, pred_, atol=0.006)
def test_ck_msm(self): MLMSM = msm.estimate_markov_model([self.double_well_data.dtraj_T100K_dt10_n6good], 40) self.ck = MLMSM.cktest(2, mlags=[0,1,10]) estref = np.array([[[ 1., 0. ], [ 0., 1. ]], [[ 0.89806859, 0.10193141], [ 0.10003466, 0.89996534]], [[ 0.64851782, 0.35148218], [ 0.34411751, 0.65588249]]]) predref = np.array([[[ 1., 0. ], [ 0., 1. ]], [[ 0.89806859, 0.10193141], [ 0.10003466, 0.89996534]], [[ 0.62613723, 0.37386277], [ 0.3669059, 0.6330941 ]]]) # rough agreement with MLE assert np.allclose(self.ck.estimates, estref, rtol=0.1, atol=10.0) assert self.ck.estimates_conf[0] is None assert self.ck.estimates_conf[1] is None assert np.allclose(self.ck.predictions, predref, rtol=0.1, atol=10.0) assert self.ck.predictions_conf[0] is None assert self.ck.predictions_conf[1] is None
def setUpClass(cls): import pyerna.datasets cls.dtraj = pyerna.datasets.load_2well_discrete().dtraj_T100K_dt10 nu = 1.*np.bincount(cls.dtraj) cls.statdist = nu/nu.sum() cls.tau = 10 maxerr = 1e-12 cls.msmrev = estimate_markov_model(cls.dtraj, cls.tau ,maxerr=maxerr) cls.msmrevpi = estimate_markov_model(cls.dtraj, cls.tau,maxerr=maxerr, statdist=cls.statdist) cls.msm = estimate_markov_model(cls.dtraj, cls.tau, reversible=False, maxerr=maxerr) """Sparse""" cls.msmrev_sparse = estimate_markov_model(cls.dtraj, cls.tau, sparse=True, maxerr=maxerr) cls.msmrevpi_sparse = estimate_markov_model(cls.dtraj, cls.tau,maxerr=maxerr, statdist=cls.statdist, sparse=True) cls.msm_sparse = estimate_markov_model(cls.dtraj, cls.tau, reversible=False, sparse=True, maxerr=maxerr)
def _estimate(self, dtrajs): import bhmm # ensure right format dtrajs = _types.ensure_dtraj_list(dtrajs) # CHECK LAG trajlengths = [_np.size(dtraj) for dtraj in dtrajs] if self.lag >= _np.max(trajlengths): raise ValueError('Illegal lag time ' + str(self.lag) + ' exceeds longest trajectory length') if self.lag > _np.mean(trajlengths): self.logger.warning( 'Lag time ' + str(self.lag) + ' is on the order of mean trajectory length ' + str(_np.mean(trajlengths)) + '. It is recommended to fit four lag times in each ' + 'trajectory. HMM might be inaccurate.') # EVALUATE STRIDE if self.stride == 'effective': # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding # how many uncorrelated counts we can make self.stride = self.lag # get a quick estimate from the spectral radius of the non-reversible from pyerna.msm import estimate_markov_model msm_nr = estimate_markov_model(dtrajs, lag=self.lag, reversible=False, sparse=False, connectivity='largest', dt_traj=self.timestep_traj) # if we have more than nstates timescales in our MSM, we use the next (neglected) timescale as an # estimate of the decorrelation time if msm_nr.nstates > self.nstates: # because we use non-reversible msm, we want to silence the ImaginaryEigenvalueWarning import warnings from msmtools.util.exceptions import ImaginaryEigenValueWarning with warnings.catch_warnings(): warnings.filterwarnings( 'ignore', category=ImaginaryEigenValueWarning, module='msmtools.analysis.dense.decomposition') corrtime = max(1, msm_nr.timescales()[self.nstates - 1]) # use the smaller of these two pessimistic estimates self.stride = int(min(self.lag, 2 * corrtime)) # LAG AND STRIDE DATA dtrajs_lagged_strided = bhmm.lag_observations(dtrajs, self.lag, stride=self.stride) # OBSERVATION SET if self.observe_nonempty: observe_subset = 'nonempty' else: observe_subset = None # INIT HMM from bhmm import init_discrete_hmm from pyerna.msm.estimators import MaximumLikelihoodMSM from pyerna.msm.estimators import OOMReweightedMSM if self.msm_init == 'largest-strong': hmm_init = init_discrete_hmm(dtrajs_lagged_strided, self.nstates, lag=1, reversible=self.reversible, stationary=True, regularize=True, method='lcs-spectral', separate=self.separate) elif self.msm_init == 'all': hmm_init = init_discrete_hmm(dtrajs_lagged_strided, self.nstates, lag=1, reversible=self.reversible, stationary=True, regularize=True, method='spectral', separate=self.separate) elif isinstance( self.msm_init, (MaximumLikelihoodMSM, OOMReweightedMSM)): # initial MSM given. from bhmm.init.discrete import init_discrete_hmm_spectral p0, P0, pobs0 = init_discrete_hmm_spectral( self.msm_init.count_matrix_full, self.nstates, reversible=self.reversible, stationary=True, active_set=self.msm_init.active_set, P=self.msm_init.transition_matrix, separate=self.separate) hmm_init = bhmm.discrete_hmm(p0, P0, pobs0) observe_subset = self.msm_init.active_set # override observe_subset. else: raise ValueError('Unknown MSM initialization option: ' + str(self.msm_init)) # --------------------------------------------------------------------------------------- # Estimate discrete HMM # --------------------------------------------------------------------------------------- # run EM from bhmm.estimators.maximum_likelihood import MaximumLikelihoodEstimator as _MaximumLikelihoodEstimator hmm_est = _MaximumLikelihoodEstimator(dtrajs_lagged_strided, self.nstates, initial_model=hmm_init, output='discrete', reversible=self.reversible, stationary=self.stationary, accuracy=self.accuracy, maxit=self.maxit) # run hmm_est.fit() # package in discrete HMM self.hmm = bhmm.DiscreteHMM(hmm_est.hmm) # get model parameters self.initial_distribution = self.hmm.initial_distribution transition_matrix = self.hmm.transition_matrix observation_probabilities = self.hmm.output_probabilities # get estimation parameters self.likelihoods = hmm_est.likelihoods # Likelihood history self.likelihood = self.likelihoods[-1] self.hidden_state_probabilities = hmm_est.hidden_state_probabilities # gamma variables self.hidden_state_trajectories = hmm_est.hmm.hidden_state_trajectories # Viterbi path self.count_matrix = hmm_est.count_matrix # hidden count matrix self.initial_count = hmm_est.initial_count # hidden init count self._active_set = _np.arange(self.nstates) # TODO: it can happen that we loose states due to striding. Should we lift the output probabilities afterwards? # parametrize self import msmtools.estimation as msmest self._dtrajs_full = dtrajs self._dtrajs_lagged = dtrajs_lagged_strided self._nstates_obs_full = msmest.number_of_states(dtrajs) self._nstates_obs = msmest.number_of_states(dtrajs_lagged_strided) self._observable_set = _np.arange(self._nstates_obs) self._dtrajs_obs = dtrajs self.set_model_params(P=transition_matrix, pobs=observation_probabilities, reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag)) # TODO: perhaps remove connectivity and just rely on .submodel()? # deal with connectivity states_subset = None if self.connectivity == 'largest': states_subset = 'largest-strong' elif self.connectivity == 'populous': states_subset = 'populous-strong' # return submodel (will return self if all None) return self.submodel(states=states_subset, obs=observe_subset, mincount_connectivity=self.mincount_connectivity, inplace=True)
def test_cktest_simple(self): dtraj = np.random.randint(0, 10, 100) oom = msm.estimate_markov_model(dtraj, 1) hmm = oom.coarse_grain(2) hmm.cktest()
def test_pcca_recompute(self): msm = estimate_markov_model(self.dtraj, self.tau) pcca1 = msm.pcca(2) msm.estimate(self.dtraj, lag=self.tau + 1) pcca2 = msm.pcca(2) assert pcca2 is not pcca1
def test_msm(self): msm_one_over_n = estimate_markov_model(self.dtraj, lag=1, mincount_connectivity='1/n') msm_restrict_connectivity = estimate_markov_model(self.dtraj, lag=1, mincount_connectivity=self.mincount_connectivity) self._test_connectivity(msm_one_over_n, msm_restrict_connectivity)
def setUp(self): """Store state of the rng""" self.state = np.random.mtrand.get_state() """Reseed the rng to enforce 'deterministic' behavior""" np.random.mtrand.seed(42) """Meta-stable birth-death chain""" b = 2 q = np.zeros(7) p = np.zeros(7) q[1:] = 0.5 p[0:-1] = 0.5 q[2] = 1.0 - 10 ** (-b) q[4] = 10 ** (-b) p[2] = 10 ** (-b) p[4] = 1.0 - 10 ** (-b) bdc = BirthDeathChain(q, p) P = bdc.transition_matrix() dtraj = generate_traj(P, 10000, start=0) tau = 1 """Estimate MSM""" MSM = estimate_markov_model(dtraj, tau) C_MSM = MSM.count_matrix_full lcc_MSM = MSM.largest_connected_set Ccc_MSM = MSM.count_matrix_active P_MSM = MSM.transition_matrix mu_MSM = MSM.stationary_distribution """Meta-stable sets""" A = [0, 1, 2] B = [4, 5, 6] w_MSM = np.zeros((2, mu_MSM.shape[0])) w_MSM[0, A] = mu_MSM[A] / mu_MSM[A].sum() w_MSM[1, B] = mu_MSM[B] / mu_MSM[B].sum() K = 10 P_MSM_dense = P_MSM p_MSM = np.zeros((K, 2)) w_MSM_k = 1.0 * w_MSM for k in range(1, K): w_MSM_k = np.dot(w_MSM_k, P_MSM_dense) p_MSM[k, 0] = w_MSM_k[0, A].sum() p_MSM[k, 1] = w_MSM_k[1, B].sum() """Assume that sets are equal, A(\tau)=A(k \tau) for all k""" w_MD = 1.0 * w_MSM p_MD = np.zeros((K, 2)) eps_MD = np.zeros((K, 2)) p_MSM[0, :] = 1.0 p_MD[0, :] = 1.0 eps_MD[0, :] = 0.0 for k in range(1, K): """Build MSM at lagtime k*tau""" C_MD = count_matrix(dtraj, k * tau, sliding=True) / (k * tau) lcc_MD = largest_connected_set(C_MD) Ccc_MD = largest_connected_submatrix(C_MD, lcc=lcc_MD) c_MD = Ccc_MD.sum(axis=1) P_MD = transition_matrix(Ccc_MD).toarray() w_MD_k = np.dot(w_MD, P_MD) """Set A""" prob_MD = w_MD_k[0, A].sum() c = c_MD[A].sum() p_MD[k, 0] = prob_MD eps_MD[k, 0] = np.sqrt(k * (prob_MD - prob_MD ** 2) / c) """Set B""" prob_MD = w_MD_k[1, B].sum() c = c_MD[B].sum() p_MD[k, 1] = prob_MD eps_MD[k, 1] = np.sqrt(k * (prob_MD - prob_MD ** 2) / c) """Input""" self.MSM = MSM self.K = K self.A = A self.B = B """Expected results""" self.p_MSM = p_MSM self.p_MD = p_MD self.eps_MD = eps_MD