def ChapmanKolmogorovTest(assignments, klist=[1, 2, 3, 4, 5], lagtime=50, states=None): msm = MarkovStateModel(lag_time=lagtime, n_timescales=10) msm.fit(assignments) p_tau = msm.populations_ T_tau = msm.transmat_ mapping_tau = msm.mapping_ prob_tau_all = [] prob_ktau_all = [] if states == "all" or states is None: states = range(len(p_tau)) for k in klist: lagtime_long = k * lagtime print "long lagtime:", lagtime_long msm = MarkovStateModel(lag_time=lagtime_long, n_timescales=10) msm.fit(assignments) p_ktau = msm.populations_ T_ktau = msm.transmat_ mapping_ktau = msm.mapping_ probability_tau, probability_ktau = CalculateStatesProbability( T_tau, T_ktau, p_tau, p_ktau, mapping_tau, mapping_ktau, k, states) prob_tau_all.append(probability_tau) prob_ktau_all.append(probability_ktau) return prob_tau_all, prob_ktau_all
def test_both(): model = MarkovStateModel( reversible_type='mle', lag_time=1, n_timescales=1) # note this might break it if we ask for more than 1 timescale sequences = np.random.randint(20, size=(10, 1000)) lag_times = [1, 5, 10] models_ref = [] for tau in lag_times: msm = MarkovStateModel( reversible_type='mle', lag_time=tau, n_timescales=10) msm.fit(sequences) models_ref.append(msm) timescales_ref = [m.timescales_ for m in models_ref] models = param_sweep(msm, sequences, {'lag_time' : lag_times}, n_jobs=2) timescales = implied_timescales(sequences, lag_times, msm=msm, n_timescales=10, n_jobs=2) print(timescales) print(timescales_ref) if np.abs(models[0].transmat_ - models[1].transmat_).sum() < 1E-6: raise Exception("you wrote a bad test.") for i in range(len(lag_times)): models[i].lag_time = lag_times[i] npt.assert_array_almost_equal(models[i].transmat_, models_ref[i].transmat_) npt.assert_array_almost_equal(timescales_ref[i], timescales[i])
def test_ergodic_cutoff(): assert (MarkovStateModel(lag_time=10).ergodic_cutoff == BayesianMarkovStateModel(lag_time=10).ergodic_cutoff) assert (MarkovStateModel(lag_time=10)._parse_ergodic_cutoff() == BayesianMarkovStateModel(lag_time=10)._parse_ergodic_cutoff()) for cut_off in [0.01, 'on', 'off']: assert (MarkovStateModel(ergodic_cutoff=cut_off).ergodic_cutoff == BayesianMarkovStateModel(ergodic_cutoff=cut_off).ergodic_cutoff)
def test_from_msm(): assignments, _ = _metastable_system() msm = MarkovStateModel() msm.fit(assignments) pcca = PCCA.from_msm(msm, 2) msm = MarkovStateModel() msm.fit(assignments) pccaplus = PCCAPlus.from_msm(msm, 2)
def test_counts_3(): # test counts matrix scaling seq = [1] * 4 + [2] * 4 + [1] * 4 model1 = MarkovStateModel(reversible_type=None, lag_time=2, sliding_window=True).fit([seq]) model2 = MarkovStateModel(reversible_type=None, lag_time=2, sliding_window=False).fit([seq]) model3 = MarkovStateModel(reversible_type=None, lag_time=2, ergodic_cutoff='off').fit([seq]) eq(model1.countsmat_, model2.countsmat_) eq(model1.countsmat_, model3.countsmat_) eq(model2.countsmat_, model3.countsmat_)
def test_9(): # what if the input data contains NaN? They should be ignored model = MarkovStateModel(ergodic_cutoff=0) seq = [0, 1, 0, 1, np.nan] model.fit(seq) assert model.n_states_ == 2 assert model.mapping_ == {0: 0, 1: 1} if not PY3: model = MarkovStateModel() seq = [0, 1, 0, None, 0, 1] model.fit(seq) assert model.n_states_ == 2 assert model.mapping_ == {0: 0, 1: 1}
def plot_timescales(clusterer_dir, n_clusters, tica_dir, main="", lag_times=list(range(1, 50))): clusterer = verboseload(clusterer_dir) print(clusterer) sequences = clusterer.labels_ #print(sequences) #lag_times = list(np.arange(1,150,5)) n_timescales = 5 msm_timescales = implied_timescales(sequences, lag_times, n_timescales=n_timescales, msm=MarkovStateModel( verbose=True, prior_counts=1e-5, ergodic_cutoff='off')) print(msm_timescales) for i in range(n_timescales): plt.plot(lag_times, msm_timescales[:, i]) plt.xlabel("Lag time (ns)") plt.ylabel("Implied Timescales (ns)") plt.title(main) plt.semilogy() pp = PdfPages("%s/%s_n_clusters%d_implied_timescales.pdf" % (tica_dir, main, n_clusters)) pp.savefig() pp.close() plt.clf()
def test_cond_committors(): # depends on tpt.committors msm = MarkovStateModel(lag_time=1) assignments = np.random.randint(4, size=(10, 1000)) msm.fit(assignments) tprob = msm.transmat_ for_committors = tpt.committors(0, 3, msm) cond_committors = tpt.conditional_committors(0, 3, 2, msm) # The committor for state one can be decomposed into paths that # do and do not visit state 2 along the way. The paths that do not # visit state 1 must look like 1, 1, 1, ..., 1, 1, 3. So we can # compute them with a similar approximation as the forward committor # Since we want the other component of the forward committor, we # subtract that probability from the forward committor ref = for_committors[1] - np.power(tprob[1, 1], np.arange(5000)).sum() * tprob[1, 3] #print (ref / for_committors[1]) ref = [0, ref, for_committors[2], 0] #print(cond_committors, ref) npt.assert_array_almost_equal(ref, cond_committors)
def test_harder_hubscore(): # depends on tpt.committors and tpt.conditional_committors assignments = np.random.randint(10, size=(10, 1000)) msm = MarkovStateModel(lag_time=1) msm.fit(assignments) hub_scores = tpt.hub_scores(msm) ref_hub_scores = np.zeros(10) for A in xrange(10): for B in xrange(10): committors = tpt.committors(A, B, msm) denom = msm.transmat_[A, :].dot(committors) #+ msm.transmat_[A, B] for C in xrange(10): if A == B or A == C or B == C: continue cond_committors = tpt.conditional_committors(A, B, C, msm) temp = 0.0 for i in xrange(10): if i in [A, B]: continue temp += cond_committors[i] * msm.transmat_[A, i] temp /= denom ref_hub_scores[C] += temp ref_hub_scores /= (9 * 8) #print(ref_hub_scores, hub_scores) npt.assert_array_almost_equal(ref_hub_scores, hub_scores)
def test_fluxes(): # depends on tpt.committors msm = MarkovStateModel(lag_time=1) assignments = np.random.randint(3, size=(10, 1000)) msm.fit(assignments) tprob = msm.transmat_ pop = msm.populations_ # forward committors qplus = tpt.committors(0, 2, msm) ref_fluxes = np.zeros((3, 3)) ref_net_fluxes = np.zeros((3, 3)) for i in xrange(3): for j in xrange(3): if i != j: # Eq. 2.24 in Metzner et al. Transition Path Theory. # Multiscale Model. Simul. 2009, 7, 1192-1219. ref_fluxes[i, j] = (pop[i] * tprob[i, j] * (1 - qplus[i]) * qplus[j]) for i in xrange(3): for j in xrange(3): ref_net_fluxes[i, j] = np.max( [0, ref_fluxes[i, j] - ref_fluxes[j, i]]) fluxes = tpt.fluxes(0, 2, msm) net_fluxes = tpt.net_fluxes(0, 2, msm) # print(fluxes) # print(ref_fluxes) npt.assert_array_almost_equal(ref_fluxes, fluxes) npt.assert_array_almost_equal(ref_net_fluxes, net_fluxes)
def case1(): map_id = 40 for p_id in range(6383, 6391): assignments = np.load('Assignments-%d.fixed.Map%d.npy' % (p_id, map_id)) cv = KFold(len(assignments), n_folds=10) lagtime = 50 msm = MarkovStateModel(lag_time=lagtime) pops = [] msmts = [] for fold, (train_index, test_index) in enumerate(cv): assignments_train = assignments[train_index] msm.fit(assignments_train) if len(msm.populations_) == 40: pops.append(msm.populations_) msmts.append(msm.timescales_) output_dir = "Data-%d-macro%d" % (p_id, map_id) if not os.path.exists(output_dir): os.makedirs(output_dir) fn_populations = os.path.join(output_dir, "Populations-10fold.npy") fn_msmts = os.path.join(output_dir, "ImpliedTimescales-10fold.npy") np.save(fn_populations, pops) np.save(fn_msmts, msmts) print "Saved: {},{}".format(fn_populations, fn_msmts)
def test_13(): model = MarkovStateModel(n_timescales=2) model.fit([[0, 0, 0, 1, 2, 1, 0, 0, 0, 1, 3, 3, 3, 1, 1, 2, 2, 0, 0]]) left_right = np.dot(model.left_eigenvectors_.T, model.right_eigenvectors_) # check biorthonormal np.testing.assert_array_almost_equal( left_right, np.eye(3)) # check that the stationary left eigenvector is normalized to be 1 np.testing.assert_almost_equal(model.left_eigenvectors_[:, 0].sum(), 1) # the left eigenvectors satisfy <\phi_i, \phi_i>_{\mu^{-1}} = 1 for i in range(3): np.testing.assert_almost_equal( np.dot(model.left_eigenvectors_[:, i], model.left_eigenvectors_[:, i] / model.populations_), 1) # and that the right eigenvectors satisfy <\psi_i, \psi_i>_{\mu} = 1 for i in range(3): np.testing.assert_almost_equal( np.dot(model.right_eigenvectors_[:, i], model.right_eigenvectors_[:, i] * model.populations_), 1)
def test_10(): # test inverse transform model = MarkovStateModel(reversible_type=None, ergodic_cutoff=0) model.fit([['a', 'b', 'c', 'a', 'a', 'b']]) v = model.inverse_transform([[0, 1, 2]]) assert len(v) == 1 np.testing.assert_array_equal(v[0], ['a', 'b', 'c'])
def test_counts_no_trim(): # test counts matrix without trimming model = MarkovStateModel(reversible_type=None, ergodic_cutoff=0) model.fit([[1, 1, 1, 1, 1, 1, 1, 1, 1]]) eq(model.countsmat_, np.array([[8.0]])) eq(model.mapping_, {1: 0})
def test_counts_2(): # test counts matrix with trimming model = MarkovStateModel(reversible_type=None, ergodic_cutoff=1) model.fit([[1, 1, 1, 1, 1, 1, 1, 1, 1, 2]]) eq(model.mapping_, {1: 0}) eq(model.countsmat_, np.array([[8]]))
def test_partial_transform(): model = MarkovStateModel() model.fit([['a', 'a', 'b', 'b', 'c', 'c', 'a', 'a']]) assert model.mapping_ == {'a': 0, 'b': 1, 'c': 2} v = model.partial_transform(['a', 'b', 'c']) assert isinstance(v, list) assert len(v) == 1 assert v[0].dtype == np.int np.testing.assert_array_equal(v[0], [0, 1, 2]) v = model.partial_transform(['a', 'b', 'c', 'd'], 'clip') assert isinstance(v, list) assert len(v) == 1 assert v[0].dtype == np.int np.testing.assert_array_equal(v[0], [0, 1, 2]) v = model.partial_transform(['a', 'b', 'c', 'd'], 'fill') assert isinstance(v, np.ndarray) assert len(v) == 4 assert v.dtype == np.float np.testing.assert_array_equal(v, [0, 1, 2, np.nan]) v = model.partial_transform(['a', 'a', 'SPLIT', 'b', 'b', 'b'], 'clip') assert isinstance(v, list) assert len(v) == 2 assert v[0].dtype == np.int assert v[1].dtype == np.int np.testing.assert_array_equal(v[0], [0, 0]) np.testing.assert_array_equal(v[1], [1, 1, 1])
def test_ntimescales_3(): # see issue #603 trajs = [np.random.randint(0, 30, size=500) for _ in range(5)] msm = MarkovStateModel(n_timescales=10).fit(trajs) pccap = PCCAPlus.from_msm(msm, 11) lumped_trajs = pccap.transform(trajs) assert len(np.unique(lumped_trajs)) == 11
def test_pipeline(): trajs = DoubleWell(random_state=0).get_cached().trajectories p = Pipeline([ ('ndgrid', NDGrid(n_bins_per_feature=100)), ('msm', MarkovStateModel(lag_time=100)) ]) p.fit(trajs) p.named_steps['msm'].summarize()
def test_pcca_plus_1(): assignments, ref_macrostate_assignments = _metastable_system() pipeline = Pipeline([('msm', MarkovStateModel()), ('pcca+', PCCAPlus(2))]) macro_assignments = pipeline.fit_transform(assignments)[0] # we need to consider any permutation of the state labels when we # test for equality. Since it's only a 2-state that's simple using # the logical_not to flip the assignments. assert (np.all(macro_assignments == ref_macrostate_assignments) or np.all( macro_assignments == np.logical_not(ref_macrostate_assignments)))
def test_6(): # test score_ll with novel entries model = MarkovStateModel(reversible_type='mle') sequence = ['a', 'a', 'b', 'b', 'a', 'a', 'b', 'b'] model.fit([sequence]) assert not np.isfinite(model.score_ll([['c']])) assert not np.isfinite(model.score_ll([['c', 'c']])) assert not np.isfinite(model.score_ll([['a', 'c']]))
def test_51(): # test score_ll model = MarkovStateModel(reversible_type='mle') sequence = ['a', 'a', 'b', 'b', 'a', 'a', 'b', 'b', 'c', 'c', 'c', 'a', 'a'] model.fit([sequence]) assert model.mapping_ == {'a': 0, 'b': 1, 'c': 2} score_ac = model.score_ll([['a', 'c']]) assert score_ac == np.log(model.transmat_[0, 2])
def estimate_mle_populations(matrix): if msmb_version == '2.8.2': t_matrix = estimate_transition_matrix(matrix) populations = get_eigenvectors(t_matrix, 1, **kwargs)[1][:, 0] return populations elif msmb_version == '3.2.0': obj = MarkovStateModel() populations = obj._fit_mle(matrix)[1] return populations
def test_ntimescales_2(): # see issue #603 trajs = [random.randint(0, 100, size=500) for _ in range(15)] msm = MarkovStateModel().fit(trajs) pccap = PCCAPlus.from_msm(msm, 11) lumped_trajs = pccap.transform(trajs) observed_macros = len(np.unique(lumped_trajs)) assert observed_macros == 11, observed_macros
def at_lagtime(lt): msm = MarkovStateModel(lag_time=lt, n_timescales=10, verbose=False) msm.fit(list(ktrajs.values())) ret = { 'lag_time': lt, 'percent_retained': msm.percent_retained_, } for i in range(msm.n_timescales): ret['timescale_{}'.format(i)] = msm.timescales_[i] return ret
def test_plot_implied_timescales(): lag_times = [1, 50, 100, 250, 500, 1000, 5000] msm_objs = [] for lag in lag_times: # Construct MSM msm = MarkovStateModel(lag_time=lag, n_timescales=5) msm.fit(data) msm_objs.append(msm) ax = plot_implied_timescales(msm_objs) assert isinstance(ax, SubplotBase)
def generate_msm(self, clustered): """ Generates a MSM from the current cluster data Returns: Msm """ # Generate microstate MSM self.currtime = time.time() msm = MarkovStateModel(lag_time=self.config.getint("model", "msm_lag"), reversible_type="transpose", ergodic_cutoff="off", prior_counts=0.000001) msm.fit(clustered) print("TIME\tmicromsm:\t%f" % (time.time() - self.currtime)) utils.dump(msm, "msm_G%d.pkl" % self.generation) # Lump into macrostates self.currtime = time.time() pcca = PCCAPlus.from_msm(msm, n_macrostates=self.config.getint( "model", "macrostates")) mclustered = pcca.transform(clustered, mode="fill") if any(any(np.isnan(x) for x in m) for m in mclustered): #pylint: disable=no-member print( "WARNING: Unassignable clusters in PCCA with %d macrostates!" % self.config.getint("model", "macrostates")) print("TIME\tpccaplus:\t%f" % (time.time() - self.currtime)) if self.save_extras: utils.dump(pcca, "macrostater.pkl") # Generate macrostate MSM self.currtime = time.time() mmsm = MarkovStateModel(lag_time=self.config.getint( "model", "msm_lag"), reversible_type="transpose", ergodic_cutoff="off", prior_counts=0.000001) mmsm.fit(mclustered) print("TIME\tmacromsm\t%f" % (time.time() - self.currtime)) utils.dump(mmsm, "mmsm_G%d.pkl" % self.generation) return mmsm, mclustered
def test_bace(): assignments, ref_macrostate_assignments = _metastable_system() pipeline = Pipeline([('msm', MarkovStateModel()), ('bace', BACE(n_macrostates=2))]) macro_assignments = pipeline.fit_transform(assignments)[0] # we need to consider any permutation of the state labels when we # test for equality. Since it's only a 2-state that's simple using # the logical_not to flip the assignments. opposite = np.logical_not(ref_macrostate_assignments) assert (np.all(macro_assignments == ref_macrostate_assignments) or np.all(macro_assignments == opposite))
def test_score_1(): # test that GMRQ is equal to the sum of the first n eigenvalues, # when testing and training on the same dataset. sequence = [0, 0, 0, 1, 1, 1, 2, 2, 2, 1, 1, 1, 0, 0, 0, 1, 2, 2, 2, 1, 1, 1, 0, 0] for n in [0, 1, 2]: model = MarkovStateModel(verbose=False, n_timescales=n) model.fit([sequence]) assert_approx_equal(model.score([sequence]), model.eigenvalues_.sum()) assert_approx_equal(model.score([sequence]), model.score_)
def test_fit_1(): # call fit, compare to MSM sequence = [0, 0, 0, 1, 1, 1, 0, 0, 2, 2, 0, 1, 1, 1, 2, 2, 2, 2, 2] model = ContinuousTimeMSM(verbose=False) model.fit([sequence]) msm = MarkovStateModel(verbose=False) msm.fit([sequence]) # they shouldn't be equal in general, but for this input they seem to be np.testing.assert_array_almost_equal(model.transmat_, msm.transmat_)
def test_7(): # test timescales model = MarkovStateModel() model.fit([[0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1]]) assert np.all(np.isfinite(model.timescales_)) assert len(model.timescales_) == 1 model.fit([[0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 2, 2, 0, 0]]) assert np.all(np.isfinite(model.timescales_)) assert len(model.timescales_) == 2 assert model.n_states_ == 3 model = MarkovStateModel(n_timescales=1) model.fit([[0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 2, 2, 0, 0]]) assert len(model.timescales_) == 1 model = MarkovStateModel(n_timescales=100) model.fit([[0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 2, 2, 0, 0]]) assert len(model.timescales_) == 2 assert np.sum(model.populations_) == 1.0