def test_harder_hubscore(): # depends on tpt.committors and tpt.conditional_committors assignments = np.random.randint(10, size=(10, 1000)) msm = MarkovStateModel(lag_time=1) msm.fit(assignments) hub_scores = tpt.hub_scores(msm) ref_hub_scores = np.zeros(10) for A in range(10): for B in range(10): committors = tpt.committors(A, B, msm) denom = msm.transmat_[A, :].dot(committors) for C in range(10): if A == B or A == C or B == C: continue cond_committors = tpt.conditional_committors(A, B, C, msm) temp = 0.0 for i in range(10): if i in [A, B]: continue temp += cond_committors[i] * msm.transmat_[A, i] temp /= denom ref_hub_scores[C] += temp ref_hub_scores /= (9 * 8) npt.assert_array_almost_equal(ref_hub_scores, hub_scores)
def test_fluxes_1(): # depends on tpt.committors msm = MarkovStateModel(lag_time=1) assignments = np.random.randint(3, size=(10, 1000)) msm.fit(assignments) tprob = msm.transmat_ pop = msm.populations_ # forward committors qplus = tpt.committors(0, 2, msm) ref_fluxes = np.zeros((3, 3)) ref_net_fluxes = np.zeros((3, 3)) for i in range(3): for j in range(3): if i != j: # Eq. 2.24 in Metzner et al. Transition Path Theory. # Multiscale Model. Simul. 2009, 7, 1192-1219. ref_fluxes[i, j] = (pop[i] * tprob[i, j] * (1 - qplus[i]) * qplus[j]) for i in range(3): for j in range(3): ref_net_fluxes[i, j] = np.max([0, ref_fluxes[i, j] - ref_fluxes[j, i]]) fluxes = tpt.fluxes(0, 2, msm) net_fluxes = tpt.net_fluxes(0, 2, msm) npt.assert_array_almost_equal(ref_fluxes, fluxes) npt.assert_array_almost_equal(ref_net_fluxes, net_fluxes)
def test_13(): model = MarkovStateModel(n_timescales=2) model.fit([[0, 0, 0, 1, 2, 1, 0, 0, 0, 1, 3, 3, 3, 1, 1, 2, 2, 0, 0]]) left_right = np.dot(model.left_eigenvectors_.T, model.right_eigenvectors_) # check biorthonormal np.testing.assert_array_almost_equal( left_right, np.eye(3)) # check that the stationary left eigenvector is normalized to be 1 np.testing.assert_almost_equal(model.left_eigenvectors_[:, 0].sum(), 1) # the left eigenvectors satisfy <\phi_i, \phi_i>_{\mu^{-1}} = 1 for i in range(3): np.testing.assert_almost_equal( np.dot(model.left_eigenvectors_[:, i], model.left_eigenvectors_[:, i] / model.populations_), 1) # and that the right eigenvectors satisfy <\psi_i, \psi_i>_{\mu} = 1 for i in range(3): np.testing.assert_almost_equal( np.dot(model.right_eigenvectors_[:, i], model.right_eigenvectors_[:, i] * model.populations_), 1)
def build_msm(clusterer_dir, lag_time): clusterer = verboseload(clusterer_dir) n_clusters = np.shape(clusterer.cluster_centers_)[0] labels = clusterer.labels_ msm_modeler = MarkovStateModel(lag_time=lag_time) print("fitting msm to trajectories with %d clusters and lag_time %d" %(n_clusters, lag_time)) msm_modeler.fit_transform(labels) verbosedump(msm_modeler, "/scratch/users/enf/b2ar_analysis/msm_model_%d_clusters_t%d" %(n_clusters, lag_time)) print("fitted msm to trajectories with %d states" %(msm_modeler.n_states_)) #np.savetxt("/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_transmat.csv" %(n_clusters, lag_time), msm_modeler.transmat_, delimiter=",") #G = nx.from_numpy_matrix(msm_modeler.transmat_) #nx.write_edgelist(G, "/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_edgelist" %(n_clusters, lag_time), msm_modeler.transmat_, delimiter=",") transmat = msm_modeler.transmat_ mapping = msm_modeler.mapping_ edges = open("/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_edgelist.csv" %(n_clusters, lag_time), "wb") for i in range(0, msm_modeler.n_states_): if i == 0: for j in range(0, msm_modeler.n_states_): edges.write(";") edges.write("%d" %mapping[j]) edges.write("\n") edges.write("%d" %(mapping[i])) for j in range(0, msm_modeler.n_states_): prob = transmat[i][j] edges.write(";") if prob > 0.000001: edges.write("%f" %prob) else: edges.write("0") edges.write("\n") edges.close()
def test_cond_committors(): # depends on tpt.committors msm = MarkovStateModel(lag_time=1) assignments = np.random.randint(4, size=(10, 1000)) msm.fit(assignments) tprob = msm.transmat_ for_committors = tpt.committors(0, 3, msm) cond_committors = tpt.conditional_committors(0, 3, 2, msm) # The committor for state one can be decomposed into paths that # do and do not visit state 2 along the way. The paths that do not # visit state 1 must look like 1, 1, 1, ..., 1, 1, 3. So we can # compute them with a similar approximation as the forward committor # Since we want the other component of the forward committor, we # subtract that probability from the forward committor ref = for_committors[1] - np.power(tprob[1, 1], np.arange(5000)).sum() * tprob[1, 3] #print (ref / for_committors[1]) ref = [0, ref, for_committors[2], 0] #print(cond_committors, ref) npt.assert_array_almost_equal(ref, cond_committors)
def test_both(): sequences = [np.random.randint(20, size=1000) for _ in range(10)] lag_times = [1, 5, 10] models_ref = [] for tau in lag_times: msm = MarkovStateModel(reversible_type='mle', lag_time=tau, n_timescales=10) msm.fit(sequences) models_ref.append(msm) timescales_ref = [m.timescales_ for m in models_ref] model = MarkovStateModel(reversible_type='mle', lag_time=1, n_timescales=10) models = param_sweep(model, sequences, {'lag_time': lag_times}, n_jobs=2) timescales = implied_timescales(sequences, lag_times, msm=model, n_timescales=10, n_jobs=2) print(timescales) print(timescales_ref) if np.abs(models[0].transmat_ - models[1].transmat_).sum() < 1E-6: raise Exception("you wrote a bad test.") for i in range(len(lag_times)): npt.assert_array_almost_equal(models[i].transmat_, models_ref[i].transmat_) npt.assert_array_almost_equal(timescales_ref[i], timescales[i])
def test_1(): # test counts matrix without trimming model = MarkovStateModel(reversible_type=None, ergodic_cutoff=0) model.fit([[1, 1, 1, 1, 1, 1, 1, 1, 1]]) eq(model.countsmat_, np.array([[8.0]])) eq(model.mapping_, {1: 0})
def test_both(): model = MarkovStateModel(reversible_type="mle", lag_time=1, n_timescales=1) # note this might break it if we ask for more than 1 timescale sequences = np.random.randint(20, size=(10, 1000)) lag_times = [1, 5, 10] models_ref = [] for tau in lag_times: msm = MarkovStateModel(reversible_type="mle", lag_time=tau, n_timescales=10) msm.fit(sequences) models_ref.append(msm) timescales_ref = [m.timescales_ for m in models_ref] models = param_sweep(msm, sequences, {"lag_time": lag_times}, n_jobs=2) timescales = implied_timescales(sequences, lag_times, msm=msm, n_timescales=10, n_jobs=2) print(timescales) print(timescales_ref) if np.abs(models[0].transmat_ - models[1].transmat_).sum() < 1e-6: raise Exception("you wrote a bad test.") for i in range(len(lag_times)): models[i].lag_time = lag_times[i] npt.assert_array_almost_equal(models[i].transmat_, models_ref[i].transmat_) npt.assert_array_almost_equal(timescales_ref[i], timescales[i])
def test_10(): # test inverse transform model = MarkovStateModel(reversible_type=None, ergodic_cutoff=0) model.fit([['a', 'b', 'c', 'a', 'a', 'b']]) v = model.inverse_transform([[0, 1, 2]]) assert len(v) == 1 np.testing.assert_array_equal(v[0], ['a', 'b', 'c'])
def test_counts_2(): # test counts matrix with trimming model = MarkovStateModel(reversible_type=None, ergodic_cutoff=1) model.fit([[1, 1, 1, 1, 1, 1, 1, 1, 1, 2]]) eq(model.mapping_, {1: 0}) eq(model.countsmat_, np.array([[8]]))
def test_51(): # test score_ll model = MarkovStateModel(reversible_type='mle') sequence = ['a', 'a', 'b', 'b', 'a', 'a', 'b', 'b', 'c', 'c', 'c', 'a', 'a'] model.fit([sequence]) assert model.mapping_ == {'a': 0, 'b': 1, 'c': 2} score_ac = model.score_ll([['a', 'c']]) assert score_ac == np.log(model.transmat_[0, 2])
def test_mle_eq(): seq = [[0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1]] mle_mdl = MarkovStateModel(lag_time=1) b_mdl = BootStrapMarkovStateModel(n_samples=10, n_procs=2, msm_args={'lag_time': 1}) mle_mdl.fit(seq) b_mdl.fit(seq) #make sure we have good model eq(mle_mdl.populations_, b_mdl.mle_.populations_) eq(mle_mdl.timescales_, b_mdl.mle_.timescales_)
def test_6(): # test score_ll with novel entries model = MarkovStateModel(reversible_type='mle') sequence = ['a', 'a', 'b', 'b', 'a', 'a', 'b', 'b'] model.fit([sequence]) assert not np.isfinite(model.score_ll([['c']])) assert not np.isfinite(model.score_ll([['c', 'c']])) assert not np.isfinite(model.score_ll([['a', 'c']]))
def test_mfpt_match(): assignments = np.random.randint(10, size=(10, 2000)) msm = MarkovStateModel(lag_time=1) msm.fit(assignments) # these two do different things mfpts0 = np.vstack([tpt.mfpts(msm, i) for i in range(10)]).T mfpts1 = tpt.mfpts(msm) npt.assert_array_almost_equal(mfpts0, mfpts1)
def at_lagtime(lt, clustered_trajs): msm = MarkovStateModel(lag_time=lt, n_timescales=20, verbose=False) msm.fit(clustered_trajs) ret = { 'lag_time': lt, 'percent_retained': msm.percent_retained_, } for i in range(msm.n_timescales): ret['timescale_{}'.format(i)] = msm.timescales_[i] return ret
def at_lagtime(lt): msm = MarkovStateModel(lag_time=lt, n_timescales=10, verbose=False) msm.fit(list(ktrajs.values())) ret = { 'lag_time': lt, 'percent_retained': msm.percent_retained_, } for i in range(msm.n_timescales): ret['timescale_{}'.format(i)] = msm.timescales_[i] return ret
def build_msm(clusterer_dir, lag_time): clusterer = verboseload(clusterer_dir) n_clusters = np.shape(clusterer.cluster_centers_)[0] labels = clusterer.labels_ msm_modeler = MarkovStateModel(lag_time=lag_time) print("fitting msm to trajectories with %d clusters and lag_time %d" %(n_clusters, lag_time)) msm_modeler.fit_transform(labels) verbosedump(msm_modeler, "/scratch/users/enf/b2ar_analysis/msm_model_%d_clusters_t%d" %(n_clusters, lag_time)) print("fitted msm to trajectories with %d states" %(msm_modeler.n_states_)) '''
def test_plot_implied_timescales(self): lag_times = [1, 50, 100, 250, 500, 1000, 5000] msm_objs = [] for lag in lag_times: # Construct MSM msm = MarkovStateModel(lag_time=lag, n_timescales=5) msm.fit(data) msm_objs.append(msm) ax = plot_implied_timescales(msm_objs) assert isinstance(ax, SubplotBase)
def post(self): io = StringIO(self.get_argument('matrix')) w = sio.mmread(io) msm = MarkovStateModel() msm.transmat_, msm.populations_ = _transmat_mle_prinz(w) msm.n_states_ = msm.populations_.shape[0] if bool(int(self.get_argument('mode'))): self.write(make_json_paths(msm, self)) # TP else: self.write(make_json_graph(msm, self)) # MSM
def test_score_1(): # test that GMRQ is equal to the sum of the first n eigenvalues, # when testing and training on the same dataset. sequence = [0, 0, 0, 1, 1, 1, 2, 2, 2, 1, 1, 1, 0, 0, 0, 1, 2, 2, 2, 1, 1, 1, 0, 0] for n in [0, 1, 2]: model = MarkovStateModel(verbose=False, n_timescales=n) model.fit([sequence]) assert_approx_equal(model.score([sequence]), model.eigenvalues_.sum()) assert_approx_equal(model.score([sequence]), model.score_)
def test_fit_1(): # call fit, compare to MSM sequence = [0, 0, 0, 1, 1, 1, 0, 0, 2, 2, 0, 1, 1, 1, 2, 2, 2, 2, 2] model = ContinuousTimeMSM(verbose=False) model.fit([sequence]) msm = MarkovStateModel(verbose=False) msm.fit([sequence]) # they shouldn't be equal in general, but for this input they seem to be np.testing.assert_array_almost_equal(model.transmat_, msm.transmat_)
def test_11(): # test sample model = MarkovStateModel() model.fit([[0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 2, 2, 0, 0]]) sample = model.sample_discrete(n_steps=1000, random_state=0) assert isinstance(sample, np.ndarray) assert len(sample) == 1000 bc = np.bincount(sample) diff = model.populations_ - (bc / np.sum(bc)) assert np.sum(np.abs(diff)) < 0.1
def test_doublewell(): X = load_doublewell(random_state=0)['trajectories'] for i in range(3): Y = NDGrid(n_bins_per_feature=10).fit_transform([X[i]]) model1 = MarkovStateModel(verbose=False).fit(Y) model2 = ContinuousTimeMSM().fit(Y) print('MSM uncertainty timescales:') print(model1.uncertainty_timescales()) print('ContinuousTimeMSM uncertainty timescales:') print(model2.uncertainty_timescales()) print()
def test_eigtransform_2(): model = MarkovStateModel(n_timescales=2) traj = [4, 3, 0, 0, 0, 1, 2, 1, 0, 0, 0, 1, 0, 1, 1, 2, 2, 0, 0] model.fit([traj]) transformed_0 = model.eigtransform([traj], mode='clip') # clip off the first two states (not ergodic) assert transformed_0[0].shape == (len(traj) - 2, model.n_timescales) transformed_1 = model.eigtransform([traj], mode='fill') assert transformed_1[0].shape == (len(traj), model.n_timescales) assert np.all(np.isnan(transformed_1[0][:2, :])) assert not np.any(np.isnan(transformed_1[0][2:]))
def test_12(): # test eigtransform model = MarkovStateModel(n_timescales=1) model.fit([[4, 3, 0, 0, 0, 1, 2, 1, 0, 0, 0, 1, 0, 1, 1, 2, 2, 0, 0]]) assert model.mapping_ == {0: 0, 1: 1, 2: 2} assert len(model.eigenvalues_) == 2 t = model.eigtransform([[0, 1]], right=True) assert t[0][0] == model.right_eigenvectors_[0, 1] assert t[0][1] == model.right_eigenvectors_[1, 1] s = model.eigtransform([[0, 1]], right=False) assert s[0][0] == model.left_eigenvectors_[0, 1] assert s[0][1] == model.left_eigenvectors_[1, 1]
def test_hessian(): grid = NDGrid(n_bins_per_feature=10, min=-np.pi, max=np.pi) seqs = grid.fit_transform(load_doublewell(random_state=0)['trajectories']) seqs = [seqs[i] for i in range(10)] lag_time = 10 model = ContinuousTimeMSM(verbose=True, lag_time=lag_time) model.fit(seqs) msm = MarkovStateModel(verbose=False, lag_time=lag_time) print(model.summarize()) print('MSM timescales\n', msm.fit(seqs).timescales_) print('Uncertainty K\n', model.uncertainty_K()) print('Uncertainty pi\n', model.uncertainty_pi())
def cluster_msm(sequences,n_states, lag_times): for n in n_states: states = KMeans(n_clusters=n) states.fit(sequences) io.dump(states,str(n)+'n_cl.pkl') ts=np.zeros(5) for lag_time in lag_times: msm = MarkovStateModel(lag_time=lag_time, verbose=False,n_timescales=5) msm.fit(states.labels_) ts1=msm.timescales_ ts=np.vstack((ts,ts1)) io.dump(msm,str(n)+'n_'+str(lag_time)+'lt_msm.pkl') ts=np.delete(ts, (0), axis=0) io.dump(ts,str(n)+'n_timescales.pkl')
def test_5(): trjs = DoubleWell(random_state=0).get_cached().trajectories clusterer = NDGrid(n_bins_per_feature=5) mle_msm = MarkovStateModel(lag_time=100, verbose=False) b_msm = BayesianMarkovStateModel(lag_time=100, n_samples=1000, n_chains=8, n_steps=1000, random_state=0) states = clusterer.fit_transform(trjs) b_msm.fit(states) mle_msm.fit(states) # this is a pretty silly test. it checks that the mean transition # matrix is not so dissimilar from the MLE transition matrix. # This shouldn't necessarily be the case anyways -- the likelihood is # not "symmetric". And the cutoff chosen is just heuristic. assert np.linalg.norm(b_msm.all_transmats_.mean(axis=0) - mle_msm.transmat_) < 1e-2
def test_3(): model = MarkovStateModel(reversible_type='mle') model.fit([[0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0]]) counts = np.array([[8, 1, 1], [1, 3, 0], [1, 0, 3]]) eq(model.countsmat_, counts) assert np.sum(model.populations_) == 1.0 model.timescales_ # test pickleable try: dump(model, 'test-msm-temp.npy', compress=1) model2 = load('test-msm-temp.npy') eq(model2.timescales_, model.timescales_) finally: os.unlink('test-msm-temp.npy')
def test_hubscore(): # Make an actual hub! tprob = np.array([[0.8, 0.0, 0.2, 0.0, 0.0], [0.0, 0.8, 0.2, 0.0, 0.0], [0.1, 0.1, 0.6, 0.1, 0.1], [0.0, 0.0, 0.2, 0.8, 0.0], [0.0, 0.0, 0.2, 0.0, 0.8]]) msm = MarkovStateModel(lag_time=1) msm.transmat_ = tprob msm.n_states_ = 5 score = tpt.hub_scores(msm, 2)[0] assert score == 1.0
def test_eigtransform_1(): # test eigtransform model = MarkovStateModel(n_timescales=1) model.fit([[4, 3, 0, 0, 0, 1, 2, 1, 0, 0, 0, 1, 0, 1, 1, 2, 2, 0, 0]]) assert model.mapping_ == {0: 0, 1: 1, 2: 2} assert len(model.eigenvalues_) == 2 t = model.eigtransform([[0, 1]], right=True) assert t[0][0] == model.right_eigenvectors_[0, 1] assert t[0][1] == model.right_eigenvectors_[1, 1] s = model.eigtransform([[0, 1]], right=False) assert s[0][0] == model.left_eigenvectors_[0, 1] assert s[0][1] == model.left_eigenvectors_[1, 1]
def plot_timescales(clusterer_dir, n_clusters, lag_time): clusterer = verboseload(clusterer_dir) sequences = clusterer.labels_ lag_times = list(np.arange(1, 150, 5)) n_timescales = 5 msm_timescales = implied_timescales(sequences, lag_times, n_timescales=n_timescales, msm=MarkovStateModel(verbose=False)) print(msm_timescales) for i in range(n_timescales): plt.plot(lag_times, msm_timescales[:, i]) plt.semilogy() pp = PdfPages( "/scratch/users/enf/b2ar_analysis/kmeans_%d_%d_implied_timescales.pdf" % (n_clusters, lag_time)) pp.savefig() pp.close()
def test_0(): # Verify that the partial derivatives of the ith eigenvalue of the # transition matrix with respect to the entries of the transition matrix # is given by the outer product of the left and right eigenvectors # corresponding to that eigenvalue. # \frac{\partial \lambda_k}{\partial T_{ij}} = U_{i,k} V_{j,k} X = load_doublewell(random_state=0)['trajectories'] Y = NDGrid(n_bins_per_feature=10).fit_transform(X) model = MarkovStateModel(verbose=False).fit(Y) n = model.n_states_ u, lv, rv = _solve_msm_eigensystem(model.transmat_, n) # first, compute forward difference numerical derivatives h = 1e-7 dLambda_dP_numeric = np.zeros((n, n, n)) # dLambda_dP_numeric[eigenvalue_index, i, j] for i in range(n): for j in range(n): # perturb the (i,j) entry of transmat H = np.zeros((n, n)) H[i, j] = h u_perturbed = sorted(np.real(eigvals(model.transmat_ + H)), reverse=True) # compute the forward different approx. derivative of each # of the eigenvalues for k in range(n): # sort the eigenvalues of the perturbed matrix in descending # order, to be consistent w/ _solve_msm_eigensystem dLambda_dP_numeric[k, i, j] = (u_perturbed[k] - u[k]) / h for k in range(n): analytic = np.outer(lv[:, k], rv[:, k]) np.testing.assert_almost_equal(dLambda_dP_numeric[k], analytic, decimal=5)
def calculate_its(kcenters_sequences, lag_times, n_timescales, outfile_name, ergodic_cutoff_option): msm_timescales = implied_timescales( kcenters_sequences, lag_times, n_timescales=n_timescales, msm=MarkovStateModel(verbose=True, reversible_type='transpose', ergodic_cutoff=ergodic_cutoff_option)) for k in range(n_timescales): plt.plot(lag_times, msm_timescales[:, k], 'o-') f2 = open(outfile_name + '.dat', 'w') for i in range(len(lag_times)): f2.write("%d " % (lag_times[i])) for j in range(n_timescales): f2.write("%f " % (msm_timescales[i, j])) f2.write('\n') f2.close() plt.title('Discrete-time MSM Relaxation Timescales') plt.semilogy() x1, x2, y1, y2 = plt.axis() plt.savefig(outfile_name + '.png') plt.close()
def test_multi_params(): msm = MarkovStateModel() param_grid = { 'lag_time' : [1, 2, 3], 'reversible_type' : ['mle', 'transpose'] } sequences = np.random.randint(20, size=(10, 1000)) models = param_sweep(msm, sequences, param_grid, n_jobs=2) assert len(models) == 6 # I don't know what the order should be, so I'm just going # to check that there are no duplicates params = [] for m in models: params.append('%s%d' % (m.reversible_type, m.lag_time)) for l in param_grid['lag_time']: for s in param_grid['reversible_type']: assert ('%s%d' % (s, l)) in params # this is redundant, but w/e assert len(set(params)) == 6
def test_countsmat(): model = MarkovStateModel(verbose=False) C = np.array([[4380, 153, 15, 2, 0, 0], [211, 4788, 1, 0, 0, 0], [169, 1, 4604, 226, 0, 0], [3, 13, 158, 4823, 3, 0], [0, 0, 0, 4, 4978, 18], [7, 5, 0, 0, 62, 4926]], dtype=float) C = C + (1.0 / 6.0) model.n_states_ = C.shape[0] model.countsmat_ = C model.transmat_, model.populations_ = model._fit_mle(C) n_trials = 5000 random = np.random.RandomState(0) all_timescales = np.zeros((n_trials, model.n_states_ - 1)) all_eigenvalues = np.zeros((n_trials, model.n_states_)) for i in range(n_trials): T = np.vstack([random.dirichlet(C[i]) for i in range(C.shape[0])]) u = _solve_msm_eigensystem(T, k=6)[0] u = np.real(u) # quiet warning. Don't know if this is legit all_eigenvalues[i] = u all_timescales[i] = -1 / np.log(u[1:])
from msmbuilder.cluster import MiniBatchKMeans from msmbuilder.msm import MarkovStateModel import numpy as np import msmexplorer as msme rs = np.random.RandomState(42) # Load Fs Peptide Data trajs = FsPeptide().get().trajectories # Extract Backbone Dihedrals featurizer = DihedralFeaturizer(types=['phi', 'psi']) diheds = featurizer.fit_transform(trajs) # Perform Dimensionality Reduction tica_model = tICA(lag_time=2, n_components=2) tica_trajs = tica_model.fit_transform(diheds) # Perform Clustering clusterer = MiniBatchKMeans(n_clusters=100, random_state=rs) clustered_trajs = clusterer.fit_transform(tica_trajs) # Construct MSM msm = MarkovStateModel(lag_time=2) msm.fit(clustered_trajs) # Plot MSM Network msme.plot_pop_resids(msm, color='tarragon')
# Globals num_procs = 5 # Should pick this up from Slurm E-V. #traj_dir = '/mnt/storage/home/ra15808/scratch/train' traj_dir = '/panfs/panasas01/chem/ra15808/Datasets/DHFR/train' # traj_dir = '/Users/robert_arbon/Datasets/DHFR/train' trial_db = 'best_trials.pickl' output_db = trial_db.split('.')[0]+'-'+str(new_n_timescales)+'.pickl' # Pipelines pipe = Pipeline([ ('variance_cut', VarianceThreshold()), ('tica', tICA(kinetic_mapping=True)), ('cluster', MiniBatchKMeans()), ('msm', MarkovStateModel(n_timescales=2, lag_time=50, verbose=True))]) # Get old results best = pd.read_pickle(trial_db) best.sort_values(by='feature', inplace=True) # Setup results dictionary results = {'id': [], 'strategy': [], 'test_scores-{}'.format(new_n_timescales): []} # Loop cv = ShuffleSplit(n_splits=5, test_size=0.5, random_state=42) old_feature = 'none' for i, row in best.iterrows(): print('---Running {}---'.format(i)) # Get dataset if row['feature'] != old_feature:
this_seq = util.featurize_RawPos(inds_N, [this_sim]) sequences_all.extend(this_seq) seq_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/sequences' + '_s' + str( LOAD_STRIDE) + '.out' pickle.dump(sequences_all, open(seq_path, 'wb')) clustering = KCenters(n_clusters=N_CLUSTER) geo_assign = clustering.fit_predict(sequences_all) centers = clustering.cluster_centers_ geo_assign_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/KC_geoassign_c' \ +str(N_CLUSTER)+'_s'+str(LOAD_STRIDE)+'.out' pickle.dump(geo_assign, open(geo_assign_path, 'wb')) micro_msm = MarkovStateModel(lag_time=1, reversible_type='transpose', ergodic_cutoff='off', verbose=True).fit(geo_assign) msm_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/KC_msm_c'+str(N_CLUSTER)+ \ '_s'+str(LOAD_STRIDE)+'.out' pickle.dump(micro_msm, open(msm_path, 'wb')) # map assignments print('There are %d microstates in msm' % micro_msm.n_states_) raw_clusters = [] for this_assign in geo_assign: raw_clusters.extend(np.unique(this_assign)) raw_clusters = np.unique(np.array(raw_clusters)) print('There are %d clusters in the original geometric clustering.' % len(raw_clusters))
txx, delimiter=',') # clustering from msmbuilder.cluster import MiniBatchKMeans clusterer = MiniBatchKMeans(n_clusters=num_clusters) #100 for camodulin clustered_trajs = tica_trajs.fit_transform_with(clusterer, 'kmeans/', fmt='dir-npy') # msm builder from msmbuilder.msm import MarkovStateModel from msmbuilder.utils import dump if which_dataset == 'fspeptide': msm = MarkovStateModel(lag_time=2, n_timescales=20, ergodic_cutoff='on') if which_dataset == 'apo_calmodulin': msm = MarkovStateModel(lag_time=20, n_timescales=20, ergodic_cutoff='on') msm.fit(clustered_trajs) # Concatenate the trajectories in cluster indices cluster_indices = np.concatenate(clustered_trajs) # Compile X if feature == 'XYZ': temp = xyz[0] _, num_atoms, num_axis = temp.xyz.shape reference_frame = temp.slice(0, copy=True) num_features = num_atoms * num_axis pre_X = [
rs = np.random.RandomState(42) # Load Fs Peptide Data trajs = FsPeptide().get().trajectories # Extract Backbone Dihedrals featurizer = DihedralFeaturizer(types=['chi1']) diheds = featurizer.fit_transform(trajs) # Perform Dimensionality Reduction tica_model = tICA(lag_time=2, n_components=2) tica_trajs = tica_model.fit_transform(diheds) # Perform Clustering clusterer = MiniBatchKMeans(n_clusters=12, random_state=rs) clustered_trajs = clusterer.fit_transform(tica_trajs) # Construct MSM msm = MarkovStateModel(lag_time=2) assignments = msm.fit_transform(clustered_trajs) # Plot Stacked Distributions a = np.concatenate(assignments, axis=0) d = np.concatenate(diheds, axis=0) # Plot Stacked Distributions of the sine of each Chi1 angle # within an arbitrary set of states {2, 5, 0} path_data = [d[a == i][:, ::2] for i in [2, 5, 0]] msme.plot_stackdist(path_data)
def test_bace_2(): assignments, ref_macrostate_assignments = _metastable_system() pipeline = Pipeline([('msm', MarkovStateModel()), ('bace', BACE(n_macrostates=2))]) macro_assignments = pipeline.fit_transform(assignments)[0] assert (np.min(assignments) >= 0)
def test_7(): # test timescales model = MarkovStateModel() model.fit([[0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1]]) assert np.all(np.isfinite(model.timescales_)) assert len(model.timescales_) == 1 model.fit([[0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 2, 2, 0, 0]]) assert np.all(np.isfinite(model.timescales_)) assert len(model.timescales_) == 2 assert model.n_states_ == 3 model = MarkovStateModel(n_timescales=1) model.fit([[0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 2, 2, 0, 0]]) assert len(model.timescales_) == 1 model = MarkovStateModel(n_timescales=100) model.fit([[0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 2, 2, 0, 0]]) assert len(model.timescales_) == 2 assert np.sum(model.populations_) == 1.0
# rapidly. Note that we transform our trajectories from the n_components-dimensional # tICA space into a 1-dimensional cluster index txx = np.concatenate(tica_trajs) #_ = msme.plot_histogram(txx) clusterer = MiniBatchKMeans(n_clusters=int(args.clusters), random_state=42) clustered_trajs = tica_trajs.fit_transform_with(clusterer, 'kmeans/', fmt='dir-npy') #plt.figure() #plt.hexbin(txx[:,0], txx[:,1], bins='log', mincnt=1, cmap='viridis') #plt.scatter(clusterer.cluster_centers_[:,0], clusterer.cluster_centers_[:,1], s=100, c='w') #plt.savefig('microstate_clusters.png') # We can construct an MSM from the labeled trajectories msm = MarkovStateModel(lag_time=int(args.lag), n_timescales=20) msm.fit(clustered_trajs) assignments = clusterer.partial_transform(txx) assignments = msm.partial_transform(assignments) #msme.plot_free_energy(txx, obs=(0, 1), n_samples=10000, # pi=msm.populations_[assignments], # xlabel='tIC 1', ylabel='tIC 2') #plt.figure() #plt.scatter(clusterer.cluster_centers_[msm.state_labels_, 0], # clusterer.cluster_centers_[msm.state_labels_, 1], # s=1e4 * msm.populations_, # size by population # c=msm.left_eigenvectors_[:, 1], # color by eigenvector # cmap="coolwarm", # zorder=3) #plt.colorbar(label='First dynamical eigenvector') #plt.tight_layout()
def calculate_fitness(population_dihedral, diheds, score_global, i, lock): import pandas as pd import numpy as np pop_index = i new_diheds = [] for i in range(0, len(diheds)): X = diheds[i] selected_features = X[:, population_dihedral] new_diheds.append(selected_features) from msmbuilder.preprocessing import RobustScaler scaler = RobustScaler() scaled_diheds = scaler.fit_transform(new_diheds) scaled_diheds = new_diheds from msmbuilder.decomposition import tICA tica_model = tICA(lag_time=2, n_components=5) tica_model.fit(scaled_diheds) tica_trajs = tica_model.transform(scaled_diheds) from msmbuilder.cluster import MiniBatchKMeans clusterer = MiniBatchKMeans(n_clusters=200, random_state=42) clustered_trajs = clusterer.fit_transform(tica_trajs) from msmbuilder.msm import MarkovStateModel msm = MarkovStateModel(lag_time=50, n_timescales=5) #msm.fit_transform(clustered_trajs) from sklearn.cross_validation import KFold n_states = [4] cv = KFold(len(clustered_trajs), n_folds=5) results = [] for n in n_states: msm.n_states_ = n for fold, (train_index, test_index) in enumerate(cv): train_data = [clustered_trajs[i] for i in train_index] test_data = [clustered_trajs[i] for i in test_index] msm.fit(train_data) train_score = msm.score(train_data) test_score = msm.score(test_data) time_score = msm.timescales_[0] time_test_score = time_score + test_score print(time_score) print(test_score) av_score = time_test_score / 2 results.append({ 'train_score': train_score, 'test_score': test_score, 'time_score': time_score, 'av_score': av_score, 'n_states': n, 'fold': fold }) print(msm.timescales_) results = pd.DataFrame(results) avgs = (results.groupby('n_states').aggregate(np.median).drop('fold', axis=1)) best_nt = avgs['test_score'].idxmax() best_n = avgs['av_score'].idxmax() best_score = avgs.loc[best_n, 'av_score'] best_scorent = avgs.loc[best_nt, 'test_score'] print(best_scorent) lock.acquire() score_global.update({pop_index: best_scorent}) lock.release()
kcenters_sequences = kcenters.fit_predict( tica_sequences) #here it is ground state tica sequences print "begin to plot the microstate implied timescale into the objective dir" #plot implied timescale lag_times = range(10, 100, 10) #adjust variables n_timescales = 5 #adjust variables msm_timescales = implied_timescales(kcenters_sequences, lag_times, n_timescales=n_timescales, msm=MarkovStateModel( verbose=True, reversible_type='transpose')) outfile_name = "%s/GS_ITS_tic%d_lagtime%d_clustersize%d.dat" % ( outputdir, num_tics_for_clustering, tic_lag_time, nMicro) print msm_timescales print msm_timescales.shape for k in range(n_timescales): plt.plot(lag_times, msm_timescales[:, k], 'o-') f2 = open(outfile_name, 'w') for i in range(len(lag_times)): f2.write("%d " % (lag_times[i])) for j in range(n_timescales): f2.write("%f " % (msm_timescales[i, j])) f2.write('\n')
# TIMESCALES # # The data will be loaded with a stride of 10 frames. Each fame is 50ps, so the time per frame will be # 500ps/frame or 0.5ns/frame. # Each trajectory is 1000 frames long # Lag time will be 40 frames (20 ns) based on a visual inspection of /Misc/MSM_lag_time.ipynb features = tica_unstructured_features to_ns = 0.5 msm_lag = int(40/to_ns) # # MODEL # pipe = Pipeline([('features', FeatureSelector(features=tica_unstructured_features)), ('variance_cut', VarianceThreshold()), ('scaling', RobustScaler()), ('tica', tICA(kinetic_mapping=True)), ('cluster', MiniBatchKMeans()), ('msm', MarkovStateModel(lag_time=msm_lag, verbose=False, n_timescales=2))]) # # SAVE MODEL # savedir = 'rand-tica-all' save_generic(pipe, '{}/model.pickl'.format(savedir)) print_feature_names(features, join(savedir, 'feature_list.txt'))
from msmbuilder.msm import MarkovStateModel from sklearn.pipeline import Pipeline import os from ..adaptive import create_folder logging.disable(logging.CRITICAL) parser = NumberedRunsParser(traj_fmt='run-{run}.nc', top_fn='data_app/runs/structure.prmtop', step_ps=200) meta = gather_metadata('/'.join(['data_app/runs/', '*nc']), parser) model = Pipeline([('feat', DihedralFeaturizer()), ('scaler', MinMaxScaler()), ('tICA', tICA(lag_time=1, n_components=4)), ('clusterer', MiniBatchKMeans(n_clusters=5)), ('msm', MarkovStateModel(lag_time=1, n_timescales=4))]) spawns = [ (0, 1), ] epoch = 1 class TestAppBase: def __init__(self): self.app = App(generator_folder='data_app/generators', data_folder='data_app/runs', input_folder='data_app/inputs', filtered_folder='data_app/filtered_trajs', model_folder='data_app/model', build_folder='data_app/build',
# # TIMESCALES # # The data will be loaded with a stride of 10 frames. Each fame is 50ps, so the time per frame will be # 500ps/frame or 0.5ns/frame. # Each trajectory is 1000 frames long # Lag time will be 40 frames (20 ns) based on a visual inspection of /Misc/MSM_lag_time.ipynb to_ns = 0.5 msm_lag = int(40 / to_ns) # # FEATURE INDICES # all_idx = np.load('indices_all.npy') # # OTHER PARAMETERS # ref_traj = md.load('../Data/data/trajectory-1.xtc', top='../Data/data/fs-peptide.pdb') featurizer = FeatureSelector(features=feats) pipe = Pipeline([('features', featurizer), ('variance_cut', VarianceThreshold()), ('scaling', RobustScaler()), ('cluster', MiniBatchKMeans()), ('msm', MarkovStateModel(lag_time=msm_lag, verbose=False))]) save_generic(pipe, 'model.pickl')
from multiprocessing import Pool import pandas as pd from msmbuilder.featurizer import DihedralFeaturizer, KappaAngleFeaturizer from sklearn.model_selection import cross_val_score, cross_val_predict # Globals num_procs = 5 traj_dir = '/mnt/storage/home/ra15808/scratch/train' # traj_dir = '/Users/robert_arbon/Datasets/DHFR/train' pipe_fixed = Pipeline([('variance_cut', VarianceThreshold()), ('tica', tICA(kinetic_mapping=True)), ('cluster', MiniBatchKMeans()), ('msm', MarkovStateModel(n_timescales=2, lag_time=50, verbose=True))]) pipe_csp = Pipeline([('variance_cut', VarianceThreshold()), ('tica', tICA(kinetic_mapping=True)), ('cluster', MiniBatchKMeans()), ('msm', MarkovStateModel(use_gap='timescales', lag_time=50, verbose=True))]) best = pd.read_pickle('best_trials.pickl') best.sort_values(by='rank', inplace=True) results = {'id': [], 'new_test_scores': [], 'strategy': []}
#print len(sequences_all) #print sequences_all[-1].shape #average position of Asp113 #res_pos_ave = np.mean(res_pos_A_1[0],axis = 0) # time_step = util.calc_time_step(times_path,stride = LOAD_STRIDE) # clustering = KCenters(n_clusters = 10) assignments = clustering.fit_predict(sequences_all) centers = clustering.cluster_centers_ #print len(assignments) #print assignments[1].shape msm = MarkovStateModel(lag_time=180, verbose=True).fit(assignments) countsmat = msm.countsmat_ transmat = msm.transmat_ #print np.sum(countsmat) #np.savetxt('/home/shenglan/TryMSMbuilder/output/assignments.out',assignments, fmt = '%3.0f') np.savetxt('/home/shenglan/TryMSMbuilder/output/countsmat.out',countsmat,fmt = '%8.4g') np.savetxt('/home/shenglan/TryMSMbuilder/output/transmat.out',transmat,fmt = '%10.4g') #try different lag_times msmts0 = {} lag_times = [1,20,40,60,80,100,120,140,160,180] n_states = [5,10,15,30] for n in n_states:
def test_from_msm_2(): assignments, _ = _metastable_system() msm = MarkovStateModel() msm.fit(assignments) pccaplus = PCCAPlus.from_msm(msm, 2, 'crispness') assert pccaplus.objective_function == 'crispness'
from nose.plugins.skip import SkipTest import numpy as np from msmbuilder.msm import MarkovStateModel, BayesianMarkovStateModel from matplotlib.axes import SubplotBase from ..plots import plot_tpaths from . import PlotTestCase rs = np.random.RandomState(42) data = rs.randint(low=0, high=10, size=100000) msm = MarkovStateModel() msm.fit(data) bmsm = BayesianMarkovStateModel() bmsm.fit(data) class TestTPTPlot(PlotTestCase): """Test the function(s) that visualize TPTs.""" def test_plot_tpaths_msm(self): ax = plot_tpaths(msm, 0, 9) assert isinstance(ax, SubplotBase) @SkipTest def test_plot_tpaths_bmsm(self): ax = plot_tpaths(bmsm, 0, 9) assert isinstance(ax, SubplotBase)
def test_from_msm(): assignments, _ = _metastable_system() msm = MarkovStateModel() msm.fit(assignments) pcca = PCCA.from_msm(msm, 2) msm = MarkovStateModel() msm.fit(assignments) pccaplus = PCCAPlus.from_msm(msm, 2) msm = MarkovStateModel() msm.fit(assignments) mvca = MVCA.from_msm(msm, 2) msm = MarkovStateModel() msm.fit(assignments) bace = BACE.from_msm(msm, 2)
plt.semilogy() plt.yticks(fontsize=18) plt.xlabel('Lag times ', fontsize=22) plt.ylabel('Implied times ', fontsize=22) plt.savefig(outname) plt.close() implied_times() msm_timescales_d = implied_timescales(sequences, lag_times, n_timescales=n_timescales, n_jobs=1, msm=MarkovStateModel( verbose=True, reversible_type='transpose', ergodic_cutoff=0), verbose=1) plot(msm_timescales_d, 'Discrete-time MSM Relaxation Timescales', 'imp_times_t_erg_off.png') msm_timescales_d_mle = implied_timescales(sequences, lag_times, n_timescales=n_timescales, n_jobs=1, msm=MarkovStateModel(verbose=True), verbose=1) plot(msm_timescales_d_mle, 'Discrete-time MSM Relaxation Timescales MLE', 'imp_times_mle.png') msm_timescales_c = implied_timescales(sequences,
f = DihedralFeaturizer(sincos=False) dump(f, "raw_featurizer.pkl") feat = f.transform(trj_list) dump(feat, "raw_features.pkl") f = load("./featurizer.pkl") dump(f, "featurizer.pkl") df1 = pd.DataFrame(f.describe_features(trj_list[0])) dump(df1, "feature_descriptor.pkl") feat = f.transform(trj_list) dump(feat, "features.pkl") t = tICA(lag_time=100, n_components=2, kinetic_mapping=False) tica_feat = t.fit_transform(feat) dump(t, "tica_mdl.pkl") dump(tica_feat, "tica_features.pkl") kmeans_mdl = KMeans(50) ass = kmeans_mdl.fit_predict(tica_feat) msm_mdl = MarkovStateModel(100) msm_mdl.fit(ass) dump(kmeans_mdl, "kmeans_mdl.pkl") dump(ass, "assignments.pkl") dump(msm_mdl, "msm_mdl.pkl")
traj_num = traj_num + 1 temp = np.loadtxt(line.strip()) kcenters_sequences.append(temp.tolist()) microstate_lagtime = 50 reversible = 'none' initial = 10 ending = 400 interval = pow(ending * 1.0 / initial, 1.0 / 20) lag_times = [] for j in range(20): lag_times.append(initial * pow(interval, j)) #lag_times=range(10,100,10) msm = MarkovStateModel(verbose=True, lag_time=microstate_lagtime, reversible_type=reversible, ergodic_cutoff='on') msm.fit(kcenters_sequences) print msm.mapping_ print("for microstate lag time = ", microstate_lagtime, ",", msm.n_states_, " states are left") np.savetxt("kcenters_microstate_%s_transmat_.txt" % (reversible), msm.transmat_) np.savetxt("kcenters_%s_stationary_population" % (reversible), msm.populations_) #plot implied timescale n_timescales = 10 print "lagtime list is:", lag_times msm_timescales = implied_timescales(kcenters_sequences,
import msmexplorer as msme from msmexplorer.example_datasets import FsPeptide rs = np.random.RandomState(42) # Load Fs Peptide Data trajs = FsPeptide().get().trajectories # Extract Backbone Dihedrals featurizer = DihedralFeaturizer(types=['phi', 'psi']) diheds = featurizer.fit_transform(trajs) # Perform Dimensionality Reduction tica_model = tICA(lag_time=2, n_components=4) tica_trajs = tica_model.fit_transform(diheds) # Perform Clustering clusterer = MiniBatchKMeans(n_clusters=100, random_state=rs) clustered_trajs = clusterer.fit_transform(tica_trajs) # Construct MSM msm = MarkovStateModel(lag_time=2, n_timescales=5) msm.fit(clustered_trajs) # Plot Timescales colors = ['pomegranate', 'beryl', 'tarragon', 'rawdenim', 'carbon'] msme.plot_timescales(msm, ylabel='Implied Timescales ($ns$)', color_palette=colors)
traj_dict = dict(map(traj_load, meta.iterrows())) trajs = [traj for traj in traj_dict.values() if traj.n_frames > 1000] print(len(trajs)) num_clust = 20 cluster = LandmarkAgglomerative(n_clusters=num_clust, n_landmarks=int(totframes / 100), linkage='ward', metric='rmsd') ctrajs = cluster.fit_transform(trajs) # print('Fitting cluster labels for MSM') # ctraj = {} # count = 0 # for k, v in traj_dict.items(): # print(k, count) # count +=1 # ctraj[k] = cluster.partial_predict(v) # # ctrajs = [traj for traj in ctraj.values() if traj.shape[0] > 1000] print('Fitting MSM') lag = 4000 msm = MarkovStateModel(lag_time=lag, n_timescales=50) msm.fit(ctrajs) # save_trajs(ctraj, 'results/nclusters-{0}-ctraj'.format(num_clust), meta) save_generic(cluster, 'results/clusterer-nclusters-{0}.pickle'.format(num_clust)) save_generic(msm, 'results/msm-lag-{0}-nclusters-{1}.pickl'.format(lag, num_clust))