示例#1
0
    def setUpClass(cls):
        super(TestCluster, cls).setUpClass()
        cls.dtraj_dir = tempfile.mkdtemp()

        # generate Gaussian mixture
        means = [
            np.array([-3, 0]),
            np.array([-1, 1]),
            np.array([0, 0]),
            np.array([1, -1]),
            np.array([4, 2])
        ]
        widths = [
            np.array([0.3, 2]),
            np.array([0.3, 2]),
            np.array([0.3, 2]),
            np.array([0.3, 2]),
            np.array([0.3, 2])
        ]
        # continuous trajectory
        nsample = 1000
        cls.T = len(means) * nsample
        cls.X = np.zeros((cls.T, 2))
        for i in range(len(means)):
            cls.X[i * nsample:(i + 1) * nsample,
                  0] = widths[i][0] * np.random.randn() + means[i][0]
            cls.X[i * nsample:(i + 1) * nsample,
                  1] = widths[i][1] * np.random.randn() + means[i][1]
        # cluster in different ways
        cls.km = coor.cluster_kmeans(data=cls.X, k=100)
        cls.rs = coor.cluster_regspace(data=cls.X, dmin=0.5)
        cls.rt = coor.cluster_uniform_time(data=cls.X, k=100)
        cls.cl = [cls.km, cls.rs, cls.rt]
示例#2
0
    def setUpClass(cls):
        super(TestCluster, cls).setUpClass()
        cls.dtraj_dir = tempfile.mkdtemp()

        # generate Gaussian mixture
        means = [np.array([-3,0]),
                 np.array([-1,1]),
                 np.array([0,0]),
                 np.array([1,-1]),
                 np.array([4,2])]
        widths = [np.array([0.3,2]),
                  np.array([0.3,2]),
                  np.array([0.3,2]),
                  np.array([0.3,2]),
                  np.array([0.3,2])]
        # continuous trajectory
        nsample = 1000
        cls.T = len(means)*nsample
        cls.X = np.zeros((cls.T, 2))
        for i in range(len(means)):
            cls.X[i*nsample:(i+1)*nsample,0] = widths[i][0] * np.random.randn() + means[i][0]
            cls.X[i*nsample:(i+1)*nsample,1] = widths[i][1] * np.random.randn() + means[i][1]
        # cluster in different ways
        cls.km = coor.cluster_kmeans(data = cls.X, k = 100)
        cls.rs = coor.cluster_regspace(data = cls.X, dmin=0.5)
        cls.rt = coor.cluster_uniform_time(data = cls.X, k = 100)
        cls.cl = [cls.km, cls.rs, cls.rt]
示例#3
0
    def test_with_data_in_mem(self):
        import pyemma.coordinates as api

        data = [
            np.random.random((100, 50)),
            np.random.random((103, 50)),
            np.random.random((33, 50))
        ]
        reader = source(data)
        assert isinstance(reader, DataInMemory)

        tpca = api.pca(dim=2)

        n_centers = 10
        km = api.cluster_kmeans(k=n_centers)

        disc = api.discretizer(reader, tpca, km)
        disc.parametrize()

        dtrajs = disc.dtrajs
        for dtraj in dtrajs:
            n_states = np.max((np.unique(dtraj)))
            self.assertGreaterEqual(
                n_centers - 1, n_states,
                "dtraj has more states than cluster centers")
示例#4
0
def project_and_cluster(trajfiles,
                        featurizer,
                        sparsify=False,
                        tica=True,
                        lag=100000,
                        scale=True,
                        var_cutoff=1.0,
                        ncluster=100):
    """
    Returns
    -------
    trans_obj, Y, clustering

    """
    X = coor.load(trajfiles, featurizer)
    if sparsify:
        X = remove_constant(X)
    if tica:
        trans_obj = coor.tica(X, lag=lag, var_cutoff=var_cutoff)
        Y = trans_obj.get_output()
    else:
        trans_obj = coor.pca(X, dim=-1, var_cutoff=var_cutoff)
        Y = trans_obj.get_output()
    if scale:
        for y in Y:
            y *= trans_obj.eigenvalues[:trans_obj.dimension()]
    if cluster:
        cl_obj = coor.cluster_kmeans(Y,
                                     k=ncluster,
                                     max_iter=3,
                                     fixed_seed=True)
        return trans_obj, Y, cl_obj
    return trans_obj, Y
示例#5
0
def clusterTrajectories(trajectories, numClusters, stride=1):
    """ Cluster the trajectories into numClusters clusters using kmeans
    algorithm.
    Returns a KmeansClusteringObject
    """
    return coor.cluster_kmeans(data=trajectories,
                               k=numClusters,
                               max_iter=20,
                               stride=stride)
示例#6
0
    def test_clustering_kmeans(self):
        params = {'k': 10, 'init_strategy': 'uniform', 'max_iter': 42,
                  'metric': 'minRMSD', 'stride': 1}
        cl = coor.cluster_kmeans([np.random.random((100, 3))],**params)
        params['n_clusters'] = params['k']
        params['clustercenters'] = cl.clustercenters  # this is a model param, so it should contained in the output
        del params['k']

        self.compare(cl, params)
示例#7
0
 def cluster(self, trajectories):
     """ Cluster the trajectories into numClusters clusters using kmeans
     algorithm.
     Returns a KmeansClusteringObject
     """
     return coor.cluster_kmeans(data=trajectories,
                                k=self.numClusters,
                                max_iter=500,
                                stride=self.stride)
示例#8
0
文件: test_cli.py 项目: nd1511/PyEMMA
    def setUpClass(cls):
        from pyemma.datasets import get_bpti_test_data

        d = get_bpti_test_data()
        trajs, top = d['trajs'], d['top']
        s = source(trajs, top=top)

        t = tica(s, lag=1)

        c = cluster_kmeans(t)
        cls.model_file = tempfile.mktemp()
        c.save(cls.model_file, save_streaming_chain=True)
def lengthVsNtrajs(data, nruns, lagtime, clusters, outputFilename, cache, m,
                   stride):
    nClusters = len(clusters)
    nLags = len(lagtime)
    results = np.zeros((nClusters, nLags))
    results_cv = np.zeros((nClusters, nLags))
    for i, cl in enumerate(clusters):
        clustering = coor.cluster_kmeans(data=data,
                                         k=cl,
                                         max_iter=500,
                                         stride=stride)
        for j, lag in enumerate(lagtime):
            if (cl, lag) in cache:
                print(
                    "Loading cached computation for %d clusters and %d lagtime"
                    % (cl, lag))
                results[i][j], results_cv[i][j] = cache[(cl, lag)]
                with open(outputFilename, 'a') as f:
                    f.write("%d %d %f %f\n" %
                            (cl, lag, results[i][j], results_cv[i][j]))
                continue
            print("Computing for %d clusters and %d lagtime" % (cl, lag))
            try:
                MSM = msm.estimate_markov_model(clustering.dtrajs, lag)
                print("MSM estimated on %d states" % MSM.nstates)
            except Exception:
                print("Estimation error in %d clusters, %d lagtime" %
                      (cl, lag))
                results[i][j] = 0.0
                results_cv[i][j] = 0.0
                continue
            try:
                results[i][j] = np.mean(MSM.score(MSM.dtrajs_full, score_k=m))
            except Exception:
                print("Estimation error in %d clusters, %d lagtime" %
                      (cl, lag))
                results[i][j] = 0.0
                results_cv[i][j] = 0.0
                continue
            try:
                results_cv[i][j] = np.mean(
                    MSM.score_cv(MSM.dtrajs_full, score_k=m, n=nruns))
            except Exception:
                print("Estimation error in %d clusters, %d lagtime" %
                      (cl, lag))
                results_cv[i][j] = 0.0

            with open(outputFilename, 'a') as f:
                f.write("%d %d %f %f\n" %
                        (cl, lag, results[i][j], results_cv[i][j]))
    return results, results_cv
示例#10
0
def estimateDG(data, nruns, cl, lag, ntraj, len_traj, skipFirstSnaphots,
               cluster_each_iteration):
    deltaG = []
    if not cluster_each_iteration:
        clustering = coor.cluster_kmeans(data=data,
                                         k=cl,
                                         max_iter=500,
                                         stride=1)
    for _ in range(nruns):
        data_it = select_iteration_data(data, ntraj)
        data_it = [data[j][skipFirstSnaphots:len_traj] for j in data_it]
        if cluster_each_iteration:
            clustering = coor.cluster_kmeans(data=data_it,
                                             k=cl,
                                             max_iter=500,
                                             stride=1)
            dtrajs = clustering.dtrajs
        else:
            dtrajs = clustering.assign(data_it)
        try:
            MSM = msm.estimate_markov_model(dtrajs, lag)
            print("MSM estimated on %d states" % MSM.nstates)
        except Exception:
            print(
                "Estimation error in %d clusters, %d lagtime, %d trajectories of %d steps"
                % (cl, lag, ntraj, len_traj))
            continue
        pi, cl_centers = compute.ensure_connectivity(MSM,
                                                     clustering.clustercenters)
        d = 0.75
        bins = compute.create_box(cl_centers, data_it, d)
        microstateVolume = compute.calculate_microstate_volumes_new(
            cl_centers, data_it, bins, d)
        _, string = compute.calculate_pmf(microstateVolume, pi)
        value = float(string.split()[1])
        deltaG.append(value)
    return np.mean(deltaG), np.std(deltaG)
示例#11
0
def multi_temperature_tram(feat, trajfiles, temperatures, dtrajs=None, stride=1, tica_lag=100,
        keep_tica_dims=20, n_clusters=100, tram_lag=100, engfile="Etot.dat", usecols=(1,), kb=0.0083145):
    """
    Parameters
    ----------
    feat : obj, pyemma.coor.featurizer
        Featurizer object that already has the appropriate features added.
    trajfiles : list
        Names of trajectories to include in estimation.
    temperatures : list
        Temperatures of corresponding trajectories.
    stride : int
        Number of frames to skip in tica and clustering.
    tica_lag : int
        Lagtime to use for constructing tica.
    keep_tica_dims : int
        Number of dimensions to keep from tica. Somewhat ambiguous.
    n_clusters : int
        Number of clusters for kmeans. Somewhat ambiguous. 
    """

    dirs = [ os.path.dirname(x) for x in trajfiles ]
    beta = [ 1./(kb*x) for x in temperatures ]

    if dtrajs is None:
        inp = coor.source(trajfiles, feat)

        tica_obj = coor.tica(inp, lag=tica_lag, dim=keep_tica_dims, stride=stride)
        Y = tica_obj.get_output()

        cl = coor.cluster_kmeans(data=Y, k=n_clusters, stride=stride)
        dtrajs = cl.dtrajs

    # dimensionless energy
    if engfile.endswith("npy"):
        energy_trajs = [ beta[i]*np.load("{}/{}".format(dirs[i], engfile)) for i in range(len(dirs)) ]
    else:
        energy_trajs = [ beta[i]*np.loadtxt("{}/{}".format(dirs[i], engfile), usecols=usecols) for i in range(len(dirs)) ]
    temp_trajs = [ kb*temperatures[i]*np.ones(energy_trajs[i].shape[0], float) for i in range(len(dirs)) ]

    # dTRAM approach
    tram = thermo.estimate_multi_temperature(energy_trajs, temp_trajs,
            dtrajs, energy_unit='kT', temp_unit='kT', estimator='tram',
            lag=tram_lag, maxiter=2000000, maxerr=1e-10)

    return dirs, dtrajs, tram
示例#12
0
def model_file():
    file = None
    try:
        from pyemma.datasets import get_bpti_test_data
        d = get_bpti_test_data()
        trajs, top = d['trajs'], d['top']
        s = source(trajs, top=top)

        t = tica(s, lag=1)

        c = cluster_kmeans(t)
        file = tempfile.mktemp()
        c.save(file, save_streaming_chain=True)

        yield file
    finally:
        if file is not None:
            shutil.rmtree(file, ignore_errors=True)
示例#13
0
    dtraj_phi_3.append(dtraj_rama_3[i][:,1])
    dtraj_phi_4.append(dtraj_rama_4[i][:,1])
    dtraj_phi_5.append(dtraj_rama_5[i][:,1])
    dtraj_phi_6.append(dtraj_rama_6[i][:,1])


# **simple clustering along psi only for discretization**

# In[7]:

n_clusters = 2     # number of k-means clusters


# In[8]:

clustering_rama_2 = coor.cluster_kmeans(dtraj_phi_2,k=n_clusters,max_iter=100, tolerance=1e-12, fixed_seed=True)
clustering_rama_3 = coor.cluster_kmeans(dtraj_phi_3,k=n_clusters,max_iter=100, tolerance=1e-12, fixed_seed=True)
clustering_rama_4 = coor.cluster_kmeans(dtraj_phi_4,k=n_clusters,max_iter=100, tolerance=1e-12, fixed_seed=True)
clustering_rama_5 = coor.cluster_kmeans(dtraj_phi_5,k=n_clusters,max_iter=100, tolerance=1e-12, fixed_seed=True)
clustering_rama_6 = coor.cluster_kmeans(dtraj_phi_6,k=n_clusters,max_iter=100, tolerance=1e-12, fixed_seed=True)


# In[9]:

cc_rama_2 = clustering_rama_2.clustercenters[:,0]
cc_rama_3 = clustering_rama_3.clustercenters[:,0]
cc_rama_4 = clustering_rama_4.clustercenters[:,0]
cc_rama_5 = clustering_rama_5.clustercenters[:,0]
cc_rama_6 = clustering_rama_6.clustercenters[:,0]

示例#14
0
coordinates_source = coor.source(trajectory_files,featurizer)
print("There are %d frames total in %d trajectories." % (coordinates_source.n_frames_total(), coordinates_source.number_of_trajectories()))

################################################################################
# Do tICA
################################################################################

print('tICA...')
running_tica = coor.tica(lag=100, dim=100)

################################################################################
# Cluster
################################################################################

print('Clustering...')
clustering = coor.cluster_kmeans(k=100, stride=50)
coor.pipeline([coordinates_source,running_tica,clustering])

dtrajs = clustering.dtrajs

# Save discrete trajectories.
clustering.save_dtrajs(output_format='npy', extension='.npy')

################################################################################
# Make tics plot
################################################################################
tics = running_tica.get_output()[0]

z,x,y = np.histogram2d(tics[:,0],tics[:,1], bins=50)
F = -np.log(z+1)
extent = [x[0], x[-1], y[0], y[-1]]
save_object('pca_obj.pkl', pca_obj)

#plt.plot(tica_obj.eigenvalues,marker='x')
#plt.xlim([-1,20])
#plt.ylim([0.5,1])

# here we do a little trick to ensure that eigenvectors always have the same sign structure.
# That's irrelevant to the analysis and just nicer plots - you can ignore it.
#for i in range(2):
#    if tica_obj.eigenvectors[0, i] > 0:
#        tica_obj.eigenvectors[:, i] *= -1

Y = pca_obj.get_output()  # get tica coordinates
np.save('Y.npy', Y)

# Now, do the clustering
Y_clust = []
for i in range(len(Y)):
    Y_clust.append(Y[i][:, 0:clust_dim])
clustering = coor.cluster_kmeans(data=Y_clust,
                                 k=n_clusters,
                                 max_iter=50,
                                 tolerance=1e-05,
                                 stride=1)
save_object(
    'clustering_kmeans_nclust-' + str(clustering.n_clusters) + '_clustdim-' +
    str(clust_dim) + '.pkl', clustering)
#clustering = coor.cluster_regspace(Y_clust,max_centers=n_clusters,dmin=dmin)
#save_object('clustering_regspace_nclust-'+str(clustering.n_clusters)+'_clustdim-'+str(clust_dim)+'.pkl', clustering)
#print 'n_clusters = '+str(clustering.n_clusters)
示例#16
0
    trajfiles = [ x + "/" + trajname for x in tempdirs ]

    # add features
    feat = coor.featurizer(topfile)
    feat, feature_info = util.sbm_contact_features(feat, pairwise_file, n_native_pairs)

    if not os.path.exists("msm"):
        os.mkdir("msm")

    if (not os.path.exists("msm/dtrajs.pkl")) or recluster:
        # cluster if necessary
        inp = coor.source(trajfiles, feat)
        tica_obj = coor.tica(inp, dim=tica_dims, lag=tica_lag, stride=stride)
        Y = tica_obj.get_output()
        cl = coor.cluster_kmeans(data=Y, k=n_clusters)
        dtrajs = cl.dtrajs

        os.chdir("msm")
        dirs = [ os.path.basename(os.path.dirname(x)) for x in trajfiles ]

        if not dontsavemsm:
            dtraj_info = { dirs[x]:dtrajs[x] for x in range(len(dirs)) }
            dtraj_info["dirs"] = dirs
            with open("dtrajs.pkl", 'wb') as fhandle:
                pickle.dump(dtraj_info, fhandle)
    else:
        os.chdir("msm")
        with open("dtrajs.pkl", 'rb') as fhandle:
            dtraj_pkl = pickle.load(fhandle)
            dirs = dtraj_pkl["dirs"]
示例#17
0
        plt.plot(x, Y[ij][:, 1])
        plt.ylabel('IC 2')
        plt.xticks([])

        ax1 = plt.subplot(313)
        plt.plot(x, Y[ij][:, 2])
        plt.ylabel('IC 3')
        plt.xlabel('time (frames)')
        plt.xticks([])
        plt.savefig("traj_%d_ICs.png" % (ij + 1))
        # if we have many trajectories having them all open might consume a lot of
        # memory
        plt.close()
else:
    Y = trajs
clustering = coor.cluster_kmeans(Y, k=numClusters, max_iter=100)

dtrajs = clustering.dtrajs
cc_x = clustering.clustercenters[:, 0]
cc_y = clustering.clustercenters[:, 1]
cc_z = clustering.clustercenters[:, 2]
xall = np.vstack(Y)[:, 0]
yall = np.vstack(Y)[:, 1]
plt.figure(figsize=(8, 5))
mplt.plot_free_energy(xall, yall, cmap="Spectral")
plt.plot(cc_x, cc_y, linewidth=0, marker='o', markersize=5, color='black')
plt.xlabel("IC 1")
plt.ylabel("IC 2")
plt.title("FES IC1-2")
plt.savefig("fes_IC1-2.png")
示例#18
0
    zfile = open(
        'Intermediate_pickle_files/wt-h70a-d66a_cattraj_contourmap.pickle',
        'w')
    pickle.dump(F, zfile)
    zfile.close()

    test = []
    for t in combined:
        test.append(t)
    np.shape(test)

    # ###  100 K-means clusters
    nclusters = 100

    kmean_cluster100 = coor.cluster_kmeans(data=test,
                                           k=nclusters,
                                           max_iter=1000,
                                           tolerance=1e-6)

    print "Done!"
    print "Saving cluster centers..."
    ccenters100 = kmean_cluster100.clustercenters
    f = open(
        'Intermediate_pickle_files/wt-h70a-d66a_cattraj_dirrmsd_ccenter-100.pickle',
        'w')
    pickle.dump(ccenters100, f)
    f.close()

    wt_dtrajs = coor.assign_to_centers(data=wt_dir_rmsd, centers=ccenters100)
    f = open(
        'Intermediate_pickle_files/cypa_wt-d66a_cattraj_dirrmsd_dtrajs.pickle',
        'w')
示例#19
0
文件: msm.py 项目: erb24/dApdA
# 1- Clustering
#cl = coor.cluster_uniform_time(data=data, k=100, stride=10)
#cl = coor.cluster_kmeans(data=data, k=250, stride=10)
# for later use we save the discrete trajectories and cluster center coordinates:
#dtrajs = cl.dtrajs
#cc_x = cl.clustercenters[:,0]
#cc_y = cl.clustercenters[:,1]

if os.path.isfile('clusterenters_kmeans.npy'):
    dtrajs = np.load('dtrajs_kmeans.npy')
    dtrajs = np.ravel(dtrajs)
    dummy = np.load('clustercenters_kmeans.npy')
    cc_x = dummy[:, 0]
    cc_y = dummy[:, 1]
else:
    cl = coor.cluster_kmeans(data=data, k=100, stride=25)
    # for later use we save the discrete trajectories and cluster center pyemma.coordinatesdinates:
    dtrajs = cl.dtrajs
    cc_x = cl.clustercenters[:, 0]
    cc_y = cl.clustercenters[:, 1]
    np.save('dtrajs_kmeans.npy', dtrajs)
    np.save('clustercenters_kmeans.npy', np.column_stack([cc_x, cc_y]))

# 2- Lag time: Note that the xtc files are saved every 0.2 ps.

# Making the Markov model
M = msm.estimate_markov_model(dtrajs, 2500)
print('fraction of states used = ', M.active_state_fraction)
print('fraction of counts used = ', M.active_count_fraction)
print('transition matrix', M.transition_matrix)  # doctest: +SKIP
示例#20
0
plt.savefig('tic1_feature_corr.png')

plt.clf()
plt.title('Feature correlation to tIC 2')
plt.bar(range(len(tica.feature_TIC_correlation[:, 1])),
        abs(tica.feature_TIC_correlation[:, 1]),
        align='center')
plt.xlabel('Index within feature vector')
plt.ylabel('Correlation')
plt.tight_layout()

plt.savefig('tic2_feature_corr.png')

print('running kmeans')

clkmeans = coor.cluster_kmeans(Y, 300, max_iter=300)

plt.clf()
plt.figure(figsize=(8, 5))
plt.plot(clkmeans.clustercenters[:, 0], clkmeans.clustercenters[:, 1], ' ok')
mplt.plot_free_energy(np.hstack(Y1), np.hstack(Y2))
plt.xlabel('tic 1')
plt.ylabel('tic 2')

plt.savefig('kmeans_cluster-on_tic1tic2.png')

np.save('clkmeans_dtrajs.npy', clkmeans.dtrajs)
np.save('clkmeans_clustercenters.npy', clkmeans.clustercenters)

print('running MSM')
示例#21
0
n_sets = 3

print 'feat dimension'
print feat.dimension()




dataset = []
nlist = []

if 1:
    n_clusters = 200
    tica_obj = coor.tica( dim=2, lag=tica_lagtime, kinetic_map=True)

    input_data = coor.cluster_kmeans( k=n_clusters, max_iter=50)

    disc = coor.discretizer(inp, tica_obj, input_data, stride=1, chunksize=10)
    disc.parametrize()
print tica_obj.cumvar
#TICA output is Y
Y = tica_obj.get_output()
print np.shape(Y)
#print 'Y[0]'
#print Y[0]
print 'number of trajetories = ', np.shape(Y)[0]
#

#mapped_data is the TICA clustered data mapped to the microstates (so integer valued)
mapped_data =input_data.dtrajs
示例#22
0
tica_lagtime = 400

#number of PCCA clusters
n_sets = 3

print 'feat dimension'
print feat.dimension()

dataset = []
nlist = []

if 1:
    n_clusters = 200
    tica_obj = coor.tica(dim=2, lag=tica_lagtime, kinetic_map=True)

    input_data = coor.cluster_kmeans(k=n_clusters, max_iter=50)

    disc = coor.discretizer(inp, tica_obj, input_data, stride=1, chunksize=10)
    disc.parametrize()
print tica_obj.cumvar
#TICA output is Y
Y = tica_obj.get_output()
print np.shape(Y)
#print 'Y[0]'
#print Y[0]
print 'number of trajetories = ', np.shape(Y)[0]
#

#mapped_data is the TICA clustered data mapped to the microstates (so integer valued)
mapped_data = input_data.dtrajs
示例#23
0
      (coordinates_source.n_frames_total(),
       coordinates_source.number_of_trajectories()))

################################################################################
# Do tICA
################################################################################

print('tICA...')
running_tica = coor.tica(lag=100, dim=100)

################################################################################
# Cluster
################################################################################

print('Clustering...')
clustering = coor.cluster_kmeans(k=100, stride=50)
coor.pipeline([coordinates_source, running_tica, clustering])

dtrajs = clustering.dtrajs

# Save discrete trajectories.
clustering.save_dtrajs(output_format='npy', extension='.npy')

################################################################################
# Make tics plot
################################################################################
tics = running_tica.get_output()[0]

z, x, y = np.histogram2d(tics[:, 0], tics[:, 1], bins=50)
F = -np.log(z + 1)
extent = [x[0], x[-1], y[0], y[-1]]
#cluster tica data into clusters
import pyemma.coordinates as coor
import numpy as np

sys = 'fdis'
tica_data = coor.load('tica_data_05/fdis_tica_data.h5')

n_clusters = 100

cl = coor.cluster_kmeans(tica_data, k=n_clusters, max_iter=50)

#cl.save(f'cluster_data/{sys}_{n_clusters}_mini_cluster_object.h5', overwrite=True)

cl.write_to_hdf5(f'cluster_data_11/{sys}_{n_clusters}_cluster_dtrajs22.h5')
示例#25
0
pickle.dump(clust_col_skip_dtraj,
            open('clust_col_skip_dtraj_cl_full_try2.pickle', 'wb'),
            protocol=pickle.HIGHEST_PROTOCOL)

print('length of clust_col_skip_dtraj is ', len(clust_col_skip_dtraj))
print('length of clust_col_skip_dtraj[0] is ', len(clust_col_skip_dtraj[0]))

#Y = tica.get_output()
tica = pickle.load(open('mix_tica_full.pickle', 'rb'))
Y = tica.get_output()
print('shape of tica is ', len(Y[0]))
#cl_f = pickle.load(open('pg_cl_full_ax1.pickle', 'rb'))

Y2 = [i[:, 1:3] for i in Y]

cluster_tic = coor.cluster_kmeans(Y2, k=10, max_iter=100)

clust_out = cluster_tic.dtrajs
#clust_out =  pickle.load(open('tic_cl_full_dtraj.pickle', 'rb'))
pickle.dump(clust_out,
            open('tic_cl_full_dtraj3_try2.pickle', 'wb'),
            protocol=pickle.HIGHEST_PROTOCOL)

pickle.dump(cluster_tic.clustercenters,
            open('tic_cl_full_centers3_try2.pickle', 'wb'),
            protocol=pickle.HIGHEST_PROTOCOL)

D = [0] * len(clust_col_skip_dtraj)

for i in range(len(clust_col_skip_dtraj)):
    D[i] = [0] * len(clust_col_skip_dtraj[0])
示例#26
0
    # Choose parameters to be used in the task

    config.show_progress_bars = False

    lag = args.tica_lag

    feat = coor.featurizer(topfile)
    feat.add_backbone_torsions()

    inp = coor.source(trajfiles, feat)
    dim = args.tica_dim

    tica_obj = coor.tica(inp, lag=lag, dim=dim, kinetic_map=False)
    Y = tica_obj.get_output()

    cl = coor.cluster_kmeans(data=Y, k=args.msm_states, stride=args.stride)
    M = msm.estimate_markov_model(cl.dtrajs, args.msm_lag)

    # with open("model.dtraj", "w") as f:
    #     f.write("\n".join(" ".join(map(str, x)) for x in cl.dtrajs))
    #
    # # np.savetxt("model.dtraj", cl.dtrajs, delimiter=" ", fmt='%d')
    # np.savetxt("model.msm", M.P, delimiter=",")

    data = {
        'input': {
            'frames': inp.n_frames_total(),
            'dimension': inp.dimension(),
            'trajectories': inp.number_of_trajectories(),
            'lengths': inp.trajectory_lengths().tolist(),
        },
示例#27
0
plt.plot(x, Y[0][:, 1])
plt.ylabel('IC 2')
plt.xticks([])
plt.yticks(np.arange(-2, 4))
ax1 = plt.subplot(313)
plt.plot(x, Y[0][:, 2])
plt.xlabel('time / ns')
plt.ylabel('IC 3')
plt.yticks(np.arange(-4, 6, 2))

# for shorter trajectory, ideal number of clusters is 100
# optimal lag_time = 750?

# optimal lag_time = 1000 timesteps

clustering = coor.cluster_kmeans(Y, k=100)
dtrajs = clustering.dtrajs
msm = pyemma.msm.estimate_markov_model(dtrajs, 380)
pyemma.plots.plot_cktest(msm.cktest(3, err_est=True), marker='.')

# TRIALS - reg_space clustering and kmeans comparison - kmeans by far better
clustering_reg = coor.cluster_regspace(Y, dmin=2, max_centers=100)
cr_x = clustering_reg.clustercenters[:, 0]
cr_y = clustering_reg.clustercenters[:, 0]
cc_x = clustering.clustercenters[:, 0]
cc_y = clustering.clustercenters[:, 1]
c_reg = [cr_x, cr_y]
c = [cc_x, cc_y]
print(len(clustering_reg.clustercenters))
fig, axes = plt.subplots(1, 2, figsize=(10, 4), sharex=True, sharey=True)
for ax, cls in zip(axes.flat, [c, c_reg]):
示例#28
0
    plt.xlim([xmin, xmax])

    #Trace la carte d'énergie libre, abscisse : rayon de gyration, ordonnee : RMSD
    plt.xlim([xmin, xmax])  #Borne
    plt.ylim([ymin, ymax])
    mplt.plot_free_energy(gyrateArray, rms)
    #plt.plot([Refgyrate],[ymin], '+')
    plt.ylabel('RMSD (A)')
    plt.xlabel('Radius of gyration (A)')
    save_figure('free' + peptide_name + '.pdf',
                PathOut + "/")  #Par défaut, image au format pdf

    #Mise en place du k-means
    n_clusters = args.kmeans
    Y = np.vstack((gyrateArray, rms))
    X = np.transpose(Y)
    clustering = coor.cluster_kmeans(X, k=n_clusters, max_iter=100)
    dtrajs = clustering.dtrajs

    cc_x = clustering.clustercenters[:, 0]
    cc_y = clustering.clustercenters[:, 1]
    ind_clust = clustering.index_clusters

    plt.plot(cc_x, cc_y, linewidth=0, marker='o', markersize=5, color='black')
    for i in range(len(cc_x)):
        plt.text(cc_x[i], cc_y[i], str(i + 1), color='grey', fontsize=12)

    save_figure('free' + peptide_name + '_clusters.pdf', PathOut)
    compute_effectif_cluster(ind_clust, int(traj.time[0]), int(traj.timestep),
                             PathOut, PathOut + "ex_md.xtc", struct)
示例#29
0
        if i == 3:
            for j in range(4):
                axes[i][j].set_xlabel("TIC " + str(j + 2), fontsize=20)

    axes[0][0].annotate("TICA  " + f_str,
                        fontsize=24,
                        xy=(0, 0),
                        xytext=(1.8, 1.1),
                        xycoords="axes fraction",
                        textcoords="axes fraction")
    fig.savefig(msm_savedir + "/tic_hist_grid.pdf")

    n_clusters = 300
    msm_lags = [1, 10, 20, 50, 100, 200]

    cluster = coor.cluster_kmeans(k=n_clusters)
    coor.pipeline([reader, tica, cluster])
    its = msm.its(cluster.dtrajs, lags=msm_lags)

    plt.figure()
    mplt.plot_implied_timescales(its)
    plt.title(msm_savedir)
    plt.savefig(msm_savedir + "/its_vs_lag_ylog.pdf")

    #plt.figure()
    #plt.plot(np.arange(1,21), M.timescales()[:20], 'o')
    #ymin, ymax = plt.ylim()
    #plt.ylim(0, ymax)
    #plt.savefig("msm_ti.pdf")
示例#30
0
def main(lagtimes, clusters, m, tica_lag, tica, output_path):
    trajectoryFolder = "allTrajs"
    trajectoryBasename = "traj*"
    stride = 1
    if output_path and not os.path.exists(output_path):
        os.makedirs(output_path)
    scores_path = os.path.join(output_path, "scores")
    if not os.path.exists(scores_path):
        os.makedirs(scores_path)
    data, _ = cluster.loadTrajFiles(trajectoryFolder, trajectoryBasename)
    if tica:
        tica_obj = coor.tica(data,
                             lag=tica_lag,
                             var_cutoff=0.9,
                             kinetic_map=True)
        print('TICA dimension ', tica_obj.dimension())
        data = tica_obj.get_output()
    for tau in lagtimes:
        scores = []
        scores_cv = []
        print("Estimating MSM with %d lagtime" % tau)
        for k in clusters:
            print("Calculating scores with %d clusters" % k)
            # cluster data
            cl = coor.cluster_kmeans(data=data,
                                     k=k,
                                     max_iter=500,
                                     stride=stride)
            try:
                MSM = msm.estimate_markov_model(cl.dtrajs, tau)
                print("MSM estimated on %d states" % MSM.nstates)
            except Exception:
                print("Estimation error in %d clusters, %d lagtime" % (k, tau))
                scores.append(0)
                scores_cv.append(np.array([0, 0, 0, 0, 0]))
                continue
            try:
                scores.append(MSM.score(MSM.dtrajs_full, score_k=m))
            except Exception:
                print("Estimation error in %d clusters, %d lagtime" % (k, tau))
                scores.append(0)
                scores_cv.append(np.array([0, 0, 0, 0, 0]))
                continue
            try:
                scores_cv.append(MSM.score_cv(MSM.dtrajs_full, score_k=m, n=5))
            except Exception:
                print("Estimation error in %d clusters, %d lagtime" % (k, tau))
                scores_cv.append(np.array([0, 0, 0, 0, 0]))
        np.save(os.path.join(scores_path, "scores_lag_%d.npy" % tau), scores)
        np.save(os.path.join(scores_path, "scores_cv_lag_%d.npy" % tau),
                scores_cv)
        mean_scores = [sc.mean() for sc in scores_cv]
        std_scores = [sc.std() for sc in scores_cv]
        plt.figure()
        plt.plot(clusters, scores, label="Training")
        plt.errorbar(clusters,
                     mean_scores,
                     yerr=std_scores,
                     fmt='k',
                     label="Testing")
        plt.xlabel("Number of states")
        plt.ylabel("Score")
        plt.legend()
        plt.savefig(os.path.join(output_path, "scores_cv_lag_%d.png" % tau))
示例#31
0
MSMlags = np.array([1])
for i in range(1, 4, 1):
    nmin = 10**(i)
    nmax = 10**(i + 1)
    dn = 10**(i)
    MSMlags1 = np.arange(nmin, nmax, dn)
    MSMlags = np.concatenate([MSMlags, MSMlags1])

nlagsMSM = np.shape(MSMlags)[0]
print('number of different lag times chosen = ' + str(nlagsMSM))
print('lag time values used = ' + str(MSMlags))
sys.stdout.flush()

n_clusters = 1000
clustering = coor.cluster_kmeans(list(Ys), k=n_clusters, max_iter=10000)
dtrajs = clustering.dtrajs

my_dict = {}

my_dict['n_clusters'] = n_clusters
my_dict['micro_membership'] = dtrajs
my_dict['centers'] = clustering.clustercenters

np.savez_compressed('2F4K_MSM_10TICA_clusters_1000.npz', **my_dict)

#--------------------------------------------------------------------
# Build Markov State Model out of clustered data and save Markov transition matrices to file

print('Building Markov Model at different lag times...')
sys.stdout.flush()
示例#32
0
                  show_titles=True,
                  title_kwargs={"fontsize": 12})

    plt.savefig('%s/corner.png' % mutant)

    plt.clf()
    plt.figure(figsize=(8, 5))
    mplt.plot_free_energy(np.hstack(Y1_otherpro), np.hstack(Y2_otherpro))
    plt.xlabel('tic 1')
    plt.ylabel('tic 2')

    plt.savefig('%s/tic1-tic2.png' % mutant)

    print('running %s kmeans' % mutant)

    clkmeans = coor.cluster_kmeans(Y_otherpro, 300, max_iter=300)

    plt.clf()
    plt.figure(figsize=(8, 5))
    plt.plot(clkmeans.clustercenters[:, 0], clkmeans.clustercenters[:, 1],
             ' ok')
    mplt.plot_free_energy(np.hstack(Y1), np.hstack(Y2))
    plt.xlabel('tic 1')
    plt.ylabel('tic 2')

    plt.savefig('%s/kmeans_cluster-on_tic1tic2.png' % mutant)

    np.save('%s/clkmeans_dtrajs.npy' % mutant, clkmeans.dtrajs)
    np.save('%s/clkmeans_clustercenters.npy' % mutant, clkmeans.clustercenters)

    print('running %s MSM' % mutant)