示例#1
0
    def test_provided_means(self):
        data = np.random.random((300, 3))
        mean = data.mean(axis=0)
        pca_spec_mean = pca(data, mean=mean)
        pca_calc_mean = pca(data)

        np.testing.assert_allclose(mean, pca_calc_mean.mean)
        np.testing.assert_allclose(mean, pca_spec_mean.mean)

        np.testing.assert_allclose(pca_spec_mean.cov, pca_calc_mean.cov)
示例#2
0
    def test_api_cs(self):
        p = pca(chunksize=1, dim=10)
        assert p.chunksize == 1

        # in case, we do not have a fixed dimension, pca can not calculate a proper chunksize.
        p = pca(chunksize=None)
        assert p.chunksize == p._FALLBACK_CHUNKSIZE

        p = pca(chunksize=None, dim=10)
        assert p.chunksize != 0
        assert p.chunksize != p._FALLBACK_CHUNKSIZE
示例#3
0
    def test_partial_fit(self):
        data = [np.random.random((100, 3)), np.random.random((100, 3))]
        pca_part = pca()
        pca_part.partial_fit(data[0])
        pca_part.partial_fit(data[1])

        ref = pca(data)
        np.testing.assert_allclose(pca_part.mean, ref.mean)

        np.testing.assert_allclose(pca_part.eigenvalues, ref.eigenvalues)
        np.testing.assert_allclose(pca_part.eigenvectors, ref.eigenvectors)
示例#4
0
文件: test_pca.py 项目: nd1511/PyEMMA
 def test_dimension(self):
     assert types.is_int(self.pca_obj.dimension())
     # Here:
     assert self.pca_obj.dimension() == 1
     # Test other variants
     obj = pca(data=self.X, dim=-1, var_cutoff=1.0)
     assert obj.dimension() == 2
     obj = pca(data=self.X, dim=-1, var_cutoff=0.8)
     assert obj.dimension() == 1
     with self.assertRaises(ValueError):  # trying to set both dim and subspace_variance is forbidden
         pca(data=self.X, dim=1, var_cutoff=0.8)
示例#5
0
    def test_with_data_in_mem(self):
        import pyemma.coordinates as api

        data = [
            np.random.random((100, 50)),
            np.random.random((103, 50)),
            np.random.random((33, 50))
        ]
        reader = source(data)
        assert isinstance(reader, DataInMemory)

        tpca = api.pca(dim=2)

        n_centers = 10
        km = api.cluster_kmeans(k=n_centers)

        disc = api.discretizer(reader, tpca, km)
        disc.parametrize()

        dtrajs = disc.dtrajs
        for dtraj in dtrajs:
            n_states = np.max((np.unique(dtraj)))
            self.assertGreaterEqual(
                n_centers - 1, n_states,
                "dtraj has more states than cluster centers")
示例#6
0
def project_and_cluster(trajfiles,
                        featurizer,
                        sparsify=False,
                        tica=True,
                        lag=100000,
                        scale=True,
                        var_cutoff=1.0,
                        ncluster=100):
    """
    Returns
    -------
    trans_obj, Y, clustering

    """
    X = coor.load(trajfiles, featurizer)
    if sparsify:
        X = remove_constant(X)
    if tica:
        trans_obj = coor.tica(X, lag=lag, var_cutoff=var_cutoff)
        Y = trans_obj.get_output()
    else:
        trans_obj = coor.pca(X, dim=-1, var_cutoff=var_cutoff)
        Y = trans_obj.get_output()
    if scale:
        for y in Y:
            y *= trans_obj.eigenvalues[:trans_obj.dimension()]
    if cluster:
        cl_obj = coor.cluster_kmeans(Y,
                                     k=ncluster,
                                     max_iter=3,
                                     fixed_seed=True)
        return trans_obj, Y, cl_obj
    return trans_obj, Y
示例#7
0
    def test_feature_correlation_MD(self):
        # Copying from the test_MD_data
        path = pkg_resources.resource_filename(__name__, 'data') + os.path.sep
        self.pdb_file = os.path.join(path, 'bpti_ca.pdb')
        self.xtc_file = os.path.join(path, 'bpti_mini.xtc')
        inp = source(self.xtc_file, top=self.pdb_file)
        pcamini = pca(inp)

        feature_traj = pcamini.data_producer.get_output()[0]
        nfeat = feature_traj.shape[1]
        pca_traj = pcamini.get_output()[0]
        npcs = pca_traj.shape[1]

        test_corr = pcamini.feature_PC_correlation
        true_corr = np.corrcoef(feature_traj.T, pca_traj.T)[:nfeat, -npcs:]
        np.testing.assert_allclose(test_corr, true_corr, atol=1.E-8)
示例#8
0
文件: test_pca.py 项目: nd1511/PyEMMA
    def test_feature_correlation_data(self):
        # Create features with some correlation
        feature_traj = np.zeros((100, 3))
        feature_traj[:,0] = np.linspace(-.5,.5,len(feature_traj))
        feature_traj[:,1] = (feature_traj[:,0]+np.random.randn(len(feature_traj))*.5)**1
        feature_traj[:,2] = np.random.randn(len(feature_traj))
        nfeat = feature_traj.shape[1]

        # PCA
        pca_obj = pca(data = feature_traj, dim = 3)
        pca_traj = pca_obj.get_output()[0]
        npcs = pca_traj.shape[1]

        # Create correlations
        test_corr = pca_obj.feature_PC_correlation
        true_corr = np.corrcoef(feature_traj.T, pca_traj.T)[:nfeat,-npcs:]
        np.testing.assert_allclose(test_corr, true_corr, atol=1.E-8)
示例#9
0
    def setUpClass(cls):
        import pyemma.msm.generation as msmgen

        # generate HMM with two Gaussians
        cls.P = np.array([[0.99, 0.01], [0.01, 0.99]])
        cls.T = 10000
        means = [np.array([-1, 1]), np.array([1, -1])]
        widths = [np.array([0.3, 2]), np.array([0.3, 2])]
        # continuous trajectory
        cls.X = np.zeros((cls.T, 2))
        # hidden trajectory
        dtraj = msmgen.generate_traj(cls.P, cls.T)
        for t in range(cls.T):
            s = dtraj[t]
            cls.X[t, 0] = widths[s][0] * np.random.randn() + means[s][0]
            cls.X[t, 1] = widths[s][1] * np.random.randn() + means[s][1]
        cls.lag = 10
        cls.pca_obj = pca(data=cls.X, dim=1)
示例#10
0
    def setUpClass(cls):
        # set random state, remember old one and set it back in tearDownClass
        cls.old_state = np.random.get_state()
        np.random.seed(0)

        # generate HMM with two Gaussians
        cls.P = np.array([[0.99, 0.01], [0.01, 0.99]])
        cls.T = 10000
        means = [np.array([-1, 1]), np.array([1, -1])]
        widths = [np.array([0.3, 2]), np.array([0.3, 2])]
        # continuous trajectory
        cls.X = np.zeros((cls.T, 2))
        # hidden trajectory
        dtraj = MarkovStateModel(cls.P).simulate(cls.T)
        for t in range(cls.T):
            s = dtraj[t]
            cls.X[t, 0] = widths[s][0] * np.random.randn() + means[s][0]
            cls.X[t, 1] = widths[s][1] * np.random.randn() + means[s][1]
        cls.pca_obj = pca(data=cls.X, dim=1)
示例#11
0
    def setUpClass(cls):
        import pyemma.msm.generation as msmgen

        # generate HMM with two Gaussians
        cls.P = np.array([[0.99, 0.01],
                      [0.01, 0.99]])
        cls.T = 10000
        means = [np.array([-1,1]), np.array([1,-1])]
        widths = [np.array([0.3,2]),np.array([0.3,2])]
        # continuous trajectory
        cls.X = np.zeros((cls.T, 2))
        # hidden trajectory
        dtraj = msmgen.generate_traj(cls.P, cls.T)
        for t in range(cls.T):
            s = dtraj[t]
            cls.X[t,0] = widths[s][0] * np.random.randn() + means[s][0]
            cls.X[t,1] = widths[s][1] * np.random.randn() + means[s][1]
        cls.lag = 10
        cls.pca_obj = pca(data = cls.X, dim=1)
示例#12
0
    def test_with_data_in_mem(self):
        import pyemma.coordinates as api

        data = [np.random.random((100, 50)),
                np.random.random((103, 50)),
                np.random.random((33, 50))]
        reader = api.memory_reader(data)

        tpca = api.pca(dim=2)

        n_centers = 10
        km = api.kmeans(k=n_centers)

        disc = api.discretizer(reader, tpca, km)
        disc.parametrize()

        dtrajs = disc.dtrajs
        for dtraj in dtrajs:
            n_states = np.max((np.unique(dtraj)))
            self.assertGreaterEqual(n_centers - 1, n_states,
                                    "dtraj has more states than cluster centers")
示例#13
0
 def test_variances(self):
     obj = pca(data=self.X)
     O = obj.get_output()[0]
     vars = np.var(O, axis=0)
     refs = obj.eigenvalues
     assert np.max(np.abs(vars - refs)) < 0.01
dtraj_Vfit_ind = np.array(dtraj_Vfit_ind)

dtraj_Vfit = []
for traj in range(dtraj_Vfit_ind.shape[1]):
    tmp = dtraj_Vfit_ind[0, traj]
    for ss in range(1, dtraj_Vfit_ind.shape[0]):
        tmp = np.vstack((tmp, dtraj_Vfit_ind[ss, traj]))
    dtraj_Vfit.append(tmp.T)

# In[6]:

np.array(dtraj_Vfit).shape

# **TICA**

pca_obj = coor.pca(dtraj_Vfit, var_cutoff=0.95)

save_object('pca_obj.pkl', pca_obj)

#plt.plot(tica_obj.eigenvalues,marker='x')
#plt.xlim([-1,20])
#plt.ylim([0.5,1])

# here we do a little trick to ensure that eigenvectors always have the same sign structure.
# That's irrelevant to the analysis and just nicer plots - you can ignore it.
#for i in range(2):
#    if tica_obj.eigenvectors[0, i] > 0:
#        tica_obj.eigenvectors[:, i] *= -1

Y = pca_obj.get_output()  # get tica coordinates
np.save('Y.npy', Y)
示例#15
0
def dotltsne(infilename='', intopname='', nofit=0, lagtime=1, pcadim=2, ticadim=2,
             maxpcs=50, ncomp=2, perplex1=10.0, perplex2=10.0, exag=12.0,
             rate=200.0, niter=1000, command='', ofilename='out.txt'):
  # Reading and superimposing the trajectory
  try:
    print("Loading trajectory")
    refpdb = md.load_pdb(intopname)
    X = md.load(infilename, top=intopname)
    print("Fitting trajectory")
    if nofit!=1:
      X.superpose(refpdb)
  except IOError:
    print("Cannot load %s or %s, exiting." % (infilename, intopname))
    exit(0)
  else:
    print("%s succesfully loaded and fitted" % X)
  print("")

  # Conversion of trajectory into matrix
  Xt = sp.zeros((X.n_frames, 3*X.n_atoms))
  for i in range(X.n_frames):
    for j in range(X.n_atoms):
      Xt[i,3*j] = X.xyz[i,j,0]
      Xt[i,3*j+1] = X.xyz[i,j,1]
      Xt[i,3*j+2] = X.xyz[i,j,2]

  # PCA
  print("Runing PCA")
  T = X.n_frames
  if lagtime > T:
    print("Lag time higher than the number of frames, exiting.")
    exit(0)
  pca = coor.pca(data = Xt)
  projs_pca = pca.get_output()

  # TICA
  print("Runing TICA")
  tica = coor.tica(data = Xt, lag=lagtime, dim=ticadim)
  projs_tica = tica.get_output()

  # t-SNE
  print("Runing t-SNE")
  Xembtsne = sk.TSNE(n_components=ncomp, perplexity=perplex1,
                     early_exaggeration=exag, learning_rate=rate, n_iter=niter,
                     metric="euclidean").fit_transform(Xt)

  # time-lagged t-SNE
  print("Runing time-lagged t-SNE")
  Xm = Xt-sp.mean(Xt, axis=0)
  Xc = sp.cov(sp.transpose(Xm))
  eva, eve = sp.linalg.eig(Xc)
  order=sp.argsort(eva)[::-1]
  eve = eve[:,order]
  eva = eva[order]
  projs = Xm.dot(eve)
  projs = projs/sp.sqrt(eva)
  C1 = sp.transpose(projs[:-lagtime,]).dot(projs[lagtime:,])/(T-lagtime-1)
  C1 = (C1+sp.transpose(C1))/2
  eva2, eve2 = sp.linalg.eig(C1)
  order=sp.argsort(eva2)[::-1]
  eve2 = eve2[:,order]
  eva2 = eva2[order]
  projs = projs.dot(eve2[:,:maxpcs])
  projs = projs*sp.real(eva2[:maxpcs])
  Xd = spat.distance_matrix(projs, projs)
  Xembtltsne = sk.TSNE(n_components=ncomp, perplexity=perplex2,
                       early_exaggeration=exag, learning_rate=rate, n_iter=niter,
                       metric="precomputed").fit_transform(Xd)

  # Saving results
  print("Saving results")
  ofile = open(ofilename, 'w')
  ofile.write("# Command: %s\n" % command)
  if(nofit==0):
    ofile.write("# structures were superimposed onto reference structure\n")
  else:
    ofile.write("# structures were NOT superimposed onto reference structure\n")
  ofile.write("# lag time set to %i frames\n" % lagtime)
  ofile.write("# output dimension for PCA set to %i\n" % pcadim)
  ofile.write("# output dimension for TICA set to %i\n" % ticadim)
  ofile.write("# number of top principle components passed to time-lagged t-SNE set to %i\n" % maxpcs)
  ofile.write("# output dimension for t-SNE and time-lagged t-SNE set to %i\n" % ncomp)
  ofile.write("# perplexity of t-SNE set to %f\n" % perplex1)
  ofile.write("# perplexity of time-lagged t-SNE set to %f\n" % perplex2)
  ofile.write("# early_exaggeration set to %f\n" % exag)
  ofile.write("# structure_ID")
  for j in range(pcadim):
    ofile.write(" PCA%i" % (j+1))
  for j in range(ticadim):
    ofile.write(" TICA%i" % (j+1))
  for j in range(ncomp):
    ofile.write(" tSNE%i" % (j+1))
  for j in range(ncomp):
    ofile.write(" tltSNE%i" % (j+1))
  ofile.write("\n")
  for i in range(T):
    output = " %i" % (i+1)
    for j in range(pcadim):
      output = output + " %f" % projs_pca[0][i,j]
    for j in range(ticadim):
      output = output + " %f" % projs_tica[0][i,j]
    for j in range(ncomp):
      output = output + " %f" % Xembtsne[i,j]
    for j in range(ncomp):
      output = output + " %f" % Xembtltsne[i,j]
    ofile.write("%s\n" % output)
  ofile.close()

# In[6]:

for traj in range(len(dtraj_rama_2)):
    dtraj_dih[traj] = dtraj_dih[traj].T


# In[7]:

dtraj_dih[0].shape


# In[8]:

pca_obj = coor.pca(dtraj_dih, dim=-1, var_cutoff=0.95,stride=1, mean=None)


# In[9]:

Y = pca_obj.get_output()


# In[10]:

# nb - this will introduce errors into the clustering but that only matter for the mpp part, let's ignore for now
from copy import deepcopy
dtraj_conc = deepcopy(Y[0][:,0:5])
for traj in range(1,len(dtraj_rama_2)):
    dtraj_conc = np.vstack((dtraj_conc,Y[traj][:,0:5]))