def test_provided_means(self): data = np.random.random((300, 3)) mean = data.mean(axis=0) pca_spec_mean = pca(data, mean=mean) pca_calc_mean = pca(data) np.testing.assert_allclose(mean, pca_calc_mean.mean) np.testing.assert_allclose(mean, pca_spec_mean.mean) np.testing.assert_allclose(pca_spec_mean.cov, pca_calc_mean.cov)
def test_api_cs(self): p = pca(chunksize=1, dim=10) assert p.chunksize == 1 # in case, we do not have a fixed dimension, pca can not calculate a proper chunksize. p = pca(chunksize=None) assert p.chunksize == p._FALLBACK_CHUNKSIZE p = pca(chunksize=None, dim=10) assert p.chunksize != 0 assert p.chunksize != p._FALLBACK_CHUNKSIZE
def test_partial_fit(self): data = [np.random.random((100, 3)), np.random.random((100, 3))] pca_part = pca() pca_part.partial_fit(data[0]) pca_part.partial_fit(data[1]) ref = pca(data) np.testing.assert_allclose(pca_part.mean, ref.mean) np.testing.assert_allclose(pca_part.eigenvalues, ref.eigenvalues) np.testing.assert_allclose(pca_part.eigenvectors, ref.eigenvectors)
def test_dimension(self): assert types.is_int(self.pca_obj.dimension()) # Here: assert self.pca_obj.dimension() == 1 # Test other variants obj = pca(data=self.X, dim=-1, var_cutoff=1.0) assert obj.dimension() == 2 obj = pca(data=self.X, dim=-1, var_cutoff=0.8) assert obj.dimension() == 1 with self.assertRaises(ValueError): # trying to set both dim and subspace_variance is forbidden pca(data=self.X, dim=1, var_cutoff=0.8)
def test_with_data_in_mem(self): import pyemma.coordinates as api data = [ np.random.random((100, 50)), np.random.random((103, 50)), np.random.random((33, 50)) ] reader = source(data) assert isinstance(reader, DataInMemory) tpca = api.pca(dim=2) n_centers = 10 km = api.cluster_kmeans(k=n_centers) disc = api.discretizer(reader, tpca, km) disc.parametrize() dtrajs = disc.dtrajs for dtraj in dtrajs: n_states = np.max((np.unique(dtraj))) self.assertGreaterEqual( n_centers - 1, n_states, "dtraj has more states than cluster centers")
def project_and_cluster(trajfiles, featurizer, sparsify=False, tica=True, lag=100000, scale=True, var_cutoff=1.0, ncluster=100): """ Returns ------- trans_obj, Y, clustering """ X = coor.load(trajfiles, featurizer) if sparsify: X = remove_constant(X) if tica: trans_obj = coor.tica(X, lag=lag, var_cutoff=var_cutoff) Y = trans_obj.get_output() else: trans_obj = coor.pca(X, dim=-1, var_cutoff=var_cutoff) Y = trans_obj.get_output() if scale: for y in Y: y *= trans_obj.eigenvalues[:trans_obj.dimension()] if cluster: cl_obj = coor.cluster_kmeans(Y, k=ncluster, max_iter=3, fixed_seed=True) return trans_obj, Y, cl_obj return trans_obj, Y
def test_feature_correlation_MD(self): # Copying from the test_MD_data path = pkg_resources.resource_filename(__name__, 'data') + os.path.sep self.pdb_file = os.path.join(path, 'bpti_ca.pdb') self.xtc_file = os.path.join(path, 'bpti_mini.xtc') inp = source(self.xtc_file, top=self.pdb_file) pcamini = pca(inp) feature_traj = pcamini.data_producer.get_output()[0] nfeat = feature_traj.shape[1] pca_traj = pcamini.get_output()[0] npcs = pca_traj.shape[1] test_corr = pcamini.feature_PC_correlation true_corr = np.corrcoef(feature_traj.T, pca_traj.T)[:nfeat, -npcs:] np.testing.assert_allclose(test_corr, true_corr, atol=1.E-8)
def test_feature_correlation_data(self): # Create features with some correlation feature_traj = np.zeros((100, 3)) feature_traj[:,0] = np.linspace(-.5,.5,len(feature_traj)) feature_traj[:,1] = (feature_traj[:,0]+np.random.randn(len(feature_traj))*.5)**1 feature_traj[:,2] = np.random.randn(len(feature_traj)) nfeat = feature_traj.shape[1] # PCA pca_obj = pca(data = feature_traj, dim = 3) pca_traj = pca_obj.get_output()[0] npcs = pca_traj.shape[1] # Create correlations test_corr = pca_obj.feature_PC_correlation true_corr = np.corrcoef(feature_traj.T, pca_traj.T)[:nfeat,-npcs:] np.testing.assert_allclose(test_corr, true_corr, atol=1.E-8)
def setUpClass(cls): import pyemma.msm.generation as msmgen # generate HMM with two Gaussians cls.P = np.array([[0.99, 0.01], [0.01, 0.99]]) cls.T = 10000 means = [np.array([-1, 1]), np.array([1, -1])] widths = [np.array([0.3, 2]), np.array([0.3, 2])] # continuous trajectory cls.X = np.zeros((cls.T, 2)) # hidden trajectory dtraj = msmgen.generate_traj(cls.P, cls.T) for t in range(cls.T): s = dtraj[t] cls.X[t, 0] = widths[s][0] * np.random.randn() + means[s][0] cls.X[t, 1] = widths[s][1] * np.random.randn() + means[s][1] cls.lag = 10 cls.pca_obj = pca(data=cls.X, dim=1)
def setUpClass(cls): # set random state, remember old one and set it back in tearDownClass cls.old_state = np.random.get_state() np.random.seed(0) # generate HMM with two Gaussians cls.P = np.array([[0.99, 0.01], [0.01, 0.99]]) cls.T = 10000 means = [np.array([-1, 1]), np.array([1, -1])] widths = [np.array([0.3, 2]), np.array([0.3, 2])] # continuous trajectory cls.X = np.zeros((cls.T, 2)) # hidden trajectory dtraj = MarkovStateModel(cls.P).simulate(cls.T) for t in range(cls.T): s = dtraj[t] cls.X[t, 0] = widths[s][0] * np.random.randn() + means[s][0] cls.X[t, 1] = widths[s][1] * np.random.randn() + means[s][1] cls.pca_obj = pca(data=cls.X, dim=1)
def setUpClass(cls): import pyemma.msm.generation as msmgen # generate HMM with two Gaussians cls.P = np.array([[0.99, 0.01], [0.01, 0.99]]) cls.T = 10000 means = [np.array([-1,1]), np.array([1,-1])] widths = [np.array([0.3,2]),np.array([0.3,2])] # continuous trajectory cls.X = np.zeros((cls.T, 2)) # hidden trajectory dtraj = msmgen.generate_traj(cls.P, cls.T) for t in range(cls.T): s = dtraj[t] cls.X[t,0] = widths[s][0] * np.random.randn() + means[s][0] cls.X[t,1] = widths[s][1] * np.random.randn() + means[s][1] cls.lag = 10 cls.pca_obj = pca(data = cls.X, dim=1)
def test_with_data_in_mem(self): import pyemma.coordinates as api data = [np.random.random((100, 50)), np.random.random((103, 50)), np.random.random((33, 50))] reader = api.memory_reader(data) tpca = api.pca(dim=2) n_centers = 10 km = api.kmeans(k=n_centers) disc = api.discretizer(reader, tpca, km) disc.parametrize() dtrajs = disc.dtrajs for dtraj in dtrajs: n_states = np.max((np.unique(dtraj))) self.assertGreaterEqual(n_centers - 1, n_states, "dtraj has more states than cluster centers")
def test_variances(self): obj = pca(data=self.X) O = obj.get_output()[0] vars = np.var(O, axis=0) refs = obj.eigenvalues assert np.max(np.abs(vars - refs)) < 0.01
dtraj_Vfit_ind = np.array(dtraj_Vfit_ind) dtraj_Vfit = [] for traj in range(dtraj_Vfit_ind.shape[1]): tmp = dtraj_Vfit_ind[0, traj] for ss in range(1, dtraj_Vfit_ind.shape[0]): tmp = np.vstack((tmp, dtraj_Vfit_ind[ss, traj])) dtraj_Vfit.append(tmp.T) # In[6]: np.array(dtraj_Vfit).shape # **TICA** pca_obj = coor.pca(dtraj_Vfit, var_cutoff=0.95) save_object('pca_obj.pkl', pca_obj) #plt.plot(tica_obj.eigenvalues,marker='x') #plt.xlim([-1,20]) #plt.ylim([0.5,1]) # here we do a little trick to ensure that eigenvectors always have the same sign structure. # That's irrelevant to the analysis and just nicer plots - you can ignore it. #for i in range(2): # if tica_obj.eigenvectors[0, i] > 0: # tica_obj.eigenvectors[:, i] *= -1 Y = pca_obj.get_output() # get tica coordinates np.save('Y.npy', Y)
def dotltsne(infilename='', intopname='', nofit=0, lagtime=1, pcadim=2, ticadim=2, maxpcs=50, ncomp=2, perplex1=10.0, perplex2=10.0, exag=12.0, rate=200.0, niter=1000, command='', ofilename='out.txt'): # Reading and superimposing the trajectory try: print("Loading trajectory") refpdb = md.load_pdb(intopname) X = md.load(infilename, top=intopname) print("Fitting trajectory") if nofit!=1: X.superpose(refpdb) except IOError: print("Cannot load %s or %s, exiting." % (infilename, intopname)) exit(0) else: print("%s succesfully loaded and fitted" % X) print("") # Conversion of trajectory into matrix Xt = sp.zeros((X.n_frames, 3*X.n_atoms)) for i in range(X.n_frames): for j in range(X.n_atoms): Xt[i,3*j] = X.xyz[i,j,0] Xt[i,3*j+1] = X.xyz[i,j,1] Xt[i,3*j+2] = X.xyz[i,j,2] # PCA print("Runing PCA") T = X.n_frames if lagtime > T: print("Lag time higher than the number of frames, exiting.") exit(0) pca = coor.pca(data = Xt) projs_pca = pca.get_output() # TICA print("Runing TICA") tica = coor.tica(data = Xt, lag=lagtime, dim=ticadim) projs_tica = tica.get_output() # t-SNE print("Runing t-SNE") Xembtsne = sk.TSNE(n_components=ncomp, perplexity=perplex1, early_exaggeration=exag, learning_rate=rate, n_iter=niter, metric="euclidean").fit_transform(Xt) # time-lagged t-SNE print("Runing time-lagged t-SNE") Xm = Xt-sp.mean(Xt, axis=0) Xc = sp.cov(sp.transpose(Xm)) eva, eve = sp.linalg.eig(Xc) order=sp.argsort(eva)[::-1] eve = eve[:,order] eva = eva[order] projs = Xm.dot(eve) projs = projs/sp.sqrt(eva) C1 = sp.transpose(projs[:-lagtime,]).dot(projs[lagtime:,])/(T-lagtime-1) C1 = (C1+sp.transpose(C1))/2 eva2, eve2 = sp.linalg.eig(C1) order=sp.argsort(eva2)[::-1] eve2 = eve2[:,order] eva2 = eva2[order] projs = projs.dot(eve2[:,:maxpcs]) projs = projs*sp.real(eva2[:maxpcs]) Xd = spat.distance_matrix(projs, projs) Xembtltsne = sk.TSNE(n_components=ncomp, perplexity=perplex2, early_exaggeration=exag, learning_rate=rate, n_iter=niter, metric="precomputed").fit_transform(Xd) # Saving results print("Saving results") ofile = open(ofilename, 'w') ofile.write("# Command: %s\n" % command) if(nofit==0): ofile.write("# structures were superimposed onto reference structure\n") else: ofile.write("# structures were NOT superimposed onto reference structure\n") ofile.write("# lag time set to %i frames\n" % lagtime) ofile.write("# output dimension for PCA set to %i\n" % pcadim) ofile.write("# output dimension for TICA set to %i\n" % ticadim) ofile.write("# number of top principle components passed to time-lagged t-SNE set to %i\n" % maxpcs) ofile.write("# output dimension for t-SNE and time-lagged t-SNE set to %i\n" % ncomp) ofile.write("# perplexity of t-SNE set to %f\n" % perplex1) ofile.write("# perplexity of time-lagged t-SNE set to %f\n" % perplex2) ofile.write("# early_exaggeration set to %f\n" % exag) ofile.write("# structure_ID") for j in range(pcadim): ofile.write(" PCA%i" % (j+1)) for j in range(ticadim): ofile.write(" TICA%i" % (j+1)) for j in range(ncomp): ofile.write(" tSNE%i" % (j+1)) for j in range(ncomp): ofile.write(" tltSNE%i" % (j+1)) ofile.write("\n") for i in range(T): output = " %i" % (i+1) for j in range(pcadim): output = output + " %f" % projs_pca[0][i,j] for j in range(ticadim): output = output + " %f" % projs_tica[0][i,j] for j in range(ncomp): output = output + " %f" % Xembtsne[i,j] for j in range(ncomp): output = output + " %f" % Xembtltsne[i,j] ofile.write("%s\n" % output) ofile.close()
# In[6]: for traj in range(len(dtraj_rama_2)): dtraj_dih[traj] = dtraj_dih[traj].T # In[7]: dtraj_dih[0].shape # In[8]: pca_obj = coor.pca(dtraj_dih, dim=-1, var_cutoff=0.95,stride=1, mean=None) # In[9]: Y = pca_obj.get_output() # In[10]: # nb - this will introduce errors into the clustering but that only matter for the mpp part, let's ignore for now from copy import deepcopy dtraj_conc = deepcopy(Y[0][:,0:5]) for traj in range(1,len(dtraj_rama_2)): dtraj_conc = np.vstack((dtraj_conc,Y[traj][:,0:5]))