def test_pca_noref_nofit(self): '''no reference and do not do fitting from drroe: " Also, not fitting at all should be considered a legitimate option - you may want to include global rotational and translational motion in your eigenvectors." pytraj: pt.pca(traj, mask, n_vecs=2, fit=False) ''' command = ''' parm data/tz2.parm7 trajin data/tz2.nc matrix covar name MyMatrix !@H= createcrd CRD1 run # Step three. Diagonalize matrix. runanalysis diagmatrix MyMatrix vecs 2 name MyEvecs # Step four. Project saved fit coordinates along eigenvectors 1 and 2 crdaction CRD1 projection evecs MyEvecs !@H= out project.dat beg 1 end 2 ''' traj = pt.load("data/tz2.nc", "data/tz2.parm7") # no reference state = pt.load_cpptraj_state(command) state.run() mask = '!@H=' data = pt.pca(traj, mask, n_vecs=2, fit=False) data_ref = pt.pca(traj, mask, n_vecs=2, fit=False, ref=3, ref_mask='@CA') cpp_data = state.data[-2:].values # use absolute values aa_eq(np.abs(data[0]), np.abs(cpp_data), decimal=3) # if fit=True, ref will be ignored aa_eq(np.abs(data_ref[0]), np.abs(cpp_data), decimal=3)
def test_traj_on_disk_fit_to_given_reference(self): """test_traj_on_disk_fit_to_given_reference """ fit = True traj_on_disk = pt.iterload(fn('tz2.nc'), fn('tz2.parm7')) traj_on_mem = pt.load(fn('tz2.nc'), fn('tz2.parm7')) ref0 = traj_on_disk[0] ref1 = traj_on_mem[0] data0, _ = pt.pca(traj_on_disk, mask='@CA', n_vecs=2, fit=fit, ref=ref0) data1, _ = pt.pca(traj_on_mem, mask='@CA', n_vecs=2, fit=fit, ref=ref1) aa_eq(np.abs(data0), np.abs(data1)) # try again # https://github.com/Amber-MD/pytraj/issues/1452 data2, _ = pt.pca(traj_on_disk, mask='@CA', n_vecs=2, fit=fit, ref=ref0) aa_eq(np.abs(data0), np.abs(data2))
def test_pca_with_ref_with_different_mask_from_matrix(self): '''has reference. Use !@H= for ref_mask and use * for covariance matrix and projection from drroe: "You should be able to supply separate masks for fitting and creating the covariance matrix It is common enough for example to only perform rms-fitting on heavy atoms while still wanting all atoms in eigenvectors." pytraj: pt.pca(traj, mask=mask_matrix, n_vecs=2, ref=ref, ref_mask=mask_ref) ''' command_ref_provided = ''' parm data/tz2.parm7 trajin data/tz2.nc reference data/tz2.rst7 # only perform fitting on heavy atoms rms reference !@H= # all atoms matrix covar name MyMatrix * createcrd CRD1 run # Step three. Diagonalize matrix. runanalysis diagmatrix MyMatrix vecs 2 name MyEvecs # Step four. Project saved fit coordinates along eigenvectors 1 and 2 # all atoms crdaction CRD1 projection evecs MyEvecs * out project.dat beg 1 end 2 ''' traj = pt.load("data/tz2.nc", "data/tz2.parm7") ref = pt.load('data/tz2.rst7', traj.top) state = pt.load_cpptraj_state(command_ref_provided) state.run() mask_ref = '!@H=' mask_matrix = '*' data = pt.pca(traj, mask=mask_matrix, n_vecs=2, ref=ref, ref_mask=mask_ref) cpp_data = state.data[-2:].values # use absolute values aa_eq(np.abs(data[0]), np.abs(cpp_data), decimal=3)
def test_pca_noref(self): '''test_pca_noref: no reference pytraj: pt.pca(traj, mask, n_vecs=2) ''' command = ''' # Step one. Generate average structure. # RMS-Fit to first frame to remove global translation/rotation. parm {tz2_top} trajin {tz2_trajin} rms first !@H= average crdset AVG run # Step two. RMS-Fit to average structure. Calculate covariance matrix. # Save the fit coordinates. rms ref AVG !@H= matrix covar name MyMatrix !@H= createcrd CRD1 run # Step three. Diagonalize matrix. runanalysis diagmatrix MyMatrix vecs 2 name MyEvecs # Step four. Project saved fit coordinates along eigenvectors 1 and 2 crdaction CRD1 projection evecs MyEvecs !@H= out project.dat beg 1 end 2 '''.format( tz2_top=tz2_top, tz2_trajin=tz2_trajin) traj = pt.load(fn('tz2.nc'), fn('tz2.parm7')) # no reference state = pt.load_cpptraj_state(command) state.run() mask = '!@H=' data = pt.pca(traj, mask, n_vecs=2) cpp_data = state.data[-2:].values # use absolute values aa_eq(np.abs(data[0]), np.abs(cpp_data), decimal=3)
def test_pca_noref(self): '''no reference pytraj: pt.pca(traj, mask, n_vecs=2) ''' command = ''' # Step one. Generate average structure. # RMS-Fit to first frame to remove global translation/rotation. parm data/tz2.parm7 trajin data/tz2.nc rms first !@H= average crdset AVG run # Step two. RMS-Fit to average structure. Calculate covariance matrix. # Save the fit coordinates. rms ref AVG !@H= matrix covar name MyMatrix !@H= createcrd CRD1 run # Step three. Diagonalize matrix. runanalysis diagmatrix MyMatrix vecs 2 name MyEvecs # Step four. Project saved fit coordinates along eigenvectors 1 and 2 crdaction CRD1 projection evecs MyEvecs !@H= out project.dat beg 1 end 2 ''' traj = pt.load("data/tz2.nc", "data/tz2.parm7") # no reference state = pt.load_cpptraj_state(command) state.run() mask = '!@H=' data = pt.pca(traj, mask, n_vecs=2) cpp_data = state.data[-2:].values # use absolute values aa_eq(np.abs(data[0]), np.abs(cpp_data), decimal=3)
def test_pca_with_ref(self): '''has reference from drroe: "If the user provides their own reference structure, do not create an average structure" pytraj: pt.pca(traj, mask, n_vecs=2, ref=ref) ''' command_ref_provided = ''' parm data/tz2.parm7 trajin data/tz2.nc reference data/tz2.rst7 rms reference !@H= matrix covar name MyMatrix !@H= createcrd CRD1 run # Step three. Diagonalize matrix. runanalysis diagmatrix MyMatrix vecs 2 name MyEvecs # Step four. Project saved fit coordinates along eigenvectors 1 and 2 crdaction CRD1 projection evecs MyEvecs !@H= out project.dat beg 1 end 2 ''' traj = pt.load("data/tz2.nc", "data/tz2.parm7") ref = pt.load('data/tz2.rst7', traj.top) state = pt.load_cpptraj_state(command_ref_provided) state.run() mask = '!@H=' data = pt.pca(traj, mask, n_vecs=2, ref=ref) cpp_data = state.data[-2:].values # use absolute values aa_eq(np.abs(data[0]), np.abs(cpp_data), decimal=3)
def test_pca_raise(self): traj = pt.iterload('data/tz2.nc', 'data/tz2.parm7') self.assertRaises(ValueError, lambda: pt.pca(traj, n_vecs=2, mask='@CA'))
def test_raises(self): frame = pt.iterload(fn('tz2.nc'), fn('tz2.parm7'))[0] with pytest.raises(ValueError): pt.pca(frame, mask='@CA')
import pytraj as pt traj = pt.load('../tests/data/tz2.nc', '../tests/data/tz2.parm7') data = pt.pca(traj, mask='@CA', n_vecs=3) print(pt.pca.__doc__) print('##################') print('output') print(data)
def main(): traj = pt.load(args.traj, args.parm) rnd_iter = args.riter rnd_vecs = args.evec pairs = list() if args.mask_proj == None: args.mask_proj = args.mask print "Mask : ", args.mask print "Mask proj: ", args.mask_proj if rnd_vecs < 1: rnd_vecs = 3 * traj[args.mask].xyz.shape[1] - 6 #make pairs for n_i in range(rnd_vecs): for n_j in range(rnd_vecs): if n_i < n_j: pairs.append((n_i, n_j)) sele = pt.select(traj.top, args.mask) sele_txt = "" for s_i, s in enumerate(sele): sele_txt += "%d %s\n" % (s_i, traj.top.atomlist[s]) o = open("%s_sele.dat" % args.prefix, "w") o.write(sele_txt) o.close() n_vecs = rnd_vecs pca_data, eigen = pt.pca(traj[args.start:], mask=args.mask, n_vecs=n_vecs) eigen_val = eigen[0] eigen_vec = eigen[1] np.savetxt("%s_eigen_vec.dat" % args.prefix, np.c_[eigen_vec[0], eigen_vec[1], eigen_vec[2]]) np.savetxt("%s_pcadata.dat" % args.prefix, pca_data.T) #h = hist(pca_data[0], pca_data[1]) #h.plot2d(xlab="PC1 [$\AA$]", ylab="PC2 [$\AA$]", title="PCA", name=args.out) # Plot PCA for pc_i, pc_j in pairs: plt.scatter(pca_data[pc_i], pca_data[pc_j], marker='o', c="r", alpha=0.5) plt.xlabel("PC%d [$\AA$]" % pc_i) plt.ylabel("PC%d [$\AA$]" % pc_j) plt.title("PCA PC%d vs. PC%d" % (pc_i, pc_j)) plt.savefig("PC%d-vs-PC%s_%s.png" % (pc_i, pc_j, args.prefix)) plt.close('all') # Plot atom contritbuion for pc_i in range(3): l = eigen_vec[pc_i].shape[0] c = np.linalg.norm(eigen_vec[pc_i].reshape((l / 3, 3)), axis=1) a = np.arange(l / 3) + 1 plt.plot(a, c, label="PC%s" % pc_i, alpha=0.5) plt.legend() plt.xlim(0, l / 3 + 1) plt.xlabel("Atom ID") plt.ylabel("Eigenvector components") plt.title("Eigenvectors") plt.savefig("Eigenvectors_%s.png" % args.prefix) plt.close('all') total_var = np.sum(eigen_val) plt.scatter(range(1, n_vecs + 1), (np.cumsum(eigen_val) / total_var) * 100, label="Cumulative Variance") plt.plot(range(1, n_vecs + 1), (eigen_val / total_var) * 100, "g--", label="Variance") plt.legend() #plt.xticks(range(1, n_vecs+1, 2)) plt.xlabel("Eigenvector #") plt.ylabel("Variance explained [%]") plt.title("Variance explained by PC Eigenvectors") plt.savefig("Variance_%s.png" % args.prefix, dpi=1000) plt.close('all') if args.traj_proj != None and args.parm_proj != None: traj_proj = pt.load(args.traj_proj, args.parm_proj) pt.rmsd(traj_proj, mask=args.mask_proj) #avg_proj = pt.mean_structure(traj_proj, mask=args.mask) #pt.rmsd(traj_proj, mask=args.mask, ref=avg_proj) projection_data = pt.projection(traj_proj[args.start_proj:], args.mask_proj, eigenvalues=eigen_val,\ eigenvectors=eigen_vec,\ scalar_type='covar') np.savetxt("%s_pcadata_proj.dat" % args.prefix, projection_data.T) #h = hist(projection_data[0], projection_data[1]) #h.plot2d(xlab="PC1 [$\AA$]", ylab="PC2 [$\AA$]", title="PCA projection", name=args.out_proj) for pc_i, pc_j in pairs: plt.scatter(pca_data[pc_i], pca_data[pc_j], marker='o', c="r", alpha=0.5) plt.scatter(projection_data[pc_i], projection_data[pc_j], marker='o', c="g", alpha=0.5) plt.xlabel("PC%d [$\AA$]" % pc_i) plt.ylabel("PC%d [$\AA$]" % pc_j) plt.title("PCA PC%d vs. PC%d with projection" % (pc_i, pc_j)) plt.savefig("PC%d-vs-PC%s_%s_projection.png" % (pc_i, pc_j, args.prefix)) plt.close('all') plt.scatter(projection_data[pc_i], projection_data[pc_j], marker='o', c="g", alpha=0.5) plt.xlabel("PC%d [$\AA$]" % pc_i) plt.ylabel("PC%d [$\AA$]" % pc_j) plt.title("PCA PC%d vs. PC%d only projection" % (pc_i, pc_j)) plt.savefig("PC%d-vs-PC%d_%s_only_projection.png" % (pc_i, pc_j, args.prefix)) plt.close('all') pca_data_2, eigen_2 = pt.pca(traj_proj[args.start_proj:], mask=args.mask_proj, n_vecs=n_vecs) eigen_val_2 = eigen_2[0] eigen_vec_2 = eigen_2[1] overlap = 0 for pc_i in range(rnd_vecs): for pc_j in range(rnd_vecs): overlap += (np.dot(eigen_vec[pc_i], eigen_vec_2[pc_j]) / (np.linalg.norm(eigen_vec[pc_i]) * np.linalg.norm(eigen_vec_2[pc_j])))**2 overlap /= rnd_vecs print "Vector space spanned by traj-1 overlap with traj-2 subspace (%d vecs): %6.3f" % ( rnd_vecs, overlap) if args.zscore != None: overlap_rnd = np.zeros(rnd_iter) for r in range(rnd_iter): ### make random traj t1_rnd = traj for f in range(t1_rnd.xyz[args.start:].shape[0]): idxs = np.arange(t1_rnd.xyz[args.start + f, ].shape[0]) sele = np.random.permutation(idxs) t1_rnd[f] = t1_rnd.xyz[args.start + f, ][sele] pca_t1_rnd, eigen_t1_rnd = pt.pca(t1_rnd[args.start:], mask=args.mask, n_vecs=n_vecs) eigen_vec_1_rnd = eigen_t1_rnd[1] for pc_i in range(n_vecs): for pc_j in range(n_vecs): overlap_rnd[r] += ( np.dot(eigen_vec[pc_i], eigen_vec_1_rnd[pc_j]) / (np.linalg.norm(eigen_vec[pc_i]) * np.linalg.norm(eigen_vec_1_rnd[pc_j])))**2 overlap_rnd[r] /= n_vecs z_score = (overlap - np.mean(overlap_rnd)) / np.std(overlap_rnd) print "Z-score : %6.3f" % z_score
def main(): X = pt.load(args.traj, args.parm, stride=args.stride) if args.pca == "no": X = X[args.mask].xyz[args.start:] shape = X.shape X = X.reshape((shape[0], shape[1] * 3)) else: n_vecs = 3 * X[args.mask].xyz[args.start:].shape[1] - 6 pca_data, eigen = pt.pca(X[args.start:], n_vecs=n_vecs, mask=args.mask) eigen_val = eigen[0] eigen_vec = eigen[1] np.savetxt("%s_eigen_vec.dat" % args.prefix, np.c_[eigen_vec[0], eigen_vec[1], eigen_vec[2]]) pairs = list() #make pairs for n_i in range(3): for n_j in range(3): if n_i < n_j: pairs.append((n_i, n_j)) # Plot PCA for pc_i, pc_j in pairs: plt.scatter(pca_data[pc_i], pca_data[pc_j], marker='o', c="r", alpha=0.5) plt.xlabel("PC%d [$\AA$]" % pc_i) plt.ylabel("PC%d [$\AA$]" % pc_j) plt.title("PCA PC%d vs. PC%d" % (pc_i, pc_j)) plt.savefig("PC%d-vs-PC%s_%s.png" % (pc_i, pc_j, args.prefix), dpi=1000) plt.close('all') # Plot atom contritbuion for pc_i in range(3): l = eigen_vec[pc_i].shape[0] c = np.linalg.norm(eigen_vec[pc_i].reshape((l / 3, 3)), axis=1) a = np.arange(l / 3) + 1 plt.plot(a, c, label="PC%s" % pc_i, alpha=0.5) plt.legend() plt.xlim(0, l / 3 + 1) plt.xlabel("Atom ID") plt.ylabel("Eigenvector components") plt.title("Eigenvectors") plt.savefig("Eigenvectors_%s.png" % args.prefix, dpi=1000) plt.close('all') total_var = np.sum(eigen_val) plt.scatter(range(1, n_vecs + 1), (np.cumsum(eigen_val) / total_var) * 100, label="Cumulative Variance") plt.plot(range(1, n_vecs + 1), (eigen_val / total_var) * 100, "g--", label="Eigenvector Variance") plt.legend() #plt.xticks(range(1, n_vecs+1, 2)) plt.xlabel("Eigenvector #") plt.ylabel("Fractional of Variance explained [%]") plt.title("Explained total variance explained by PCA") plt.savefig("Variance_%s.png" % args.prefix, dpi=1000) plt.close('all') X = pca_data range_n_clusters = range(2, 20) for n_clusters in range_n_clusters: # Create a subplot with 1 row and 2 columns if args.pca == "yes": fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) else: fig, (ax1, ax2) = plt.subplots(1, 1) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-0.1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) # Initialize the clusterer with n_clusters value and a random generator # seed of rand for reproducibility. rand = np.random.randint(99999) print("Random seed is %d." % rand) clusterer = KMeans(n_clusters=n_clusters, random_state=rand) cluster_labels = clusterer.fit_predict(X) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(X, cluster_labels) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, cluster_labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) if args.pca == "yes": # 2nd Plot showing the actual clusters formed colors = cm.spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors) # Labeling the clusters centers = clusterer.cluster_centers_ # Draw white circles at cluster centers ax2.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200) for i, c in enumerate(centers): ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50) ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") plt.suptitle( ("Silhouette analysis for KMeans clustering on sample data " "with n_clusters = %d" % n_clusters), fontsize=14, fontweight='bold') plt.savefig("%s_silhouette_n=%d.png" % (args.prefix, n_clusters), dpi=1000) plt.close('all')