def k_centers(X, n_clusters=8, metric='rmsd', random_state=None): """K-Centers clustering Cluster a vector or Trajectory dataset using a simple heuristic to minimize the maximum distance from any data point to its assigned cluster center. The runtime of this algorithm is O(kN), where k is the number of clusters and N is the size of the dataset, making it one of the least expensive clustering algorithms available. Parameters ---------- n_clusters : int, optional, default: 8 The number of clusters to form as well as the number of centroids to generate. metric : {"euclidean", "sqeuclidean", "cityblock", "chebyshev", "canberra", "braycurtis", "hamming", "jaccard", "cityblock", "rmsd"} The distance metric to use. metric = "rmsd" requires that sequences passed to ``fit()`` be ```md.Trajectory```; other distance metrics require ``np.ndarray``s. random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. References ---------- .. [1] Gonzalez, Teofilo F. "Clustering to minimize the maximum intercluster distance." Theor. Comput. Sci. 38 (1985): 293-306. .. [2] Beauchamp, Kyle A., et al. "MSMBuilder2: modeling conformational dynamics on the picosecond to millisecond scale." J. Chem. Theory. Comput. 7.10 (2011): 3412-3419. Attributes ---------- cluster_centers_ : array, [n_clusters, n_features] or md.Trajectory Coordinates of cluster centers labels_ : array, [n_samples,] The label of each point is an integer in [0, n_clusters). """ n_samples = len(X) if random_state is -1: seed = check_random_state(None).randint(0, n_samples) else: seed = random_state print "seed=", seed cluster_centers_ = [] cluster_centers_.append(seed) #seed = random distances_ = pairwise_distances(X, index=seed, metric=metric) labels_ = np.zeros(len(X), dtype=np.int32) for i in xrange(1, n_clusters): MaxIndex = np.argmax(distances_) cluster_centers_.append(MaxIndex) #set the furthest point from existing center as a new center if distances_[ MaxIndex ] < 0: break new_distance_list = pairwise_distances(X, index=MaxIndex, metric=metric) updated_indices = np.where(new_distance_list < distances_)[0] distances_[ updated_indices ] = new_distance_list[ updated_indices ] labels_[ updated_indices ] = i return cluster_centers_, labels_
def k_centers_assign(X, centers=None, n_clusters=8, metric='rmsd', random_state=None): """K-Centers clustering Cluster a vector or Trajectory dataset using a simple heuristic to minimize the maximum distance from any data point to its assigned cluster center. The runtime of this algorithm is O(kN), where k is the number of clusters and N is the size of the dataset, making it one of the least expensive clustering algorithms available. Parameters ---------- n_clusters : int, optional, default: 8 The number of clusters to form as well as the number of centroids to generate. metric : {"euclidean", "sqeuclidean", "cityblock", "chebyshev", "canberra", "braycurtis", "hamming", "jaccard", "cityblock", "rmsd"} The distance metric to use. metric = "rmsd" requires that sequences passed to ``fit()`` be ```md.Trajectory```; other distance metrics require ``np.ndarray``s. random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. References ---------- .. [1] Gonzalez, Teofilo F. "Clustering to minimize the maximum intercluster distance." Theor. Comput. Sci. 38 (1985): 293-306. .. [2] Beauchamp, Kyle A., et al. "MSMBuilder2: modeling conformational dynamics on the picosecond to millisecond scale." J. Chem. Theory. Comput. 7.10 (2011): 3412-3419. Attributes ---------- cluster_centers_ : array, [n_clusters, n_features] or md.Trajectory Coordinates of cluster centers labels_ : array, [n_samples,] The label of each point is an integer in [0, n_clusters). """ n_samples = len(X) if centers is None: print("No Cluster Centers found!") n_centers = len(centers) print("N_Centers:", n_centers) print("N_samples:", n_samples) labels_ = np.zeros(n_samples, dtype=np.int32) #distances_ = np.zeros(n_centers, dtype=np.float32) for i in range(0, n_samples): distances_ = pairwise_distances(X=centers, Y=X, index=i, metric=metric) #distances_ = md.rmsd(centers, X, i, parallel=True, precentered=True) cluster_num = np.argmin(distances_) labels_[i] = cluster_num return labels_
def k_centers_assign(X, centers=None, n_clusters=8, metric='rmsd', random_state=None): """K-Centers clustering Cluster a vector or Trajectory dataset using a simple heuristic to minimize the maximum distance from any data point to its assigned cluster center. The runtime of this algorithm is O(kN), where k is the number of clusters and N is the size of the dataset, making it one of the least expensive clustering algorithms available. Parameters ---------- n_clusters : int, optional, default: 8 The number of clusters to form as well as the number of centroids to generate. metric : {"euclidean", "sqeuclidean", "cityblock", "chebyshev", "canberra", "braycurtis", "hamming", "jaccard", "cityblock", "rmsd"} The distance metric to use. metric = "rmsd" requires that sequences passed to ``fit()`` be ```md.Trajectory```; other distance metrics require ``np.ndarray``s. random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. References ---------- .. [1] Gonzalez, Teofilo F. "Clustering to minimize the maximum intercluster distance." Theor. Comput. Sci. 38 (1985): 293-306. .. [2] Beauchamp, Kyle A., et al. "MSMBuilder2: modeling conformational dynamics on the picosecond to millisecond scale." J. Chem. Theory. Comput. 7.10 (2011): 3412-3419. Attributes ---------- cluster_centers_ : array, [n_clusters, n_features] or md.Trajectory Coordinates of cluster centers labels_ : array, [n_samples,] The label of each point is an integer in [0, n_clusters). """ n_samples = len(X) if centers is None: print "No Cluster Centers found!" n_centers = len(centers) print "N_Centers:", n_centers print "N_samples:", n_samples labels_ = np.zeros(n_samples, dtype=np.int32) #distances_ = np.zeros(n_centers, dtype=np.float32) for i in xrange(0, n_samples): distances_ = pairwise_distances(X=centers, Y=X, index=i, metric=metric) #distances_ = md.rmsd(centers, X, i, parallel=True, precentered=True) cluster_num = np.argmin(distances_) labels_[ i ] = cluster_num return labels_
def run_knn(X, n_neighbors=100, n_samples=1000, metric='rmsd', algorithm='vp_tree'): # X = check_array(X, accept_sparse='csr') #print "Calculating pairwise ", metric, " distances of ", n_samples, " samples..." t0 = time.time() if metric is "rmsd": samples = random.sample(X, n_samples) whole_samples= reduce(operator.add, (samples[i] for i in xrange(len(samples)))) else: whole_samples = random.sample(X, n_samples) sample_dist_metric = pairwise_distances( whole_samples, whole_samples, metric=metric ) t1 = time.time() #print "time:", t1-t0, #print "Done." # Calculate neighborhood for all samples. This leaves the original point # in, which needs to be considered later #print "Calculating knn..." t0 = time.time() if metric is 'rmsd': shape_x = np.shape(X.xyz) knn = knnn.vp_tree_parallel( np.reshape(X.xyz, (shape_x[0] * shape_x[1] * shape_x[2])), shape_x[1] * 3, "rmsd_serial" ) distances_, indices = knn.query( np.linspace(0, len(X.xyz)-1, len(X.xyz), dtype='int'), n_neighbors ) else: if algorithm is 'vp_tree': shape_x = np.shape(X) #print "shape_x:", shape_x knn = knnn.vp_tree_parallel( np.reshape(X, (shape_x[0] * shape_x[1])), shape_x[1], "euclidean_serial" ) distances_, indices = knn.query( np.linspace(0, len(X)-1, len(X), dtype='int'), n_neighbors ) else: neighbors_model = NearestNeighbors(n_neighbors=n_neighbors, algorithm=algorithm, metric=metric) neighbors_model.fit(X) distances_, indices = neighbors_model.kneighbors(X, n_neighbors=n_neighbors, return_distance=True) t1 = time.time() #print "time:", t1-t0, #print "Done." # Calculate distance between sample, and find dc # np.savetxt("./sample_dist_metric.txt", sample_dist_metric, fmt="%f") #np.savetxt("./distances_.txt", distances_, fmt="%f") #np.savetxt("./indices.txt", indices, fmt="%d") return sample_dist_metric, distances_, indices
def main(): cli = argparse.ArgumentParser() cli.add_argument( '-t', '--trajListFns', default='trajlist', help='List of trajectory files to read in, separated by spaces.') cli.add_argument( '-a', '--atomListFns', default='atom_indices', help='List of atom index files to read in, separated by spaces.') cli.add_argument('-g', '--topology', default='native.pdb', help='topology file.') cli.add_argument('-o', '--homedir', help='Home dir.', default=".", type=str) cli.add_argument('-e', '--iext', help='''The file extension of input trajectory files. Must be a filetype that mdtraj.load() can recognize.''', default="xtc", type=str) cli.add_argument('-n', '--n_clusters', help='''n_clusters.''', default=100, type=int) cli.add_argument('-m', '--n_macro_states', help='''n_macro_states.''', default=6, type=int) cli.add_argument('-s', '--stride', help='stride.', default=None, type=int) args = cli.parse_args() trajlistname = args.trajListFns atom_indicesname = args.atomListFns trajext = args.iext File_TOP = args.topology homedir = args.homedir n_clusters = args.n_clusters n_macro_states = args.n_macro_states stride = args.stride # =========================================================================== # Reading Trajs from XTC files #print "stride:", stride #trajreader = XTCReader(trajlistname, atom_indicesname, homedir, trajext, File_TOP, nSubSample=stride) #trajs = trajreader.trajs #print(trajs) #traj_len = trajreader.traj_len #np.savetxt("./traj_len.txt", traj_len, fmt="%d") if os.path.isfile( "./phi_angles.txt") and os.path.isfile("./psi_angles.txt") is True: phi_angles = np.loadtxt("./phi_angles.txt", dtype=np.float32) psi_angles = np.loadtxt("./psi_angles.txt", dtype=np.float32) else: #phi_angles, psi_angles = trajreader.get_phipsi(trajs, psi=[6, 8, 14, 16], phi=[4, 6, 8, 14]) phi_angles, psi_angles = trajreader.get_phipsi(trajs, psi=[5, 7, 13, 15], phi=[3, 5, 7, 13]) np.savetxt("./phi_angles.txt", phi_angles, fmt="%f") np.savetxt("./psi_angles.txt", psi_angles, fmt="%f") phi_psi = np.column_stack((phi_angles, psi_angles)) n_samples = 1000 percent = 0.9 import random whole_samples = random.sample(list(phi_psi), n_samples) #print whole_samples from metrics.pairwise import pairwise_distances sample_dist_metric = pairwise_distances(whole_samples, whole_samples, metric='euclidean') print(sample_dist_metric.shape) sample_dist = [] for i in range(0, n_samples): for j in range(i + 1, n_samples): sample_dist.append(sample_dist_metric[i, j]) sorted_sample_dist = np.sort(sample_dist) print("Len of samples:", len(sorted_sample_dist), np.max(sorted_sample_dist), np.min(sorted_sample_dist)) eps_list = [] len_samples = len(sorted_sample_dist) for percent in [0.05, 0.025, 0.008]: #,0.005, 0.003, # 0.002, 0.001, 0.0008, 0.0005, 0.0003, 0.0002, 0.0001, 0.00005]: percent /= 10.0 index = int(round(len_samples * percent)) if index == len_samples: index -= 1 dc = sorted_sample_dist[index] #print index, sorted_sample_dist[index] eps_list.append(dc) print(eps_list) # from sklearn.neighbors import NearestNeighbors # print len(phi_psi) # neighborhoods_model = NearestNeighbors(n_neighbors=len(phi_psi), algorithm='kd_tree') # neighborhoods_model.fit(phi_psi) # #distances, indices = neighborhoods_model.kneighbors(phi_psi) # distances, indices = neighborhoods_model.kneighbors(phi_psi, 5) # print distances #print phi_psi # =========================================================================== # do Clustering using MR -DBSCAN method clustering_name = "mr-dbscan_iter_" potential = True # potential = False #eps = eps_list[0] eps = 9.376904 min_samples = 1 len_frames = len(phi_psi) print("Total frames:", len_frames) print("Running first calculation") db = DBSCAN(eps=eps, min_samples=min_samples, algorithm='kd_tree').fit(phi_psi) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True old_assignments = db.labels_ n_microstates = len( set(old_assignments)) - (1 if -1 in old_assignments else 0) print('Estimated number of clusters: %d' % n_microstates) # Calculating percentage of each states frame_bincount = np.bincount( old_assignments[old_assignments >= 0]) #remove outliers frame_freq_index_sorted = np.argsort( frame_bincount)[::-1] # descending arg sort frame_freq_percent_sorted = frame_bincount[ frame_freq_index_sorted] / np.float32(len_frames) print(frame_freq_percent_sorted[0:10]) print(frame_freq_index_sorted[0:10]) old_frame_freq_percent_sorted = frame_freq_percent_sorted old_frame_freq_index_sorted = frame_freq_index_sorted iter_name = clustering_name + '0' + '_eps_' + str( eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str( n_microstates) plot_cluster(labels=old_assignments, phi_angles=phi_angles, psi_angles=psi_angles, name=iter_name, potential=potential) n_iterations = len(eps_list) print("n_iterations:", n_iterations) eps_list = [9.376904, 3.3741567, 0.87675905] min_samples_list = [1, 20, 20] #min_samples_list = [50, 30, 20, 15, 10, 8, 5, 2] n_min_samples = len(min_samples_list) #eps_list = [3.0, 2.0, 1.0, 0.8, 0.5] #min_samples_list = [3, 3, 3, 3, 3, 2, 2] r #esults = np.zeros((n_min_samples,n_iterations,len_frames), dtype=np.int32) results = np.zeros((n_iterations, len_frames), dtype=np.int32) for i in range(0, n_iterations): #for j in range(0, n_min_samples): eps = eps_list[i] min_samples = min_samples_list[i] db = DBSCAN(eps=eps, min_samples=min_samples).fit(phi_psi) ''' core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True new_assignments = db.labels_ if i < 7: remove_outliers = True else: remove_outliers = False assignments = merge_assignments(new_assignments, old_assignments, remove_outliers=remove_outliers) n_microstates = len(set(assignments)) - (1 if -1 in assignments else 0) # Calculating percentage of each states frame_bincount = np.bincount(assignments[assignments >= 0]) # remove outliers frame_freq_index_sorted = np.argsort(frame_bincount)[::-1] # descending arg sort frame_freq_percent_sorted = frame_bincount[frame_freq_index_sorted] / np.float32(len_frames) frame_freq_percent_sorted = frame_freq_percent_sorted[0:10] frame_freq_index_sorted = frame_freq_index_sorted[0:10] print frame_freq_percent_sorted print frame_freq_index_sorted old_frame_freq_index_sorted = [] for j in xrange(0, 10): index = np.argwhere(assignments==frame_freq_index_sorted[j])[0] old_frame_freq_index_sorted.append(old_assignments[index][0]) print old_frame_freq_index_sorted ''' core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True assignments = db.labels_ n_microstates = len(set(assignments)) - (1 if -1 in assignments else 0) #results[j,i, :]= np.array(assignments) results[i, :] = np.array(assignments) print("Iter:", i, "Running MR-DBSCAN at eps:", eps, 'min_sampes:', min_samples, 'Estimated number of clusters:', n_microstates) #print('Estimated number of clusters: %d' % n_microstates) iter_name = clustering_name + str(i) + '_eps_' + str( eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str( n_microstates) plot_cluster(labels=assignments, phi_angles=phi_angles, psi_angles=psi_angles, name=iter_name, potential=potential) #old_assignments = assignments print(results) np.save("results.npy", results) #np.savetxt("results.csv", results, fmt="%d", delimiter=",") np.savetxt("eps_list.txt", eps_list, fmt="%f", delimiter=",") np.savetxt("min_samples_list.txt", min_samples_list, fmt="%d", delimiter=",")
def k_centers(X, n_clusters=8, metric='rmsd', random_state=None): """K-Centers clustering Cluster a vector or Trajectory dataset using a simple heuristic to minimize the maximum distance from any data point to its assigned cluster center. The runtime of this algorithm is O(kN), where k is the number of clusters and N is the size of the dataset, making it one of the least expensive clustering algorithms available. Parameters ---------- n_clusters : int, optional, default: 8 The number of clusters to form as well as the number of centroids to generate. metric : {"euclidean", "sqeuclidean", "cityblock", "chebyshev", "canberra", "braycurtis", "hamming", "jaccard", "cityblock", "rmsd"} The distance metric to use. metric = "rmsd" requires that sequences passed to ``fit()`` be ```md.Trajectory```; other distance metrics require ``np.ndarray``s. random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. References ---------- .. [1] Gonzalez, Teofilo F. "Clustering to minimize the maximum intercluster distance." Theor. Comput. Sci. 38 (1985): 293-306. .. [2] Beauchamp, Kyle A., et al. "MSMBuilder2: modeling conformational dynamics on the picosecond to millisecond scale." J. Chem. Theory. Comput. 7.10 (2011): 3412-3419. Attributes ---------- cluster_centers_ : array, [n_clusters, n_features] or md.Trajectory Coordinates of cluster centers labels_ : array, [n_samples,] The label of each point is an integer in [0, n_clusters). """ n_samples = len(X) if random_state is -1: seed = check_random_state(None).randint(0, n_samples) else: seed = random_state print("seed=", seed) cluster_centers_ = [] cluster_centers_.append(seed) #seed = random distances_ = pairwise_distances(X, index=seed, metric=metric) labels_ = np.zeros(len(X), dtype=np.int32) for i in range(1, n_clusters): MaxIndex = np.argmax(distances_) cluster_centers_.append(MaxIndex) #set the furthest point from existing center as a new center if distances_[MaxIndex] < 0: break new_distance_list = pairwise_distances(X, index=MaxIndex, metric=metric) updated_indices = np.where(new_distance_list < distances_)[0] distances_[updated_indices] = new_distance_list[updated_indices] labels_[updated_indices] = i return cluster_centers_, labels_
def main(): cli = argparse.ArgumentParser() cli.add_argument('-e', '--eps', help='eps', default=1, type=float) cli.add_argument('-m', '--min_samples', help='min_samples', default=5, type=int) cli.add_argument('-l', '--nlist', help='nlist', default=1000, type=int) cli.add_argument('-p', '--nprobe', help='nprob', default=10, type=int) # Download example dataset from msmbuilder.example_datasets import AlanineDipeptide ala2 = AlanineDipeptide(verbose=False) xyz = ala2.get().trajectories print(ala2.description()) #xyz = [t[::10] for t in xyz] print("{} trajectories".format(len(xyz))) # msmbuilder does not keep track of units! You must keep track of your # data's timestep to_ns = 0.5 print("with length {} ns".format(set(len(x) * to_ns for x in xyz))) from msmbuilder.featurizer import DihedralFeaturizer featurizer = DihedralFeaturizer(types=['phi', 'psi']) diheds = featurizer.fit_transform(xyz) print(xyz[0].xyz.shape) print(diheds[0].shape) from msmbuilder.preprocessing import RobustScaler scaler = RobustScaler() scaled_diheds = scaler.fit_transform(diheds) print(diheds[0].shape) print(scaled_diheds[0].shape) from msmbuilder.decomposition import tICA tica_model = tICA(lag_time=2, n_components=2) # fit and transform can be done in seperate steps: tica_model.fit(diheds) tica_trajs = tica_model.transform(diheds) featurizer = DihedralFeaturizer(types=['phi', 'psi'], sincos=False) diheds = featurizer.fit_transform(xyz) print(diheds[0].shape) print(tica_trajs[0].shape) # =========================================================================== #if os.path.isfile("./phi_angles.txt") and os.path.isfile("./psi_angles.txt") is True: # phi_angles = np.loadtxt("./phi_angles.txt", dtype=np.float32) # psi_angles = np.loadtxt("./psi_angles.txt", dtype=np.float32) #X = np.column_stack((phi_angles, psi_angles)) #print(X.shape) phi_angles = np.degrees(diheds[0][:, 0]) psi_angles = np.degrees(diheds[0][:, 1]) print(phi_angles) X = tica_trajs[0].astype(np.float32) #rint(X) n_size = X.shape[0] dimension = X.shape[1] #print(X.shape) # =========================================================================== args = cli.parse_args() eps = args.eps # eps min_samples = args.min_samples # min_samples nlist = args.nlist nprobe = args.nprobe IVFFlat = True print('n_size = %d,\t dimension = %d,\t eps = %f, min_samples = %d' % (n_size, dimension, eps, min_samples)) n_samples = 1000 percent = 0.9 import random whole_samples = random.sample(list(X), n_samples) #print whole_samples from metrics.pairwise import pairwise_distances sample_dist_metric = pairwise_distances(whole_samples, whole_samples, metric='l2') print(sample_dist_metric.shape) sample_dist = [] for i in range(0, n_samples): for j in range(i + 1, n_samples): sample_dist.append(sample_dist_metric[i, j]) sorted_sample_dist = np.sort(sample_dist) print("Len of samples:", len(sorted_sample_dist), np.max(sorted_sample_dist), np.min(sorted_sample_dist)) eps_list = [] len_samples = len(sorted_sample_dist) for percent in [0.30, 0.20, 0.10]: #,0.005, 0.003, # 0.002, 0.001, 0.0008, 0.0005, 0.0003, 0.0002, 0.0001, 0.00005]: #percent /= 10.0 index = int(round(len_samples * percent)) if index == len_samples: index -= 1 dc = sorted_sample_dist[index] #print index, sorted_sample_dist[index] eps_list.append(dc) print(eps_list) #print X # =========================================================================== # do Clustering using MR -DBSCAN method clustering_name = "mr-dbscan_iter_" #potential = True remove_outliers = False potential = False eps = eps_list[0] min_samples = 1 len_frames = len(X) print("Total frames:", len_frames) print("Running first calculation") db = Faiss_DBSCAN(eps=eps, min_samples=min_samples, nlist=nlist, nprobe=nprobe, metric="l2", GPU=False, IVFFlat=IVFFlat) db.fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True old_assignments = db.labels_ n_microstates = len( set(old_assignments)) - (1 if -1 in old_assignments else 0) print('Estimated number of clusters: %d' % n_microstates) # Calculating percentage of each states frame_bincount = np.bincount( old_assignments[old_assignments >= 0]) #remove outliers frame_freq_index_sorted = np.argsort( frame_bincount)[::-1] # descending arg sort frame_freq_percent_sorted = frame_bincount[ frame_freq_index_sorted] / np.float32(len_frames) print(frame_freq_percent_sorted[0:10]) print(frame_freq_index_sorted[0:10]) old_frame_freq_percent_sorted = frame_freq_percent_sorted old_frame_freq_index_sorted = frame_freq_index_sorted n_microstates = len( set(old_assignments)) - (1 if -1 in old_assignments else 0) print('Estimated number of clusters: %d' % n_microstates) iter_name = clustering_name + '0' + '_eps_' + str( eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str( n_microstates) plot_cluster(labels=old_assignments, phi_angles=phi_angles, psi_angles=psi_angles, name=iter_name, potential=potential) n_iterations = len(eps_list) print("n_iterations:", n_iterations) min_samples_list = [50, 30, 10] #min_samples_list = [50, 30, 20, 15, 10, 8, 5, 2] n_min_samples = len(min_samples_list) #eps_list = [3.0, 2.0, 1.0, 0.8, 0.5] #min_samples_list = [3, 3, 3, 3, 3, 2, 2] results = np.zeros((n_min_samples, n_iterations, len_frames), dtype=np.int32) for i in range(1, n_iterations): eps = eps_list[i] min_samples = min_samples_list[i] db = Faiss_DBSCAN(eps=eps, min_samples=min_samples, nlist=nlist, nprobe=nprobe, metric="l2", GPU=False, IVFFlat=IVFFlat).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True new_assignments = db.labels_ if i is n_iterations - 1: remove_outliers = True #else: # remove_outliers = False assignments = merge_assignments(new_assignments, old_assignments, remove_outliers=remove_outliers) n_microstates = len(set(assignments)) - (1 if -1 in assignments else 0) #results[j,i, :]= np.array(assignments) print("Iter:", i, "Running MR-DBSCAN at eps:", eps, 'min_sampes:', min_samples, 'Estimated number of clusters:', n_microstates) #print('Estimated number of clusters: %d' % n_microstates) iter_name = clustering_name + str(i) + '_eps_' + str( eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str( n_microstates) plot_cluster(labels=assignments, phi_angles=phi_angles, psi_angles=psi_angles, name=iter_name, potential=potential) #old_assignments = assignments #print(results) #np.save("results.npy", results) #np.savetxt("results.csv", results, fmt="%d", delimiter=",") np.savetxt("eps_list.txt", eps_list, fmt="%f", delimiter=",") np.savetxt("min_samples_list.txt", min_samples_list, fmt="%d", delimiter=",")
def main(): cli = argparse.ArgumentParser() cli.add_argument('-e', '--eps', help='eps', default=1, type=float) cli.add_argument('-m', '--min_samples', help='min_samples', default=5, type=int) cli.add_argument('-l', '--nlist', help='nlist', default=1000, type=int) cli.add_argument('-p', '--nprobe', help='nprob', default=10, type=int) # =========================================================================== if os.path.isfile("./phi_angles.txt") and os.path.isfile("./psi_angles.txt") is True: phi_angles = np.loadtxt("./phi_angles.txt", dtype=np.float32) psi_angles = np.loadtxt("./psi_angles.txt", dtype=np.float32) X = np.column_stack((phi_angles, psi_angles)) print(X.shape) n_size = X.shape[0] dimension = X.shape[1] # =========================================================================== args = cli.parse_args() eps = args.eps # eps min_samples = args.min_samples # min_samples nlist = args.nlist nprobe = args.nprobe IVFFlat = True print('n_size = %d,\t dimension = %d,\t eps = %f, min_samples = %d' % (n_size, dimension, eps, min_samples)) n_samples = 1000 percent = 0.9 import random whole_samples = random.sample(list(X), n_samples) #print whole_samples from metrics.pairwise import pairwise_distances sample_dist_metric = pairwise_distances(whole_samples, whole_samples, metric='l2') print(sample_dist_metric.shape) sample_dist = [] for i in range(0, n_samples): for j in range(i+1, n_samples): sample_dist.append(sample_dist_metric[i, j]) sorted_sample_dist = np.sort(sample_dist) print("Len of samples:", len(sorted_sample_dist), np.max(sorted_sample_dist), np.min(sorted_sample_dist)) eps_list = [] len_samples = len(sorted_sample_dist) for percent in [0.20, 0.05, 0.020 ]: #,0.005, 0.003, # 0.002, 0.001, 0.0008, 0.0005, 0.0003, 0.0002, 0.0001, 0.00005]: #percent /= 10.0 index = int(round(len_samples*percent)) if index == len_samples: index -= 1 dc = sorted_sample_dist[index] #print index, sorted_sample_dist[index] eps_list.append(dc) print(eps_list) #print X # =========================================================================== # do Clustering using MR -DBSCAN method clustering_name = "mr-dbscan_iter_" potential = True remove_outliers = False # potential = False eps = eps_list[0] min_samples = 1 len_frames = len(X) print("Total frames:", len_frames) print("Running first calculation") db = Faiss_DBSCAN(eps=eps, min_samples=min_samples, nlist=nlist, nprobe=nprobe, metric="l2", GPU=False, IVFFlat=IVFFlat) db.fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True old_assignments = db.labels_ n_microstates = len(set(old_assignments)) - (1 if -1 in old_assignments else 0) print('Estimated number of clusters: %d' % n_microstates) # Calculating percentage of each states frame_bincount = np.bincount(old_assignments[old_assignments>=0]) #remove outliers frame_freq_index_sorted = np.argsort(frame_bincount)[::-1] # descending arg sort frame_freq_percent_sorted = frame_bincount[frame_freq_index_sorted]/np.float32(len_frames) print(frame_freq_percent_sorted[0:10]) print(frame_freq_index_sorted[0:10]) old_frame_freq_percent_sorted = frame_freq_percent_sorted old_frame_freq_index_sorted = frame_freq_index_sorted iter_name = clustering_name + '0' + '_eps_' + str(eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(n_microstates) plot_cluster(labels=old_assignments, phi_angles=phi_angles, psi_angles=psi_angles, name=iter_name, potential=potential) n_iterations = len(eps_list) print("n_iterations:", n_iterations) min_samples_list = [10, 20, 20] #min_samples_list = [50, 30, 20, 15, 10, 8, 5, 2] n_min_samples = len(min_samples_list) #eps_list = [3.0, 2.0, 1.0, 0.8, 0.5] #min_samples_list = [3, 3, 3, 3, 3, 2, 2] results = np.zeros((n_min_samples,n_iterations,len_frames), dtype=np.int32) for i in range(1, n_iterations): eps = eps_list[i] min_samples = min_samples_list[i] db = Faiss_DBSCAN(eps=eps, min_samples=min_samples, nlist=nlist, nprobe=nprobe, metric="l2", GPU=False, IVFFlat=IVFFlat).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True new_assignments = db.labels_ if i is n_iterations - 1: remove_outliers = True #else: # remove_outliers = False assignments = merge_assignments(new_assignments, old_assignments, remove_outliers=remove_outliers) n_microstates = len(set(assignments)) - (1 if -1 in assignments else 0) #results[j,i, :]= np.array(assignments) print("Iter:", i, "Running MR-DBSCAN at eps:", eps, 'min_sampes:', min_samples, 'Estimated number of clusters:', n_microstates) #print('Estimated number of clusters: %d' % n_microstates) iter_name = clustering_name + str(i) + '_eps_' + str(eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(n_microstates) plot_cluster(labels=assignments, phi_angles=phi_angles, psi_angles=psi_angles, name=iter_name, potential=potential) #old_assignments = assignments #print(results) #np.save("results.npy", results) #np.savetxt("results.csv", results, fmt="%d", delimiter=",") np.savetxt("eps_list.txt", eps_list, fmt="%f", delimiter=",") np.savetxt("min_samples_list.txt", min_samples_list, fmt="%d", delimiter=",")
def main(): cli = argparse.ArgumentParser() cli.add_argument( '-t', '--trajListFns', default='trajlist', help='List of trajectory files to read in, separated by spaces.') cli.add_argument( '-a', '--atomListFns', default='atom_indices', help='List of atom index files to read in, separated by spaces.') cli.add_argument('-g', '--topology', default='native.pdb', help='topology file.') cli.add_argument('-o', '--homedir', help='Home dir.', default=".", type=str) cli.add_argument('-e', '--iext', help='''The file extension of input trajectory files. Must be a filetype that mdtraj.load() can recognize.''', default="xtc", type=str) cli.add_argument('-n', '--n_clusters', help='''n_clusters.''', default=100, type=int) cli.add_argument('-m', '--n_macro_states', help='''n_macro_states.''', default=6, type=int) cli.add_argument('-s', '--stride', help='stride.', default=None, type=int) args = cli.parse_args() trajlistname = args.trajListFns atom_indicesname = args.atomListFns trajext = args.iext File_TOP = args.topology homedir = args.homedir n_clusters = args.n_clusters n_macro_states = args.n_macro_states stride = args.stride # =========================================================================== # Reading Trajs from XTC files #print "stride:", stride #trajreader = XTCReader(trajlistname, atom_indicesname, homedir, trajext, File_TOP, nSubSample=stride) #trajs = trajreader.trajs #print trajs #traj_len = trajreader.traj_len #np.savetxt("./traj_len.txt", traj_len, fmt="%d") if os.path.isfile( "./phi_angles.txt") and os.path.isfile("./psi_angles.txt") is True: phi_angles = np.loadtxt("./phi_angles.txt", dtype=np.float32) psi_angles = np.loadtxt("./psi_angles.txt", dtype=np.float32) else: phi_angles, psi_angles = trajreader.get_phipsi(trajs, psi=[6, 8, 14, 16], phi=[4, 6, 8, 14]) #phi_angles, psi_angles = trajreader.get_phipsi(trajs, psi=[5, 7, 13, 15], phi=[3, 5, 7, 13]) np.savetxt("./phi_angles.txt", phi_angles, fmt="%f") np.savetxt("./psi_angles.txt", psi_angles, fmt="%f") phi_psi = np.column_stack((phi_angles, psi_angles)) n_samples = 1000 percent = 0.9 import random whole_samples = random.sample(phi_psi, n_samples) #print whole_samples from metrics.pairwise import pairwise_distances sample_dist_metric = pairwise_distances(whole_samples, whole_samples, metric='rmsd') print sample_dist_metric.shape sample_dist = [] for i in xrange(0, n_samples): for j in xrange(i + 1, n_samples): sample_dist.append(sample_dist_metric[i, j]) sorted_sample_dist = np.sort(sample_dist) print "Len of samples:", len(sorted_sample_dist), np.max( sorted_sample_dist), np.min(sorted_sample_dist) eps_list = [] len_samples = len(sorted_sample_dist) for percent in [ 0.40, 0.35, 0.3, 0.25, 0.2, 0.15, 0.1, 0.05, 0.025, 0.010, 0.008, 0.005, 0.003, 0.001, 0.0005, 0.0003, 0.0001, 0.00005, 0.00001 ]: percent /= 10.0 index = int(round(len_samples * percent)) if index == len_samples: index -= 1 dc = sorted_sample_dist[index] #print index, sorted_sample_dist[index] eps_list.append(dc) print eps_list # from sklearn.neighbors import NearestNeighbors # print len(phi_psi) # neighborhoods_model = NearestNeighbors(n_neighbors=len(phi_psi), algorithm='kd_tree') # neighborhoods_model.fit(phi_psi) # #distances, indices = neighborhoods_model.kneighbors(phi_psi) # distances, indices = neighborhoods_model.kneighbors(phi_psi, 5) # print distances #print phi_psi # =========================================================================== # do Clustering using MR -DBSCAN method clustering_name = "mr-dbscan_iter_" potential = True # potential = False eps = eps_list[0] min_samples = 5 print "Running first calculation" db = DBSCAN(eps=eps, min_samples=min_samples, metric='rmsd').fit(phi_psi) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True old_assignments = db.labels_ n_microstates = len( set(old_assignments)) - (1 if -1 in old_assignments else 0) print('Estimated number of clusters: %d' % n_microstates) iter_name = clustering_name + '0' + '_eps_' + str( eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str( n_microstates) plot_cluster(labels=old_assignments, phi_angles=phi_angles, psi_angles=psi_angles, name=iter_name, potential=potential) n_iterations = len(eps_list) print "n_iterations:", n_iterations #eps_list = [3.0, 2.0, 1.0, 0.8, 0.5] min_samples_list = [3, 3, 3, 3, 3, 2, 2] for i in xrange(1, n_iterations): eps = eps_list[i] #min_samples = min_samples_list[i] db = DBSCAN(eps=eps, min_samples=min_samples, metric='rmsd').fit(phi_psi) print "Iter:", i, "Running MR-DBSCAN at eps:", eps, 'min_sampes:', min_samples core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True new_assignments = db.labels_ old_assignments = merge_assignments(new_assignments, old_assignments) n_microstates = len( set(old_assignments)) - (1 if -1 in old_assignments else 0) print('Estimated number of clusters: %d' % n_microstates) iter_name = clustering_name + str(i) + '_eps_' + str( eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str( n_microstates) plot_cluster(labels=old_assignments, phi_angles=phi_angles, psi_angles=psi_angles, name=iter_name, potential=potential) labels = old_assignments print labels n_microstates = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_microstates) #cluster_centers_ = cluster.cluster_centers_ # plot micro states clustering_name = "mr-dbscan_n_" + str(n_microstates) np.savetxt("assignments_" + clustering_name + ".txt", labels, fmt="%d") #np.savetxt("cluster_centers_"+clustering_name+".txt", cluster_centers_, fmt="%d") plot_cluster(labels=labels, phi_angles=phi_angles, psi_angles=psi_angles, name=clustering_name, potential=potential)