def core_clusters(dynamic_clustering, pdb_file, dcd_pkl_filename): # dict of trajectory frames dynamic_clustering_frames_list = {} # dict of [cluster_no - frameids] for i in dynamic_clustering: try: dynamic_clustering_frames_list[dynamic_clustering[i]].append( int(i)) except: dynamic_clustering_frames_list[dynamic_clustering[i]] = [int(i)] frames = preprocessing.load_residues('reduced_dimensions.pkl') distances_of_frames_in_cluster = {} # this is distance-frameindex mapping avg_structure_in_cluster = {} # this is distance-frameindex mapping for i in dynamic_clustering_frames_list: #print i, "cluster_id" temp = misc.most_probable_structure_in_cluster( dynamic_clustering_frames_list[i], frames, pdb_file, i, "dynamic", dcd_pkl_filename) total_number_of_strucutres = len(dynamic_clustering_frames_list[i]) for j in range(total_number_of_strucutres): distances_of_frames_in_cluster[misc.distance( frames[dynamic_clustering_frames_list[i][j]], temp)] = dynamic_clustering_frames_list[i][j] distance_value = sorted(distances_of_frames_in_cluster.keys())[ (total_number_of_strucutres / 2) + 1] distances_of_frames_in_cluster[i] = distance_value avg_structure_in_cluster[i] = temp return distances_of_frames_in_cluster, avg_structure_in_cluster
def get_dynamic_cluster_sequence(): frames = preprocessing.load_residues('reduced_dimensions.pkl') #print frames.shape d = shelve.open("dynamic_clustering") #print set(d.values()) cluster_membership = {} for i in range(frames.shape[0]): try: cluster_membership[int(d[str(i)])] += 1 except: cluster_membership[int(d[str(i)])] = 1 transition_matrix = ds.Autovivification() for i in set(d.values()): for j in set(d.values()): transition_matrix[i][j] = 0 for i in range(frames.shape[0] - 1): transition_matrix[int(d[str(i)])][int(d[str(i + 1)])] += 1 # normalizing values row-wise cluster_probability = {} for i in set(d.values()): sums = 0 for j in set(d.values()): sums += transition_matrix[i][j] cluster_probability[i] = sums for j in set(d.values()): transition_matrix[i][j] /= sums * 1.0 sequence = get_most_probable_path(d) return sequence, transition_matrix
def tsne(load=True,jump=9): #d = shelve.open("dynamic_clustering") X = preprocessing.load_residues('reduced_dimensions.pkl') #clusters = [] #for i in range(0, X.shape[0], jump): # clusters.append(int(d[str(i)])) X = X[::jump] d = {} gc.collect() #print X.shape, len(clusters) if load == False: model = TSNE(n_components=2, random_state=0, perplexity=40) y = model.fit_transform(X) joblib.dump(model,'tsne_model.pkl') else: clf = joblib.load('tsne_model.pkl') y = clf.embedding_ #print y.shape #print np.array(clusters).shape """ # Plot our dataset. fig = plt.figure() # ax = fig.add_subplot(111, projection='3d') p = plt.scatter(y[:, 0], y[:, 1], c=np.array(clusters), cmap=plt.cm.rainbow) plt.colorbar(p) plt.legend() # plt.show() plt.savefig("TSNE_40.eps", format='eps', dpi=300) """ return
def get_path_probability(transition_matrix, path, cluster_representative_index, jump): d = shelve.open("dynamic_clustering") frames = preprocessing.load_residues('reduced_dimensions.pkl') cluster_membership = {} for i in range(frames.shape[0]): try: cluster_membership[int(d[str(i)])] += 1 except: cluster_membership[int(d[str(i)])] = 1 initial_val = cluster_membership[int(d[str(0)])] / float(frames.shape[0]) converted_clusters = [d[str((cluster_representative_index[0]) * jump)]] for i in range(1, len(path)): first = d[str((cluster_representative_index[path[i - 1]]) * jump)] second = d[str((cluster_representative_index[path[i]]) * jump)] #print first, second, transition_matrix[first][second] initial_val *= transition_matrix[first][second] #ensuring the path has unique clusters only next_state = d[str((cluster_representative_index[path[i]]) * jump)] if len(converted_clusters ) != 0 and next_state != converted_clusters[-1]: converted_clusters.append(next_state) return converted_clusters, initial_val
def cluster_trajectory_kmeans(fit=True): if fit == True: X = preprocessing.load_residues('reduced_dimensions.pkl') model = KMeans(n_clusters=1000) model.fit(X) joblib.dump(model, "KMEANS.pkl") else: model = joblib.load("KMEANS.pkl") X = preprocessing.load_residues('reduced_dimensions.pkl') mean_cluster_ids = shelve.open("kmeans_trajectory_clustering") # this structure stores cluster-ids for each frame of the trajectory #print X.shape for i in range(X.shape[0]): prediction = model.predict(X[i].reshape((1, -1))) mean_cluster_ids[str(i)] = int(prediction) number_of_current_clusters = model.cluster_centers_.shape[0] d = dict(mean_cluster_ids) mean_cluster_ids.close()
def markov_chain(dcd, pdb, filename): #mpp.cluster_trajectory_kmeans(fit = True) #mpp.cluster_trajectory_kmeans(fit = False) #mpp.dynamic_cluster_trajectory(meta_stability_criteria = 0.93, pdb_file=pdb, dcd_pkl_filename=filename) sequence, transition_matrix = mpp.get_dynamic_cluster_sequence() #print len(sequence) distribution = mpp.equilibrium_distribution(transition_matrix) mpp.construct_transition_graph(sequence,transition_matrix,distribution) sequence, transition_matrix = mpp.get_dynamic_cluster_sequence() dynamic_clustering = shelve.open("dynamic_clustering") start, end = misc.get_cluster_ids_for_start_and_end(dynamic_clustering["0"],set(dynamic_clustering.values()), "dynamic") #print start, end path = mpp.get_most_probable_path_in_markov_chain(transition_matrix, start, end) print path dcd_array = pp.load_residues(filename) misc.write_dcd(dcd_array,path,"dynamic") misc.write_pdb(dcd_array,path,"dynamic") dcd_array = pp.load_residues(filename) return
def get_individual_transitions(cluster_representative_point, jump): transition_matrix = aux.Autovivification() #T_i_j represents prob of going from i to j mixture_params = shelve.open("EM_params_with_full_covariance") data_points = preprocessing.load_residues('reduced_dimensions.pkl') energies = preprocessing.create_energy_matrix(get_from_file=1) energies = np.sum(energies,axis=1) dbscan = joblib.load('dbscan_model.pkl') for i in cluster_representative_point.keys(): #print "cluster_id:", i covar_matrix = mixture_params["variances"][i] prec_chol, log_det = prob_dis.compute_precisions_chol(np.array([covar_matrix])) parent_temp_indices = misc.find_indices_of_clusters(dbscan.labels_, i) parent_temp_transition_probs = aux.Autovivification() parent_log_probabilities = prob_dis.log_pdf(data_points[::jump][np.array(parent_temp_indices)], mixture_params["means"][i], covar_matrix, mixture_params["beta"], energies[::jump][np.array(parent_temp_indices)], mixture_params["energy_cluster"][i], prec_chol[0], log_det[0]) for j in cluster_representative_point.keys(): temp_indices = misc.find_indices_of_clusters(dbscan.labels_, j) child_log_probabilities = prob_dis.log_pdf(data_points[::jump][np.array(temp_indices)], mixture_params["means"][i], covar_matrix, mixture_params["beta"], energies[::jump][np.array(temp_indices)], mixture_params["energy_cluster"][i], prec_chol[0], log_det[0]) for index in range(parent_log_probabilities.shape[0]): max_val = np.maximum(child_log_probabilities, np.array([parent_log_probabilities[index]])) trans_prob = np.exp(child_log_probabilities - max_val - np.log(np.exp(child_log_probabilities - max_val) + np.exp(np.array([parent_log_probabilities[index]]) - max_val))) trans_prob[trans_prob == 0.5] = 0.0 parent_temp_transition_probs[index][j] = copy.deepcopy(trans_prob) #normalize_probabilities_for_individual_structures for index in parent_temp_transition_probs.keys(): denom = 0.0 for j in parent_temp_transition_probs[index].keys(): denom += np.sum(parent_temp_transition_probs[index][j]) for j in parent_temp_transition_probs[index].keys(): parent_temp_transition_probs[index][j] /= denom transition_matrix[parent_temp_indices[index]][j] = parent_temp_transition_probs[index][j] return transition_matrix
def evaluate_metastable_states(jump): d = shelve.open("dynamic_clustering") X = preprocessing.load_residues('reduced_dimensions.pkl') clusters = [] for i in range(0, X.shape[0], jump): clusters.append(int(d[str(i)])) model = joblib.load('dbscan_model.pkl') not_noise_indices = np.where(model.labels_ != -1) new_cluster_labels = model.labels_[not_noise_indices] new_ground_truth = (np.array(clusters))[not_noise_indices] labels_true = new_ground_truth labels_pred = new_cluster_labels return metrics.adjusted_mutual_info_score(labels_true, labels_pred)
def density_clustering(jump=9, min_number_of_samples = 5): X = preprocessing.load_residues('reduced_dimensions.pkl') clf = joblib.load('tsne_model.pkl') y = clf.embedding_ # determining value of eps - data is of uniform density neigh = NearestNeighbors(4) neigh.fit(y) plt.clf() distances = neigh.kneighbors()[0][:, 3] #print distances.shape plt.plot(np.arange(distances.shape[0]), np.array(sorted(distances))) # plt.plot(np.arange(distances.shape[0]), first_order_gradients[:]) # print second_order_gradients.argmax() # print second_order_gradients.argmax() eps_cutoff = np.array(sorted(distances))[int(0.99 * distances.shape[0])] #print eps_cutoff, int(0.99 * distances.shape[0]) # plt.show() model = DBSCAN(eps=eps_cutoff, min_samples=min_number_of_samples) model.fit_predict(y) joblib.dump(model, 'dbscan_model.pkl') #print set(model.labels_), len(set(model.labels_)) """ not_noise_indices = np.where(model.labels_ != -1) new_cluster_labels = model.labels_[not_noise_indices] new_ground_truth = (np.array(clusters))[not_noise_indices] #print new_cluster_labels.shape #print set(clusters), len(set(clusters)) fig = plt.figure() # ax = fig.add_subplot(111, projection='3d') p = plt.scatter(y[:, 0], y[:, 1], c=np.array(model.labels_), cmap=plt.cm.rainbow) plt.colorbar(p) # plt.show() plt.savefig("DBSCAN.eps", format='eps', dpi=300) labels_true = new_ground_truth labels_pred = new_cluster_labels print "AMI Score: ", metrics.adjusted_mutual_info_score(labels_true, labels_pred) """ return
def most_probable_structure_in_cluster(frame_indices, frames, pdb, cluster_id, type_of_cluster, dcd_pkl_filename, jump=1): # this function finds the most probable 2-D location of each cluster array = frames[frame_indices] if array.shape[0] <= 3: mean_point = np.mean(array, axis=0) else: # use grid search cross-validation to optimize the bandwidth params = {'bandwidth': np.logspace(-1, 0, 20)} grid = GridSearchCV(KernelDensity(), params) grid.fit(array) # use the best estimator to compute the kernel density estimate kde = grid.best_estimator_ sampling_points = kde.sample(n_samples=10000, random_state=20) #Z = kde.score_samples(sampling_points) #Z = Z.reshape(X.shape) #np.save("prob_density"+base, Z) #index = np.unravel_index(Z.argmax(), Z.shape) #print Z[index], index #print "most probable values:", X[index], Y[index] mean_point = np.mean(sampling_points, axis=0) #mean_point = sampling_points[index] closest_structure_index = get_closest_structure_index( frame_indices, frames, mean_point) for x in frame_indices: print "mol addfile " + dcd_pkl_filename[:3] + ".dcd first " + str( x * jump) + " last " + str(x * jump) + " waitfor all" print dcd_array = preprocessing.load_residues(dcd_pkl_filename)[::jump] temp = dcd_array[closest_structure_index] # print "mol addfile " + dcd_pkl_filename[:3] + ".dcd first " + str(closest_structure_index*jump) + " last " + str(closest_structure_index*jump) +" waitfor all" write_pdb_file(temp, pdb, cluster_id, type_of_cluster) return mean_point
def find_cluster_centres(jump=18, pdb_file="", dcd_pkl_filename="", load=False): # this function finds the most probable 2-D location of each cluster dbscan = joblib.load('dbscan_model.pkl') X = preprocessing.load_residues('reduced_dimensions.pkl') indices = dbscan.core_sample_indices_ X = X[::jump] # very important line jump as in t-SNE we skip frames according to the jump parameter #print X.shape if load == False: core_points = X[indices] #print core_points.shape labels = dbscan.labels_[indices] #print labels.shape cluster_frames_dict = {} for lbl in set(labels): cluster_frames_dict[lbl] = [] for index in range(core_points.shape[0]): cluster_frames_dict[labels[index]].append(index) cluster_representative_point = {} for key in cluster_frames_dict.keys(): cluster_representative_point[key] = misc.most_probable_structure_in_cluster(cluster_frames_dict[key], X, pdb_file, key, "tsne", dcd_pkl_filename, jump) joblib.dump(cluster_representative_point, "tsne_cluster_representative_point.pkl") else: cluster_representative_point = joblib.load("tsne_cluster_representative_point.pkl") # make_clusters cluster_representative_index = {} for key in cluster_representative_point.keys(): frame_indices = misc.find_indices_of_clusters(dbscan.labels_, key) print "mol new rna.psf" for x in frame_indices: print "mol addfile " + dcd_pkl_filename[:3] + "-recenter-solute.dcd first " + str(x*jump) + " last " + str(x*jump) +" waitfor all" print temp_index = misc.get_closest_structure_index(np.array(frame_indices), X, cluster_representative_point[key]) cluster_representative_index[key] = temp_index return cluster_representative_point, cluster_representative_index
def graph_based_method(dcd, pdb, filename,jump,min_number_of_samples): gbu.tsne(False,jump=jump) gbu.density_clustering(jump, min_number_of_samples) cluster_representative_point, cluster_representative_index = gbu.find_cluster_centres(jump, pdb, filename, False) log_transition_matrix = gbu.get_transition_probabilities(cluster_representative_point, cluster_representative_index, temp=411, jump=jump, iterations=1000, train_model = False) dbscan = joblib.load('dbscan_model.pkl') start = dbscan.labels_[0] sequence = gbu.get_most_probable_path(dbscan.labels_) #print sequence #mpp.construct_transition_graph(sequence,transition_matrix,distribution)# start, end = misc.get_cluster_ids_for_start_and_end(start, set(dbscan.labels_), "tsne") #print start, end path = mpp.get_most_probable_path_in_markov_chain(copy.deepcopy(log_transition_matrix), start, end, is_log = True) #print path dcd_array = pp.load_residues(filename) misc.write_dcd(dcd_array,path,"tsne") misc.write_pdb(dcd_array,path,"tsne") print "The most reactive path is saved in your current working directory as 'tsne_unfolded_traj.dcd'" """ #compare the results to the MPP algorthm sequence, transition_matrix = mpp.get_dynamic_cluster_sequence() dynamic_clusters_path, path_prob = mpp.get_path_probability(copy.deepcopy(transition_matrix), path,cluster_representative_index,jump=jump) #print path_prob print "Graph:", (dynamic_clusters_path) path = mpp.get_most_probable_path_in_markov_chain(copy.deepcopy(transition_matrix), dynamic_clusters_path[0], dynamic_clusters_path[-1]) print "Markov Chain", path misc.write_pdb(dcd_array,path,"dynamic") """ return
def dynamic_cluster_trajectory(meta_stability_criteria = 0.9, pdb_file = "", dcd_pkl_filename = ""): model = joblib.load("KMEANS.pkl") X = preprocessing.load_residues('reduced_dimensions.pkl') #print X.shape mean_cluster_ids = shelve.open("kmeans_trajectory_clustering") # this structure stores cluster-ids for each frame of the trajectory if (len(mean_cluster_ids) == 0): for i in range(X.shape[0]): prediction = model.predict(X[i].reshape((1, -1))) mean_cluster_ids[str(i)] = int(prediction) #print "yo" number_of_current_clusters = model.cluster_centers_.shape[0] d = dict(mean_cluster_ids) mean_cluster_ids.close() while (True): cluster_membership = {} #print number_of_current_clusters, "--" for i in range(X.shape[0]): try: cluster_membership[int(d[str(i)])] += 1 except: cluster_membership[int(d[str(i)])] = 1 transition_matrix = ds.Autovivification() for i in set(d.values()): for j in set(d.values()): transition_matrix[i][j] = 0 for i in range(X.shape[0] - 1): transition_matrix[int(d[str(i)])][int(d[str(i + 1)])] += 1 # normalizing values row-wise cluster_probability = {} for i in set(d.values()): sums = 0 for j in set(d.values()): sums += transition_matrix[i][j] cluster_probability[i] = sums for j in set(d.values()): transition_matrix[i][j] /= sums * 1.0 dynamic_clusters = ds.disjoint(set(d.values())) visited = {} for i in set(d.values()): temp_visited = dfs_markov(i, set(d.values()), transition_matrix, dynamic_clusters, visited, meta_stability_criteria) visited = copy.deepcopy(temp_visited) dynamic_clusters.compress() dynamic_clusters.save_structure() cluster_ids = dynamic_clusters.get_clusters(cluster_probability) new_clusters = cluster_ids.values() #print len(set(new_clusters)) if number_of_current_clusters == len(set(new_clusters)): # applying cluster core correction distances_of_frames_in_cluster, avg_structure_in_cluster = core_clusters(d, pdb_file, dcd_pkl_filename) #print len(set(d.values())), "original" X = preprocessing.load_residues('reduced_dimensions.pkl') d_new = {} d_new["0"] = d["0"] for index in range(1, len(X)): dist = misc.distance(X[index], avg_structure_in_cluster[d[str(index)]]) if dist > distances_of_frames_in_cluster[d[str(index)]]: d_new[str(index)] = d_new[str(index - 1)] else: d_new[str(index)] = d[str(index)] d = d_new dynamic_clustering = shelve.open("dynamic_clustering") dynamic_clustering.clear() for i in d: dynamic_clustering[i] = d[i] dynamic_clustering.close() #print len(set(d.values())), "original" sequence = get_most_probable_path(d) #print len(sequence) #print sequence return sequence, transition_matrix else: #print set(new_clusters) number_of_current_clusters = len(set(new_clusters)) d_new = {} for i in range(X.shape[0]): d_new[str(i)] = cluster_ids[int(d[str(i)])] d = d_new dynamic_clustering = shelve.open("dynamic_clustering") dynamic_clustering.clear() for i in d: dynamic_clustering[i] = d[i] dynamic_clustering.close() return
def get_transition_probabilities(cluster_representative_point, cluster_representative_index, temp, jump, iterations, train_model = True): data_points = preprocessing.load_residues('reduced_dimensions.pkl') energies = preprocessing.create_energy_matrix(get_from_file=1) energies = np.sum(energies,axis=1) dbscan = joblib.load('dbscan_model.pkl') beta_val = 1.0/(0.0019872041*temp) #EM to estimate parameters of mixture model if train_model == True: init_cov = np.cov(data_points, rowvar = False) cov = [] coefs = [] means = [] energies_cluster = [] for key in cluster_representative_point.keys(): frame_indices = misc.find_indices_of_clusters(dbscan.labels_, key) params = {'bandwidth': np.logspace(-1, 0, 20)} grid = GridSearchCV(KernelDensity(), params) grid.fit(energies[np.array(frame_indices)].reshape(-1,1)) # use the best estimator to compute the kernel density estimate kde = grid.best_estimator_ sampling_points = kde.sample(n_samples=10000, random_state=20) mean_energy = float(np.mean(sampling_points, axis=0)) cov.append(init_cov) means.append(cluster_representative_point[key]) energies_cluster.append(mean_energy) coefs.append(1.0/len(cluster_representative_point.keys())) #print energies_cluster param_grid = {"means": means, "data": data_points, "beta": beta_val, "covariances": cov, "energy_data": energies, "energy_cluster": energies_cluster, "coef": coefs} model = em.Expectation_Maximization(param_grid, threshold=1e-4, reg_covar=1e-6, iterations=iterations) model.fit() model.get_params() #saves params in shelve file names "EM_params" mixture_params = dict(shelve.open("EM_params_with_full_covariance")) joblib.dump(mixture_params, "EM_params.pkl") mixture_params = joblib.load("EM_params.pkl") #print mixture_params["beta"], mixture_params["coef"] log_transition_matrix = aux.Autovivification() #T_i_j represents prob of going from i to j for i in cluster_representative_point.keys(): covar_matrix = mixture_params["variances"][i] prec_chol, log_det = prob_dis.compute_precisions_chol(np.array([covar_matrix])) for j in cluster_representative_point.keys(): temp_indices = misc.find_indices_of_clusters(dbscan.labels_, j) log_probabilites = prob_dis.log_pdf(data_points[::jump][np.array(temp_indices)], mixture_params["means"][i], covar_matrix, mixture_params["beta"], energies[::jump][np.array(temp_indices)], mixture_params["energy_cluster"][i], prec_chol[0], log_det[0]) max_val = np.amax(log_probabilites) total = np.log(np.sum(np.exp(log_probabilites - max_val))) + max_val log_transition_matrix[i][j] = total return log_transition_matrix
def get_metastable_states(filename): path = gbu.map_PES() dcd_array = pp.load_residues(filename) misc.write_dcd(dcd_array,path,"tsne")