def transform_protein_tica(yaml_file): mdl_dir = yaml_file["mdl_dir"] tica_obj_path = os.path.join(mdl_dir, "tica_mdl.pkl") protein_tica_mdl = verboseload(tica_obj_path) for protein in yaml_file["protein_list"]: print("Fitting to protein %s" % protein) with enter_protein_data_dir(yaml_file, protein): if os.path.exists("./normalized_features"): featurized_traj = sorted(glob.glob("./normalized_features/*.jl"), key=keynat) else: print('Warning: features have not been scaled') featurized_traj = sorted(glob.glob("./%s/*.jl" % yaml_file["feature_dir"]), key=keynat) tica_data = {} for f in featurized_traj: featurized_path = verboseload(f) try: tica_data[os.path.basename(f)] = \ protein_tica_mdl.partial_transform(featurized_path) except: pass with enter_protein_mdl_dir(yaml_file, protein): verbosedump(tica_data, 'tica_data.pkl') print("Done transforming protein %s" % protein) # dumping the tica_mdl again since the eigenspectrum might have been calculated tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl") verbosedump(protein_tica_mdl, tica_mdl_path) return
def featurize_project(proj_folder,top_folder,featurizer_object,stride,view): #if already featurized dont bother(should add a warning about this) if os.path.exists(proj_folder+"/featurized_traj.pkl"): return verboseload(proj_folder+"/featurized_traj.pkl") if featurizer_object is None: featurizer = DihedralFeaturizer(types=['phi', 'psi','chi1']) else: try: featurizer = verboseload(featurizer_object) except: sys.exit("Cant Load Featurizer using msmbuilder verboseload") feature_dict={} traj_list = glob.glob(proj_folder+"/trajectories/*.dcd") jobs = [(proj_folder,top_folder,featurizer,traj,stride) for traj in traj_list] results = view.map_sync(featurize_traj,jobs) for result in results: feature_dict[result[0]] = result[1] verbosedump(feature_dict,proj_folder+"/featurized_traj.pkl") return feature_dict
def load_current_protein_model(yaml_file, protein, sanity=True): """ :param base_dir: Base directory for the project :param protein: Protein for which to load :param sanity: Whether or not to run sanity tests :return: base_dir, mdl_dir, msm_mdl, tica_mdl, tica_data, kmeans_mdl, fixed_assignments for the model currently stored in mdl_dir and mdl_dir/protein """ yaml_file = load_yaml_file(yaml_file) base_dir = yaml_file["base_dir"] mdl_dir = yaml_file["mdl_dir"] prot_mdl_dir = os.path.join(mdl_dir, protein) # load the project level information first kmeans_mdl = verboseload(os.path.join(mdl_dir, "kmeans_mdl.pkl")) tica_mdl = verboseload(os.path.join(mdl_dir, "tica_mdl.pkl")) # now load the protein level information tica_data = verboseload(os.path.join(prot_mdl_dir, "tica_data.pkl")) # need the fixed assignments because otherwise we will have issues assignments = verboseload(os.path.join( prot_mdl_dir, "fixed_assignments.pkl")) msm_mdl = verboseload(os.path.join(prot_mdl_dir, "msm_mdl.pkl")) # some sanity tests if sanity: _sanity_test(base_dir, protein, msm_mdl, tica_data, kmeans_mdl, assignments) return base_dir, mdl_dir, msm_mdl, tica_mdl, tica_data, kmeans_mdl, assignments
def transform_protein_tica(yaml_file): mdl_dir = yaml_file["mdl_dir"] tica_obj_path = os.path.join(mdl_dir, "tica_mdl.pkl") protein_tica_mdl = verboseload(tica_obj_path) for protein in yaml_file["protein_list"]: with enter_protein_data_dir(yaml_file, protein): print("Transforming protein %s" % protein) featurized_traj = sorted(glob.glob("./%s/*.jl" % yaml_file["feature_dir"]), key=keynat) tica_data = {} for f in featurized_traj: featurized_path = verboseload(f) try: tica_data[os.path.basename(f)] = \ protein_tica_mdl.partial_transform(featurized_path) except: pass with enter_protein_mdl_dir(yaml_file, protein): verbosedump(tica_data, 'tica_data.pkl') print("Done transforming protein %s" % protein) # dumping the tica_mdl again since the eigenspectrum might have been calculated. tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl") verbosedump(protein_tica_mdl, tica_mdl_path) return
def featurize_project(proj_folder, top_folder, featurizer_object, stride, view): #if already featurized dont bother(should add a warning about this) if os.path.exists(proj_folder + "/featurized_traj.pkl"): return verboseload(proj_folder + "/featurized_traj.pkl") if featurizer_object is None: featurizer = DihedralFeaturizer(types=['phi', 'psi', 'chi1']) else: try: featurizer = verboseload(featurizer_object) except: sys.exit("Cant Load Featurizer using msmbuilder verboseload") feature_dict = {} traj_list = glob.glob(proj_folder + "/trajectories/*.dcd") jobs = [(proj_folder, top_folder, featurizer, traj, stride) for traj in traj_list] results = view.map_sync(featurize_traj, jobs) for result in results: feature_dict[result[0]] = result[1] verbosedump(feature_dict, proj_folder + "/featurized_traj.pkl") return feature_dict
def transform_protein_pca(yaml_file): mdl_dir = yaml_file["mdl_dir"] pca_obj_path = os.path.join(mdl_dir, "pca_mdl.pkl") protein_pca_mdl = verboseload(pca_obj_path) for protein in yaml_file["protein_list"]: with enter_protein_data_dir(yaml_file, protein): print("Transforming protein %s" % protein) featurized_traj = sorted(glob.glob("./%s/*.jl" % yaml_file["feature_dir"]), key=keynat) pca_data = {} for f in featurized_traj: featurized_path = verboseload(f) try: pca_data[os.path.basename(f)] = \ protein_pca_mdl.partial_transform(featurized_path) except: print('Error') with enter_protein_mdl_dir(yaml_file, protein): verbosedump(pca_data, 'pca_data.pkl') print("Done transforming protein %s" % protein) # dumping the pca_mdl again since the eigenspectrum might have been calculated pca_mdl_path = os.path.join(mdl_dir, "pca_mdl.pkl") verbosedump(protein_pca_mdl, pca_mdl_path) return
def fit_protein_kmeans(yaml_file,mini=True,pca=False): mdl_dir = yaml_file["mdl_dir"] mdl_params = yaml_file["mdl_params"] current_mdl_params={} for i in mdl_params.keys(): if i.startswith("cluster__"): current_mdl_params[i.split("cluster__")[1]] = mdl_params[i] if mini: current_mdl_params["batch_size"] = 100*current_mdl_params["n_clusters"] kmeans_mdl = MiniBatchKMeans(**current_mdl_params) else: kmeans_mdl = KMeans(**current_mdl_params) data = [] for protein in yaml_file["protein_list"]: with enter_protein_mdl_dir(yaml_file, protein): if pca: tica_data = verboseload("pca_data.pkl") else: tica_data = verboseload("tica_data.pkl") # get all traj sorted_list = sorted(tica_data.keys(), key=keynat) data.extend([tica_data[i] for i in sorted_list]) kmeans_mdl.fit(data) kmeans_mdl_path = os.path.join(mdl_dir, "kmeans_mdl.pkl") verbosedump(kmeans_mdl, kmeans_mdl_path) return
def cos_to_means(clusterer_dir, features_dir): clusterer = verboseload(clusterer_dir) clusters_map = make_clusters_map(clusterer) features = verboseload(features_dir) feature_distances = {} for i in range(0, len(list(clusters_map.keys()))): indices = clusters_map[i] k_mean = clusterer.cluster_centers_[i] print(k_mean) find_cos_partial = partial(find_cos, k_mean=k_mean, features=features) feature_distances_i = list(map(find_cos_partial, indices)) feature_distances[i] = feature_distances_i print((feature_distances[0][0:10])) sorted_map = {} print((list(feature_distances.keys()))) print((len(list(feature_distances.keys())))) for i in range(0, len(list(feature_distances.keys()))): sorted_features = sorted(feature_distances[i], key=lambda x: x[2], reverse=True) sorted_map[i] = sorted_features print(sorted_map[0][0:10]) return sorted_map
def cluster_minikmeans(tica_dir, data_dir, traj_dir, n_clusters, clusterer_dir=None, tICs=None): if (os.path.exists(clusterer_dir)): reduced_data = load_file(data_dir) clusterer = verboseload(clusterer_dir) clusterer.labels_ = clusterer.transform(reduced_data) verbosedump(clusterer, clusterer_dir) else: print("Clustering by KMeans") try: reduced_data = verboseload(data_dir) except: reduced_data = load_dataset(data_dir) if tICs is not None: X = [] for traj in reduced_data: X.append(traj[:, tICs]) else: X = reduced_data clusterer = MiniBatchKMeans(n_clusters=n_clusters, n_init=10) clusterer.fit_transform(X) verbosedump(clusterer, clusterer_dir)
def plot_tica_and_clusters(tica_dir, transformed_data_dir, clusterer_dir, lag_time, component_i=0, component_j=1): transformed_data = verboseload(transformed_data_dir) clusterer = verboseload(clusterer_dir) trajs = np.concatenate(transformed_data) plt.hexbin(trajs[:, component_i], trajs[:, component_j], bins='log', mincnt=1) centers = clusterer.cluster_centers_ for i in range(0, np.shape(centers)[0]): center = centers[i, :] plt.annotate('%d' % i, xy=(center[0], center[1]), xytext=(center[0], center[1]), size=6) pp = PdfPages("%s/c%d_c%d_clusters%d.pdf" % (tica_dir, component_i, component_j, np.shape(centers)[0])) pp.savefig() pp.close()
def fit_and_transform(features_directory, model_dir, stride=5, lag_time=10, n_components = 5): if not os.path.exists(model_dir): os.makedirs(model_dir) projected_data_filename = "%s/phi_psi_chi2_allprot_projected.h5" %model_dir fit_model_filename = "%s/phi_psi_chi2_allprot_tica_coords.h5" %model_dir #active_pdb_file = "/scratch/users/enf/b2ar_analysis/renamed_topologies/A-00.pdb" tica_model = tICA(n_components = n_components, lag_time = lag_time) if not os.path.exists(projected_data_filename): print("loading feature files") feature_files = get_trajectory_files(features_directory, ext = ".h5") pool = mp.Pool(mp.cpu_count()) features = pool.map(load_features, feature_files) pool.terminate() if not os.path.exists(fit_model_filename): print("fitting data to tICA model") fit_model = tica_model.fit(features) verbosedump(fit_model, fit_model_filename) transformed_data = fit_model.transform(features) verbosedump(transformed_data, projected_data_filename) else: print("loading tICA model") fit_model = verboseload(fit_model_filename) print("transforming") transformed_data = fit_model.transform(features) verbosedump(transformed_data, projected_data_filename) else: fit_model = verboseload(fit_model_filename) transformed_data = verboseload(projected_data_filename) print fit_model.summarize()
def cos_to_means(clusterer_dir, features_dir): clusterer = verboseload(clusterer_dir) clusters_map = make_clusters_map(clusterer) features = verboseload(features_dir) feature_distances = {} for i in range(0, len(clusters_map.keys())): indices = clusters_map[i] k_mean = clusterer.cluster_centers_[i] print k_mean find_cos_partial = partial(find_cos, k_mean=k_mean, features = features) feature_distances_i = map(find_cos_partial, indices) feature_distances[i] = feature_distances_i print(feature_distances[0][0:10]) sorted_map = {} print(feature_distances.keys()) print(len(feature_distances.keys())) for i in range(0, len(feature_distances.keys())): sorted_features = sorted(feature_distances[i], key = lambda x: x[2], reverse = True) sorted_map[i] = sorted_features print sorted_map[0][0:10] return sorted_map
def dist_to_means(clusterer_dir, features_dir): clusterer = verboseload(clusterer_dir) clusters_map = make_clusters_map(clusterer) features = verboseload(features_dir) feature_distances = {} def find_cos(index, k_mean): traj = index[0] frame = index[1] conformation = features[traj][frame] a = conformation b = k_mean return (traj, frame, np.dot(a,b) / (np.linalg.norm(a) * np.linalg.norm(b))) for i in range(0, len(clusters_map.keys())): indices = clusters_map[i] k_mean = clusterer.cluster_centers_[i] print k_mean find_cos_partial = partial(find_cos, k_mean=k_mean) feature_distances_i = map(find_cos_partial, indices) feature_distances[i] = feature_distances_i print(feature_distances[0][0:10]) sorted_map = {} print(feature_distances.keys()) print(len(feature_distances.keys())) for i in range(0, len(feature_distances.keys())): sorted_features = sorted(feature_distances[i], key = lambda x: x[2], reverse = True) sorted_map[i] = sorted_features print sorted_map[0][0:10] return sorted_map
def load_current_protein_model(yaml_file, protein, sanity=True): """ :param base_dir: Base directory for the project :param protein: Protein for which to load :param sanity: Whether or not to run sanity tests :return: base_dir, mdl_dir, msm_mdl, tica_mdl, tica_data, kmeans_mdl, fixed_assignments for the model currently stored in mdl_dir and mdl_dir/protein """ yaml_file = load_yaml_file(yaml_file) base_dir = yaml_file["base_dir"] mdl_dir = yaml_file["mdl_dir"] prot_mdl_dir = os.path.join(mdl_dir, protein) # load the project level information first kmeans_mdl = verboseload(os.path.join(mdl_dir, "kmeans_mdl.pkl")) tica_mdl = verboseload(os.path.join(mdl_dir, "tica_mdl.pkl")) # now load the protein level information tica_data = verboseload(os.path.join(prot_mdl_dir, "tica_data.pkl")) # need the fixed assignments because otherwise we will have issues assignments = verboseload( os.path.join(prot_mdl_dir, "fixed_assignments.pkl")) msm_mdl = verboseload(os.path.join(prot_mdl_dir, "msm_mdl.pkl")) # some sanity tests if sanity: _sanity_test(base_dir, protein, msm_mdl, tica_data, kmeans_mdl, assignments) return base_dir, mdl_dir, msm_mdl, tica_mdl, tica_data, kmeans_mdl, assignments
def landmark_ktica(features_dir, combined_features_file=None, feature_ext = ".dataset", use_clusters_as_landmarks=True, clusters_map_file = "", landmarks_dir = "", nystroem_components=1000, n_components=10, lag_time=5, nystroem_data_filename = "", fit_model_filename = "", projected_data_filename = "", landmark_subsample=10, sparse = False, shrinkage = 0.05, wolf = False, rho = 0.01): ''' features_dir: string, directory where your featurized trajectories are kept. combined_features_dir: if you have a file containing all featurized trajectories in one file, i.e. as a list of np arrays, this is it. feature_ext: if instead of a combined file of features they are in separate files, what is the extension of your feature files? use_clusters_as_landmarks: this is if you are doing a composition of tICA --> clustering --> Nystroem --> tICA. this is what I do. if true, you need to feed it a json file containing a dictionary that maps cluster name --> list of 2-tuples, where each tuple has (trajectory_id, frame_number pairs). So this way, instead of choosing landmark points at random in the Nystroem approximation, you are using regular linear tICA-driven clustering to choose your landmark points more efficiently. landmarks_dir: directory where you will save the landmarks. this should be a file containing a list of 1d np arrays or a 2d array nystroem_components: the number of landmarks to use. n_components: the number of ktICA components to compute. lag_time: lag time of tICA nystroem_data_filename: where you will save Nystroem object fit_model_filename: the filename of the ktICA object to save. projected_data_filename: where you will save the features projected with kernel tICA landmark_subsample= how frequently to subsample the landmarks if you are doing use_clusters_as_landmarks. sparse: set to False. shrinkage: same as gamma in old version of tICA. you might want to mess with this. wolf = False: keep this as true unless you're using Robert's branch of msmbuilder rho = Ignore this. ''' if not os.path.exists(nystroem_data_filename): if combined_features_dir is not None: features = verboseload(combined_features_file) else: features = load_file_list(get_trajectory_files(features_dir, ext = feature_ext)) if os.path.exists(landmarks_dir): landmarks = verboseload(landmarks_dir) print(np.shape(landmarks)) else: if use_clusters_as_landmarks: with open(clusters_map_file) as f: clusters_map = json.load(f) clusters_map = {int(k):v for k,v in clusters_map.items()} landmarks = [] for cluster_id,sample_list in clusters_map.items(): for sample in sample_list: traj = sample[0] frame = sample[1] landmark = features[traj][frame] landmarks.append(landmark) landmarks = [landmarks[i] for i in range(0,np.shape(landmarks)[0]) if i%landmark_subsample==0] #%landmark_subsample == 0] verbosedump(landmarks, landmarks_dir) else: n = np.shape(features)[0] indices = np.random.choice(n, nystroem_components) features_concatenated = np.concatenate(features) landmarks = features_concatenated[indices,:] verbosedump(landmarks, landmarks_dir) ktica(features, landmarks, projected_data_filename, nystroem_data_filename, fit_model_filename, sparse, shrinkage, wolf, rho)
def dist_to_means(clusterer_dir, features_dir, n_samples = False, n_components = False, tica_coords_csv = False, kmeans_csv = False): clusterer = verboseload(clusterer_dir) clusters_map = make_clusters_map(clusterer) try: features = verboseload(features_dir) except: features = load_dataset(features_dir) feature_distances = {} for i in range(0, len(clusters_map.keys())): indices = clusters_map[i] k_mean = clusterer.cluster_centers_[i] print k_mean find_dist_partial = partial(find_dist, k_mean=k_mean, features = features) feature_distances_i = map(find_dist_partial, indices) feature_distances[i] = feature_distances_i print(feature_distances[0][0:10]) sorted_map = {} print(feature_distances.keys()) print(len(feature_distances.keys())) for i in range(0, len(feature_distances.keys())): sorted_features = sorted(feature_distances[i], key = lambda x: x[2], reverse = False) sorted_map[i] = sorted_features if n_samples is not False and n_components is not False and tica_coords_csv is not False: tica_coords_map = {} for cluster_id in sorted_map.keys(): for j in range(0, n_samples): sample = "cluster%d_sample%d" %(cluster_id, j) sample_tuple = sorted_map[cluster_id][j][0:2] sample_coords = features[sample_tuple[0]][sample_tuple[1]] tica_coords_map[sample] = sample_coords titles = ["sample"] for k in range(0, n_components): titles.append("component_%d" %k) print(tica_coords_map.keys()[0]) print(tica_coords_map[tica_coords_map.keys()[0]]) write_map_to_csv(tica_coords_csv, tica_coords_map, titles) if kmeans_csv is not False: kmeans_map = {} for cluster in range(0,clusterer.n_clusters): k_mean = clusterer.cluster_centers_[cluster] cluster_id = "cluster%d" %cluster kmeans_map[cluster_id] = k_mean titles = ["cluster"] for k in range(0, n_components): titles.append("component_%d" %k) write_map_to_csv(kmeans_csv, kmeans_map, titles) print sorted_map[0][0:10] return sorted_map
def plot_all_tics_and_clusters(tica_dir, transformed_data_dir, clusterer_dir, lag_time, tic_range=None, main="", label="dot", active_cluster_ids=[], intermediate_cluster_ids=[], inactive_cluster_ids=[], inactive_subsample=5, intermediate_subsample=5, custom_cluster_centers=None, concatenate=True, axes=None): try: transformed_data = verboseload(transformed_data_dir) except: transformed_data = load_dataset(transformed_data_dir) if custom_cluster_centers is None: clusterer = verboseload(clusterer_dir) centers = clusterer.cluster_centers_ #print centers if not concatenate: num_tics = np.shape(transformed_data)[1] else: num_tics = np.shape(transformed_data[0])[1] if tic_range == None: tic_range = range(0, num_tics) for i in tic_range: js = [j for j in tic_range if j > i] plot_partial = partial( plot_tica_and_clusters, n_clusters=len(centers), tica_dir=tica_dir, main=main, transformed_data=transformed_data, lag_time=lag_time, label=label, active_cluster_ids=active_cluster_ids, intermediate_cluster_ids=intermediate_cluster_ids, inactive_cluster_ids=inactive_cluster_ids, inactive_subsample=inactive_subsample, intermediate_subsample=intermediate_subsample, component_i=i, centers=centers, concatenate=concatenate, axes=axes) #for j in js: # plot_partial(j) pool = mp.Pool(mp.cpu_count()) pool.map(plot_partial, js) pool.terminate() #plot_tica_and_clusters(tica_dir = tica_dir, transformed_data = transformed_data, clusterer = clusterer, lag_time = lag_time, label = "dot", active_cluster_ids = active_cluster_ids, intermediate_cluster_ids = intermediate_cluster_ids, inactive_cluster_ids = inactive_cluster_ids, component_i = i, component_j = j) print("Printed all tICA coords and all requested clusters")
def plot_col(transformed_data_file, figure_directory, colors_file): transformed_data = verboseload(transformed_data_file) trajs = np.concatenate(transformed_data) colors = np.concatenate(verboseload(colors_file)) sc = plt.scatter(trajs[:,0], trajs[:,1], c=colors, s=50, cmap = mpl.cm.RdYlBu_r) plt.colorbar(sc) plt.show() pp = PdfPages(figure_directory) pp.savefig() pp.close() return
def landmark_ktica_ticaTraj(tica_dir, clusterer_dir, ktica_dir, clusters_map_file = "", landmarks_dir = "", nystroem_components=1000, n_components=10, lag_time=5, nystroem_data_filename = "", fit_model_filename = "", projected_data_filename = "", landmark_subsample=1, sparse = False, wolf = True, rho = 0.01, shrinkage = None): if not os.path.exists(ktica_dir): os.makedirs(ktica_dir) if not sparse: if shrinkage is None: tica_model = tICA(n_components = n_components, lag_time = lag_time) else: tica_model = tICA(n_components = n_components, lag_time = lag_time, shrinkage = shrinkage) else: if shrinkage is None: tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho) else: tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho, shrinkage = shrinkage) if not os.path.exists(nystroem_data_filename): clusterer = verboseload(clusterer_dir) tica = verboseload(tica_dir) features = tica clusters = clusterer.cluster_centers_ landmarks = clusters print("here's what goes into the combined class:") #print(np.shape(features)) print(np.shape(landmarks)) print(type(landmarks)) nys = Nystroem(n_components = np.shape(landmarks)[0], basis = landmarks)#np.shape(landmarks)[0])# basis=landmarks) nyx = nys.fit_transform(features) del features del landmarks try: save_dataset(nyx, nystroem_data_filename) except: os.system("rm -rf %s" %nystroem_data_filename) save_dataset(nyx, nystroem_data_filename) else: nyx = load_dataset(nystroem_data_filename) print(np.shape(nyx)) print(dir(nyx)) if not os.path.exists(projected_data_filename): fit_model = tica_model.fit(nyx) verbosedump(fit_model, fit_model_filename) transformed_data = fit_model.transform(nyx) del(nyx) try: save_dataset(transformed_data, projected_data_filename) except: os.system("rm -rf %s" %projected_data_filename) save_dataset(transformed_data, projected_data_filename) else: print("Already performed landmark kernel tICA.")
def __init__(self, yaml_file, relative_loc=None): self.yaml_file = load_yaml_file(yaml_file) self.base_dir = self.yaml_file["base_dir"] self.mdl_dir = self.yaml_file["mdl_dir"] if relative_loc is None: self.relative_loc = self.mdl_dir else: self.relative_loc = os.path.join(relative_loc, os.path.split(self.mdl_dir)[1]) self.kmeans_mdl = verboseload( os.path.join(self.relative_loc, "kmeans_mdl.pkl")) self.tica_mdl = verboseload(os.path.join(self.relative_loc, "tica_mdl.pkl"))
def __init__(self, yaml_file, relative_loc=None): self.yaml_file = load_yaml_file(yaml_file) self.base_dir = self.yaml_file["base_dir"] self.mdl_dir = self.yaml_file["mdl_dir"] if relative_loc is None: self.relative_loc = self.mdl_dir else: self.relative_loc = os.path.join(relative_loc, os.path.split(self.mdl_dir)[1]) self.kmeans_mdl = verboseload( os.path.join(self.relative_loc, "kmeans_mdl.pkl")) self.tica_mdl = verboseload( os.path.join(self.relative_loc, "tica_mdl.pkl"))
def _test_protein_with_project(prj): p1 = Protein(prj, "kinase_1") p2 = Protein(prj, "kinase_2") assert isinstance(p1, Protein) assert isinstance(p1.msm, MarkovStateModel) assert (p1.msm.left_eigenvectors_ == verboseload(os.path.join(prj.mdl_dir,"kinase_1","msm_mdl.pkl")).left_eigenvectors_).all() assert (p1.bootrap_msm.mle_.left_eigenvectors_ == verboseload(os.path.join(prj.mdl_dir,"kinase_1","msm_mdl.pkl")).left_eigenvectors_).all() assert (p2.msm.left_eigenvectors_ == verboseload(os.path.join(prj.mdl_dir,"kinase_2","msm_mdl.pkl")).left_eigenvectors_).all() assert (p2.bootrap_msm.mle_.left_eigenvectors_ == verboseload(os.path.join(prj.mdl_dir,"kinase_2","msm_mdl.pkl")).left_eigenvectors_).all() return True
def plot_col(transformed_data_file, figure_directory, colors_file): transformed_data = verboseload(transformed_data_file) trajs = np.concatenate(transformed_data) colors = np.concatenate(verboseload(colors_file)) sc = plt.scatter(trajs[:, 0], trajs[:, 1], c=colors, s=50, cmap=mpl.cm.RdYlBu_r) plt.colorbar(sc) plt.show() pp = PdfPages(figure_directory) pp.savefig() pp.close() return
def plot_tica_and_clusters(tica_dir, transformed_data_dir, clusterer_dir, lag_time, component_i = 0, component_j = 1): transformed_data = verboseload(transformed_data_dir) clusterer = verboseload(clusterer_dir) trajs = np.concatenate(transformed_data) plt.hexbin(trajs[:,component_i], trajs[:,component_j], bins='log', mincnt=1) centers = clusterer.cluster_centers_ for i in range(0, np.shape(centers)[0]): center = centers[i,:] plt.annotate('%d' %i, xy=(center[0],center[1]), xytext=(center[0], center[1]),size=6) pp = PdfPages("%s/c%d_c%d_clusters%d.pdf" %(tica_dir, component_i, component_j, np.shape(centers)[0])) pp.savefig() pp.close()
def _slice_file(job_tuple): inp_file, feature_ind, output_folder = job_tuple featurized_file = verboseload(inp_file) sliced_file = featurized_file[:, feature_ind] sliced_file_out = os.path.join(output_folder, os.path.basename(inp_file)) verbosedump(sliced_file, sliced_file_out) return
def build_msm(clusterer_dir, lag_time): clusterer = verboseload(clusterer_dir) n_clusters = np.shape(clusterer.cluster_centers_)[0] labels = clusterer.labels_ msm_modeler = MarkovStateModel(lag_time=lag_time) print("fitting msm to trajectories with %d clusters and lag_time %d" %(n_clusters, lag_time)) msm_modeler.fit_transform(labels) verbosedump(msm_modeler, "/scratch/users/enf/b2ar_analysis/msm_model_%d_clusters_t%d" %(n_clusters, lag_time)) print("fitted msm to trajectories with %d states" %(msm_modeler.n_states_)) #np.savetxt("/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_transmat.csv" %(n_clusters, lag_time), msm_modeler.transmat_, delimiter=",") #G = nx.from_numpy_matrix(msm_modeler.transmat_) #nx.write_edgelist(G, "/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_edgelist" %(n_clusters, lag_time), msm_modeler.transmat_, delimiter=",") transmat = msm_modeler.transmat_ mapping = msm_modeler.mapping_ edges = open("/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_edgelist.csv" %(n_clusters, lag_time), "wb") for i in range(0, msm_modeler.n_states_): if i == 0: for j in range(0, msm_modeler.n_states_): edges.write(";") edges.write("%d" %mapping[j]) edges.write("\n") edges.write("%d" %(mapping[i])) for j in range(0, msm_modeler.n_states_): prob = transmat[i][j] edges.write(";") if prob > 0.000001: edges.write("%f" %prob) else: edges.write("0") edges.write("\n") edges.close()
def plot_tics_gmm_R(save_dir, data_file, gmm_dir, titles = None, tICA = False, scale = 1.0, refcoords_file = None): data = verboseload(data_file) data = np.concatenate(data) data[:,0] *= scale if(refcoords_file is not None): refcoords = load_file(refcoords_file) else: refcoords = None print(np.shape(refcoords)) print(refcoords) gmm_means = [] for j in range(0,np.shape(data)[1]): with gzip.open("%s/tIC%d_gmm.pkl.gz" %(gmm_dir, j)) as f: gmm = pickle.load(f) gmm_means.append(gmm.means_) num_columns = np.shape(data)[1] plot_column_pair_partial = partial(plot_column_pair, num_columns = num_columns, save_dir = save_dir, titles = titles, data = data, gmm_means = gmm_means, refcoords = refcoords) #for i in range(0,num_columns): # plot_column_pair_partial(i) pool = mp.Pool(mp.cpu_count()) pool.map(plot_column_pair_partial, range(0,num_columns)) pool.terminate() print("Done plotting columns") return
def macrostate_pcca(msm_file, clusterer_file, n_macrostates, macrostate_dir): msm = verboseload(msm_file) clusterer = verboseload(clusterer_file) #pcca = lumping.PCCAPlus.from_msm(msm = msm,n_macrostates = n_macrostates) #macrostate_model = MarkovStateModel() #macrostate_model.fit(pcca.transform(labels)) pcca_object = lumping.PCCA(n_macrostates = 10) pcca_object.fit(sequences = clusterer.labels_) #pcca_object.transform(sequences = clusterer.labels_) #macrostate_model = pcca_object.from_msm(msm = msm, n_macrostates = n_macrostates) print(pcca_object) print(pcca_object.microstate_mapping_) verbosedump(pcca_object, macrostate_dir)
def test_dihedral_feat(): print(base_dir) pool = Pool(6) yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml")) for prt in ["kinase_1", "kinase_2"]: print(prt) prj = yaml_file["project_dict"][prt][0] featurize_project_wrapper(yaml_file, prt, feat=None, stride=1, view=pool) feat = DihedralFeaturizer(types=['phi', 'psi','chi1']) flist = glob.glob(os.path.join(base_dir, prt , yaml_file["protein_dir"],"*.hdf5")) for i in np.random.choice(flist, 3): trj = mdt.load(i) my_feat = feat.partial_transform(trj) expected_fname = os.path.join(base_dir, prt, yaml_file["feature_dir"], os.path.splitext(os.path.basename(i))[0]+".jl") calc_feat = verboseload(expected_fname) assert np.allclose(my_feat, calc_feat) return True
def fit_protein_pca(yaml_file): mdl_dir = yaml_file["mdl_dir"] mdl_params = yaml_file["mdl_params"] current_mdl_params={} for i in mdl_params.keys(): if i.startswith("pca__"): current_mdl_params[i.split("pca__")[1]] = mdl_params[i] protein_pca_mdl = PCA(**current_mdl_params) for protein in yaml_file["protein_list"]: print("Fitting to protein %s" % protein) with enter_protein_data_dir(yaml_file, protein): featurized_traj = sorted(glob.glob("./%s/*.jl" % yaml_file["feature_dir"]), key=keynat) for f in featurized_traj: featurized_path = verboseload(f) try: protein_pca_mdl.partial_fit(featurized_path) except: pass print("Done partial fitting to protein %s" % protein) # dumping the pca_mdl pca_mdl_path = os.path.join(mdl_dir, "pca_mdl.pkl") verbosedump(protein_pca_mdl, pca_mdl_path) return
def fit_protein_tica(yaml_file,sparse=False): mdl_dir = yaml_file["mdl_dir"] mdl_params = yaml_file["mdl_params"] current_mdl_params={} for i in mdl_params.keys(): if i.startswith("tica__"): current_mdl_params[i.split("tica__")[1]] = mdl_params[i] if sparse==True: protein_tica_mdl = SparseTICA(**current_mdl_params) else: protein_tica_mdl = tICA(**current_mdl_params) for protein in yaml_file["protein_list"]: print("Fitting to protein %s" % protein) with enter_protein_data_dir(yaml_file, protein): featurized_traj = sorted(glob.glob("./%s/*.jl" % yaml_file["feature_dir"]), key=keynat) for f in featurized_traj: featurized_path = verboseload(f) try: protein_tica_mdl.partial_fit(featurized_path) except: pass print("Done partial fitting to protein %s" % protein) # dumping the tica_mdl tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl") verbosedump(protein_tica_mdl, tica_mdl_path) return
def transform_protein_kmeans(yaml_file): mdl_dir = yaml_file["mdl_dir"] kmeans_mdl_path = os.path.join(mdl_dir, "kmeans_mdl.pkl") kmeans_mdl = verboseload(kmeans_mdl_path) for protein in yaml_file["protein_list"]: print("Assigning protein %s" % protein) with enter_protein_mdl_dir(yaml_file, protein): tica_data = verboseload("tica_data.pkl") # do assignments assignments = {} for i in tica_data.keys(): assignments[i] = kmeans_mdl.predict([tica_data[i]])[0] verbosedump(assignments, 'assignments.pkl') print("Done assigning %s" % protein) return
def cluster(data_dir, traj_dir, n_clusters): reduced_data = verboseload(data_dir) trajs = np.concatenate(reduced_data) plt.hexbin(trajs[:, 0], trajs[:, 1], bins='log', mincnt=1) clusterer = MiniBatchKMedoids(n_clusters=n_clusters) clusterer.fit_transform(reduced_data) centers = clusterer.cluster_centers_ for i in range(0, np.shape(centers)[0]): center = centers[i, :] plt.scatter(center[0], center[1]) plt.annotate('C%d' % i, xy=(center[0], center[1]), xytext=(center[0] + 0.1, center[1] + 0.1), arrowprops=dict(facecolor='black', shrink=0.05)) location = clusterer.cluster_ids_[i, :] print(location) traj = get_trajectory_files(traj_dir)[location[0]] print(("traj = %s" % traj)) print(("frame = %d" % location[1])) conformation = md.load_frame(traj, location[1]) conformation.save_pdb( "/scratch/users/enf/b2ar_analysis/cluster_%d.pdb" % i) plt.show()
def plot_timescales(clusterer_dir, n_clusters, tica_dir, main="", lag_times=list(range(1, 50))): clusterer = verboseload(clusterer_dir) print(clusterer) sequences = clusterer.labels_ #print(sequences) #lag_times = list(np.arange(1,150,5)) n_timescales = 5 msm_timescales = implied_timescales(sequences, lag_times, n_timescales=n_timescales, msm=MarkovStateModel( verbose=True, prior_counts=1e-5, ergodic_cutoff='off')) print(msm_timescales) for i in range(n_timescales): plt.plot(lag_times, msm_timescales[:, i]) plt.xlabel("Lag time (ns)") plt.ylabel("Implied Timescales (ns)") plt.title(main) plt.semilogy() pp = PdfPages("%s/%s_n_clusters%d_implied_timescales.pdf" % (tica_dir, main, n_clusters)) pp.savefig() pp.close() plt.clf()
def test_map_tic_component(): yaml_file = os.path.join(base_dir,"mdl_dir","project.yaml") yaml_file = load_yaml_file(yaml_file) fit_pipeline(yaml_file["base_dir"]) with enter_protein_data_dir(yaml_file, "kinase_1"): df = pd.DataFrame(verboseload( os.path.join(yaml_file["feature_dir"], "feature_descriptor.h5") )) trj = mdt.load(os.path.join(yaml_file["protein_dir"], "fake_proj1_0_0.hdf5")) ser = ProteinSeries(yaml_file,base_dir) prt = Protein(ser, "kinase_1") tica_mdl = prt.tica_mdl tic_index=0 t_c = tica_mdl.components_[tic_index, :] a_i, r_i = _map_tic_component(t_c, df, trj) assert len(a_i[0]) == trj.n_atoms assert len(r_i[0]) == trj.n_residues #spot check residue 0 df2 = pd.DataFrame([i[1] for i in df.iterrows() if 0 in i[1]["resids"]]) r0_imp = np.sum(abs(t_c[df2.index])) assert r0_imp==r_i[0,0]
def fit_protein_kmeans(yaml_file,mini=True): mdl_dir = yaml_file["mdl_dir"] mdl_params = yaml_file["mdl_params"] current_mdl_params={} for i in mdl_params.keys(): if i.startswith("cluster__"): current_mdl_params[i.split("cluster__")[1]] = mdl_params[i] if mini: current_mdl_params["batch_size"] = 100*current_mdl_params["n_clusters"] kmeans_mdl = MiniBatchKMeans(**current_mdl_params) else: kmeans_mdl = KMeans(**current_mdl_params) data = [] for protein in yaml_file["protein_list"]: with enter_protein_mdl_dir(yaml_file, protein): tica_data = verboseload("tica_data.pkl") # get all traj sorted_list = sorted(tica_data.keys(), key=keynat) data.extend([tica_data[i] for i in sorted_list]) kmeans_mdl.fit(data) kmeans_mdl_path = os.path.join(mdl_dir, "kmeans_mdl.pkl") verbosedump(kmeans_mdl, kmeans_mdl_path) return
def macrostate_pcca(msm_file, clusterer_file, n_macrostates, macrostate_dir): msm = verboseload(msm_file) clusterer = verboseload(clusterer_file) #pcca = lumping.PCCAPlus.from_msm(msm = msm,n_macrostates = n_macrostates) #macrostate_model = MarkovStateModel() #macrostate_model.fit(pcca.transform(labels)) pcca_object = lumping.PCCA(n_macrostates=10) pcca_object.fit(sequences=clusterer.labels_) #pcca_object.transform(sequences = clusterer.labels_) #macrostate_model = pcca_object.from_msm(msm = msm, n_macrostates = n_macrostates) print(pcca_object) print((pcca_object.microstate_mapping_)) verbosedump(pcca_object, macrostate_dir)
def fit_bootstrap(yaml_file,pool=None): mdl_params = yaml_file["mdl_params"] current_mdl_params={} for i in mdl_params.keys(): if i.startswith("msm__"): current_mdl_params[i.split("msm__")[1]] = mdl_params[i] if "bootstrap__n_samples" in mdl_params.keys(): bootstrap__n_samples = mdl_params["bootstrap__n_samples"] else: bootstrap__n_samples = 100 for protein in yaml_file["protein_list"]: with enter_protein_mdl_dir(yaml_file, protein): print(protein) assignments = verboseload("assignments.pkl") msm_mdl =BootStrapMarkovStateModel(n_samples= bootstrap__n_samples, n_procs=2, msm_args = current_mdl_params ) msm_mdl.fit([assignments[i] for i in assignments.keys()], pool=pool) verbosedump(msm_mdl, "bootstrap_msm_mdl.pkl") verbosedump(msm_mdl.mle_, "msm_mdl.pkl") fixed_assignments = {} for i in assignments.keys(): fixed_assignments[i] = msm_mdl.mle_.transform( assignments[i], mode='fill')[0] verbosedump(fixed_assignments, 'fixed_assignments.pkl') return
def fit_bayes_msms(yaml_file): mdl_params = yaml_file["mdl_params"] msm__lag_time = mdl_params["msm__lag_time"] if "bayesmsm__n_samples" in mdl_params.keys(): bayesmsm__n_samples = mdl_params["bayesmsm__n_samples"] else: bayesmsm__n_samples = 800 if "bayesmsm__n_steps" in mdl_params.keys(): bayesmsm__n_steps = mdl_params["bayesmsm__n_steps"] else: bayesmsm__n_steps = 1000000 for protein in yaml_file["protein_list"]: with enter_protein_mdl_dir(yaml_file, protein): print(protein) assignments = verboseload("assignments.pkl") msm_mdl = BayesianMarkovStateModel(n_samples=bayesmsm__n_samples, n_steps=bayesmsm__n_steps, lag_time=msm__lag_time, ergodic_cutoff=1.0/msm__lag_time, verbose=True).fit( [assignments[i] for i in assignments.keys()]) _ = msm_mdl.all_eigenvalues_ verbosedump(msm_mdl, "bayesmsm_mdl.pkl") fixed_assignments = {} for i in assignments.keys(): fixed_assignments[i] = msm_mdl.transform( assignments[i], mode='fill')[0] verbosedump(fixed_assignments, 'fixed_assignments.pkl') return
def find_most_important_residues_in_tIC(traj_file, tica_object, tic_features_csv, contact_residues,tic_residue_csv, feature_coefs_csv, duplicated_feature_coefs_csv, cutoff): try: tica = verboseload(tica_object) except: tica = load_dataset(tica_object) print traj_file traj = md.load_frame(traj_file, 0) #traj = fix_traj(traj) top = traj.topology #residue_pairs = compute_contacts_below_cutoff([traj_file, [0]], cutoff = cutoff, contact_residues = contact_residues, anton = True) residue_pairs = generate_features(tic_features_csv) new_residue_pairs = [] for pair in residue_pairs: new_residue_pairs.append(("%s%d.%d" %(pair[0][2], pair[0][1], pair[0][0])), ("%s%d.%d" %(pair[1][2], pair[1][1], pair[1][0]))) residue_pairs = new_residue_pairs #print traj_file top_indices_per_tIC = {} feature_coefs_per_tIC = {} duplicated_feature_coefs_per_tIC = {} #for each tIC: #for each feature, get the absolute component value #add to feature_coefs_per_tIC dictionary the absolute coefficient for that tIC #duplicate them for the analysis where we look at residues individually #sort by absolute coefficient value #for each tIC: # for i in range(0, np.shape(tica.components_)[0]): print i index_components = [(j,abs(tica.components_[i][j])) for j in range(0,np.shape(tica.components_)[1])] feature_coefs_per_tIC[i] = [component[1] for component in index_components] duplicated_feature_coefs_per_tIC[i] = [j for k in feature_coefs_per_tIC[i] for j in (k, k)] index_components = sorted(index_components, key= lambda x: x[1],reverse=True) print(index_components[0:10]) list_i = [index_components[j][0] for j in range(0,len(index_components))] top_indices_per_tIC[i] = list_i top_residues_per_tIC = {} for i in range(0, np.shape(tica.components_)[0]): top_residues_per_tIC[i] = [] for index in top_indices_per_tIC[i]: residues = residue_pairs[index] top_residues_per_tIC[i].append(residues) top_residues_per_tIC[i] = [item for sublist in top_residues_per_tIC[i] for item in sublist] residue_list = residue_pairs feature_coefs_per_tIC["residues_0"] = [pair[0] for pair in residue_list] feature_coefs_per_tIC["residues_1"] = [pair[1] for pair in residue_list] duplicated_feature_coefs_per_tIC["residues"] = [residue for residue_pair in residue_list for residue in residue_pair] write_map_to_csv(tic_residue_csv, top_residues_per_tIC, []) write_map_to_csv(feature_coefs_csv, feature_coefs_per_tIC, []) write_map_to_csv(duplicated_feature_coefs_csv, duplicated_feature_coefs_per_tIC, []) return
def plot_all_tics_and_clusters(tica_dir, transformed_data_dir, clusterer_dir, lag_time, label = "dot", active_cluster_ids = [], intermediate_cluster_ids = [], inactive_cluster_ids = []): try: transformed_data = verboseload(transformed_data_dir) except: transformed_data = load_dataset(transformed_data_dir) clusterer = verboseload(clusterer_dir) num_tics = np.shape(transformed_data[0])[1] print "Looking at %d tICS" %num_tics for i in range(0,num_tics): js = range(i+1, num_tics) plot_partial = partial(plot_tica_and_clusters, tica_dir = tica_dir, transformed_data = transformed_data, clusterer = clusterer, lag_time = lag_time, label = "dot", active_cluster_ids = active_cluster_ids, intermediate_cluster_ids = intermediate_cluster_ids, inactive_cluster_ids = inactive_cluster_ids, component_i = i) pool = mp.Pool(mp.cpu_count()) pool.map(plot_partial, js) pool.terminate() #plot_tica_and_clusters(tica_dir = tica_dir, transformed_data = transformed_data, clusterer = clusterer, lag_time = lag_time, label = "dot", active_cluster_ids = active_cluster_ids, intermediate_cluster_ids = intermediate_cluster_ids, inactive_cluster_ids = inactive_cluster_ids, component_i = i, component_j = j) print "Printed all tICA coords and all requested clusters"
def fit_bootstrap(yaml_file,pool=None): mdl_params = yaml_file["mdl_params"] current_mdl_params={} bootstrap_mdl_params={} for i in mdl_params.keys(): if i.startswith("msm__"): current_mdl_params[i.split("msm__")[1]] = mdl_params[i] if i.startswith("bootstrap__"): bootstrap_mdl_params[i.split("bootstrap__")[1]] = mdl_params[i] if "n_samples" not in bootstrap_mdl_params.keys(): bootstrap_mdl_params["n_samples"] = 100 for protein in yaml_file["protein_list"]: with enter_protein_mdl_dir(yaml_file, protein): print(protein) assignments = verboseload("assignments.pkl") msm_mdl =BootStrapMarkovStateModel(n_procs=2, msm_args = current_mdl_params, **bootstrap_mdl_params) msm_mdl.fit([assignments[i] for i in assignments.keys()], pool=pool) verbosedump(msm_mdl, "bootstrap_msm_mdl.pkl") verbosedump(msm_mdl.mle_, "msm_mdl.pkl") fixed_assignments = {} for i in assignments.keys(): fixed_assignments[i] = msm_mdl.mle_.transform( assignments[i], mode='fill')[0] verbosedump(fixed_assignments, 'fixed_assignments.pkl') return
def plot_tica(transformed_data_dir, lag_time): transformed_data = verboseload(transformed_data_dir) trajs = np.concatenate(transformed_data) plt.hexbin(trajs[:,0], trajs[:,1], bins='log', mincnt=1) pp = PdfPages("/scratch/users/enf/b2ar_analysis/tica_phi_psi_chi2_t%d.pdf" %lag_time) pp.savefig() pp.close()
def plot_pnas_vs_tics(pnas_dir, tic_dir, pnas_names, directory, scale = 7.14, refcoords_file = None): pnas = np.concatenate(load_file(pnas_dir)) pnas[:,0] *= scale print(np.shape(pnas)) print(len(pnas_names)) if("ktICA" in tic_dir): tics = load_dataset(tic_dir) else: tics = verboseload(tic_dir) print(np.shape(tics)) tics = np.concatenate(tics) print(np.shape(tics)) if len(pnas_names) != np.shape(pnas)[1]: print("Invalid pnas names") return for i in range(0,np.shape(pnas)[1]): for j in range(0,np.shape(tics)[1]): tic = tics[:,j] pnas_coord = pnas[:,i] plt.hexbin(tic, pnas_coord, bins = 'log', mincnt=1) coord_name = pnas_names[i] tic_name = "tIC.%d" %(j+1) plt.xlabel(tic_name) plt.ylabel(coord_name) pp = PdfPages("%s/%s_%s_hexbin.pdf" %(directory, tic_name, coord_name)) pp.savefig() pp.close() plt.clf() return
def normalize_project_series(yaml_file, output_folder="normalized_features", stride=40,nrm=None): """ routine to take a set of proteins features stored in the feature_dir and normalize them by removing the mean and setting variance to 1 using the standard scaler. The normalizer is dumped into the mdl dir. :param yaml_file: The yaml file to work with. :param output_folder: The name of the output folder to dump normalized features in :param stride: The initial stride in files to fit the normalizer with. This is necessary to prevent memory errors. defaults to every 40th file :param nrm: previously fit normalizer. else it uses the standard scaler from scikitlearn :return: """ yaml_file = load_yaml_file(yaml_file) #setup normalizer if nrm is None: nrm = preprocessing.StandardScaler() all_data = {} for prt in yaml_file["protein_list"]: with enter_protein_data_dir(yaml_file, prt): print(prt) flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"]))[::stride] for f in flist: all_data[f]=verboseload(f) seq=[] for i in all_data.keys(): seq.extend(all_data[i]) #fit it nrm.fit(seq) #dump it into the mdl dir. verbosedump(nrm,"%s/nrm.h5"%yaml_file["mdl_dir"]) for prt in yaml_file["protein_list"]: _check_output_folder_exists(yaml_file, prt, output_folder) with enter_protein_data_dir(yaml_file, prt): output_folder_path = os.path.abspath(output_folder) flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"])) for f in flist: res = verboseload(f) res = nrm.transform(res) verbosedump(res,"%s/%s"%(output_folder_path, os.path.basename(f))) return
def normalize_project_series(yaml_file, output_folder="normalized_features", stride=1,nrm=None): """ routine to take a set of proteins features stored in the feature_dir and normalize them by removing the mean and setting variance to 1 using the standard scaler. The normalizer is dumped into the mdl dir. :param yaml_file: The yaml file to work with. :param output_folder: The name of the output folder to dump normalized features in :param stride: The initial stride in files to fit the normalizer with. This is necessary to prevent memory errors. defaults to every 40th file :param nrm: previously fit normalizer. else it uses the standard scaler from scikitlearn :return: """ yaml_file = load_yaml_file(yaml_file) #setup normalizer if nrm is None: nrm = preprocessing.RobustScaler() all_data = {} for prt in yaml_file["protein_list"]: with enter_protein_data_dir(yaml_file, prt): print(prt) flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"]))[::stride] for f in flist: all_data[f]=verboseload(f) seq=[] for i in all_data.keys(): seq.extend(all_data[i]) #fit it nrm.fit(seq) #dump it into the mdl dir. verbosedump(nrm,"%s/nrm.h5"%yaml_file["mdl_dir"]) for prt in yaml_file["protein_list"]: _check_output_folder_exists(yaml_file, prt, output_folder) with enter_protein_data_dir(yaml_file, prt): output_folder_path = os.path.abspath(output_folder) flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"])) for f in flist: res = verboseload(f) res = nrm.transform(res) verbosedump(res,"%s/%s"%(output_folder_path, os.path.basename(f))) return
def reseed_from_clusterer(clusterer_file, main, tica_dir, projected_features_dir, traj_files): clusterer = verboseload(clusterer_file) n_clusters = len(clusterer.cluster_centers_) print(n_clusters) clusters_map = make_clusters_map(verboseload(clusterer_file)) count_tuples = [] for i in range(0, n_clusters): count_tuples.append((i, len(clusters_map[i]))) count_tuples.sort(key=operator.itemgetter(1)) min_populated_clusters = [count_tuples[i][0] for i in range(0, 16)] print(min_populated_clusters) plot_all_tics_and_clusters(tica_dir, projected_features_dir, clusterer_file, None, tic_range=[0], main=main, label="cluster_id", active_cluster_ids=min_populated_clusters) traj_index_frame_pairs = list( find_closest_indices_to_cluster_center(projected_features_dir, clusterer_file)) traj_index_frame_pairs = [tuple(pair) for pair in traj_index_frame_pairs] for i, traj_index_frame_pair in enumerate(traj_index_frame_pairs): traj_index, frame = traj_index_frame_pair if i in min_populated_clusters: print("Looking at cluster %d" % i) print("Snapshot in: %s" % str(traj_index_frame_pair)) snapshot = md.load_frame(traj_files[traj_index], index=frame) snapshot.save("%s/%smincount_snapshot_cluster%d.rst7" % (tica_dir, main, i)) snapshot.save("%s/%smincount_snapshot_cluster%d.pdb" % (tica_dir, main, i)) protein_indices = [ a.index for a in snapshot.topology.atoms if a.residue.is_protein or "LIG" in str(a.residue) ] snapshot_protein = snapshot.atom_slice(protein_indices) snapshot_protein.save( "%s/%smincount_snapshot_cluster%d_protein.pdb" % (tica_dir, main, i)) return (min_populated_clusters)
def cluster_project_wrapper(proj_folder,feature_dict,n_states): if os.path.exists(proj_folder+"/assignments.pkl"): return verboseload(proj_folder+"/cluster_mdl.pkl"),verboseload(proj_folder+"/assignments.pkl") elif os.path.exists(proj_folder+"/cluster_mdl.pkl"): cluster_mdl = verboseload(proj_folder+"/cluster_mdl.pkl") else: cluster_mdl = KMeans(n_clusters = n_states) cluster_mdl.fit([feature_dict[i] for i in feature_dict.keys()]) assignments={} for i in feature_dict.keys(): assignments[i] = cluster_mdl.transform([feature_dict[i]]) verbosedump(cluster_mdl,proj_folder+"/cluster_mdl.pkl") verbosedump(assignments,proj_folder+"/assignments.pkl") return cluster_mdl,assignments
def plot_all_tics(tica_dir, transformed_data_dir, lag_time): transformed_data = verboseload(transformed_data_dir) num_tics = np.shape(transformed_data[0])[1] print "Looking at %d tICS" %num_tics for i in range(0,num_tics): for j in range(i+1,num_tics): plot_tica_component_i_j(tica_dir, transformed_data_dir, lag_time, component_i = i, component_j = j) print "Printed all tICA coords"
def fit_and_transform(directory, stride=5): projected_data_filename = "/scratch/users/enf/b2ar_analysis/phi_psi_chi_stride%d_projected.h5" % stride fit_model_filename = "/scratch/users/enf/b2ar_analysis/phi_psi_chi2_stride%s_tica_coords.h5" % stride #active_pdb_file = "/scratch/users/enf/b2ar_analysis/3P0G_pymol_prepped.pdb" active_pdb_file = "/scratch/users/enf/b2ar_analysis/system_B.pdb" tica_model = tICA(n_components=4) if not os.path.exists(projected_data_filename): print("loading feature files") feature_files = get_trajectory_files(directory) pool = mp.Pool(mp.cpu_count()) features = pool.map(load_features, feature_files) pool.terminate() if not os.path.exists(fit_model_filename): print("fitting data to tICA model") fit_model = tica_model.fit(features) verbosedump(fit_model, fit_model_filename) transformed_data = fit_model.transform(features) verbosedump(transformed_data, projected_data_filename) else: print("loading tICA model") fit_model = verboseload(fit_model_filename) transformed_data = fit_model.transform(features) verbosedump(transformed_data, projected_data_filename) else: fit_model = verboseload(fit_model_filename) transformed_data = verboseload(projected_data_filename) active_pdb = md.load(active_pdb_file) top = active_pdb.topology atom_indices = [ a.index for a in top.atoms if a.residue.is_protein and a.residue.resSeq != 341 and a.residue.name[0:2] != "HI" and a.residue.resSeq != 79 and a.residue.resSeq != 296 and a.residue.resSeq != 269 and a.residue. resSeq != 178 and a.residue.resSeq != 93 and a.residue.name != "NMA" and a.residue.name != "NME" and a.residue.name != "ACE" ] active_pdb = md.load(active_pdb_file, atom_indices=atom_indices) featurizer = DihedralFeaturizer(types=['phi', 'psi', 'chi2']) active_pdb_features = featurizer.transform(active_pdb) active_pdb_projected = fit_model.transform(active_pdb_features) print((active_pdb_projected[0:4]))
def plot_tica(transformed_data_dir, lag_time): transformed_data = verboseload(transformed_data_dir) trajs = np.concatenate(transformed_data) plt.hexbin(trajs[:, 0], trajs[:, 1], bins='log', mincnt=1) pp = PdfPages( "/scratch/users/enf/b2ar_analysis/tica_phi_psi_chi2_t%d.pdf" % lag_time) pp.savefig() pp.close()
def cluster_project_wrapper(proj_folder, feature_dict, n_states): if os.path.exists(proj_folder + "/assignments.pkl"): return verboseload(proj_folder + "/cluster_mdl.pkl"), verboseload(proj_folder + "/assignments.pkl") elif os.path.exists(proj_folder + "/cluster_mdl.pkl"): cluster_mdl = verboseload(proj_folder + "/cluster_mdl.pkl") else: cluster_mdl = KMeans(n_clusters=n_states) cluster_mdl.fit([feature_dict[i] for i in feature_dict.keys()]) assignments = {} for i in feature_dict.keys(): assignments[i] = cluster_mdl.transform([feature_dict[i]]) verbosedump(cluster_mdl, proj_folder + "/cluster_mdl.pkl") verbosedump(assignments, proj_folder + "/assignments.pkl") return cluster_mdl, assignments
def cluster_kmeans(tica_dir, data_dir, traj_dir, n_clusters, lag_time): clusterer_dir = "%s/clusterer_%dclusters.h5" % (tica_dir, n_clusters) if (os.path.exists(clusterer_dir)): print("Already clustered") else: print("Clustering by KMeans") reduced_data = verboseload(data_dir) trajs = np.concatenate(reduced_data) clusterer = KMeans(n_clusters=n_clusters, n_jobs=-1) clusterer.fit_transform(reduced_data) verbosedump(clusterer, clusterer_dir)
def transform_protein_kmeans(yaml_file,pca=False): mdl_dir = yaml_file["mdl_dir"] kmeans_mdl_path = os.path.join(mdl_dir, "kmeans_mdl.pkl") kmeans_mdl = verboseload(kmeans_mdl_path) for protein in yaml_file["protein_list"]: print("Assigning protein %s" % protein) with enter_protein_mdl_dir(yaml_file, protein): if pca: tica_data = verboseload("pca_data.pkl") else: tica_data = verboseload("tica_data.pkl") # do assignments assignments = {} for i in tica_data.keys(): assignments[i] = kmeans_mdl.predict([tica_data[i]])[0] verbosedump(assignments, 'assignments.pkl') print("Done assigning %s" % protein) return
def main(): args = parse_commandline() traj_file = args.t top_file = args.p tica_file = args.c tic_index = args.i out_file = args.o stride = args.s describer = args.d cutoff = args.u #load stuff trj = mdt.load(traj_file,top=top_file,stride=stride) tica_mdl = verboseload(tica_file) df = pd.DataFrame(verboseload(describer)) dat_fn = "importances_{}.txt".format(out_file) tcl_fn = "{}.tcl".format(out_file) tica_to_vmd(df, tica_mdl, tic_index, traj_file, top_file, trj, stride, dat_fn, tcl_fn, cutoff)
def plot_all_tics(tica_dir, transformed_data_dir, lag_time): transformed_data = verboseload(transformed_data_dir) num_tics = np.shape(transformed_data[0])[1] print("Looking at %d tICS" % num_tics) for i in range(0, num_tics): for j in range(i + 1, num_tics): plot_tica_component_i_j(tica_dir, transformed_data_dir, lag_time, component_i=i, component_j=j) print("Printed all tICA coords")
def __init__(self, series, name): if not isinstance(series, ProteinSeries): raise Exception("We need a project series to be associated " "with this kinase") self.name = name self.project = series self.kmeans_mdl = self.project.kmeans_mdl self.tica_mdl = self.project.tica_mdl self.protein_mdl_dir = os.path.join(self.project.relative_loc, self.name) if os.path.isfile("%s/bootstrap_msm_mdl.pkl" % self.protein_mdl_dir): self.bootrap_msm = verboseload("%s/bootstrap_msm_mdl.pkl" % self.protein_mdl_dir) if os.path.isfile("%s/msm_mdl.pkl" % self.protein_mdl_dir): self.msm = verboseload("%s/msm_mdl.pkl" % self.protein_mdl_dir) if os.path.isfile("%s/bayesmsm_mdl.pkl" % self.protein_mdl_dir): self.bayesmsm = verboseload("%s/bayesmsm_mdl.pkl" % self.protein_mdl_dir) self.tica_data = verboseload("%s/tica_data.pkl" % self.protein_mdl_dir) self.assignments = verboseload("%s/assignments.pkl" % self.protein_mdl_dir) self.fixed_assignments = verboseload("%s/fixed_assignments.pkl" % self.protein_mdl_dir) self.n_states_ = self.msm.n_states_ self.n_tics_ = self.kmeans_mdl.cluster_centers_.shape[1] self._computed = False self._tic_dict = None self._tic_min = None self._tic_max = None self._mlpt_fct = 0.4