def main(num_clusters, output_folder, ligand_resname, atom_ids, traj_folder): extractCoords.main(folder_name=traj_folder, lig_resname=ligand_resname, non_Repeat=True, atom_Ids=atom_ids) trajectoryFolder = "allTrajs" trajectoryBasename = "*trajectory*" stride = 1 clusterCountsThreshold = 0 clusteringObject = cluster.Cluster(num_clusters, trajectoryFolder, trajectoryBasename, alwaysCluster=False, stride=stride) clusteringObject.clusterTrajectories() clusteringObject.eliminateLowPopulatedClusters(clusterCountsThreshold) clusterCenters = clusteringObject.clusterCenters centersInfo = get_centers_info(trajectoryFolder, trajectoryBasename, num_clusters, clusterCenters) COMArray = [centersInfo[i]['center'] for i in xrange(num_clusters)] if output_folder is not None: outputFolder = os.path.join(traj_folder, output_folder) if not os.path.exists(outputFolder): os.makedirs(outputFolder) else: outputFolder = "" writePDB( COMArray, os.path.join(outputFolder, "clusters_%d_KMeans_allSnapshots.pdb" % num_clusters)) writeInitialStructures(centersInfo, outputFolder, traj_folder) return trajectoryFolder, "discretized"
def main(ligand, clusters_file, conf_folder, topology=None): trajFolder = "allTrajs_nonRepeat" cluster_centers = np.loadtxt(clusters_file) if not os.path.exists("discretized"): os.makedirs("discretized") if not os.path.exists(trajFolder): os.makedirs(trajFolder) stride = 1 clusterCountsThreshold = 0 trajBasename = "coord*" if topology is not None: topology_contents = utilities.getTopologyFile(topology) else: topology_contents = None epoch_folders = utilities.get_epoch_folders(conf_folder) numClusters = cluster_centers.shape[0] coordinates = [[] for cl in range(numClusters)] for it in epoch_folders: files = glob.glob(conf_folder + "%s/extractedCoordinates/coord*" % it) for f in files: traj = os.path.splitext(f)[0].split("_")[-1] shutil.copy(f, trajFolder + "/coord_%s_%s.dat" % (it, traj)) clusteringObject = cluster.Cluster(numClusters, trajFolder, trajBasename, alwaysCluster=False, stride=stride) clusteringObject.clusterTrajectories() clusteringObject.eliminateLowPopulatedClusters(clusterCountsThreshold) for i in range(numClusters): if not os.path.exists("cluster_%d" % i): os.makedirs("cluster_%d/allStructures" % i) dtrajs_files = glob.glob("discretized/*.disctraj") for dtraj in dtrajs_files: print(dtraj) traj = np.loadtxt(dtraj) epoch, traj_num = map(int, os.path.splitext(dtraj)[0].split("_", 3)[1:]) trajPositions = np.loadtxt(trajFolder + "/coord_%d_%d.dat" % (epoch, traj_num)) trajFile = glob.glob( os.path.join(conf_folder + "%d/trajectory_%d*" % (epoch, traj_num)))[0] snapshots = utilities.getSnapshots(trajFile, topology=topology) for nSnap, cluster_num in enumerate(traj): coordinates[int(cluster_num)].append(trajPositions[nSnap]) filename = "cluster_%d/allStructures/conf_%d_%d_%d.pdb" % ( cluster_num, epoch, traj_num, nSnap) if isinstance(snapshots[nSnap], basestring): with open(filename, "w") as fw: fw.write(snapshots[nSnap]) else: utilities.write_mdtraj_object_PDB(snapshots[nSnap], filename, topology_contents) for cl in range(numClusters): np.savetxt("cluster_%d/positions.dat" % cl, coordinates[cl])
def cluster_TICA_space(numClusters, trajectoryFolder, trajectoryBasename, stride, clusterCountsThreshold): clusteringObject = cluster.Cluster(numClusters, trajectoryFolder, trajectoryBasename, alwaysCluster=False, stride=stride) clusteringObject.clusterTrajectories() clusteringObject.eliminateLowPopulatedClusters(clusterCountsThreshold) return clusteringObject
def main(lagtimes, clusters_file, disctraj, trajs, n_clusters, plots_path, save_plot, show_plot, lagtime_resolution=20): if disctraj is not None: dtraj_files = glob.glob(os.path.join(disctraj, "*traj*.disctraj")) dtrajs = [np.loadtxt(f, dtype=int) for f in dtraj_files] clusterCenters = np.loadtxt(clusters_file) else: clusteringObject = cluster.Cluster(n_clusters, trajs, "traj*", alwaysCluster=False, discretizedPath=disctraj) if clusters_file is not None: # only assign clusteringObject.clusterCentersFile = clusters_file clusteringObject.clusterTrajectories() clusterCenters = clusteringObject.clusterCenters dtrajs = clusteringObject.dtrajs Q = [] for lag in lagtimes: msm_obj = msm.estimate_markov_model(dtrajs, lag) counts = msm_obj.count_matrix_full Q.append(counts.diagonal() / counts.sum()) Q = np.array(Q) print("Clusters over 0.01 metastability") correlation_limit = 0.01 states2 = np.where(Q[-1] > correlation_limit)[0] size2 = states2.size if len(states2): print(" ".join(map(str, states2))) print("Number of clusters:", size2, ", %.2f%% of the total" % (100 * size2 / float(n_clusters))) utilities.write_PDB_clusters(np.hstack((clusterCenters, Q[:-1].T)), use_beta=True, title="cluster_Q.pdb") if plots_path is None: plots_path = "" else: utilities.makeFolder(plots_path) create_plots(Q, plots_path, save_plot, show_plot, n_clusters, lagtimes, threshold=2.0)
def getRepresentativePDBs(filesWildcard, run): files = glob.glob(filesWildcard) trajs = [utilities.loadtxtfile(f)[:, 1:] for f in files] cl = cluster.Cluster(0, "", "") cl.clusterCenters = utilities.loadtxtfile(cl.clusterCentersFile) dtrajs = cl.assignNewTrajectories(trajs) numClusters = cl.clusterCenters.shape[0] centersInfo = getCentersInfo(cl.clusterCenters, trajs, files, dtrajs) if not os.path.exists("representative_structures"): os.makedirs("representative_structures") with open("representative_structures/representative_structures_%d.dat" % run, "w") as fw: fw.write("Cluster\tEpoch\tTrajectory\tSnapshot\n") for clNum in range(numClusters): fw.write("%d\t" % clNum+"\t".join(centersInfo[clNum]["structure"])+"\n")
def main(num_clusters, output_folder, ligand_resname, atom_ids, folder_name=".", topology=None): extractCoords.main(folder_name, lig_resname=ligand_resname, non_Repeat=True, atom_Ids=atom_ids) trajectoryFolder = "allTrajs" trajectoryBasename = "traj*" stride = 1 clusterCountsThreshold = 0 folders = utilities.get_epoch_folders(folder_name) folders.sort(key=int) if os.path.exists("discretized"): # If there is a previous clustering, remove to cluster again shutil.rmtree("discretized") clusteringObject = cluster.Cluster(num_clusters, trajectoryFolder, trajectoryBasename, alwaysCluster=False, stride=stride) clusteringObject.clusterTrajectories() clusteringObject.eliminateLowPopulatedClusters(clusterCountsThreshold) clusterCenters = clusteringObject.clusterCenters centersInfo = get_centers_info(trajectoryFolder, trajectoryBasename, num_clusters, clusterCenters) COMArray = [centersInfo[i]['center'][:3] for i in range(num_clusters)] if output_folder is not None: outputFolder = os.path.join(output_folder, "") if not os.path.exists(outputFolder): os.makedirs(outputFolder) else: outputFolder = "" writePDB( COMArray, outputFolder + "clusters_%d_KMeans_allSnapshots.pdb" % num_clusters) writeInitialStructures(centersInfo, outputFolder + "initial_%d.pdb", topology=topology)
def main(control_file): # parameters trajectoryFolder, trajectoryBasename, numClusters, stride, lagtimes, _, _, numberOfITS, _, _, lagtime, clusterCountsThreshold = readParams( control_file) # program clusteringObject = cluster.Cluster(numClusters, trajectoryFolder, trajectoryBasename, alwaysCluster=False, stride=stride) clusteringObject.clusterTrajectories() clusteringObject.eliminateLowPopulatedClusters(clusterCountsThreshold) calculateMSM = estimate.MSM(error=False, dtrajs=clusteringObject.dtrajs) calculateMSM.estimate(lagtime=lagtime, lagtimes=lagtimes, numberOfITS=numberOfITS)
def main(lagtime, clusters_file, disctraj, trajs, n_clusters, plots_path, save_plot, show_plot, lagtime_resolution=20): lagtimes = list(range(1, lagtime, lagtime_resolution)) n_lags = len(lagtimes) if disctraj is None: clusteringObject = cluster.Cluster(n_clusters, trajs, "traj*", alwaysCluster=False) if clusters_file is not None: # only assign utilities.makeFolder(clusteringObject.discretizedFolder) clusteringObject.clusterCentersFile = clusters_file clusteringObject.clusterTrajectories() disctraj = clusteringObject.discretizedFolder clusterCenters = clusteringObject.clusterCenters else: clusterCenters = utilities.loadtxtfile(clusters_file) if len(clusterCenters) != n_clusters: raise ValueError( "Number of clusters specified in the -n parameter does not match the provided clusters" ) print("Calculating autocorrelation...") dtrajs = glob.glob(os.path.join(disctraj, "traj*")) dtrajs_loaded = [ utilities.loadtxtfile(dtraj, dtype=int) for dtraj in dtrajs ] autoCorr = utils.calculateAutoCorrelation(lagtimes, dtrajs_loaded, n_clusters, n_lags) np.save("autoCorr.npy", autoCorr) # __cleanupFiles(parameters.trajWildcard, False) utilities.write_PDB_clusters(np.vstack( (clusterCenters.T, np.abs(autoCorr[:, -1]))).T, use_beta=True, title="cluster_autoCorr.pdb") print("Clusters over correlation time limit") correlation_limit = np.exp(-1) states2 = np.where(autoCorr[:, -1] > correlation_limit)[0] size2 = states2.size if len(states2): print(" ".join(map(str, states2))) print("Number of clusters:", size2, ", %.2f%% of the total" % (100 * size2 / float(n_clusters))) print("Clusters with more than 0.1 autocorrelation") states1 = np.where(autoCorr[:, -1] > 0.1)[0] size1 = states1.size if len(states1): print(" ".join(map(str, states1))) print("Number of clusters:", size1, ", %.2f%% of the total" % (100 * size1 / float(n_clusters))) if size2 > 0: print("Correlation time not achieved at lagtime %d" % lagtime) else: for i in range(len(lagtimes)): states = np.where(autoCorr[:, -i - 1] > correlation_limit)[0] if len(states): string_states = ", ".join(map(str, states)) print("Correlation time %d, for states: %s" % (lagtimes[-i], string_states)) break if plots_path is None: plots_path = "" else: utilities.makeFolder(plots_path) create_plots(autoCorr, plots_path, save_plot, show_plot, n_clusters, lagtimes, threshold=2.0)
def main(num_clusters, criteria1, criteria2, ligand_resname, output_folder="ClusterCentroids", atom_ids="", cpus=2, topology=None, report="report_", traj="trajectory_", use_pdb=False, png=False, CA=0, sidechains=0, restart="all"): # Create multiprocess pool if cpus > 1: pool = mp.Pool(cpus) else: pool = mp.Pool(1) # Extract COM ligand for each snapshot if not glob.glob("allTrajs/traj*"): extractCoords.main(lig_resname=ligand_resname, non_Repeat=True, atom_Ids=atom_ids, nProcessors=cpus, parallelize=True, topology=topology, protein_CA=CA, sidechains=sidechains) print("Clusterize trajectories by RMSD of COM") trajectoryFolder = "allTrajs" trajectoryBasename = "*traj*" stride = 1 clusterCountsThreshold = 0 folders = utilities.get_epoch_folders(".") folders.sort(key=int) if not restart: clusteringObject = cluster.Cluster(num_clusters, trajectoryFolder, trajectoryBasename, alwaysCluster=True, stride=stride) clusteringObject.clusterTrajectories() clusteringObject.eliminateLowPopulatedClusters(clusterCountsThreshold) clusterCenters = clusteringObject.clusterCenters np.savetxt("clustercenters.dat", clusterCenters) dtrajs = clusteringObject.dtrajs print("Extract metrics for each snapshot") min_metric_trajs = {} epochs = [folder for folder in glob.glob("./*/") if folder.isdigit()] reports = simulationToCsv.gather_reports() fields = simulationToCsv.retrieve_fields(reports[0]) df = simulationToCsv.init_df(fields) df = simulationToCsv.fill_data(reports, df, pool) print("Update data with metrics and clusters") df.index = range(df.shape[0]) df["Cluster"] = [None] * df.shape[0] input_list = [[ df, Traj, d ] for d, Traj in zip(dtrajs, clusteringObject.trajFilenames)] results = pool.map(save_to_df, input_list) for data in results: for df_tmp in data: df.update(df_tmp) df.to_csv("Simulation.csv", index=False) if restart: df = pd.read_csv("Simulation.csv") clusterCenters = utilities.loadtxtfile("clustercenters.dat") print(clusterCenters) centersInfo = get_centers_info(trajectoryFolder, trajectoryBasename, num_clusters, clusterCenters) COMArray = [centersInfo[i]['center'] for i in range(num_clusters)] print("Retrieve clusters and metric") fields1 = [] fields2 = [] print(centersInfo) for cluster_num in centersInfo: epoch_num, traj_num, snap_num = map( int, centersInfo[cluster_num]['structure']) field1, crit1_name = get_metric(criteria1, epoch_num, traj_num, snap_num, report) field2, crit2_name = get_metric(criteria2, epoch_num, traj_num, snap_num, report) fields1.append(field1) fields2.append(field2) if output_folder is not None: outputFolder = os.path.join(output_folder, "") if not os.path.exists(outputFolder): os.makedirs(outputFolder) else: outputFolder = "" print("Output structures") writePDB( COMArray, outputFolder + "clusters_%d_KMeans_allSnapshots.pdb" % num_clusters) writeInitialStructures(fields1, fields2, crit1_name, crit2_name, centersInfo, outputFolder + "cluster_{}_{}_{}_{}_{}.pdb", traj, topology=topology, use_pdb=use_pdb) plotClusters(fields1, fields2, crit1_name, crit2_name, outputFolder, png=png) assesClusterConvergence(df, num_clusters, traj, topology) return
def main(num_clusters, criteria1, criteria2, output_folder, ligand_resname, atom_ids, cpus=2, topology=None, report="report_", traj="trajectory_", use_pdb=False): if not glob.glob("*/extractedCoordinates/coord_*"): extractCoords.main(lig_resname=ligand_resname, non_Repeat=True, atom_Ids=atom_ids, nProcessors=cpus, parallelize=False, topology=topology, use_pdb=use_pdb) trajectoryFolder = "allTrajs" trajectoryBasename = "*traj*" stride = 1 clusterCountsThreshold = 0 folders = utilities.get_epoch_folders(".") folders.sort(key=int) clusteringObject = cluster.Cluster(num_clusters, trajectoryFolder, trajectoryBasename, alwaysCluster=True, stride=stride) clusteringObject.clusterTrajectories() clusteringObject.eliminateLowPopulatedClusters(clusterCountsThreshold) clusterCenters = clusteringObject.clusterCenters centersInfo = get_centers_info(trajectoryFolder, trajectoryBasename, num_clusters, clusterCenters) COMArray = [centersInfo[i]['center'] for i in xrange(num_clusters)] fields1 = [] fields2 = [] for cluster_num in centersInfo: epoch_num, traj_num, snap_num = map( int, centersInfo[cluster_num]['structure']) field1, crit1_name = get_metric(criteria1, epoch_num, traj_num, snap_num, report) field2, crit2_name = get_metric(criteria2, epoch_num, traj_num, snap_num, report) fields1.append(field1) fields2.append(field2) if output_folder is not None: outputFolder = os.path.join(output_folder, "") if not os.path.exists(outputFolder): os.makedirs(outputFolder) else: outputFolder = "" writePDB( COMArray, outputFolder + "clusters_%d_KMeans_allSnapshots.pdb" % num_clusters) writeInitialStructures(fields1, fields2, crit1_name, crit2_name, centersInfo, outputFolder + "cluster_{}_{}_{}_{}_{}.pdb", traj, topology=topology, use_pdb=use_pdb) plotClusters(fields1, fields2, crit1_name, crit2_name, outputFolder)