def get_rmsd_matrix(self, align, symmetry): (ps, masses, radii, conforms, symm_groups, models_name, n_models) = rmsd_calculation.get_rmfs_coordinates_one_rmf( "./", self.get_input_file_name("SampledA.rmf3"), self.get_input_file_name("SampledC.rmf3"), None, symmetry, None, 1) inner_data = rmsd_calculation.get_rmsds_matrix( # noqa conforms, 'cpu_omp', align, 2, symm_groups) del conforms mHandler = MatrixHandler() mHandler.loadMatrix("Distances_Matrix.data") rmsd_matrix = mHandler.getMatrix() distmat = rmsd_matrix.get_data() distmat_full = sp.spatial.distance.squareform(distmat) return distmat_full
args.extension = "rmf3" ps_names, masses, radii, conforms, models_name = get_rmfs_coordinates( args.path, idfile_A, idfile_B, args.subunit) print "Size of conformation matrix", conforms.shape if not args.skip_sampling_precision: inner_data = get_rmsds_matrix(conforms, args.mode, args.align, args.cores) print "Size of RMSD matrix (flattened):", inner_data.shape import pyRMSD.RMSDCalculator from pyRMSD.matrixHandler import MatrixHandler mHandler = MatrixHandler() mHandler.loadMatrix("Distances_Matrix.data") rmsd_matrix = mHandler.getMatrix() distmat = rmsd_matrix.get_data() distmat_full = sp.spatial.distance.squareform(distmat) print "Size of RMSD matrix (unpacked, N x N):", distmat_full.shape # Get model lists sampleA_all_models, sampleB_all_models = get_sample_identity( idfile_A, idfile_B) total_num_models = len(sampleA_all_models) + len(sampleB_all_models) all_models = sampleA_all_models + sampleB_all_models print "Size of Sample A:", len( sampleA_all_models), " ; Size of Sample B: ", len( sampleB_all_models), "; Total", total_num_models if not args.skip_sampling_precision:
def main(): args = parse_args() import os import shutil import numpy import scipy as sp import IMP.sampcon from IMP.sampcon import scores_convergence, clustering_rmsd from IMP.sampcon import rmsd_calculation, precision_rmsd import IMP idfile_A = "Identities_A.txt" idfile_B = "Identities_B.txt" # Step 0: Compute Score convergence score_A = [] score_B = [] with open(os.path.join(args.path, args.scoreA), 'r') as f: for line in f: score_A.append(float(line.strip("\n"))) with open(os.path.join(args.path, args.scoreB), 'r') as f: for line in f: score_B.append(float(line.strip("\n"))) scores = score_A + score_B # Get the convergence of the best score scores_convergence.get_top_scorings_statistics(scores, 0, args.sysname) # Check if the two score distributions are similar scores_convergence.get_scores_distributions_KS_Stats( score_A, score_B, 100, args.sysname) # Step 1: Compute RMSD matrix if args.extension == "pdb": ps_names = [] # bead names are not stored in PDB files symm_groups = None conforms, masses, radii, models_name = \ rmsd_calculation.get_pdbs_coordinates( args.path, idfile_A, idfile_B) else: args.extension = "rmf3" # If we have a single RMF file, read conformations from that if args.rmf_A is not None: (ps_names, masses, radii, conforms, symm_groups, models_name, n_models) = rmsd_calculation.get_rmfs_coordinates_one_rmf( args.path, args.rmf_A, args.rmf_B, args.subunit, args.symmetry_groups) # If not, default to the Identities.txt file else: symm_groups = None (ps_names, masses, radii, conforms, models_name) = rmsd_calculation.get_rmfs_coordinates( args.path, idfile_A, idfile_B, args.subunit) print("Size of conformation matrix", conforms.shape) if not args.skip_sampling_precision: # get_rmsds_matrix modifies conforms, so save it to a file and restore # afterwards (so that we retain the original IMP orientation) numpy.save("conforms", conforms) inner_data = rmsd_calculation.get_rmsds_matrix(conforms, args.mode, args.align, args.cores, symm_groups) print("Size of RMSD matrix (flattened):", inner_data.shape) del conforms conforms = numpy.load("conforms.npy") os.unlink('conforms.npy') from pyRMSD.matrixHandler import MatrixHandler mHandler = MatrixHandler() mHandler.loadMatrix("Distances_Matrix.data") rmsd_matrix = mHandler.getMatrix() distmat = rmsd_matrix.get_data() distmat_full = sp.spatial.distance.squareform(distmat) print("Size of RMSD matrix (unpacked, N x N):", distmat_full.shape) # Get model lists if args.rmf_A is not None: sampleA_all_models = list(range(n_models[0])) sampleB_all_models = list(range(n_models[0], n_models[1] + n_models[0])) total_num_models = n_models[1] + n_models[0] else: (sampleA_all_models, sampleB_all_models) = clustering_rmsd.get_sample_identity( idfile_A, idfile_B) total_num_models = len(sampleA_all_models) + len(sampleB_all_models) all_models = list(sampleA_all_models) + list(sampleB_all_models) print("Size of Sample A:", len(sampleA_all_models), " ; Size of Sample B: ", len(sampleB_all_models), "; Total", total_num_models) if not args.skip_sampling_precision: print("Calculating sampling precision") # Step 2: Cluster at intervals of grid size to get the # sampling precision gridSize = args.gridsize # Get cutoffs for clustering cutoffs_list = clustering_rmsd.get_cutoffs_list(distmat, gridSize) print("Clustering at thresholds:", cutoffs_list) # Do clustering at each cutoff pvals, cvs, percents = clustering_rmsd.get_clusters( cutoffs_list, distmat_full, all_models, total_num_models, sampleA_all_models, sampleB_all_models, args.sysname) # Now apply the rule for selecting the right precision based # on population of contingency table, pvalue and cramersv (sampling_precision, pval_converged, cramersv_converged, percent_converged) = clustering_rmsd.get_sampling_precision( cutoffs_list, pvals, cvs, percents) # Output test statistics with open("%s.Sampling_Precision_Stats.txt" % args.sysname, 'w+') as fpv: print( "The sampling precision is defined as the largest allowed " "RMSD between the cluster centroid and a ", args.sysname, "model within any cluster in the finest clustering for " "which each sample contributes models proportionally to " "its size (considering both significance and magnitude of " "the difference) and for which a sufficient proportion of " "all models occur in sufficiently large clusters. The " "sampling precision for our ", args.sysname, " modeling is %.3f" % (sampling_precision), " A.", file=fpv) print( "Sampling precision, P-value, Cramer's V and percentage " "of clustered models below:", file=fpv) print("%.3f\t%.3f\t%.3f\t%.3f" % (sampling_precision, pval_converged, cramersv_converged, percent_converged), file=fpv) print("", file=fpv) final_clustering_threshold = sampling_precision else: final_clustering_threshold = args.cluster_threshold # Perform final clustering at the required precision print("Clustering at threshold %.3f" % final_clustering_threshold) (cluster_centers, cluster_members) = clustering_rmsd.precision_cluster( distmat_full, total_num_models, final_clustering_threshold) (ctable, retained_clusters) = clustering_rmsd.get_contingency_table( len(cluster_centers), cluster_members, all_models, sampleA_all_models, sampleB_all_models) print("Contingency table:", ctable) # Output the number of models in each cluster and each sample with open("%s.Cluster_Population.txt" % args.sysname, 'w+') as fcp: for rows in range(len(ctable)): print(rows, ctable[rows][0], ctable[rows][1], file=fcp) # Obtain the subunits for which we need to calculate densities density_custom_ranges = precision_rmsd.parse_custom_ranges(args.density) # Output cluster precisions fpc = open("%s.Cluster_Precision.txt" % args.sysname, 'w+') # For each cluster, output the models in the cluster # Also output the densities for the cluster models for i in range(len(retained_clusters)): clus = retained_clusters[i] # The cluster centroid is the first conformation. # We use this as to align and compute RMSD/precision conform_0 = conforms[all_models[cluster_members[clus][0]]] # create a directory for the cluster if not os.path.exists("./cluster.%s" % i): os.mkdir("./cluster.%s" % i) os.mkdir("./cluster.%s/Sample_A/" % i) os.mkdir("./cluster.%s/Sample_B/" % i) else: shutil.rmtree("./cluster.%s" % i) os.mkdir("./cluster.%s" % i) os.mkdir("./cluster.%s/Sample_A/" % i) os.mkdir("./cluster.%s/Sample_B/" % i) # Create densities for all subunits for both sample A and sample B # as well as separately. gmd1 = precision_rmsd.GetModelDensity( custom_ranges=density_custom_ranges, resolution=args.density_threshold, voxel=args.voxel, bead_names=ps_names) gmd2 = precision_rmsd.GetModelDensity( custom_ranges=density_custom_ranges, resolution=args.density_threshold, voxel=args.voxel, bead_names=ps_names) gmdt = precision_rmsd.GetModelDensity( custom_ranges=density_custom_ranges, resolution=args.density_threshold, voxel=args.voxel, bead_names=ps_names) # Also output the identities of cluster members both_file = open('cluster.' + str(i) + '.all.txt', 'w') sampleA_file = open('cluster.' + str(i) + '.sample_A.txt', 'w') sampleB_file = open('cluster.' + str(i) + '.sample_B.txt', 'w') # Create a model with just the cluster_member particles model = IMP.Model() ps = [] # particle list to be updated by each RMF frame for pi in range(len(conform_0)): p = IMP.Particle(model, "%s" % str(pi)) IMP.core.XYZ.setup_particle(p, (0, 0, 0)) IMP.core.XYZR.setup_particle(p, float(radii[pi])) IMP.atom.Mass.setup_particle(p, float(masses[pi])) ps.append(p) # Obtain cluster precision by obtaining average RMSD of each model # to the cluster center cluster_precision = 0.0 # transformation from internal pyRMSD orientation trans = None # for each model in the cluster for mem in cluster_members[clus]: model_index = all_models[mem] # get superposition of each model to cluster center and the # RMSD between the two if args.symmetry_groups: rmsd, superposed_ps, trans = \ precision_rmsd.get_particles_from_superposed_amb( conforms[model_index], conform_0, args.align, ps, trans, symm_groups) else: rmsd, superposed_ps, trans = \ precision_rmsd.get_particles_from_superposed( conforms[model_index], conform_0, args.align, ps, trans) model.update() # why not? cluster_precision += rmsd # Add the superposed particles to the respective density maps gmdt.add_subunits_density(superposed_ps) # total density map print(model_index, file=both_file) if model_index in sampleA_all_models: # density map for sample A gmd1.add_subunits_density(superposed_ps) print(model_index, file=sampleA_file) else: # density map for sample B gmd2.add_subunits_density(superposed_ps) print(model_index, file=sampleB_file) cluster_precision /= float(len(cluster_members[clus]) - 1.0) print( "Cluster precision (average distance to cluster centroid) " "of cluster ", str(i), " is %.3f" % cluster_precision, "A", file=fpc) both_file.close() sampleA_file.close() sampleB_file.close() # Output density files for the cluster density = gmdt.write_mrc(path="./cluster.%s" % i, file_prefix="LPD") gmd1.write_mrc(path="./cluster.%s/Sample_A/" % i, file_prefix="LPD") gmd2.write_mrc(path="./cluster.%s/Sample_B/" % i, file_prefix="LPD") # Add the cluster center model RMF to the cluster directory cluster_center_index = cluster_members[clus][0] if args.rmf_A is not None: cluster_center_model_id = cluster_center_index if cluster_center_index < n_models[0]: make_cluster_centroid( os.path.join(args.path, args.rmf_A), cluster_center_index, os.path.join("cluster.%d" % i, "cluster_center_model.rmf3"), i, len(cluster_members[clus]), cluster_precision, density, args.path) else: make_cluster_centroid( os.path.join(args.path, args.rmf_B), cluster_center_index - n_models[0], os.path.join("cluster.%d" % i, "cluster_center_model.rmf3"), i, len(cluster_members[clus]), cluster_precision, density, args.path) else: # index to Identities file. cluster_center_model_id = all_models[cluster_center_index] outfname = os.path.join("cluster.%d" % i, "cluster_center_model." + args.extension) if 'rmf' in args.extension: make_cluster_centroid(models_name[cluster_center_model_id], 0, outfname, i, len(cluster_members[clus]), cluster_precision, density, args.path) else: shutil.copy(models_name[cluster_center_model_id], outfname) fpc.close() # generate plots for the score and structure tests if args.gnuplot: import subprocess import glob gnuplotdir = IMP.sampcon.get_data_path("gnuplot_scripts") for filename in sorted(glob.glob(os.path.join(gnuplotdir, "*.plt"))): cmd = ['gnuplot', '-e', 'sysname="%s"' % args.sysname, filename] print(" ".join(cmd)) subprocess.check_call(cmd)