eRMSDs = get_eRMSDs(r1, r2, inputfile, traj_file, top_file, num_confs) if parallel: out = parallelize.fire_multiprocess(traj_file, top_file, get_eRMSDs, num_confs, n_cpus, r2, inputfile, traj_file, top_file, matrix=True) eRMSDs = np.sum((i for i in out), axis=0) #eRMSDs = pickle.load(open('tmp_eRMSDs', 'rb')) #the eRMSD matrix is actually only half a matrix for ni, i in enumerate(eRMSDs): for nj, j in enumerate(i): eRMSDs[nj][ni] = j if ni == nj: eRMSDs[ni][nj] = 0 #since calculating the eRMSDs are so time-consuming to calculate we're gonna pickle it to iterate the DBSCAN later. with open("tmp_eRMSDs", "wb") as file: pickle.dump(eRMSDs, file) ############################################################################################################### #Next, we're going to perform a DBSCAN on that matrix of eRMSDs to find clusters of similar structures perform_DBSCAN(eRMSDs, num_confs, traj_file, inputfile) ##############################################################################################################
eRMSDs = get_eRMSDs(r1, r2, inputfile, traj_file, top_file, num_confs) if parallel: out = parallelize_lorenzo_onefile.fire_multiprocess(traj_file, top_file, get_eRMSDs, num_confs, n_cpus, r2, inputfile, traj_file, top_file, matrix=True) eRMSDs = np.sum((i for i in out), axis=0) #eRMSDs = pickle.load(open('tmp_eRMSDs', 'rb')) #the eRMSD matrix is actually only half a matrix for ni, i in enumerate(eRMSDs): for nj, j in enumerate(i): eRMSDs[nj][ni] = j if ni == nj: eRMSDs[ni][nj] = 0 #since calculating the eRMSDs are so time-consuming to calculate we're gonna pickle it to iterate the DBSCAN later. with open("tmp_eRMSDs", "wb") as file: pickle.dump(eRMSDs, file) ############################################################################################################### #Next, we're going to perform a DBSCAN on that matrix of eRMSDs to find clusters of similar structures perform_DBSCAN(eRMSDs, num_confs, traj_file, inputfile, "precomputed") ##############################################################################################################
#Create an oxView overlay showing the first SUM components SUM = 1 print("INFO: Change the number of eigenvalues to sum and display by modifying the SUM variable in the script. Current value: {}".format(SUM), file=stderr) weighted_sum = np.zeros_like(evectors[0]) for i in range(0, SUM): #how many eigenvalues do you want? weighted_sum += evalues[i]*evectors[i] prep_pos_for_json = lambda conf: list( list(p) for p in conf ) with catch_warnings(): #this produces an annoying warning about casting complex values to real values that is not relevant simplefilter("ignore") output_vectors = weighted_sum.reshape(int(weighted_sum.shape[0]/3), 3).astype(float) with open(outfile, "w+") as file: file.write(dumps({ "pca" : prep_pos_for_json(output_vectors) })) #If we're running clustering, feed the linear terms into the clusterer if cluster: print("INFO: Mapping configurations to component space...", file=stderr) #If you want to cluster on only some of the components, uncomment this #out = out[:,0:3] from clustering import perform_DBSCAN labs = perform_DBSCAN(out, num_confs, traj_file, inputfile, "euclidean")
if outfile.split(".")[1] != "json": raise Exception f = outfile.split(".")[0] + str(i) + "." + outfile.split(".")[1] except: print( "ERROR: oxView overlays must have a '.json' extension. No overlays will be produced", file=stderr) break out = np.sqrt(evalues[i]) * evectors[i] with catch_warnings( ): #this produces an annoying warning about casting complex values to real values that is not relevant simplefilter("ignore") output_vectors = out.reshape(int(out.shape[0] / 3), 3).astype(float) with open(f, "w+") as file: file.write(dumps({"pca": prep_pos_for_json(output_vectors)})) #If we're running clustering, feed the linear terms into the clusterer if cluster: print("INFO: Mapping configurations to component space...", file=stderr) #If you want to cluster on only some of the components, uncomment this #out = out[:,0:3] from clustering import perform_DBSCAN labs = perform_DBSCAN(coordinates, num_confs, traj_file, inputfile, "euclidean")
if lineplt == True: if hist == True: #clear the histogram plot plt.clf() #if making two plots, automatically append the plot type to the output file name out = outfile[:outfile.find(".")] + "_traj" + outfile[outfile. find("."):] else: out = outfile graph_count = 0 for traj_set in distances: for dist_list in traj_set: a = plt.plot(dist_list, alpha=0.5, label=names[graph_count]) graph_count += 1 plt.xlabel("Simulation Steps") plt.ylabel("Distance (nm)") plt.legend() #plt.show() print("INFO: Writing trajectory plot to file {}".format(out), file=stderr) plt.savefig("{}".format(out)) if cluster == True: if not all([x == trajectories[0] for x in trajectories]): print("ERROR: Clustering can only be run on a single trajectory", file=stderr) exit(1) from clustering import perform_DBSCAN labs = perform_DBSCAN(distances[0].T, len(distances[0][0]), trajectories[0], input_files[0], "euclidean")
def main(): parser = argparse.ArgumentParser( prog=path.basename(__file__), description= "Calculates a principal component analysis of nucleotide deviations over a trajectory" ) parser.add_argument('inputfile', type=str, nargs=1, help="The inputfile used to run the simulation") parser.add_argument('trajectory', type=str, nargs=1, help='the trajectory file you wish to analyze') parser.add_argument( 'meanfile', type=str, nargs=1, help='The mean structure .json file from compute_mean.py') parser.add_argument( 'outfile', type=str, nargs=1, help='the name of the .json file where the PCA will be written') parser.add_argument('-p', metavar='num_cpus', nargs=1, type=int, dest='parallel', help="(optional) How many cores to use") parser.add_argument( '-c', metavar='cluster', dest='cluster', action='store_const', const=True, default=False, help="Run the clusterer on each configuration's position in PCA space?" ) args = parser.parse_args() check_dependencies(["python", "numpy", "Bio"]) traj_file = args.trajectory[0] inputfile = args.inputfile[0] mean_file = args.meanfile[0] outfile = args.outfile[0] parallel = args.parallel if parallel: n_cpus = args.parallel[0] #-c makes it run the clusterer on the output cluster = args.cluster top_file = get_input_parameter(inputfile, "topology") if "RNA" in get_input_parameter(inputfile, "interaction_type"): environ["OXRNA"] = "1" else: environ["OXRNA"] = "0" import UTILS.base #this needs to be imported after the model type is set num_confs = cal_confs(traj_file) if mean_file.split(".")[-1] == "json": with open(mean_file) as file: align_conf = load(file)['g_mean'] elif mean_file.split(".")[-1] == "dat": fetch_np = lambda conf: np.array([n.cm_pos for n in conf._nucleotides]) with LorenzoReader2(mean_file, top_file) as reader: s = reader._get_system() align_conf = fetch_np(s) cms = np.mean(align_conf, axis=0) #all structures must have the same center of mass align_conf -= cms #Compute the deviations if not parallel: r = LorenzoReader2(traj_file, top_file) deviations_matrix = get_pca(r, align_conf, num_confs) if parallel: out = parallelize_lorenzo_onefile.fire_multiprocess( traj_file, top_file, get_pca, num_confs, n_cpus, align_conf) deviations_matrix = np.concatenate([i for i in out]) #now that we have the deviations matrix we're gonna get the covariance and PCA it #note that in the future we might want a switch for covariance vs correlation matrix because correlation (cov/stdev so all diagonals are 1) is better for really floppy structures pca = PCA(n_components=3) pca.fit(deviations_matrix) transformed = pca.transform(deviations_matrix) #THIS IS AS FAR AS I GOT import matplotlib.pyplot as plt print("INFO: Saving scree plot to scree.png", file=stderr) plt.scatter(range(0, len(evalues)), evalues, s=25) plt.xlabel("component") plt.ylabel("eigenvalue") plt.savefig("scree.png") print( "INFO: Creating coordinate plot from first three eigenvectors. Saving to coordinates.png", file=stderr) #if you want to weight the components by their eigenvectors #mul = np.einsum('ij,i->ij',evectors[0:3], evalues[0:3]) mul = evectors #reconstruct configurations in component space out = np.dot(deviations_matrix, mul).astype(float) #make a quick plot from the first three components from mpl_toolkits.mplot3d import Axes3D fig = plt.figure() ax = fig.gca(projection='3d') ax.scatter(out[:, 0], out[:, 1], out[:, 2], c='g', s=25) plt.savefig("coordinates.png") #Create an oxView overlay showing the first SUM components SUM = 1 print( "INFO: Change the number of eigenvalues to sum and display by modifying the SUM variable in the script. Current value: {}" .format(SUM), file=stderr) weighted_sum = np.zeros_like(evectors[0]) for i in range(0, SUM): #how many eigenvalues do you want? weighted_sum += evalues[i] * evectors[i] prep_pos_for_json = lambda conf: list(list(p) for p in conf) with catch_warnings( ): #this produces an annoying warning about casting complex values to real values that is not relevant simplefilter("ignore") output_vectors = weighted_sum.reshape(int(weighted_sum.shape[0] / 3), 3).astype(float) with open(outfile, "w+") as file: file.write(dumps({"pca": prep_pos_for_json(output_vectors)})) #If we're running clustering, feed the linear terms into the clusterer if cluster: print("INFO: Mapping configurations to component space...", file=stderr) #If you want to cluster on only some of the components, uncomment this #out = out[:,0:3] from clustering import perform_DBSCAN labs = perform_DBSCAN(out, num_confs, traj_file, inputfile, "euclidean", 12, 8)