def run(self): # load and concat trjs if self.mem_efficient: trj_lengths, xyzs = load_as_concatenated( filenames=self.trj_filenames, processes=self.n_procs, top=self.base_struct_md, atom_indices=self.atom_indices_vals) trjs_sub = md.Trajectory( xyzs, self.base_struct_md.atom_slice( self.atom_indices_vals).topology) else: trj_lengths, xyzs = load_as_concatenated( filenames=self.trj_filenames, processes=self.n_procs, top=self.base_struct_md) trjs = md.Trajectory(xyzs, self.base_struct_md.topology) trjs_sub = trjs.atom_slice(self.atom_indices_vals) # determine if rebuilding all msm stuff if self.build_full: base_struct_centers = self.base_struct_md.atom_slice( self.atom_indices_vals) base_struct_centers.save_pdb("./centers.pdb") self.base_struct_md.save_pdb("./prot_masses.pdb") init_centers = None else: init_centers = md.load("./data/centers.xtc", top="./centers.pdb") # fit data with base clustering object self.base_clust_obj.fit(trjs_sub, init_centers=init_centers) center_indices, distances, assignments, centers = \ self.base_clust_obj.result_.partition(trj_lengths) # save data ra.save("./data/assignments.h5", assignments) ra.save("./data/distances.h5", distances) trjs_sub = trjs_sub[self.base_clust_obj.center_indices_] trjs_sub.superpose(trjs_sub[0]) trjs_sub.save_xtc("./data/centers.xtc") if not self.mem_efficient: full_centers = trjs[self.base_clust_obj.center_indices_] full_centers.superpose(self.base_struct_md) full_centers.save_xtc("./data/full_centers.xtc") # save states n_states = len(self.base_clust_obj.center_indices_) unique_states = np.arange(n_states) if init_centers is not None: unique_states = unique_states[-(n_states - len(init_centers)):] np.save("./data/unique_states.npy", unique_states)
def run(self, msm_dir='.'): if self.centers != 'none': assignments = ra.load(msm_dir + "/data/assignments.h5") distances = ra.load(msm_dir + "/data/distances.h5") if self.centers == 'auto': state_nums = np.load(msm_dir + "/data/unique_states.npy") elif self.centers == 'all': state_nums = None elif self.centers == 'restarts': states_to_simulate_file = \ msm_dir + "/rankings/states_to_simulate_gen" + \ str(self.gen_num) + ".npy" state_nums = np.load(states_to_simulate_file) save_states(assignments, distances, state_nums=state_nums, n_procs=self.n_procs, largest_center=self.largest_center, save_routine=self.save_routine, msm_dir=msm_dir) if self.save_xtc_centers: center_filenames = np.sort( glob.glob("%s/centers_masses/*.pdb" % msm_dir)) trj_lengths, xyzs = load_as_concatenated(center_filenames, processes=self.n_procs) centers = md.Trajectory(xyzs, topology=md.load("%s/prot_masses.pdb" % msm_dir).top) centers.save_xtc("%s/data/full_centers.xtc" % msm_dir)
def write_struct_ctrs(trajectoryfiles, topology, ctr_inds_file, ctr_structs_file, run_after=None): import os import pickle import mdtraj as md from enspara.util.load import load_as_concatenated print("Loading center indices at", ctr_inds_file) if os.path.isfile(ctr_structs_file): print("Overwriting", ctr_structs_file) # print("Refusing to overwrite", ctr_structs_file) # return ctr_structs_file try: with open(ctr_inds_file, 'rb') as f: ctr_inds = pickle.load(f) except pickle.UnpicklingError: import numpy as np ctr_inds = np.load(ctr_inds_file) print("Loaded %s center indices." % len(ctr_inds)) top = md.load(topology).top try: lengths, xyz = load_as_concatenated( filenames=[trajectoryfiles[tr] for tr, fr in ctr_inds], args=[{ 'frame': fr, 'top': top } for tr, fr in ctr_inds], processes=4) except IndexError: print(len(trajectoryfiles), len(ctr_inds), max([tr for tr, fr in ctr_inds])) raise ctr_structs = md.Trajectory(xyz=xyz, topology=top) ctr_structs.save(ctr_structs_file) return ctr_structs_file
def batch_reassign(targets, centers, lengths, frac_mem, n_procs=None): example_center = centers[0] DTYPE_BYTES = 4 batch_size, batch_gb = determine_batch_size( example_center.n_atoms, DTYPE_BYTES, frac_mem) logger.info( 'Batch max size set to %s frames (~%.2f GB, %.1f%% of total RAM).' % (batch_size, batch_gb, frac_mem*100)) if batch_size < max(lengths): raise enspara.exception.ImproperlyConfigured( 'Batch size of %s was smaller than largest file (size %s).' % (batch_size, max(lengths))) batches = compute_batches(lengths, batch_size) assignments = [] distances = [] for i, batch_indices in enumerate(batches): tick = time.perf_counter() logger.info("Starting batch %s of %s", i+1, len(batches)) batch_targets = [targets[i] for i in batch_indices] with timed("Loaded frames for batch in %.1f seconds", logger.info): batch_lengths, xyz = load_as_concatenated( [tfile for tfile, top, aids in batch_targets], lengths=[lengths[i] for i in batch_indices], args=[{'top': top, 'atom_indices': aids} for t, top, aids in batch_targets], processes=n_procs) # mdtraj loads as float32, and load_as_concatenated should thus # also load as float32. This should _never_ be hit, but there might be # some platform-specific situation where double != float64? assert xyz.dtype.itemsize == DTYPE_BYTES trj = md.Trajectory(xyz, topology=example_center.top) with timed("Precentered trajectories in %.1f seconds", logger.debug): trj.center_coordinates() with timed("Assigned trajectories in %.1f seconds", logger.debug): batch_assignments, batch_distances = assign_to_nearest_center( trj, centers, partial(md.rmsd, precentered=True)) # clear memory of xyz and trj to allow cleanup to deallocate # these large arrays; may help with memory high-water mark with timed("Cleared array from memory in %.1f seconds", logger.debug): xyz_size = xyz.size del trj, xyz assignments.extend(partition_list(batch_assignments, batch_lengths)) distances.extend(partition_list(batch_distances, batch_lengths)) logger.info( "Finished batch %s of %s in %.1f seconds. Coordinates array had " "memory footprint of %.2f GB (of memory high-water mark %.2f/%.2f " "GB).", i, len(batches), time.perf_counter() - tick, xyz_size * DTYPE_BYTES / 1024**3, resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024**2, psutil.virtual_memory().total / 1024**3) return assignments, distances
import numpy as np import os from fast.msm_gen import save_states as ss from functools import partial from multiprocessing import Pool from enspara import cluster from enspara.msm import MSM, builders from enspara.util.load import load_as_concatenated from enspara.util import array as ra dist_cutoff = 0.01 assignments = ra.load("./data/assignments.h5") distances = ra.load("./data/distances.h5") ss.save_states(assignments, distances, save_routine='masses', largest_center=dist_cutoff, n_confs=1, n_procs=64) print("Saving the states!") prot_masses = "./prot_masses.pdb" prot_masses = md.load(prot_masses) pdb_names = np.sort(glob.glob("./centers_masses/*.pdb")) trj_lengths, xyzs = load_as_concatenated(pdb_names, processes=64, top=prot_masses) centers_full = md.Trajectory(xyz=xyzs, topology=prot_masses.top) centers_full.save_xtc("./data/full_centers.xtc") print("Wrote Full Cluster Centers!")
def entry_point(): if True: # filenames filenames = np.sort([ os.path.abspath(pathname) for pathname in glob.glob("./trajectories/*.xtc") ]) print("obtained filenames!") # load atom indices pdb = md.load("prot_masses.pdb") iis = pdb.topology.select("backbone and resid 72 to 87") # iis = np.loadtxt("./atom-indices-bb.dat", dtype=int) # topology filename prot_masses = "./prot_masses.pdb" prot_masses = md.load(prot_masses) # load trjs print("about to load!!") centers = prot_masses.atom_slice(iis) trj_lengths, xyzs = load_as_concatenated(filenames=filenames, atom_indices=iis, processes=48, top=prot_masses) trjs_sub = md.Trajectory(xyz=xyzs, topology=centers.top) del xyzs if True: # get subset n_clusters = 10000 #n_clusters = None dist_cutoff = 0.01 clusterer = cluster.KCenters(metric=md.rmsd, cluster_radius=dist_cutoff, n_clusters=n_clusters) #clusterer = cluster.KHybrid(metric=md.rmsd, cluster_radius=dist_cutoff, n_clusters=n_clusters, kmedoids_updates=2) clusterer.fit(trjs_sub) center_indices, distances, assignments, centers = \ clusterer.result_.partition(trj_lengths) ra.save("./data/assignments.h5", assignments) ra.save("./data/distances.h5", distances) trjs_sub[clusterer.center_indices_].save_xtc("./data/centers.xtc") np.save("./data/center_indices.npy", clusterer.center_indices_) print("Done clustering!") if True: lag_time = 10 # 20ps * 200 = 4 ns #lag_time = 1 # 20ps * 200 = 4 ns assignments = ra.load("./data/assignments.h5") unique_states = np.unique(np.concatenate(assignments)) b = partial(builders.normalize, prior_counts=1 / unique_states.shape[0]) msm_obj = MSM(lag_time=lag_time, method=b) msm_obj.fit(assignments) np.save("./data/tcounts.npy", msm_obj.tcounts_) np.save("./data/tprobs.npy", msm_obj.tprobs_) np.save("./data/populations.npy", msm_obj.eq_probs_) print("Done MSM!")