def run(self, msm_dir='.'): if self.centers != 'none': assignments = ra.load(msm_dir + "/data/assignments.h5") distances = ra.load(msm_dir + "/data/distances.h5") if self.centers == 'auto': state_nums = np.load(msm_dir + "/data/unique_states.npy") elif self.centers == 'all': state_nums = None elif self.centers == 'restarts': states_to_simulate_file = \ msm_dir + "/rankings/states_to_simulate_gen" + \ str(self.gen_num) + ".npy" state_nums = np.load(states_to_simulate_file) save_states(assignments, distances, state_nums=state_nums, n_procs=self.n_procs, largest_center=self.largest_center, save_routine=self.save_routine, msm_dir=msm_dir) if self.save_xtc_centers: center_filenames = np.sort( glob.glob("%s/centers_masses/*.pdb" % msm_dir)) trj_lengths, xyzs = load_as_concatenated(center_filenames, processes=self.n_procs) centers = md.Trajectory(xyzs, topology=md.load("%s/prot_masses.pdb" % msm_dir).top) centers.save_xtc("%s/data/full_centers.xtc" % msm_dir)
def load_assignments(assignments): from enspara.util import array as ra from tables import NoSuchNodeError if not hasattr(assignments, 'shape'): print('loading msm assignments from', assignments) try: assignments = ra.load(assignments, keys=None) except NoSuchNodeError: assignments = ra.load(assignments, keys=...) return assignments
def main(argv=None): args = process_command_line(argv) try: assignments = ra.load(args.assignments) except NoSuchNodeError: assignments = ra.load(args.assignments, keys=...) if args.trj_ids is not None: assignments = assignments[args.trj_ids] tscales = implied_timescales(assignments, args.lag_times, n_times=args.n_eigenvalues, sliding_window=True, trim=args.trim, method=args.symmetrization, n_procs=args.processes) import matplotlib as mpl mpl.use('Agg') from matplotlib import pyplot as plt unit_factor, unit_str = process_units(args.timestep, args.infer_timestep) # scale x and y axes to nanoseconds lag_times = np.array(args.lag_times) / unit_factor tscales /= unit_factor for i in range(args.n_eigenvalues): plt.plot(lag_times, tscales[:, i] / unit_factor, label=r'$\lambda_{i}$'.format(i=i + 1)) if args.logscale: plt.yscale('log') plt.ylabel('Eigenmotion Speed [{u}]'.format(u=unit_str)) plt.xlabel('Lag Time [{u}]'.format(u=unit_str)) plt.legend(frameon=False) plt.savefig(args.plot, dpi=300) return 0
def main(argv=None): args = process_command_line(argv) try: features = ra.load(args.features, keys=...) except exception.DataInvalid: features = ra.load(args.features) logger.info("Loaded data from %s with shape %s", args.features, features.shape) if args.cluster_algorithm == 'khybrid': clustering = KHybrid(metric=args.cluster_distance, cluster_radius=args.cluster_radius, kmedoids_updates=args.kmedoids_updates) elif args.cluster_algorithm == 'kcenters': clustering = KCenters(cluster_radius=args.cluster_radius, metric=args.cluster_distance) logger.info("Clustering with %s", clustering) clustering.fit(features._data) result = clustering.result_.partition(features.lengths) del features ra.save(args.distances, result.distances) logger.info("Wrote distances with shape %s to %s", result.distances.shape, args.distances) ra.save(args.assignments, result.assignments) logger.info("Wrote assignments with shape %s to %s", result.assignments.shape, args.cluster_centers) ra.save(args.cluster_centers, result.centers) logger.info("Wrote cluster_centers with shape %s to %s", result.centers.shape, args.cluster_centers) pickle.dump(result.center_indices, open(args.center_indices, 'wb')) logger.info("Wrote %s center_indices with shape to %s", len(result.center_indices), args.center_indices) return 0
def load_features(features, stride): if len(features) == 1: with timed("Loading features took %.1f s.", logger.info): try: data = ra.load(features[0]) except tables.exceptions.NoSuchNodeError: data = ra.load(features[0], keys=...) lengths = data.lengths data = data._data else: # and len(features) > 1 with timed("Loading features took %.1f s.", logger.info): lengths, data = mpi.io.load_npy_as_striped(features, stride) with timed("Turned over array in %.2f min", logger.info): tmp_data = data.copy() del data data = tmp_data return lengths, data
def check_clustering(self, msm_dir, gen_num, n_kids, verbose=True): correct_clustering = True total_assignments = (gen_num + 1) * n_kids assignments = ra.load(msm_dir + '/data/assignments.h5') n_assignments = len(assignments) if total_assignments != n_assignments: correct_clustering = False logging.info( "inconsistent number of trajectories between assignments and data!" ) return correct_clustering
def _prop_msm(msm_dir, msm_obj): """Propagate MSM files.""" t0 = time.time() # load assignments and build MSM assignments = ra.load(msm_dir + '/data/assignments.h5') msm_obj.fit(assignments) # write counts, probs, and popoulations (if applicable) scipy.io.mmwrite(msm_dir + '/data/tcounts.mtx', msm_obj.tcounts_) scipy.io.mmwrite(msm_dir + '/data/tprobs.mtx', msm_obj.tprobs_) if msm_obj.eq_probs_ is not None: np.save(msm_dir + '/data/populations.npy', msm_obj.eq_probs_) t1 = time.time() logging.info("building MSM took %0.4f seconds" % (t1 - t0)) return msm_obj
def main(argv=None): '''Run the driver script for this module. This code only runs if we're being run as a script. Otherwise, it's silent and just exposes methods.''' args = process_command_line(argv) try: assignments = ra.load(args.assignments) except NoSuchNodeError: assignments = ra.load(args.assignments, keys=...) if args.trj_ids is not None: assignments = assignments[args.trj_ids] tscales = implied_timescales( assignments, args.lag_times, n_times=args.n_eigenvalues, sliding_window=True, trim=args.trim, method=args.symmetrization, n_procs=args.processes) unit_factor, unit_str = process_units(args.timestep, args.infer_timestep) # scale x and y axes to nanoseconds lag_times = np.array(args.lag_times) / unit_factor tscales /= unit_factor for i in range(args.n_eigenvalues): plt.plot(lag_times, tscales[:, i] / unit_factor, label=r'$\lambda_{i}$'.format(i=i+1)) if args.logscale: plt.yscale('log') plt.ylabel('Eigenmotion Speed [{u}]'.format(u=unit_str)) plt.xlabel('Lag Time [{u}]'.format(u=unit_str)) plt.legend(frameon=False) plt.savefig(args.plot, dpi=300) return 0
def _perform_analysis(analysis_obj, msm_dir, gen_num, sub_obj, q_check_obj, update_data): """Performs analysis of cluster centers. Inputs ---------- analysis_obj : object, The object used for analysis. msm_dir : str, MSM directory where analysis is performed. gen_num : int, Generation number. sub_obj : object, Submission wrapper object. q_check_obj : object, Queueing system wrapper to determine if submission is still running. update_data : bool, Flag for rebuilding whole analysis or analyzing a subset of structures. """ t0 = time.time() # determine if there is an analysis object if analysis_obj is None: state_rankings = None else: # set the objects output analysis_obj.set_output(msm_dir, gen_num) # optionally set rebuild or continue analysis if hasattr(analysis_obj, 'build_full'): analysis_obj.build_full = update_data # if the output doesn't exists, pickle submit analysis if not os.path.exists(analysis_obj.output_name): _pickle_submit(msm_dir, analysis_obj, sub_obj, q_check_obj, gen_num, 'analysis') # get rankings state_rankings = analysis_obj.state_rankings # check that everything went well # number of state rankings should be equal to number of state # in the assignments n_states_ranked = len(state_rankings) n_states = len(np.unique(ra.load(msm_dir + '/data/assignments.h5'))) if n_states_ranked != n_states: raise DataInvalid( 'The number of state rankings does not match the number ' + \ 'of states in the assignments! Analysis may have failed!') t1 = time.time() logging.info("analysis took %0.4f seconds" % (t1 - t0)) return state_rankings
def check_save_states(self, msm_dir): assigns = ra.load(msm_dir + '/data/assignments.h5') unique_states = np.unique(assigns) n_states = unique_states.shape[0] correct_save = True save_masses = False save_restarts = False if (self.save_routine == 'masses') or (self.save_routine == 'full'): save_masses = True if (self.save_routine == 'restarts') or (self.save_routine == 'full'): save_restarts = True if (self.centers == 'none') or (self.centers == 'restarts'): pass else: if save_masses: n_masses = len(glob.glob(msm_dir + '/centers_masses/*.pdb')) if n_masses != n_states: correct_save = False if save_restarts: n_restarts = len(glob.glob(msm_dir + '/centers_restarts/*.gro')) if n_restarts != n_states: correct_save = False return correct_save
import glob import mdtraj as md import numpy as np import os from fast.msm_gen import save_states as ss from functools import partial from multiprocessing import Pool from enspara import cluster from enspara.msm import MSM, builders from enspara.util.load import load_as_concatenated from enspara.util import array as ra dist_cutoff = 0.01 assignments = ra.load("./data/assignments.h5") distances = ra.load("./data/distances.h5") ss.save_states(assignments, distances, save_routine='masses', largest_center=dist_cutoff, n_confs=1, n_procs=64) print("Saving the states!") prot_masses = "./prot_masses.pdb" prot_masses = md.load(prot_masses) pdb_names = np.sort(glob.glob("./centers_masses/*.pdb")) trj_lengths, xyzs = load_as_concatenated(pdb_names, processes=64, top=prot_masses) centers_full = md.Trajectory(xyz=xyzs, topology=prot_masses.top) centers_full.save_xtc("./data/full_centers.xtc")
def entry_point(): if True: # filenames filenames = np.sort([ os.path.abspath(pathname) for pathname in glob.glob("./trajectories/*.xtc") ]) print("obtained filenames!") # load atom indices pdb = md.load("prot_masses.pdb") iis = pdb.topology.select("backbone and resid 72 to 87") # iis = np.loadtxt("./atom-indices-bb.dat", dtype=int) # topology filename prot_masses = "./prot_masses.pdb" prot_masses = md.load(prot_masses) # load trjs print("about to load!!") centers = prot_masses.atom_slice(iis) trj_lengths, xyzs = load_as_concatenated(filenames=filenames, atom_indices=iis, processes=48, top=prot_masses) trjs_sub = md.Trajectory(xyz=xyzs, topology=centers.top) del xyzs if True: # get subset n_clusters = 10000 #n_clusters = None dist_cutoff = 0.01 clusterer = cluster.KCenters(metric=md.rmsd, cluster_radius=dist_cutoff, n_clusters=n_clusters) #clusterer = cluster.KHybrid(metric=md.rmsd, cluster_radius=dist_cutoff, n_clusters=n_clusters, kmedoids_updates=2) clusterer.fit(trjs_sub) center_indices, distances, assignments, centers = \ clusterer.result_.partition(trj_lengths) ra.save("./data/assignments.h5", assignments) ra.save("./data/distances.h5", distances) trjs_sub[clusterer.center_indices_].save_xtc("./data/centers.xtc") np.save("./data/center_indices.npy", clusterer.center_indices_) print("Done clustering!") if True: lag_time = 10 # 20ps * 200 = 4 ns #lag_time = 1 # 20ps * 200 = 4 ns assignments = ra.load("./data/assignments.h5") unique_states = np.unique(np.concatenate(assignments)) b = partial(builders.normalize, prior_counts=1 / unique_states.shape[0]) msm_obj = MSM(lag_time=lag_time, method=b) msm_obj.fit(assignments) np.save("./data/tcounts.npy", msm_obj.tcounts_) np.save("./data/tprobs.npy", msm_obj.tprobs_) np.save("./data/populations.npy", msm_obj.eq_probs_) print("Done MSM!")