def generate_next_inpcrds(self, msm, clusters): """ Writes the input coordinate files for the next generation. Each file is in its own numbered directory and called just "inpcrd" """ # Check if inpcrds are already made sysdir = os.path.join(self.dir, "systems", str(self.generation + 1)) if len(glob(os.path.join(sysdir, "*.inpcrd"))) == self.nreps: print(" Already have samplers... skipping inpcrd_generation") return # Make directory to contain topologies and inpcrds if not os.path.isdir(os.path.join(self.dir, "systems")): os.mkdir(os.path.join(self.dir, "systems")) gendir = os.path.join(self.dir, "systems", str(self.generation + 1)) if not os.path.isdir(gendir): os.mkdir(gendir) scores = hub_scores(msm) utils.dump(scores, "mmsm_scores.pkl") gen = InpcrdGenerator(prodfiles=self.prodfiles, clusters=clusters, msm=msm, scores=scores, config=self.config, criteria=self.config.get("model", "criteria", fallback="hub_scores")) gen.run()
def update_features(self): """ Uses the current trajectories to update the features. Returns: featurized all trajectories ready for tica """ # Check feature string has correct format (space for generation) if "%d" not in self.featurized: print("ERROR: Need format string %d in featurized option") quit(1) # Featurize this generation if not os.path.isfile(self.featurized % self.generation) and \ not os.path.isfile("%s.pkl" % self.featurized % self.generation) and \ not os.path.isfile("%s.h5" % self.featurized % self.generation): featr = MultiligandContactFeaturizer(ligands=self.ligands, scheme="closest-heavy", protein=None, scaling_function=None, log=True) feated = [] for traj in self.new_prodfiles: topo = utils.get_topology(traj, self.dir) if not os.path.isfile(topo): topo = os.path.abspath( self.config.get("system", "topologypsf")) featme = md.load(traj, top=topo, stride=1) # Hilariously this requires a list to be the right output shape feated.extend(featr.transform([featme])) # Save this feature set, with backwards compatibility for pickle runs if ".pkl" in self.featurized: utils.dump(feated, self.featurized % self.generation) else: utils.save_features_h5( feated, "%s.h5" % self.featurized % self.generation) else: print("Already have features for generation %d" % self.generation) if os.path.isfile("%s.h5" % self.featurized % self.generation): feated = utils.load_features_h5("%s.h5" % self.featurized % self.generation) else: feated = utils.load("%s.pkl" % self.featurized % self.generation) # Check feature file isn't empty. If so, delete it and recurse if len(feated) == 0: print("Empty features generation %d... Regenerating" % self.generation) os.remove("%s.h5" % self.featurized % self.generation) feated = self.update_features() # We only need to update tica with new features this generation return feated
def generate_clusters(self, ticad): """ Updates the cluster data. Needs to be re-done each iteration as cluster from previous trajectories may change as we get more data. Returns: clustered dataset """ clustr = MiniBatchKMeans( n_clusters=self.config.getint("model", "num_clusters")) clustered = clustr.fit_transform(ticad) if self.save_extras: utils.dump(clustr, "microstater.pkl") return clustered
def generate_msm(self, clustered): """ Generates a MSM from the current cluster data Returns: Msm """ # Generate microstate MSM self.currtime = time.time() msm = MarkovStateModel(lag_time=self.config.getint("model", "msm_lag"), reversible_type="transpose", ergodic_cutoff="off", prior_counts=0.000001) msm.fit(clustered) print("TIME\tmicromsm:\t%f" % (time.time() - self.currtime)) utils.dump(msm, "msm_G%d.pkl" % self.generation) # Lump into macrostates self.currtime = time.time() pcca = PCCAPlus.from_msm(msm, n_macrostates=self.config.getint( "model", "macrostates")) mclustered = pcca.transform(clustered, mode="fill") if any(any(np.isnan(x) for x in m) for m in mclustered): #pylint: disable=no-member print( "WARNING: Unassignable clusters in PCCA with %d macrostates!" % self.config.getint("model", "macrostates")) print("TIME\tpccaplus:\t%f" % (time.time() - self.currtime)) if self.save_extras: utils.dump(pcca, "macrostater.pkl") # Generate macrostate MSM self.currtime = time.time() mmsm = MarkovStateModel(lag_time=self.config.getint( "model", "msm_lag"), reversible_type="transpose", ergodic_cutoff="off", prior_counts=0.000001) mmsm.fit(mclustered) print("TIME\tmacromsm\t%f" % (time.time() - self.currtime)) utils.dump(mmsm, "mmsm_G%d.pkl" % self.generation) return mmsm, mclustered
def run(self): """ Actually does the hard work of building the MSM and creating the next generation """ # Check if a pickled msm already exists. If so, use it to save time if os.path.isfile("mmsm_G%d.pkl" % self.generation) and \ os.path.isfile("testing.cluster.pkl"): print("Loading MSM for generation %d:" % self.generation) msm = utils.load("mmsm_G%d.pkl" % self.generation) if os.path.isfile("testing.mcluster.pkl"): mclusters = utils.load("testing.mcluster.pkl") else: mclusters = utils.load("testing.cluster.pkl") else: print("GENERATION %d:" % self.generation) # Check if tica exists already if os.path.isfile("testing.tica.pkl"): print("Loading tics...") tics = utils.load("testing.tica.pkl") elif os.path.isfile("ticad_%d.h5" % self.generation): tics = utils.load_features_h5("ticad_%d.h5" % self.generation) else: # Featurize new trajectories print(" Featurizing...") sys.stdout.flush() self.currtime = time.time() features = self.update_features() print("TIME:\tfeaturize\t%f" % (time.time() - self.currtime)) print("Files: %d Features: %d" % (len(self.new_prodfiles), len(features))) # Regenerate tics print(" tICing...") sys.stdout.flush() self.currtime = time.time() tics = self.generate_tics(features) print("TIME:\tticaing\t%f" % (time.time() - self.currtime)) print(" Clustering...") sys.stdout.flush() self.currtime = time.time() clusters = self.generate_clusters(tics) print("TIME:\tcluster\t%f" % (time.time() - self.currtime)) utils.dump(clusters, "testing.cluster.pkl") print(" MSMing...") sys.stdout.flush() msm, mclusters = self.generate_msm(clusters) utils.dump(mclusters, "testing.mcluster.pkl") # DEBUG # Resample, if we haven't reached max generation if self.generation < self.config.getint( "production", "max_generation", fallback=1000000): print(" Sampling and starting...") sys.stdout.flush() self.currtime = time.time() self.generate_next_inpcrds(msm, mclusters) print("TIME:\tinpcrd:\t%f" % (time.time() - self.currtime)) else: self.finish_run() # The generation is incremented by the last ligand_adder. # Indicate that the model has completed successfully # This isn't really necessary but whatever self.config["model"]["JobID"] = "0" with open(self.configfile, 'w') as configfile: self.config.write(configfile) # Save representative clusters last self.currtime = time.time() self.save_cluster_means(mclusters) print("TIME:\taggregate:\t%f" % (time.time() - self.currtime))