def _algorithm(self): logger.info('Postprocessing new data') sims = simlist(glob(path.join(self.datapath, '*', '')), glob(path.join(self.inputpath, '*', 'structure.pdb')), glob(path.join(self.inputpath, '*', ''))) if self.filter: sims = simfilter(sims, self.filteredpath, filtersel=self.filtersel) metr = Metric(sims, skip=self.skip) metr.set(self.projection) #if self.contactsym is not None: # contactSymmetry(data, self.contactsym) if self.ticadim > 0: # tica = TICA(metr, int(max(2, np.ceil(self.ticalag)))) # gianni: without project it was tooooo slow tica = TICA(metr.project(), int(max(2, np.ceil(self.ticalag)))) datadr = tica.project(self.ticadim) else: datadr = metr.project() datadr.dropTraj( ) # Preferably we should do this before any projections. Corrupted sims can affect TICA datadr.cluster( self.clustmethod(n_clusters=self._numClusters(datadr.numFrames))) self._model = Model(datadr) self._model.markovModel(self.lag, self._numMacrostates(datadr)) if self.save: self._model.save('adapt_model_e' + str(self._getEpoch()) + '.dat') relFrames = self._getSpawnFrames(self._model, datadr) self._writeInputs(datadr.rel2sim(np.concatenate(relFrames)))
def _algorithm(self): logger.info('Postprocessing new data') sims = simlist(glob(path.join(self.datapath, '*', '')), glob(path.join(self.inputpath, '*', 'structure.pdb')), glob(path.join(self.inputpath, '*', ''))) if self.filter: sims = simfilter(sims, self.filteredpath, filtersel=self.filtersel) metr = Metric(sims, skip=self.skip) metr.set(self.projection) # if self.contactsym is not None: # contactSymmetry(data, self.contactsym) if self.ticadim > 0: # tica = TICA(metr, int(max(2, np.ceil(self.ticalag)))) # gianni: without project it was tooooo slow tica = TICA(metr.project(), int(max(2, np.ceil(self.ticalag)))) datadr = tica.project(self.ticadim) else: datadr = metr.project() datadr.dropTraj( ) # Preferably we should do this before any projections. Corrupted sims can affect TICA datadr.cluster( self.clustmethod(n_clusters=self._numClusters(datadr.numFrames))) model = Model(datadr) self._model = model self._model.markovModel(self.lag, self._numMacrostates(datadr)) if self.save: self._model.save('adapt_model_e' + str(self._getEpoch()) + '.dat') # Undirected component uc = -model.data.N # Lower counts should give higher score hence the - if self.statetype == 'micro': uc = uc[model.cluster_ofmicro] if self.statetype == 'macro': uc = macroAccumulate(model, uc[model.cluster_ofmicro]) # Calculating the directed component dc = self._calculateDirectedComponent(sims, model.data.St, model.data.N) if self.statetype == 'micro': dc = dc[model.cluster_ofmicro] if self.statetype == 'macro': dc = macroAccumulate(model, dc[model.cluster_ofmicro]) uc = self._featScale(uc) dc = self._featScale(dc) reward = dc + self.ucscale * uc relFrames = self._getSpawnFrames(reward, self._model, datadr) self._writeInputs(datadr.rel2sim(np.concatenate(relFrames)))
def _algorithm(self): logger.info('Postprocessing new data') datalist = simlist(glob(path.join(self.datapath, '*', '')), glob(path.join(self.inputpath, '*', 'structure.pdb')), glob(path.join(self.inputpath, '*', ''))) filtlist = simfilter(datalist, self.filteredpath, filtersel=self.filtersel) if hasattr(self, 'metricsel2') and self.metricsel2 is not None: proj = MetricDistance(self.metricsel1, self.metricsel2, metric=self.metrictype) else: proj = MetricSelfDistance(self.metricsel1, metric=self.metrictype) metr = Metric(filtlist, skip=self.skip) metr.projection(proj) data = metr.project() #if self.contactsym is not None: # contactSymmetry(data, self.contactsym) data.dropTraj() if self.ticadim > 0: tica = TICA(data, int(max(2, np.ceil(20/self.skip)))) datadr = tica.project(self.ticadim) else: datadr = data K = int(max(np.round(0.6 * np.log10(datadr.numFrames/1000)*1000+50), 100)) # heuristic if K > datadr.numFrames / 3: # Freaking ugly patches ... K = int(datadr.numFrames / 3) datadr.cluster(self.clustmethod(n_clusters=K), mergesmall=5) replacement = False if datadr.K < 10: datadr.cluster(self.clustmethod(n_clusters=K)) replacement = True model = Model(datadr) macronum = self.macronum if datadr.K < macronum: macronum = np.ceil(datadr.K / 2) logger.warning('Using less macrostates than requested due to lack of microstates. macronum = ' + str(macronum)) from pyemma.msm import timescales_msm timesc = timescales_msm(datadr.St.tolist(), lags=self.lag, nits=macronum).get_timescales() macronum = min(self.macronum, max(np.sum(timesc > self.lag), 2)) model.markovModel(self.lag, macronum) p_i = self._criteria(model, self.method) (spawncounts, prob) = self._spawn(p_i, self.nmax-self.running) logger.debug('spawncounts {}'.format(spawncounts)) stateIdx = np.where(spawncounts > 0)[0] _, relFrames = model.sampleStates(stateIdx, spawncounts[stateIdx], statetype='micro', replacement=replacement) logger.debug('relFrames {}'.format(relFrames)) self._writeInputs(datadr.rel2sim(np.concatenate(relFrames)))
def _getData(self, sims): metr = Metric(sims, skip=self.skip) metr.set(self.projection) # if self.contactsym is not None: # contactSymmetry(data, self.contactsym) if self.ticadim > 0: # tica = TICA(metr, int(max(2, np.ceil(self.ticalag)))) # gianni: without project it was tooooo slow data = metr.project() data.dropTraj() # Drop before TICA to avoid broken trajectories ticalag = int( np.ceil(max(2, min(np.min(data.trajLengths) / 2, self.ticalag)))) # 1 < ticalag < (trajLen / 2) tica = TICA(data, ticalag) datadr = tica.project(self.ticadim) else: datadr = metr.project() datadr.dropTraj() # Preferably we should do this before any projections. Corrupted sims can affect TICA return datadr
def _algorithm(self): from htmd.kinetics import Kinetics sims = self._getSimlist() metr = Metric(sims, skip=self.skip) metr.set(self.projection) data = metr.project() data.dropTraj() # Drop before TICA to avoid broken trajectories if self.goalfunction is not None: goaldata = self._getGoalData(data.simlist) if len(data.simlist) != len(goaldata.simlist): raise RuntimeError('The goal function was not able to project all trajectories that the MSM projection could. Check for possible errors in the goal function.') goaldataconcat = np.concatenate(goaldata.dat) if self.save: makedirs('saveddata', exist_ok=True) goaldata.save(path.join('saveddata', 'e{}_goaldata.dat'.format(self._getEpoch()))) # tica = TICA(metr, int(max(2, np.ceil(self.ticalag)))) # gianni: without project it was tooooo slow if self.ticadim > 0: ticalag = int(np.ceil(max(2, min(np.min(data.trajLengths) / 2, self.ticalag)))) # 1 < ticalag < (trajLen / 2) tica = TICA(data, ticalag) datatica = tica.project(self.ticadim) if not self._checkNFrames(datatica): return False self._createMSM(datatica) else: if not self._checkNFrames(data): return False self._createMSM(data) confstatdist = self.conformationStationaryDistribution(self._model) if self.actionspace == 'metric': if not data.K: data.cluster(self.clustmethod(n_clusters=self._numClusters(data.numFrames))) data_q = data.copy() elif self.actionspace == 'goal': data_q = goaldata.copy() elif self.actionspace == 'tica': data_q = datatica.copy() elif self.actionspace == 'ticapcca': data_q = datatica.copy() for traj in data_q.trajectories: traj.cluster = self._model.macro_ofcluster[traj.cluster] data_q.K = self._model.macronum if self.recluster: print('Reclustering with {}'.format(self.reclusterMethod)) data_q.cluster(self.reclusterMethod) numstates = data_q.K print('Numstates: {}'.format(numstates)) currepoch = self._getEpoch() q_values = np.zeros(numstates, dtype=np.float32) n_values = np.zeros(numstates, dtype=np.int32) if self.random: # If random mode respawn from random action states action_sel = np.zeros(numstates, dtype=int) N = self.nmax - self._running randomactions = np.bincount(np.random.randint(numstates, size=N)) action_sel[:len(randomactions)] = randomactions if self.save_qval: makedirs('saveddata', exist_ok=True) np.save(path.join('saveddata', 'e{}_actions.npy'.format(currepoch)), action_sel) relFrames = self._getSpawnFrames_UCB(action_sel, data_q) self._writeInputs(data.rel2sim(np.concatenate(relFrames))) return True if self.goalfunction is not None: ## For every cluster in data_q, get the max score and initialize if self.goal_preprocess is not None: goaldataconcat = self.goal_preprocess(goaldataconcat) qstconcat = np.concatenate(data_q.St) statemaxes = np.zeros(numstates) np.maximum.at(statemaxes, qstconcat, np.squeeze(goaldataconcat)) if not self.pucb: goalenergies = -Kinetics._kB * self.temperature * np.log(1-statemaxes) q_values = goalenergies n_values += int((self.nframes / self._numClusters(self.nframes)) * self.goal_init) ## Needs nframes to be set properly!!!!!!!! rewardtraj = np.arange(data_q.numTrajectories) # Recalculate reward for all states rewards = self.getRewards(rewardtraj, data_q, confstatdist, numstates, self.reward_method, self.reward_mode, self.reward_window) for i in range(numstates): if len(rewards[i]) == 0: continue q_values[i] = updatingMean(q_values[i], n_values[i], rewards[i]) n_values += np.array([len(x) for x in rewards]) if self.save_qval: makedirs('saveddata', exist_ok=True) np.save(path.join('saveddata', 'e{}_qval.npy'.format(currepoch)), q_values) np.save(path.join('saveddata', 'e{}_nval.npy'.format(currepoch)), n_values) if self.pucb: ucb_values = np.array([self.count_pucb(q_values[clust], self.exploration, statemaxes[clust], currepoch + 1, n_values[clust]) for clust in range(numstates)]) else: ucb_values = np.array([self.count_ucb(q_values[clust], self.exploration, currepoch + 1, n_values[clust]) for clust in range(numstates)]) if self.save_qval: makedirs('saveddata', exist_ok=True) np.save(path.join('saveddata', 'e{}_ucbvals.npy'.format(currepoch)), ucb_values) N = self.nmax - self._running if self.actionpool <= 0: self.actionpool = N topactions = np.argsort(-ucb_values)[:self.actionpool] action = np.random.choice(topactions, N, replace=False) action_sel = np.zeros(numstates, dtype=int) action_sel[action] += 1 while np.sum(action_sel) < N: # When K is lower than N repeat some actions for a in action: action_sel[a] +=1 if np.sum(action_sel) == N: break if self.save_qval: np.save(path.join('saveddata', 'e{}_actions.npy'.format(currepoch)), action_sel) relFrames = self._getSpawnFrames_UCB(action_sel, data_q) self._writeInputs(data.rel2sim(np.concatenate(relFrames))) return True
def _algorithm(self): logger.info('Postprocessing new data') datalist = simlist( glob(path.join(self.datapath, '*', '')), glob(path.join(self.inputpath, '*', 'structure.pdb')), glob(path.join(self.inputpath, '*', ''))) filtlist = simfilter(datalist, self.filteredpath, filtersel=self.filtersel) if hasattr(self, 'metricsel2') and self.metricsel2 is not None: proj = MetricDistance(self.metricsel1, self.metricsel2, metric=self.metrictype) else: proj = MetricSelfDistance(self.metricsel1, metric=self.metrictype) metr = Metric(filtlist, skip=self.skip) metr.projection(proj) data = metr.project() #if self.contactsym is not None: # contactSymmetry(data, self.contactsym) data.dropTraj() if self.ticadim > 0: tica = TICA(data, int(max(2, np.ceil(20 / self.skip)))) datadr = tica.project(self.ticadim) else: datadr = data K = int( max(np.round(0.6 * np.log10(datadr.numFrames / 1000) * 1000 + 50), 100)) # heuristic if K > datadr.numFrames / 3: # Freaking ugly patches ... K = int(datadr.numFrames / 3) datadr.cluster(self.clustmethod(n_clusters=K), mergesmall=5) replacement = False if datadr.K < 10: datadr.cluster(self.clustmethod(n_clusters=K)) replacement = True model = Model(datadr) macronum = self.macronum if datadr.K < macronum: macronum = np.ceil(datadr.K / 2) logger.warning( 'Using less macrostates than requested due to lack of microstates. macronum = ' + str(macronum)) from pyemma.msm import timescales_msm timesc = timescales_msm(datadr.St.tolist(), lags=self.lag, nits=macronum).get_timescales() macronum = min(self.macronum, max(np.sum(timesc > self.lag), 2)) model.markovModel(self.lag, macronum) p_i = self._criteria(model, self.method) (spawncounts, prob) = self._spawn(p_i, self.nmax - self.running) logger.debug('spawncounts {}'.format(spawncounts)) stateIdx = np.where(spawncounts > 0)[0] _, relFrames = model.sampleStates(stateIdx, spawncounts[stateIdx], statetype='micro', replacement=replacement) logger.debug('relFrames {}'.format(relFrames)) self._writeInputs(datadr.rel2sim(np.concatenate(relFrames)))
def analyze_folder(folder=None, out_folder="/tmp", skip=1, metrics=None, clu=500, tica=True, ticadim=5, tica_lag=20, model_lag=10, model_units='ns', macro_N=10, bulk_split=False, fes=True, rg_analysis=True, save=True, data_fstep=None): """Analysis script for create a Markov State Model Creates and returns a Markov State Model given a data folder. Intented to follow up the evolution of an adaptive sampling run. Allows to save the model ans several informative plots Parameters ---------- folder : str Data folder where adaptive is running out_folder : str Output folder to store derived data skip : int Number of frames to skip while projecting the MD data metrics : [:class: `Metric` object] Metric array used to project the data clu : int Number of cluster to create using the MiniBatchKMeans method. tica: bool Wether to use TICA of GWPCA for dimensionality reduction ticadim : int Number of TICA dimension to project the data. If None, the model will be created using the raw projected data tica_lag : int, optional Description model_lag : int Number of ns used to create the model model_units : str, optional Description macro_N : int Number of macrostate to split the final Markov State Model fes : bool, optional If true it will save a plot projecting the first two TICA dimension. Requires ticadim to be defined rg_analysis : bool, optional If true, a plot with information relative to the radious of gyration of the molecule will be created. save : bool, optional If true, the model will be saved in the outputs folder Returns ------- :class:`Model` Final model """ from htmd.model import Model from htmd.molecule.molecule import Molecule from htmd.simlist import simlist from htmd.projections.metric import Metric from sklearn.cluster import MiniBatchKMeans from IDP_htmd.IDP_model import plot_RG from IDP_htmd.model_utils import create_bulk from glob import glob import os try: os.mkdir(out_folder) except: print("Folder already exists") try: fsims = np.load(f"{folder}/simlist.npy", allow_pickle=True) print(f"Loaded {folder}/simlist.npy") except: print("Creating simlist") sims = glob(folder + 'filtered/*/') fsims = simlist(sims, folder + 'filtered/filtered.pdb') metr = Metric(fsims, skip=skip) metr.set(metrics) #Check if this gives problems to ITS try: model = Model(file=f"{out_folder}/model.dat") out_data = model.data print(f"Loading model: {out_folder}/model.dat") except: if tica and ticadim: from htmd.projections.tica import TICA print("Projecting TICA") tica = TICA(metr, tica_lag) out_data = tica.project(ticadim) elif not tica and ticadim: from htmd.projections.gwpca import GWPCA data = metr.project() data.dropTraj() print("using GWPCA") gwpca = GWPCA(data, tica_lag) out_data = gwpca.project(ticadim) else: print("Not using TICA") data = metr.project() data.dropTraj() out_data = data #Avoid some possibles error while clustering if data_fstep: out_data.fstep = data_fstep x = True while x: try: out_data.cluster(MiniBatchKMeans(n_clusters=clu), mergesmall=5) x = False except Exception as e: raise Exception("Error " + str(e)) model = Model(out_data) model.plotTimescales(plot=False, save=f"{out_folder}/1_its.png") if macro_N: model.markovModel(model_lag, macro_N, units=model_units) if bulk_split: try: print("Starting bulk splitting") create_bulk(model, bulk_split) except Exception as e: print("Could not perform the bulk splitting") print(e) model.eqDistribution(plot=False, save=f"{out_folder}/1.2_eqDistribution.png") if rg_analysis: from IDP_htmd.IDP_analysis import rg_analysis mol = Molecule(model.data.simlist[0].molfile) rg_data = rg_analysis(model, skip=skip) plot_RG(rg_data, mol, save=f"{out_folder}/1.4_rg.png") # if fes and ticadim: # model.plotFES(0, 1, temperature=310, states=True, # plot=False, save=f"{out_folder}/1.3_fes.png") if save: model.save(f"{out_folder}/model.dat") return model
def _createMSM(self, epoch, output_folder, basedata=None, skip=1, clusters=0, ticadim=0, ticalag=20, macronum=2, modellag=5, modelunits="frames", fstep=None, data2combine=None): from htmd.projections.tica import TICA from sklearn.cluster import MiniBatchKMeans from htmd.model import Model try: model = Model( file=f"{output_folder}/{self.analysis_type[0]}{epoch}_model.dat" ) if (model.macronum != macronum or model.lag != modellag): model.markovModel(modellag, macronum, units=modelunits) print("Model loaded") except: if not self.precalculated_data and not self.low_memory_usage: print("Calculating PRECALC DATA") precalc_data = self._precalculateData(self.precalc_metric, self.input_folder, fstep=fstep, skip=skip) self.precalc_data = precalc_data self.precalculated_data = True if self.analysis_type == "epoch" and not self.low_memory_usage: epoch_sim = np.concatenate( np.array([ self.epoch_sim_indexes[i] for i in range(1, epoch + 1) if i in list(self.epoch_sim_indexes.keys()) ])) drop_traj_idx = np.ones(self.precalc_data.numTrajectories) drop_traj_idx[epoch_sim] = 0 drop_idx = np.where(drop_traj_idx == 1)[0] elif self.analysis_type == "sims" and not self.low_memory_usage: drop_traj_idx = np.ones(self.precalc_data.numTrajectories) no_drop_idx = np.arange(1, epoch) drop_traj_idx[no_drop_idx] = 0 drop_idx = np.where(drop_traj_idx == 1)[0] if not self.low_memory_usage: data = self.precalc_data.copy() data.dropTraj(idx=drop_idx) data.dropTraj() if basedata: from htmd.projections.metric import MetricData r_fit = self._fitBaseline(data, basedata) data = MetricData(dat=r_fit, simlist=data.simlist) elif ticadim and not self.low_memory_usage: tica = TICA(data, ticalag) data = tica.project(ticadim) elif ticadim and self.low_memory_usage: from htmd.projections.metric import Metric if self.analysis_type == "epoch": epoch_sim = np.concatenate( np.array([ self.epoch_sim_indexes[i] for i in range(1, epoch + 1) if i in list(self.epoch_sim_indexes.keys()) ])) else: epoch_sim = range(0, epoch) metr = Metric(self._sims[epoch_sim], skip=skip) metr.set(self.precalc_metric) tica = TICA(metr, ticalag) data = tica.project(ticadim) if not clusters: clusters = self._numClusters(data.numFrames) if data2combine: try: print("Adding extra dimension") data2combine_copy = data2combine.copy() data2combine_copy.dropTraj(keepsims=data.simlist) data.combine(data2combine_copy) except Exception as e: print("Could not combined data", str(e)) data.cluster(MiniBatchKMeans(clusters), mergesmall=5) model = Model(data) model.markovModel(modellag, macronum, units=modelunits) model.save( f"{output_folder}/{self.analysis_type[0]}{epoch}_model.dat") for name, met in self.associated_metrics.items(): try: self.associated_data[name] except: print(f"Calcualtion associted data - {name.upper()}") assoc_data = self._precalculateData(met, self.input_folder, fstep=fstep, skip=skip) self.associated_data[name] = assoc_data for name, data in self.associated_data.items(): tmp_data = data.copy() tmp_data.dropTraj(keepsims=model.data.simlist) self.tmp_associated_data[name] = tmp_data return model