def _start_inline(self, directories): bar = ProgressBar(len(directories), description="Running QM Calculations") if self.code == Code.Gaussian: cmd = self.gaussian_binary + ' < input.gjf > output.gau 2>&1' elif self.code == Code.PSI4: cmd = self.psi4_binary + " -i psi4.in -o psi4.out 2>&1" elif self.code == Code.TeraChem: cmd = self.terachem_binary + " -i terachem.in > terachem.out 2>&1" for d in directories: f = open(os.path.join(d, "run.sh"), "w") print("#!/bin/sh\n%s\n" % (cmd), file=f) f.close() os.chmod(os.path.join(d, "run.sh"), 0o700) for directory in directories: cwd = os.getcwd() try: os.chdir(directory) if self.code == Code.Gaussian: if not os.path.exists("output.gau"): subprocess.call(cmd, shell=True) elif self.code == Code.PSI4: if not os.path.exists("psi4.out"): subprocess.call(cmd, shell=True) elif self.code == Code.TeraChem: if not os.path.exists("terachem.out"): subprocess.call(cmd, shell=True) except: os.chdir(cwd) raise os.chdir(cwd) bar.progress() bar.stop()
def progressbar(seq, description=None, total=None): p = ProgressBar(total, description=description) while True: try: yield next(seq) p.progress() # Had to put progress after yield because last call goes over the total and then I can't decrement in stop() except StopIteration: p.stop() raise
def tileMembrane(memb, xmin, ymin, xmax, ymax, buffer=1.5): """ Tile a membrane in the X and Y dimensions to reach a specific size. Parameters ---------- memb xmin ymin xmax ymax buffer Returns ------- megamemb : A big membrane Molecule """ from htmd.progress.progress import ProgressBar memb = memb.copy() memb.resid = sequenceID(memb.resid) minmemb = np.min(memb.get('coords', 'water'), axis=0).flatten() size = np.max(memb.get('coords', 'water'), axis=0) - np.min(memb.get('coords', 'water'), axis=0) size = size.flatten() xreps = int(np.ceil((xmax - xmin) / size[0])) yreps = int(np.ceil((ymax - ymin) / size[1])) logger.info('Replicating Membrane {}x{}'.format(xreps, yreps)) from htmd.molecule.molecule import Molecule megamemb = Molecule() bar = ProgressBar(xreps * yreps, description='Replicating Membrane') k = 0 for x in range(xreps): for y in range(yreps): tmpmemb = memb.copy() xpos = xmin + x * (size[0] + buffer) ypos = ymin + y * (size[1] + buffer) tmpmemb.moveBy([-float(minmemb[0]) + xpos, -float(minmemb[1]) + ypos, 0]) tmpmemb.remove('same resid as (x > {} or y > {})'.format(xmax, ymax), _logger=False) tmpmemb.set('segid', 'M{}'.format(k)) megamemb.append(tmpmemb) k += 1 bar.progress() bar.stop() # Membranes don't tile perfectly. Need to remove waters that clash with lipids of other tiles # Some clashes will still occur between periodic images however megamemb.remove('same fragment as water and within 1.5 of not water', _logger=False) return megamemb
def __init__(self, data, lag): from pyemma.coordinates import tica # data.dat.tolist() might be better? self.data = data if isinstance(data, Metric): from pyemma.coordinates.transform.tica import TICA self.tic = TICA(lag) p = ProgressBar(len(data.simulations)) for i in range(len(data.simulations)): # Fix for pyemma bug. Remove eventually: d, _, _ = data._projectSingle(i) if d is None or d.shape[0] < lag: continue self.tic.partial_fit(d) p.progress() p.stop() else: self.tic = tica(data.dat.tolist(), lag=lag)
def __call__(self, index): if CallBack.bars[ self. parallel] == 0: # Oh man, the race conditions possible here... kill me CallBack.bars[self.parallel] = ProgressBar( self.parallel.n_dispatched, description='Projecting trajectories') CallBack.bars[self.parallel].progress() if self.parallel._original_iterable: self.parallel.dispatch_next()
def _run_qm_jobs_inline(self, directory): cmd = self._env['BIN_G09'] fni = [] fno = [] for root, dirs, files in os.walk(directory): for f in files: if (f.endswith(".gjf")): op = f.replace(".gjf", ".out") if not os.path.exists(os.path.join(root, op)): fni.append(os.path.join(root, f)) fno.append(os.path.join(root, op)) if (len(fni)): bar = ProgressBar(len(fni), description="Running QM Calculations") for i in range(len(fni)): subprocess.check_output([cmd, fni[i], fno[i]]) bar.progress() bar.stop()
def __init__(self, data, lag, units='frames'): from pyemma.coordinates import tica # data.dat.tolist() might be better? self.data = data if isinstance(data, Metric): if units != 'frames': raise RuntimeError( 'Cannot use delayed projection TICA with units other than frames for now. Report this to HTMD issues.' ) metr = data from pyemma.coordinates.transform.tica import TICA self.tic = TICA(lag) p = ProgressBar(len(metr.simulations)) for proj in _projectionGenerator(metr, _getNcpus()): for pro in proj: self.tic.partial_fit(pro[0]) p.progress(len(proj)) p.stop() else: lag = unitconvert(units, 'frames', lag, data.fstep) if lag == 0: raise RuntimeError( 'Lag time conversion resulted in 0 frames. Please use a larger lag-time for TICA.' ) self.tic = tica(data.dat.tolist(), lag=lag)
def _start(self, directories): bar = ProgressBar(len(directories), description="Running QM Calculations") for directory in directories: cwd = os.getcwd() try: if self.execution == Execution.Inline: os.chdir(directory) if self.code == Code.Gaussian: if not os.path.exists("output.gau"): subprocess.call('"' + self.gaussian_binary + '" < input.gjf > output.gau 2>&1', shell=True) elif self.code == Code.PSI4: if not os.path.exists("psi4.out"): subprocess.call([ self.psi4_binary, "-i", "psi4.in", "-o", "psi4.out" ]) except: os.chdir(cwd) raise os.chdir(cwd) bar.progress() bar.stop()
def __init__(self, data, lag, units='frames', dimensions=None): from pyemma.coordinates.transform.tica import TICA as TICApyemma self.data = data self.dimensions = dimensions if isinstance(data, Metric): # Memory efficient TICA projecting trajectories on the fly if units != 'frames': raise RuntimeError('Cannot use delayed projection TICA with units other than frames for now. Report this to HTMD issues.') self.tic = TICApyemma(lag) metr = data p = ProgressBar(len(metr.simulations)) for proj in _projectionGenerator(metr, _getNcpus()): for pro in proj: if pro is None: continue if self.dimensions is None: self.tic.partial_fit(pro[0]) else: # Sub-select dimensions for fitting self.tic.partial_fit(pro[0][:, self.dimensions]) p.progress(len(proj)) p.stop() else: # In-memory TICA lag = unitconvert(units, 'frames', lag, data.fstep) if lag == 0: raise RuntimeError('Lag time conversion resulted in 0 frames. Please use a larger lag-time for TICA.') self.tic = TICApyemma(lag) if self.dimensions is None: datalist = data.dat.tolist() else: # Sub-select dimensions for fitting datalist = [x[:, self.dimensions].copy() for x in data.dat] self.tic.fit(datalist)
def tileMembrane(memb, xmin, ymin, xmax, ymax): """ Tile the membrane in the X and Y dimensions to reach a specific size. Returns ------- megamemb : A big membrane Molecule """ from htmd.progress.progress import ProgressBar memb = memb.copy() memb.resid = sequenceID(memb.resid) minmemb = np.min(memb.get('coords', 'water'), axis=0).flatten() size = np.max(memb.get('coords', 'water'), axis=0) - np.min(memb.get('coords', 'water'), axis=0) size = size.flatten() xreps = int(np.ceil((xmax - xmin) / size[0])) yreps = int(np.ceil((ymax - ymin) / size[1])) logger.info('Replicating Membrane {}x{}'.format(xreps, yreps)) from htmd.molecule.molecule import Molecule megamemb = Molecule() bar = ProgressBar(xreps * yreps, description='Replicating Membrane') k = 0 for x in range(xreps): for y in range(yreps): tmpmemb = memb.copy() xpos = xmin + x * size[0] ypos = ymin + y * size[1] tmpmemb.moveBy([-float(minmemb[0]) + xpos, -float(minmemb[1]) + ypos, 0]) sel = 'same resid as (x > {} or y > {})'.format(xmax, ymax) tmpmemb.remove(sel, _logger=False) tmpmemb.set('segid', 'M{}'.format(k)) megamemb.append(tmpmemb) k += 1 bar.progress() bar.stop() return megamemb
def project(self, ndim=None): """ Projects the data object given to the constructor onto `ndim` dimensions Parameters ---------- ndim : int The number of dimensions we want to project the data on. Returns ------- dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the projected data Example ------- >>> gw = GWPCA(data) >>> dataproj = gw.project(5) """ from sklearn.decomposition import IncrementalPCA from htmd.progress.progress import ProgressBar from htmd.metricdata import MetricData pca = IncrementalPCA(n_components=ndim, batch_size=10000) p = ProgressBar(len(self.data.dat)) for d in self.data.dat: pca.partial_fit(d * self.weights) p.progress() p.stop() projdata = self.data.copy() p = ProgressBar(len(self.data.dat)) for i, d in enumerate(self.data.dat): projdata.dat[i] = pca.transform(d * self.weights) p.progress() p.stop() # projdataconc = pca.fit_transform(self.weighedconcat) # projdata.dat = projdata.deconcatenate(projdataconc) return projdata
def progressbar(seq, description=None, total=None): p = ProgressBar(total, description=description) while True: try: yield next(seq) p.progress( ) # Had to put progress after yield because last call goes over the total and then I can't decrement in stop() except StopIteration: p.stop() raise
def fit(self, data): """ Compute the centroids of data. Parameters ---------- data : np.ndarray A 2D array of data. Columns are features and rows are data examples. """ if len(self.cluster_centers_) != 0: logger.warning('Clustering already exists. Reclustering data!') self.cluster_centers_ = [] self.centerFrames = [] self.clusterSize = [] # Initialization # select random point and assign all points to cluster 0 numpoints = np.size(data, 0) idxCenter = np.random.randint(numpoints) self.cluster_centers_.append(data[idxCenter, :]) self.centerFrames.append(idxCenter) self.labels_ = np.zeros(numpoints, dtype=int) dist = self._dist(self.cluster_centers_, data) countCluster = 1 p = ProgressBar(self.n_clusters) while len(self.cluster_centers_) < self.n_clusters: if np.max(dist) == 0: break # find point furthest away from all center newCenterIdx = np.argmax(dist) newCenter = data[newCenterIdx, :] self.centerFrames.append(newCenterIdx) self.cluster_centers_.append(newCenter) # find all points closer to new center than old center newdist = self._dist(newCenter, data) switchIdx = dist > newdist # assign them to new cluster self.labels_[switchIdx] = countCluster dist[switchIdx] = newdist[switchIdx] countCluster += 1 p.progress() p.stop() # update clusterSize self.clusterSize = np.bincount(self.labels_) self.distance = dist self.cluster_centers_ = np.array(self.cluster_centers_)
def tileMembrane(memb, xmin, ymin, xmax, ymax): """ Tile the membrane in the X and Y dimensions to reach a specific size. Returns ------- megamemb : A big membrane Molecule """ from htmd.progress.progress import ProgressBar memb = memb.copy() memb.resid = sequenceID(memb.resid) minmemb = np.min(memb.get('coords', 'water'), axis=0).flatten() size = np.max(memb.get('coords', 'water'), axis=0) - np.min( memb.get('coords', 'water'), axis=0) size = size.flatten() xreps = int(np.ceil((xmax - xmin) / size[0])) yreps = int(np.ceil((ymax - ymin) / size[1])) logger.info('Replicating Membrane {}x{}'.format(xreps, yreps)) from htmd.molecule.molecule import Molecule megamemb = Molecule() bar = ProgressBar(xreps * yreps, description='Replicating Membrane') k = 0 for x in range(xreps): for y in range(yreps): tmpmemb = memb.copy() xpos = xmin + x * size[0] ypos = ymin + y * size[1] tmpmemb.moveBy( [-float(minmemb[0]) + xpos, -float(minmemb[1]) + ypos, 0]) sel = 'same resid as (x > {} or y > {})'.format(xmax, ymax) tmpmemb.remove(sel, _logger=False) tmpmemb.set('segid', 'M{}'.format(k)) megamemb.append(tmpmemb) k += 1 bar.progress() bar.stop() return megamemb
def _run_qm_jobs_inline(self, directory): cmd = self._env['BIN_G09'] fni = [] fno = [] for root, dirs, files in os.walk(directory): for f in files: if f.endswith(".gjf"): op = f.replace(".gjf", ".out") if not os.path.exists(os.path.join(root, op)): fni.append(os.path.join(root, f)) fno.append(os.path.join(root, op)) if len(fni): bar = ProgressBar(len(fni), description="Running QM Calculations") for i in range(len(fni)): subprocess.check_output([cmd, fni[i], fno[i]]) bar.progress() bar.stop()
def __init__(self, data, lag, units='frames'): from pyemma.coordinates import tica # data.dat.tolist() might be better? self.data = data if isinstance(data, Metric): from pyemma.coordinates.transform.tica import TICA lag = unitconvert(units, 'frames', lag, data.fstep) self.tic = TICA(lag) p = ProgressBar(len(data.simulations)) for i in range(len(data.simulations)): # Fix for pyemma bug. Remove eventually: d, _, _ = data._projectSingle(i) if d is None or d.shape[0] < lag: continue self.tic.partial_fit(d) p.progress() p.stop() else: self.tic = tica(data.dat.tolist(), lag=lag)
def _run_qm_jobs_lsf(self, directory): cmd = self._env['BIN_G09'] fni = [] fno = [] for root, dirs, files in os.walk(directory): for f in files: if (f.endswith(".gjf")): op = f.replace(".gjf", ".out") if not os.path.exists(op): fni.append(os.path.join(root, f)) fno.append(os.path.join(root, op)) if (len(fni)): # Make an LSF script for i in range(len(fni)): fpbs = fni[i] + ".lsf" if not os.path.exists(fpbs): f = open(fpbs, "w") print("#BSUB -n %d" % (self._config.NCORES), file=f) print("#BSUB -R \"span[ptile=%d]\"" % (self._config.NCORES), file=f) print("#BSUB -W 24:00", file=f) print("#BSUB -J gaussian", file=f) print("#BSUB -app gaussian", file=f) print("#BSUB -o /dev/null", file=f) print("#BSUB -M %d000" % (self._config.MEMORY), file=f) print("\nmodule load gaussian\n", file=f) print("cd \"%s\"" % (directory), file=f) print("\"%s\" %s %s" % (cmd, fni[i], fno[i]), file=f) f.close() # Look to see if there is already a job submitted # If not, qsub it fpbsstate = fni[i] + ".jobid" if not os.path.exists(fpbsstate): # Qsub, saving jobid to file subprocess.check_output( "\"" + self._env['BIN_BSUB'] + "\" < \"" + fpbs + "\" > \"" + fpbsstate + "\"", shell=True) # Finally monitor progress. Continue until all jobs have produced an output # NB TODO FIXME: should also poll qstat to see if job is still live bar = ProgressBar(len(fni), description="Running QM Calculations") complete = False lastcount = 0 while not complete: count = 0 complete = True for i in fno: # print(" Checking [" + i +"]" ) # print( os.path.exists(i) ) # print( os.access(i, os.R_OK) ) try: os.stat(i) # Try to flush any cache (for NFS) except: pass if os.access(i, os.R_OK): # print("FOUND") count = count + 1 else: complete = False # print( str(count) + " completed of " + str(len(fno)) ) while (lastcount < count): bar.progress() lastcount = lastcount + 1; time.sleep(10); bar.stop() time.sleep(5) # A bit of time for any outputfile to complete writing
def project(self, ndim=None): """ Projects the data object given to the constructor onto the top `ndim` TICA dimensions Parameters ---------- ndim : int The number of TICA dimensions we want to project the data on. If None is given it will use choose a number of dimensions to cover 95% of the kinetic variance. Returns ------- dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the TICA projected data Example ------- >>> from htmd.projections.tica import TICA >>> tica = TICA(data,20) >>> dataTica = tica.project(5) """ if ndim is not None: self.tic.set_params(dim=ndim) keepdata = [] keepdim = None keepdimdesc = None if isinstance(self.data, Metric): # Memory efficient TICA projecting trajectories on the fly proj = [] refs = [] fstep = None metr = self.data p = ProgressBar(len(metr.simulations)) k = -1 droppedsims = [] for projecteddata in _projectionGenerator(metr, _getNcpus()): for pro in projecteddata: k += 1 if pro is None: droppedsims.append(k) continue if self.dimensions is not None: numDimensions = pro[0].shape[1] keepdim = np.setdiff1d(range(numDimensions), self.dimensions) keepdata.append(pro[0][:, keepdim]) proj.append(self.tic.transform(pro[0][:, self.dimensions]).astype(np.float32)) # Sub-select dimensions for projecting else: proj.append(self.tic.transform(pro[0]).astype(np.float32)) refs.append(pro[1]) if fstep is None: fstep = pro[2] p.progress(len(projecteddata)) p.stop() simlist = self.data.simulations simlist = np.delete(simlist, droppedsims) ref = np.array(refs, dtype=object) parent = None if self.dimensions is not None: from htmd.projections.metric import _singleMolfile from htmd.molecule.molecule import Molecule (single, molfile) = _singleMolfile(metr.simulations) if single: keepdimdesc = metr.getMapping(Molecule(molfile)) keepdimdesc = keepdimdesc.iloc[keepdim] else: if ndim is not None and self.data.numDimensions < ndim: raise RuntimeError('TICA cannot increase the dimensionality of your data. Your data has {} dimensions and you requested {} TICA dimensions'.format(self.data.numDimensions, ndim)) if self.dimensions is not None: keepdim = np.setdiff1d(range(self.data.numDimensions), self.dimensions) keepdata = [x[:, keepdim] for x in self.data.dat] if self.data.description is not None: keepdimdesc = self.data.description.iloc[keepdim] proj = self.tic.get_output() simlist = self.data.simlist ref = self.data.ref fstep = self.data.fstep parent = self.data # If TICA is done on a subset of dimensions, combine non-projected data with projected data if self.dimensions is not None: newproj = [] for k, t in zip(keepdata, proj): newproj.append(np.hstack((k, t))) proj = newproj if ndim is None: ndim = self.tic.dimension() logger.info('Kept {} dimension(s) to cover 95% of kinetic variance.'.format(ndim)) from htmd.metricdata import MetricData datatica = MetricData(dat=np.array(proj), simlist=simlist, ref=ref, fstep=fstep, parent=parent) from pandas import DataFrame # TODO: Make this messy pandas creation cleaner. I'm sure I can append rows to DataFrame types = [] indexes = [] description = [] for i in range(ndim): types += ['tica'] indexes += [-1] description += ['TICA dimension {}'.format(i+1)] datatica.description = DataFrame({'type': types, 'atomIndexes': indexes, 'description': description}) if self.dimensions is not None and keepdimdesc is not None: # If TICA is done on a subset of dims datatica.description = keepdimdesc.append(datatica.description, ignore_index=True) return datatica
def project(self, ndim=None): """ Projects the data object given to the constructor onto the top `ndim` TICA dimensions Parameters ---------- ndim : int The number of TICA dimensions we want to project the data on. If None is given it will use choose a number of dimensions to cover 95% of the kinetic variance. Returns ------- dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the TICA projected data Example ------- >>> from htmd.projections.tica import TICA >>> tica = TICA(data,20) >>> dataTica = tica.project(5) """ if ndim is not None: # self.tic._dim = ndim # Old way of doing it. Deprecated since pyEMMA 2.1 self.tic.set_params(dim=ndim) # Change to this in 2.1 pyEMMA version if isinstance(self.data, Metric): # Doesn't project on correct number of dimensions proj = [] refs = [] fstep = None '''from htmd.config import _config from joblib import Parallel, delayed results = Parallel(n_jobs=_config['ncpus'], verbose=11)( delayed(_test)(self.data, self.tic, i) for i in range(len(self.data.simulations))) for i in range(len(results)): proj.append(results[i][0]) refs.append(results[i][1]) fstep.append(results[i][2])''' droppedsims = [] p = ProgressBar(len(self.data.simulations)) for i in range(len(self.data.simulations)): d, r, f = self.data._projectSingle(i) if d is None: droppedsims.append(i) continue if fstep is None: fstep = f refs.append(r) proj.append(self.tic.transform(d)) p.progress() p.stop() simlist = self.data.simulations simlist = np.delete(simlist, droppedsims) ref = np.array(refs, dtype=object) #fstep = 0 parent = None else: proj = self.tic.get_output() simlist = self.data.simlist ref = self.data.ref fstep = self.data.fstep parent = self.data if ndim is None: logger.info('Kept {} dimension(s) to cover 95% of kinetic variance.'.format(self.tic.dimension())) #print(np.shape(proj)) from htmd.metricdata import MetricData datatica = MetricData(dat=np.array(proj, dtype=object), simlist=simlist, ref=ref, fstep=fstep, parent=parent) '''datatica = self.data.copy() #datatica.dat = self.data.deconcatenate(np.squeeze(proj)) datatica.dat = np.array(proj, dtype=object) datatica.parent = self.data datatica.St = None datatica.Centers = None datatica.N = None datatica.K = None datatica._dataid = random.random() datatica._clusterid = None''' return datatica
def simlist(datafolders, molfiles, inputfolders=None): """Creates a list of simulations Parameters ---------- datafolders : str list A list of directories, each containing a single trajectory molfiles : str list A list of pdb files corresponding to the trajectories in dataFolders. Can also be a single string to a single structure which corresponds to all trajectories. inputfolders : optional, str list A list of directories, each containing the input files used to produce the trajectories in dataFolders Return ------ sims : np.ndarray of :class:`Sim <htmd.simlist.Sim>` objects A list of simulations Examples -------- >>> simlist(glob('./test/data/*/'), glob('./test/input/*/*.pdb'), glob('./test/input/*/')) """ if not datafolders: raise NameError('No data folders were given, check your arguments.') if not molfiles: raise NameError('No molecule files were given, check your arguments.') if isinstance(molfiles, str): molfiles = [molfiles] if isinstance(datafolders, str): datafolders = [datafolders] if inputfolders and isinstance(inputfolders, str): inputfolders = [inputfolders] #Sim = namedtuple('Sim', ['id', 'parent', 'input', 'trajectory', 'molfile']) # I need to match the simulation names inside the globs given. The # reason is that there can be more input folders in the glob than in # the data glob due to not having been retrieved. Hence I need to match # the folder names. # Create a hash map of data folder names datanames = dict() for folder in datafolders: if _simName(folder) in datanames: raise NameError('Duplicate simulation name detected. Cannot name-match directories.') datanames[_simName(folder)] = folder molnames = dict() for mol in molfiles: molnames[_simName(mol)] = mol if inputfolders: inputnames = dict() for inputf in inputfolders: inputnames[_simName(inputf)] = inputf logger.info('Starting listing of simulations.') sims = [] keys = natsort.natsorted(datanames.keys()) i = 0 bar = ProgressBar(len(keys), description='Creating simlist') for k in keys: trajectories = _listXTCs(datanames[k]) if not trajectories: bar.progress() continue if len(molfiles) > 1: if k not in molnames: raise NameError('Did not find molfile with folder name ' + k + ' in the given glob') molfile = molnames[k] else: molfile = molfiles[0] inputf = [] if inputfolders: if k not in inputnames: raise NameError('Did not find input with folder name ' + k + ' in the given glob') inputf = inputnames[k] sims.append(Sim(simid=i, parent=None, input=inputf, trajectory=trajectories, molfile=molfile)) i += 1 bar.progress() bar.stop() logger.info('Finished listing of simulations.') return np.array(sims, dtype=object)
def simlist(datafolders, topologies, inputfolders=None): """Creates a list of simulations Parameters ---------- datafolders : str list A list of directories, each containing a single trajectory topologies : str list A list of topology files or folders containing a topology file corresponding to the trajectories in dataFolders. Can also be a single string to a single structure which corresponds to all trajectories. inputfolders : optional, str list A list of directories, each containing the input files used to produce the trajectories in dataFolders Return ------ sims : np.ndarray of :class:`Sim <htmd.simlist.Sim>` objects A list of simulations Examples -------- >>> simlist(glob('./test/data/*/'), glob('./test/input/*/'), glob('./test/input/*/')) >>> simlist(glob('./test/data/*/'), glob('./test/input/*/*.pdb'), glob('./test/input/*/')) """ from htmd.util import ensurelist import natsort if not datafolders: raise FileNotFoundError( 'No data folders were given, check your arguments.') if not topologies: raise FileNotFoundError( 'No molecule files were given, check your arguments.') topologies = ensurelist(topologies) datafolders = ensurelist(datafolders) for folder in datafolders: if not os.path.isdir(folder): raise NotADirectoryError('{}'.format(folder)) if inputfolders: inputfolders = ensurelist(inputfolders) for folder in inputfolders: if not os.path.isdir(folder): raise NotADirectoryError('{}'.format(folder)) # I need to match the simulation names inside the globs given. The # reason is that there can be more input folders in the glob than in # the data glob due to not having been retrieved. Hence I need to match # the folder names. # Create a hash map of data folder names datanames = dict() for folder in datafolders: if _simName(folder) in datanames: raise RuntimeError( 'Duplicate simulation name detected. Cannot name-match directories.' ) datanames[_simName(folder)] = folder molnames = dict() for mol in topologies: if not os.path.exists(mol): raise FileNotFoundError('File {} does not exist'.format(mol)) molnames[_simName(mol)] = mol if inputfolders: inputnames = dict() for inputf in inputfolders: inputnames[_simName(inputf)] = inputf logger.debug('Starting listing of simulations.') sims = [] keys = natsort.natsorted(datanames.keys()) i = 0 from htmd.progress.progress import ProgressBar bar = ProgressBar(len(keys), description='Creating simlist') for k in keys: trajectories = _autoDetectTrajectories(datanames[k]) if not trajectories: bar.progress() continue if len(topologies) > 1: if k not in molnames: raise FileNotFoundError( 'Did not find molfile with folder name ' + k + ' in the given glob') molfile = molnames[k] else: molfile = topologies[0] if os.path.isdir(molfile): molfile = _autoDetectTopology(molfile) inputf = [] if inputfolders: if k not in inputnames: raise FileNotFoundError( 'Did not find input with folder name ' + k + ' in the given glob') inputf = inputnames[k] numframes = [_readNumFrames(f) for f in trajectories] sims.append( Sim(simid=i, parent=None, input=inputf, trajectory=trajectories, molfile=molfile, numframes=numframes)) i += 1 bar.progress() bar.stop() logger.debug('Finished listing of simulations.') return np.array(sims, dtype=object)
def simlist(datafolders, molfiles, inputfolders=None): """Creates a list of simulations Parameters ---------- datafolders : str list A list of directories, each containing a single trajectory molfiles : str list A list of pdb files corresponding to the trajectories in dataFolders. Can also be a single string to a single structure which corresponds to all trajectories. inputfolders : optional, str list A list of directories, each containing the input files used to produce the trajectories in dataFolders Return ------ sims : np.ndarray of :class:`Sim <htmd.simlist.Sim>` objects A list of simulations Examples -------- >>> simlist(glob('./test/data/*/'), glob('./test/input/*/*.pdb'), glob('./test/input/*/')) """ if not datafolders: raise NameError('No data folders were given, check your arguments.') if not molfiles: raise NameError('No molecule files were given, check your arguments.') if isinstance(molfiles, str): molfiles = [molfiles] if isinstance(datafolders, str): datafolders = [datafolders] if inputfolders and isinstance(inputfolders, str): inputfolders = [inputfolders] #Sim = namedtuple('Sim', ['id', 'parent', 'input', 'trajectory', 'molfile']) # I need to match the simulation names inside the globs given. The # reason is that there can be more input folders in the glob than in # the data glob due to not having been retrieved. Hence I need to match # the folder names. # Create a hash map of data folder names datanames = dict() for folder in datafolders: if _simName(folder) in datanames: raise NameError( 'Duplicate simulation name detected. Cannot name-match directories.' ) datanames[_simName(folder)] = folder molnames = dict() for mol in molfiles: molnames[_simName(mol)] = mol if inputfolders: inputnames = dict() for inputf in inputfolders: inputnames[_simName(inputf)] = inputf logger.info('Starting listing of simulations.') sims = [] keys = natsort.natsorted(datanames.keys()) i = 0 bar = ProgressBar(len(keys), description='Creating simlist') for k in keys: trajectories = _listTrajectories(datanames[k]) if not trajectories: bar.progress() continue if len(molfiles) > 1: if k not in molnames: raise NameError('Did not find molfile with folder name ' + k + ' in the given glob') molfile = molnames[k] else: molfile = molfiles[0] inputf = [] if inputfolders: if k not in inputnames: raise NameError('Did not find input with folder name ' + k + ' in the given glob') inputf = inputnames[k] sims.append( Sim(simid=i, parent=None, input=inputf, trajectory=trajectories, molfile=molfile)) i += 1 bar.progress() bar.stop() logger.info('Finished listing of simulations.') return np.array(sims, dtype=object)
def solvate(mol, pad=None, minmax=None, negx=0, posx=0, negy=0, posy=0, negz=0, posz=0, buffer=2.4, watsize=65.4195, prefix='WT', keysel='name OH2', rotate=False, rotsel='all', rotinc=36, spdb=None, spsf=None, stop=None): """ Solvates the system in a water box Parameters ---------- mol : :class:`Molecule <htmd.molecule.molecule.Molecule>` object The molecule object we want to solvate pad : float The padding to add to the minmax in all dimensions. You can specify different padding in each dimension using the negx, negy, negz, posx, posy, posz options. This option will override any values in the neg and pos options. minmax : list Min and max dimensions. Should be a 2D matrix of the form [[minx, miny, minz], [maxx, maxy, maxz]]. If none is given, it is calculated from the minimum and maximum coordinates in the mol. negx : float The padding in the -x dimension posx : float The padding in the +x dimension negy : float The padding in the -y dimension posy : float The padding in the +y dimension negz : float The padding in the -z dimension posz : float The padding in the +z dimension buffer : float How much buffer space to leave empty between waters and other molecules watsize : float The size of the water box prefix : str The prefix used for water segments keysel : str The key selection for water atoms rotate : bool Enable automated rotation of molecule to fit best in box rotsel : str The selection of atoms to rotate rotinc : float The increment in degrees to rotate spdb : str The path to the water pdb file spsf : str The path to the water psf file stop : str The path to the water topology file Returns ------- mol : :class:`Molecule <htmd.molecule.molecule.Molecule>` object A solvated molecule Examples -------- >>> smol = solvate(mol, pad=10) >>> smol = solvate(mol, minmax=[[-20, -20, -20],[20, 20, 20]]) """ mol = mol.copy() if mol.numFrames > 1: logger.warning( 'Multiple frames in Molecule. Solvate keeps only frame 0 and discards the rest.' ) mol.coords = np.atleast_3d(mol.coords[:, :, 0]) if spdb is None: spdb = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'wat.pdb') if os.path.isfile(spdb): logger.info('Using water pdb file at: ' + spdb) water = Molecule(spdb) else: raise NameError('No solvent pdb file found in ' + spdb) if pad is not None: negx = pad posx = pad negy = pad posy = pad negz = pad posz = pad if rotate: raise NameError('Rotation not implemented yet') # Calculate min max coordinates from molecule if mol.numAtoms > 0: minmol = np.min(mol.get('coords'), axis=0) maxmol = np.max(mol.get('coords'), axis=0) else: minmol = [np.inf, np.inf, np.inf] maxmol = [-np.inf, -np.inf, -np.inf] if minmax is None: minc = minmol maxc = maxmol else: if isinstance(minmax, list): minmax = np.array(minmax) minc = minmax[0, :] maxc = minmax[1, :] xmin = float(minc[0] - negx) xmax = float(maxc[0] + posx) ymin = float(minc[1] - negy) ymax = float(maxc[1] + posy) zmin = float(minc[2] - negz) zmax = float(maxc[2] + posz) dx = xmax - xmin dy = ymax - ymin dz = zmax - zmin nx = int(np.ceil((dx + 2 * buffer) / watsize)) ny = int(np.ceil((dy + 2 * buffer) / watsize)) nz = int(np.ceil((dz + 2 * buffer) / watsize)) # Calculate number of preexisting water segments with given prefix if mol.numAtoms > 0: preexist = len( np.unique(mol.get('segid', sel='segid "{}.*"'.format(prefix)))) else: preexist = 0 numsegs = nx * ny * nz logger.info('Replicating ' + str(numsegs) + ' water segments, ' + str(nx) + ' by ' + str(ny) + ' by ' + str(nz)) # Check that we won't run out of segment name characters, and switch to # using hexadecimal or alphanumeric naming schemes in cases where decimal # numbered segnames won't fit into the field width. testsegname = '{0}{1:d}'.format(prefix, numsegs + preexist) testsegnamehex = '{0}{1:X}'.format(prefix, numsegs + preexist) writemode = 'decimal' if len(testsegname) > 4 and len(testsegnamehex) <= 4: writemode = 'hex' logger.warning( 'Warning: decimal naming would overrun segname field. Using hexadecimal segnames instead...' ) elif len(testsegnamehex) > 4: writemode = 'alphanum' logger.warning( 'Warning: decimal or hex naming would overrun segname field. Using alphanumeric segnames instead...' ) minx = minmol[0] - buffer miny = minmol[1] - buffer minz = minmol[2] - buffer maxx = maxmol[0] + buffer maxy = maxmol[1] + buffer maxz = maxmol[2] + buffer bar = ProgressBar(nx * ny * nz, description='Solvating') waterboxes = np.empty(numsegs, dtype=object) n = preexist w = 0 for i in range(nx): movex = xmin + i * watsize movexmax = movex + watsize xoverlap = True if movex > maxx or movexmax < minx: xoverlap = False for j in range(ny): movey = ymin + j * watsize moveymax = movey + watsize yoverlap = True if movey > maxy or moveymax < miny: yoverlap = False for k in range(nz): movez = zmin + k * watsize movezmax = movez + watsize zoverlap = True if movez > maxz or movezmax < minz: zoverlap = False if writemode == 'decimal': segname = '{0}{1:d}'.format(prefix, n) elif writemode == 'hex': segname = '{0}{1:x}'.format(prefix, n) elif writemode == 'alphanum': segname = '{0}{1:c}{2:c}{3:c}'.format( prefix, int(np.floor(np.floor(n / 26) / 26) + 65), int(np.mod(np.floor(n / 26), 26) + 65), int(np.mod(n, 26) + 65)) waterboxes[w] = water.copy() waterboxes[w].moveBy([movex, movey, movez]) waterboxes[w].set('segid', segname) mol.append(waterboxes[w]) watsel = mol.segid == segname selover = np.zeros(len(watsel), dtype=bool) if xoverlap and yoverlap and zoverlap: # Remove water overlapping with other segids selover = _overlapWithOther(mol, segname, buffer) # Remove water outside the boundaries selout = _outOfBoundaries(mol, segname, xmin, xmax, ymin, ymax, zmin, zmax) sel = selover | selout #mol.write('temp.pdb') mol.filter(mol.segid != segname, _logger=False) waterboxes[w].filter(np.invert(sel[watsel]), _logger=False) #waterboxes[w].write('wat' + str(w) + '.pdb') n += 1 w += 1 bar.progress() bar.stop() waters = 0 for i in range(numsegs): waters += waterboxes[i].numAtoms if waterboxes[i].numAtoms != 0: mol.append(waterboxes[i]) logger.info('{} water molecules were added to the system.'.format( int(waters / 3))) return mol
def fitSoftTorsion(self, angle, geomopt=True): bkp_coords = self.coords.copy() phi_to_fit = None frozens = [] for d in self._soft_dihedrals: if (d.atoms == angle).all(): phi_to_fit = d frozens.append(d.atoms) else: if not geomopt: frozens.append(d.atoms) if not phi_to_fit: raise ValueError("specified phi is not a recognised soft dihedral") self._makeDihedralUnique(phi_to_fit) atoms = phi_to_fit.atoms equivs = phi_to_fit.equivalents # Number of rotamers for each dihedral to compute nrotamer = 36 # Create a copy of molecule with nrotamer frames mol = self.copy() for _ in range(nrotamer - 1): mol.appendFrames(self) assert mol.numFrames == nrotamer # Set rotamer coordinates angles = np.linspace(-np.pi, np.pi, num=nrotamer, endpoint=False) for frame, angle in enumerate(angles): mol.frame = frame mol.setDihedral(atoms, angle, bonds=mol.bonds) dirname = "dihedral-single-point" if geomopt: dirname = "dihedral-opt" dih_name = "%s-%s-%s-%s" % (self.name[atoms[0]], self.name[atoms[1]], self.name[atoms[2]], self.name[atoms[3]]) fitdir = os.path.join(self.outdir, dirname, dih_name, self.output_directory_name()) try: os.makedirs(fitdir, exist_ok=True) except: raise OSError( 'Directory {} could not be created. Check if you have permissions.' .format(fitdir)) qmset = QMCalculation(mol, charge=self.netcharge, directory=fitdir, frozen=frozens, optimize=geomopt, theory=self.theory, solvent=self.solvent, basis=self.basis, execution=self.execution, code=self.qmcode) ret = self._makeDihedralFittingSetFromQMResults(atoms, qmset.results()) # Get the initial parameters of the dihedral we are going to fit param = self._prm.dihedralParam(self._rtf.type_by_index[atoms[0]], self._rtf.type_by_index[atoms[1]], self._rtf.type_by_index[atoms[2]], self._rtf.type_by_index[atoms[3]]) # Save these parameters as the best fit (fit to beat) best_param = np.zeros((13)) for t in range(6): best_param[t] = param[t].k0 best_param[t + 6] = param[t].phi0 best_param[12] = 0. # Evalaute the mm potential with this dihedral zeroed out # The objective function will try to fit to the delta between # The QM potential and the this modified mm potential for t in param: t.k0 = t.phi0 = 0. #t.e14 = 1. # Use whatever e14 has been inherited for the type self._prm.updateDihedral(param) ffeval = FFEvaluate(self) # Now evaluate the ff without the dihedral being fitted for t in range(ret.N): mm_zeroed = ffeval.run(ret.coords[t][:, :, 0])['total'] ret.mm_delta.append(ret.qm[t] - mm_zeroed) ret.mm_zeroed.append(mm_zeroed) mmin1 = min(ret.mm_zeroed) mmin2 = min(ret.mm_delta) for t in range(ret.N): ret.mm_zeroed[t] = ret.mm_zeroed[t] - mmin1 ret.mm_delta[t] = ret.mm_delta[t] - mmin2 self._fitDihedral_results = ret self._fitDihedral_phi = param # Now measure all of the soft dihedrals phis that are mapped to this dihedral ret.phis = [] for iframe in range(ret.N): ret.phis.append([ret.phi[iframe]]) for atoms in equivs: angle = dihedralAngle(ret.coords[iframe][atoms, :, 0]) ret.phis[iframe].append(angle) best_chisq = self._fitDihedral_objective(best_param) bar = ProgressBar(64, description="Fitting") for iframe in range(64): (bounds, start) = self._fitDihedral_make_bounds(iframe) xopt = optimize.minimize(self._fitDihedral_objective, start, method="L-BFGS-B", bounds=bounds, options={'disp': False}) chisq = self._fitDihedral_objective(xopt.x) if (chisq < best_chisq): best_chisq = chisq best_param = xopt.x bar.progress() bar.stop() # Update the target dihedral with the optimized parameters for iframe in range(6): param[iframe].k0 = best_param[0 + iframe] param[iframe].phi0 = best_param[6 + iframe] self._prm.updateDihedral(param) param = self._prm.dihedralParam(self._rtf.type_by_index[atoms[0]], self._rtf.type_by_index[atoms[1]], self._rtf.type_by_index[atoms[2]], self._rtf.type_by_index[atoms[3]]) # Finally evaluate the fitted potential ffeval = FFEvaluate(self) for t in range(ret.N): ret.mm_fitted.append(ffeval.run(ret.coords[t][:, :, 0])['total']) mmin = min(ret.mm_fitted) chisq = 0. for t in range(ret.N): ret.mm_fitted[t] = ret.mm_fitted[t] - mmin delta = ret.mm_fitted[t] - ret.qm[t] chisq = chisq + (delta * delta) ret.chisq = chisq # TODO Score it self.coords = bkp_coords return ret
def simlist(datafolders, topologies, inputfolders=None): """Creates a list of simulations Parameters ---------- datafolders : str list A list of directories, each containing a single trajectory topologies : str list A list of topology files or folders containing a topology file corresponding to the trajectories in dataFolders. Can also be a single string to a single structure which corresponds to all trajectories. inputfolders : optional, str list A list of directories, each containing the input files used to produce the trajectories in dataFolders Return ------ sims : np.ndarray of :class:`Sim <htmd.simlist.Sim>` objects A list of simulations Examples -------- >>> simlist(glob('./test/data/*/'), glob('./test/input/*/'), glob('./test/input/*/')) >>> simlist(glob('./test/data/*/'), glob('./test/input/*/*.pdb'), glob('./test/input/*/')) """ from htmd.util import ensurelist import natsort if not datafolders: raise FileNotFoundError('No data folders were given, check your arguments.') if not topologies: raise FileNotFoundError('No molecule files were given, check your arguments.') topologies = ensurelist(topologies) datafolders = ensurelist(datafolders) for folder in datafolders: if not os.path.isdir(folder): raise NotADirectoryError('{}'.format(folder)) if inputfolders: inputfolders = ensurelist(inputfolders) for folder in inputfolders: if not os.path.isdir(folder): raise NotADirectoryError('{}'.format(folder)) # I need to match the simulation names inside the globs given. The # reason is that there can be more input folders in the glob than in # the data glob due to not having been retrieved. Hence I need to match # the folder names. # Create a hash map of data folder names datanames = dict() for folder in datafolders: if _simName(folder) in datanames: raise RuntimeError('Duplicate simulation name detected. Cannot name-match directories.') datanames[_simName(folder)] = folder molnames = dict() for mol in topologies: if not os.path.exists(mol): raise FileNotFoundError('File {} does not exist'.format(mol)) molnames[_simName(mol)] = mol if inputfolders: inputnames = dict() for inputf in inputfolders: inputnames[_simName(inputf)] = inputf logger.debug('Starting listing of simulations.') sims = [] keys = natsort.natsorted(datanames.keys()) i = 0 from htmd.progress.progress import ProgressBar bar = ProgressBar(len(keys), description='Creating simlist') for k in keys: trajectories = _autoDetectTrajectories(datanames[k]) if not trajectories: bar.progress() continue if len(topologies) > 1: if k not in molnames: raise FileNotFoundError('Did not find molfile with folder name ' + k + ' in the given glob') molfile = molnames[k] else: molfile = topologies[0] if os.path.isdir(molfile): molfile = _autoDetectTopology(molfile) inputf = [] if inputfolders: if k not in inputnames: raise FileNotFoundError('Did not find input with folder name ' + k + ' in the given glob') inputf = inputnames[k] numframes = [_readNumFrames(f) for f in trajectories] sims.append(Sim(simid=i, parent=None, input=inputf, trajectory=trajectories, molfile=molfile, numframes=numframes)) i += 1 bar.progress() bar.stop() logger.debug('Finished listing of simulations.') return np.array(sims, dtype=object)
def solvate(mol, pad=None, minmax=None, negx=0, posx=0, negy=0, posy=0, negz=0, posz=0, buffer=2.4, watsize=65.4195, prefix='WT', keysel='name OH2', rotate=False, rotsel='all', rotinc=36, spdb=None, spsf=None, stop=None): """ Solvates the system in a water box Parameters ---------- mol : :class:`Molecule <htmd.molecule.molecule.Molecule>` object The molecule object we want to solvate pad : float The padding to add to the minmax in all dimensions. You can specify different padding in each dimension using the negx, negy, negz, posx, posy, posz options. This option will override any values in the neg and pos options. minmax : list Min and max dimensions. Should be a 2D matrix of the form [[minx, miny, minz], [maxx, maxy, maxz]]. If none is given, it is calculated from the minimum and maximum coordinates in the mol. negx : float The padding in the -x dimension posx : float The padding in the +x dimension negy : float The padding in the -y dimension posy : float The padding in the +y dimension negz : float The padding in the -z dimension posz : float The padding in the +z dimension buffer : float How much buffer space to leave empty between waters and other molecules watsize : float The size of the water box prefix : str The prefix used for water segments keysel : str The key selection for water atoms rotate : bool Enable automated rotation of molecule to fit best in box rotsel : str The selection of atoms to rotate rotinc : float The increment in degrees to rotate spdb : str The path to the water pdb file spsf : str The path to the water psf file stop : str The path to the water topology file Returns ------- mol : :class:`Molecule <htmd.molecule.molecule.Molecule>` object A solvated molecule Examples -------- >>> smol = solvate(mol, pad=10) >>> smol = solvate(mol, minmax=[[-20, -20, -20],[20, 20, 20]]) """ mol = mol.copy() if mol.numFrames > 1: logger.warning('Multiple frames in Molecule. Solvate keeps only frame 0 and discards the rest.') mol.coords = np.atleast_3d(mol.coords[:, :, 0]) if spdb is None: spdb = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'wat.pdb') if os.path.isfile(spdb): logger.info('Using water pdb file at: ' + spdb) water = Molecule(spdb) else: raise NameError('No solvent pdb file found in ' + spdb) if pad is not None: negx = pad; posx = pad; negy = pad; posy = pad; negz = pad; posz = pad if rotate: raise NameError('Rotation not implemented yet') # Calculate min max coordinates from molecule if mol.numAtoms > 0: minmol = np.min(mol.get('coords'), axis=0) maxmol = np.max(mol.get('coords'), axis=0) else: minmol = [np.inf, np.inf, np.inf] maxmol = [-np.inf, -np.inf, -np.inf] if minmax is None: minc = minmol maxc = maxmol else: if isinstance(minmax, list): minmax = np.array(minmax) minc = minmax[0, :] maxc = minmax[1, :] xmin = float(minc[0] - negx) xmax = float(maxc[0] + posx) ymin = float(minc[1] - negy) ymax = float(maxc[1] + posy) zmin = float(minc[2] - negz) zmax = float(maxc[2] + posz) dx = xmax - xmin dy = ymax - ymin dz = zmax - zmin nx = int(np.ceil((dx + 2 * buffer) / watsize)) ny = int(np.ceil((dy + 2 * buffer) / watsize)) nz = int(np.ceil((dz + 2 * buffer) / watsize)) # Calculate number of preexisting water segments with given prefix if mol.numAtoms > 0: preexist = len(np.unique(mol.get('segid', sel='segid "{}.*"'.format(prefix)))) else: preexist = 0 numsegs = nx * ny * nz logger.info('Replicating ' + str(numsegs) + ' water segments, ' + str(nx) + ' by ' + str(ny) + ' by ' + str(nz)) # Check that we won't run out of segment name characters, and switch to # using hexadecimal or alphanumeric naming schemes in cases where decimal # numbered segnames won't fit into the field width. testsegname = '{0}{1:d}'.format(prefix, numsegs + preexist) testsegnamehex = '{0}{1:X}'.format(prefix, numsegs + preexist) writemode = 'decimal' if len(testsegname) > 4 and len(testsegnamehex) <= 4: writemode = 'hex' logger.warning('Warning: decimal naming would overrun segname field. Using hexadecimal segnames instead...') elif len(testsegnamehex) > 4: writemode = 'alphanum' logger.warning('Warning: decimal or hex naming would overrun segname field. Using alphanumeric segnames instead...') minx = minmol[0] - buffer; miny = minmol[1] - buffer; minz = minmol[2] - buffer maxx = maxmol[0] + buffer; maxy = maxmol[1] + buffer; maxz = maxmol[2] + buffer bar = ProgressBar(nx*ny*nz, description='Solvating') waterboxes = np.empty(numsegs, dtype=object) n = preexist w = 0 for i in range(nx): movex = xmin + i * watsize movexmax = movex + watsize xoverlap = True if movex > maxx or movexmax < minx: xoverlap = False for j in range(ny): movey = ymin + j * watsize moveymax = movey + watsize yoverlap = True if movey > maxy or moveymax < miny: yoverlap = False for k in range(nz): movez = zmin + k * watsize movezmax = movez + watsize zoverlap = True if movez > maxz or movezmax < minz: zoverlap = False if writemode == 'decimal': segname = '{0}{1:d}'.format(prefix, n) elif writemode == 'hex': segname = '{0}{1:x}'.format(prefix, n) elif writemode == 'alphanum': segname = '{0}{1:c}{2:c}{3:c}'.format(prefix, int(np.floor(np.floor(n/26)/26) + 65), int(np.mod(np.floor(n/26), 26) + 65), int(np.mod(n, 26) + 65)) waterboxes[w] = water.copy() waterboxes[w].moveBy([movex, movey, movez]) waterboxes[w].set('segid', segname) mol.append(waterboxes[w]) watsel = mol.segid == segname selover = np.zeros(len(watsel), dtype=bool) if xoverlap and yoverlap and zoverlap: # Remove water overlapping with other segids selover = _overlapWithOther(mol, segname, buffer) # Remove water outside the boundaries selout = _outOfBoundaries(mol, segname, xmin, xmax, ymin, ymax, zmin, zmax) sel = selover | selout #mol.write('temp.pdb') mol.filter(mol.segid != segname, _logger=False) waterboxes[w].filter(np.invert(sel[watsel]), _logger=False) #waterboxes[w].write('wat' + str(w) + '.pdb') n += 1 w += 1 bar.progress() bar.stop() waters = 0 for i in range(numsegs): waters += waterboxes[i].numAtoms if waterboxes[i].numAtoms != 0: mol.append(waterboxes[i]) logger.info('{} water molecules were added to the system.'.format(int(waters/3))) return mol
def fitSoftTorsion(self, phi, geomopt=True): found = False phi_to_fit = None frozens = [] dih_index = 0 i = 0 bkp_coords = self.coords.copy() for d in self._soft_dihedrals: if (d.atoms == phi).all(): phi_to_fit = d dih_index = i frozens.append(d.atoms) else: if not geomopt: frozens.append(d.atoms) i += 1 if not phi_to_fit: raise ValueError("specified phi is not a recognised soft dihedral") self._makeDihedralUnique(phi_to_fit) atoms = phi_to_fit.atoms left = phi_to_fit.left right = phi_to_fit.right equivs = phi_to_fit.equivalents step = 10 # degrees nstep = int(360 / step) cset = np.zeros((self.natoms, 3, nstep)) i = 0 for phi in range(-180, 180, step): cset[:, :, i] = setPhi(self.coords[:, :, 0], atoms, left, right, phi) i += 1 mol = self.copy() mol.coords = cset dirname = "dihedral-single-point" if geomopt: dirname = "dihedral-opt" dih_name = "%s-%s-%s-%s" % (self.name[atoms[0]], self.name[atoms[1]], self.name[atoms[2]], self.name[atoms[3]]) fitdir = os.path.join(self.outdir, dirname, dih_name, self.output_directory_name()) try: os.makedirs(fitdir, exist_ok=True) except: raise OSError('Directory {} could not be created. Check if you have permissions.'.format(fitdir)) qmset = QMCalculation(mol, charge=self.netcharge, directory=fitdir, frozen=frozens, optimize=geomopt, theory=self.theory, solvent=self.solvent, basis=self.basis, execution=self.execution, code=self.qmcode) ret = self._makeDihedralFittingSetFromQMResults(atoms, qmset.results()) # Get the initial parameters of the dihedral we are going to fit param = self._prm.dihedralParam(self._rtf.type_by_index[atoms[0]], self._rtf.type_by_index[atoms[1]], self._rtf.type_by_index[atoms[2]], self._rtf.type_by_index[atoms[3]]) # Save these parameters as the best fit (fit to beat) best_param = np.zeros((13)) for t in range(6): best_param[t] = param[t].k0 best_param[t + 6] = param[t].phi0 best_param[12] = 0. # print(param) # Evalaute the mm potential with this dihedral zeroed out # The objective function will try to fit to the delta between # The QM potential and the this modified mm potential for t in param: t.k0 = t.phi0 = 0. t.e14 = 1. # Always fit with e14 scaling of 1. per CHARMM self._prm.updateDihedral(param) ffe = FFEvaluate(self) # print(ffe.evaluate( ret.coords[0] ) ) # input # Now evaluate the ff without the dihedral being fitted for t in range(ret.N): mm_zeroed = (ffe.evaluate(ret.coords[t])["total"]) ret.mm_delta.append(ret.qm[t] - mm_zeroed) ret.mm_zeroed.append(mm_zeroed) mmin1 = min(ret.mm_zeroed) mmin2 = min(ret.mm_delta) for t in range(ret.N): ret.mm_zeroed[t] = ret.mm_zeroed[t] - mmin1 ret.mm_delta[t] = ret.mm_delta[t] - mmin2 self._fitDihedral_results = ret self._fitDihedral_phi = param # Now measure all of the soft dihedrals phis that are mapped to this dihedral ret.phis = [] for i in range(ret.N): ret.phis.append([ret.phi[i]]) for e in equivs: ret.phis[i].append(getPhi(ret.coords[i], e)) # print ("EQUIVALENT DIHEDRALS FOR THIS DIHEDRAL" ) # print(equivs) # print ("PHI VALUES TO FIT") # print (ret.phis) # Set up the NOLOPT fit # There are 13 parameters, k,phi for n=1,2,3,4,5,6 and a shift N = 13 # initial guess, st = np.zeros(13) # bounds best_chisq = self._fitDihedral_objective(best_param) # print("CHISQ of initial = %f" % ( best_chisq ) ) # Now zero out the terms of the dihedral we are going to fit bar = ProgressBar(64, description="Fitting") for i in range(64): (bounds, start) = self._fitDihedral_make_bounds(i) xopt = optimize.minimize(self._fitDihedral_objective, start, method="L-BFGS-B", bounds=bounds, options={'disp': False}) chisq = self._fitDihedral_objective(xopt.x) # print( "CHISQ of fit = %f " % (chisq) ) if (chisq < best_chisq): best_chisq = chisq best_param = xopt.x bar.progress() bar.stop() # print("Best ChiSQ = %f" %(best_chisq) ) # Update the target dihedral with the optimized parameters # print(param) # print(best_param ) for i in range(6): param[i].k0 = best_param[0 + i] param[i].phi0 = best_param[6 + i] self._prm.updateDihedral(param) # print(param) param = self._prm.dihedralParam(self._rtf.type_by_index[atoms[0]], self._rtf.type_by_index[atoms[1]], self._rtf.type_by_index[atoms[2]], self._rtf.type_by_index[atoms[3]]) # print(param) # Finally evaluate the fitted potential ffe = FFEvaluate(self) for t in range(ret.N): ret.mm_fitted.append(ffe.evaluate(ret.coords[t])["total"]) mmin = min(ret.mm_fitted) chisq = 0. # print( "QM energies" ) # print( ret.qm ) for t in range(ret.N): ret.mm_fitted[t] = ret.mm_fitted[t] - mmin delta = ret.mm_fitted[t] - ret.qm[t] chisq = chisq + (delta * delta) ret.chisq = chisq # TODO Score it self.coords = bkp_coords return ret
def project(self, ndim=None): """ Projects the data object given to the constructor onto the top `ndim` TICA dimensions Parameters ---------- ndim : int The number of TICA dimensions we want to project the data on. If None is given it will use choose a number of dimensions to cover 95% of the kinetic variance. Returns ------- dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the TICA projected data Example ------- >>> from htmd.projections.tica import TICA >>> tica = TICA(data,20) >>> dataTica = tica.project(5) """ if ndim is not None: # self.tic._dim = ndim # Old way of doing it. Deprecated since pyEMMA 2.1 self.tic.set_params( dim=ndim) # Change to this in 2.1 pyEMMA version if isinstance( self.data, Metric): # Doesn't project on correct number of dimensions proj = [] refs = [] fstep = None '''from htmd.config import _config from joblib import Parallel, delayed results = Parallel(n_jobs=_config['ncpus'], verbose=11)( delayed(_test)(self.data, self.tic, i) for i in range(len(self.data.simulations))) for i in range(len(results)): proj.append(results[i][0]) refs.append(results[i][1]) fstep.append(results[i][2])''' droppedsims = [] p = ProgressBar(len(self.data.simulations)) for i in range(len(self.data.simulations)): d, r, f = self.data._projectSingle(i) if d is None: droppedsims.append(i) continue if fstep is None: fstep = f refs.append(r) proj.append(self.tic.transform(d)) p.progress() p.stop() simlist = self.data.simulations simlist = np.delete(simlist, droppedsims) ref = np.array(refs, dtype=object) #fstep = 0 parent = None else: proj = self.tic.get_output() simlist = self.data.simlist ref = self.data.ref fstep = self.data.fstep parent = self.data if ndim is None: logger.info( 'Kept {} dimension(s) to cover 95% of kinetic variance.'. format(self.tic.dimension())) #print(np.shape(proj)) from htmd.metricdata import MetricData datatica = MetricData(dat=np.array(proj, dtype=object), simlist=simlist, ref=ref, fstep=fstep, parent=parent) '''datatica = self.data.copy() #datatica.dat = self.data.deconcatenate(np.squeeze(proj)) datatica.dat = np.array(proj, dtype=object) datatica.parent = self.data datatica.St = None datatica.Centers = None datatica.N = None datatica.K = None datatica._dataid = random.random() datatica._clusterid = None''' return datatica
def project(self, ndim=None): """ Projects the data object given to the constructor onto the top `ndim` TICA dimensions Parameters ---------- ndim : int The number of TICA dimensions we want to project the data on. If None is given it will use choose a number of dimensions to cover 95% of the kinetic variance. Returns ------- dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the TICA projected data Example ------- >>> from htmd.projections.tica import TICA >>> tica = TICA(data,20) >>> dataTica = tica.project(5) """ if ndim is not None: # self.tic._dim = ndim # Old way of doing it. Deprecated since pyEMMA 2.1 self.tic.set_params( dim=ndim) # Change to this in 2.1 pyEMMA version if isinstance( self.data, Metric): # Doesn't project on correct number of dimensions proj = [] refs = [] fstep = None metr = self.data p = ProgressBar(len(metr.simulations)) k = -1 droppedsims = [] for projecteddata in _projectionGenerator(metr, _getNcpus()): for pro in projecteddata: k += 1 if pro is None: droppedsims.append(k) continue proj.append(self.tic.transform(pro[0])) refs.append(pro[1]) if fstep is None: fstep = pro[2] p.progress(len(projecteddata)) p.stop() simlist = self.data.simulations simlist = np.delete(simlist, droppedsims) ref = np.array(refs, dtype=object) #fstep = 0 parent = None else: proj = self.tic.get_output() simlist = self.data.simlist ref = self.data.ref fstep = self.data.fstep parent = self.data if ndim is None: logger.info( 'Kept {} dimension(s) to cover 95% of kinetic variance.'. format(self.tic.dimension())) from htmd.metricdata import MetricData datatica = MetricData(dat=np.array(proj, dtype=object), simlist=simlist, ref=ref, fstep=fstep, parent=parent) from pandas import DataFrame types = [] indexes = [] description = [] for i in range(ndim): types += ['tica'] indexes += [-1] description += ['TICA dimension {}'.format(i + 1)] datatica.map = DataFrame({ 'type': types, 'indexes': indexes, 'description': description }) return datatica
def tileMembrane(memb, xmin, ymin, xmax, ymax, buffer=1.5): """ Tile a membrane in the X and Y dimensions to reach a specific size. Parameters ---------- memb : :class:`Molecule <htmd.molecule.molecule.Molecule>` object The membrane to be tiled xmin : float Minimum x coordinate ymin : float Minimum y coordinate xmax : float Maximum x coordinate ymax : float Maximum y coordinate buffer : float Buffer distance between tiles Returns ------- megamemb : A big membrane Molecule """ from htmd.progress.progress import ProgressBar memb = memb.copy() memb.resid = sequenceID( (memb.resid, memb.insertion, memb.chain, memb.segid)) minmemb = np.min(memb.get('coords', 'water'), axis=0).flatten() size = np.max(memb.get('coords', 'water'), axis=0) - np.min( memb.get('coords', 'water'), axis=0) size = size.flatten() xreps = int(np.ceil((xmax - xmin) / size[0])) yreps = int(np.ceil((ymax - ymin) / size[1])) logger.info('Replicating Membrane {}x{}'.format(xreps, yreps)) from htmd.molecule.molecule import Molecule megamemb = Molecule() bar = ProgressBar(xreps * yreps, description='Replicating Membrane') k = 0 for x in range(xreps): for y in range(yreps): tmpmemb = memb.copy() xpos = xmin + x * (size[0] + buffer) ypos = ymin + y * (size[1] + buffer) tmpmemb.moveBy( [-float(minmemb[0]) + xpos, -float(minmemb[1]) + ypos, 0]) tmpmemb.remove('same resid as (x > {} or y > {})'.format( xmax, ymax), _logger=False) if tmpmemb.numAtoms == 0: continue tmpmemb.set('segid', 'M{}'.format(k), sel='not water') tmpmemb.set('segid', 'MW{}'.format(k), sel='water') megamemb.append(tmpmemb) k += 1 bar.progress() bar.stop() # Membranes don't tile perfectly. Need to remove waters that clash with lipids of other tiles # Some clashes will still occur between periodic images however megamemb.remove('same resid as water and within 1.5 of not water', _logger=False) return megamemb
def fitSoftDihedral( self, phi, geomopt=True ): found=False phi_to_fit = None frozens=[] dih_index=0 i=0 bkp_coords = self.coords.copy() for d in self._soft_dihedrals: if (d.atoms == phi).all(): phi_to_fit = d dih_index=i frozens.append(d.atoms) else: if not geomopt: frozens.append(d.atoms) i=i+1 if not phi_to_fit: raise ValueError( "specified phi is not a recognised soft dihedral" ) self._makeDihedralUnique( phi_to_fit ) atoms = phi_to_fit.atoms left = phi_to_fit.left right = phi_to_fit.right equivs= phi_to_fit.equivalents step = 10 # degrees nstep = (int)(360/step) cset = np.zeros( ( self.natoms, 3, nstep ) ) i=0 for phi in range( -180, 180, step ): cset[:,:,i] = setPhi( self.coords[:,:,0], atoms, left, right, phi ) i=i+1 mol = self.copy() mol.coords = cset dirname = "dihedral-single-point" if geomopt: dirname="dihedral-opt" try: os.mkdir( dirname ) except: pass dih_name = "%s-%s-%s-%s" % ( self.name[atoms[0]], self.name[atoms[1]], self.name[atoms[2]], self.name[atoms[3]] ) qmset = QMCalculation( mol, charge=self.netcharge, directory= os.path.join( dirname, (dih_name)) , frozen=frozens, optimize=geomopt ) ret = self._makeDihedralFittingSetFromQMResults( atoms, qmset.results() ) # Get the initial parameters of the dihedral we are going to fit param = self._prm.dihedralParam( self._rtf.type_by_index[ atoms[0] ], self._rtf.type_by_index[ atoms[1] ], self._rtf.type_by_index[ atoms[2] ], self._rtf.type_by_index[ atoms[3] ] ) # Save these parameters as the best fit (fit to beat) best_param=np.zeros((13)) for t in range(6): best_param[t] = param[t].k0 best_param[t+6] = param[t].phi0 best_param[12] = 0. # print(param) # Evalaute the mm potential with this dihedral zeroed out # The objective function will try to fit to the delta between # The QM potential and the this modified mm potential for t in param: t.k0 = t.phi0 = 0. t.e14 = 1. # Always fit with e14 scaling of 1. per CHARMM self._prm.updateDihedral( param ) ffe = FFEvaluate( self ) # print(ffe.evaluate( ret.coords[0] ) ) # input # Now evaluate the ff without the dihedral being fitted for t in range(ret.N): mm_zeroed = ( ffe.evaluate( ret.coords[t] )["total"]) ret.mm_delta.append( ret.qm[t] - mm_zeroed ) ret.mm_zeroed.append( mm_zeroed ) mmin1 = min( ret.mm_zeroed ) mmin2 = min( ret.mm_delta ) for t in range(ret.N): ret.mm_zeroed[t] = ret.mm_zeroed[t] - mmin1 ret.mm_delta[t] = ret.mm_delta[t] - mmin2 self._fitDihedral_results = ret self._fitDihedral_phi = param # Now measure all of the soft dihedrals phis that are mapped to this dihedral ret.phis= [] for i in range(ret.N): ret.phis.append( [ ret.phi[i] ] ) for e in equivs: ret.phis[i].append( getPhi( ret.coords[i], e ) ) # print ("EQUIVALENT DIHEDRALS FOR THIS DIHEDRAL" ) # print(equivs) # print ("PHI VALUES TO FIT") # print (ret.phis) # Set up the NOLOPT fit # There are 13 parameters, k,phi for n=1,2,3,4,5,6 and a shift N = 13 # initial guess, st= np.zeros(13) # bounds best_chisq = self._fitDihedral_objective( best_param ) # print("CHISQ of initial = %f" % ( best_chisq ) ) # Now zero out the terms of the dihedral we are going to fit bar=ProgressBar(64, description="Fitting") for i in range(64): ( bounds, start ) = self._fitDihedral_make_bounds( i ) xopt = optimize.minimize( self._fitDihedral_objective, start, method="L-BFGS-B", bounds = bounds , options={'disp': False } ) chisq = self._fitDihedral_objective( xopt.x ) # print( "CHISQ of fit = %f " % (chisq) ) if( chisq < best_chisq ): best_chisq = chisq best_param = xopt.x bar.progress() bar.stop() # print("Best ChiSQ = %f" %(best_chisq) ) # Update the target dihedral with the optimized parameters # print(param) # print(best_param ) for i in range(6): param[i].k0 = best_param[0+i] param[i].phi0 = best_param[6+i] self._prm.updateDihedral( param ) # print(param) param = self._prm.dihedralParam( self._rtf.type_by_index[ atoms[0] ], self._rtf.type_by_index[ atoms[1] ], self._rtf.type_by_index[ atoms[2] ], self._rtf.type_by_index[ atoms[3] ] ) # print(param) # Finally evaluate the fitted potential ffe = FFEvaluate( self ) for t in range(ret.N): ret.mm_fitted.append( ffe.evaluate( ret.coords[t] )["total"] ) mmin = min(ret.mm_fitted ) chisq=0. # print( "QM energies" ) # print( ret.qm ) for t in range(ret.N): ret.mm_fitted[t] = ret.mm_fitted[t] - mmin delta = ret.mm_fitted[t] - ret.qm[t] chisq = chisq + (delta * delta ) ret.chisq = chisq # TODO Score it self.coords = bkp_coords return ret
def _run_qm_jobs_lsf(self, directory): cmd = self._env['BIN_G09'] fni = [] fno = [] for root, dirs, files in os.walk(directory): for f in files: if (f.endswith(".gjf")): op = f.replace(".gjf", ".out") if not os.path.exists(op): fni.append(os.path.join(root, f)) fno.append(os.path.join(root, op)) if (len(fni)): # Make an LSF script for i in range(len(fni)): fpbs = fni[i] + ".lsf" if not os.path.exists(fpbs): f = open(fpbs, "w") print("#BSUB -n %d" % self._config.NCORES, file=f) print("#BSUB -R \"span[ptile=%d]\"" % self._config.NCORES, file=f) print("#BSUB -W 24:00", file=f) print("#BSUB -J gaussian", file=f) print("#BSUB -app gaussian", file=f) print("#BSUB -o /dev/null", file=f) print("#BSUB -M %d000" % self._config.MEMORY, file=f) print("\nmodule load gaussian\n", file=f) print("cd \"%s\"" % directory, file=f) print("\"%s\" %s %s" % (cmd, fni[i], fno[i]), file=f) f.close() # Look to see if there is already a job submitted # If not, qsub it fpbsstate = fni[i] + ".jobid" if not os.path.exists(fpbsstate): # Qsub, saving jobid to file subprocess.check_output("\"" + self._env['BIN_BSUB'] + "\" < \"" + fpbs + "\" > \"" + fpbsstate + "\"", shell=True) # Finally monitor progress. Continue until all jobs have produced an output # NB TODO FIXME: should also poll qstat to see if job is still live bar = ProgressBar(len(fni), description="Running QM Calculations") complete = False lastcount = 0 while not complete: count = 0 complete = True for i in fno: # print(" Checking [" + i +"]" ) # print( os.path.exists(i) ) # print( os.access(i, os.R_OK) ) try: os.stat(i) # Try to flush any cache (for NFS) except: pass if os.access(i, os.R_OK): # print("FOUND") count += 1 else: complete = False # print( str(count) + " completed of " + str(len(fno)) ) while lastcount < count: bar.progress() lastcount += 1 time.sleep(10) bar.stop() time.sleep( 5) # A bit of time for any outputfile to complete writing