def getResnums(self, gaps=False): """Return list of residue numbers associated with non-gapped *seq*. When *gaps* is **True**, return a list containing the residue numbers with gaps appearing as **None**. Residue numbers are inferred from the full label. When label does not contain residue number information, indices a range of numbers starting from 1 is returned.""" title, start, end = splitSeqLabel(self.getLabel(True)) try: start, end = int(start), int(end) except: LOGGER.info('Cannot parse label start, end values, Setting ' 'resnums 1 to {0:d}'.format(self.numResidues())) start, end = 1, self.numResidues() else: if (end - start + 1) != self.numResidues(): LOGGER.info('Label start-end position does not match ' 'length of ungapped sequence. Setting ' 'resnums 1 to {0:d}'.format(self.numResidues())) start, end = 1, self.numResidues() resnums = iter(range(start, end + 1)) if gaps: return [next(resnums) if torf else None for torf in char.isalpha(self._array)] else: return list(resnums)
def deformAtoms(atoms, mode, rmsd=None): """Generate a new coordinate set for *atoms* along the *mode*. *atoms* must be a :class:`.AtomGroup` instance. New coordinate set will be appended to *atoms*. If *rmsd* is provided, *mode* will be scaled to generate a coordinate set with given RMSD distance to the active coordinate set.""" if not isinstance(atoms, AtomGroup): raise TypeError('atoms must be an AtomGroup, not {0}' .format(type(atoms))) if not isinstance(mode, VectorBase): raise TypeError('mode must be a Mode or Vector instance, ' 'not {0}'.format(type(mode))) if not mode.is3d(): raise ValueError('mode must be from a 3-dimensional model.') if atoms.numAtoms() != mode.numAtoms(): raise ValueError('number of atoms do not match') array = mode.getArrayNx3() if rmsd is not None: rmsd = float(rmsd) # rmsd = ( ((scalar * array)**2).sum() / n_atoms )**0.5 scalar = (atoms.numAtoms() * rmsd**2 / (array**2).sum())**0.5 LOGGER.info('Mode is scaled by {0}.'.format(scalar)) atoms.addCoordset(atoms.getCoords() + array * scalar) else: atoms.addCoordset(atoms.getCoords() + array)
def _superpose(self, **kwargs): """Superpose conformations and update coordinates.""" calcT = getTransformation if kwargs.get('trans', False): if self._trans is not None: LOGGER.info('Existing transformations will be overwritten.') trans = np.zeros((self._n_csets, 4, 4)) else: trans = None indices = self._indices if indices is None: weights = self._weights coords = self._coords confs = self._confs confs_selected = self._confs else: weights = self._weights[:, indices] coords = self._coords[indices] confs = self._confs confs_selected = self._confs[:, indices] for i, conf in enumerate(confs_selected): rmat, tvec = calcT(conf, coords, weights[i]) if trans is not None: trans[i][:3, :3] = rmat trans[i][:3, 3] = tvec confs[i] = tvec + np.dot(confs[i], rmat.T) self._trans = trans
def parsePDBs(self, **kwargs): """Load PDB into memory as :class:`.AtomGroup` instances using :func:`.parsePDB` and perform selection based on residue ranges given by CATH.""" pdbs = self.getPDBs(True) selstrs = self.getSelStrs() header = kwargs.get('header', False) model = kwargs.get('model', None) LOGGER.timeit('_cath_parsePDB') LOGGER.info('Parsing {0} PDB files...'.format(len(pdbs))) ret = parsePDB(*pdbs, **kwargs) if model != 0: if header: prots, _ = ret else: prots = ret LOGGER.info('Extracting domains...') for i in range(len(prots)): sel = prots[i].select(selstrs[i]) prots[i] = sel LOGGER.report('CATH domains are parsed and extracted in %.2fs', '_cath_parsePDB') return ret
def iterpose(self, rmsd=0.0001): confs = self._confs.copy() Ensemble.iterpose(self, rmsd) self._confs = confs LOGGER.info('Final superposition to calculate transformations.') self.superpose()
def loadPDBClusters(sqid=None): """Load previously fetched PDB sequence clusters from disk to memory.""" PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters') if sqid is None: sqid_list = list(PDB_CLUSTERS) LOGGER.info('Loading all PDB sequence clusters.') else: assert isinstance(sqid, int), 'sqid must be an integer' if sqid not in PDB_CLUSTERS: raise ValueError('PDB cluster data is not available for sequence ' 'identity {0}%, try one of {1}' .format(sqid, PDB_CLUSTERS_SQID_STR)) LOGGER.info('Loading PDB sequence clusters for sequence identity ' '{0}.'.format(sqid)) sqid_list = [sqid] global PDB_CLUSTERS_UPDATE_WARNING for sqid in sqid_list: filename = os.path.join(PDB_CLUSTERS_PATH, 'bc-{0}.out.gz'.format(sqid)) if not os.path.isfile(filename): fetchPDBClusters(sqid) if PDB_CLUSTERS_UPDATE_WARNING: import time diff = (time.time() - os.path.getmtime(filename)) / 604800. if diff > 1.: LOGGER.warning('PDB sequence clusters are {0:.1f} week(s) old,' ' call `fetchPDBClusters` to receive updates.' .format(diff)) PDB_CLUSTERS_UPDATE_WARNING = False inp = openFile(filename) PDB_CLUSTERS[sqid] = inp.read() inp.close()
def psiBlastRun(sequence, cycles=2, filename=None, **kwargs): """Returns the results from a full PSI-BLAST run (multiple cycles). All arguments are the same as psiBlastCycle and are passed to it except for cycles. :arg cycles: the number of cycles to run default is 2 :type cycles: int """ psithr = kwargs.get('psithr', 1.0e-3) job_id = kwargs.get('previousjobid','') selectedHits = kwargs.get('selectedHits','') cycles_done = 0 results_list = [] job_ids = [] while cycles_done < cycles: if cycles_done > 0: selectedHits = 'http://www.ebi.ac.uk/Tools/services/rest/psiblast/result/' \ + job_id + '/preselected_seq' sequence = None job_id, results, sequence = psiBlastCycle(sequence, filename, \ previousjobid=job_id, \ selectedHits=selectedHits, \ cycle=cycles_done, **kwargs) results_list.append(results) job_ids.append(job_id) cycles_done += 1 LOGGER.info('Finished cycle {0} with job ID {1}.'.format(cycles_done, job_id)) return job_ids, results_list, sequence
def pathPDBMirror(path=None, format=None): """Returns or specify PDB mirror path to be used by :func:`.fetchPDB`. To release the current mirror, pass an invalid path, e.g. ``path=''``. If you are keeping a partial mirror, such as PDB files in :file:`/data/structures/divided/pdb/` folder, specify *format*, which is ``'pdb'`` in this case.""" if path is None: path = SETTINGS.get('pdb_mirror_path') format = SETTINGS.get('pdb_mirror_format', None) if path: if isdir(path): if format is None: return path else: return path, format else: LOGGER.warning('PDB mirror path {0} is not a accessible.' .format(repr(path))) else: if isdir(path): path = abspath(path) LOGGER.info('Local PDB mirror path is set: {0}' .format(repr(path))) SETTINGS['pdb_mirror_path'] = path SETTINGS['pdb_mirror_format'] = format SETTINGS.save() else: current = SETTINGS.pop('pdb_mirror_path') if current: LOGGER.info('PDB mirror {0} is released.' .format(repr(current))) SETTINGS.save() else: raise IOError('{0} is not a valid path.'.format(repr(path)))
def wwPDBServer(*key): """Set/get `wwPDB`_ FTP/HTTP server location used for downloading PDB structures. Use one of the following keywords for setting a server: +---------------------------+-----------------------------+ | wwPDB FTP server | *Key* (case insensitive) | +===========================+=============================+ | RCSB PDB (USA) (default) | RCSB, USA, US | +---------------------------+-----------------------------+ | PDBe (Europe) | PDBe, Europe, Euro, EU | +---------------------------+-----------------------------+ | PDBj (Japan) | PDBj, Japan, Jp | +---------------------------+-----------------------------+ .. _wwPDB: http://www.wwpdb.org/""" if not key: return SETTINGS.get('wwpdb', None) elif len(key) == 1: try: key = key[0].lower() except AttributeError: raise TypeError('key must be a string') if key in WWPDB_FTP_SERVERS: SETTINGS['wwpdb'] = key SETTINGS.save() LOGGER.info('wwPDB server is set to {}.' .format(WWPDB_FTP_SERVERS[key][0])) else: raise ValueError('{0} is not a valid wwPDB server identifier' .format(repr(key))) else: raise TypeError('one wwPDB server identifier is expected, {0} given' .format(len(key)))
def calcCrossProjection(ensemble, mode1, mode2, scale=None, **kwargs): """Return projection of conformational deviations onto modes from different models. :arg ensemble: ensemble for which deviations will be projected :type ensemble: :class:`.Ensemble` :arg mode1: normal mode to project conformations onto :type mode1: :class:`.Mode`, :class:`.Vector` :arg mode2: normal mode to project conformations onto :type mode2: :class:`.Mode`, :class:`.Vector` :arg scale: scale width of the projection onto mode ``x`` or ``y``, best scaling factor will be calculated and printed on the console, absolute value of scalar makes the with of two projection same, sign of scalar makes the projections yield a positive correlation""" if not isinstance(ensemble, (Ensemble, Conformation, Vector, TrajBase)): raise TypeError('ensemble must be Ensemble, Conformation, Vector, ' 'or a Trajectory, not {0}'.format(type(ensemble))) if not isinstance(mode1, VectorBase): raise TypeError('mode1 must be a Mode instance, not {0}' .format(type(mode1))) if not mode1.is3d(): raise ValueError('mode1 must be 3-dimensional') if not isinstance(mode2, VectorBase): raise TypeError('mode2 must be a Mode instance, not {0}' .format(type(mode2))) if not mode2.is3d(): raise ValueError('mode2 must be 3-dimensional') if scale is not None: assert isinstance(scale, str), 'scale must be a string' scale = scale.lower() assert scale in ('x', 'y'), 'scale must be x or y' xcoords = calcProjection(ensemble, mode1, kwargs.get('rmsd', True)) ycoords = calcProjection(ensemble, mode2, kwargs.pop('rmsd', True)) if scale: scalar = kwargs.get('scalar', None) if scalar: assert isinstance(scalar, (float, int)), 'scalar must be a number' else: scalar = ((ycoords.max() - ycoords.min()) / (xcoords.max() - xcoords.min()) ) * np.sign(np.dot(xcoords, ycoords)) if scale == 'x': LOGGER.info('Projection onto {0} is scaled by {1:.2f}' .format(mode1, scalar)) else: scalar = 1 / scalar LOGGER.info('Projection onto {0} is scaled by {1:.2f}' .format(mode2, scalar)) if scale == 'x': xcoords = xcoords * scalar else: ycoords = ycoords * scalar return xcoords, ycoords
def pathVMD(*path): """Return VMD path, or set it to be a user specified *path*.""" if not path: path = SETTINGS.get('vmd', None) if isExecutable(path): return path else: LOGGER.warning('VMD path is not set by user, looking for it.') vmdbin = None vmddir = None if PLATFORM == 'Windows': if PY3K: import winreg else: import _winreg as winreg # PY3K: OK for vmdversion in ('1.8.7', '1.9', '1.9.1'): try: key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, 'Software\\University of Illinois\\VMD\\' + vmdversion) vmddir = winreg.QueryValueEx(key, 'VMDDIR')[0] vmdbin = join(vmddir, 'vmd.exe') except: pass try: key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, 'Software\\WOW6432node\\University of Illinois\\VMD\\' + vmdversion) vmddir = winreg.QueryValueEx(key, 'VMDDIR')[0] vmdbin = join(vmddir, 'vmd.exe') except: pass else: vmdbin = which('vmd') if False: pipe = os.popen('which vmd') vmdbin = pipe.next().strip() vmdfile = open(vmdbin) for line in vmdfile: if line.startswith('defaultvmddir='): vmddir = line.split('=')[1].replace('"', '') break vmdfile.close() if isExecutable(vmdbin): setVMDpath(vmdbin) return vmdbin elif len(path) == 1: path = path[0] if isExecutable(path): SETTINGS['vmd'] = path SETTINGS.save() LOGGER.info("VMD path is set to '{0}'.".format(path)) else: raise OSError('{0} is not executable.'.format(str(path))) else: raise ValueError('specify a single path string')
def setVMDpath(path): """Set path to a VMD executable.""" if isExecutable(path): SETTINGS["vmd"] = path SETTINGS.save() LOGGER.info("VMD path is set to '{0:s}'.".format(path)) else: raise OSError("{0:s} is not executable.".format(str(path)))
def _evalAltlocs(atomgroup, altloc, chainids, resnums, resnames, atomnames): altloc_keys = list(altloc) altloc_keys.sort() indices = {} for key in altloc_keys: xyz = atomgroup.getCoords() success = 0 lines = altloc[key] for line, i in lines: aan = line[12:16].strip() arn = line[17:21].strip() ach = line[21] ari = int(line[22:26].split()[0]) rn, ids, ans = indices.get((ach, ari), (None, None, None)) if ids is None: ids = indices.get(ach, None) if ids is None: ids = (chainids == ach).nonzero()[0] indices[ach] = ids ids = ids[resnums[ids] == ari] if len(ids) == 0: LOGGER.warn("failed to parse altloc {0} at line {1}, " "residue not present for altloc 'A'".format( repr(key), i+1)) continue rn = resnames[ids[0]] ans = atomnames[ids] indices[(ach, ari)] = (rn, ids, ans) if rn != arn: LOGGER.warn("failed to parse altloc {0} at line {1}, " "residue name mismatch (expected {2}, " "parsed {3})".format(repr(key), i+1, repr(rn), repr(arn))) continue index = ids[(ans == aan).nonzero()[0]] if len(index) != 1: LOGGER.warn("failed to parse altloc {0} at line {1}, atom" " {2} not found in the residue" .format(repr(key), i+1, repr(aan))) continue try: xyz[index[0], 0] = float(line[30:38]) xyz[index[0], 1] = float(line[38:46]) xyz[index[0], 2] = float(line[46:54]) except: LOGGER.warn('failed to parse altloc {0} at line {1}, could' ' not read coordinates'.format(repr(key), i+1)) continue success += 1 LOGGER.info('{0} out of {1} altloc {2} lines were parsed.' .format(success, len(lines), repr(key))) if success > 0: LOGGER.info('Altloc {0} is appended as a coordinate set to ' 'atomgroup {1}.'.format(repr(key), atomgroup.getTitle())) atomgroup.addCoordset(xyz, label='altloc ' + key)
def showMeanMechStiff(model, coords, header, chain='A', *args, **kwargs): """Show mean value of effective spring constant with secondary structure taken from MechStiff. Header is needed to obatin secondary structure range. Using ``'jet_r'`` as argument color map will be reverse (similar to VMD program coding). """ meanStiff = np.array([np.mean(model.getStiffness(), axis=0)]) import matplotlib import matplotlib.pyplot as plt import matplotlib.patches as patches fig=plt.figure(figsize=[18,6], facecolor='w', dpi=100) if 'jet_r' in kwargs: import matplotlib.cm as plt kwargs['jet_r'] = 'cmap=cm.jet_r' if 'nearest' in kwargs: kwargs['nearest'] = 'interpolation=nearest' with plt.style.context('fivethirtyeight'): ax = fig.add_subplot(111) matplotlib.rcParams['font.size'] = '24' plt.plot(np.arange(len(meanStiff[0]))+coords.getResnums()[0],meanStiff[0], 'k-', linewidth = 3) plt.xlim(coords.getResnums()[0], coords.getResnums()[-1]) ax_top=round(np.max(meanStiff[0])+((np.max(meanStiff[0])-np.min(meanStiff[0]))/3)) ax_bottom=np.floor(np.min(meanStiff[0])) LOGGER.info('The range of mean effective force constant is: {0} to {1}.' .format(min(meanStiff[0]), max(meanStiff[0]))) plt.ylim(ax_bottom,ax_top) plt.xlabel('residue', fontsize = '22') plt.ylabel('mean $\kappa$ [a.u.]', fontsize = '22') ax = fig.add_subplot(411, aspect='equal') plt.imshow(meanStiff, *args, **kwargs) header_ss = header['sheet_range'] + header['helix_range'] for i in range(len(header_ss)): if header_ss[i][1] == chain: beg = int(header_ss[i][-2])-coords.getResnums()[0] end = int(header_ss[i][-1])-coords.getResnums()[0] add_beg = end - beg if header_ss[i][0] == 'H': ax.add_patch(patches.Rectangle((beg-1,-0.7),add_beg,\ 1.4,fill=False, linestyle='solid',edgecolor='#b22683', linewidth=2)) elif header_ss[i][0] == 'E': if header_ss[i][2] == -1: ax.add_patch(patches.Arrow(beg-1,0,add_beg,0,width=4.65, \ fill=False, linestyle='solid',edgecolor='black', linewidth=2)) else: ax.add_patch(patches.Arrow(end-1,0,add_beg*(-1),0,width=4.65, \ fill=False, linestyle='solid',edgecolor='black', linewidth=2)) plt.axis('off') ax.set_ylim(-1.7,1.7) if SETTINGS['auto_show']: showFigure() return plt.show
def setPDBMirrorPath(path): """Set the path to a local PDB mirror.""" if not isinstance(path, str): raise TypeError('path must be a string') if isdir(path): path = abspath(path) LOGGER.info('Local PDB mirror path is set: {0:s}'.format(repr(path))) SETTINGS['pdb_mirror_path'] = path SETTINGS.save() else: raise IOError('No such directory: {0:s}'.format(repr(path)))
def calcRankorder(matrix, zscore=False, **kwargs): """Returns indices of elements and corresponding values sorted in descending order, if *descend* is **True** (default). Can apply a zscore normalization; by default along *axis* - 0 such that each column has mean=0 and std=1. If *zcore* analysis is used, return value contains the zscores. If matrix is smymetric only lower triangle indices will be returned, with diagonal elements if *diag* is **True** (default).""" try: ndim, shape = matrix.ndim, matrix.shape except AttributeError: raise TypeError('matrix must be a 2D array') if ndim != 2: raise ValueError('matrix must be a 2D array') threshold = kwargs.get('thredhold', 0.0001) try: symm = abs((matrix.transpose() - matrix).max()) < threshold except: symm = False if zscore: axis = int(bool(kwargs.get('axis', 0))) matrix = (matrix - matrix.mean(axis)) / matrix.std(axis) LOGGER.info('Zscore normalization has been applied.') descend = kwargs.get('descend', True) if not symm: if descend: sorted_index = matrix.argsort(axis=None)[::-1] else: sorted_index = matrix.argsort(axis=None) row = indices(shape)[0].flatten()[sorted_index] column = indices(shape)[1].flatten()[sorted_index] else: LOGGER.info('Matrix is symmetric, only lower triangle indices ' 'will be returned.') if kwargs.get('diag', True): k = 0 else: k = -1 ind_row, ind_column = tril_indices(shape[0], k=k) matrix_lt = matrix[ind_row, ind_column] if descend: sorted_index = matrix_lt.argsort(axis=None)[::-1] else: sorted_index = matrix_lt.argsort(axis=None) row = ind_row[sorted_index] column = ind_column[sorted_index] return (row, column, matrix[row, column])
def calcMechStiff(modes, coords, kbt=1.): """Calculate stiffness matrix calculated using :class:`.ANM` instance. Method described in [EB08]_. :arg coords: a coordinate set or an object with ``getCoords`` method :type coords: :class:`numpy.ndarray`. :arg n_modes: number of non-zero eigenvalues/vectors to calculate. If **None** is given, all modes will be calculated (3x number of atoms). :type n_modes: int or **None**, default is 20. Author: Mustafa Tekpinar & Karolina Mikulska-Ruminska & Cihan Kaya """ try: coords = (coords._getCoords() if hasattr(coords, '_getCoords') else coords.getCoords()) except AttributeError: try: checkCoords(coords) except TypeError: raise TypeError('coords must be a Numpy array or an object ' 'with `getCoords` method') try: is3d = modes.is3d() eigvecs = modes.getArray().T.flatten() eigvals = modes.getEigvals() except: raise TypeError('modes must be either an NMA or ModeSet object') if not is3d: raise TypeError('modes must be 3-dimensional') n_atoms = modes.numAtoms() n_modes = modes.numModes() LOGGER.timeit('_sm') sm = np.zeros((n_atoms, n_atoms), np.double) from .smtools import calcSM LOGGER.info('Calculating stiffness matrix.') calcSM(coords, sm, eigvecs, eigvals, n_atoms, n_modes, float(kbt)) LOGGER.report('Stiffness matrix calculated in %.2lfs.', label='_sm') LOGGER.info('The range of effective force constant is: {0} to {1}.' .format(*calcStiffnessRange(sm))) return sm
def buildMechStiff(self, coords, n_modes=None, kbt=1.): """Calculate stiffness matrix calculated using :class:`.ANM` instance. Method described in [EB08]_. .. [EB08] Eyal E., Bahar I. Toward a Molecular Understanding of the Anisotropic Response of Proteins to External Forces: Insights from Elastic Network Models. *Biophys J* **2008** 94:3424-34355. :arg coords: a coordinate set or an object with ``getCoords`` method :type coords: :class:`numpy.ndarray`. :arg n_modes: number of non-zero eigenvalues/vectors to calculate. If ``None`` is given, all modes will be calculated (3x number of atoms). :type n_modes: int or ``None``, default is 20. Author: Mustafa Tekpinar & Karolina Mikulska-Ruminska & Cihan Kaya """ try: coords = (coords._getCoords() if hasattr(coords, '_getCoords') else coords.getCoords()) except AttributeError: try: checkCoords(coords) except TypeError: raise TypeError('coords must be a Numpy array or an object ' 'with `getCoords` method') n_atoms = natoms = self._n_atoms n_modes = 3 * n_atoms self.calcModes(n_modes=None, zeros=True) LOGGER.timeit('_sm') eigvecs = (np.transpose(self._array)).flatten() eigvals = np.transpose(self._eigvals) natoms = n_atoms sm = np.zeros((n_atoms, n_atoms), np.double) from .smtools import calcSM LOGGER.info('Calculating stiffness matrix.') calcSM(coords, sm, eigvecs, eigvals, natoms, n_modes, float(kbt)) LOGGER.report('Stiffness matrix calculated in %.2lfs.', label='_sm') self._stiffness = sm LOGGER.info('The range of effective force constant is: {0} to {1}.' .format(np.min(sm[np.nonzero(sm)]), np.amax(sm)))
def calcModes(self, n_modes=20, turbo=True): """Calculate principal (or essential) modes. This method uses :func:`scipy.linalg.eigh`, or :func:`numpy.linalg.eigh`, function to diagonalize the covariance matrix. :arg n_modes: number of non-zero eigenvalues/vectors to calculate, default is 20, if **None** or ``'all'`` is given, all modes will be calculated :type n_modes: int :arg turbo: when available, use a memory intensive but faster way to calculate modes, default is **True** :type turbo: bool""" linalg = importLA() if self._cov is None: raise ValueError('covariance matrix is not built or set') start = time.time() dof = self._dof self._clear() if str(n_modes).lower() == 'all': n_modes = None if linalg.__package__.startswith('scipy'): if n_modes is None: eigvals = None n_modes = dof else: n_modes = int(n_modes) if n_modes >= self._dof: eigvals = None n_modes = dof else: eigvals = (dof - n_modes, dof - 1) values, vectors = linalg.eigh(self._cov, turbo=turbo, eigvals=eigvals) else: if n_modes is not None: LOGGER.info('Scipy is not found, all modes are calculated.') values, vectors = linalg.eigh(self._cov) # Order by descending SV revert = list(range(len(values)-1, -1, -1)) values = values[revert] vectors = vectors[:, revert] which = values > 1e-8 self._eigvals = values[which] self._array = vectors[:, which] self._vars = self._eigvals self._n_modes = len(self._eigvals) LOGGER.debug('{0} modes were calculated in {1:.2f}s.' .format(self._n_modes, time.time()-start))
def getNormDistFluct(self, coords): """Normalized distance fluctuation """ model = self.getModel() LOGGER.info('Number of chains: {0}, chains: {1}.' .format(len(list(set(coords.getChids()))), \ list(set(coords.getChids())))) try: #coords = coords.select('protein and name CA') coords = (coords._getCoords() if hasattr(coords, '_getCoords') else coords.getCoords()) except AttributeError: try: checkCoords(coords) except TypeError: raise TypeError('coords must be a Numpy array or an object ' 'with `getCoords` method') if not isinstance(model, NMA): LOGGER.info('Calculating new model') model = GNM('prot analysis') model.buildKirchhoff(coords) model.calcModes() linalg = importLA() n_atoms = model.numAtoms() n_modes = model.numModes() LOGGER.timeit('_ndf') from .analysis import calcCrossCorr from numpy import linalg as LA # <dRi, dRi>, <dRj, dRj> = 1 crossC = 2-2*calcCrossCorr(model) r_ij = np.zeros((n_atoms,n_atoms,3)) for i in range(n_atoms): for j in range(i+1,n_atoms): r_ij[i][j] = coords[j,:] - coords[i,:] r_ij[j][i] = r_ij[i][j] r_ij_n = LA.norm(r_ij, axis=2) #with np.errstate(divide='ignore'): r_ij_n[np.diag_indices_from(r_ij_n)] = 1e-5 # div by 0 crossC=abs(crossC) normdistfluct = np.divide(np.sqrt(crossC),r_ij_n) LOGGER.report('NDF calculated in %.2lfs.', label='_ndf') normdistfluct[np.diag_indices_from(normdistfluct)] = 0 # div by 0 return normdistfluct
def parseDCD(filename, start=None, stop=None, step=None): """Parse CHARMM format DCD files (also NAMD 2.1 and later). Returns an :class:`Ensemble` instance. Conformations in the ensemble will be ordered as they appear in the trajectory file. Use :class:`DCDFile` class for parsing coordinates of a subset of atoms. :arg filename: DCD filename :type filename: str :arg start: index of first frame to read :type start: int :arg stop: index of the frame that stops reading :type stop: int :arg step: steps between reading frames, default is 1 meaning every frame :type step: int""" dcd = DCDFile(filename) time_ = time() n_frames = dcd.numFrames() LOGGER.info('DCD file contains {0:d} coordinate sets for {1:d} atoms.' .format(n_frames, dcd.numAtoms())) ensemble = dcd[slice(start,stop,step)] dcd.close() time_ = time() - time_ or 0.01 dcd_size = 1.0 * dcd.numFrames() * dcd._bytes_per_frame / (1024*1024) LOGGER.info('DCD file was parsed in {0:.2f} seconds.'.format(time_)) LOGGER.info('{0:.2f} MB parsed at input rate {1:.2f} MB/s.' .format(dcd_size, dcd_size/time_)) LOGGER.info('{0:d} coordinate sets parsed at input rate {1:d} frame/s.' .format(n_frames, int(n_frames/time_))) return ensemble
def parseEMDStream(stream, **kwargs): """ Returns an :class:`.AtomGroup` containing EMD data parsed from a stream of EMD file. :arg stream: Any object with the method ``readlines`` (e.g. :class:`file`, buffer, stdin)""" cutoff = kwargs.get('cutoff', None) if cutoff is not None: cutoff = float(cutoff) n_nodes = int(kwargs.get('n_nodes', 1000)) num_iter = int(kwargs.get('num_iter', 20)) map = kwargs.get('map',True) make_nodes = kwargs.get('make_nodes',False) if map is False and make_nodes is False: LOGGER.warn('At least one of map and make_nodes should be True. ' 'Setting map to False was an intentional change from the default ' 'behaviour so make_nodes has been set to True.') make_nodes = True title_suffix = kwargs.get('title_suffix','') atomgroup = AtomGroup(str(kwargs.get('title', 'Unknown')) + title_suffix) atomgroup._n_atoms = n_nodes if make_nodes: LOGGER.info('Building coordinates from electron density map. This may take a while.') LOGGER.timeit() if map: emd, atomgroup = _parseEMDLines(atomgroup, stream, cutoff=cutoff, n_nodes=n_nodes, \ num_iter=num_iter, map=map, make_nodes=make_nodes) else: atomgroup = _parseEMDLines(atomgroup, stream, cutoff=cutoff, n_nodes=n_nodes, \ num_iter=num_iter, map=map, make_nodes=make_nodes) LOGGER.report('{0} atoms and {1} coordinate sets were ' 'parsed in %.2fs.'.format(atomgroup.numAtoms(), atomgroup.numCoordsets())) else: emd = _parseEMDLines(atomgroup, stream, cutoff=cutoff, n_nodes=n_nodes, \ num_iter=num_iter, map=map, make_nodes=make_nodes) if make_nodes: if map: return emd, atomgroup else: return atomgroup else: return emd
def getNormDistFluct(self, coords): """Normalized distance fluctuation """ model = self.getModel() LOGGER.info('Number of chains: {0}, chains: {1}.' .format(len(list(set(coords.getChids()))), \ list(set(coords.getChids())))) try: #coords = coords.select('protein and name CA') coords = (coords._getCoords() if hasattr(coords, '_getCoords') else coords.getCoords()) except AttributeError: try: checkCoords(coords) except TypeError: raise TypeError('coords must be a Numpy array or an object ' 'with `getCoords` method') if not isinstance(model, NMA): LOGGER.info('Calculating new model') model = GNM('prot analysis') model.buildKirchhoff(coords) model.calcModes() LA = importLA() n_atoms = model.numAtoms() LOGGER.timeit('_ndf') from .analysis import calcCrossCorr # <dRi, dRi>, <dRj, dRj> = 1 crossC = 2 - 2 * calcCrossCorr(model) r_ij = np.zeros((n_atoms, n_atoms, 3)) for i in range(n_atoms): for j in range(i + 1, n_atoms): r_ij[i][j] = coords[j, :] - coords[i, :] r_ij[j][i] = r_ij[i][j] r_ij_n = LA.norm(r_ij, axis=2) #with np.errstate(divide='ignore'): r_ij_n[np.diag_indices_from(r_ij_n)] = ZERO # div by 0 crossC = abs(crossC) normdistfluct = np.divide(np.sqrt(crossC), r_ij_n) LOGGER.report('NDF calculated in %.2lfs.', label='_ndf') normdistfluct[np.diag_indices_from(normdistfluct)] = 0 # div by 0 return normdistfluct
def buildMechStiff(self, coords, n_modes=None, kbt=1.): """Calculate stiffness matrix calculated using :class:`.ANM` instance. Method described in [EB08]_. .. [EB08] Eyal E., Bahar I. Toward a Molecular Understanding of the Anisotropic Response of Proteins to External Forces: Insights from Elastic Network Models. *Biophys J* **2008** 94:3424-34355. :arg coords: a coordinate set or an object with ``getCoords`` method :type coords: :class:`numpy.ndarray`. :arg n_modes: number of non-zero eigenvalues/vectors to calculate. If ``None`` is given, all modes will be calculated (3x number of atoms). :type n_modes: int or ``None``, default is 20. Author: Mustafa Tekpinar & Karolina Mikulska-Ruminska & Cihan Kaya """ try: coords = (coords._getCoords() if hasattr(coords, '_getCoords') else coords.getCoords()) except AttributeError: try: checkCoords(coords) except TypeError: raise TypeError('coords must be a Numpy array or an object ' 'with `getCoords` method') n_atoms = natoms = self._n_atoms n_modes = 3 * n_atoms self.calcModes(n_modes=None, zeros=True) LOGGER.timeit('_sm') eigvecs = (np.transpose(self._array)).flatten() eigvals = np.transpose(self._eigvals) natoms = n_atoms sm = np.zeros((n_atoms, n_atoms), np.double) from .smtools import calcSM LOGGER.info('Calculating stiffness matrix.') calcSM(coords, sm, eigvecs, eigvals, natoms, n_modes, float(kbt)) LOGGER.report('Stiffness matrix calculated in %.2lfs.', label='_sm') self._stiffness = sm LOGGER.info( 'The range of effective force constant is: {0} to {1}.'.format( np.min(sm[np.nonzero(sm)]), np.amax(sm)))
def evol_refine(msa, **kwargs): import prody from prody import parseMSA, refineMSA, writeMSA, LOGGER from os.path import splitext outname = kwargs.get('outname') if outname is None: outname, ext = splitext(msa) if ext.lower() == '.gz': outname, _ = splitext(msa) outname += '_refined' + ext writeMSA(outname, refineMSA(parseMSA(msa), **kwargs), **kwargs) LOGGER.info('Refined MSA is written in file: ' + outname)
def savePickle(self, filename=None, folder=None, store_custom_PDBs=False): if folder is None: folder = SETTINGS.get('rhapsody_local_folder', '.') if filename is None: filename = 'UniprotMap-' + self.uniq_acc + '.pkl' pickle_path = os.path.join(folder, filename) cache = self.customPDBmappings if store_custom_PDBs is not True: # do not store alignments of custom PDBs self.customPDBmappings = [] # save pickle pickle.dump(self, open(pickle_path, "wb")) self.customPDBmappings = cache LOGGER.info("Pickle '{}' saved.".format(filename)) return pickle_path
def evol_refine(msa, **kwargs): import prody from prody import parseMSA, refineMSA, writeMSA, LOGGER from os.path import splitext outname = kwargs.get("outname") if outname is None: outname, ext = splitext(msa) if ext.lower() == ".gz": outname, _ = splitext(msa) outname += "_refined" + ext writeMSA(outname, refineMSA(parseMSA(msa), **kwargs), **kwargs) LOGGER.info("Refined MSA is written in file: " + outname)
def calcModes(self, n_modes=20, turbo=True): """Calculate principal (or essential) modes. This method uses :func:`scipy.linalg.eigh`, or :func:`numpy.linalg.eigh`, function to diagonalize the covariance matrix. :arg n_modes: number of non-zero eigenvalues/vectors to calculate, default is 20, for **None** all modes will be calculated :type n_modes: int :arg turbo: when available, use a memory intensive but faster way to calculate modes, default is **True** :type turbo: bool""" linalg = importLA() if self._cov is None: raise ValueError('covariance matrix is not built or set') start = time.time() dof = self._dof if linalg.__package__.startswith('scipy'): if n_modes is None: eigvals = None n_modes = dof else: n_modes = int(n_modes) if n_modes >= self._dof: eigvals = None n_modes = dof else: eigvals = (dof - n_modes, dof - 1) values, vectors = linalg.eigh(self._cov, turbo=turbo, eigvals=eigvals) else: if n_modes is not None: LOGGER.info('Scipy is not found, all modes are calculated.') values, vectors = linalg.eigh(self._cov) # Order by descending SV revert = list(range(len(values) - 1, -1, -1)) values = values[revert] vectors = vectors[:, revert] which = values > 1e-8 self._eigvals = values[which] self._array = vectors[:, which] self._vars = self._eigvals self._n_modes = len(self._eigvals) LOGGER.debug('{0} modes were calculated in {1:.2f}s.'.format( self._n_modes, time.time() - start))
def parseEMDStream(stream, **kwargs): """ Returns an :class:`.AtomGroup` containing EMD data parsed from a stream of EMD file. :arg stream: Anything that implements the method ``readlines`` (e.g. :class:`file`, buffer, stdin)""" cutoff = kwargs.get('cutoff', None) if cutoff is not None: cutoff = float(cutoff) n_nodes = int(kwargs.get('n_nodes', 1000)) num_iter = int(kwargs.get('num_iter', 20)) map = kwargs.get('map',False) make_nodes = kwargs.get('make_nodes',True) if map is False and make_nodes is False: LOGGER.warn('At least one of map and make_nodes should be True. ' 'Setting make_nodes to False was an intentional change from the default ' 'behaviour so map has been set to True.') map = True title_suffix = kwargs.get('title_suffix','') atomgroup = AtomGroup(str(kwargs.get('title', 'Unknown')) + title_suffix) if make_nodes: LOGGER.info('Building coordinates from electron density map. This may take a while.') LOGGER.timeit() if map: emd, atomgroup = _parseEMDLines(atomgroup, stream, cutoff=cutoff, n_nodes=n_nodes, \ num_iter=num_iter, map=map, make_nodes=make_nodes) else: atomgroup = _parseEMDLines(atomgroup, stream, cutoff=cutoff, n_nodes=n_nodes, \ num_iter=num_iter, map=map, make_nodes=make_nodes) LOGGER.report('{0} atoms and {1} coordinate sets were ' 'parsed in %.2fs.'.format(atomgroup.numAtoms(), atomgroup.numCoordsets())) else: emd = _parseEMDLines(atomgroup, stream, cutoff=cutoff, n_nodes=n_nodes, \ num_iter=num_iter, map=map, make_nodes=make_nodes) if make_nodes: if map: return emd, atomgroup else: return atomgroup else: return emd
def recoverPickle(self, filename=None, folder=None, days=30, **kwargs): acc = self.uniq_acc if acc is None: # assume acc is equal to uniq_acc acc = self.acc if folder is None: folder = SETTINGS.get('rhapsody_local_folder', '.') if filename is None: # assume acc is equal to uniq_acc acc = self.acc filename = 'UniprotMap-' + acc + '.pkl' pickle_path = os.path.join(folder, filename) if not os.path.isfile(pickle_path): # import unique accession number acc = queryUniprot(self.acc)['accession 0'] filename = 'UniprotMap-' + acc + '.pkl' pickle_path = os.path.join(folder, filename) else: pickle_path = os.path.join(folder, filename) # check if pickle exists if not os.path.isfile(pickle_path): raise IOError("File '{}' not found".format(filename)) # load pickle recovered_self = pickle.load(open(pickle_path, "rb")) if acc not in [recovered_self.acc, recovered_self.uniq_acc]: raise ValueError('Accession number in recovered pickle (%s) ' % recovered_self.uniq_acc + 'does not match.') # check timestamp and ignore pickles that are too old date_format = "%Y-%m-%d %H:%M:%S.%f" t_old = datetime.datetime.strptime(recovered_self._timestamp, date_format) t_now = datetime.datetime.utcnow() Delta_t = datetime.timedelta(days=days) if t_old + Delta_t < t_now: raise RuntimeError( 'Pickle {} was too old and was ignored.'.format(filename)) self.fullRecord = recovered_self.fullRecord self.uniq_acc = recovered_self.uniq_acc self.sequence = recovered_self.sequence self.PDBrecords = recovered_self.PDBrecords self.PDBmappings = recovered_self.PDBmappings self.customPDBmappings = recovered_self.customPDBmappings self._align_algo_args = recovered_self._align_algo_args self._align_algo_kwargs = recovered_self._align_algo_kwargs self._timestamp = recovered_self._timestamp self.Pfam = recovered_self.Pfam LOGGER.info("Pickle '{}' recovered.".format(filename)) return
def savePickle(self, folder=None, filename=None): """Stores a pickle of the current class instance. The pickle will contain all information and precomputed features, but not GNM and ANM models. In case a PDBID is missing, the parsed PDB :class:`AtomGroup` is stored as well. :arg folder: path of the folder where the pickle will be saved. If not specified, the local Rhapsody installation folder will be used. :type folder: str :arg filename: name of the pickle. By default, the pickle will be saved as ``'PDBfeatures-[PDBID].pkl'``. If a PDBID is not defined, the user must provide a filename. :type filename: str :return: pickle path :rtype: str """ if folder is None: # define folder where to look for pickles folder = SETTINGS.get('rhapsody_local_folder') if folder is None: folder = '.' else: folder = os.path.join(folder, 'pickles') if filename is None: # use the default filename, if possible if self.PDBID is None: # when a custom structure is used, there is no # default filename: the user should provide it raise ValueError('Please provide a filename.') filename = 'PDBfeatures-' + self.PDBID + '.pkl' pickle_path = os.path.join(folder, filename) # do not store GNM and ANM instances. # If a valid PDBID is present, do not store parsed PDB # as well, since it can be easily fetched again cache = (self._pdb, self._gnm, self._anm) if self.PDBID is not None: self._pdb = None self._gnm = {} self._anm = {} for env in ['chain', 'reduced', 'sliced']: self._gnm[env] = {chID: None for chID in self.chids} self._anm[env] = {chID: None for chID in self.chids} # write pickle pickle.dump(self, open(pickle_path, "wb")) # restore temporarily cached data self._pdb, self._gnm, self._anm = cache LOGGER.info("Pickle '{}' saved.".format(filename)) return pickle_path
def __add__(self, other): if not isinstance(other, AtomGroup): raise TypeError('unsupported operand type(s) for +: {0} and ' '{1}'.format(repr(type(self).__name__), repr(type(other).__name__))) new = AtomGroup(self._title + ' + ' + other._title) if self._n_csets: if self._n_csets == other._n_csets: new.setCoords(np.concatenate((self._coords, other._coords), 1)) if self._n_csets > 1: LOGGER.info('All {0} coordinate sets are copied to ' '{1}.'.format(self._n_csets, new.getTitle())) else: new.setCoords(np.concatenate((self._getCoords(), other._getCoords()))) LOGGER.info('Active coordinate sets are copied to {0}.' .format(new.getTitle())) elif other._n_csets: LOGGER.warn('No coordinate sets are copied to {0}' .format(new.getTitle())) for key in set(list(self._data) + list(other._data)): if key in ATOMIC_FIELDS and ATOMIC_FIELDS[key].readonly: continue this = self._data.get(key) that = other._data.get(key) if this is not None or that is not None: if this is None: shape = list(that.shape) shape[0] = len(self) this = np.zeros(shape, that.dtype) if that is None: shape = list(this.shape) shape[0] = len(other) that = np.zeros(shape, this.dtype) new._data[key] = np.concatenate((this, that)) if self._bonds is not None and other._bonds is not None: new.setBonds(np.concatenate([self._bonds, other._bonds + self._n_atoms])) elif self._bonds is not None: new.setBonds(self._bonds.copy()) elif other._bonds is not None: new.setBonds(other._bonds + self._n_atoms) return new
def queryUniprot(*args, n_attempts=3, dt=1, **kwargs): """ Redefine prody function to check for no internet connection """ attempt = 0 while attempt < n_attempts: try: _ = openURL('http://www.uniprot.org/') break except: LOGGER.info(f'Attempt {attempt} to contact www.uniprot.org failed') attempt += 1 time.sleep((attempt + 1) * dt) else: _ = openURL('http://www.uniprot.org/') return pd.queryUniprot(*args, **kwargs)
def setAtoms(self, atoms, pH=7.0): ''' Sets atoms. :arg atoms: *atoms* parsed by parsePDB :arg pH: pH based on which to select protonation states for adding missing hydrogens, default is 7.0. :type pH: float ''' atoms = atoms.select('not hetatm') self._nuc = atoms.select('nucleotide') if self._nuc is not None: idx_p = [] for c in self._nuc.getChids(): tmp = self._nuc[c].iterAtoms() for a in tmp: if a.getName() in ['P', 'OP1', 'OP2', 'OP3']: idx_p.append(a.getIndex()) if idx_p: nsel = 'not index ' + ' '.join([str(i) for i in idx_p]) atoms = atoms.select(nsel) if self._isBuilt(): super(ClustENM, self).setAtoms(atoms) else: LOGGER.info('Fixing the structure ...') LOGGER.timeit('_clustenm_fix') self._ph = pH self._fix(atoms) LOGGER.report('The structure was fixed in %.2fs.', label='_clustenm_fix') if self._nuc is None: self._idx_cg = self._atoms.ca.getIndices() self._n_cg = self._atoms.ca.numAtoms() else: self._idx_cg = self._atoms.select("name CA C2 C4' P").getIndices() self._n_cg = self._atoms.select("name CA C2 C4' P").numAtoms() self._n_atoms = self._atoms.numAtoms() self._indices = None
def update(self, source=None): """Update data and files from CATH.""" self._source = source = self._source or source self.reset() if source is None: return LOGGER.timeit('_cath_update') type_ = 0 tree = None if isinstance(source, str): if isfile(source): type_ = 1 elif isURL(source): type_ = 0 else: type_ = 2 elif hasattr(source, 'read'): type_ = 1 else: raise TypeError( 'source must be either an url, file name, file handle, ' 'or text in xml format') if type_ == 0: LOGGER.info('Fetching data from CATH...') self._fetch() LOGGER.info('Parsing CATH files...') self._parse() elif type_ == 1: LOGGER.info('Reading data from the local xml file...') tree = ET.parse(source) elif type_ == 2: LOGGER.info('Parsing input string...') tree = ET.fromstring(source) # post-processing if type_ > 0: root = tree.getroot() nodes = root.iter() # remove prefix from node tags for node in nodes: node.tag = node.tag.lstrip('id.') # convert int to str length_nodes = root.findall('.//*[@length]') for node in length_nodes: node.attrib['length'] = int(node.attrib['length']) copy2(root, self.root) self._update_map() LOGGER.report('CATH local database built in %.2fs.', '_cath_update')
def prody_select(selstr, *pdbs, **kwargs): """Write selected atoms from a PDB file in PDB format. :arg selstr: atom selection string, see :ref:`selections` :arg pdbs: PDB identifier(s) or filename(s) :arg output: output filename, default is :file:`pdb_selected.pdb` :arg prefix: prefix for output file, default is PDB filename :arg suffix: output filename suffix, default is :file:`_selected`""" from os.path import isfile from prody import LOGGER, parsePDB, writePDB #selstr = kwargs.get('selstr') if not pdbs: raise ValueError('pdb argument must be provided') if ((isfile(selstr) or len(selstr) == 4 and selstr[0].isdigit()) and len(pdbs) == 1 and not isfile(pdbs[0])): pdbs, selstr = selstr, pdbs[0] LOGGER.warn('The order of selstr and pdb arguments have switched ' 'to support multiple files, old order will be supported ' 'until v1.4.') pdbs = [pdbs] prefix = kwargs.get('prefix', None) suffix = kwargs.get('suffix', '_selected') output = kwargs.get('output', None) altloc = kwargs.get('altloc', None) for pdb in pdbs: pdb = parsePDB(pdb, altloc=altloc) pdbselect = pdb.select(selstr) if pdbselect is None: LOGGER.warn('Selection {0} did not match any atoms.'.format( repr(selstr))) return LOGGER.info('Selection {0} matched {1} atoms.'.format( repr(selstr), len(pdbselect))) outname = output or ((prefix or pdb.getTitle()) + suffix) LOGGER.info('Selection is written into: ' + writePDB(outname, pdbselect))
def recoverPickle(self, folder=None, filename=None, days=30, **kwargs): if folder is None: # define folder where to look for pickles folder = SETTINGS.get('rhapsody_local_folder', '.') if filename is None: # use the default filename, if possible if self.PDBID is not None: filename = 'PDBfeatures-' + self.PDBID + '.pkl' else: # when a custom structure is used, there is no # default filename: the user should provide it raise ValueError('Please provide a filename.') pickle_path = os.path.join(folder, filename) if not os.path.isfile(pickle_path): raise IOError("File '{}' not found".format(filename)) recovered_self = pickle.load(open(pickle_path, "rb")) # check consistency of recovered data if self.PDBID is None: if self._pdb != recovered_self._pdb: raise ValueError( 'Incompatible PDB structure in recovered pickle.') elif self.PDBID != recovered_self.PDBID: raise ValueError( 'PDBID in recovered pickle ({}) does not match.'.format( recovered_self.PDBID)) if self.n_modes != recovered_self.n_modes: raise ValueError( 'Num. of modes in recovered pickle ({}) does not match.'. format(recovered_self.n_modes)) # check timestamp and ignore pickles that are too old date_format = "%Y-%m-%d %H:%M:%S.%f" t_old = datetime.datetime.strptime(recovered_self.timestamp, date_format) t_now = datetime.datetime.utcnow() Delta_t = datetime.timedelta(days=days) if t_old + Delta_t < t_now: raise RuntimeError('Pickle was too old and was ignored.') # import recovered data self.chids = recovered_self.chids self.resids = recovered_self.resids self.feats = recovered_self.feats self._gnm = recovered_self._gnm self._anm = recovered_self._anm self.timestamp = recovered_self.timestamp LOGGER.info("Pickle '{}' recovered.".format(filename)) return
def __add__(self, other): """Concatenate ensembles. The reference coordinates, atoms, and weights of *self* is used in the resulting ensemble.""" if not isinstance(other, Ensemble): raise TypeError('an Ensemble instance cannot be added to an {0} ' 'instance'.format(type(other))) elif self._n_atoms != other._n_atoms: raise ValueError('Ensembles must have same number of atoms.') ensemble = type(self)('{0} + {1}'.format(self.getTitle(), other.getTitle())) if self._coords is not None: ensemble.setCoords(self._coords.copy()) if self._confs is not None: ensemble.addCoordset(self._confs.copy()) if other._confs is not None: ensemble.addCoordset(other._confs.copy()) all_keys = set(list(self._data.keys()) + list(other._data.keys())) for key in all_keys: if key in self._data and key in other._data: self_data = self._data[key] other_data = other._data[key] elif key in self._data: self_data = self._data[key] other_data = zeros(other.numConfs(), dtype=self_data.dtype) elif key in other._data: other_data = other._data[key] self_data = zeros(other.numConfs(), dtype=other_data.dtype) ensemble._data[key] = concatenate((self_data, other_data), axis=0) if self._weights is not None: LOGGER.info('Atom weights from {0} are used in {1}.'.format( repr(self._title), repr(ensemble.getTitle()))) ensemble.setWeights(self._weights.copy()) elif other._weights is not None: ensemble.setWeights(other._weights.copy()) if self._atoms is not None: ensemble.setAtoms(self._atoms) ensemble._indices = self._indices else: ensemble.setAtoms(other._atoms) ensemble._indices = other._indices return ensemble
def calcProjection(self, coords, blocks, **kwargs): natoms = self._n_atoms if natoms != len(blocks): raise ValueError('len(blocks) must match number of atoms') LOGGER.timeit('_rtb') from collections import defaultdict i = Increment() d = defaultdict(i) blocks = np.array([d[b] for b in blocks], dtype='int32') try: from collections import Counter except ImportError: counter = defaultdict(int) for b in blocks: counter[b] += 1 else: counter = Counter(blocks) nblocks = len(counter) maxsize = 1 nones = 0 while counter: _, size = counter.popitem() if size == 1: nones += 1 if size > maxsize: maxsize = size LOGGER.info('System has {0} blocks largest with {1} of {2} units.' .format(nblocks, maxsize, natoms)) nb6 = nblocks * 6 - nones * 3 coords = coords.T.astype(float, order='C') hessian = self._hessian self._project = project = np.zeros((natoms * 3, nb6), float) from .rtbtools import calc_projection calc_projection(coords, blocks, project, natoms, nblocks, nb6, maxsize) self._hessian = project.T.dot(hessian).dot(project) self._dof = self._hessian.shape[0] LOGGER.report('Block Hessian and projection matrix were calculated in %.2fs.', label='_rtb')
def prody_select(selstr, *pdbs, **kwargs): """Write selected atoms from a PDB file in PDB format. :arg selstr: atom selection string, see :ref:`selections` :arg pdbs: :term:`PDB` identifier(s) or filename(s) :arg output: output filename, default is :file:`pdb_selected.pdb` :arg prefix: prefix for output file, default is PDB filename :arg suffix: output filename suffix, default is :file:`_selected`""" from os.path import isfile from prody import LOGGER, parsePDB, writePDB #selstr = kwargs.get('selstr') if not pdbs: raise ValueError('pdb argument must be provided') if ((isfile(selstr) or len(selstr) == 4 and selstr[0].isdigit()) and len(pdbs) == 1 and not isfile(pdbs[0])): pdbs, selstr = selstr, pdbs[0] LOGGER.warn('The order of selstr and pdb arguments have switched ' 'to support multiple files, old order will be supported ' 'until v1.4.') pdbs = [pdbs] prefix = kwargs.get('prefix', None) suffix = kwargs.get('suffix', '_selected') output = kwargs.get('output', None) for pdb in pdbs: pdb = parsePDB(pdb) pdbselect = pdb.select(selstr) if pdbselect is None: LOGGER.warn('Selection {0:s} did not match any atoms.' .format(repr(selstr))) return LOGGER.info('Selection {0:s} matched {1:d} atoms.' .format(repr(selstr), len(pdbselect))) outname = output or ((prefix or pdb.getTitle()) + suffix) LOGGER.info('Selection is written into: ' + writePDB(outname, pdbselect))
def __add__(self, other): """Concatenate ensembles. The reference coordinates, atoms, and weights of *self* is used in the resulting ensemble.""" if not isinstance(other, Ensemble): raise TypeError('an Ensemble instance cannot be added to an {0} ' 'instance'.format(type(other))) elif self._n_atoms != other._n_atoms: raise ValueError('Ensembles must have same number of atoms.') ensemble = Ensemble('{0} + {1}'.format(self.getTitle(), other.getTitle())) if self._coords is not None: ensemble.setCoords(self._coords.copy()) if self._confs is not None: ensemble.addCoordset(self._confs.copy()) if other._confs is not None: ensemble.addCoordset(other._confs.copy()) all_keys = list(self._data.keys()) + list(other._data.keys()) for key in all_keys: if key in self._data and key in other._data: self_data = self._data[key] other_data = other._data[key] elif key in self._data: self_data = self._data[key] other_data = zeros(other.numConfs(), dtype=self_data.dtype) elif key in other._data: other_data = other._data[key] self_data = zeros(other.numConfs(), dtype=other_data.dtype) ensemble._data[key] = concatenate((self_data, other_data), axis=0) if self._weights is not None: LOGGER.info('Atom weights from {0} are used in {1}.' .format(repr(self._title), repr(ensemble.getTitle()))) ensemble.setWeights(self._weights.copy()) elif other._weights is not None: ensemble.setWeights(other._weights.copy()) if self._atoms is not None: ensemble.setAtoms(self._atoms) ensemble._indices = self._indices else: ensemble.setAtoms(other._atoms) ensemble._indices = other._indices return ensemble
def getFilterList(self): """Returns a list of chemicals for the entries that were filtered out""" filterDict = self._filterDict if filterDict is None: raise ValueError( 'You cannot obtain the list of filtered out entries before doing any filtering.' ) temp_str = ', '.join([ str(len(filterDict['lower_MW'])), str(len(filterDict['upper_MW'])), str(len(filterDict['conf_score'])) ]) LOGGER.info('Filtered out [' + temp_str + '] for [lower weight, upper weight, confidence score]') return self._filterList
def update(self, source=None): """Update data and files from CATH.""" self._source = source = self._source or source self.reset() if source is None: return LOGGER.timeit('_cath_update') type_ = 0 tree = None if isinstance(source, str): if isfile(source): type_ = 1 elif isURL(source): type_ = 0 else: type_ = 2 elif hasattr(source, 'read'): type_ = 1 else: raise TypeError('source must be either an url, file name, file handle, ' 'or text in xml format') if type_ == 0: LOGGER.info('Fetching data from CATH...') self._fetch() LOGGER.info('Parsing CATH files...') self._parse() elif type_ == 1: LOGGER.info('Reading data from the local xml file...') tree = ET.parse(source) elif type_ == 2: LOGGER.info('Parsing input string...') tree = ET.fromstring(source) # post-processing if type_ > 0: root = tree.getroot() nodes = root.iter() # remove prefix from node tags for node in nodes: node.tag = node.tag.lstrip('id.') # convert int to str length_nodes = root.findall('.//*[@length]') for node in length_nodes: node.attrib['length'] = int(node.attrib['length']) copy2(root, self.root) self._update_map() LOGGER.report('CATH local database built in %.2fs.', '_cath_update')
def fetchPDBClusters(sqid=None): """Retrieve PDB sequence clusters. PDB sequence clusters are results of the weekly clustering of protein chains in the PDB generated by blastclust. They are available at FTP site: ftp://resources.rcsb.org/sequence/clusters/ This function will download about 10 Mb of data and save it after compressing in your home directory in :file:`.prody/pdbclusters`. Compressed files will be less than 4 Mb in size. Cluster data can be loaded using :func:`loadPDBClusters` function and be accessed using :func:`listPDBCluster`.""" if sqid is not None: if sqid not in PDB_CLUSTERS: raise ValueError('sqid must be one of ' + PDB_CLUSTERS_SQID_STR) keys = [sqid] else: keys = list(PDB_CLUSTERS) PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters') if not os.path.isdir(PDB_CLUSTERS_PATH): os.mkdir(PDB_CLUSTERS_PATH) LOGGER.progress('Downloading sequence clusters', len(PDB_CLUSTERS), '_prody_fetchPDBClusters') count = 0 for i, x in enumerate(keys): filename = 'bc-{0}.out'.format(x) url = ('ftp://resources.rcsb.org/sequence/clusters/' + filename) try: inp = openURL(url) except IOError: LOGGER.warning('Clusters at {0}% sequence identity level could ' 'not be downloaded.') continue else: out = openFile(filename+'.gz', 'w', folder=PDB_CLUSTERS_PATH) out.write(inp.read()) inp.close() out.close() count += 1 LOGGER.update(i, '_prody_fetchPDBClusters') LOGGER.clear() if len(PDB_CLUSTERS) == count: LOGGER.info('All PDB clusters were downloaded successfully.') elif count == 0: LOGGER.warn('PDB clusters could not be downloaded.')
def iterpose(self, rmsd=0.0001, quiet=False): """Iteratively superpose the ensemble until convergence. Initially, all conformations are aligned with the reference coordinates. Then mean coordinates are calculated, and are set as the new reference coordinates. This is repeated until reference coordinates do not change. This is determined by the value of RMSD between the new and old reference coordinates. Note that at the end of the iterative procedure the reference coordinate set will be average of conformations in the ensemble. :arg rmsd: change in reference coordinates to determine convergence, default is 0.0001 Å RMSD :type rmsd: float""" if self._coords is None: raise AttributeError('coordinates are not set, use `setCoords`') if self._confs is None or len(self._confs) == 0: raise AttributeError('conformations are not set, use' '`addCoordset`') LOGGER.info('Starting iterative superposition:') LOGGER.timeit('_prody_ensemble') rmsdif = 1 step = 0 weights = self._weights length = len(self) if weights is not None: if weights.ndim == 3: weightsum = weights.sum(axis=0) weightsum[weightsum == 0.] = 1. # add pseudocount to avoid nan else: weightsum = length while rmsdif > rmsd: self._superpose(quiet=quiet) if weights is None: newxyz = self._confs.sum(0) / length else: newxyz = (self._confs * weights).sum(0) / weightsum rmsdif = getRMSD(self._coords, newxyz) self._coords = newxyz step += 1 LOGGER.info('Step #{0}: RMSD difference = {1:.4e}'.format( step, rmsdif)) LOGGER.report('Iterative superposition completed in %.2fs.', '_prody_ensemble')
def fetchPDBClusters(sqid=None): """Retrieve PDB sequence clusters. PDB sequence clusters are results of the weekly clustering of protein chains in the PDB generated by blastclust. They are available at FTP site: ftp://resources.rcsb.org/sequence/clusters/ This function will download about 10 Mb of data and save it after compressing in your home directory in :file:`.prody/pdbclusters`. Compressed files will be less than 4 Mb in size. Cluster data can be loaded using :func:`loadPDBClusters` function and be accessed using :func:`listPDBCluster`.""" if sqid is not None: if sqid not in PDB_CLUSTERS: raise ValueError('sqid must be one of ' + PDB_CLUSTERS_SQID_STR) keys = [sqid] else: keys = list(PDB_CLUSTERS) PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters') if not os.path.isdir(PDB_CLUSTERS_PATH): os.mkdir(PDB_CLUSTERS_PATH) LOGGER.progress('Downloading sequence clusters', len(keys), '_prody_fetchPDBClusters') count = 0 for i, x in enumerate(keys): filename = 'bc-{0}.out'.format(x) url = ('ftp://resources.rcsb.org/sequence/clusters/' + filename) try: inp = openURL(url) except IOError: LOGGER.warning('Clusters at {0}% sequence identity level could ' 'not be downloaded.') continue else: out = openFile(filename + '.gz', 'w', folder=PDB_CLUSTERS_PATH) out.write(inp.read()) inp.close() out.close() count += 1 LOGGER.update(i, label='_prody_fetchPDBClusters') LOGGER.finish() if len(PDB_CLUSTERS) == count: LOGGER.info('All PDB clusters were downloaded successfully.') elif count == 0: LOGGER.warn('PDB clusters could not be downloaded.')
def getFilterList(self): """Returns a list of PDB IDs and chains for the entries that were filtered out""" filterDict = self._filterDict if filterDict is None: raise ValueError( 'You cannot obtain the list of filtered out entries before doing any filtering.' ) temp_str = ', '.join([ str(len(filterDict['len'])), str(len(filterDict['rmsd'])), str(len(filterDict['Z'])), str(len(filterDict['identity'])) ]) LOGGER.info('Filtered out [' + temp_str + '] for [length, RMSD, Z, identity]') return self._filterList
def evol_filter(msa, *word, **kwargs): import prody from prody import MSAFile, writeMSA, LOGGER from os.path import splitext outname = kwargs.get('outname') if outname is None: outname, ext = splitext(msa) if ext.lower() == '.gz': outname, _ = splitext(msa) outname += '_filtered' + ext single = len(word) == 1 if single: word = word[0] if kwargs.get('startswith', False): if single: filter = lambda label, seq, word=word: label.startswith(word) elif kwargs.get('endswith', False): if single: filter = lambda label, seq, word=word: label.endswith(word) elif kwargs.get('contains', False): if single: filter = lambda label, seq, word=word: word in label elif kwargs.get('equals', False): if single: filter = lambda label, seq, word=word: word == label else: filter = lambda label, seq, word=set(word): label in word else: raise TypeError('one of startswith, endswith, contains, or equals ' 'must be specified') msa = MSAFile(msa, filter=filter, filter_full=kwargs.get('filter_full', False)) LOGGER.info('Filtered MSA is written in file: ' + writeMSA(outname, msa, **kwargs))
def searchDali(pdbId, chainId, daliURL=None, subset='fullPDB', **kwargs): """Search Dali server with input of PDB ID and chain ID. Dali server: http://ekhidna2.biocenter.helsinki.fi/dali/ :arg subset: fullPDB, PDB25, PDB50, PDB90 :type subset: str """ LOGGER.timeit('_dali') # timeout = 120 timeout = kwargs.pop('timeout', 120) if daliURL is None: daliURL = "http://ekhidna2.biocenter.helsinki.fi/cgi-bin/sans/dump.cgi" pdbId = pdbId.lower() pdb_chain = pdbId + chainId parameters = { 'cd1' : pdb_chain, 'method': 'search', 'title': 'Title_'+pdb_chain, 'address': '' } enc_params = urllib.urlencode(parameters).encode('utf-8') request = urllib2.Request(daliURL, enc_params) try_error = 3 while try_error >= 0: try: url = urllib2.urlopen(request).url break except: try_error -= 1 if try_error >= 0: LOGGER.sleep(2, '. Connection error happened. Trying to reconnect...') continue else: url = urllib2.urlopen(request).url break if url.split('.')[-1].lower() in ['html', 'php']: # print('test -1: '+url) url = url.replace(url.split('/')[-1], '') LOGGER.debug('Submitted Dali search for PDB and chain "{0} and {1}".'.format(pdbId, chainId)) LOGGER.info(url) LOGGER.clear() obj = DaliRecord(url, pdbId, chainId, subset=subset, timeout=timeout, **kwargs) if obj.isSuccess: return obj return None
def parsePolyPhen2output(pph2_output): '''Import PolyPhen-2 results directly from the output of 'queryPolyPhen2' or from a file (in 'full' format). ''' assert type(pph2_output) in [dict, str] if type(pph2_output) is dict: lines = pph2_output['full'].split('\n') else: with open(pph2_output, 'r') as file: lines = file.readlines() # discard invalid lines lines = [l for l in lines if l.strip() and l[0] != '#'] if not lines: msg = ( "PolyPhen-2's output is empty. Please check file 'pph2-log.txt' " "in the output folder for error messages from PolyPhen-2. \n" "Typical errors include: \n" "1) query contains *non-human* variants \n" "2) variants' format is incorrect (e.g. " '"UniprotID pos wt_aa mut_aa") \n' "3) wild-type amino acids are in the wrong position on the " "sequence (please refer to Uniprot's canonical isoform) \n" "4) Uniprot accession number is not recognized by PolyPhen-2. \n") raise RuntimeError(msg) # define a structured array pl_dtype = np.dtype([(col, 'U25') for col in pph2_columns]) parsed_lines = np.zeros(len(lines), dtype=pl_dtype) # fill structured array n_cols = len(pph2_columns) for i, line in enumerate(lines): # parse line words = [w.strip() for w in line.split('\t')] # check format n_words = len(words) if n_words == n_cols - 1: # manually insert null 'other' column words.append('?') elif n_words != n_cols: msg = 'Incorrect number of columns: {}'.format(n_words) raise ValueError(msg) # import to structured array parsed_lines[i] = tuple(words) LOGGER.info("PolyPhen-2's output parsed.") return parsed_lines
def fetch(self, url=None, localFile=False, **kwargs): if localFile: emsurfer_file = open(url, 'r') data = emsurfer_file.read() emsurfer_file.close() else: import requests if url == None: url = self._url html = requests.get(url).content if PY3K: html = html.decode() LOGGER.clear() LOGGER.report('Emsurfer results were fetched in %.1fs.', '_emsurfer') data = html.strip().split('\n') data_list = [] for line in data[3:-2]: data_list.append(tuple(line.split('\t'))) # Rank EMDB_ID EUC_D RESOLUTION emsurferInfo = np.array(data_list, dtype=[('Rank', '<i4'), ('EMDB_ID', '<U70'), ('EUC_D', '<f4'), ('RESOLUTION', '<f4')]) emdListAll = [] self._emsurferInfo = emsurferInfo emsurfer_temp_dict = dict() for temp in self._emsurferInfo: temp_dict = dict() temp_dict['Rank'] = temp[0] temp_dict['EMDB_ID'] = emdbId = temp[1] temp_dict['EUC_D'] = temp[2] temp_dict['RESOLUTION'] = temp[3] emsurfer_temp_dict[emdbId] = temp_dict emdListAll.append(emdbId) self._emdListAll = tuple(emdListAll) self._emdList = self._emdListAll self._alignEMD = emsurfer_temp_dict LOGGER.info('Obtained ' + str(len(emdListAll)) + ' EMD matches from Emsurfer for '+self._emdId+'.') return True
def print_feat_imp_figure(filename, feat_imp, featset): assert isinstance(filename, str), 'filename must be a string' filename = os.path.splitext(filename)[0] + '.png' matplotlib = _try_import_matplotlib() if matplotlib is None: return else: from matplotlib import pyplot as plt fig = plt.figure(figsize=(7, 7)) n = len(feat_imp) plt.bar(range(n), feat_imp, align='center', tick_label=featset) plt.xticks(rotation='vertical') plt.ylabel('feat. importance') fig.savefig(filename, format='png', bbox_inches='tight') plt.close() plt.rcParams.update(plt.rcParamsDefault) LOGGER.info(f'Feat. importance plot saved to {filename}')
def close(self): """Close the file. This method will not affect a stream.""" if self._filename is None: self._closed = True return if not self._mode.startswith('r') and self._format == STOCKHOLM: try: self._write('//\n') except ValueError: LOGGER.info('Failed to write terminal slash characters to ' 'closed file.') try: self._stream.close() except Exception: pass self._closed = True
def parseChainsList(filename): """ Parse a set of PDBs and extract chains based on a list in a text file. :arg filename: the name of the file to be read :type filename: str Returns: lists containing an :class:'.AtomGroup' for each PDB, the headers for those PDBs, and the requested :class:`.Chain` objects """ verb = LOGGER.verbosity LOGGER.verbosity = 'info' fi = open(filename, 'r') lines = fi.readlines() fi.close() pdb_ids = [] ags = [] headers = [] chains = [] num_lines = len(lines) LOGGER.progress('Starting', num_lines) for i, line in enumerate(lines): LOGGER.update(i, 'Parsing lines...') pdb_id = line.split()[0].split('_')[0] if not pdb_id in pdb_ids: pdb_ids.append(pdb_id) ag, header = parsePDB(pdb_id, compressed=False, \ subset=line.split()[0].split('_')[1], header=True) ags.append(ag) headers.append(header) chains.append(ag.getHierView()[line.strip().split()[1]]) LOGGER.verbosity = verb LOGGER.info( '{0} PDBs have been parsed and {1} chains have been extracted. \ '.format(len(ags), len(chains))) return ags, headers, chains
def save(self, filename='cath.xml'): """Write local CATH database to an XML file. *filename* can either be a file name or a handle.""" LOGGER.timeit('_cath_write') if not isinstance(filename, str): try: fn = filename.name except AttributeError: fn = repr(filename) f = filename else: fn = filename LOGGER.info('Writing data to {0}...'.format(fn)) if not len(self.root): raise ValueError('local database has not been built, ' 'please call update() first') tree = self.copy() root = tree.getroot() # convert int to str length_nodes = root.findall('.//*[@length]') for node in length_nodes: node.attrib['length'] = str(node.attrib['length']) # add prefix to node tags nodes = root.iter() for node in nodes: node.tag = 'id.' + node.tag # add indentation to nodes indentElement(root) if isinstance(filename, str): f = open(filename, 'wb') tree.write(f, encoding='utf-8') f.close() LOGGER.report('CATH local database saved in %.2fs.', '_cath_write')
def __add__(self, other): """Concatenate ensembles. The reference coordinates and weights of *self* is used in the resulting ensemble.""" if not isinstance(other, Ensemble): raise TypeError('an Ensemble instance cannot be added to an {0} ' 'instance'.format(type(other))) elif self.numAtoms() != other.numAtoms(): raise ValueError('Ensembles must have same number of atoms.') ensemble = Ensemble('{0} + {1}'.format(self.getTitle(), other.getTitle())) ensemble.setCoords(self._coords.copy()) ensemble.addCoordset(self._confs.copy()) ensemble.addCoordset(other.getCoordsets()) if self._weights is not None: LOGGER.info('Atom weights from {0} are used in {1}.'.format( repr(self._title), repr(ensemble.getTitle()))) ensemble.setWeights(self._weights) return ensemble
def _sliceMSA(self, msa): acc_name = self.fullRecord['name 0'] # find sequences in MSA related to the given Uniprot name indexes = msa.getIndex(acc_name) if indexes is None: raise RuntimeError( 'No sequence found in MSA for {}'.format(acc_name)) elif type(indexes) is not list: indexes = [indexes] # slice MSA to include only columns from selected sequences cols = np.array([], dtype=int) arr = msa._getArray() for i in indexes: cols = np.append(cols, np.char.isalpha(arr[i]).nonzero()[0]) cols = np.unique(cols) arr = arr.take(cols, 1) sliced_msa = MSA(arr, title='refined', labels=msa._labels) LOGGER.info('Number of columns in MSA reduced to {}.'.format( \ sliced_msa.numResidues())) return sliced_msa, indexes