def buildSCAMatrix(msa, turbo=True, **kwargs): """Return SCA matrix calculated for *msa*, which may be an :class:`.MSA` instance or a 2D Numpy character array. Implementation is case insensitive and handles ambiguous amino acids as follows: * **B** (Asx) count is allocated to *D* (Asp) and *N* (Asn) * **Z** (Glx) count is allocated to *E* (Glu) and *Q* (Gln) * **J** (Xle) count is allocated to *I* (Ile) and *L* (Leu) * **X** (Xaa) count is allocated to the twenty standard amino acids * Joint probability of observing a pair of ambiguous amino acids is allocated to all potential combinations, e.g. probability of **XX** is allocated to 400 combinations of standard amino acids, similarly probability of **XB** is allocated to 40 combinations of *D* and *N* with the standard amino acids. Selenocysteine (**U**, Sec) and pyrrolysine (**O**, Pyl) are considered as distinct amino acids. When *ambiguity* is set **False**, all alphabet characters as considered as distinct types. All non-alphabet characters are considered as gaps.""" msa = getMSA(msa) from .msatools import msasca LOGGER.timeit('_sca') length = msa.shape[1] sca = zeros((length, length), float) sca = msasca(msa, sca, turbo=bool(turbo)) LOGGER.report('SCA matrix was calculated in %.2fs.', '_sca') return sca
def calcMeff(msa, seqid=0.8, refine=False, weight=False, **kwargs): """Returns the Meff for *msa*, which may be an :class:`.MSA` instance or a 2D Numpy character array. Since similar sequences in an *msa* decreases the diversity of *msa*, *Meff* gives a weight for sequences in the *msa*. For example: One sequence in MSA has 5 other similar sequences in this MSA(itself included). The weight of this sequence is defined as 1/5=0.2. Meff is the sum of all sequence weights. In another word, Meff can be understood as the effective number of independent sequences. Sequences sharing sequence identity of *seqid* or more with another sequence are regarded as similar sequences to calculate Meff. Sequences are not refined by default. When *refine* is set **True**, the MSA will be refined by the first sequence. The weight for each sequence are returned when *weight* is **True**.""" msa = getMSA(msa) from .msatools import msameff LOGGER.timeit("_meff") refine = 1 if refine else 0 weight = 0 if weight else 1 # A Mark for return weighted array. if not weight: w = zeros((msa.shape[0]), float) meff = msameff(msa, theta=1.0 - seqid, meff_only=weight, refine=refine, w=w) else: meff = msameff(msa, theta=1.0 - seqid, meff_only=weight, refine=refine) LOGGER.report("Meff was calculated in %.2fs.", "_meff") return meff
def calcMBSfromSim(simMatrix, nEvals=20, remove_outliers=True, remove_offset=True, **kwargs): LOGGER.timeit('_MBS') n = simMatrix.shape[0] mbs = np.zeros(n) for i in range(n): try: # cut "non-covalent" bonds around atom 'i' modSim = MBSPointMutation(simMatrix, i) # compute laplacian's spectrum of eigvals laplacian = sparse.csgraph.laplacian(modSim, normed=True) evals = sparse.linalg.eigsh(laplacian, k=min(nEvals, n-1), which='SM', return_eigenvectors=False) # sort eigvals in ascending order evals = np.sort(evals) # compute MBS at site i mbs[i] = np.sum(1./evals[1:]) except Exception as err: LOGGER.warn('Unable to compute MBS at position ' '{0}. {1}'.format(i, err)) mbs[i] = np.nan if any(~np.isnan(mbs)): # remove outliers if remove_outliers is True: mbs = _removeOutliers(mbs, **kwargs) # remove offset if remove_offset is True: offset = min(mbs[~np.isnan(mbs)]) mbs = mbs - offset LOGGER.report('MBS computed in %.1fs.', '_MBS') return mbs
def parsePDBs(self, **kwargs): """Load PDB into memory as :class:`.AtomGroup` instances using :func:`.parsePDB` and perform selection based on residue ranges given by CATH.""" pdbs = self.getPDBs(True) selstrs = self.getSelStrs() header = kwargs.get('header', False) model = kwargs.get('model', None) LOGGER.timeit('_cath_parsePDB') LOGGER.info('Parsing {0} PDB files...'.format(len(pdbs))) ret = parsePDB(*pdbs, **kwargs) if model != 0: if header: prots, _ = ret else: prots = ret LOGGER.info('Extracting domains...') for i in range(len(prots)): sel = prots[i].select(selstrs[i]) prots[i] = sel LOGGER.report('CATH domains are parsed and extracted in %.2fs', '_cath_parsePDB') return ret
def parsePQR(filename, **kwargs): """Returns an :class:`.AtomGroup` containing data parsed from PDB lines. :arg filename: a PQR filename :type filename: str""" title = kwargs.get('title', kwargs.get('name')) model = 1 header = False chain = kwargs.get('chain') subset = kwargs.get('subset') altloc = kwargs.get('altloc', 'A') max_n_atoms = kwargs.get('max_n_atoms', 1e5) if not os.path.isfile(filename): raise IOError('No such file: {0}'.format(repr(filename))) if title is None: fn, ext = os.path.splitext(os.path.split(filename)[1]) if ext == '.gz': fn, ext = os.path.splitext(fn) title = fn.lower() title_suffix = '' if subset: try: subset = _PDBSubsets[subset.lower()] except AttributeError: raise TypeError('subset must be a string') except KeyError: raise ValueError('{0} is not a valid subset' .format(repr(subset))) title_suffix = '_' + subset if chain is not None: if not isinstance(chain, str): raise TypeError('chain must be a string') elif len(chain) == 0: raise ValueError('chain must not be an empty string') title_suffix = '_' + chain + title_suffix if 'ag' in kwargs: ag = kwargs['ag'] if not isinstance(ag, AtomGroup): raise TypeError('ag must be an AtomGroup instance') n_csets = ag.numCoordsets() else: ag = AtomGroup(title + title_suffix) n_csets = 0 pqr = openFile(filename, 'rt') lines = pqr.readlines() pqr.close() LOGGER.timeit() ag = _parsePDBLines(ag, lines, split=0, model=1, chain=chain, subset=subset, altloc_torf=False, format='pqr', max_n_atoms=max_n_atoms) if ag.numAtoms() > 0: LOGGER.report('{0} atoms and {1} coordinate sets were ' 'parsed in %.2fs.'.format(ag.numAtoms(), ag.numCoordsets() - n_csets)) return ag else: return None
def update(self, source=None): """Update data and files from CATH.""" self._source = source = self._source or source self.reset() if source is None: return LOGGER.timeit('_cath_update') type_ = 0 tree = None if isinstance(source, str): if isfile(source): type_ = 1 elif isURL(source): type_ = 0 else: type_ = 2 elif hasattr(source, 'read'): type_ = 1 else: raise TypeError('source must be either an url, file name, file handle, ' 'or text in xml format') if type_ == 0: LOGGER.info('Fetching data from CATH...') self._fetch() LOGGER.info('Parsing CATH files...') self._parse() elif type_ == 1: LOGGER.info('Reading data from the local xml file...') tree = ET.parse(source) elif type_ == 2: LOGGER.info('Parsing input string...') tree = ET.fromstring(source) # post-processing if type_ > 0: root = tree.getroot() nodes = root.iter() # remove prefix from node tags for node in nodes: node.tag = node.tag.lstrip('id.') # convert int to str length_nodes = root.findall('.//*[@length]') for node in length_nodes: node.attrib['length'] = int(node.attrib['length']) copy2(root, self.root) self._update_map() LOGGER.report('CATH local database built in %.2fs.', '_cath_update')
def loadAtoms(filename): """Returns :class:`.AtomGroup` instance loaded from *filename* using :func:`numpy.load` function. See also :func:`saveAtoms`.""" LOGGER.timeit('_prody_loadatoms') attr_dict = load(filename) files = set(attr_dict.files) if not 'n_atoms' in files: raise ValueError('{0} is not a valid atomic data file' .format(repr(filename))) title = str(attr_dict['title']) if 'coordinates' in files: coords = attr_dict['coordinates'] ag = AtomGroup(title) ag._n_csets = int(attr_dict['n_csets']) ag._coords = coords ag._n_atoms = int(attr_dict['n_atoms']) ag._setTimeStamp() if 'flagsts' in files: ag._flagsts = int(attr_dict['flagsts']) if 'bonds' in files and 'bmap' in files and 'numbonds' in files: ag._bonds = attr_dict['bonds'] ag._bmap = attr_dict['bmap'] ag._data['numbonds'] = attr_dict['numbonds'] skip_flags = set() for label, data in attr_dict.items(): if label in SKIPLOAD: continue if data.ndim == 1 and data.dtype == bool: if label in skip_flags: continue else: ag._setFlags(label, data) skip_flags.update(flags.ALIASES.get(label, [label])) else: ag.setData(label, data) for label in ['segindex', 'chindex', 'resindex']: if label in attr_dict: ag._data[label] = attr_dict[label] if ag.numCoordsets() > 0: ag._acsi = 0 if 'cslabels' in files: ag.setCSLabels(list(attr_dict['cslabels'])) LOGGER.report('Atom group was loaded in %.2fs.', '_prody_loadatoms') return ag
def superpose(self): """Superpose the ensemble onto the reference coordinates.""" if self._coords is None: raise ValueError('coordinates are not set, use `setCoords`') if self._confs is None or len(self._confs) == 0: raise ValueError('conformations are not set, use `addCoordset`') LOGGER.timeit('_prody_ensemble') self._superpose(trans=True) # trans kwarg is used by PDBEnsemble LOGGER.report('Superposition completed in %.2f seconds.', '_prody_ensemble')
def loadAtoms(filename): """Returns :class:`.AtomGroup` instance loaded from *filename* using :func:`numpy.load` function. See also :func:`saveAtoms`.""" LOGGER.timeit("_prody_loadatoms") attr_dict = load(filename) files = set(attr_dict.files) if not "n_atoms" in files: raise ValueError("{0} is not a valid atomic data file".format(repr(filename))) title = str(attr_dict["title"]) if "coordinates" in files: coords = attr_dict["coordinates"] ag = AtomGroup(title) ag._n_csets = int(attr_dict["n_csets"]) ag._coords = coords ag._n_atoms = int(attr_dict["n_atoms"]) ag._setTimeStamp() if "flagsts" in files: ag._flagsts = int(attr_dict["flagsts"]) if "bonds" in files and "bmap" in files and "numbonds" in files: ag._bonds = attr_dict["bonds"] ag._bmap = attr_dict["bmap"] ag._data["numbonds"] = attr_dict["numbonds"] skip_flags = set() for label, data in attr_dict.items(): if label in SKIPLOAD: continue if data.ndim == 1 and data.dtype == bool: if label in skip_flags: continue else: ag._setFlags(label, data) skip_flags.update(flags.ALIASES.get(label, [label])) else: ag.setData(label, data) for label in ["segindex", "chindex", "resindex"]: if label in attr_dict: ag._data[label] = attr_dict[label] if ag.numCoordsets() > 0: ag._acsi = 0 if "cslabels" in files: ag.setCSLabels(list(attr_dict["cslabels"])) LOGGER.report("Atom group was loaded in %.2fs.", "_prody_loadatoms") return ag
def buildSeqidMatrix(msa, turbo=True): """Returns sequence identity matrix for *msa*.""" msa = getMSA(msa) LOGGER.timeit("_seqid") from .seqtools import msaeye dim = msa.shape[0] seqid = msaeye(msa, ones((dim, dim), float), turbo=bool(turbo)) LOGGER.report("Sequence identity matrix was calculated in %.2fs.", "_seqid") return seqid
def buildSeqidMatrix(msa, turbo=True): """Return sequence identity matrix for *msa*.""" msa = getMSA(msa) LOGGER.timeit('_seqid') from .seqtools import msaeye seqid = msaeye(msa, turbo=bool(turbo)) LOGGER.report('Sequence identity matrix was calculated in %.2fs.', '_seqid') return seqid
def calcMechStiff(modes, coords, kbt=1.): """Calculate stiffness matrix calculated using :class:`.ANM` instance. Method described in [EB08]_. :arg coords: a coordinate set or an object with ``getCoords`` method :type coords: :class:`numpy.ndarray`. :arg n_modes: number of non-zero eigenvalues/vectors to calculate. If **None** is given, all modes will be calculated (3x number of atoms). :type n_modes: int or **None**, default is 20. Author: Mustafa Tekpinar & Karolina Mikulska-Ruminska & Cihan Kaya """ try: coords = (coords._getCoords() if hasattr(coords, '_getCoords') else coords.getCoords()) except AttributeError: try: checkCoords(coords) except TypeError: raise TypeError('coords must be a Numpy array or an object ' 'with `getCoords` method') try: is3d = modes.is3d() eigvecs = modes.getArray().T.flatten() eigvals = modes.getEigvals() except: raise TypeError('modes must be either an NMA or ModeSet object') if not is3d: raise TypeError('modes must be 3-dimensional') n_atoms = modes.numAtoms() n_modes = modes.numModes() LOGGER.timeit('_sm') sm = np.zeros((n_atoms, n_atoms), np.double) from .smtools import calcSM LOGGER.info('Calculating stiffness matrix.') calcSM(coords, sm, eigvecs, eigvals, n_atoms, n_modes, float(kbt)) LOGGER.report('Stiffness matrix calculated in %.2lfs.', label='_sm') LOGGER.info('The range of effective force constant is: {0} to {1}.' .format(*calcStiffnessRange(sm))) return sm
def getNormDistFluct(self, coords): """Normalized distance fluctuation """ model = self.getModel() LOGGER.info('Number of chains: {0}, chains: {1}.' .format(len(list(set(coords.getChids()))), \ list(set(coords.getChids())))) try: #coords = coords.select('protein and name CA') coords = (coords._getCoords() if hasattr(coords, '_getCoords') else coords.getCoords()) except AttributeError: try: checkCoords(coords) except TypeError: raise TypeError('coords must be a Numpy array or an object ' 'with `getCoords` method') if not isinstance(model, NMA): LOGGER.info('Calculating new model') model = GNM('prot analysis') model.buildKirchhoff(coords) model.calcModes() linalg = importLA() n_atoms = model.numAtoms() n_modes = model.numModes() LOGGER.timeit('_ndf') from .analysis import calcCrossCorr from numpy import linalg as LA # <dRi, dRi>, <dRj, dRj> = 1 crossC = 2-2*calcCrossCorr(model) r_ij = np.zeros((n_atoms,n_atoms,3)) for i in range(n_atoms): for j in range(i+1,n_atoms): r_ij[i][j] = coords[j,:] - coords[i,:] r_ij[j][i] = r_ij[i][j] r_ij_n = LA.norm(r_ij, axis=2) #with np.errstate(divide='ignore'): r_ij_n[np.diag_indices_from(r_ij_n)] = 1e-5 # div by 0 crossC=abs(crossC) normdistfluct = np.divide(np.sqrt(crossC),r_ij_n) LOGGER.report('NDF calculated in %.2lfs.', label='_ndf') normdistfluct[np.diag_indices_from(normdistfluct)] = 0 # div by 0 return normdistfluct
def buildMechStiff(self, coords, n_modes=None, kbt=1.): """Calculate stiffness matrix calculated using :class:`.ANM` instance. Method described in [EB08]_. .. [EB08] Eyal E., Bahar I. Toward a Molecular Understanding of the Anisotropic Response of Proteins to External Forces: Insights from Elastic Network Models. *Biophys J* **2008** 94:3424-34355. :arg coords: a coordinate set or an object with ``getCoords`` method :type coords: :class:`numpy.ndarray`. :arg n_modes: number of non-zero eigenvalues/vectors to calculate. If ``None`` is given, all modes will be calculated (3x number of atoms). :type n_modes: int or ``None``, default is 20. Author: Mustafa Tekpinar & Karolina Mikulska-Ruminska & Cihan Kaya """ try: coords = (coords._getCoords() if hasattr(coords, '_getCoords') else coords.getCoords()) except AttributeError: try: checkCoords(coords) except TypeError: raise TypeError('coords must be a Numpy array or an object ' 'with `getCoords` method') n_atoms = natoms = self._n_atoms n_modes = 3 * n_atoms self.calcModes(n_modes=None, zeros=True) LOGGER.timeit('_sm') eigvecs = (np.transpose(self._array)).flatten() eigvals = np.transpose(self._eigvals) natoms = n_atoms sm = np.zeros((n_atoms, n_atoms), np.double) from .smtools import calcSM LOGGER.info('Calculating stiffness matrix.') calcSM(coords, sm, eigvecs, eigvals, natoms, n_modes, float(kbt)) LOGGER.report('Stiffness matrix calculated in %.2lfs.', label='_sm') self._stiffness = sm LOGGER.info('The range of effective force constant is: {0} to {1}.' .format(np.min(sm[np.nonzero(sm)]), np.amax(sm)))
def parseEMDStream(stream, **kwargs): """ Returns an :class:`.AtomGroup` containing EMD data parsed from a stream of EMD file. :arg stream: Any object with the method ``readlines`` (e.g. :class:`file`, buffer, stdin)""" cutoff = kwargs.get('cutoff', None) if cutoff is not None: cutoff = float(cutoff) n_nodes = int(kwargs.get('n_nodes', 1000)) num_iter = int(kwargs.get('num_iter', 20)) map = kwargs.get('map',True) make_nodes = kwargs.get('make_nodes',False) if map is False and make_nodes is False: LOGGER.warn('At least one of map and make_nodes should be True. ' 'Setting map to False was an intentional change from the default ' 'behaviour so make_nodes has been set to True.') make_nodes = True title_suffix = kwargs.get('title_suffix','') atomgroup = AtomGroup(str(kwargs.get('title', 'Unknown')) + title_suffix) atomgroup._n_atoms = n_nodes if make_nodes: LOGGER.info('Building coordinates from electron density map. This may take a while.') LOGGER.timeit() if map: emd, atomgroup = _parseEMDLines(atomgroup, stream, cutoff=cutoff, n_nodes=n_nodes, \ num_iter=num_iter, map=map, make_nodes=make_nodes) else: atomgroup = _parseEMDLines(atomgroup, stream, cutoff=cutoff, n_nodes=n_nodes, \ num_iter=num_iter, map=map, make_nodes=make_nodes) LOGGER.report('{0} atoms and {1} coordinate sets were ' 'parsed in %.2fs.'.format(atomgroup.numAtoms(), atomgroup.numCoordsets())) else: emd = _parseEMDLines(atomgroup, stream, cutoff=cutoff, n_nodes=n_nodes, \ num_iter=num_iter, map=map, make_nodes=make_nodes) if make_nodes: if map: return emd, atomgroup else: return atomgroup else: return emd
def save(self, filename='cath.xml'): """Write local CATH database to an XML file. *filename* can either be a file name or a handle.""" LOGGER.timeit('_cath_write') if not isinstance(filename, str): try: fn = filename.name except AttributeError: fn = repr(filename) f = filename else: fn = filename LOGGER.info('Writing data to {0}...'.format(fn)) if not len(self.root): raise ValueError('local database has not been built, ' 'please call update() first') tree = self.copy() root = tree.getroot() # convert int to str length_nodes = root.findall('.//*[@length]') for node in length_nodes: node.attrib['length'] = str(node.attrib['length']) # add prefix to node tags nodes = root.iter() for node in nodes: node.tag = 'id.' + node.tag # add indentation to nodes indentElement(root) if isinstance(filename, str): f = open(filename, 'wb') tree.write(f, encoding='utf-8') f.close() LOGGER.report('CATH local database saved in %.2fs.', '_cath_write')
def buildMutinfoMatrix(msa, ambiguity=True, turbo=True, **kwargs): """Returns mutual information matrix calculated for *msa*, which may be an :class:`.MSA` instance or a 2D Numpy character array. Implementation is case insensitive and handles ambiguous amino acids as follows: * **B** (Asx) count is allocated to *D* (Asp) and *N* (Asn) * **Z** (Glx) count is allocated to *E* (Glu) and *Q* (Gln) * **J** (Xle) count is allocated to *I* (Ile) and *L* (Leu) * **X** (Xaa) count is allocated to the twenty standard amino acids * Joint probability of observing a pair of ambiguous amino acids is allocated to all potential combinations, e.g. probability of **XX** is allocated to 400 combinations of standard amino acids, similarly probability of **XB** is allocated to 40 combinations of *D* and *N* with the standard amino acids. Selenocysteine (**U**, Sec) and pyrrolysine (**O**, Pyl) are considered as distinct amino acids. When *ambiguity* is set **False**, all alphabet characters as considered as distinct types. All non-alphabet characters are considered as gaps. Mutual information matrix can be normalized or corrected using :func:`applyMINormalization` and :func:`applyMICorrection` methods, respectively. Normalization by joint entropy can performed using this function with *norm* option set **True**.""" msa = getMSA(msa) from .msatools import msamutinfo LOGGER.timeit("_mutinfo") length = msa.shape[1] mutinfo = empty((length, length), float) mutinfo = msamutinfo( msa, mutinfo, ambiguity=bool(ambiguity), turbo=bool(turbo), norm=bool(kwargs.get("norm", False)), debug=bool(kwargs.get("debug", False)), ) LOGGER.report("Mutual information matrix was calculated in %.2fs.", "_mutinfo") return mutinfo
def buildDirectInfoMatrix(msa, seqid=0.8, pseudo_weight=0.5, refine=False, **kwargs): """Returns direct information matrix calculated for *msa*, which may be an :class:`.MSA` instance or a 2D Numpy character array. Sequences sharing sequence identity of *seqid* or more with another sequence are regarded as similar sequences for calculating their weights using :func:`.calcMeff`. *pseudo_weight* are the weight for pseudo count probability. Sequences are not refined by default. When *refine* is set **True**, the MSA will be refined by the first sequence and the shape of direct information matrix will be smaller. """ msa = getMSA(msa) from .msatools import msadipretest, msadirectinfo1, msadirectinfo2 from numpy import matrix LOGGER.timeit("_di") if msa.shape[0] < 250: LOGGER.warning( "DI performs the best with higher number of sequences, and " "minimal number of sequences is recommended as 250." ) refine = 1 if refine else 0 # msadipretest get some parameter from msa to set matrix size length, q = msadipretest(msa, refine=refine) c = matrix.dot(matrix(zeros((length * q, 1), float)), matrix(zeros((1, length * q), float))) prob = zeros((length, q + 1), float) # msadirectinfo1 return c to be inversed and prob to be used meff, n, length, c, prob = msadirectinfo1( msa, c, prob, theta=1.0 - seqid, pseudocount_weight=pseudo_weight, refine=refine, q=q + 1 ) c = c.I di = zeros((length, length), float) # get final DI di = msadirectinfo2(n, length, c, prob, di, q + 1) del prob, c LOGGER.report("DI matrix was calculated in %.2fs.", "_di") return di
def buildHessian(self, coords, blocks, cutoff=15., gamma=1., **kwargs): """Build Hessian matrix for given coordinate set. :arg coords: a coordinate set or an object with ``getCoords`` method :type coords: :class:`numpy.ndarray` :arg cutoff: cutoff distance (Å) for pairwise interactions, default is 15.0 Å :type cutoff: float :arg gamma: spring constant, default is 1.0 :type gamma: float""" try: coords = (coords._getCoords() if hasattr(coords, '_getCoords') else coords.getCoords()) except AttributeError: try: checkCoords(coords) except TypeError: raise TypeError('coords must be a Numpy array or an object ' 'with `getCoords` method') LOGGER.timeit('_rtb') natoms = coords.shape[0] if (natoms,) != blocks.shape: raise ValueError('blocks.shape must be (natoms,)') nblocks = len(set(blocks)) nb6 = nblocks * 6 coords = coords.T.copy() self._hessian = hessian = np.zeros((nb6, nb6), float) self._project = project = np.zeros((natoms * 3, nb6), float) from rtbtools import buildhessian buildhessian(coords, blocks, hessian, project, natoms, nblocks, float(cutoff), float(gamma)) LOGGER.report('Hessian was built in %.2fs.', label='_rtb')
def calcModes(self, n_modes=20, zeros=False, turbo=True, hinges=True): """Calculate normal modes. This method uses :func:`scipy.linalg.eigh` function to diagonalize the Kirchhoff matrix. When Scipy is not found, :func:`numpy.linalg.eigh` is used. :arg n_modes: number of non-zero eigenvalues/vectors to calculate. If **None** or ``'all'`` is given, all modes will be calculated. :type n_modes: int or None, default is 20 :arg zeros: If **True**, modes with zero eigenvalues will be kept. :type zeros: bool, default is **True** :arg turbo: Use a memory intensive, but faster way to calculate modes. :type turbo: bool, default is **True** :arg hinges: Identify hinge sites after modes are computed. :type hinges: bool, default is **True** """ if self._kirchhoff is None: raise ValueError('Kirchhoff matrix is not built or set') if str(n_modes).lower() == 'all': n_modes = None assert n_modes is None or isinstance(n_modes, int) and n_modes > 0, \ 'n_modes must be a positive integer' assert isinstance(zeros, bool), 'zeros must be a boolean' assert isinstance(turbo, bool), 'turbo must be a boolean' self._clear() LOGGER.timeit('_gnm_calc_modes') values, vectors, vars = solveEig(self._kirchhoff, n_modes=n_modes, zeros=zeros, turbo=turbo, is3d=False) self._eigvals = values self._array = vectors self._vars = vars self._trace = self._vars.sum() self._n_modes = len(self._eigvals) if hinges: self.calcHinges() LOGGER.report('{0} modes were calculated in %.2fs.' .format(self._n_modes), label='_gnm_calc_modes')
def iterpose(self, rmsd=0.0001): """Iteratively superpose the ensemble until convergence. Initially, all conformations are aligned with the reference coordinates. Then mean coordinates are calculated, and are set as the new reference coordinates. This is repeated until reference coordinates do not change. This is determined by the value of RMSD between the new and old reference coordinates. Note that at the end of the iterative procedure the reference coordinate set will be average of conformations in the ensemble. :arg rmsd: change in reference coordinates to determine convergence, default is 0.0001 Å RMSD :type rmsd: float""" if self._coords is None: raise AttributeError('coordinates are not set, use `setCoords`') if self._confs is None or len(self._confs) == 0: raise AttributeError('conformations are not set, use' '`addCoordset`') LOGGER.info('Starting iterative superposition:') LOGGER.timeit('_prody_ensemble') rmsdif = 1 step = 0 weights = self._weights if weights is not None and weights.ndim == 3: weightsum = weights.sum(axis=0) length = len(self) while rmsdif > rmsd: self._superpose() if weights is None: newxyz = self._confs.sum(0) / length else: newxyz = (self._confs * weights).sum(0) / weightsum rmsdif = getRMSD(self._coords, newxyz) self._coords = newxyz step += 1 LOGGER.info('Step #{0}: RMSD difference = {1:.4e}' .format(step, rmsdif)) LOGGER.report('Iterative superposition completed in %.2fs.', '_prody_ensemble')
def buildOMESMatrix(msa, ambiguity=True, turbo=True, **kwargs): """Return OMES (Observed Minus Expected Squared) covariance matrix calculated for *msa*, which may be an :class:`.MSA` instance or a 2D NumPy character array. OMES is defined as:: (N_OBS - N_EX)^2 (f_i,j - f_i * f_j)^2 OMES_(i,j) = sum(------------------) = N * sum(-----------------------) N_EX f_i * f_j Implementation is case insensitive and handles ambiguous amino acids as follows: * **B** (Asx) count is allocated to *D* (Asp) and *N* (Asn) * **Z** (Glx) count is allocated to *E* (Glu) and *Q* (Gln) * **J** (Xle) count is allocated to *I* (Ile) and *L* (Leu) * **X** (Xaa) count is allocated to the twenty standard amino acids * Joint probability of observing a pair of ambiguous amino acids is allocated to all potential combinations, e.g. probability of **XX** is allocated to 400 combinations of standard amino acids, similarly probability of **XB** is allocated to 40 combinations of *D* and *N* with the standard amino acids. Selenocysteine (**U**, Sec) and pyrrolysine (**O**, Pyl) are considered as distinct amino acids. When *ambiguity* is set **False**, all alphabet characters as considered as distinct types. All non-alphabet characters are considered as gaps.""" msa = getMSA(msa) from .msatools import msaomes LOGGER.timeit('_omes') length = msa.shape[1] omes = empty((length, length), float) omes = msaomes(msa, omes, ambiguity=bool(ambiguity), turbo=bool(turbo), debug=bool(kwargs.get('debug', False))) LOGGER.report('OMES matrix was calculated in %.2fs.', '_omes') return omes
def calcPredictions(feat_matrix, clsf, SAV_coords=None): assert SAV_coords is None or len(SAV_coords) == len(feat_matrix) # import classifier and other info if isinstance(clsf, dict): clsf_dict = clsf else: LOGGER.timeit('_import_clsf') clsf_dict = pickle.load(open(clsf, 'rb')) LOGGER.report('Random Forest classifier imported in %.1fs.', '_import_clsf') classifier = clsf_dict['trained RF'] opt_cutoff = clsf_dict['CV summary']['optimal cutoff'] path_curve = clsf_dict['CV summary']['path. probability'] train_data = clsf_dict['training dataset'] LOGGER.timeit('_preds') # define a structured array for storing predictions pred_dtype = np.dtype([('score', 'f'), ('path. probability', 'f'), ('path. class', 'U12'), ('training info', 'U12')]) predictions = np.zeros(len(feat_matrix), dtype=pred_dtype) # select rows where all features are well-defined sel_rows = [i for i, r in enumerate(feat_matrix) if all(~np.isnan(r))] n_pred = len(sel_rows) if n_pred == 0: LOGGER.warning('No predictions could be computed.') proba = None else: # compute predictions sliced_feat_matrix = feat_matrix[sel_rows] proba = classifier.predict_proba(sliced_feat_matrix) # output J, err_bar = opt_cutoff Jminus = J - err_bar Jplus = J + err_bar k = 0 for i in range(len(feat_matrix)): # determine SAV status if SAV_coords is None: SAV_status = '?' elif SAV_coords[i] in train_data['del. SAVs']: SAV_status = 'known_del' elif SAV_coords[i] in train_data['neu. SAVs']: SAV_status = 'known_neu' else: SAV_status = 'new' # determine pathogenicity prob. and class if i not in sel_rows: predictions[i] = (np.nan, np.nan, '?', SAV_status) else: # retrieve score returned by RF score = proba[k, 1] # assign pathogenicity probability by interpolating # the pathogenicity profile computed during CV path_prob = np.interp(score, path_curve[0], path_curve[1]) # assign class of pathogenicity based on Youden's cutoff if score > Jplus: path_class = "deleterious" elif score > J: path_class = "prob.delet." elif score >= Jminus: path_class = "prob.neutral" else: path_class = "neutral" # store values predictions[i] = (score, path_prob, path_class, SAV_status) k = k + 1 LOGGER.report('{} predictions computed in %.1fs.'.format(n_pred), '_preds') return predictions
def calcPerturbResponse(model, **kwargs): """Returns a matrix of profiles from scanning the response of the structure to random perturbations at specific atom (or node) positions. The function implements the perturbation response scanning (PRS) method described in [CA09]_. Rows of the matrix are the average magnitude of the responses obtained by perturbing the atom/node position at that row index, i.e. ``prs_profile[i,j]`` will give the response of residue/node *j* to perturbations in residue/node *i*. PRS is performed using the covariance matrix from *model*, e.g. :class:`.ANM` instance. When an *atoms* instance is given, the PRS matrix will be added as data, which can be retrieved with ``atoms.getData('prs_matrix')``. *model* and *atoms* must have the same number of atoms. *atoms* must be an :class:`.AtomGroup` instance. .. [CA09] Atilgan C, Atilgan AR, Perturbation-Response Scanning Reveals Ligand Entry-Exit Mechanisms of Ferric Binding Protein. *PLoS Comput Biol* **2009** 5(10):e1000544. The PRS matrix can be calculated and saved as follows:: prs_matrix = calcPerturbResponse(p38_anm, saveMatrix=True) The PRS matrix can also be save later as follows:: writeArray('prs_matrix.txt', prs_matrix, format='%8.6f', delimiter='\t') :arg saveMatrix: whether to save the last matrix generated to a text file. Default is False :type saveMatrix: bool :arg saveName: The file name for saved matrices Default is 'response_matrix.txt'. :type saveName: str """ if not isinstance(model, (NMA, ModeSet, Mode)): raise TypeError('model must be an NMA, ModeSet, or Mode instance') if isinstance(model, NMA) and len(model) == 0: raise ValueError('model must have normal modes calculated') atoms = kwargs.get('atoms', None) if atoms is not None: if isinstance(atoms, Selection): atoms = atoms.copy() if not isinstance(atoms, AtomGroup): raise TypeError('atoms must be an AtomGroup instance') elif atoms.numAtoms() != model.numAtoms(): raise ValueError('model and atoms must have the same number atoms') n_atoms = model.numAtoms() LOGGER.timeit('_prody_prs_all') LOGGER.info('Calculating covariance matrix') LOGGER.timeit('_prody_cov') cov = calcCovariance(model) if cov is None: raise ValueError('model did not return a covariance matrix') LOGGER.clear() LOGGER.report('Covariance matrix calculated in %.1fs.', '_prody_cov') LOGGER.progress('Calculating perturbation response', n_atoms, '_prody_prs_mat') if not model.is3d(): prs_matrix = cov**2 else: cov_squared = cov**2 n_by_3n_cov_squared = np.zeros((n_atoms, 3 * n_atoms)) prs_matrix = np.zeros((n_atoms, n_atoms)) i3 = -3 i3p3 = 0 for i in range(n_atoms): i3 += 3 i3p3 += 3 n_by_3n_cov_squared[i, :] = (cov_squared[i3:i3p3, :]).sum(0) j3 = -3 j3p3 = 0 for j in range(n_atoms): j3 += 3 j3p3 += 3 prs_matrix[:, j] = (n_by_3n_cov_squared[:, j3:j3p3]).sum(1) LOGGER.clear() LOGGER.report('Perturbation response matrix calculated in %.1fs.', '_prody_prs_mat') saveMatrix = kwargs.get('saveMatrix', False) suppressDiag = kwargs.get('suppressDiag', False) saveName = kwargs.get('saveName', 'response_matrix.txt') norm_prs_matrix = np.zeros((n_atoms, n_atoms)) self_dp = np.diag(prs_matrix) self_dp = self_dp.reshape(n_atoms, 1) norm_prs_matrix = prs_matrix / np.repeat(self_dp, n_atoms, axis=1) if suppressDiag == True: # suppress the diagonal (self displacement) to facilitate # visualizing the response profile norm_prs_matrix = norm_prs_matrix - np.diag(np.diag(norm_prs_matrix)) if saveMatrix == True: np.savetxt(saveName, norm_prs_matrix, delimiter='\t', fmt='%8.6f') LOGGER.report('Perturbation response scanning completed in %.1fs.', '_prody_prs_all') if atoms is not None: atoms.setData('prs_matrix', norm_prs_matrix) return atoms, norm_prs_matrix else: return norm_prs_matrix
def refineMSA(msa, label=None, rowocc=None, seqid=None, colocc=None, **kwargs): """Refine *msa* by removing sequences (rows) and residues (columns) that contain gaps. :arg msa: multiple sequence alignment :type msa: :class:`.MSA` :arg label: remove columns that are gaps in the sequence matching label, ``msa.getIndex(label)`` must return a sequence index, a PDB identifier is also acceptable :type label: str :arg rowocc: row occupancy, sequences with less occupancy will be removed after *label* refinement is applied :type rowocc: float :arg seqid: keep unique sequences at specified sequence identity level, unique sequences are identified using :func:`.uniqueSequences` :type seqid: float :arg colocc: column occupancy, residue positions with less occupancy will be removed after other refinements are applied :type colocc: float :arg keep: keep columns corresponding to residues not resolved in the PDB structure, default is **False**, applies when *label* is a PDB identifier :arg type: bool For Pfam MSA data, *label* is UniProt entry name for the protein. You may also use PDB structure and chain identifiers, e.g. ``'1p38'`` or ``'1p38A'``, for *label* argument and UniProt entry names will be parsed using :func:`.parsePDBHeader` function (see also :class:`.Polymer` and :class:`.DBRef`). The order of refinements are applied in the order of arguments. If *label* and *unique* is specified is specified, sequence matching *label* will be kept in the refined :class:`.MSA` although it may be similar to some other sequence.""" # if msa is a char array, it will be refined but label won't work try: ndim, dtype_ = msa.ndim, msa.dtype except AttributeError: try: arr = msa._getArray() except AttributeError: raise TypeError('msa must be a character array or an MSA instance') ndim, dtype_ = arr.ndim, arr.dtype else: arr, msa = msa, None if dtype('|S1') != dtype_: raise ValueError('msa must be a character array or an MSA instance') if ndim != 2: raise ValueError('msa must be a 2D array or an MSA instance') title = [] cols = None index = None if label is not None: before = arr.shape[1] LOGGER.timeit('_refine') try: upper, lower = label.upper(), label.lower() except AttributeError: raise TypeError('label must be a string') if msa is None: raise TypeError('msa must be an MSA instance, ' 'label cannot be used') index = msa.getIndex(label) if index is None: index = msa.getIndex(upper) if index is None: index = msa.getIndex(lower) chain = None if index is None and (len(label) == 4 or len(label) == 5): from prody import parsePDB try: structure, header = parsePDB(label[:4], header=True) except Exception as err: raise IOError('failed to parse header for {0} ({1})' .format(label[:4], str(err))) chid = label[4:].upper() for poly in header['polymers']: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if index is None: index = msa.getIndex(dbref.idcode) if index is not None: LOGGER.info('{0} idcode {1} for {2}{3} ' 'is found in chain {3}.'.format( dbref.database, dbref.idcode, label[:4], poly.chid, str(msa))) break if index is None: index = msa.getIndex(dbref.accession) if index is not None: LOGGER.info('{0} accession {1} for {2}{3} ' 'is found in chain {3}.'.format( dbref.database, dbref.accession, label[:4], poly.chid, str(msa))) break if index is not None: chain = structure[poly.chid] if index is None: raise ValueError('label is not in msa, or msa is not indexed') try: len(index) except TypeError: pass else: raise ValueError('label {0} maps onto multiple sequences, ' 'so cannot be used for refinement'.format(label)) title.append('label=' + label) cols = char.isalpha(arr[index]).nonzero()[0] arr = arr.take(cols, 1) LOGGER.report('Label refinement reduced number of columns from {0} to ' '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine') if chain is not None and not kwargs.get('keep', False): before = arr.shape[1] LOGGER.timeit('_refine') from prody.proteins.compare import importBioPairwise2 from prody.proteins.compare import MATCH_SCORE, MISMATCH_SCORE from prody.proteins.compare import GAP_PENALTY, GAP_EXT_PENALTY pw2 = importBioPairwise2() chseq = chain.getSequence() algn = pw2.align.localms(arr[index].tostring().upper(), chseq, MATCH_SCORE, MISMATCH_SCORE, GAP_PENALTY, GAP_EXT_PENALTY, one_alignment_only=1) torf = [] for s, c in zip(*algn[0][:2]): if s == '-': continue elif c != '-': torf.append(True) else: torf.append(False) torf = array(torf) tsum = torf.sum() assert tsum <= before, 'problem in mapping sequence to structure' if tsum < before: arr = arr.take(torf.nonzero()[0], 1) LOGGER.report('Structure refinement reduced number of ' 'columns from {0} to {1} in %.2fs.' .format(before, arr.shape[1]), '_refine') else: LOGGER.debug('All residues in the sequence are contained in ' 'PDB structure {0}.'.format(label)) from .analysis import calcMSAOccupancy, uniqueSequences rows = None if rowocc is not None: before = arr.shape[0] LOGGER.timeit('_refine') try: rowocc = float(rowocc) except Exception as err: raise TypeError('rowocc must be a float ({0})'.format(str(err))) assert 0. <= rowocc <= 1., 'rowocc must be between 0 and 1' rows = calcMSAOccupancy(arr, 'row') >= rowocc if index is not None: index = rows[:index].sum() rows = (rows).nonzero()[0] arr = arr[rows] title.append('rowocc>=' + str(rowocc)) LOGGER.report('Row occupancy refinement reduced number of rows from ' '{0} to {1} in %.2fs.'.format(before, arr.shape[0]), '_refine') if seqid is not None: before = arr.shape[0] LOGGER.timeit('_refine') unique = uniqueSequences(arr, seqid) if index is not None: unique[index] = True unique = unique.nonzero()[0] arr = arr[unique] title.append('seqid>=' + str(seqid)) if rows is not None: rows = rows[unique] else: rows = unique LOGGER.report('Sequence identity refinement reduced number of rows ' 'from {0} to {1} in %.2fs.'.format(before, arr.shape[0]), '_refine') if colocc is not None: before = arr.shape[1] LOGGER.timeit('_refine') try: colocc = float(colocc) except Exception as err: raise TypeError('colocc must be a float ({0})'.format(str(err))) assert 0. <= colocc <= 1., 'colocc must be between 0 and 1' cols = (calcMSAOccupancy(arr, 'col') >= colocc).nonzero()[0] arr = arr.take(cols, 1) title.append('colocc>=' + str(colocc)) LOGGER.report('Column occupancy refinement reduced number of columns ' 'from {0} to {1} in %.2fs.'.format(before, arr.shape[1]), '_refine') if not title: raise ValueError('label, rowocc, colocc all cannot be None') # depending on slicing of rows, arr may not have it's own memory if arr.base is not None: arr = arr.copy() if msa is None: return arr else: if rows is None: from copy import copy labels = copy(msa._labels) mapping = copy(msa._mapping) else: labels = msa._labels labels = [labels[i] for i in rows] mapping = None return MSA(arr, title=msa.getTitle() + ' refined ({0})' .format(', '.join(title)), labels=labels, mapping=mapping)
def refineEnsemble(ensemble, lower=.5, upper=10., **kwargs): """Refine a :class:`.PDBEnsemble` based on RMSD criterions. :arg ensemble: the ensemble to be refined :type ensemble: :class:`.Ensemble`, :class:`.PDBEnsemble` :arg lower: the smallest allowed RMSD between two conformations with the exception of **protected** :type lower: float :arg upper: the highest allowed RMSD between two conformations with the exception of **protected** :type upper: float :keyword protected: a list of either the indices or labels of the conformations needed to be kept in the refined ensemble :type protected: list :arg ref: the index or label of the reference conformation which will also be kept. Default is 0 :type ref: int or str """ protected = kwargs.pop('protected', []) P = [] if len(protected): labels = ensemble.getLabels() for p in protected: if isinstance(p, Integral): i = p else: if p in labels: i = labels.index(p) else: LOGGER.warn( 'could not find any conformation with the label %s in the ensemble' % str(p)) P.append(i) LOGGER.timeit('_prody_refineEnsemble') from numpy import argsort ### obtain reference index # rmsd = ensemble.getRMSDs() # ref_i = np.argmin(rmsd) ref_i = kwargs.pop('ref', 0) if isinstance(ref_i, Integral): pass elif isinstance(ref_i, str): labels = ensemble.getLabels() ref_i = labels.index(ref_i) else: LOGGER.warn( 'could not find any conformation with the label %s in the ensemble' % str(ref_i)) if not ref_i in P: P = [ref_i] + P ### calculate pairwise RMSDs ### RMSDs = ensemble.getRMSDs(pairwise=True) def getRefinedIndices(A): deg = A.sum(axis=0) sorted_indices = list(argsort(deg)) # sorted_indices = P + [x for x in sorted_indices if x not in P] sorted_indices.remove(ref_i) sorted_indices.insert(0, ref_i) n_confs = ensemble.numConfs() isdel_temp = np.zeros(n_confs) for a in range(n_confs): i = sorted_indices[a] for b in range(n_confs): if a >= b: continue j = sorted_indices[b] if isdel_temp[i] or isdel_temp[j]: continue else: if A[i, j]: # isdel_temp[j] = 1 if not j in P: isdel_temp[j] = 1 elif not i in P: isdel_temp[i] = 1 temp_list = isdel_temp.tolist() ind_list = [] for i in range(n_confs): if not temp_list[i]: ind_list.append(i) return ind_list L = list(range(len(ensemble))) U = list(range(len(ensemble))) if lower is not None: A = RMSDs < lower L = getRefinedIndices(A) if upper is not None: B = RMSDs > upper U = getRefinedIndices(B) # find common indices from L and U I = list(set(L) - (set(L) - set(U))) # for p in P: # if p not in I: # I.append(p) I.sort() reens = ensemble[I] LOGGER.report('Ensemble was refined in %.2fs.', '_prody_refineEnsemble') LOGGER.info('%d conformations were removed from ensemble.' % (len(ensemble) - len(I))) return reens
def buildCovariance(self, coordsets, **kwargs): """Build a covariance matrix for *coordsets* using mean coordinates as the reference. *coordsets* argument may be one of the following: * :class:`.Atomic` * :class:`.Ensemble` * :class:`.TrajBase` * :class:`numpy.ndarray` with shape ``(n_csets, n_atoms, 3)`` For ensemble and trajectory objects, ``update_coords=True`` argument can be used to set the mean coordinates as the coordinates of the object. When *coordsets* is a trajectory object, such as :class:`.DCDFile`, covariance will be built by superposing frames onto the reference coordinate set (see :meth:`.Frame.superpose`). If frames are already aligned, use ``aligned=True`` argument to skip this step. .. note:: If *coordsets* is a :class:`.PDBEnsemble` instance, coordinates are treated specially. Let's say **C**\_ij is the element of the covariance matrix that corresponds to atoms *i* and *j*. This super element is divided by number of coordinate sets (PDB models or structures) in which both of these atoms are observed together.""" if not isinstance(coordsets, (Ensemble, Atomic, TrajBase, np.ndarray)): raise TypeError('coordsets must be an Ensemble, Atomic, Numpy ' 'array instance') LOGGER.timeit('_prody_pca') mean = None weights = None ensemble = None if isinstance(coordsets, np.ndarray): if (coordsets.ndim != 3 or coordsets.shape[2] != 3 or coordsets.dtype not in (np.float32, float)): raise ValueError('coordsets is not a valid coordinate array') elif isinstance(coordsets, Atomic): coordsets = coordsets._getCoordsets() elif isinstance(coordsets, Ensemble): ensemble = coordsets if isinstance(coordsets, PDBEnsemble): weights = coordsets.getWeights() > 0 coordsets = coordsets._getCoordsets() update_coords = bool(kwargs.get('update_coords', False)) if isinstance(coordsets, TrajBase): nfi = coordsets.nextIndex() coordsets.reset() n_atoms = coordsets.numSelected() dof = n_atoms * 3 cov = np.zeros((dof, dof)) #mean = coordsets._getCoords().flatten() n_confs = 0 n_frames = len(coordsets) LOGGER.info( 'Covariance will be calculated using {0} frames.'.format( n_frames)) coordsum = np.zeros(dof) LOGGER.progress('Building covariance', n_frames, '_prody_pca') align = not kwargs.get('aligned', False) for frame in coordsets: if align: frame.superpose() coords = frame._getCoords().flatten() coordsum += coords cov += np.outer(coords, coords) n_confs += 1 LOGGER.update(n_confs, label='_prody_pca') LOGGER.finish() cov /= n_confs coordsum /= n_confs mean = coordsum cov -= np.outer(coordsum, coordsum) coordsets.goto(nfi) self._cov = cov if update_coords: coordsets.setCoords(mean.reshape((n_atoms, 3))) else: n_confs = coordsets.shape[0] if n_confs < 3: raise ValueError('coordsets must have more than 3 coordinate ' 'sets') n_atoms = coordsets.shape[1] if n_atoms < 3: raise ValueError('coordsets must have more than 3 atoms') dof = n_atoms * 3 LOGGER.info( 'Covariance is calculated using {0} coordinate sets.'.format( len(coordsets))) s = (n_confs, dof) if weights is None: if coordsets.dtype == float: self._cov = np.cov(coordsets.reshape((n_confs, dof)).T, bias=1) else: cov = np.zeros((dof, dof)) coordsets = coordsets.reshape((n_confs, dof)) mean = coordsets.mean(0) LOGGER.progress('Building covariance', n_confs, '_prody_pca') for i, coords in enumerate(coordsets.reshape(s)): deviations = coords - mean cov += np.outer(deviations, deviations) LOGGER.update(n_confs, label='_prody_pca') LOGGER.finish() cov /= n_confs self._cov = cov else: # PDB ensemble case mean = np.zeros((n_atoms, 3)) for i, coords in enumerate(coordsets): mean += coords * weights[i] mean /= weights.sum(0) d_xyz = ((coordsets - mean) * weights).reshape(s) divide_by = weights.astype(float).repeat(3, axis=2).reshape(s) self._cov = np.dot(d_xyz.T, d_xyz) / np.dot( divide_by.T, divide_by) if update_coords and ensemble is not None: if mean is None: mean = coordsets.mean(0) ensemble.setCoords(mean) self._trace = self._cov.trace() self._dof = dof self._n_atoms = n_atoms LOGGER.report('Covariance matrix calculated in %2fs.', '_prody_pca')
def calcPairDeformationDist(model, coords, ind1, ind2, kbt=1.): """Returns distribution of the deformations in the distance contributed by each mode for selected pair of residues *ind1* *ind2* using *model* from a :class:`.ANM`. Method described in [EB08]_ equation (10) and figure (2). .. [EB08] Eyal E., Bahar I. Toward a Molecular Understanding of the Anisotropic Response of Proteins to External Forces: Insights from Elastic Network Models. *Biophys J* **2008** 94:3424-34355. :arg model: this is an 3-dimensional NMA instance from a :class:`.ANM calculations. :type model: :class:`.ANM` :arg coords: a coordinate set or an object with ``getCoords`` method. Recommended: coords = parsePDB('pdbfile').select('protein and name CA'). :type coords: :class:`numpy.ndarray`. :arg ind1: first residue number. :type ind1: int :arg ind2: secound residue number. :type ind2: int """ try: resnum_list = coords.getResnums() resnam_list = coords.getResnames() coords = (coords._getCoords() if hasattr(coords, '_getCoords') else coords.getCoords()) except AttributeError: try: checkCoords(coords) except TypeError: raise TypeError('coords must be a Numpy array or an object ' 'with `getCoords` method') if not isinstance(model, NMA): raise TypeError('model must be a NMA instance') elif not model.is3d(): raise TypeError('model must be a 3-dimensional NMA instance') elif len(model) == 0: raise ValueError('model must have normal modes calculated') elif model.getStiffness() is None: raise ValueError('model must have stiffness matrix calculated') linalg = importLA() n_atoms = model.numAtoms() n_modes = model.numModes() LOGGER.timeit('_pairdef') r_ij = np.zeros((n_atoms,n_atoms,3)) r_ij_norm = np.zeros((n_atoms,n_atoms,3)) for i in range(n_atoms): for j in range(i+1,n_atoms): r_ij[i][j] = coords[j,:] - coords[i,:] r_ij[j][i] = r_ij[i][j] r_ij_norm[i][j] = r_ij[i][j]/linalg.norm(r_ij[i][j]) r_ij_norm[j][i] = r_ij_norm[i][j] eigvecs = model.getEigvecs() eigvals = model.getEigvals() D_pair_k = [] mode_nr = [] ind1 = ind1 - resnum_list[0] ind2 = ind2 - resnum_list[0] for m in xrange(6,n_modes): U_ij_k = [(eigvecs[ind1*3][m] - eigvecs[ind2*3][m]), (eigvecs[ind1*3+1][m] \ - eigvecs[ind2*3+1][m]), (eigvecs[ind1*3+2][m] - eigvecs[ind2*3+2][m])] D_ij_k = abs(np.sqrt(kbt/eigvals[m])*(np.vdot(r_ij_norm[ind1][ind2], U_ij_k))) D_pair_k.append(D_ij_k) mode_nr.append(m) LOGGER.report('Deformation was calculated in %.2lfs.', label='_pairdef') return mode_nr, D_pair_k
def parseMMCIFStream(stream, **kwargs): """Returns an :class:`.AtomGroup` and/or a class:`.StarDict` containing header data parsed from a stream of CIF lines. :arg stream: Anything that implements the method ``readlines`` (e.g. :class:`file`, buffer, stdin) """ model = kwargs.get('model') subset = kwargs.get('subset') chain = kwargs.get('chain') altloc = kwargs.get('altloc', 'A') header = kwargs.get('header', False) if model is not None: if isinstance(model, int): if model < 0: raise ValueError('model must be greater than 0') else: raise TypeError('model must be an integer, {0} is invalid' .format(str(model))) title_suffix = '' if subset: try: subset = _PDBSubsets[subset.lower()] except AttributeError: raise TypeError('subset must be a string') except KeyError: raise ValueError('{0} is not a valid subset' .format(repr(subset))) title_suffix = '_' + subset if chain is not None: if not isinstance(chain, str): raise TypeError('chain must be a string') elif len(chain) == 0: raise ValueError('chain must not be an empty string') title_suffix = '_' + chain + title_suffix ag = None if 'ag' in kwargs: ag = kwargs['ag'] if not isinstance(ag, AtomGroup): raise TypeError('ag must be an AtomGroup instance') n_csets = ag.numCoordsets() elif model != 0: ag = AtomGroup(str(kwargs.get('title', 'Unknown')) + title_suffix) n_csets = 0 if model != 0: LOGGER.timeit() try: lines = stream.readlines() except AttributeError as err: try: lines = stream.read().split('\n') except AttributeError: raise err if not len(lines): raise ValueError('empty PDB file or stream') if header: ag, header = _parseMMCIFLines(ag, lines, model, chain, subset, altloc, header) else: ag = _parseMMCIFLines(ag, lines, model, chain, subset, altloc, header) if ag.numAtoms() > 0: LOGGER.report('{0} atoms and {1} coordinate set(s) were ' 'parsed in %.2fs.'.format(ag.numAtoms(), ag.numCoordsets() - n_csets)) else: ag = None LOGGER.warn('Atomic data could not be parsed, please ' 'check the input file.') if header: return ag, StarDict(*header, title=str(kwargs.get('title', 'Unknown'))) return ag
def parsePQR(filename, **kwargs): """Returns an :class:`.AtomGroup` containing data parsed from PDB lines. :arg filename: a PQR filename :type filename: str""" title = kwargs.get('title', kwargs.get('name')) model = 1 header = False chain = kwargs.get('chain') subset = kwargs.get('subset') altloc = kwargs.get('altloc', 'A') if not os.path.isfile(filename): raise IOError('No such file: {0}'.format(repr(filename))) if title is None: fn, ext = os.path.splitext(os.path.split(filename)[1]) if ext == '.gz': fn, ext = os.path.splitext(fn) title = fn.lower() title_suffix = '' if subset: try: subset = _PDBSubsets[subset.lower()] except AttributeError: raise TypeError('subset must be a string') except KeyError: raise ValueError('{0} is not a valid subset'.format(repr(subset))) title_suffix = '_' + subset if chain is not None: if not isinstance(chain, str): raise TypeError('chain must be a string') elif len(chain) == 0: raise ValueError('chain must not be an empty string') title_suffix = '_' + chain + title_suffix if 'ag' in kwargs: ag = kwargs['ag'] if not isinstance(ag, AtomGroup): raise TypeError('ag must be an AtomGroup instance') n_csets = ag.numCoordsets() else: ag = AtomGroup(title + title_suffix) n_csets = 0 pqr = openFile(filename, 'rt') lines = pqr.readlines() pqr.close() LOGGER.timeit() ag = _parsePDBLines(ag, lines, split=0, model=1, chain=chain, subset=subset, altloc_torf=False, format='pqr') if ag.numAtoms() > 0: LOGGER.report('{0} atoms and {1} coordinate sets were ' 'parsed in %.2fs.'.format(ag.numAtoms(), ag.numCoordsets() - n_csets)) return ag else: return None
def fetch(self, url=None, localFile=False, **kwargs): """Get Dali record from url or file. :arg url: url of Dali results page or local dali results file If None then the url already associated with the DaliRecord object is used. :type url: str :arg localFile: whether provided url is a path for a local dali results file :type localFile: bool :arg timeout: amount of time until the query times out in seconds default value is 120 :type timeout: int :arg localfolder: folder in which to find the local file default is the current folder :type localfolder: str """ if localFile: dali_file = open(url, 'r') data = dali_file.read() dali_file.close() else: import requests if url == None: url = self._url sleep = 2 timeout = kwargs.pop('timeout', 120) LOGGER.timeit('_dali') log_message = '' try_error = 3 while True: LOGGER.write('Connecting to Dali for search results...') LOGGER.clear() try: # html = urllib2.urlopen(url).read() html = requests.get(url).content except: try_error -= 1 if try_error >= 0: LOGGER.sleep( 2, '. Connection error happened. Trying to reconnect...' ) continue else: # html = urllib2.urlopen(url).read() html = requests.get(url).content if PY3K: html = html.decode() if html.find('Status: Queued') > -1: log_message = '(Dali search is queued)...' elif html.find('Status: Running') > -1: log_message = '(Dali search is running)...' elif html.find('Your job') == -1 and html.find('.txt') > -1: break elif html.find('ERROR:') > -1: LOGGER.warn(': Dali search reported an ERROR!') return False sleep = 20 if int(sleep * 1.5) >= 20 else int(sleep * 1.5) if LOGGER.timing('_dali') > timeout: LOGGER.warn( ': Dali search has timed out. \nThe results can be obtained later using the fetch() method.' ) return False LOGGER.sleep(int(sleep), 'to reconnect to Dali ' + log_message) LOGGER.clear() LOGGER.clear() LOGGER.report('Dali results were fetched in %.1fs.', '_dali') lines = html.strip().split('\n') file_name = re.search('=.+-90\\.txt', html).group()[1:] file_name = file_name[:-7] # LOGGER.info(url+file_name+self._subset+'.txt') # data = urllib2.urlopen(url+file_name+self._subset+'.txt').read() data = requests.get(url + file_name + self._subset + '.txt').content if PY3K: data = data.decode() localfolder = kwargs.pop('localfolder', '.') if file_name.lower().startswith('s001'): temp_name = self._pdbId + self._chain else: temp_name = file_name temp_name += self._subset + '_dali.txt' if localfolder != '.' and not os.path.exists(localfolder): os.mkdir(localfolder) with open(localfolder + os.sep + temp_name, "w") as file_temp: file_temp.write(html + '\n' + url + file_name + self._subset + '.txt' + '\n' + data) # with open(temp_name, "a+") as file_temp: file_temp.write(url+file_name + '\n' + data) data_list = data.strip().split('# ') # No: Chain Z rmsd lali nres %id PDB Description -> data_list[3] # Structural equivalences -> data_list[4] # Translation-rotation matrices -> data_list[5] map_temp_dict = dict() lines = data_list[4].strip().split('\n') self._lines_4 = lines mapping_temp = np.genfromtxt( lines[1:], delimiter=(4, 1, 14, 6, 2, 4, 4, 5, 2, 4, 4, 3, 5, 4, 3, 5, 6, 3, 5, 4, 3, 5, 28), usecols=[0, 3, 5, 7, 9, 12, 15, 15, 18, 21], dtype='|i4') # [0,3,5,7,9,12,15,15,18,21] -> [index, residue_a, residue_b, residue_i_a, residue_i_b, resid_a, resid_b, resid_i_a, resid_i_b] for map_i in mapping_temp: if not map_i[0] in map_temp_dict: map_temp_dict[map_i[0]] = [[ map_i[1], map_i[2], map_i[3], map_i[4] ]] else: map_temp_dict[map_i[0]].append( [map_i[1], map_i[2], map_i[3], map_i[4]]) self._max_index = max(mapping_temp[:, 2]) self._mapping = map_temp_dict self._data = data_list[3] lines = data_list[3].strip().split('\n') # daliInfo = np.genfromtxt(lines[1:], delimiter = (4,3,6,5,5,5,6,5,57), usecols = [0,2,3,4,5,6,7,8], # dtype=[('id', '<i4'), ('pdb_chain', '|S6'), ('Z', '<f4'), ('rmsd', '<f4'), # ('len_align', '<i4'), ('nres', '<i4'), ('identity', '<i4'), ('title', '|S70')]) daliInfo = np.genfromtxt(lines[1:], delimiter=(4, 3, 6, 5, 5, 5, 6, 5, 57), usecols=[0, 2, 3, 4, 5, 6, 7, 8], dtype=[('id', '<i4'), ('pdb_chain', '|U6'), ('Z', '<f4'), ('rmsd', '<f4'), ('len_align', '<i4'), ('nres', '<i4'), ('identity', '<i4'), ('title', '|U70')]) if daliInfo.ndim == 0: daliInfo = np.array([daliInfo]) pdbListAll = [] self._daliInfo = daliInfo dali_temp_dict = dict() for temp in self._daliInfo: temp_dict = dict() pdb_chain = temp[1].strip()[0:6] # U6 and U70 were used as the dtype for np.genfromtext -> unicode string were used in daliInfo # if PY3K: # pdb_chain = pdb_chain.decode() pdb_chain = str(pdb_chain) temp_dict['pdbId'] = pdbid = pdb_chain[0:4].lower() temp_dict['chainId'] = chid = pdb_chain[5:6] temp_dict['pdb_chain'] = pdb_chain = pdbid + chid temp_dict['Z'] = temp[2] temp_dict['rmsd'] = temp[3] temp_dict['len_align'] = temp[4] temp_dict['nres'] = temp[5] temp_dict['identity'] = temp[6] temp_dict['mapping'] = (np.array(map_temp_dict[temp[0]]) - 1).tolist() temp_dict['map_ref'] = [ x for map_i in (np.array(map_temp_dict[temp[0]]) - 1).tolist() for x in range(map_i[0], map_i[1] + 1) ] temp_dict['map_sel'] = [ x for map_i in (np.array(map_temp_dict[temp[0]]) - 1).tolist() for x in range(map_i[2], map_i[3] + 1) ] dali_temp_dict[pdb_chain] = temp_dict pdbListAll.append(pdb_chain) self._pdbListAll = tuple(pdbListAll) self._pdbList = self._pdbListAll self._alignPDB = dali_temp_dict LOGGER.info('Obtained ' + str(len(pdbListAll)) + ' PDB chains from Dali for ' + self._pdbId + self._chain + '.') return True
def buildHessian(self, coords, cutoff=15., gamma=1., **kwargs): """Build Hessian matrix for given coordinate set. :arg coords: a coordinate set or an object with ``getCoords`` method :type coords: :class:`numpy.ndarray` :arg cutoff: cutoff distance (Å) for pairwise interactions, default is 15.0 Å :type cutoff: float :arg gamma: spring constant, default is 1.0 :type gamma: float :arg membrane_hi: the maximum z coordinate of the pdb default is 13.0 :type membrane_hi: float :arg membrane_lo: the minimum z coordinate of the pdb default is -13.0 :type membrane_lo: float :arg R: radius of all membrane in x-y direction default is 80. :type R: float :arg r: radius of individual barrel-type membrane protein default is 2.5. :type :arg lat: lattice type which could be FCC(face-centered-cubic)(default), SC(simple cubic), SH(simple hexagonal) :type lat: str """ try: coords = (coords._getCoords() if hasattr(coords, '_getCoords') else coords.getCoords()) except AttributeError: try: checkCoords(coords) except TypeError: raise TypeError('coords must be a Numpy array or an object ' 'with `getCoords` method') self._n_atoms = natoms = int(coords.shape[0]) if self._membrane is None: membrane_hi = float(kwargs.get('membrane_hi', 13.0)) membrane_lo = float(kwargs.get('membrane_lo', -13.0)) R = float(kwargs.get('R', 80)) r = float(kwargs.get('r', 5)) lat = str(kwargs.get('lat', 'FCC')) buildMembrane(self,coords,membrane_hi=membrane_hi, membrane_lo=membrane_lo,R=R,r=r,lat=lat) LOGGER.timeit('_exanm') coords = np.concatenate((coords,self._membrane.getCoords()),axis=0) self._combined_coords = coords total_natoms = int(coords.shape[0]) self._hessian = np.zeros((natoms*3, natoms*3), float) total_hessian = np.zeros((total_natoms*3, total_natoms*3), float) cutoff, g, gamma = checkENMParameters(cutoff, gamma) cutoff2 = cutoff * cutoff for i in range(total_natoms): res_i3 = i*3 res_i33 = res_i3+3 i_p1 = i+1 i2j_all = coords[i_p1:, :] - coords[i] for j, dist2 in enumerate((i2j_all ** 2).sum(1)): if dist2 > cutoff2: continue i2j = i2j_all[j] j += i_p1 g = gamma(dist2, i, j) res_j3 = j*3 res_j33 = res_j3+3 super_element = np.outer(i2j, i2j) * (- g / dist2) total_hessian[res_i3:res_i33, res_j3:res_j33] = super_element total_hessian[res_j3:res_j33, res_i3:res_i33] = super_element total_hessian[res_i3:res_i33, res_i3:res_i33] = total_hessian[res_i3:res_i33, res_i3:res_i33] - super_element total_hessian[res_j3:res_j33, res_j3:res_j33] = total_hessian[res_j3:res_j33, res_j3:res_j33] - super_element ss = total_hessian[:natoms*3, :natoms*3] so = total_hessian[:natoms*3, natoms*3+1:] os = total_hessian[natoms*3+1:,:natoms*3] oo = total_hessian[natoms*3+1:, natoms*3+1:] self._hessian = ss - np.dot(so, np.dot(linalg.inv(oo), os)) LOGGER.report('Hessian was built in %.2fs.', label='_exanm') self._dof = self._hessian.shape[0]
def buildHessian(self, coords, blocks, cutoff=15., gamma=1., **kwargs): """Build Hessian matrix for given coordinate set. :arg coords: a coordinate set or an object with ``getCoords`` method :type coords: :class:`numpy.ndarray` :arg blocks: a list or array of block identifiers :type blocks: list, :class:`numpy.ndarray` :arg cutoff: cutoff distance (Å) for pairwise interactions, default is 15.0 Å :type cutoff: float :arg gamma: spring constant, default is 1.0 :type gamma: float :arg scale: scaling factor for force constant along Z-direction, default is 1.0 :type scale: float :arg membrane_low: minimum z-coordinate at which membrane scaling is applied default is 1.0 :type membrane_low: float :arg membrane_high: maximum z-coordinate at which membrane scaling is applied. If membrane_high < membrane_low, scaling will be applied to the entire structure default is -1.0 :type membrane_high: float """ try: coords = (coords._getCoords() if hasattr(coords, '_getCoords') else coords.getCoords()) except AttributeError: try: checkCoords(coords) except TypeError: raise TypeError('coords must be a Numpy array or an object ' 'with `getCoords` method') LOGGER.timeit('_rtb') self._n_atoms = natoms = int(coords.shape[0]) if natoms != len(blocks): raise ValueError('len(blocks) must match number of atoms') from collections import defaultdict i = Increment() d = defaultdict(i) blocks = np.array([d[b] for b in blocks], np.int64) try: from collections import Counter except ImportError: counter = defaultdict(int) for b in blocks: counter[b] += 1 else: counter = Counter(blocks) nblocks = len(counter) maxsize = 1 nones = 0 while counter: _, size = counter.popitem() if size == 1: nones += 1 if size > maxsize: maxsize = size LOGGER.info( 'System has {0} blocks largest with {1} of {2} units.'.format( nblocks, maxsize, natoms)) nb6 = nblocks * 6 - nones * 3 coords = coords.T.copy() self._hessian = hessian = np.zeros((nb6, nb6), float) self._project = project = np.zeros((natoms * 3, nb6), float) from .rtbtools import buildhessian buildhessian( coords, blocks, hessian, project, natoms, nblocks, maxsize, float(cutoff), float(gamma), scale=float(kwargs.get('scale', 1.0)), memlo=float(kwargs.get('membrane_low', 1.0)), memhi=float(kwargs.get('membrane_high', -1.0)), ) self._dof = self._hessian.shape[0] LOGGER.report('Hessian was built in %.2fs.', label='_rtb')
def searchUniprotID(query, search_b=False, skip_a=False, **kwargs): """Returns Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence file. Sequence queries must not contain gaps and must be at least 16 characters long :type query: str :arg search_b: search Pfam-B families when **True** :type search_b: bool :arg skip_a: do not search Pfam-A families when **True** :type skip_a: bool :arg ga: use gathering threshold when **True** :type ga: bool :arg evalue: user specified e-value cutoff, must be smaller than 10.0 :type evalue: float :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" query = str(query) seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) url = prefix + 'protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml: break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) result = root[0].get('id') return result
def buildHessian(self, coords, cutoff=15., gamma=1., **kwargs): """Build Hessian matrix for given coordinate set. **kwargs** are passed to :method:`.buildMembrane`. :arg coords: a coordinate set or an object with ``getCoords`` method :type coords: :class:`numpy.ndarray` :arg cutoff: cutoff distance (Å) for pairwise interactions, default is 15.0 Å :type cutoff: float :arg gamma: spring constant, default is 1.0 :type gamma: float """ atoms = coords try: coords = (coords._getCoords() if hasattr(coords, '_getCoords') else coords.getCoords()) except AttributeError: try: checkCoords(coords) except TypeError: raise TypeError('coords must be a Numpy array or an object ' 'with `getCoords` method') self._n_atoms = natoms = int(coords.shape[0]) if self._membrane is None: coords = self.buildMembrane(atoms, **kwargs) else: coords = self._combined.getCoords() LOGGER.timeit('_exanm') total_natoms = int(coords.shape[0]) self._hessian = np.zeros((natoms*3, natoms*3), float) total_hessian = np.zeros((total_natoms*3, total_natoms*3), float) cutoff, g, gamma = checkENMParameters(cutoff, gamma) cutoff2 = cutoff * cutoff for i in range(total_natoms): res_i3 = i*3 res_i33 = res_i3+3 i_p1 = i+1 i2j_all = coords[i_p1:, :] - coords[i] for j, dist2 in enumerate((i2j_all ** 2).sum(1)): if dist2 > cutoff2: continue i2j = i2j_all[j] j += i_p1 g = gamma(dist2, i, j) res_j3 = j*3 res_j33 = res_j3+3 super_element = np.outer(i2j, i2j) * (- g / dist2) total_hessian[res_i3:res_i33, res_j3:res_j33] = super_element total_hessian[res_j3:res_j33, res_i3:res_i33] = super_element total_hessian[res_i3:res_i33, res_i3:res_i33] = total_hessian[res_i3:res_i33, res_i3:res_i33] - super_element total_hessian[res_j3:res_j33, res_j3:res_j33] = total_hessian[res_j3:res_j33, res_j3:res_j33] - super_element ss = total_hessian[:natoms*3, :natoms*3] so = total_hessian[:natoms*3, natoms*3:] os = total_hessian[natoms*3:,:natoms*3] oo = total_hessian[natoms*3:, natoms*3:] self._hessian = ss - np.dot(so, np.dot(inv(oo), os)) LOGGER.report('Hessian was built in %.2fs.', label='_exanm') self._dof = self._hessian.shape[0]
def mapSAVs2PDB(SAV_coords, custom_PDB=None): LOGGER.info('Mapping SAVs to PDB structures...') LOGGER.timeit('_map2PDB') # sort SAVs, so to group together those # with identical accession number sorting_map = np.argsort(SAV_coords['acc']) # define a structured array PDBmap_dtype = np.dtype([('orig. SAV coords', 'U25'), ('uniq. SAV coords', 'U25'), ('PDB SAV coords', 'U100'), ('PDB size', 'i')]) num_SAVs = len(SAV_coords) mapped_SAVs = np.zeros(num_SAVs, dtype=PDBmap_dtype) # map to PDB using Uniprot class cache = {'acc': None, 'obj': None} count = 0 for indx, SAV in [(i, SAV_coords[i]) for i in sorting_map]: count += 1 acc, pos, aa1, aa2, SAV_str = SAV LOGGER.info("[{}/{}] Mapping SAV '{}' to PDB...".format( count, num_SAVs, SAV_str)) # map Uniprot to PDB chains if acc == cache['acc']: # use mapping from previous iteration U2P_map = cache['obj'] else: # save previous mapping if isinstance(cache['obj'], UniprotMapping): cache['obj'].savePickle() cache['acc'] = acc # compute the new mapping try: U2P_map = UniprotMapping(acc, recover_pickle=True) if custom_PDB is not None: LOGGER.info('Aligning Uniprot sequence to custom PDB...') U2P_map.alignCustomPDB(custom_PDB, 'all') except Exception as e: U2P_map = str(e) cache['obj'] = U2P_map # map specific SAV try: if isinstance(U2P_map, str): raise RuntimeError(U2P_map) # check wt aa if not 0 < pos <= len(U2P_map.sequence): raise ValueError('Index out of range') wt_aa = U2P_map.sequence[pos - 1] if aa1 != wt_aa: raise ValueError(f'Incorrect wt aa: {aa1} instead of {wt_aa}') # map to PDB. Format: [('2DZF', 'A', 150, 'N', 335)] if custom_PDB is None: r = U2P_map.mapSingleResidue(pos, check_aa=True) else: r = U2P_map.mapSingleRes2CustomPDBs(pos, check_aa=True) if len(r) == 0: raise RuntimeError('Unable to map SAV to PDB') else: PDBID, chID, resid, aa, PDB_size = r[0] # NB: check for blank "chain" field if chID.strip() == '': chID = '?' res_map = f'{PDBID} {chID} {resid} {aa}' except Exception as e: res_map = str(e) PDB_size = 0 # store SAVs mapped on PDB chains and unique Uniprot coordinates if isinstance(U2P_map, str): uniq_coords = U2P_map else: uniq_coords = f'{U2P_map.uniq_acc} {pos} {aa1} {aa2}' mapped_SAVs[indx] = (SAV_str, uniq_coords, res_map, PDB_size) # save last pickle if isinstance(cache['obj'], UniprotMapping): cache['obj'].savePickle() LOGGER.report('SAVs have been mapped to PDB in %.1fs.', '_map2PDB') return mapped_SAVs
def parsePDBStream(stream, **kwargs): """Returns an :class:`.AtomGroup` and/or dictionary containing header data parsed from a stream of PDB lines. :arg stream: Anything that implements the method ``readlines`` (e.g. :class:`file`, buffer, stdin)""" model = kwargs.get('model') header = kwargs.get('header', False) assert isinstance(header, bool), 'header must be a boolean' chain = kwargs.get('chain') subset = kwargs.get('subset') altloc = kwargs.get('altloc', 'A') if model is not None: if isinstance(model, int): if model < 0: raise ValueError('model must be greater than 0') else: raise TypeError('model must be an integer, {0} is invalid'.format( str(model))) title_suffix = '' if subset: try: subset = _PDBSubsets[subset.lower()] except AttributeError: raise TypeError('subset must be a string') except KeyError: raise ValueError('{0} is not a valid subset'.format(repr(subset))) title_suffix = '_' + subset if chain is not None: if not isinstance(chain, str): raise TypeError('chain must be a string') elif len(chain) == 0: raise ValueError('chain must not be an empty string') title_suffix = '_' + chain + title_suffix ag = None if 'ag' in kwargs: ag = kwargs['ag'] if not isinstance(ag, AtomGroup): raise TypeError('ag must be an AtomGroup instance') n_csets = ag.numCoordsets() elif model != 0: ag = AtomGroup(str(kwargs.get('title', 'Unknown')) + title_suffix) n_csets = 0 biomol = kwargs.get('biomol', False) auto_secondary = None secondary = kwargs.get('secondary') if not secondary: auto_secondary = SETTINGS.get('auto_secondary') secondary = auto_secondary split = 0 hd = None if model != 0: LOGGER.timeit() try: lines = stream.readlines() except AttributeError as err: try: lines = stream.read().split('\n') except AttributeError: raise err if not len(lines): raise ValueError('empty PDB file or stream') if header or biomol or secondary: hd, split = getHeaderDict(lines) _parsePDBLines(ag, lines, split, model, chain, subset, altloc) if ag.numAtoms() > 0: LOGGER.report('{0} atoms and {1} coordinate set(s) were ' 'parsed in %.2fs.'.format( ag.numAtoms(), ag.numCoordsets() - n_csets)) else: ag = None LOGGER.warn('Atomic data could not be parsed, please ' 'check the input file.') elif header: hd, split = getHeaderDict(stream) if ag is not None and isinstance(hd, dict): if secondary: if auto_secondary: try: ag = assignSecstr(hd, ag) except ValueError: pass else: ag = assignSecstr(hd, ag) if biomol: ag = buildBiomolecules(hd, ag) if isinstance(ag, list): LOGGER.info('Biomolecular transformations were applied, {0} ' 'biomolecule(s) are returned.'.format(len(ag))) else: LOGGER.info('Biomolecular transformations were applied to the ' 'coordinate data.') if model != 0: if header: return ag, hd else: return ag else: return hd
def queryPolyPhen2(filename, dump=True, prefix='pph2', fasta_file=None, fix_isoforms=False, ignore_errors=False, **kwargs): # original PolyPhen-2 curl command (see: # http://genetics.bwh.harvard.edu/pph2/dokuwiki/faq ): # # curl -F _ggi_project=PPHWeb2 -F _ggi_origin=query \ # -F _ggi_target_pipeline=1 -F MODELNAME=HumDiv \ # -F UCSCDB=hg19 -F SNPFUNC=m -F [email protected] \ # -F _ggi_batch_file=@example_batch.txt \ # -D - http://genetics.bwh.harvard.edu/cgi-bin/ggi/ggi2.cgi assert type(dump) is bool assert type(prefix) is str LOGGER.info('Submitting query to PolyPhen-2...') num_lines = sum(1 for line in open(filename, 'rb') if line[0] != '#') input_file = open(filename, 'rb') # submit query address = 'http://genetics.bwh.harvard.edu/cgi-bin/ggi/ggi2.cgi' files = { '_ggi_project': (None, 'PPHWeb2'), '_ggi_origin': (None, 'query'), '_ggi_target_pipeline': (None, '1'), '_ggi_batch_file': ('query.txt', input_file), 'MODELNAME': (None, kwargs.get('MODELNAME', 'HumDiv')), 'UCSCDB': (None, kwargs.get('UCSCDB', 'hg19')), 'SNPFUNC': (None, kwargs.get('SNPFUNC', 'm')) } if fasta_file is not None: # upload custom sequences custom_fasta = open(fasta_file, 'rb') files['uploaded_sequences_1'] = ('sequences.fa', custom_fasta) response = requests.post(address, files=files) # parse job ID from response page jobID = response.cookies['polyphenweb2'] # results and semaphore files results_dir = f'http://genetics.bwh.harvard.edu/ggi/pph2/{jobID}/1/' files = { 'started': results_dir + 'started.txt', 'completed': results_dir + 'completed.txt', 'short': results_dir + 'pph2-short.txt', 'full': results_dir + 'pph2-full.txt', 'log': results_dir + 'pph2-log.txt', 'snps': results_dir + 'pph2-snps.txt' } # keep checking if the job has started/completed and, # when done, fetch output files output = {} exts = ['started', 'completed', 'short', 'full', 'log', 'snps'] for k in exts: # delay = timeout + backoff_factor*[2^(total_retries - 1)] if k == 'started': LOGGER.timeit('_started') r = _requests_retry_session(retries=16).get(files[k]) LOGGER.report('Query to PolyPhen-2 started in %.1fs.', '_started') LOGGER.info('PolyPhen-2 is running...') elif k == 'completed': LOGGER.timeit('_queryPP2') r = _requests_retry_session(retries=200, timeout=log(num_lines) / 2).get( files[k]) LOGGER.report('Query to PolyPhen-2 completed in %.1fs.', '_queryPP2') else: r = _requests_retry_session(retries=12).get(files[k]) output[k] = r.text # print to file, if requested if dump: with open(prefix + '-' + k + '.txt', 'w', 1) as f: print(r.text, file=f) # check for conflicts between Uniprot sequences and isoforms used # by Polyhen-2 (which are sometimes outdated) Uniprot_accs = _check_log_errors(output['log']) if Uniprot_accs: if fix_isoforms: LOGGER.info('PolyPhen-2 may have picked the wrong isoforms.') LOGGER.info('Resubmitting query with correct isoforms --- ' 'it may take up to a few hours to complete...') # print file with freshly downloaded Uniprot sequences fasta_fname, new_accs = _print_fasta_file(Uniprot_accs) # replace accession numbers in list of SAVs tmp_fname = filename + '.tmp' _replace_strings_in_file(filename, tmp_fname, new_accs) # resubmit query by manually uploading fasta sequences output = queryPolyPhen2(tmp_fname, dump=dump, prefix=prefix, fasta_file=fasta_fname, fix_isoforms=False, **kwargs) os.remove(tmp_fname) # restore original accession numbers in output orig_accs = dict([[v, k] for k, v in new_accs.items()]) for k in exts: output[k] = _replace_strings_in_text(output[k], orig_accs) if dump: outfile = f'pph2-{k}.txt' _replace_strings_in_file(outfile, outfile, orig_accs) elif not ignore_errors: LOGGER.warn('Please check PolyPhen-2 log file') else: LOGGER.error('Please check PolyPhen-2 log file') return output
def calcPairDeformationDist(model, coords, ind1, ind2, kbt=1.): """Returns distribution of the deformations in the distance contributed by each mode for selected pair of residues *ind1* *ind2* using *model* from a :class:`.ANM`. Method described in [EB08]_ equation (10) and figure (2). .. [EB08] Eyal E., Bahar I. Toward a Molecular Understanding of the Anisotropic Response of Proteins to External Forces: Insights from Elastic Network Models. *Biophys J* **2008** 94:3424-34355. :arg model: this is an 3-dimensional :class:`NMA` instance from a :class:`.ANM` calculations. :type model: :class:`.ANM` :arg coords: a coordinate set or an object with :meth:`getCoords` method. Recommended: ``coords = parsePDB('pdbfile').select('protein and name CA')``. :type coords: :class:`~numpy.ndarray`. :arg ind1: first residue number. :type ind1: int :arg ind2: secound residue number. :type ind2: int """ try: resnum_list = coords.getResnums() resnam_list = coords.getResnames() coords = (coords._getCoords() if hasattr(coords, '_getCoords') else coords.getCoords()) except AttributeError: try: checkCoords(coords) except TypeError: raise TypeError('coords must be a Numpy array or an object ' 'with `getCoords` method') if not isinstance(model, NMA): raise TypeError('model must be a NMA instance') elif not model.is3d(): raise TypeError('model must be a 3-dimensional NMA instance') elif len(model) == 0: raise ValueError('model must have normal modes calculated') linalg = importLA() n_atoms = model.numAtoms() n_modes = model.numModes() LOGGER.timeit('_pairdef') r_ij = np.zeros((n_atoms, n_atoms, 3)) r_ij_norm = np.zeros((n_atoms, n_atoms, 3)) for i in range(n_atoms): for j in range(i + 1, n_atoms): r_ij[i][j] = coords[j, :] - coords[i, :] r_ij[j][i] = r_ij[i][j] r_ij_norm[i][j] = r_ij[i][j] / linalg.norm(r_ij[i][j]) r_ij_norm[j][i] = r_ij_norm[i][j] eigvecs = model.getEigvecs() eigvals = model.getEigvals() D_pair_k = [] mode_nr = [] ind1 = ind1 - resnum_list[0] ind2 = ind2 - resnum_list[0] for m in range(6, n_modes): U_ij_k = [(eigvecs[ind1*3][m] - eigvecs[ind2*3][m]), (eigvecs[ind1*3+1][m] \ - eigvecs[ind2*3+1][m]), (eigvecs[ind1*3+2][m] - eigvecs[ind2*3+2][m])] D_ij_k = abs( sqrt(kbt / eigvals[m]) * (np.vdot(r_ij_norm[ind1][ind2], U_ij_k))) D_pair_k.append(D_ij_k) mode_nr.append(m) LOGGER.report('Deformation was calculated in %.2lfs.', label='_pairdef') return mode_nr, D_pair_k
def psiBlastCycle(sequence=None, filename=None, **kwargs): """Returns a :class:`PDBBlastRecord` instance that contains results from a single cycle of EBI psiblast. :arg sequence: an object with an associated sequence string or a sequence string itself :type sequence: :class:`Atomic`, :class:`Sequence`, or str :arg filename: a *filename* to save the results in XML format :type filename: str The following search parameters can be adjusted by the user. We use the same default values as http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/ wherever applicable. :arg email: email address for reporting problems default is [email protected] :type email: str with an @ before a . :arg matrix: The comparison matrix to be used to score alignments when searching the database possible values are 'BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30' and 'PAM70' default is 'BLOSUM62' :type matrix: str :arg gapopen: Penalty taken away from the score when a gap is created in sequence alignments. Increasing the gap opening penalty will decrease the number of gaps in the final alignment. Possible values range from 8 to 16 inclusive, default is 11 :type gapopen: int :arg gapext: Penalty taken away from the score for each base or residue in the gap. Increasing the gap extension penalty favors short gaps in the final alignment, conversly decreasing the gap extension penalty favors long gaps in the final alignment. Possible values range from 0 to 3, default is 1 :type gapext: int :arg expthr: Expectation threshold that limits the number of scores and alignments reported. This is the maximum number of times the match is expected to occur by chance. Possible values are 1.0e-200, 1.0e-100, 1.0e-50, 1.0e-10, 1.0e-5, 1.0e-4, 1.0e-3, 1.0e-2, 0.1, 1.0, 10.0, 100, 1000 default is 10.0 :type expthr: float :arg psithr: Expectation value threshold for automatic selection of matched sequences for inclusion in the PSSM at each iteration. Possible values are 1.0e-6, 1.0e-5, 1.0e-4, 2.0e-4, 5.0e-4, 1.0e-3, 2.0e-3, 5.0e-3, 1.0e-2, 2.0e-2, 0.1, 0.3, 0.5, 1.0, 3.0, 10.0 default is 1.0e-3 :type psithr: float :arg scores: Maximum number of match score summaries reported in the result output. Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000 Default is 500 :type scores: int :arg alignments: Maximum number of match alignments reported in the result output. Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000 Default is 500 :type alignmets: int :arg dropoff: The amount a score can drop before extension of word hits is halted Possible values are 0, 2, 4, 6, 8, 10, 15, 20, 25, or 30 Default is 15 :type dropoff: int :arg finaldropoff: Dropoff value for final gapped alignment Possible values are 10, 12, 14, 16, 18, 20, 22, 24, 25, 26, 28, or 30 Default is 25 :type finaldropoff: int :arg filter: Filter regions of low sequence complexity. This can avoid issues with low complexity sequences where matches are found due to composition rather than meaningful sequence similarity. However, in some cases filtering also masks regions of interest and so should be used with caution. Possible values are T and F, default is F :type filter: str :arg seqrange: Specify a range or section of the input sequence to use in the search. Example: Specifying '34-89' in an input sequence of total length 100, will tell BLAST to only use residues 34 to 89, inclusive. :type seqrange: str of form START-END :arg database: a database name from those available. See http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/database default is pdb :type database: str :arg previousjobid: The job identifier for the previous PSI-BLAST iteration. default is None You can change this if you want to continue from a previous run :type previousjobid: str :arg selectedHits: Name of a file containing a list of identifiers of the hits from the previous iteration to use to construct the search PSSM for this iteration. default is None :type selectedHits: str :arg cpfile: Name of a Checkpoint file from the previous iteration. default is None :type cpfile: str :arg sleep: how long to wait to reconnect for status Sleep time is multiplied by 1.5 when results are not ready. default is 2 seconds :type sleep: float :arg timeout: when to give up waiting for the results default is 120 seconds :type timeout: float :arg cycle: cycle number :type cycle: int """ cycle = kwargs.get('cycle',0) if sequence == 'runexample': sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI' 'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN' 'DAYDIVKMKKSNISPNFNFMGQLLDFERTL') elif isinstance(sequence, Atomic): sequence = sequence.calpha.getSequence() elif isinstance(sequence, Sequence): sequence = str(sequence) elif isinstance(sequence, str): if len(sequence) in [4, 5, 6]: ag = parsePDB(sequence) sequence = ag.calpha.getSequence() sequence = ''.join(sequence.split()) elif sequence is None: if cycle == 0: cycle = 1 else: raise TypeError('sequence must be Atomic, Sequence, or str not {0}' .format(type(sequence))) if cycle == 0: query = [('sequence', sequence)] else: query = [] email = kwargs.get('email','*****@*****.**') if not isinstance(email, str): raise TypeError('email must be a string') elif email.find('@') == -1 or email.find('.') == -1 or len(email.split('@')) != 2: raise ValueError('email must be a valid email address with at least one . and exactly one @ sign') elif not email.find('@') < email.find(email.split('.')[-1]): raise ValueError('email must be a valid email address with a . after the @ sign') query.append(('email', email)) query.append(('title', 'ProDy psiBlastPDB request')) previousjobid = kwargs.get('previousjobid','') if previousjobid is not '': query.append(('previousjobid',previousjobid)) selectedHits = kwargs.get('selectedHits','') if selectedHits is not '': query.append(('selectedHits',selectedHits)) database = kwargs.get('database','pdb') checkPsiBlastParameter('database', database) query.append(('database',database)) matrix = kwargs.get('matrix', 'BLOSUM62') checkPsiBlastParameter('matrix', matrix) query.append(('matrix',matrix)) gapopen = kwargs.get('gapopen',11) checkPsiBlastParameter('gapopen', gapopen) query.append(('gapopen',gapopen)) gapext = kwargs.get('gapext',1) checkPsiBlastParameter('gapext', gapext) query.append(('gapext',gapext)) expthr = kwargs.get('expthr', 10.) checkPsiBlastParameter('expthr', expthr) query.append(('expthr',expthr)) psithr = kwargs.get('psithr',1.0e-3) checkPsiBlastParameter('psithr', psithr) query.append(('psithr',psithr)) scores = kwargs.get('scores',500) checkPsiBlastParameter('scores', scores) query.append(('scores',scores)) alignments = kwargs.get('alignments',500) checkPsiBlastParameter('alignments', alignments) query.append(('alignments',alignments)) query.append(('alignView',0)) dropoff = kwargs.get('dropoff',15) checkPsiBlastParameter('dropoff', dropoff) query.append(('dropoff',dropoff)) finaldropoff = kwargs.get('finaldropoff',25) checkPsiBlastParameter('finaldropoff', finaldropoff) query.append(('finaldropoff',finaldropoff)) filter = kwargs.get('filter','F') checkPsiBlastParameter('filter', filter) query.append(('filter',filter)) if previousjobid is '' and selectedHits is '': seqrange = kwargs.get('seqrange', None) if seqrange is None: seqrange = '0-' + str(len(sequence)) elif not isinstance(seqrange, str): raise TypeError('seqrange should be a string') elif len(seqrange.split('-')) != 2: raise ValueError('seqrange should take the form START-END') try: start = int(seqrange.split('-')[0]) end = int(seqrange.split('-')[1]) except: raise ValueError('seqrange should be START-END with START and END being integers') query.append(('seqrange',seqrange)) headers = { 'User-Agent' : 'ProDy' } try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib import urlencode sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', 120)) data = urlencode(query) # submit the job base_url = 'http://www.ebi.ac.uk/Tools/services/rest/psiblast/' url = base_url + 'run/' LOGGER.timeit('_prody_psi-blast') if cycle == 0: LOGGER.info('PSI-Blast searching PDB database for "{0}..."' .format(sequence[:5])) else: LOGGER.info('PSI-Blast searching PDB database, cycle={0}' .format(cycle)) handle = openURL(url, data=data, headers=headers) job_id = handle.read() handle.close() # check the status url = base_url + 'status/' + job_id handle = openURL(url) status = handle.read() handle.close() # keep checking the status until it's no longer running while status == 'RUNNING': LOGGER.sleep(int(sleep), 'to reconnect to EBI for status.') LOGGER.write('Connecting to EBI for status...') handle = openURL(url) status = handle.read() LOGGER.clear() sleep = int(sleep * 1.5) if LOGGER.timing('_prody_psi-blast') > timeout: LOGGER.warn('PSI-Blast search time out.') return None LOGGER.info('The status is {0}'.format(status)) LOGGER.clear() LOGGER.report('PSI-Blast search completed in %.1fs.', '_prody_psi-blast') if cycle != 1: # get the results url = base_url + 'result/' + job_id + '/xml' handle = openURL(url) results = handle.read() handle.close() try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' f_out = open(filename, 'w') f_out.write(results) f_out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) return job_id, PsiBlastRecord(results, sequence) else: return job_id
def searchPfam(query, **kwargs): """Return Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence file, sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" prefix = "{http://pfam.xfam.org/}" query = str(query) if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = "".join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError("could not parse a sequence without gaps from " + query) else: seq = "".join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit("_pfam") timeout = int(kwargs.get("timeout", 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + " is not a valid sequence") fseq = ">Seq\n" + seq parameters = {"hmmdb": "pfam", "seq": fseq} enc_params = urllib.urlencode(parameters) request = urllib2.Request("http://hmmer.janelia.org/search/hmmscan", enc_params) url = urllib2.urlopen(request).geturl() + "?output=xml" LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format(seq[:MINSEQLEN])) xml = openURL(url, timeout=timeout).read() try: root = ET.XML(xml) except Exception as err: raise ValueError("failed to parse results XML, check URL: " + url) matches = {} for child in root[0]: if child.tag == "hits": accession = child.get("acc") pfam_id = accession.split(".")[0] matches[pfam_id] = {} matches[pfam_id]["accession"] = accession matches[pfam_id]["class"] = "Domain" matches[pfam_id]["id"] = child.get("name") matches[pfam_id]["locations"] = {} matches[pfam_id]["locations"]["ali_end"] = child[0].get("alisqto") matches[pfam_id]["locations"]["ali_start"] = child[0].get("alisqfrom") matches[pfam_id]["locations"]["bitscore"] = child[0].get("bitscore") matches[pfam_id]["locations"]["end"] = child[0].get("alisqto") matches[pfam_id]["locations"]["evalue"] = child.get("evalue") matches[pfam_id]["locations"]["evidence"] = "hmmer v3.0" matches[pfam_id]["locations"]["hmm_end"] = child[0].get("alihmmto") matches[pfam_id]["locations"]["hmm_start"] = child[0].get("alihmmfrom") matches[pfam_id]["locations"]["significant"] = child[0].get("significant") matches[pfam_id]["locations"]["start"] = child[0].get("alisqfrom") matches[pfam_id]["type"] = "Pfam-A" return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], "polymers") except Exception as err: LOGGER.warn("failed to parse header for {0} ({1})".format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != "UniProt": continue idcode = dbref.idcode LOGGER.info( "UniProt ID code {0} for {1} chain " "{2} will be used.".format(idcode, seq[:4], poly.chid) ) break if idcode is not None: break if idcode is None: LOGGER.warn("A UniProt ID code for PDB {0} could not be " "parsed.".format(repr(seq))) url = "http://pfam.xfam.org/protein/" + seq + "?output=xml" else: url = "http://pfam.xfam.org/protein/" + idcode + "?output=xml" else: url = "http://pfam.xfam.org/protein/" + seq + "?output=xml" LOGGER.debug("Retrieving Pfam search results: " + url) xml = None while LOGGER.timing("_pfam") < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml: break if not xml: raise IOError("Pfam search timed out or failed to parse results " "XML, check URL: " + url) else: LOGGER.report("Pfam search completed in %.2fs.", "_pfam") if xml.find(b"There was a system error on your last request.") > 0: LOGGER.warn("No Pfam matches found for: " + seq) return None try: root = ET.XML(xml) except Exception as err: raise ValueError("failed to parse results XML, check URL: " + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError("failed to parse results XML, check URL: " + url) else: results = dictElement(root[0], prefix) try: xml_matches = results["matches"] except KeyError: raise ValueError("failed to parse results XML, check URL: " + url) matches = dict() for child in xml_matches: try: accession = child.attrib["accession"][:7] except KeyError: raise ValueError("failed to parse results XML, check URL: " + url) if not re.search("^P(F|B)[0-9]{5}$", accession): raise ValueError("{0} does not match pfam accession" " format".format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault("locations", []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = "Query " + repr(query) else: query = "Query sequence" if matches: LOGGER.info(query + " matched {0} Pfam families.".format(len(matches))) else: LOGGER.info(query + " did not match any Pfam families.") return matches
def buildMembrane(self, coords, **kwargs): """Build Hessian matrix for given coordinate set. :arg coords: a coordinate set or an object with ``getCoords`` method :type coords: :class:`numpy.ndarray` :arg membrane_high: the maximum z coordinate of the membrane. Default is **13.0** :type membrane_high: float :arg membrane_low: the minimum z coordinate of the membrane. Default is **-13.0** :type membrane_low: float :arg R: radius of all membrane in x-y direction. Default is **80** :type R: float :arg Ri: inner radius of the membrane in x-y direction if it needs to be hollow. Default is **0**, which is not hollow :type Ri: float :arg r: radius of each membrane node. Default is **3.1** :type r: float :arg lat: lattice type which could be **FCC** (face-centered-cubic, default), **SC** (simple cubic), **SH** (simple hexagonal) :type lat: str :arg exr: exclusive radius of each protein node. Default is **5.0** :type exr: float :arg hull: whether use convex hull to determine the protein's interior. Turn it off if protein is multimer. Default is **True** :type hull: bool :arg center: whether transform the structure to the origin (only x- and y-axis). Default is **True** :type center: bool """ atoms = coords try: coords = (coords._getCoords() if hasattr(coords, '_getCoords') else coords.getCoords()) except AttributeError: try: checkCoords(coords) except TypeError: raise TypeError('coords must be a Numpy array or an object ' 'with `getCoords` method') self._n_atoms = natoms = int(coords.shape[0]) LOGGER.timeit('_membrane') depth = kwargs.pop('depth', None) h = depth / 2 if depth is not None else None h = kwargs.pop('h', h) if h is not None: h = float(h) hu = h hl = -h else: hu = kwargs.pop('membrane_high', 13.0) hu = kwargs.pop('high', hu) hu = float(hu) hl = kwargs.pop('membrane_low', -13.0) hl = kwargs.pop('low', hl) hl = float(hl) R = float(kwargs.pop('R', 80.)) Ri = float(kwargs.pop('Ri', 0.)) r = float(kwargs.pop('r', 3.1)) lat = str(kwargs.pop('lat', 'FCC')) exr = float(kwargs.pop('exr', 5.)) use_hull = kwargs.pop('hull', True) centering = kwargs.pop('center', True) V = assign_lpvs(lat) if centering: c0 = coords.mean(axis=0) c0[-1] = 0. coords -= c0 # determine transmembrane part torf = np.logical_and(coords[:, -1] < hu, coords[:, -1] > hl) transmembrane = coords[torf, :] if not np.any(torf): raise ValueError('No region was identified as membrane. Please use a structure from opm/ppm.') if use_hull: from scipy.spatial import ConvexHull hull = ConvexHull(transmembrane) else: hull = transmembrane ## determine the bound for ijk imax = (R + V[0,2] * (hu - hl)/2.)/r jmax = (R + V[1,2] * (hu - hl)/2.)/r kmax = (R + V[2,2] * (hu - hl)/2.)/r imax = int(ceil(imax)) jmax = int(ceil(jmax)) kmax = int(ceil(kmax)) membrane = [] atm = 0 for i in range(-imax, imax): for j in range(-jmax, jmax): for k in range(-kmax, kmax): c = array([i, j, k]) xyz = 2.*r*dot(c, V) if xyz[2]>hl and xyz[2]<hu and \ xyz[0]>-R and xyz[0]<R and \ xyz[1]>-R and xyz[1]<R: dd = norm(xyz[:2]) if dd < R and dd > Ri: if checkClash(xyz, hull, radius=exr): membrane.append(xyz) atm = atm + 1 membrane = array(membrane) if len(membrane) == 0: self._membrane = None LOGGER.warn('no membrane is built. The protein should be transformed to the correct origin as in OPM') return coords else: self._membrane = AtomGroup(title="Membrane") self._membrane.setCoords(membrane) self._membrane.setResnums(range(atm)) self._membrane.setResnames(["NE1" for i in range(atm)]) self._membrane.setChids(["Q" for i in range(atm)]) self._membrane.setElements(["Q1" for i in range(atm)]) self._membrane.setNames(["Q1" for i in range(atm)]) LOGGER.report('Membrane was built in %2.fs.', label='_membrane') coords = self._combineMembraneProtein(atoms) return coords
def buildHessian(self, coords, cutoff=15., gamma=1., **kwargs): """Build Hessian matrix for given coordinate set. :arg coords: a coordinate set or an object with ``getCoords`` method :type coords: :class:`numpy.ndarray` :arg cutoff: cutoff distance (Å) for pairwise interactions, default is 15.0 Å, minimum is 4.0 Å :type cutoff: float :arg gamma: spring constant, default is 1.0 :type gamma: float, :class:`Gamma` :arg sparse: elect to use sparse matrices, default is **False**. If Scipy is not found, :class:`ImportError` is raised. :type sparse: bool :arg kdtree: elect to use KDTree for building Hessian matrix, default is **False** since KDTree method is slower :type kdtree: bool Instances of :class:`Gamma` classes and custom functions are accepted as *gamma* argument. When Scipy is available, user can select to use sparse matrices for efficient usage of memory at the cost of computation speed.""" try: coords = (coords._getCoords() if hasattr(coords, '_getCoords') else coords.getCoords()) except AttributeError: try: checkCoords(coords) except TypeError: raise TypeError('coords must be a Numpy array or an object ' 'with `getCoords` method') cutoff, g, gamma = checkENMParameters(cutoff, gamma) self._reset() self._cutoff = cutoff self._gamma = g n_atoms = coords.shape[0] dof = n_atoms * 3 LOGGER.timeit('_anm_hessian') if kwargs.get('sparse', False): try: from scipy import sparse as scipy_sparse except ImportError: raise ImportError('failed to import scipy.sparse, which is ' 'required for sparse matrix calculations') kirchhoff = scipy_sparse.lil_matrix((n_atoms, n_atoms)) hessian = scipy_sparse.lil_matrix((dof, dof)) else: kirchhoff = np.zeros((n_atoms, n_atoms), 'd') hessian = np.zeros((dof, dof), float) if kwargs.get('kdtree', False): LOGGER.info('Using KDTree for building the Hessian.') kdtree = KDTree(coords) kdtree.search(cutoff) for i, j in kdtree.getIndices(): i2j = coords[j] - coords[i] dist2 = np.dot(i2j, i2j) g = gamma(dist2, i, j) super_element = np.outer(i2j, i2j) * (- g / dist2) res_i3 = i*3 res_i33 = res_i3+3 res_j3 = j*3 res_j33 = res_j3+3 hessian[res_i3:res_i33, res_j3:res_j33] = super_element hessian[res_j3:res_j33, res_i3:res_i33] = super_element hessian[res_i3:res_i33, res_i3:res_i33] = \ hessian[res_i3:res_i33, res_i3:res_i33] - super_element hessian[res_j3:res_j33, res_j3:res_j33] = \ hessian[res_j3:res_j33, res_j3:res_j33] - super_element kirchhoff[i, j] = -g kirchhoff[j, i] = -g kirchhoff[i, i] = kirchhoff[i, i] - g kirchhoff[j, j] = kirchhoff[j, j] - g else: cutoff2 = cutoff * cutoff for i in range(n_atoms): res_i3 = i*3 res_i33 = res_i3+3 i_p1 = i+1 i2j_all = coords[i_p1:, :] - coords[i] for j, dist2 in enumerate((i2j_all ** 2).sum(1)): if dist2 > cutoff2: continue i2j = i2j_all[j] j += i_p1 g = gamma(dist2, i, j) res_j3 = j*3 res_j33 = res_j3+3 super_element = np.outer(i2j, i2j) * (- g / dist2) hessian[res_i3:res_i33, res_j3:res_j33] = super_element hessian[res_j3:res_j33, res_i3:res_i33] = super_element hessian[res_i3:res_i33, res_i3:res_i33] = \ hessian[res_i3:res_i33, res_i3:res_i33] - super_element hessian[res_j3:res_j33, res_j3:res_j33] = \ hessian[res_j3:res_j33, res_j3:res_j33] - super_element kirchhoff[i, j] = -g kirchhoff[j, i] = -g kirchhoff[i, i] = kirchhoff[i, i] - g kirchhoff[j, j] = kirchhoff[j, j] - g LOGGER.report('Hessian was built in %.2fs.', label='_anm_hessian') self._kirchhoff = kirchhoff self._hessian = hessian self._n_atoms = n_atoms self._dof = dof
def calcModes(self, n_modes=20, zeros=False, turbo=True): """Calculate normal modes. This method uses :func:`scipy.linalg.eigh` function to diagonalize the Hessian matrix. When Scipy is not found, :func:`numpy.linalg.eigh` is used. :arg n_modes: number of non-zero eigenvalues/vectors to calculate. If ``None`` or 'all' is given, all modes will be calculated. :type n_modes: int or None, default is 20 :arg zeros: If ``True``, modes with zero eigenvalues will be kept. :type zeros: bool, default is ``False`` :arg turbo: Use a memory intensive, but faster way to calculate modes. :type turbo: bool, default is ``True`` """ if self._hessian is None: raise ValueError('Hessian matrix is not built or set') if str(n_modes) is 'all': n_modes = None assert n_modes is None or isinstance(n_modes, int) and n_modes > 0, \ 'n_modes must be a positive integer' assert isinstance(zeros, bool), 'zeros must be a boolean' assert isinstance(turbo, bool), 'turbo must be a boolean' linalg = importLA() LOGGER.timeit('_anm_calc_modes') shift = 5 if linalg.__package__.startswith('scipy'): if n_modes is None: eigvals = None n_modes = self._dof else: if n_modes >= self._dof: eigvals = None n_modes = self._dof else: eigvals = (0, n_modes + shift) if eigvals: turbo = False if isinstance(self._hessian, np.ndarray): values, vectors = linalg.eigh(self._hessian, turbo=turbo, eigvals=eigvals) else: try: from scipy.sparse import linalg as scipy_sparse_la except ImportError: raise ImportError('failed to import scipy.sparse.linalg, ' 'which is required for sparse matrix ' 'decomposition') try: values, vectors = (scipy_sparse_la.eigsh(self._hessian, k=n_modes + 6, which='SA')) except: values, vectors = (scipy_sparse_la.eigen_symmetric( self._hessian, k=n_modes + 6, which='SA')) else: if n_modes is not None: LOGGER.info('Scipy is not found, all modes are calculated.') values, vectors = np.linalg.eigh(self._hessian) n_zeros = sum(values < ZERO) if n_zeros < 6: LOGGER.warning('Less than 6 zero eigenvalues are calculated.') shift = n_zeros - 1 elif n_zeros > 6: LOGGER.warning('More than 6 zero eigenvalues are calculated.') shift = n_zeros - 1 if zeros: shift = -1 if n_zeros > n_modes: self._eigvals = values[1 + shift:] else: self._eigvals = values[1 + shift:] self._vars = 1 / self._eigvals self._trace = self._vars.sum() if shift: self._array = vectors[:, 1 + shift:].copy() else: self._array = vectors self._n_modes = len(self._eigvals) LOGGER.report('{0} modes were calculated in %.2fs.'.format( self._n_modes), label='_anm_calc_modes')
def searchPfam(query, **kwargs): """Return Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence file, sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" prefix = '{http://pfam.xfam.org/}' query = str(query) if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = ''.join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError('could not parse a sequence without gaps from ' + query) else: seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + ' is not a valid sequence') fseq = '>Seq\n' + seq parameters = { 'hmmdb' : 'pfam', 'seq': fseq } enc_params = urllib.urlencode(parameters).encode('utf-8') request = urllib2.Request('http://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params) url = ( urllib2.urlopen(request).geturl() + '?output=xml') LOGGER.debug('Submitted Pfam search for sequence "{0}...".' .format(seq[:MINSEQLEN])) xml = openURL(url, timeout=timeout).read() try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) matches = {} for child in root[0]: if child.tag == 'hits': accession = child.get('acc') pfam_id = accession.split('.')[0] matches[pfam_id]={} matches[pfam_id]['accession']=accession matches[pfam_id]['class']='Domain' matches[pfam_id]['id']=child.get('name') matches[pfam_id]['locations']={} matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto') matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom') matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore') matches[pfam_id]['locations']['end']=child[0].get('alisqto') matches[pfam_id]['locations']['evalue']=child.get('evalue') matches[pfam_id]['locations']['evidence']='hmmer v3.0' matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto') matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom') matches[pfam_id]['locations']['significant']=child[0].get('significant') matches[pfam_id]['locations']['start']=child[0].get('alisqfrom') matches[pfam_id]['type']='Pfam-A' return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: LOGGER.warn('failed to parse header for {0} ({1})' .format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue idcode = dbref.idcode LOGGER.info('UniProt ID code {0} for {1} chain ' '{2} will be used.' .format(idcode, seq[:4], poly.chid)) break if idcode is not None: break if idcode is None: LOGGER.warn('A UniProt ID code for PDB {0} could not be ' 'parsed.'.format(repr(seq))) url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml' else: url = ('http://pfam.xfam.org/protein/' + idcode + '?output=xml') else: url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml' print url LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml not in ['PEND','RUN']: break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError('failed to parse results XML, check URL: ' + url) else: results = dictElement(root[0], prefix) try: xml_matches = results['matches'] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for child in xml_matches: try: accession = child.attrib['accession'][:7] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) if not re.search('^P(F|B)[0-9]{5}$', accession): raise ValueError('{0} does not match pfam accession' ' format'.format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault('locations', []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = 'Query ' + repr(query) else: query = 'Query sequence' if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches
def calcPerturbResponse(model, atoms=None, repeats=100): """Returns a matrix of profiles from scanning of the response of the structure to random perturbations at specific atom (or node) positions. The function implements the perturbation response scanning (PRS) method described in [CA09]_. Rows of the matrix are the average magnitude of the responses obtained by perturbing the atom/node position at that row index, i.e. ``prs_profile[i,j]`` will give the response of residue/node *j* to perturbations in residue/node *i*. PRS is performed using the covariance matrix from *model*, e.t. :class:`.ANM` instance. Each residue/node is perturbed *repeats* times with a random unit force vector. When *atoms* instance is given, PRS profile for residues will be added as an attribute which then can be retrieved as ``atoms.getData('prs_profile')``. *model* and *atoms* must have the same number of atoms. *atoms* must be an :class:`.AtomGroup` instance. .. [CA09] Atilgan C, Atilgan AR, Perturbation-Response Scanning Reveals Ligand Entry-Exit Mechanisms of Ferric Binding Protein. *PLoS Comput Biol* **2009** 5(10):e1000544. The PRS matrix can be saved as follows:: prs_matrix = calcPerturbationResponse(p38_anm) writeArray('prs_matrix.txt', prs_matrix, format='%8.6f', delimiter='\t') """ if not isinstance(model, NMA): raise TypeError('model must be an NMA instance') elif not model.is3d(): raise TypeError('model must be a 3-dimensional NMA instance') elif len(model) == 0: raise ValueError('model must have normal modes calculated') if atoms is not None: if not isinstance(atoms, AtomGroup): raise TypeError('atoms must be an AtomGroup instance') elif atoms.numAtoms() != model.numAtoms(): raise ValueError('model and atoms must have the same number atoms') assert isinstance(repeats, int), 'repeats must be an integer' cov = calcCovariance(model) if cov is None: raise ValueError('model did not return a covariance matrix') n_atoms = model.numAtoms() response_matrix = np.zeros((n_atoms, n_atoms)) LOGGER.progress('Calculating perturbation response', n_atoms, '_prody_prs') i3 = -3 i3p3 = 0 for i in range(n_atoms): i3 += 3 i3p3 += 3 forces = np.random.rand(repeats * 3).reshape((repeats, 3)) forces /= ((forces**2).sum(1)**0.5).reshape((repeats, 1)) for force in forces: response_matrix[i] += ( np.dot(cov[:, i3:i3p3], force) ** 2).reshape((n_atoms, 3)).sum(1) LOGGER.update(i, '_prody_prs') response_matrix /= repeats LOGGER.clear() LOGGER.report('Perturbation response scanning completed in %.1fs.', '_prody_prs') if atoms is not None: atoms.setData('prs_profile', response_matrix) return response_matrix # save the original PRS matrix np.savetxt('orig_PRS_matrix', response_matrix, delimiter='\t', fmt='%8.6f') # calculate the normalized PRS matrix self_dp = np.diag(response_matrix) # using self displacement (diagonal of # the original matrix) as a # normalization factor self_dp = self_dp.reshape(n_atoms, 1) norm_PRS_mat = response_matrix / np.repeat(self_dp, n_atoms, axis=1) # suppress the diagonal (self displacement) to facilitate # visualizing the response profile norm_PRS_mat = norm_PRS_mat - np.diag(np.diag(norm_PRS_mat)) np.savetxt('norm_PRS_matrix', norm_PRS_mat, delimiter='\t', fmt='%8.6f') return response_matrix
def calcPerturbResponse(model, atoms=None, repeats=100): """Return a matrix of profiles from scanning of the response of the structure to random perturbations at specific atom (or node) positions. The function implements the perturbation response scanning (PRS) method described in [CA09]_. Rows of the matrix are the average magnitude of the responses obtained by perturbing the atom/node position at that row index, i.e. ``prs_profile[i,j]`` will give the response of residue/node *j* to perturbations in residue/node *i*. PRS is performed using the covariance matrix from *model*, e.t. :class:`.ANM` instance. Each residue/node is perturbed *repeats* times with a random unit force vector. When *atoms* instance is given, PRS profile for residues will be added as an attribute which then can be retrieved as ``atoms.getData('prs_profile')``. *model* and *atoms* must have the same number of atoms. *atoms* must be an :class:`.AtomGroup` instance. .. [CA09] Atilgan C, Atilgan AR, Perturbation-Response Scanning Reveals Ligand Entry-Exit Mechanisms of Ferric Binding Protein. *PLoS Comput Biol* **2009** 5(10):e1000544. The RPS matrix can be save as follows:: prs_matrix = calcPerturbationResponse(p38_anm) writeArray('prs_matrix.txt', prs_matrix, format='%8.6f', delimiter='\t') """ if not isinstance(model, NMA): raise TypeError('model must be an NMA instance') elif not model.is3d(): raise TypeError('model must be a 3-dimensional NMA instance') elif len(model) == 0: raise ValueError('model must have normal modes calculated') if atoms is not None: if not isinstance(atoms, AtomGroup): raise TypeError('atoms must be an AtomGroup instance') elif atoms.numAtoms() != model.numAtoms(): raise ValueError('model and atoms must have the same number atoms') assert isinstance(repeats, int), 'repeats must be an integer' cov = calcCovariance(model) if cov is None: raise ValueError('model did not return a covariance matrix') n_atoms = model.numAtoms() response_matrix = np.zeros((n_atoms, n_atoms)) LOGGER.progress('Calculating perturbation response', n_atoms, '_prody_prs') i3 = -3 i3p3 = 0 for i in range(n_atoms): i3 += 3 i3p3 += 3 forces = np.random.rand(repeats * 3).reshape((repeats, 3)) forces /= ((forces**2).sum(1)**0.5).reshape((repeats, 1)) for force in forces: response_matrix[i] += (np.dot(cov[:, i3:i3p3], force)**2).reshape( (n_atoms, 3)).sum(1) LOGGER.update(i, '_prody_prs') response_matrix /= repeats LOGGER.clear() LOGGER.report('Perturbation response scanning completed in %.1fs.', '_prody_prs') if atoms is not None: atoms.setData('prs_profile', response_matrix) return response_matrix # save the original PRS matrix np.savetxt('orig_PRS_matrix', response_matrix, delimiter='\t', fmt='%8.6f') # calculate the normalized PRS matrix self_dp = np.diag(response_matrix) # using self displacement (diagonal of # the original matrix) as a # normalization factor self_dp = self_dp.reshape(n_atoms, 1) norm_PRS_mat = response_matrix / np.repeat(self_dp, n_atoms, axis=1) # suppress the diagonal (self displacement) to facilitate # visualizing the response profile norm_PRS_mat = norm_PRS_mat - np.diag(np.diag(norm_PRS_mat)) np.savetxt('norm_PRS_matrix', norm_PRS_mat, delimiter='\t', fmt='%8.6f') return response_matrix
def buildHessian(self, coords, cutoff=15., gamma=1., **kwargs): """Build Hessian matrix for given coordinate set. :arg coords: a coordinate set or an object with ``getCoords`` method :type coords: :class:`numpy.ndarray` :arg cutoff: cutoff distance (Å) for pairwise interactions, default is 15.0 Å, minimum is 4.0 Å :type cutoff: float :arg gamma: spring constant, default is 1.0 :type gamma: float, :class:`Gamma` :arg sparse: elect to use sparse matrices, default is **False**. If Scipy is not found, :class:`ImportError` is raised. :type sparse: bool :arg kdtree: elect to use KDTree for building Hessian matrix, default is **False** since KDTree method is slower :type kdtree: bool Instances of :class:`Gamma` classes and custom functions are accepted as *gamma* argument. When Scipy is available, user can select to use sparse matrices for efficient usage of memory at the cost of computation speed.""" try: coords = (coords._getCoords() if hasattr(coords, '_getCoords') else coords.getCoords()) except AttributeError: try: checkCoords(coords) except TypeError: raise TypeError('coords must be a Numpy array or an object ' 'with `getCoords` method') cutoff, g, gamma = checkENMParameters(cutoff, gamma) self._reset() self._cutoff = cutoff self._gamma = g n_atoms = coords.shape[0] dof = n_atoms * 3 LOGGER.timeit('_anm_hessian') if kwargs.get('sparse', False): try: from scipy import sparse as scipy_sparse except ImportError: raise ImportError('failed to import scipy.sparse, which is ' 'required for sparse matrix calculations') kirchhoff = scipy_sparse.lil_matrix((n_atoms, n_atoms)) hessian = scipy_sparse.lil_matrix((dof, dof)) else: kirchhoff = np.zeros((n_atoms, n_atoms), 'd') hessian = np.zeros((dof, dof), float) if kwargs.get('kdtree', False): LOGGER.info('Using KDTree for building the Hessian.') kdtree = KDTree(coords) kdtree.search(cutoff) for i, j in kdtree.getIndices(): i2j = coords[j] - coords[i] dist2 = np.dot(i2j, i2j) g = gamma(dist2, i, j) super_element = np.outer(i2j, i2j) * (-g / dist2) res_i3 = i * 3 res_i33 = res_i3 + 3 res_j3 = j * 3 res_j33 = res_j3 + 3 hessian[res_i3:res_i33, res_j3:res_j33] = super_element hessian[res_j3:res_j33, res_i3:res_i33] = super_element hessian[res_i3:res_i33, res_i3:res_i33] = \ hessian[res_i3:res_i33, res_i3:res_i33] - super_element hessian[res_j3:res_j33, res_j3:res_j33] = \ hessian[res_j3:res_j33, res_j3:res_j33] - super_element kirchhoff[i, j] = -g kirchhoff[j, i] = -g kirchhoff[i, i] = kirchhoff[i, i] - g kirchhoff[j, j] = kirchhoff[j, j] - g else: cutoff2 = cutoff * cutoff for i in range(n_atoms): res_i3 = i * 3 res_i33 = res_i3 + 3 i_p1 = i + 1 i2j_all = coords[i_p1:, :] - coords[i] for j, dist2 in enumerate((i2j_all**2).sum(1)): if dist2 > cutoff2: continue i2j = i2j_all[j] j += i_p1 g = gamma(dist2, i, j) res_j3 = j * 3 res_j33 = res_j3 + 3 super_element = np.outer(i2j, i2j) * (-g / dist2) hessian[res_i3:res_i33, res_j3:res_j33] = super_element hessian[res_j3:res_j33, res_i3:res_i33] = super_element hessian[res_i3:res_i33, res_i3:res_i33] = \ hessian[res_i3:res_i33, res_i3:res_i33] - super_element hessian[res_j3:res_j33, res_j3:res_j33] = \ hessian[res_j3:res_j33, res_j3:res_j33] - super_element kirchhoff[i, j] = -g kirchhoff[j, i] = -g kirchhoff[i, i] = kirchhoff[i, i] - g kirchhoff[j, j] = kirchhoff[j, j] - g LOGGER.report('Hessian was built in %.2fs.', label='_anm_hessian') self._kirchhoff = kirchhoff self._hessian = hessian self._n_atoms = n_atoms self._dof = dof
def blastPDB(sequence, filename=None, **kwargs): """Return a :class:`PDBBlastRecord` instance that contains results from blast searching of ProteinDataBank database *sequence* using NCBI blastp. :arg sequence: single-letter code amino acid sequence of the protein without any gap characters, all white spaces will be removed :type sequence: str :arg filename: a *filename* to save the results in XML format :type filename: str *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``) search parameters can be adjusted by the user. *sleep* keyword argument (default is ``2`` seconds) determines how long to wait to reconnect for results. Sleep time is doubled when results are not ready. *timeout* (default is 120s) determines when to give up waiting for the results. """ if sequence == 'runexample': sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI' 'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN' 'DAYDIVKMKKSNISPNFNFMGQLLDFERTL') else: try: sequence = ''.join(sequence.split()) _ = sequence.isalpha() except AttributeError: raise TypeError('sequence must be a string') else: if not _: raise ValueError('not a valid protein sequence') query = [('DATABASE', 'pdb'), ('ENTREZ_QUERY', '(none)'), ('PROGRAM', 'blastp'),] expect = float(kwargs.pop('expect', 10e-10)) assert expect > 0, 'expect must be a positive number' query.append(('EXPECT', expect)) hitlist_size = int(kwargs.pop('hitlist_size', 250)) assert hitlist_size > 0, 'expect must be a positive integer' query.append(('HITLIST_SIZE', hitlist_size)) query.append(('QUERY', sequence)) query.append(('CMD', 'Put')) sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', 120)) if kwargs: LOGGER.warn('Keyword argument(s) {0} are not used.' .format(', '.join([repr(key) for key in kwargs]))) try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib import urlencode url = 'http://blast.ncbi.nlm.nih.gov/Blast.cgi' data = urlencode(query) LOGGER.timeit('_prody_blast') LOGGER.info('Blast searching NCBI PDB database for "{0}..."' .format(sequence[:5])) handle = openURL(url, data=data, headers={'User-agent': 'ProDy'}) html = handle.read() index = html.find(b'RID =') if index == -1: raise Exception('NCBI did not return expected response.') else: last = html.find(b'\n', index) rid = html[index + len('RID ='):last].strip() index = html.find(b'RTOE =') if index == -1: rtoe = None # This is not used else: last = html.find(b'\n', index) rtoe = int(html[index + len('RTOE ='):last].strip()) query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500), ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')] data = urlencode(query) while True: LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.') LOGGER.write('Connecting NCBI for search results...') handle = openURL(url, data=data, headers={'User-agent': 'ProDy'}) results = handle.read() index = results.find(b'Status=') LOGGER.clear() if index < 0: break last = results.index(b'\n', index) status = results[index+len('Status='):last].strip() if status.upper() == 'READY': break sleep = int(sleep * 1.5) if LOGGER.timing('_prody_blast') > timeout: LOGGER.warn('Blast search time out.') return None LOGGER.clear() LOGGER.report('Blast search completed in %.1fs.', '_prody_blast') try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' out = open(filename, 'w') out.write(results) out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) return PDBBlastRecord(results, sequence)
def buildMembrane(self, coords, **kwargs): """Build Hessian matrix for given coordinate set. :arg coords: a coordinate set or an object with ``getCoords`` method :type coords: :class:`numpy.ndarray` :arg membrane_hi: the maximum z coordinate of the pdb default is 13.0 :type membrane_hi: float :arg membrane_lo: the minimum z coordinate of the pdb default is -13.0 :type membrane_lo: float :arg R: radius of all membrane in x-y direction default is 80. :type R: float :arg r: radius of individual barrel-type membrane protein default is 2.5. :type :arg lat: lattice type which could be FCC(face-centered-cubic)(default), SC(simple cubic), SH(simple hexagonal) :type lat: str """ if type(coords) is AtomGroup: buildAg = True else: buildAg = False try: coords = (coords._getCoords() if hasattr(coords, '_getCoords') else coords.getCoords()) except AttributeError: try: checkCoords(coords) except TypeError: raise TypeError('coords must be a Numpy array or an object ' 'with `getCoords` method') self._n_atoms = natoms = int(coords.shape[0]) pxlo = min(np.append(coords[:,0],10000)) pxhi = max(np.append(coords[:,0],-10000)) pylo = min(np.append(coords[:,1],10000)) pyhi = max(np.append(coords[:,1],-10000)) pzlo = min(np.append(coords[:,2],10000)) pzhi = max(np.append(coords[:,2],-10000)) membrane_hi = float(kwargs.get('membrane_hi', 13.0)) membrane_lo = float(kwargs.get('membrane_lo', -13.0)) R = float(kwargs.get('R', 80)) r = float(kwargs.get('r', 5)) lat = str(kwargs.get('lat', 'FCC')) lpv = assign_lpvs(lat) imax = (R + lpv[0,2] * (membrane_hi - membrane_lo)/2.)/r jmax = (R + lpv[1,2] * (membrane_hi - membrane_lo)/2.)/r kmax = (R + lpv[2,2] * (membrane_hi - membrane_lo)/2.)/r #print pxlo, pxhi, pylo, pyhi, pzlo, pzhi #print lpv[0,2],lpv[1,2],lpv[2,2] #print R,r,imax,jmax,kmax membrane = zeros((1,3)) LOGGER.timeit('_membrane') membrane = zeros((1,3)) atm = 0 for i in range(-int(imax),int(imax+1)): for j in range(-int(jmax),int(jmax+1)): for k in range(-int(kmax),int(kmax+1)): X = zeros((1,3)) for p in range(3): X[0,p]=2.*r*(i*lpv[0,p]+j*lpv[1,p]+k*lpv[2,p]) dd=0 for p in range(3): dd += X[0,p] ** 2 if dd<R**2 and X[0,2]>membrane_lo and X[0,2]<membrane_hi: if X[0,0]>pxlo-R/2 and X[0,0]<pxhi+R/2 and X[0,1]>pylo-R/2 and X[0,1]<pyhi+R/2 and X[0,2]>pzlo and X[0,2]<pzhi: if checkClash(X, coords[:natoms,:], radius=5): if atm == 0: membrane = X else: membrane = np.append(membrane, X, axis=0) atm = atm + 1 #print atm self._membrane = AtomGroup(title="Membrane") self._membrane.setCoords(membrane) self._membrane.setResnums(range(atm)) self._membrane.setResnames(["NE1" for i in range(atm)]) self._membrane.setChids(["Q" for i in range(atm)]) self._membrane.setElements(["Q1" for i in range(atm)]) self._membrane.setNames(["Q1" for i in range(atm)]) LOGGER.report('Membrane was built in %2.fs.', label='_membrane')
def calcBothWaysAdaptiveANM(a, b, n_steps, **kwargs): """Runs both-way adaptivate ANM. """ n_modes0 = n_modes = kwargs.pop('n_modes', 20) coordsA, coordsB, title, atoms, weights, maskA, maskB, rmsd = checkInput( a, b, **kwargs) coordsA = coordsA.copy() coordsB = coordsB.copy() LOGGER.timeit('_prody_calcAdaptiveANM') n = 0 resetFmin = True defvecs = [] rmsds = [rmsd] ensA = Ensemble('A') ensA.setCoords(coordsA) ensA.setWeights(weights) ensA.addCoordset(coordsA.copy()) ensB = Ensemble('B') ensB.setCoords(coordsB.copy()) ensB.setWeights(weights) ensB.addCoordset(coordsB.copy()) while n < n_steps: LOGGER.info('\nStarting cycle {0} with {1}'.format( n + 1, getTitle(a, 'structure A'))) n_modes = calcStep(coordsA, coordsB, n_modes, ensA, defvecs, rmsds, mask=maskA, resetFmin=resetFmin, **kwargs) n += 1 resetFmin = False if n_modes == 0: break n = 0 n_modes = n_modes0 resetFmin = True while n < n_steps: LOGGER.info('\nStarting cycle {0} with structure {1}'.format( n + 1, getTitle(b, 'structure B'))) n_modes = calcStep(coordsB, coordsA, n_modes, ensB, defvecs, rmsds, mask=maskB, resetFmin=resetFmin, **kwargs) n += 1 resetFmin = False if n_modes == 0: LOGGER.report('Alternating Adaptive ANM converged in %.2fs.', '_prody_calcAdaptiveANM') break ensemble = ensA + ensB[::-1] ensemble.setTitle(title + '_aANM') ensemble.setAtoms(atoms) ensemble.setCoords(ensB.getCoords()) LOGGER.report('Both-way Adaptive ANM converged in %.2fs.', '_prody_calcAdaptiveANM') return ensemble