def calcSquareInnerProduct(modes1, modes2): """Returns the square inner product (SIP) of fluctuations [SK02]_. This function returns a single number. .. [SK02] Kundu S, Melton JS, Sorensen DC, Phillips GN: Dynamics of proteins in crystals: comparison of experiment with simple models. Biophys J. 2002, 83: 723-732. """ if isinstance(modes1, (NMA, ModeSet)): w1 = calcSqFlucts(modes1) elif isListLike(modes1): w1 = modes1 else: raise TypeError( 'modes1 should be a profile or an NMA or ModeSet object') if isinstance(modes2, (NMA, ModeSet)): w2 = calcSqFlucts(modes2) elif isListLike(modes2): w2 = modes2 else: raise TypeError( 'modes2 should be a profile or an NMA or ModeSet object') return np.dot(w1, w2)**2 / (np.dot(w1, w1) * np.dot(w2, w2))
def calcEnsembleFunctionOverlaps(ens, **kwargs): """Calculate function overlaps for an ensemble as the mean of the value from :func:`calcDeepFunctionOverlaps`. :arg ens: an ensemble with labels :type ens: :class:`Ensemble` """ if not isinstance(ens, Ensemble) and not isListLike(ens): raise TypeError('ens should be an ensemble or list-like') if isinstance(ens, Ensemble): ids = [label[:5] for label in ens.getLabels()] else: ids = ens if not isinstance(ids[0], str): raise TypeError('ens should have labels') goa_ens = queryGOA(ids, **kwargs) for entry in goa_ens: if len(entry._molecular) == 0: LOGGER.warn( 'ensemble member {0} has no molecular functions and was omitted' .format(entry._title)) goa_ens = [entry for entry in goa_ens if len(entry._molecular) > 0] overlaps = calcDeepFunctionOverlaps(*goa_ens, **kwargs) return overlaps
def _extend(self, arr, axis=None, defval=0): mask = self.mask #.copy() if self.is3d(): mask = np.repeat(mask, 3) n_true = np.sum(mask) N = len(mask) if axis is None: axes = [i for i in range(arr.ndim)] elif not isListLike(axis): axes = [axis] else: axes = axis shape = np.array(arr.shape) shape[axes] = N whole_array = np.empty(shape, dtype=arr.dtype) whole_array.fill(defval) I = [np.arange(s) for s in shape] J = [np.arange(s) for s in arr.shape] for ax in axes: I[ax] = mask J[ax] = np.arange(n_true) whole_array[np.ix_(*I)] = arr[np.ix_(*J)] return whole_array
def parseCCD(ids): """Retrieve the whole Chemical Component Dictionary (CCD) resource. """ if isListLike(ids): n_ids = len(ids) else: ids = [ids] n_ids = 1 ret = [] for id in ids: id_url = 'http://ligand-expo.rcsb.org/reports/{0}/{1}/{1}.cif'.format(id[0], id) try: handle = openURL(id_url) except Exception as err: LOGGER.warn('download failed ({1}).'.format(str(err))) else: data = handle.read() if len(data): if PY3K: data = data.decode() parsingDict, prog = parseSTARLines(data.split('\n'), shlex=True) star_dict = StarDict(parsingDict, prog, id) ret.append(star_dict[id]) else: ret.append(None) LOGGER.warn('Could not parse CCD data for {0}'.format(id)) if n_ids == 1: return ret[0] return ret
def calcEnsembleFunctionOverlaps(ens, **kwargs): """Calculate function overlaps for an ensemble as the mean of the value from :func:`calcDeepFunctionOverlaps`. :arg ens: an ensemble with labels :type ens: :class:`Ensemble` """ if not isinstance(ens, Ensemble) and not isListLike(ens): raise TypeError('ens should be an ensemble or list-like') if isinstance(ens, Ensemble): ids = [label[:5] for label in ens.getLabels()] else: ids = ens if not isinstance(ids[0], str): raise TypeError('ens should have labels') goa_ens = queryGOA(ids, **kwargs) for entry in goa_ens: if len(entry._molecular) == 0: LOGGER.warn( 'ensemble member {0} has no molecular functions and was omitted'.format(entry._title)) goa_ens = [entry for entry in goa_ens if len(entry._molecular) > 0] overlaps = calcDeepFunctionOverlaps(*goa_ens, **kwargs) return overlaps
def setIndices(self, value): if not isListLike(value): raise TypeError('value must be a list or numpy.ndarray instance') array = asarray(value) if len(array) != self._n_atoms: raise ValueError('length mismatch between this ensemble ' '(%d) and indices (%d)'%(self._n_atoms, len(array))) self._indices = value
def fetchPDBClusters(sqid=None): """Retrieve PDB sequence clusters. PDB sequence clusters are results of the weekly clustering of protein chains in the PDB generated by blastclust. They are available at FTP site: ftp://resources.rcsb.org/sequence/clusters/ This function will download about 10 Mb of data and save it after compressing in your home directory in :file:`.prody/pdbclusters`. Compressed files will be less than 4 Mb in size. Cluster data can be loaded using :func:`loadPDBClusters` function and be accessed using :func:`listPDBCluster`.""" if sqid is not None: if isListLike(sqid): for s in sqid: if s not in PDB_CLUSTERS: raise ValueError('sqid must be one or more of ' + PDB_CLUSTERS_SQID_STR) keys = list(sqid) else: if sqid not in PDB_CLUSTERS: raise ValueError('sqid must be one or more of ' + PDB_CLUSTERS_SQID_STR) keys = [sqid] else: keys = list(PDB_CLUSTERS) PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters') if not os.path.isdir(PDB_CLUSTERS_PATH): os.mkdir(PDB_CLUSTERS_PATH) LOGGER.progress('Downloading sequence clusters', len(keys), '_prody_fetchPDBClusters') count = 0 for i, x in enumerate(keys): filename = 'bc-{0}.out'.format(x) url = ('ftp://resources.rcsb.org/sequence/clusters/' + filename) try: inp = openURL(url) except IOError: LOGGER.warning('Clusters at {0}% sequence identity level could ' 'not be downloaded.'.format(x)) continue else: out = openFile(filename + '.gz', 'w', folder=PDB_CLUSTERS_PATH) out.write(inp.read()) inp.close() out.close() count += 1 LOGGER.update(i, label='_prody_fetchPDBClusters') LOGGER.finish() if len(keys) == count: LOGGER.info('All selected PDB clusters were downloaded successfully.') elif count == 0: LOGGER.warn('PDB clusters could not be downloaded.')
def setApix(self, apix): if not isListLike(apix): try: apix = [apix, apix, apix] except: raise TypeError('apix must be a single value or list-like') if len(apix) != 3: raise ValueError('apix must be a single value or 3 values') self._apix = apix self.Lx = apix[0] * self.NS self.Ly = apix[1] * self.NR self.Lz = apix[2] * self.NC
def alignByEnsemble(atomics, ensemble): """Align a set of :class:`.Atomic` objects using transformations from *ensemble*, which may be a :class:`.PDBEnsemble` or a :class:`.PDBConformation` instance. Transformations will be applied based on indices so *atomics* and *ensemble* must have the same number of members. :arg atomics: a set of :class:`.Atomic` objects to be aligned :type atomics: tuple, list, :class:`~numpy.ndarray` :arg ensemble: a :class:`.PDBEnsemble` or a :class:`.PDBConformation` from which transformations can be extracted :type ensemble: :class:`.PDBEnsemble`, :class:`.PDBConformation` """ if not isListLike(atomics): raise TypeError('atomics must be list-like') if not isinstance(ensemble, (PDBEnsemble, PDBConformation)): raise TypeError('ensemble must be a PDBEnsemble or PDBConformation') if isinstance(ensemble, PDBConformation): ensemble = [ensemble] if len(atomics) != len(ensemble): raise ValueError('atomics and ensemble must have the same length') output = [] for i, conf in enumerate(ensemble): trans = conf.getTransformation() if trans is None: raise ValueError('transformations are not calculated, call ' '`superpose` or `iterpose`') ag = atomics[i] if not isinstance(ag, Atomic): LOGGER.warning( 'No atomic object found for conformation {0}.'.format(i)) output.append(None) continue output.append(trans.apply(ag)) if len(output) == 1: return output[0] else: return output
def sliceModelByMask(model, mask, norm=False): """Returns a part of the *model* indicated by *mask*. Note that normal modes (eigenvectors) are not normalized unless *norm* is **True**. :arg mode: NMA model instance to be sliced :type mode: :class:`.NMA` :arg mask: an Integer array or a Boolean array where ``"True"`` indicates the parts being selected :type mask: list, :class:`~numpy.ndarray` :arg norm: whether to normalize eigenvectors, default **False** :type norm: bool :returns: :class:`.NMA`""" if not isListLike(mask): raise TypeError( 'mask must be either a list or a numpy.ndarray, not {0}'.format( type(model))) is_bool = mask.dtype is np.dtype('bool') if is_bool: if len(mask) != model.numAtoms(): raise ValueError('number of atoms in model and mask must be equal') which = mask else: if mask.min() < 0 or mask.max() >= model.numAtoms(): raise ValueError('index in mask exceeds range') which = np.zeros(model.numAtoms(), dtype=bool) which[mask] = True array = model._getArray() nma = type(model)('{0} sliced'.format(model.getTitle())) if model.is3d(): which = np.repeat(which, 3) evecs = array[which, :] if norm: evecs /= np.array([((evecs[:, i])**2).sum()**0.5 for i in range(evecs.shape[1])]) nma.setEigens(evecs, model.getEigvals()) return nma
def __init__(self, items, element=None): if element is not None: tag = element.tag attrib = element.attrib else: tag = 'cath' attrib = {} super(CATHCollection, self).__init__(tag=tag, attrib=attrib) if not isListLike(items): items = [items] parents = [] for item in items: self.append(item) parents.append(item.parent) uniq_parents = set(parents) if len(uniq_parents) == 1: self._parent = parents[0]
def __init__(self, parsingDict, prog, title='unnamed', indices=None): self._title = title self._dict = parsingDict self._prog = prog self._indices = indices if indices is None: self.dataBlocks = [ StarDataBlock(self, key) for key in self._dict.keys() ] else: self.dataBlocks = [] for idx in indices: if isListLike(idx): self.dataBlocks.append(StarDataBlock(self, idx[0], idx[1])) else: self.dataBlocks.append(StarDataBlock(self, idx)) self._dict = OrderedDict() for i, idx in enumerate(indices): self._dict[idx[0]] = self.dataBlocks[i]._dict self.numDataBlocks = len(self.dataBlocks)
def fetchPDBs(*pdb, **kwargs): """"Wrapper function to fetch multiple files from the PDB. If no format is given, it tries PDB then mmCIF then EMD. :arg pdb: one PDB identifier or filename, or a list of them. If needed, PDB files are downloaded using :func:`.fetchPDB()` function. """ n_pdb = len(pdb) if n_pdb == 0: raise ValueError('Please provide a PDB ID or filename') if n_pdb == 1: if isListLike(pdb[0]): pdb = pdb[0] n_pdb = len(pdb) fnames = [] for p in pdb: format = kwargs.pop('format', None) if format is not None: filename = fetchPDB(p, format=format, **kwargs) else: filename = fetchPDB(p, **kwargs) if filename is None: filename = fetchPDB(p, format='cif', **kwargs) if filename is None: filename = fetchPDB(p, format='emd', **kwargs) fnames.append(filename) return fnames
def buildPDBEnsemble(atomics, ref=None, title='Unknown', labels=None, atommaps=None, unmapped=None, **kwargs): """Builds a :class:`.PDBEnsemble` from a given reference structure and a list of structures (:class:`.Atomic` instances). Note that the reference should be included in the list as well. :arg atomics: a list of :class:`.Atomic` instances :type atomics: list :arg ref: reference structure or the index to the reference in *atomics*. If **None**, then the first item in *atomics* will be considered as the reference. If it is a :class:`.PDBEnsemble` instance, then *atomics* will be appended to the existing ensemble. Default is **None** :type ref: int, :class:`.Chain`, :class:`.Selection`, or :class:`.AtomGroup` :arg title: the title of the ensemble :type title: str :arg labels: labels of the conformations :type labels: list :arg degeneracy: whether only the active coordinate set (**True**) or all the coordinate sets (**False**) of each structure should be added to the ensemble. Default is **True** :type degeneracy: bool :arg occupancy: minimal occupancy of columns (range from 0 to 1). Columns whose occupancy is below this value will be trimmed :type occupancy: float :arg atommaps: labels of *atomics* that were mapped and added into the ensemble. This is an output argument :type atommaps: list :arg unmapped: labels of *atomics* that cannot be included in the ensemble. This is an output argument :type unmapped: list :arg subset: a subset for selecting particular atoms from the input structures. Default is ``"all"`` :type subset: str :arg superpose: if set to ``'iter'``, :func:`.PDBEnsemble.iterpose` will be used to superpose the structures, otherwise conformations will be superposed with respect to the reference specified by *ref* unless set to ``False``. Default is ``'iter'`` :type superpose: str, bool """ occupancy = kwargs.pop('occupancy', None) degeneracy = kwargs.pop('degeneracy', True) subset = str(kwargs.get('subset', 'all')).lower() superpose = kwargs.pop('superpose', 'iter') superpose = kwargs.pop('iterpose', superpose) debug = kwargs.pop('debug', {}) if 'mapping_func' in kwargs: raise DeprecationWarning( 'mapping_func is deprecated. Please see release notes for ' 'more details: http://prody.csb.pitt.edu/manual/release/v1.11_series.html' ) start = time.time() if not isListLike(atomics): raise TypeError('atomics should be list-like') if len(atomics) == 1 and degeneracy is True: raise ValueError('atomics should have at least two items') if labels is not None: if len(labels) != len(atomics): raise TypeError('Labels and atomics must have the same lengths.') else: labels = [] for atoms in atomics: if atoms is None: labels.append(None) else: labels.append(atoms.getTitle()) if ref is None: target = atomics[0] elif isinstance(ref, Integral): target = atomics[ref] elif isinstance(ref, PDBEnsemble): target = ref._atoms else: target = ref # initialize a PDBEnsemble with reference atoms and coordinates isrefset = False if isinstance(ref, PDBEnsemble): ensemble = ref else: # select the subset of reference beforehand for the sake of efficiency if subset != 'all': target = target.select(subset) ensemble = PDBEnsemble(title) if isinstance(target, Atomic): ensemble.setAtoms(target) ensemble.setCoords(target.getCoords()) isrefset = True else: ensemble._n_atoms = len(target) isrefset = False # build the ensemble if unmapped is None: unmapped = [] if atommaps is None: atommaps = [] LOGGER.progress('Building the ensemble...', len(atomics), '_prody_buildPDBEnsemble') for i, atoms in enumerate(atomics): if atoms is None: unmapped.append(labels[i]) continue LOGGER.update(i, 'Mapping %s to the reference...' % atoms.getTitle(), label='_prody_buildPDBEnsemble') try: atoms.getHierView() except AttributeError: raise TypeError( 'atomics must be a list of instances having the access to getHierView' ) if subset != 'all': atoms = atoms.select(subset) # find the mapping of chains of atoms to those of target debug[labels[i]] = {} atommaps_ = alignChains(atoms, target, debug=debug[labels[i]], **kwargs) if len(atommaps_) == 0: unmapped.append(labels[i]) continue else: atommaps.extend(atommaps_) # add the atommaps to the ensemble for atommap in atommaps_: lbl = pystr(labels[i]) if len(atommaps_) > 1: chids = np.unique(atommap.getChids()) strchids = ''.join(chids) lbl += '_%s' % strchids ensemble.addCoordset(atommap, weights=atommap.getFlags('mapped'), label=lbl, degeneracy=degeneracy) if not isrefset: ensemble.setCoords(atommap.getCoords()) isrefset = True LOGGER.finish() if occupancy is not None: ensemble = trimPDBEnsemble(ensemble, occupancy=occupancy) if superpose == 'iter': ensemble.iterpose() elif superpose is not False: ensemble.superpose() LOGGER.info('Ensemble ({0} conformations) were built in {1:.2f}s.'.format( ensemble.numConfs(), time.time() - start)) if unmapped: LOGGER.warn('{0} structures cannot be mapped.'.format(len(unmapped))) return ensemble
def trimModelByMask(model, mask): """Returns a part of the *model* indicated by *mask*. This method removes columns and rows in the connectivity matrix indicated by *mask* and fix the diagonal sums. Normal modes need to be calculated again after the trim. :arg mode: NMA model instance to be sliced :type mode: :class:`.NMA` :arg mask: an Integer array or a Boolean array where ``"True"`` indicates the parts being selected :type mask: list, :class:`~numpy.ndarray` :returns: :class:`.NMA`""" if not isListLike(mask): raise TypeError( 'mask must be either a list or a numpy.ndarray, not {0}'.format( type(model))) is_bool = mask.dtype is np.dtype('bool') if is_bool: if len(mask) != model.numAtoms(): raise ValueError('number of atoms in model and mask must be equal') which = mask else: if mask.min() < 0 or mask.max() >= model.numAtoms(): raise ValueError('index in mask exceeds range') which = np.zeros(model.numAtoms(), dtype=bool) which[mask] = True if model.is3d(): which = np.repeat(which, 3) if isinstance(model, GNM): matrix = model._kirchhoff elif isinstance(model, ANM): matrix = model._hessian elif isinstance(model, PCA): matrix = model._cov if isinstance(model, PCA): ss = matrix[which, :][:, which] eda = PCA(model.getTitle() + ' reduced') eda.setCovariance(ss) return eda else: matrix = matrix[which, :][:, which] if isinstance(model, GNM): gnm = GNM(model.getTitle() + ' reduced') I = np.eye(len(matrix), dtype=bool) matrix[I] = -(matrix.sum(axis=0) - np.diag(matrix)) gnm.setKirchhoff(matrix) return gnm elif isinstance(model, ANM): model_type = type(model) anm = model_type(model.getTitle() + ' reduced') n = len(matrix) // 3 for i in range(n): S = np.zeros((3, 3)) for j in range(n): if i == j: continue S -= matrix[i * 3:i * 3 + 3, j * 3:j * 3 + 3] matrix[i * 3:i * 3 + 3, i * 3:i * 3 + 3] = S anm.setHessian(matrix) if hasattr(anm, 'getMembrane'): anm._membrane = model.getMembrane() anm._combined = model.getCombined() return anm
def parsePDB(*pdb, **kwargs): """Returns an :class:`.AtomGroup` and/or dictionary containing header data parsed from a PDB file. This function extends :func:`.parsePDBStream`. See :ref:`parsepdb` for a detailed usage example. :arg pdb: one PDB identifier or filename, or a list of them. If needed, PDB files are downloaded using :func:`.fetchPDB()` function. You can also provide arguments that you would like passed on to fetchPDB(). """ n_pdb = len(pdb) if n_pdb == 1: if isListLike(pdb[0]): pdb = pdb[0] n_pdb = len(pdb) if n_pdb == 1: return _parsePDB(pdb[0], **kwargs) else: results = [] lstkwargs = {} for key in kwargs: argval = kwargs.get(key) if np.isscalar(argval): argval = [argval] * n_pdb lstkwargs[key] = argval start = time.time() LOGGER.progress('Retrieving {0} PDB structures...'.format(n_pdb), n_pdb, '_prody_parsePDB') for i, p in enumerate(pdb): kwargs = {} for key in lstkwargs: kwargs[key] = lstkwargs[key][i] c = kwargs.get('chain', '') LOGGER.update(i, 'Retrieving {0}...'.format(p + c), label='_prody_parsePDB') result = _parsePDB(p, **kwargs) if not isinstance(result, tuple): if isinstance(result, dict): result = (None, result) else: result = (result, None) results.append(result) results = list(zip(*results)) LOGGER.finish() for i in reversed(range(len(results))): if all(j is None for j in results[i]): results.pop(i) if len(results) == 1: results = results[0] results = list(results) model = kwargs.get('model') header = kwargs.get('header', False) if model != 0 and header: numPdbs = len(results[0]) else: numPdbs = len(results) LOGGER.info('{0} PDBs were parsed in {1:.2f}s.'.format( numPdbs, time.time() - start)) return results
def parsePDB(*pdb, **kwargs): """Returns an :class:`.AtomGroup` and/or dictionary containing header data parsed from a PDB file. This function extends :func:`.parsePDBStream`. See :ref:`parsepdb` for a detailed usage example. :arg pdb: one PDB identifier or filename, or a list of them. If needed, PDB files are downloaded using :func:`.fetchPDB()` function. You can also provide arguments that you would like passed on to fetchPDB(). """ n_pdb = len(pdb) if n_pdb == 1: if isListLike(pdb[0]): pdb = pdb[0] n_pdb = len(pdb) if n_pdb == 1: return _parsePDB(pdb[0], **kwargs) else: results = [] lstkwargs = {} for key in kwargs: argval = kwargs.get(key) if np.isscalar(argval): argval = [argval]*n_pdb lstkwargs[key] = argval start = time.time() LOGGER.progress('Retrieving {0} PDB structures...' .format(n_pdb), n_pdb, '_prody_parsePDB') for i, p in enumerate(pdb): kwargs = {} for key in lstkwargs: kwargs[key] = lstkwargs[key][i] c = kwargs.get('chain','') LOGGER.update(i, 'Retrieving {0}...'.format(p+c), label='_prody_parsePDB') result = _parsePDB(p, **kwargs) if not isinstance(result, tuple): if isinstance(result, dict): result = (None, result) else: result = (result, None) results.append(result) results = list(zip(*results)) LOGGER.finish() for i in reversed(range(len(results))): if all(j is None for j in results[i]): results.pop(i) if len(results) == 1: results = results[0] results = list(results) model = kwargs.get('model') header = kwargs.get('header', False) if model != 0 and header: numPdbs = len(results[0]) else: numPdbs = len(results) LOGGER.info('{0} PDBs were parsed in {1:.2f}s.' .format(numPdbs, time.time()-start)) return results
def parseBIRD(*ids, **kwargs): """Parse data from the Biologically Interesting Molecule Reference Dictionary (BIRD) resource, which is updated every week. This includes 2 kinds of keys, which can be selected with the **keys** keyword argument. The chemical information is found in a single CIF file at https://files.rcsb.org/pub/pdb/data/bird/prd/prd-all.cif.gz. This data will be downloaded and extracted to :file:`.prody/bird-prd`. Biological function information is also found in a single CIF file at https://files.rcsb.org/pub/pdb/data/bird/family/family-all.cif.gz. This data will be downloaded and extracted to :file:`.prody/bird-family`. Individual compounds can be selected using **ids**. If needed, BIRD files are downloaded using :func:`.fetchBIRDviaFTP` function. You can also provide arguments that you would like passed on to fetchBIRDviaFTP. :arg ids: one BIRD identifier (starting with PRD or FAM) or a list of them. If **None** is provided then all of them are returned. :type ids: str, tuple, list, :class:`~numpy.ndarray`, **None** :arg key: key specifying which data to fetch out of ``'prd'`` or ``'family'`` default is ``'prd'`` :type key: str Returns :class:`.StarDataBlock` object or list of them. """ key = kwargs.get('key', 'prd') if not isinstance(key, str): raise TypeError("key should be a string") if key[:3].lower() == 'prd': key = 'prd' elif key[:3].lower() == 'fam': key = 'family' else: raise ValueError("key should be 'prd' or 'fam'") n_ids = len(ids) if n_ids == 1: if isListLike(ids[0]): ids = ids[0] n_ids = len(ids) if n_ids == 1: ids = list(ids) BIRD_PATH = os.path.join(getPackagePath(), 'bird') filename = BIRD_PATH + '/{0}-all.cif.gz'.format(key) if not os.path.isfile(filename): fetchBIRDviaFTP(keys=key, **kwargs) data = parseSTAR(filename, shlex=True) ret = [] for id in ids: try: ret.append(data.search(id)[0]) except ValueError: try: ret.append(data[id]) except ValueError: LOGGER.warn('id {0} not found in {1} data ' 'so appending None'.format(id, key)) ret.append(None) if n_ids == 1: return ret[0] return ret
def writePIR(filename, msa, **kwargs): """A function to write PIR format alignments for use with MODELLER. :arg filename: The name of the file to be written including .ali :type filename: str :arg msa: a multiple sequence alignment in :class:`MSA` format :type msa: :class:`MSA` instance :arg chain_sep: chain separation character or list of them default is '/' :type chain_sep: str, list :arg types: a list of strings for field 1, PIR types (Sequence or StructureX) default is all Sequence :type types: list :arg labels: a list of strings for field 2, sequence labels default is to take them from msa :type labels: list :arg first_resnums: contents for field 3, residue number for the first residue. This should be a list of strings each having length 5, default is all 'FIRST' :type first_resnums: list :arg first_chains: contents for field 4, chain ID for the first residue This should be a list of strings each having length 1, default is all '@' :type first_chains: list :arg last_resnums: contents for field 5, residue number for the last residue. This should be a list of strings each having length 5, default is all 'LAST ' :type last_resnums: list :arg last_chains: contents for field 6, chain ID for the last residue This should be a list of strings each having length 1, default is all ' ' :type first_chains: list :arg protein_names: list of strings for field 7 default is all '' :type protein_names: list :arg protein_sources: list of strings for field 8 default is all '' :type protein_sources: list :arg resolutions: list of strings for field 9 default is all '' :type resolutions: list :arg r_factors: list of strings for field 10 default is all '' :type r_factors: list """ msafile = open(filename, 'w') chain_sep = kwargs.get('chain_sep', '/') if isinstance(chain_sep, basestring): chain_sep = [chain_sep] * msa.numSequences() elif isListLike(chain_sep) and isinstance(chain_sep[0], basestring): if len(chain_sep) != msa.numSequences(): raise ValueError('There should be an entry in chain_sep list for each sequence in msa') else: raise TypeError('chain_sep should be a string or list of strings') types = kwargs.get('types', 'Sequence') if isinstance(types, basestring): types = [types] * msa.numSequences() elif isListLike(types) and isinstance(types[0], basestring): if len(types) != msa.numSequences(): raise ValueError('There should be an entry in types list for each sequence in msa') else: raise TypeError('types should be a string or list of strings') labels = kwargs.get('labels', None) if labels is None: labels = [] for sequence in msa: labels.append(sequence.getLabel()) elif isListLike(labels) and isinstance(labels[0], basestring): if len(labels) != msa.numSequences(): raise ValueError('There should be an entry in labels list for each sequence in msa') else: raise TypeError('labels should be a string or list of strings') first_resnums = kwargs.get('first_resnums', 'FIRST') if isinstance(first_resnums, basestring) and len(first_resnums) == 5: first_resnums = [first_resnums] * msa.numSequences() elif isListLike(first_resnums) and isinstance(first_resnums, basestring): if len(first_resnums) != msa.numSequences(): raise ValueError('There should be an entry in first_resnums list for each sequence in msa') else: raise TypeError('first_resnums should be a string of length 5 or list of them') first_chains = kwargs.get('first_chains', '@') if isinstance(first_chains, basestring) and len(first_chains) == 1: first_chains = [first_chains] * msa.numSequences() elif isListLike(first_chains) and isinstance(first_chains, basestring): if len(first_chains) != msa.numSequences(): raise ValueError('There should be an entry in first_chains list for each sequence in msa') else: raise TypeError('first_chains should be a string of length 1 or list of them') last_resnums = kwargs.get('last_resnums', 'LAST ') if isinstance(last_resnums, basestring) and len(last_resnums) == 5: last_resnums = [last_resnums] * msa.numSequences() elif isListLike(last_resnums) and isinstance(last_resnums, basestring): if len(last_resnums) != msa.numSequences(): raise ValueError('There should be an entry in last_resnums list for each sequence in msa') else: raise TypeError('last_resnums should be a string of length 5 or list of them') last_chains = kwargs.get('last_chains', ' ') if isinstance(last_chains, basestring) and len(last_chains) == 1: last_chains = [last_chains] * msa.numSequences() elif isListLike(last_chains) and isinstance(last_chains, basestring): if len(last_chains) != msa.numSequences(): raise ValueError('There should be an entry in last_chains list for each sequence in msa') else: raise TypeError('last_chains should be a string of length 1 or list of them') protein_names = kwargs.get('protein_names', '') if isinstance(protein_names, basestring): protein_names = [protein_names] * msa.numSequences() elif isListLike(protein_names) and isinstance(protein_names, basestring): if len(protein_names) != msa.numSequences(): raise ValueError('There should be an entry in protein_names list for each sequence in msa') else: raise TypeError('protein_names should be a string or list of strings') protein_sources = kwargs.get('protein_sources', '') if isinstance(protein_sources, basestring): protein_sources = [protein_sources] * msa.numSequences() elif isListLike(protein_sources) and isinstance(protein_sources, basestring): if len(protein_sources) != msa.numSequences(): raise ValueError('There should be an entry in protein_sources list for each sequence in msa') else: raise TypeError('protein_sources should be a string or list of strings') resolutions = kwargs.get('resolutions', '') if isinstance(resolutions, basestring): resolutions = [resolutions] * msa.numSequences() elif isListLike(resolutions) and isinstance(resolutions, basestring): if len(resolutions) != msa.numSequences(): raise ValueError('There should be an entry in resolutions list for each sequence in msa') else: raise TypeError('resolutions should be a string or list of strings') r_factors = kwargs.get('r_factors', '') if isinstance(r_factors, basestring): r_factors = [r_factors] * msa.numSequences() elif isListLike(r_factors) and isinstance(r_factors, basestring): if len(r_factors) != msa.numSequences(): raise ValueError('There should be an entry in r_factors list for each sequence in msa') else: raise TypeError('r_factors should be a string or list of strings') for i, sequence in enumerate(msa): sequence = str(sequence).replace(chain_sep[i],'/') msafile.write('>P1;' + labels[i] + '\n') msafile.write(types[i] + ':' + labels[i] + ':') msafile.write(first_resnums[i] + ':' + first_chains[i] + ':') msafile.write(last_resnums[i] + ':' + last_chains[i] + ':') msafile.write(protein_names[i] + ':' + protein_sources[i] + ':') msafile.write(resolutions[i] + ':' + r_factors[i]) msafile.write('\n') for j in range(len(sequence)/60): msafile.write(sequence[j*60:(j+1)*60] + '\n') msafile.write(sequence[(j+1)*60:] + '*\n\n') msafile.close() return
def scanPockets(self): 'Generates ESSA z-scores for pockets and parses pocket features. It requires both Fpocket 3.0 and Pandas being installed in your system.' from re import findall fpocket = which('fpocket') if fpocket is None: LOGGER.warning('Fpocket (version >= 3.0) was not found, please install it.') return None try: from pandas import Index, DataFrame except ImportError as ie: LOGGER.warning(ie.__str__() + ' was found, please install it.') return None rcr = {(i, j): k if self._rib else self._ri[k] for i, j, k in zip(self._ca.getChids(), self._ca.getResnums(), self._ca.getResindices())} writePDB('{}_pro'.format(self._title), self._heavy) direc = '{}_pro_out'.format(self._title) if not isdir(direc): system('fpocket -f {}_pro.pdb'.format(self._title)) chdir(direc + '/pockets') l = [x for x in listdir('.') if x.endswith('.pdb')] l.sort(key=lambda x:int(x.partition('_')[0][6:])) ps = [] for x in l: with open(x, 'r') as f: tmp0 = f.read() tmp1 = [(x[1].strip(), float(x[2])) for x in findall(r'(\w+\s\w+\s*-\s*)(.+):\s*([\d.-]+)(\n)', tmp0)] fea, sco = list(zip(*tmp1)) ps.append(sco) pdbs = parsePDB(l) if not isListLike(pdbs): pdbs = [pdbs] chdir('../..') # ----- # ----- # ps = array(ps) pcn = {int(pdb.getTitle().partition('_')[0][6:]): set(zip(pdb.getChids().tolist(), pdb.getResnums().tolist())) for pdb in pdbs} pi = {p: [rcr[x] for x in crn] for p, crn in pcn.items()} pzs_max = {k: max(self._zscore[v]) for k, v in pi.items()} pzs_med = {k: median(self._zscore[v]) for k, v in pi.items()} # ----- # ----- # indices = Index(range(1, ps.shape[0] + 1), name='Pocket #') columns = Index(fea, name='Feature') self._df = DataFrame(index=indices, columns=columns, data=ps) # ----- # ----- # columns_zs = Index(['ESSA_max', 'ESSA_med', 'LHD'], name='Z-score') zps = c_[list(pzs_max.values())] zps = hstack((zps, c_[list(pzs_med.values())])) zps = hstack((zps, zscore(self._df[['Local hydrophobic density Score']]))) self._df_zs = DataFrame(index=indices, columns=columns_zs, data=zps)
def fetchBIRDviaFTP(**kwargs): """Retrieve the whole Biologically Interesting Molecule Reference Dictionary (BIRD) resource, which is updated every week. This includes 2 kinds of keys, which can be selected with the **keys** keyword argument. The chemical information is found in a zipped (tar.gz) directory at https://files.rcsb.org/pub/pdb/data/bird/prd/prd-all.cif.gz, which contains individual CIF files within it. This data will be downloaded and extracted to :file:`.prody/bird-prd`. Biological function information is also found in a zipped (tar.gz) directory at https://files.rcsb.org/pub/pdb/data/bird/family/family-all.cif.gz, which contains individual CIF files within it. This data will be downloaded and extracted to :file:`.prody/bird-family`. :arg keys: keys specifying which data to fetch out of ``'prd'``, ``'family'`` or ``'both'`` default is ``'both'`` :type keys: str, tuple, list, :class:`~numpy.ndarray` The underlying data can be accessed using :func:`parseBIRD`.""" BIRD_PATH = os.path.join(getPackagePath(), 'bird') keys = kwargs.get('keys', 'both') if isinstance(keys, str): if keys == 'both': keys = ['prd', 'family'] elif keys[:3].lower() == 'prd': keys = ['prd'] elif keys[:3].lower() == 'fam': keys = ['family'] else: raise ValueError("keys should be 'both', 'prd' or 'fam'") elif isListLike(keys): keys = list(keys) else: raise TypeError("keys should be list-like or string") ftp_divided = 'pdb/data/bird/' ftp_pdbext = '.cif.gz' ftp_prefix = '' if not os.path.isdir(BIRD_PATH): os.mkdir(BIRD_PATH) LOGGER.progress('Downloading BIRD', len(keys), '_prody_fetchBIRD') ftp_name, ftp_host, ftp_path = WWPDB_FTP_SERVERS[wwPDBServer() or 'us'] LOGGER.debug('Connecting wwPDB FTP server {0}.'.format(ftp_name)) from ftplib import FTP try: ftp = FTP(ftp_host) except Exception as error: raise type(error)('FTP connection problem, potential reason: ' 'no internet connectivity') else: count = 0 success = 0 failure = 0 filenames = [] ftp.login('') for i, x in enumerate(keys): data = [] ftp_fn = ftp_prefix + '{0}-all'.format(x) + ftp_pdbext try: ftp.cwd(ftp_path) ftp.cwd(ftp_divided) ftp.cwd(x) ftp.retrbinary('RETR ' + ftp_fn, data.append) except Exception as error: if ftp_fn in ftp.nlst(): LOGGER.warn('{0} download failed ({1}). It is ' 'possible that you do not have rights to ' 'download .gz files in the current network.' .format(x, str(error))) else: LOGGER.info('{0} download failed. {1} does not exist ' 'on {2}.'.format(ftp_fn, x, ftp_host)) failure += 1 filenames.append(None) else: if len(data): filename = BIRD_PATH + '/{0}-all.cif.gz'.format(x) with open(filename, 'w+b') as outfile: write = outfile.write [write(block) for block in data] success += 1 else: failure += 1 count += 1 LOGGER.update(i, label='_prody_fetchBIRD') LOGGER.finish() LOGGER.debug('PDB download via FTP completed ({0} downloaded, ' '{1} failed).'.format(success, failure))
def reduceModelByMask(model, mask): """Returns NMA model reduced based on *mask*. :arg model: dynamics model :type model: :class:`.ANM`, :class:`.GNM`, or :class:`.PCA` :arg mask: an Integer array or a Boolean array where ``"True"`` indicates the parts being selected :type mask: list, :class:`~numpy.ndarray` :returns: :class:`.NMA`""" if not isinstance(model, NMA): raise TypeError('model must be an NMA instance, not {0}'.format( type(model))) if not isListLike(mask): raise TypeError( 'mask must be either a list or a numpy.ndarray, not {0}'.format( type(model))) is_bool = mask.dtype is np.dtype('bool') if is_bool: if len(mask) != model.numAtoms(): raise ValueError('number of atoms in model and mask must be equal') system = mask else: if mask.min() < 0 or mask.max() >= model.numAtoms(): raise ValueError('index in mask exceeds range') system = np.zeros(model.numAtoms(), dtype=bool) system[mask] = True if isinstance(model, GNM): matrix = model._kirchhoff elif isinstance(model, ANM): matrix = model._hessian elif isinstance(model, PCA): matrix = model._cov else: raise TypeError('model does not have a valid type derived from NMA') if matrix is None: raise ValueError('model matrix (Hessian/Kirchhoff/Covariance) is not ' 'built') if model.is3d(): system = np.repeat(system, 3) if isinstance(model, PCA): ss = matrix[system, :][:, system] eda = PCA(model.getTitle() + ' reduced') eda.setCovariance(ss) return eda else: matrix = _reduceModel(matrix, system) if isinstance(model, GNM): gnm = GNM(model.getTitle() + ' reduced') gnm.setKirchhoff(matrix) return gnm elif isinstance(model, ANM): anm = ANM(model.getTitle() + ' reduced') anm.setHessian(matrix) return anm
def __init__(self, starDict, key, indices=None): self._title = key self._prog = starDict._prog self._starDict = starDict if indices is None: try: self._dict = starDict._dict[key] except: self._dict = list(starDict._dict)[key] keys = list(self._dict.keys()) else: keys = [idx[0] for idx in indices] self._dict = OrderedDict() self._dict['data'] = OrderedDict() self._dict['fields'] = OrderedDict() for idx in indices: if idx[0] == 'data': self._dict[idx[0]][idx[1]] = starDict._dict[self._title][ idx[0]][idx[1]] if not 'fields' in keys: for k, v in self._starDict._dict[ self._title]['fields'].items(): if v == idx[1]: self._dict['fields'][k] = v else: self._dict[idx[0]] = OrderedDict() self._dict[idx[0]]['fields'] = starDict._dict[self._title][ idx[0]]['fields'] self._dict[idx[0]]['data'] = OrderedDict() for id1 in idx[1]: self._dict[idx[0]]['data'][id1] = starDict._dict[ self._title][idx[0]]['data'][id1] if set(keys) == set(['data', 'fields']): self.loops = [] self.numLoops = 0 self.data = np.array(list(self._dict['data'].values())) self.fields = np.array(list(self._dict['fields'].values())) if not isListLike(self.data): self.data = [self.data] if not isListLike(self.fields): self.fields = [self.fields] self.numEntries = len(self.data) self.numFields = len(self.fields) elif 'data' in keys and 'fields' in keys: if indices is not None: self.loops = [ StarLoop(self, key, idx) for (key, idx) in indices if key not in ['data', 'fields'] ] else: self.loops = [ StarLoop(self, key) for key in keys if key not in ['data', 'fields'] ] self.data = np.array(list(self._dict['data'].values())) self.fields = np.array(list(self._dict['fields'].values())) if not isListLike(self.data): self.data = [self.data] if not isListLike(self.fields): self.fields = [self.fields] self.numEntries = len(self.data) self.numFields = len(self.fields) self.numLoops = len(self.loops) elif 'data' in keys: if indices is not None: self.loops = [ StarLoop(self, key, idx) for (key, idx) in indices if key != 'data' ] else: self.loops = [ StarLoop(self, key) for key in keys if key != 'data' ] self.data = np.array(list(self._dict['data'].values())) self.fields = np.array(list(self._dict['fields'].values())) if not isListLike(self.data): self.data = [self.data] if not isListLike(self.fields): self.fields = [self.fields] self.numLoops = len(self.loops) self.numEntries = len(self.data) self.numFields = 0 elif 'fields' in keys: if indices is not None: self.loops = [ StarLoop(self, key, idx) for (key, idx) in indices if key != 'fields' ] else: self.loops = [ StarLoop(self, key) for key in keys if key != 'fields' ] self.data = np.array(list(self._dict['data'].values())) self.fields = np.array(list(self._dict['fields'].values())) if not isListLike(self.data): self.data = [self.data] if not isListLike(self.fields): self.fields = [self.fields] self.numLoops = len(self.loops) self.numEntries = len(self.data) self.numFields = 0 else: if indices is not None: self.loops = [ StarLoop(self, key, idx) for (key, idx) in indices ] else: self.loops = [StarLoop(self, key) for key in keys] self.numLoops = len(self.loops) self.numEntries = 0 self.numFields = 0
def queryGOA(*ids, **kwargs): """Query a GOA database by identifier. :arg ids: an identifier or a list-like of identifiers :type ids: str, tuple, list, :class:`~numpy.ndarray` :arg database: name of the database of interest default is PDB. Others include UNIPROT and common names of many organisms. :type database: str """ database = kwargs.pop('database', 'PDB') gaf_dict = kwargs.pop('gaf_dict', None) if gaf_dict is None: gaf_dict = parseGAF(database=database, **kwargs) LOGGER.info('GAF parsing completed.') n_ids = len(ids) if n_ids == 1: if isListLike(ids[0]): ids = ids[0] n_ids = len(ids) if n_ids == 1: ids = list(ids) results = [] unmapped = [] LOGGER.progress('Querying GOA for {0} ids...' .format(n_ids), n_ids, '_prody_queryGOA') for i, id in enumerate(ids): LOGGER.update(i, 'Querying GOA for id {0} of {1}...' .format(i+1, n_ids), label='_prody_queryGOA') if not isinstance(id, str): raise TypeError('each ID should be a string') id = id.upper() if database == 'PDB': if not len(id) in [4, 5, 6]: raise ValueError('PDB IDs should be strings of length 4 to 6') if len(id) == 5 and str.isalpha(id[-1]): id = id[:4] + '_' + id[-1] if id in list(gaf_dict.keys()): results.append(gaf_dict[id]) else: results.append([]) unmapped.append(id) rets = [] LOGGER.progress('Mapping GO terms back to GOA results for {0} ids...' .format(n_ids), n_ids, '_prody_mapGO') for i, result in enumerate(results): LOGGER.update(i, 'Mapping GO terms back to GOA results id {0} of {1}...' .format(i+1, n_ids), label='_prody_mapGO') rets.append(GOADictList(result, title=ids[i], **kwargs)) if n_ids == 1: rets = rets[0] return rets
def calcGoOverlap(*go_terms, **kwargs): """Calculate overlap between GO terms based on their distance in the graph. GO terms in different namespaces (molecular function, cellular component, and biological process) have undefined distances. :arg go_terms: a list of GO terms or GO IDs :type go_terms: list, tuple, `~numpy.ndarray` :arg pairwise: whether to calculate to a matrix of pairwise overlaps default is False :type pairwise: bool :arg distance: whether to return distances rather than calculating overlaps default is False :type distance: bool :arg go: GO graph. Default behaviour is to parse it with :func:`.parseOBO`. :type go: `~goatools.obo_parser.GODag` """ pairwise = kwargs.pop('pairwise', False) distance = kwargs.get('distance', False) operator = kwargs.get('operator', None) go = kwargs.get('go', None) if go is None: go = parseOBO(**kwargs) if not isListLike(go_terms): raise TypeError('please provide a list-like of go terms') if pairwise: distances = np.zeros((len(go_terms), len(go_terms))) for i, go_terms_i in enumerate(go_terms): for j, go_terms_j in enumerate(go_terms): distances[i, j] = calcGoOverlap(go_terms_i, go_terms_j, pairwise=False, **kwargs) else: go_terms1 = go_terms[0] flattened_term_list = [] for entry in go_terms[1:]: if isListLike(entry): flattened_term_list.extend(entry) else: flattened_term_list.append(entry) if not isListLike(go_terms1): go_terms1 = [go_terms1] if not isListLike(flattened_term_list): flattened_term_list = [flattened_term_list] try: flattened_term_list = [go[term] for term in flattened_term_list] go_terms1 = [go[term] for term in go_terms1] except: try: flattened_term_list = [term.id for term in flattened_term_list] go_terms1 = [term.id for term in go_terms1] except: raise TypeError('go_terms should contain go terms or IDs') for term in flattened_term_list: if not isinstance(term, str): term = term.id for term in go_terms1: if not isinstance(term, str): term = term.id distances = np.zeros((len(go_terms1), len(flattened_term_list))) for i, go_id1 in enumerate(go_terms1): for j, go_id2 in enumerate(flattened_term_list): distances[i, j] = calcMinBranchLength(go_id1, go_id2, go) if operator is not None and isListLike(distances): distances = operator(distances) if operator is None: if distances.shape[-1] == 1: distances = distances.flatten() if distances.shape == (1, ): distances = distances[0] if distance: return distances else: return 1. / distances
def queryGOA(*ids, **kwargs): """Query a GOA database by identifier. :arg ids: an identifier or a list-like of identifiers :type ids: str, tuple, list, :class:`~numpy.ndarray` :arg database: name of the database of interest default is PDB. Others include UNIPROT and common names of many organisms. :type database: str """ database = kwargs.pop('database', 'PDB') gaf_dict = kwargs.pop('gaf_dict', None) if gaf_dict is None: gaf_dict = parseGAF(database=database, **kwargs) LOGGER.info('GAF parsing completed.') n_ids = len(ids) if n_ids == 1: if isListLike(ids[0]): ids = ids[0] n_ids = len(ids) if n_ids == 1: ids = list(ids) results = [] unmapped = [] LOGGER.progress('Querying GOA for {0} ids...'.format(n_ids), n_ids, '_prody_queryGOA') for i, id in enumerate(ids): LOGGER.update(i, 'Querying GOA for id {0} of {1}...'.format(i + 1, n_ids), label='_prody_queryGOA') if not isinstance(id, str): raise TypeError('each ID should be a string') id = id.upper() if database == 'PDB': if not len(id) in [4, 5, 6]: raise ValueError('PDB IDs should be strings of length 4 to 6') if len(id) == 5 and str.isalpha(id[-1]): id = id[:4] + '_' + id[-1] if id in list(gaf_dict.keys()): results.append(gaf_dict[id]) else: results.append([]) unmapped.append(id) rets = [] LOGGER.progress( 'Mapping GO terms back to GOA results for {0} ids...'.format(n_ids), n_ids, '_prody_mapGO') for i, result in enumerate(results): LOGGER.update( i, 'Mapping GO terms back to GOA results id {0} of {1}...'.format( i + 1, n_ids), label='_prody_mapGO') rets.append(GOADictList(result, title=ids[i], **kwargs)) if n_ids == 1: rets = rets[0] return rets
def calcGoOverlap(*go_terms, **kwargs): """Calculate overlap between GO terms based on their distance in the graph. GO terms in different namespaces (molecular function, cellular component, and biological process) have undefined distances. :arg go_terms: a list of GO terms or GO IDs :type go_terms: list, tuple, `~numpy.ndarray` :arg pairwise: whether to calculate to a matrix of pairwise overlaps default is False :type pairwise: bool :arg distance: whether to return distances rather than calculating overlaps default is False :type distance: bool :arg go: GO graph. Default behaviour is to parse it with :func:`.parseOBO`. :type go: `~goatools.obo_parser.GODag` """ pairwise = kwargs.pop('pairwise', False) distance = kwargs.get('distance', False) operator = kwargs.get('operator', None) go = kwargs.get('go', None) if go is None: go = parseOBO(**kwargs) if not isListLike(go_terms): raise TypeError('please provide a list-like of go terms') if pairwise: distances = np.zeros((len(go_terms), len(go_terms))) for i, go_terms_i in enumerate(go_terms): for j, go_terms_j in enumerate(go_terms): distances[i, j] = calcGoOverlap( go_terms_i, go_terms_j, pairwise=False, **kwargs) else: go_terms1 = go_terms[0] flattened_term_list = [] for entry in go_terms[1:]: if isListLike(entry): flattened_term_list.extend(entry) else: flattened_term_list.append(entry) if not isListLike(go_terms1): go_terms1 = [go_terms1] if not isListLike(flattened_term_list): flattened_term_list = [flattened_term_list] try: flattened_term_list = [go[term] for term in flattened_term_list] go_terms1 = [go[term] for term in go_terms1] except: try: flattened_term_list = [term.id for term in flattened_term_list] go_terms1 = [term.id for term in go_terms1] except: raise TypeError('go_terms should contain go terms or IDs') for term in flattened_term_list: if not isinstance(term, str): term = term.id for term in go_terms1: if not isinstance(term, str): term = term.id distances = np.zeros((len(go_terms1), len(flattened_term_list))) for i, go_id1 in enumerate(go_terms1): for j, go_id2 in enumerate(flattened_term_list): distances[i, j] = calcMinBranchLength(go_id1, go_id2, go) if operator is not None and isListLike(distances): distances = operator(distances) if operator is None: if distances.shape[-1] == 1: distances = distances.flatten() if distances.shape == (1,): distances = distances[0] if distance: return distances else: return 1. / distances