def matchModes(*modesets, **kwargs): """Returns the matches of modes among *modesets*. Note that the first modeset will be treated as the reference so that only the matching of each modeset to the first modeset is garanteed to be optimal. :arg index: if `True` then indices of modes will be returned instead of :class:`Mode` instances. :type index: bool """ index = kwargs.pop('index', False) modeset0 = modesets[0] ret = [modeset0] n_modes = len(modeset0) n_sets = len(modesets) if n_sets == 1: return modesets elif n_sets == 0: raise ValueError('at least one modeset should be given') LOGGER.progress( 'Matching {0} modes across {1} modesets...'.format(n_modes, n_sets), n_sets, '_prody_matchModes') for i, modeset in enumerate(modesets): LOGGER.update(i, label='_prody_matchModes') if i > 0: _, reordered_modeset = pairModes(modeset0, modeset, index=index) ret.append(reordered_modeset) LOGGER.finish() return ret
def _superpose(self, **kwargs): """Superpose conformations and update coordinates.""" indices = self._indices weights = self._weights mobs = self._confs if indices is None: idx = False tar = self._coords movs = None else: idx = True if self._weights is not None: weights = weights[indices] tar = self._coords[indices] movs = self._confs linalg = importLA() svd = linalg.svd det = linalg.det if weights is None: tar_com = tar.mean(0) tar_org = (tar - tar_com) mob_org = zeros(tar_org.shape, dtype=mobs.dtype) tar_org = tar_org.T else: weights_sum = weights.sum() weights_dot = dot(weights.T, weights) tar_com = (tar * weights).sum(axis=0) / weights_sum tar_org = (tar - tar_com) mob_org = zeros(tar_org.shape, dtype=mobs.dtype) LOGGER.progress('Superposing ', len(mobs), '_prody_ensemble') for i, mob in enumerate(mobs): if idx: mob = mob[indices] if weights is None: mob_com = mob.mean(0) matrix = dot(tar_org, subtract(mob, mob_com, mob_org)) else: mob_com = (mob * weights).sum(axis=0) / weights_sum subtract(mob, mob_com, mob_org) matrix = dot((tar_org * weights).T, (mob_org * weights)) / weights_dot U, s, Vh = svd(matrix) Id = array([[1, 0, 0], [0, 1, 0], [0, 0, sign(det(matrix))]]) rotation = dot(Vh.T, dot(Id, U.T)) if movs is None: mobs[i] = dot(mob_org, rotation) add(mobs[i], tar_com, mobs[i]) else: add(dot(movs[i], rotation), (tar_com - dot(mob_com, rotation)), movs[i]) LOGGER.update(i + 1, label='_prody_ensemble') LOGGER.finish()
def fetchPDBClusters(sqid=None): """Retrieve PDB sequence clusters. PDB sequence clusters are results of the weekly clustering of protein chains in the PDB generated by blastclust. They are available at FTP site: ftp://resources.rcsb.org/sequence/clusters/ This function will download about 10 Mb of data and save it after compressing in your home directory in :file:`.prody/pdbclusters`. Compressed files will be less than 4 Mb in size. Cluster data can be loaded using :func:`loadPDBClusters` function and be accessed using :func:`listPDBCluster`.""" if sqid is not None: if isListLike(sqid): for s in sqid: if s not in PDB_CLUSTERS: raise ValueError('sqid must be one or more of ' + PDB_CLUSTERS_SQID_STR) keys = list(sqid) else: if sqid not in PDB_CLUSTERS: raise ValueError('sqid must be one or more of ' + PDB_CLUSTERS_SQID_STR) keys = [sqid] else: keys = list(PDB_CLUSTERS) PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters') if not os.path.isdir(PDB_CLUSTERS_PATH): os.mkdir(PDB_CLUSTERS_PATH) LOGGER.progress('Downloading sequence clusters', len(keys), '_prody_fetchPDBClusters') count = 0 for i, x in enumerate(keys): filename = 'bc-{0}.out'.format(x) url = ('ftp://resources.rcsb.org/sequence/clusters/' + filename) try: inp = openURL(url) except IOError: LOGGER.warning('Clusters at {0}% sequence identity level could ' 'not be downloaded.'.format(x)) continue else: out = openFile(filename + '.gz', 'w', folder=PDB_CLUSTERS_PATH) out.write(inp.read()) inp.close() out.close() count += 1 LOGGER.update(i, label='_prody_fetchPDBClusters') LOGGER.finish() if len(keys) == count: LOGGER.info('All selected PDB clusters were downloaded successfully.') elif count == 0: LOGGER.warn('PDB clusters could not be downloaded.')
def fetchPDBClusters(sqid=None): """Retrieve PDB sequence clusters. PDB sequence clusters are results of the weekly clustering of protein chains in the PDB generated by blastclust. They are available at FTP site: ftp://resources.rcsb.org/sequence/clusters/ This function will download about 10 Mb of data and save it after compressing in your home directory in :file:`.prody/pdbclusters`. Compressed files will be less than 4 Mb in size. Cluster data can be loaded using :func:`loadPDBClusters` function and be accessed using :func:`listPDBCluster`.""" if sqid is not None: if sqid not in PDB_CLUSTERS: raise ValueError('sqid must be one of ' + PDB_CLUSTERS_SQID_STR) keys = [sqid] else: keys = list(PDB_CLUSTERS) PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters') if not os.path.isdir(PDB_CLUSTERS_PATH): os.mkdir(PDB_CLUSTERS_PATH) LOGGER.progress('Downloading sequence clusters', len(keys), '_prody_fetchPDBClusters') count = 0 for i, x in enumerate(keys): filename = 'bc-{0}.out'.format(x) url = ('ftp://resources.rcsb.org/sequence/clusters/' + filename) try: inp = openURL(url) except IOError: LOGGER.warning('Clusters at {0}% sequence identity level could ' 'not be downloaded.') continue else: out = openFile(filename+'.gz', 'w', folder=PDB_CLUSTERS_PATH) out.write(inp.read()) inp.close() out.close() count += 1 LOGGER.update(i, label='_prody_fetchPDBClusters') LOGGER.finish() if len(PDB_CLUSTERS) == count: LOGGER.info('All PDB clusters were downloaded successfully.') elif count == 0: LOGGER.warn('PDB clusters could not be downloaded.')
def parseChainsList(filename): """ Parse a set of PDBs and extract chains based on a list in a text file. :arg filename: the name of the file to be read :type filename: str Returns: lists containing an :class:'.AtomGroup' for each PDB, the headers for those PDBs, and the requested :class:`.Chain` objects """ fi = open(filename, 'r') lines = fi.readlines() fi.close() pdb_ids = [] ags = [] headers = [] chains = [] num_lines = len(lines) LOGGER.progress('Starting', num_lines, '_prody_parseChainsList') for i, line in enumerate(lines): LOGGER.update(i, 'Parsing lines...', label='_prody_parseChainsList') pdb_id = line.split()[0].split('_')[0] if not pdb_id in pdb_ids: pdb_ids.append(pdb_id) ag, header = parsePDB(pdb_id, compressed=False, \ subset=line.split()[0].split('_')[1], header=True) ags.append(ag) headers.append(header) chains.append(ag.getHierView()[line.strip().split()[1]]) LOGGER.finish() LOGGER.info( '{0} PDBs have been parsed and {1} chains have been extracted. \ '.format(len(ags), len(chains))) return ags, headers, chains
def parseChainsList(filename): """ Parse a set of PDBs and extract chains based on a list in a text file. :arg filename: the name of the file to be read :type filename: str Returns: lists containing an :class:'.AtomGroup' for each PDB, the headers for those PDBs, and the requested :class:`.Chain` objects """ fi = open(filename,'r') lines = fi.readlines() fi.close() pdb_ids = [] ags = [] headers = [] chains = [] num_lines = len(lines) LOGGER.progress('Starting', num_lines, '_prody_parseChainsList') for i, line in enumerate(lines): LOGGER.update(i, 'Parsing lines...', label='_prody_parseChainsList') pdb_id = line.split()[0].split('_')[0] if not pdb_id in pdb_ids: pdb_ids.append(pdb_id) ag, header = parsePDB(pdb_id, compressed=False, \ subset=line.split()[0].split('_')[1], header=True) ags.append(ag) headers.append(header) chains.append(ag.getHierView()[line.strip().split()[1]]) LOGGER.finish() LOGGER.info('{0} PDBs have been parsed and {1} chains have been extracted. \ '.format(len(ags),len(chains))) return ags, headers, chains
def calcMSF(coordsets): """Calculate mean square fluctuation(s) (MSF).""" try: ncsets = coordsets.numFrames() except AttributeError: try: coordsets = coordsets.getCoordsets() except AttributeError: pass try: ndim, shape = coordsets.ndim, coordsets.shape except: raise TypeError('coordsets must be a Numpy array or a ProDy ' 'object with `getCoordsets` method') if ndim != 3 or shape[0] == 1: raise ValueError('coordsets must contain multiple sets') msf = var(coordsets, 0).sum(1) else: nfi = coordsets.nextIndex() natoms = coordsets.numSelected() total = zeros((natoms, 3)) sqsum = zeros((natoms, 3)) LOGGER.progress( 'Evaluating {0} frames from {1}:'.format(ncsets, str(coordsets)), ncsets, '_prody_calcMSF') ncsets = 0 coordsets.reset() for frame in coordsets: frame.superpose() coords = frame._getCoords() total += coords sqsum += coords**2 ncsets += 1 LOGGER.update(ncsets, label='_prody_calcMSF') LOGGER.finish() msf = (sqsum / ncsets - (total / ncsets)**2).sum(1) coordsets.goto(nfi) return msf
def calcMSF(coordsets): """Calculate mean square fluctuation(s) (MSF).""" try: ncsets = coordsets.numFrames() except AttributeError: try: coordsets = coordsets.getCoordsets() except AttributeError: pass try: ndim, shape = coordsets.ndim, coordsets.shape except: raise TypeError('coordsets must be a Numpy array or a ProDy ' 'object with `getCoordsets` method') if ndim != 3 or shape[0] == 1: raise ValueError('coordsets must contain multiple sets') msf = var(coordsets, 0).sum(1) else: nfi = coordsets.nextIndex() natoms = coordsets.numSelected() total = zeros((natoms, 3)) sqsum = zeros((natoms, 3)) LOGGER.progress('Evaluating {0} frames from {1}:' .format(ncsets, str(coordsets)), ncsets, '_prody_calcMSF') ncsets = 0 coordsets.reset() for frame in coordsets: frame.superpose() coords = frame._getCoords() total += coords sqsum += coords ** 2 ncsets += 1 LOGGER.update(ncsets, label='_prody_calcMSF') LOGGER.finish() msf = (sqsum/ncsets - (total/ncsets)**2).sum(1) coordsets.goto(nfi) return msf
def parsePDB(*pdb, **kwargs): """Returns an :class:`.AtomGroup` and/or dictionary containing header data parsed from a PDB file. This function extends :func:`.parsePDBStream`. See :ref:`parsepdb` for a detailed usage example. :arg pdb: one PDB identifier or filename, or a list of them. If needed, PDB files are downloaded using :func:`.fetchPDB()` function. You can also provide arguments that you would like passed on to fetchPDB(). """ n_pdb = len(pdb) if n_pdb == 1: if isListLike(pdb[0]): pdb = pdb[0] n_pdb = len(pdb) if n_pdb == 1: return _parsePDB(pdb[0], **kwargs) else: results = [] lstkwargs = {} for key in kwargs: argval = kwargs.get(key) if np.isscalar(argval): argval = [argval] * n_pdb lstkwargs[key] = argval start = time.time() LOGGER.progress('Retrieving {0} PDB structures...'.format(n_pdb), n_pdb, '_prody_parsePDB') for i, p in enumerate(pdb): kwargs = {} for key in lstkwargs: kwargs[key] = lstkwargs[key][i] c = kwargs.get('chain', '') LOGGER.update(i, 'Retrieving {0}...'.format(p + c), label='_prody_parsePDB') result = _parsePDB(p, **kwargs) if not isinstance(result, tuple): if isinstance(result, dict): result = (None, result) else: result = (result, None) results.append(result) results = list(zip(*results)) LOGGER.finish() for i in reversed(range(len(results))): if all(j is None for j in results[i]): results.pop(i) if len(results) == 1: results = results[0] results = list(results) model = kwargs.get('model') header = kwargs.get('header', False) if model != 0 and header: numPdbs = len(results[0]) else: numPdbs = len(results) LOGGER.info('{0} PDBs were parsed in {1:.2f}s.'.format( numPdbs, time.time() - start)) return results
def matchModes(*modesets, **kwargs): """Returns the matches of modes among *modesets*. Note that the first modeset will be treated as the reference so that only the matching of each modeset to the first modeset is garanteed to be optimal. :arg index: if **True** then indices of modes will be returned instead of :class:`Mode` instances :type index: bool :arg turbo: if **True** then the computation will be performed in parallel. The number of threads is set to be the same as the number of CPUs. Assigning a number to specify the number of threads to be used. Note that if writing a script, ``if __name__ == '__main__'`` is necessary to protect your code when multi-tasking. See https://docs.python.org/2/library/multiprocessing.html for details. Default is **False** :type turbo: bool, int """ index = kwargs.pop('index', False) turbo = kwargs.pop('turbo', False) n_worker = None if not isinstance(turbo, bool): n_worker = int(turbo) modeset0 = modesets[0] if index: ret = [modeset0.getIndices()] else: ret = [modeset0] n_modes = len(modeset0) n_sets = len(modesets) if n_sets == 1: return ret elif n_sets == 0: raise ValueError('at least one modeset should be given') if turbo: from multiprocessing import Pool, cpu_count from math import ceil if not n_worker: n_worker = cpu_count() LOGGER.info('Matching {0} modes across {1} modesets with {2} threads...' .format(n_modes, n_sets, n_worker)) pool = Pool(n_worker) n_sets_per_worker = ceil((n_sets - 1) / n_worker) args = [] for i in range(n_worker): start = i*n_sets_per_worker + 1 end = (i+1)*n_sets_per_worker + 1 subset = modesets[start:end] args.append((modeset0, subset, index)) nested_ret = pool.map(_pairModes_wrapper, args) for entry in nested_ret: ret.extend(entry) pool.close() pool.join() else: LOGGER.progress('Matching {0} modes across {1} modesets...' .format(n_modes, n_sets), n_sets, '_prody_matchModes') for i, modeset in enumerate(modesets): LOGGER.update(i, label='_prody_matchModes') if i > 0: _, reordered_modeset = pairModes(modeset0, modeset, index=index, **kwargs) ret.append(reordered_modeset) LOGGER.finish() return ret
def parsePfamPDBs(query, data=[], **kwargs): """Returns a list of AtomGroups containing sections of chains that correspond to a particular PFAM domain family. These are defined by alignment start and end residue numbers. :arg query: UniProt ID or PDB ID If a PDB ID is provided the corresponding UniProt ID is used. If this returns multiple matches then start or end must also be provided. This query is also used for label refinement of the Pfam domain MSA. :type query: str :arg data: If given the data list from the Pfam mapping table will be output through this argument. :type data: list :keyword start: Residue number for defining the start of the domain. The PFAM domain that starts closest to this will be selected. Default is **1** :type start: int :keyword end: Residue number for defining the end of the domain. The PFAM domain that ends closest to this will be selected. :type end: int """ start = kwargs.pop('start', 1) end = kwargs.pop('end', None) if len(query) > 4 and query.startswith('PF'): pfam_acc = query else: pfam_matches = searchPfam(query) keys = list(pfam_matches.keys()) if isinstance(start, Integral): start_diff = [] for i, key in enumerate(pfam_matches): start_diff.append(int(pfam_matches[key]['locations'][0]['start']) - start) start_diff = np.array(start_diff) pfam_acc = keys[np.where(abs(start_diff) == min(abs(start_diff)))[0][0]] elif isinstance(end, Integral): end_diff = [] for i, key in enumerate(pfam_matches): end_diff.append(int(pfam_matches[key]['locations'][0]['end']) - end) end_diff = np.array(end_diff) pfam_acc = keys[np.where(abs(end_diff) == min(abs(end_diff)))[0][0]] else: raise ValueError('Please provide an integer for start or end ' 'when using a UniProt ID or PDB ID.') from ftplib import FTP from .uniprot import queryUniprot data_stream = BytesIO() ftp_host = 'ftp.ebi.ac.uk' ftp = FTP(ftp_host) ftp.login() ftp.cwd('pub/databases/Pfam/current_release') ftp.retrbinary('RETR pdbmap.gz', data_stream.write) ftp.quit() zip_data = data_stream.getvalue() data_stream.close() rawdata = gunzip(zip_data) if PY3K: rawdata = rawdata.decode() fields = ['PDB_ID', 'chain', 'nothing', 'PFAM_Name', 'PFAM_ACC', 'UniprotAcc', 'UniprotResnumRange'] data_dicts = [] for line in rawdata.split('\n'): if line.find(pfam_acc) != -1: data_dicts.append({}) for j, entry in enumerate(line.strip().split('\t')): data_dicts[-1][fields[j]] = entry.strip(';') pdb_ids = [data_dict['PDB_ID'] for data_dict in data_dicts] chains = [data_dict['chain'] for data_dict in data_dicts] header = kwargs.pop('header', False) model = kwargs.get('model', None) results = parsePDB(*pdb_ids, chain=chains, header=True, **kwargs) ags, headers = results ags, headers = list(ags), list(headers) if model == 0: LOGGER.info('only header is requested and returned') return results if header: results = (ags, headers) else: # ags = results # ags = list(ags) results = ags LOGGER.progress('Extracting Pfam domains...', len(ags)) comma_splitter = re.compile(r'\s*,\s*').split no_info = [] for i, ag in enumerate(ags): LOGGER.update(i) data_dict = data_dicts[i] pfamRange = data_dict['UniprotResnumRange'].split('-') uniprotAcc = data_dict['UniprotAcc'] try: uniData = queryUniprot(uniprotAcc) except: LOGGER.warn('No Uniprot record found for {0}'.format(data_dict['PBD_ID'])) continue resrange = None found = False for key, value in uniData.items(): if not key.startswith('dbReference'): continue try: pdbid = value['PDB'] except: continue if pdbid != data_dict['PDB_ID']: continue pdbchains = value['chains'] # example chain strings: "A=27-139, B=140-150" or "A/B=27-150" pdbchains = comma_splitter(pdbchains) for chain in pdbchains: chids, resrange = chain.split('=') chids = [chid.strip() for chid in chids.split('/')] if data_dict['chain'] in chids: resrange = resrange.split('-') found = True break if found: break if found: header = headers[i] chain_accessions = [dbref.accession for dbref in header[data_dict['chain']].dbrefs] try: if len(chain_accessions) > 0: right_part = np.where(np.array(chain_accessions) == data_dict['UniprotAcc'])[0][0] else: raise ValueError('There is no accession for a chain in the Header') except: LOGGER.warn('Could not map domains in {0}' .format(data_dict['PDB_ID'] + data_dict['chain'])) no_info.append(i) continue right_dbref = header[data_dict['chain']].dbrefs[right_part] chainStart = ag.select('chain {0}'.format(data_dict['chain']) ).getResnums()[0] missing = chainStart - right_dbref.first[0] partStart = ag.getResindices()[np.where(ag.getResnums() == right_dbref.first[0] + missing)][0] pfStart, pfEnd = int(pfamRange[0]), int(pfamRange[1]) uniStart, uniEnd = int(resrange[0]), int(resrange[1]) resiStart = pfStart - uniStart + partStart - missing resiEnd = pfEnd - uniStart + partStart - missing ags[i] = ag.select('resindex {0} to {1}'.format( resiStart, resiEnd)) else: no_info.append(i) LOGGER.finish() for i in reversed(no_info): ags.pop(i) if header: headers.pop(i) if isinstance(data, list): data.extend(data_dicts) else: LOGGER.warn('data should be a list in order to get output') return results
def addPDBEnsemble(ensemble, PDBs, refpdb=None, labels=None, mapping_func=mapOntoChain, occupancy=None, unmapped=None, **kwargs): """Adds extra structures to a given PDB ensemble. :arg ensemble: the ensemble to which the PDBs are added :type ensemble: :class:`.PDBEnsemble` :arg refpdb: reference structure. If set to `None`, it will be set to `ensemble.getAtoms()` automatically :type refpdb: :class:`.Chain`, :class:`.Selection`, or :class:`.AtomGroup` :arg PDBs: A list of PDB structures :type PDBs: iterable :arg title: the title of the ensemble :type title: str :arg labels: labels of the conformations :type labels: list :arg seqid: minimal sequence identity (percent) :type seqid: int :arg coverage: minimal sequence overlap (percent) :type coverage: int :arg occupancy: minimal occupancy of columns (range from 0 to 1). Columns whose occupancy is below this value will be trimmed :type occupancy: float :arg unmapped: a list of PDB IDs that cannot be included in the ensemble. This is an output argument :type unmapped: list """ degeneracy = kwargs.pop('degeneracy', True) subset = str(kwargs.get('subset', 'calpha')).lower() superpose = kwargs.pop('superpose', True) if labels is not None: if len(labels) != len(PDBs): raise TypeError('Labels and PDBs must have the same lengths.') else: labels = [] for pdb in PDBs: if pdb is None: labels.append(None) else: labels.append(pdb.getTitle()) # obtain refchains from the hierarhical view of the reference PDB if refpdb is None: refpdb = ensemble._atoms else: if subset != 'all': refpdb = refpdb.select(subset) refchains = list(refpdb.getHierView()) start = time.time() # obtain the atommap of all the chains combined. atoms = refchains[0] for i in range(1, len(refchains)): atoms += refchains[i] # add the PDBs to the ensemble if unmapped is None: unmapped = [] LOGGER.progress('Appending the ensemble...', len(PDBs), '_prody_addPDBEnsemble') for i, pdb in enumerate(PDBs): lbl = labels[i] if pdb is None: unmapped.append(labels[i]) continue LOGGER.update(i, 'Mapping %s to the reference...'%pdb.getTitle(), label='_prody_addPDBEnsemble') if not isinstance(pdb, (Chain, Selection, AtomGroup)): raise TypeError('PDBs must be a list of Chain, Selection, or AtomGroup.') atommaps = [] # find the mapping of the pdb to each reference chain for chain in refchains: mappings = mapping_func(pdb, chain, index=i, **kwargs) if len(mappings) > 0: atommaps.append(mappings[0][0]) else: break if len(atommaps) != len(refchains): unmapped.append(lbl) continue # combine the mappings of pdb to reference chains atommap = atommaps[0] for i in range(1, len(atommaps)): atommap += atommaps[i] # add the mappings to the ensemble ensemble.addCoordset(atommap, weights=atommap.getFlags('mapped'), label=lbl, degeneracy=degeneracy) LOGGER.finish() if occupancy is not None: ensemble = trimPDBEnsemble(ensemble, occupancy=occupancy) if superpose: ensemble.iterpose() LOGGER.info('{0} PDBs were added to the ensemble in {1:.2f}s.' .format(len(PDBs) - len(unmapped), time.time()-start)) if unmapped: LOGGER.warn('{0} structures cannot be mapped.'.format(len(unmapped))) return ensemble
def calcEnsembleENMs(ensemble, model='gnm', trim='reduce', n_modes=20, **kwargs): """Description""" match = kwargs.pop('match', True) if isinstance(ensemble, Conformation): conformation = ensemble ensemble = conformation.getEnsemble() index = conformation.getIndex() ensemble = ensemble[index:index + 1] if model is GNM: model_type = 'GNM' elif model is ANM: model_type = 'ANM' else: model_type = str(model).strip().upper() start = time.time() atoms = ensemble.getAtoms() select = None if ensemble.isSelected(): select = atoms atoms = ensemble.getAtoms(selected=False) labels = ensemble.getLabels() ### ENMs ### ## ENM for every conf enms = [] n_confs = ensemble.numConfs() str_modes = 'all' if n_modes is None else str(n_modes) LOGGER.progress( 'Calculating {0} {1} modes for {2} conformations...'.format( str_modes, model_type, n_confs), n_confs, '_prody_calcEnsembleENMs') for i in range(n_confs): LOGGER.update(i, label='_prody_calcEnsembleENMs') coords = ensemble.getCoordsets(i, selected=False) nodes = coords[0, :, :] if atoms is not None: atoms.setCoords(nodes) nodes = atoms enm, _ = calcENM(nodes, select, model=model, trim=trim, n_modes=n_modes, title=labels[i], **kwargs) enms.append(enm) #lbl = labels[i] if labels[i] != '' else '%d-th conformation'%(i+1) LOGGER.finish() min_n_modes = ensemble.numAtoms() * 3 for enm in enms: n_modes = enm.numModes() if n_modes < min_n_modes: min_n_modes = n_modes for i in range(len(enms)): n_modes = enms[i].numModes() if n_modes > min_n_modes: enms[i] = enms[i][:min_n_modes] LOGGER.warn( 'last {0} modes for {1} has been discarded because at least one ' 'conformation has only {2} modes'.format( n_modes - min_n_modes, enms[i].getTitle(), min_n_modes)) LOGGER.info( '{0} {1} modes were calculated for each of the {2} conformations in {3:.2f}s.' .format(str_modes, model_type, n_confs, time.time() - start)) modeens = ModeEnsemble(title=ensemble.getTitle()) modeens.addModeSet(enms, weights=ensemble.getWeights(), label=ensemble.getLabels()) modeens.setAtoms(ensemble.getAtoms()) if match: modeens.match() return modeens
def buildPDBEnsemble(PDBs, ref=None, title='Unknown', labels=None, mapping_func=mapOntoChain, unmapped=None, **kwargs): """Builds a PDB ensemble from a given reference structure and a list of PDB structures. Note that the reference structure should be included in the list as well. :arg PDBs: A list of PDB structures :type PDBs: iterable :arg ref: Reference structure or the index to the reference in ``PDBs``. If **None**, then the first item in ``PDBs`` will be considered as the reference. Default is **None** :type ref: int, :class:`.Chain`, :class:`.Selection`, or :class:`.AtomGroup` :arg title: The title of the ensemble :type title: str :arg labels: labels of the conformations :type labels: list :arg occupancy: Minimal occupancy of columns (range from 0 to 1). Columns whose occupancy is below this value will be trimmed. :type occupancy: float :arg unmapped: A list of PDB IDs that cannot be included in the ensemble. This is an output argument. :type unmapped: list :arg subset: A subset for selecting particular atoms from the input structures. Default is calpha :type subset: str :arg superpose: if set to ``'iter'``, :func:`.PDBEnsemble.iterpose` will be used to superpose the structures, otherwise conformations will be superposed with respect to the reference specified by ``ref``. Default is ``'iter'`` :type superpose: str """ occupancy = kwargs.pop('occupancy', None) degeneracy = kwargs.pop('degeneracy', True) subset = str(kwargs.get('subset', 'calpha')).lower() superpose = kwargs.pop('superpose', 'iter') superpose = kwargs.pop('iterpose', superpose) if len(PDBs) == 1: raise ValueError('PDBs should have at least two items') if labels is not None: if len(labels) != len(PDBs): raise TypeError('Labels and PDBs must have the same lengths.') else: labels = [] for pdb in PDBs: if pdb is None: labels.append(None) else: labels.append(pdb.getTitle()) if ref is None: refpdb = PDBs[0] refidx = 0 elif isinstance(ref, Integral): refpdb = PDBs[ref] refidx = ref else: refpdb = ref if refpdb not in PDBs: raise ValueError('refpdb should be also in the PDBs') refidx = PDBs.index(ref) # obtain refchains from the hierarchical view of the reference PDB if subset != 'all': refpdb = refpdb.select(subset) try: refchains = list(refpdb.getHierView()) except AttributeError: raise TypeError('refpdb must have getHierView') start = time.time() # obtain the atommap of all the chains combined. atoms = refpdb # initialize a PDBEnsemble with reference atoms and coordinates ensemble = PDBEnsemble(title) ensemble.setAtoms(atoms) ensemble.setCoords(atoms.getCoords()) # build the ensemble if unmapped is None: unmapped = [] LOGGER.progress('Building the ensemble...', len(PDBs), '_prody_buildPDBEnsemble') for i, pdb in enumerate(PDBs): if pdb is None: unmapped.append(labels[i]) continue LOGGER.update(i, 'Mapping %s to the reference...' % pdb.getTitle(), label='_prody_buildPDBEnsemble') try: pdb.getHierView() except AttributeError: raise TypeError( 'PDBs must be a list of instances having the access to getHierView' ) if labels is None: lbl = pdb.getTitle() else: lbl = labels[i] atommaps = [] # find the mapping of the pdb to each reference chain for chain in refchains: mappings = mapping_func(pdb, chain, index=i, **kwargs) if len(mappings) > 0: atommaps.append(mappings[0][0]) else: break if len(atommaps) != len(refchains): unmapped.append(lbl) continue # combine the mappings of pdb to reference chains atommap = atommaps[0] for j in range(1, len(atommaps)): atommap += atommaps[j] # add the mappings to the ensemble ensemble.addCoordset(atommap, weights=atommap.getFlags('mapped'), label=lbl, degeneracy=degeneracy) LOGGER.finish() if occupancy is not None: ensemble = trimPDBEnsemble(ensemble, occupancy=occupancy) if superpose != 'iter': ensemble.superpose(ref=refidx) else: ensemble.iterpose() LOGGER.info('Ensemble ({0} conformations) were built in {1:.2f}s.'.format( ensemble.numConfs(), time.time() - start)) if unmapped: LOGGER.warn('{0} structures cannot be mapped.'.format(len(unmapped))) return ensemble
def writeDCD(filename, trajectory, start=None, stop=None, step=None, align=False): """Write 32-bit CHARMM format DCD file (also NAMD 2.1 and later). *trajectory* can be an :class:`Trajectory`, :class:`DCDFile`, or :class:`Ensemble` instance. *filename* is returned upon successful output of file.""" if not filename.lower().endswith('.dcd'): filename += '.dcd' if not isinstance(trajectory, (TrajBase, Ensemble, Atomic)): raise TypeError('{0} is not a valid type for trajectory'.format( type(trajectory))) irange = list( range(*slice(start, stop, step).indices(trajectory.numCoordsets()))) n_csets = len(irange) if n_csets == 0: raise ValueError('trajectory does not have any coordinate sets, or ' 'no coordinate sets are selected') if isinstance(trajectory, Atomic): isEnsemble = False isAtomic = True n_atoms = trajectory.numAtoms() else: isEnsemble = True isAtomic = False n_atoms = trajectory.numSelected() if n_atoms == 0: raise ValueError('no atoms are selected in the trajectory') if isinstance(trajectory, TrajBase): isTrajectory = True unitcell = trajectory.hasUnitcell() nfi = trajectory.nextIndex() trajectory.reset() pack_i_48 = pack('i', 48) if isinstance(trajectory, Trajectory): timestep = trajectory.getTimestep()[0] first_ts = trajectory.getFirstTimestep()[0] framefreq = trajectory.getFrameFreq()[0] n_fixed = trajectory.numFixed()[0] else: timestep = trajectory.getTimestep() first_ts = trajectory.getFirstTimestep() framefreq = trajectory.getFrameFreq() n_fixed = trajectory.numFixed() else: isTrajectory = False unitcell = False if isinstance(trajectory, Ensemble): frame = trajectory[0] else: frame = trajectory acsi = trajectory.getACSIndex() timestep = 1 first_ts = 0 framefreq = 1 n_fixed = 0 dcd = DCDFile(filename, mode='w') LOGGER.progress('Writing DCD', len(irange), '_prody_writeDCD') prev = -1 uc = None time_ = time() for j, i in enumerate(irange): diff = i - prev prev = i if isTrajectory: if diff > 1: trajectory.skip(diff - 1) frame = next(trajectory) if frame is None: break if unitcell: uc = frame._getUnitcell() uc[3:] = np.sin((PISQUARE / 90) * (90 - uc[3:])) uc = uc[[0, 3, 1, 4, 5, 2]] elif isEnsemble: frame._index = i else: frame.setACSIndex(i) if align: frame.superpose() if j == 0: dcd.write(frame._getCoords(), uc, timestep=timestep, firsttimestep=first_ts, framefreq=framefreq) else: dcd.write(frame._getCoords(), uc) LOGGER.update(i, label='_prody_writeDCD') if isAtomic: trajectory.setACSIndex(acsi) j += 1 LOGGER.finish() dcd.close() time_ = time() - time_ or 0.01 dcd_size = 1.0 * (56 + (n_atoms * 3 + 6) * 4) * n_csets / (1024 * 1024) LOGGER.info('DCD file was written in {0:.2f} seconds.'.format(time_)) LOGGER.info('{0:.2f} MB written at input rate {1:.2f} MB/s.'.format( dcd_size, dcd_size / time_)) LOGGER.info( '{0} coordinate sets written at output rate {1} frame/s.'.format( n_csets, int(n_csets / time_))) if j != n_csets: LOGGER.warn('Warning: {0} frames expected, {1} written.'.format( n_csets, j)) if isTrajectory: trajectory.goto(nfi) return filename
def buildCovariance(self, coordsets, **kwargs): """Build a covariance matrix for *coordsets* using mean coordinates as the reference. *coordsets* argument may be one of the following: * :class:`.Atomic` * :class:`.Ensemble` * :class:`.TrajBase` * :class:`numpy.ndarray` with shape ``(n_csets, n_atoms, 3)`` For ensemble and trajectory objects, ``update_coords=True`` argument can be used to set the mean coordinates as the coordinates of the object. When *coordsets* is a trajectory object, such as :class:`.DCDFile`, covariance will be built by superposing frames onto the reference coordinate set (see :meth:`.Frame.superpose`). If frames are already aligned, use ``aligned=True`` argument to skip this step. .. note:: If *coordsets* is a :class:`.PDBEnsemble` instance, coordinates are treated specially. Let's say **C**\_ij is the element of the covariance matrix that corresponds to atoms *i* and *j*. This super element is divided by number of coordinate sets (PDB models or structures) in which both of these atoms are observed together.""" if not isinstance(coordsets, (Ensemble, Atomic, TrajBase, np.ndarray)): raise TypeError('coordsets must be an Ensemble, Atomic, Numpy ' 'array instance') LOGGER.timeit('_prody_pca') mean = None weights = None ensemble = None if isinstance(coordsets, np.ndarray): if (coordsets.ndim != 3 or coordsets.shape[2] != 3 or coordsets.dtype not in (np.float32, float)): raise ValueError('coordsets is not a valid coordinate array') elif isinstance(coordsets, Atomic): coordsets = coordsets._getCoordsets() elif isinstance(coordsets, Ensemble): ensemble = coordsets if isinstance(coordsets, PDBEnsemble): weights = coordsets.getWeights() > 0 coordsets = coordsets._getCoordsets() update_coords = bool(kwargs.get('update_coords', False)) if isinstance(coordsets, TrajBase): nfi = coordsets.nextIndex() coordsets.reset() n_atoms = coordsets.numSelected() dof = n_atoms * 3 cov = np.zeros((dof, dof)) #mean = coordsets._getCoords().flatten() n_confs = 0 n_frames = len(coordsets) LOGGER.info('Covariance will be calculated using {0} frames.' .format(n_frames)) coordsum = np.zeros(dof) LOGGER.progress('Building covariance', n_frames, '_prody_pca') align = not kwargs.get('aligned', False) for frame in coordsets: if align: frame.superpose() coords = frame._getCoords().flatten() coordsum += coords cov += np.outer(coords, coords) n_confs += 1 LOGGER.update(n_confs, label='_prody_pca') LOGGER.finish() cov /= n_confs coordsum /= n_confs mean = coordsum cov -= np.outer(coordsum, coordsum) coordsets.goto(nfi) self._cov = cov if update_coords: coordsets.setCoords(mean.reshape((n_atoms, 3))) else: n_confs = coordsets.shape[0] if n_confs < 3: raise ValueError('coordsets must have more than 3 coordinate ' 'sets') n_atoms = coordsets.shape[1] if n_atoms < 3: raise ValueError('coordsets must have more than 3 atoms') dof = n_atoms * 3 LOGGER.info('Covariance is calculated using {0} coordinate sets.' .format(len(coordsets))) s = (n_confs, dof) if weights is None: if coordsets.dtype == float: self._cov = np.cov(coordsets.reshape((n_confs, dof)).T, bias=1) else: cov = np.zeros((dof, dof)) coordsets = coordsets.reshape((n_confs, dof)) mean = coordsets.mean(0) LOGGER.progress('Building covariance', n_confs, '_prody_pca') for i, coords in enumerate(coordsets.reshape(s)): deviations = coords - mean cov += np.outer(deviations, deviations) LOGGER.update(n_confs, label='_prody_pca') LOGGER.finish() cov /= n_confs self._cov = cov else: # PDB ensemble case mean = np.zeros((n_atoms, 3)) for i, coords in enumerate(coordsets): mean += coords * weights[i] mean /= weights.sum(0) d_xyz = ((coordsets - mean) * weights).reshape(s) divide_by = weights.astype(float).repeat(3, axis=2).reshape(s) self._cov = np.dot(d_xyz.T, d_xyz) / np.dot(divide_by.T, divide_by) if update_coords and ensemble is not None: if mean is None: mean = coordsets.mean(0) ensemble.setCoords(mean) self._trace = self._cov.trace() self._dof = dof self._n_atoms = n_atoms LOGGER.report('Covariance matrix calculated in %2fs.', '_prody_pca')
def buildCovariance(self, coordsets, **kwargs): """Build a covariance matrix for *coordsets* using mean coordinates as the reference. *coordsets* argument may be one of the following: * :class:`.Atomic` * :class:`.Ensemble` * :class:`.TrajBase` * :class:`numpy.ndarray` with shape ``(n_csets, n_atoms, 3)`` For ensemble and trajectory objects, ``update_coords=True`` argument can be used to set the mean coordinates as the coordinates of the object. When *coordsets* is a trajectory object, such as :class:`.DCDFile`, covariance will be built by superposing frames onto the reference coordinate set (see :meth:`.Frame.superpose`). If frames are already aligned, use ``aligned=True`` argument to skip this step. .. note:: If *coordsets* is a :class:`.PDBEnsemble` instance, coordinates are treated specially. Let's say **C**\_ij is the element of the covariance matrix that corresponds to atoms *i* and *j*. This super element is divided by number of coordinate sets (PDB models or structures) in which both of these atoms are observed together.""" if not isinstance(coordsets, (Ensemble, Atomic, TrajBase, np.ndarray)): raise TypeError('coordsets must be an Ensemble, Atomic, Numpy ' 'array instance') LOGGER.timeit('_prody_pca') mean = None weights = None ensemble = None if isinstance(coordsets, np.ndarray): if (coordsets.ndim != 3 or coordsets.shape[2] != 3 or coordsets.dtype not in (np.float32, float)): raise ValueError('coordsets is not a valid coordinate array') elif isinstance(coordsets, Atomic): coordsets = coordsets._getCoordsets() elif isinstance(coordsets, Ensemble): ensemble = coordsets if isinstance(coordsets, PDBEnsemble): weights = coordsets.getWeights() > 0 coordsets = coordsets._getCoordsets() update_coords = bool(kwargs.get('update_coords', False)) if isinstance(coordsets, TrajBase): nfi = coordsets.nextIndex() coordsets.reset() n_atoms = coordsets.numSelected() dof = n_atoms * 3 cov = np.zeros((dof, dof)) #mean = coordsets._getCoords().flatten() n_confs = 0 n_frames = len(coordsets) LOGGER.info( 'Covariance will be calculated using {0} frames.'.format( n_frames)) coordsum = np.zeros(dof) LOGGER.progress('Building covariance', n_frames, '_prody_pca') align = not kwargs.get('aligned', False) for frame in coordsets: if align: frame.superpose() coords = frame._getCoords().flatten() coordsum += coords cov += np.outer(coords, coords) n_confs += 1 LOGGER.update(n_confs, label='_prody_pca') LOGGER.finish() cov /= n_confs coordsum /= n_confs mean = coordsum cov -= np.outer(coordsum, coordsum) coordsets.goto(nfi) self._cov = cov if update_coords: coordsets.setCoords(mean.reshape((n_atoms, 3))) else: n_confs = coordsets.shape[0] if n_confs < 3: raise ValueError('coordsets must have more than 3 coordinate ' 'sets') n_atoms = coordsets.shape[1] if n_atoms < 3: raise ValueError('coordsets must have more than 3 atoms') dof = n_atoms * 3 LOGGER.info( 'Covariance is calculated using {0} coordinate sets.'.format( len(coordsets))) s = (n_confs, dof) if weights is None: if coordsets.dtype == float: self._cov = np.cov(coordsets.reshape((n_confs, dof)).T, bias=1) else: cov = np.zeros((dof, dof)) coordsets = coordsets.reshape((n_confs, dof)) mean = coordsets.mean(0) LOGGER.progress('Building covariance', n_confs, '_prody_pca') for i, coords in enumerate(coordsets.reshape(s)): deviations = coords - mean cov += np.outer(deviations, deviations) LOGGER.update(n_confs, label='_prody_pca') LOGGER.finish() cov /= n_confs self._cov = cov else: # PDB ensemble case mean = np.zeros((n_atoms, 3)) for i, coords in enumerate(coordsets): mean += coords * weights[i] mean /= weights.sum(0) d_xyz = ((coordsets - mean) * weights).reshape(s) divide_by = weights.astype(float).repeat(3, axis=2).reshape(s) self._cov = np.dot(d_xyz.T, d_xyz) / np.dot( divide_by.T, divide_by) if update_coords and ensemble is not None: if mean is None: mean = coordsets.mean(0) ensemble.setCoords(mean) self._trace = self._cov.trace() self._dof = dof self._n_atoms = n_atoms LOGGER.report('Covariance matrix calculated in %2fs.', '_prody_pca')
def fetchBIRDviaFTP(**kwargs): """Retrieve the whole Biologically Interesting Molecule Reference Dictionary (BIRD) resource, which is updated every week. This includes 2 kinds of keys, which can be selected with the **keys** keyword argument. The chemical information is found in a zipped (tar.gz) directory at https://files.rcsb.org/pub/pdb/data/bird/prd/prd-all.cif.gz, which contains individual CIF files within it. This data will be downloaded and extracted to :file:`.prody/bird-prd`. Biological function information is also found in a zipped (tar.gz) directory at https://files.rcsb.org/pub/pdb/data/bird/family/family-all.cif.gz, which contains individual CIF files within it. This data will be downloaded and extracted to :file:`.prody/bird-family`. :arg keys: keys specifying which data to fetch out of ``'prd'``, ``'family'`` or ``'both'`` default is ``'both'`` :type keys: str, tuple, list, :class:`~numpy.ndarray` The underlying data can be accessed using :func:`parseBIRD`.""" BIRD_PATH = os.path.join(getPackagePath(), 'bird') keys = kwargs.get('keys', 'both') if isinstance(keys, str): if keys == 'both': keys = ['prd', 'family'] elif keys[:3].lower() == 'prd': keys = ['prd'] elif keys[:3].lower() == 'fam': keys = ['family'] else: raise ValueError("keys should be 'both', 'prd' or 'fam'") elif isListLike(keys): keys = list(keys) else: raise TypeError("keys should be list-like or string") ftp_divided = 'pdb/data/bird/' ftp_pdbext = '.cif.gz' ftp_prefix = '' if not os.path.isdir(BIRD_PATH): os.mkdir(BIRD_PATH) LOGGER.progress('Downloading BIRD', len(keys), '_prody_fetchBIRD') ftp_name, ftp_host, ftp_path = WWPDB_FTP_SERVERS[wwPDBServer() or 'us'] LOGGER.debug('Connecting wwPDB FTP server {0}.'.format(ftp_name)) from ftplib import FTP try: ftp = FTP(ftp_host) except Exception as error: raise type(error)('FTP connection problem, potential reason: ' 'no internet connectivity') else: count = 0 success = 0 failure = 0 filenames = [] ftp.login('') for i, x in enumerate(keys): data = [] ftp_fn = ftp_prefix + '{0}-all'.format(x) + ftp_pdbext try: ftp.cwd(ftp_path) ftp.cwd(ftp_divided) ftp.cwd(x) ftp.retrbinary('RETR ' + ftp_fn, data.append) except Exception as error: if ftp_fn in ftp.nlst(): LOGGER.warn('{0} download failed ({1}). It is ' 'possible that you do not have rights to ' 'download .gz files in the current network.' .format(x, str(error))) else: LOGGER.info('{0} download failed. {1} does not exist ' 'on {2}.'.format(ftp_fn, x, ftp_host)) failure += 1 filenames.append(None) else: if len(data): filename = BIRD_PATH + '/{0}-all.cif.gz'.format(x) with open(filename, 'w+b') as outfile: write = outfile.write [write(block) for block in data] success += 1 else: failure += 1 count += 1 LOGGER.update(i, label='_prody_fetchBIRD') LOGGER.finish() LOGGER.debug('PDB download via FTP completed ({0} downloaded, ' '{1} failed).'.format(success, failure))
def parsePDB(*pdb, **kwargs): """Returns an :class:`.AtomGroup` and/or dictionary containing header data parsed from a PDB file. This function extends :func:`.parsePDBStream`. See :ref:`parsepdb` for a detailed usage example. :arg pdb: one PDB identifier or filename, or a list of them. If needed, PDB files are downloaded using :func:`.fetchPDB()` function. You can also provide arguments that you would like passed on to fetchPDB(). """ n_pdb = len(pdb) if n_pdb == 1: if isListLike(pdb[0]): pdb = pdb[0] n_pdb = len(pdb) if n_pdb == 1: return _parsePDB(pdb[0], **kwargs) else: results = [] lstkwargs = {} for key in kwargs: argval = kwargs.get(key) if np.isscalar(argval): argval = [argval]*n_pdb lstkwargs[key] = argval start = time.time() LOGGER.progress('Retrieving {0} PDB structures...' .format(n_pdb), n_pdb, '_prody_parsePDB') for i, p in enumerate(pdb): kwargs = {} for key in lstkwargs: kwargs[key] = lstkwargs[key][i] c = kwargs.get('chain','') LOGGER.update(i, 'Retrieving {0}...'.format(p+c), label='_prody_parsePDB') result = _parsePDB(p, **kwargs) if not isinstance(result, tuple): if isinstance(result, dict): result = (None, result) else: result = (result, None) results.append(result) results = list(zip(*results)) LOGGER.finish() for i in reversed(range(len(results))): if all(j is None for j in results[i]): results.pop(i) if len(results) == 1: results = results[0] results = list(results) model = kwargs.get('model') header = kwargs.get('header', False) if model != 0 and header: numPdbs = len(results[0]) else: numPdbs = len(results) LOGGER.info('{0} PDBs were parsed in {1:.2f}s.' .format(numPdbs, time.time()-start)) return results
def buildPDBEnsemble(PDBs, ref=None, title='Unknown', labels=None, mapping_func=mapOntoChain, unmapped=None, **kwargs): """Builds a PDB ensemble from a given reference structure and a list of PDB structures. Note that the reference structure should be included in the list as well. :arg PDBs: A list of PDB structures :type PDBs: iterable :arg ref: Reference structure or the index to the reference in ``PDBs``. If **None**, then the first item in ``PDBs`` will be considered as the reference. Default is **None** :type ref: int, :class:`.Chain`, :class:`.Selection`, or :class:`.AtomGroup` :arg title: The title of the ensemble :type title: str :arg labels: labels of the conformations :type labels: list :arg occupancy: Minimal occupancy of columns (range from 0 to 1). Columns whose occupancy is below this value will be trimmed. :type occupancy: float :arg unmapped: A list of PDB IDs that cannot be included in the ensemble. This is an output argument. :type unmapped: list :arg subset: A subset for selecting particular atoms from the input structures. Default is calpha :type subset: str """ occupancy = kwargs.pop('occupancy', None) degeneracy = kwargs.pop('degeneracy', True) subset = str(kwargs.get('subset', 'calpha')).lower() superpose = kwargs.pop('superpose', True) if len(PDBs) == 1: raise ValueError('PDBs should have at least two items') if labels is not None: if len(labels) != len(PDBs): raise TypeError('Labels and PDBs must have the same lengths.') else: labels = [] for pdb in PDBs: if pdb is None: labels.append(None) else: labels.append(pdb.getTitle()) if ref is None: refpdb = PDBs[0] elif isinstance(ref, Integral): refpdb = PDBs[ref] else: refpdb = ref if refpdb not in PDBs: raise ValueError('refpdb should be also in the PDBs') # obtain refchains from the hierarchical view of the reference PDB if subset != 'all': refpdb = refpdb.select(subset) try: refchains = list(refpdb.getHierView()) except AttributeError: raise TypeError('refpdb must have getHierView') start = time.time() # obtain the atommap of all the chains combined. atoms = refchains[0] for i in range(1, len(refchains)): atoms += refchains[i] # initialize a PDBEnsemble with reference atoms and coordinates ensemble = PDBEnsemble(title) ensemble.setAtoms(atoms) ensemble.setCoords(atoms.getCoords()) # build the ensemble if unmapped is None: unmapped = [] LOGGER.progress('Building the ensemble...', len(PDBs), '_prody_buildPDBEnsemble') for i, pdb in enumerate(PDBs): if pdb is None: unmapped.append(labels[i]) continue LOGGER.update(i, 'Mapping %s to the reference...'%pdb.getTitle(), label='_prody_buildPDBEnsemble') try: pdb.getHierView() except AttributeError: raise TypeError('PDBs must be a list of instances having the access to getHierView') if labels is None: lbl = pdb.getTitle() else: lbl = labels[i] atommaps = [] # find the mapping of the pdb to each reference chain for chain in refchains: mappings = mapping_func(pdb, chain, index=i, **kwargs) if len(mappings) > 0: atommaps.append(mappings[0][0]) else: break if len(atommaps) != len(refchains): unmapped.append(lbl) continue # combine the mappings of pdb to reference chains atommap = atommaps[0] for j in range(1, len(atommaps)): atommap += atommaps[j] # add the mappings to the ensemble ensemble.addCoordset(atommap, weights=atommap.getFlags('mapped'), label = lbl, degeneracy=degeneracy) LOGGER.finish() if occupancy is not None: ensemble = trimPDBEnsemble(ensemble, occupancy=occupancy) if superpose: ensemble.iterpose() LOGGER.info('Ensemble ({0} conformations) were built in {1:.2f}s.' .format(ensemble.numConfs(), time.time()-start)) if unmapped: LOGGER.warn('{0} structures cannot be mapped.'.format(len(unmapped))) return ensemble
def buildPDBEnsemble(atomics, ref=None, title='Unknown', labels=None, atommaps=None, unmapped=None, **kwargs): """Builds a :class:`.PDBEnsemble` from a given reference structure and a list of structures (:class:`.Atomic` instances). Note that the reference should be included in the list as well. :arg atomics: a list of :class:`.Atomic` instances :type atomics: list :arg ref: reference structure or the index to the reference in *atomics*. If **None**, then the first item in *atomics* will be considered as the reference. If it is a :class:`.PDBEnsemble` instance, then *atomics* will be appended to the existing ensemble. Default is **None** :type ref: int, :class:`.Chain`, :class:`.Selection`, or :class:`.AtomGroup` :arg title: the title of the ensemble :type title: str :arg labels: labels of the conformations :type labels: list :arg degeneracy: whether only the active coordinate set (**True**) or all the coordinate sets (**False**) of each structure should be added to the ensemble. Default is **True** :type degeneracy: bool :arg occupancy: minimal occupancy of columns (range from 0 to 1). Columns whose occupancy is below this value will be trimmed :type occupancy: float :arg atommaps: labels of *atomics* that were mapped and added into the ensemble. This is an output argument :type atommaps: list :arg unmapped: labels of *atomics* that cannot be included in the ensemble. This is an output argument :type unmapped: list :arg subset: a subset for selecting particular atoms from the input structures. Default is ``"all"`` :type subset: str :arg superpose: if set to ``'iter'``, :func:`.PDBEnsemble.iterpose` will be used to superpose the structures, otherwise conformations will be superposed with respect to the reference specified by *ref* unless set to ``False``. Default is ``'iter'`` :type superpose: str, bool """ occupancy = kwargs.pop('occupancy', None) degeneracy = kwargs.pop('degeneracy', True) subset = str(kwargs.get('subset', 'all')).lower() superpose = kwargs.pop('superpose', 'iter') superpose = kwargs.pop('iterpose', superpose) debug = kwargs.pop('debug', {}) if 'mapping_func' in kwargs: raise DeprecationWarning( 'mapping_func is deprecated. Please see release notes for ' 'more details: http://prody.csb.pitt.edu/manual/release/v1.11_series.html' ) start = time.time() if not isListLike(atomics): raise TypeError('atomics should be list-like') if len(atomics) == 1 and degeneracy is True: raise ValueError('atomics should have at least two items') if labels is not None: if len(labels) != len(atomics): raise TypeError('Labels and atomics must have the same lengths.') else: labels = [] for atoms in atomics: if atoms is None: labels.append(None) else: labels.append(atoms.getTitle()) if ref is None: target = atomics[0] elif isinstance(ref, Integral): target = atomics[ref] elif isinstance(ref, PDBEnsemble): target = ref._atoms else: target = ref # initialize a PDBEnsemble with reference atoms and coordinates isrefset = False if isinstance(ref, PDBEnsemble): ensemble = ref else: # select the subset of reference beforehand for the sake of efficiency if subset != 'all': target = target.select(subset) ensemble = PDBEnsemble(title) if isinstance(target, Atomic): ensemble.setAtoms(target) ensemble.setCoords(target.getCoords()) isrefset = True else: ensemble._n_atoms = len(target) isrefset = False # build the ensemble if unmapped is None: unmapped = [] if atommaps is None: atommaps = [] LOGGER.progress('Building the ensemble...', len(atomics), '_prody_buildPDBEnsemble') for i, atoms in enumerate(atomics): if atoms is None: unmapped.append(labels[i]) continue LOGGER.update(i, 'Mapping %s to the reference...' % atoms.getTitle(), label='_prody_buildPDBEnsemble') try: atoms.getHierView() except AttributeError: raise TypeError( 'atomics must be a list of instances having the access to getHierView' ) if subset != 'all': atoms = atoms.select(subset) # find the mapping of chains of atoms to those of target debug[labels[i]] = {} atommaps_ = alignChains(atoms, target, debug=debug[labels[i]], **kwargs) if len(atommaps_) == 0: unmapped.append(labels[i]) continue else: atommaps.extend(atommaps_) # add the atommaps to the ensemble for atommap in atommaps_: lbl = pystr(labels[i]) if len(atommaps_) > 1: chids = np.unique(atommap.getChids()) strchids = ''.join(chids) lbl += '_%s' % strchids ensemble.addCoordset(atommap, weights=atommap.getFlags('mapped'), label=lbl, degeneracy=degeneracy) if not isrefset: ensemble.setCoords(atommap.getCoords()) isrefset = True LOGGER.finish() if occupancy is not None: ensemble = trimPDBEnsemble(ensemble, occupancy=occupancy) if superpose == 'iter': ensemble.iterpose() elif superpose is not False: ensemble.superpose() LOGGER.info('Ensemble ({0} conformations) were built in {1:.2f}s.'.format( ensemble.numConfs(), time.time() - start)) if unmapped: LOGGER.warn('{0} structures cannot be mapped.'.format(len(unmapped))) return ensemble
def writeDCD(filename, trajectory, start=None, stop=None, step=None, align=False): """Write 32-bit CHARMM format DCD file (also NAMD 2.1 and later). *trajectory can be an :class:`Trajectory`, :class:`DCDFile`, or :class:`Ensemble` instance. *filename* is returned upon successful output of file.""" if not filename.lower().endswith('.dcd'): filename += '.dcd' if not isinstance(trajectory, (TrajBase, Ensemble, Atomic)): raise TypeError('{0} is not a valid type for trajectory' .format(type(trajectory))) irange = list(range(*slice(start, stop,step) .indices(trajectory.numCoordsets()))) n_csets = len(irange) if n_csets == 0: raise ValueError('trajectory does not have any coordinate sets, or ' 'no coordinate sets are selected') if isinstance(trajectory, Atomic): isEnsemble = False isAtomic = True n_atoms = trajectory.numAtoms() else: isEnsemble = True isAtomic = False n_atoms = trajectory.numSelected() if n_atoms == 0: raise ValueError('no atoms are selected in the trajectory') if isinstance(trajectory, TrajBase): isTrajectory = True unitcell = trajectory.hasUnitcell() nfi = trajectory.nextIndex() trajectory.reset() pack_i_48 = pack('i', 48) if isinstance(trajectory, Trajectory): timestep = trajectory.getTimestep()[0] first_ts = trajectory.getFirstTimestep()[0] framefreq = trajectory.getFrameFreq()[0] n_fixed = trajectory.numFixed()[0] else: timestep = trajectory.getTimestep() first_ts = trajectory.getFirstTimestep() framefreq = trajectory.getFrameFreq() n_fixed = trajectory.numFixed() else: isTrajectory = False unitcell = False if isinstance(trajectory, Ensemble): frame = trajectory[0] else: frame = trajectory acsi = trajectory.getACSIndex() timestep = 1 first_ts = 0 framefreq = 1 n_fixed = 0 dcd = DCDFile(filename, mode='w') LOGGER.progress('Writing DCD', len(irange), '_prody_writeDCD') prev = -1 uc = None time_ = time() for j, i in enumerate(irange): diff = i - prev if diff > 1: trajectory.skip(diff-1) prev = i if isTrajectory: frame = next(trajectory) if frame is None: break if unitcell: uc = frame._getUnitcell() uc[3:] = np.sin((PISQUARE/90) * (90-uc[3:])) uc = uc[[0,3,1,4,5,2]] elif isEnsemble: frame._index = i else: frame.setACSIndex(i) if align: frame.superpose() if j == 0: dcd.write(frame._getCoords(), uc, timestep=timestep, firsttimestep=first_ts, framefreq=framefreq) else: dcd.write(frame._getCoords(), uc) LOGGER.update(i, label='_prody_writeDCD') if isAtomic: trajectory.setACSIndex(acsi) j += 1 LOGGER.finish() dcd.close() time_ = time() - time_ or 0.01 dcd_size = 1.0 * (56 + (n_atoms * 3 + 6) * 4 ) * n_csets / (1024*1024) LOGGER.info('DCD file was written in {0:.2f} seconds.'.format(time_)) LOGGER.info('{0:.2f} MB written at input rate {1:.2f} MB/s.' .format(dcd_size, dcd_size/time_)) LOGGER.info('{0} coordinate sets written at output rate {1} frame/s.' .format(n_csets, int(n_csets/time_))) if j != n_csets: LOGGER.warn('Warning: {0} frames expected, {1} written.' .format(n_csets, j)) if isTrajectory: trajectory.goto(nfi) return filename
def addPDBEnsemble(ensemble, PDBs, refpdb=None, labels=None, mapping_func=mapOntoChain, occupancy=None, unmapped=None, **kwargs): """Adds extra structures to a given PDB ensemble. :arg ensemble: the ensemble to which the PDBs are added :type ensemble: :class:`.PDBEnsemble` :arg refpdb: reference structure. If set to `None`, it will be set to `ensemble.getAtoms()` automatically :type refpdb: :class:`.Chain`, :class:`.Selection`, or :class:`.AtomGroup` :arg PDBs: A list of PDB structures :type PDBs: iterable :arg title: the title of the ensemble :type title: str :arg labels: labels of the conformations :type labels: list :arg seqid: minimal sequence identity (percent) :type seqid: int :arg coverage: minimal sequence overlap (percent) :type coverage: int :arg occupancy: minimal occupancy of columns (range from 0 to 1). Columns whose occupancy is below this value will be trimmed :type occupancy: float :arg unmapped: a list of PDB IDs that cannot be included in the ensemble. This is an output argument :type unmapped: list """ degeneracy = kwargs.pop('degeneracy', True) subset = str(kwargs.get('subset', 'calpha')).lower() superpose = kwargs.pop('superpose', True) if labels is not None: if len(labels) != len(PDBs): raise TypeError('Labels and PDBs must have the same lengths.') else: labels = [] for pdb in PDBs: if pdb is None: labels.append(None) else: labels.append(pdb.getTitle()) # obtain refchains from the hierarhical view of the reference PDB if refpdb is None: refpdb = ensemble._atoms else: if subset != 'all': refpdb = refpdb.select(subset) refchains = list(refpdb.getHierView()) start = time.time() # obtain the atommap of all the chains combined. atoms = refchains[0] for i in range(1, len(refchains)): atoms += refchains[i] # add the PDBs to the ensemble if unmapped is None: unmapped = [] LOGGER.progress('Appending the ensemble...', len(PDBs), '_prody_addPDBEnsemble') for i, pdb in enumerate(PDBs): lbl = labels[i] if pdb is None: unmapped.append(labels[i]) continue LOGGER.update(i, 'Mapping %s to the reference...' % pdb.getTitle(), label='_prody_addPDBEnsemble') if not isinstance(pdb, (Chain, Selection, AtomGroup)): raise TypeError( 'PDBs must be a list of Chain, Selection, or AtomGroup.') atommaps = [] # find the mapping of the pdb to each reference chain for chain in refchains: mappings = mapping_func(pdb, chain, index=i, **kwargs) if len(mappings) > 0: atommaps.append(mappings[0][0]) else: break if len(atommaps) != len(refchains): unmapped.append(lbl) continue # combine the mappings of pdb to reference chains atommap = atommaps[0] for i in range(1, len(atommaps)): atommap += atommaps[i] # add the mappings to the ensemble ensemble.addCoordset(atommap, weights=atommap.getFlags('mapped'), label=lbl, degeneracy=degeneracy) LOGGER.finish() if occupancy is not None: ensemble = trimPDBEnsemble(ensemble, occupancy=occupancy) if superpose: ensemble.iterpose() LOGGER.info('{0} PDBs were added to the ensemble in {1:.2f}s.'.format( len(PDBs) - len(unmapped), time.time() - start)) if unmapped: LOGGER.warn('{0} structures cannot be mapped.'.format(len(unmapped))) return ensemble
def parsePfamPDBs(query, data=[], **kwargs): """Returns a list of AtomGroups containing sections of chains that correspond to a particular PFAM domain family. These are defined by alignment start and end residue numbers. :arg query: UniProt ID or PDB ID If a PDB ID is provided the corresponding UniProt ID is used. If this returns multiple matches then start or end must also be provided. This query is also used for label refinement of the Pfam domain MSA. :type query: str :arg data: If given the data list from the Pfam mapping table will be output through this argument. :type data: list :keyword start: Residue number for defining the start of the domain. The PFAM domain that starts closest to this will be selected. Default is **1** :type start: int :keyword end: Residue number for defining the end of the domain. The PFAM domain that ends closest to this will be selected. :type end: int """ start = kwargs.pop('start', 1) end = kwargs.pop('end', None) if len(query) > 4 and query.startswith('PF'): pfam_acc = query else: pfam_matches = searchPfam(query) keys = list(pfam_matches.keys()) if isinstance(start, Integral): start_diff = [] for i, key in enumerate(pfam_matches): start_diff.append( int(pfam_matches[key]['locations'][0]['start']) - start) start_diff = np.array(start_diff) pfam_acc = keys[np.where( abs(start_diff) == min(abs(start_diff)))[0][0]] elif isinstance(end, Integral): end_diff = [] for i, key in enumerate(pfam_matches): end_diff.append( int(pfam_matches[key]['locations'][0]['end']) - end) end_diff = np.array(end_diff) pfam_acc = keys[np.where( abs(end_diff) == min(abs(end_diff)))[0][0]] else: raise ValueError('Please provide an integer for start or end ' 'when using a UniProt ID or PDB ID.') from ftplib import FTP from .uniprot import queryUniprot data_stream = BytesIO() ftp_host = 'ftp.ebi.ac.uk' ftp = FTP(ftp_host) ftp.login() ftp.cwd('pub/databases/Pfam/current_release') ftp.retrbinary('RETR pdbmap.gz', data_stream.write) ftp.quit() zip_data = data_stream.getvalue() data_stream.close() rawdata = gunzip(zip_data) if PY3K: rawdata = rawdata.decode() fields = [ 'PDB_ID', 'chain', 'nothing', 'PFAM_Name', 'PFAM_ACC', 'UniprotAcc', 'UniprotResnumRange' ] data_dicts = [] for line in rawdata.split('\n'): if line.find(pfam_acc) != -1: data_dicts.append({}) for j, entry in enumerate(line.strip().split('\t')): data_dicts[-1][fields[j]] = entry.strip(';') pdb_ids = [data_dict['PDB_ID'] for data_dict in data_dicts] chains = [data_dict['chain'] for data_dict in data_dicts] header = kwargs.pop('header', False) model = kwargs.get('model', None) results = parsePDB(*pdb_ids, chain=chains, header=True, **kwargs) ags, headers = results ags, headers = list(ags), list(headers) if model == 0: LOGGER.info('only header is requested and returned') return results if header: results = (ags, headers) else: # ags = results # ags = list(ags) results = ags LOGGER.progress('Extracting Pfam domains...', len(ags)) comma_splitter = re.compile(r'\s*,\s*').split no_info = [] for i, ag in enumerate(ags): LOGGER.update(i) data_dict = data_dicts[i] pfamRange = data_dict['UniprotResnumRange'].split('-') uniprotAcc = data_dict['UniprotAcc'] try: uniData = queryUniprot(uniprotAcc) except: LOGGER.warn('No Uniprot record found for {0}'.format( data_dict['PBD_ID'])) continue resrange = None found = False for key, value in uniData.items(): if not key.startswith('dbReference'): continue try: pdbid = value['PDB'] except: continue if pdbid != data_dict['PDB_ID']: continue pdbchains = value['chains'] # example chain strings: "A=27-139, B=140-150" or "A/B=27-150" pdbchains = comma_splitter(pdbchains) for chain in pdbchains: chids, resrange = chain.split('=') chids = [chid.strip() for chid in chids.split('/')] if data_dict['chain'] in chids: resrange = resrange.split('-') found = True break if found: break if found: header = headers[i] chain_accessions = [ dbref.accession for dbref in header[data_dict['chain']].dbrefs ] try: if len(chain_accessions) > 0: right_part = np.where( np.array(chain_accessions) == data_dict['UniprotAcc'])[0][0] else: raise ValueError( 'There is no accession for a chain in the Header') except: LOGGER.warn( 'Could not map domains in {0}'.format(data_dict['PDB_ID'] + data_dict['chain'])) no_info.append(i) continue right_dbref = header[data_dict['chain']].dbrefs[right_part] chainStart = ag.select('chain {0}'.format( data_dict['chain'])).getResnums()[0] missing = chainStart - right_dbref.first[0] partStart = ag.getResindices()[np.where( ag.getResnums() == right_dbref.first[0] + missing)][0] pfStart, pfEnd = int(pfamRange[0]), int(pfamRange[1]) uniStart, uniEnd = int(resrange[0]), int(resrange[1]) resiStart = pfStart - uniStart + partStart - missing resiEnd = pfEnd - uniStart + partStart - missing ags[i] = ag.select('resindex {0} to {1}'.format( resiStart, resiEnd)) else: no_info.append(i) LOGGER.finish() for i in reversed(no_info): ags.pop(i) if header: headers.pop(i) if isinstance(data, list): data.extend(data_dicts) else: LOGGER.warn('data should be a list in order to get output') return results