def calcANM(pdb, selstr='calpha', cutoff=15., gamma=1., n_modes=20, zeros=False): """Returns an :class:`ANM` instance and atoms used for the calculations. By default only alpha carbons are considered, but selection string helps selecting a subset of it. *pdb* can be :class:`.Atomic` instance.""" if isinstance(pdb, str): ag = parsePDB(pdb) title = ag.getTitle() elif isinstance(pdb, Atomic): ag = pdb if isinstance(pdb, AtomGroup): title = ag.getTitle() else: title = ag.getAtomGroup().getTitle() else: raise TypeError('pdb must be an atomic class, not {0}'.format( type(pdb))) anm = ANM(title) sel = ag.select(selstr) anm.buildHessian(sel, cutoff, gamma) anm.calcModes(n_modes, zeros) return anm, sel
def parsePDBs(self, **kwargs): """Load PDB into memory as :class:`.AtomGroup` instances using :func:`.parsePDB` and perform selection based on residue ranges given by CATH.""" pdbs = self.getPDBs(True) selstrs = self.getSelStrs() header = kwargs.get('header', False) model = kwargs.get('model', None) LOGGER.timeit('_cath_parsePDB') LOGGER.info('Parsing {0} PDB files...'.format(len(pdbs))) ret = parsePDB(*pdbs, **kwargs) if model != 0: if header: prots, _ = ret else: prots = ret LOGGER.info('Extracting domains...') for i in range(len(prots)): sel = prots[i].select(selstrs[i]) prots[i] = sel LOGGER.report('CATH domains are parsed and extracted in %.2fs', '_cath_parsePDB') return ret
def test2(pdb='2nwl-mem.pdb'): from prody import parsePDB structure = parsePDB(pdb, subset='ca') exanm = exANM('2nwl') exanm.buildHessian(structure) exanm.calcModes() return exanm
def test(pdb='2ci2'): from prody import parsePDB from numpy import zeros pdb = parsePDB(pdb, subset='ca') bbenm = bbENM() bbenm.buildHessian(pdb, cutoff=12.) return bbenm
def test(pdb='2ci2'): from prody import parsePDB from numpy import zeros pdb = parsePDB(pdb, subset='ca') bbenm = bbENM('2ci2') bbenm.buildHessian(pdb, cutoff=7.) bbenm.calcModes(n_modes = None) return bbenm
def test(): from prody import parsePDB, GNM from prody.dynamics.analysis import calcOverallNetEntropyTransfer import matplotlib.pyplot as plt pdb = parsePDB('1z83', subset='ca', chain='A') gnm = GNM() gnm.buildKirchhoff(pdb, cutoff=7.0) gnm.calcModes(n_modes=None) entTransfer = calcOverallNetEntropyTransfer(gnm,turbo=True) # f = open('/data/Manuscript_data/Data/1Z83A/monomer_overallnet_A_cihan2.txt','w') # for i in range(gnm.numAtoms()): # for j in range(gnm.numAtoms()): # if i != j: # f.write('%d\t%d\t%f\n' % (i+1,j+1,entTransfer[i,j])) # f.close() return entTransfer
def test(pdb='2nwl-mem.pdb', blk='2nwl.blk'): from prody import parsePDB from numpy import zeros pdb = parsePDB(pdb, subset='ca') pdb.setData('block', zeros(len(pdb), int)) with open(blk) as inp: for line in inp: if line.startswith('BLOCK'): _, b, n1, c1, r1, n2, c2, r2 = line.split() sel = pdb.select('chain {} and resnum {} to {}'.format( c1, r1, r2)) if sel: sel.setData('block', int(b)) pdb.setBetas(pdb.getData('block')) from prody import writePDB writePDB('pdb2gb1_truncated.pdb', pdb) rtb = RTB('2nwl') rtb.buildHessian(pdb, pdb.getData('block')) return rtb
def test(pdb='2nwl-mem.pdb', blk='2nwl.blk'): from prody import parsePDB from numpy import zeros pdb = parsePDB(pdb, subset='ca') pdb.setData('block', zeros(len(pdb), int)) with open(blk) as inp: for line in inp: if line.startswith('BLOCK'): _, b, n1, c1, r1, n2, c2, r2 = line.split() sel = pdb.select('chain {} and resnum {} to {}' .format(c1, r1, r2)) if sel: sel.setData('block', int(b)) pdb.setBetas(pdb.getData('block')) from prody import writePDB writePDB('pdb2gb1_truncated.pdb', pdb) rtb = RTB('2nwl') rtb.buildHessian(pdb, pdb.getData('block')) return rtb
def calcGNM(pdb, selstr="calpha", cutoff=15.0, gamma=1.0, n_modes=20, zeros=False): """Return a :class:`GNM` instance and atoms used for the calculations. By default only alpha carbons are considered, but selection string helps selecting a subset of it. *pdb* can be :class:`.Atomic` instance.""" if isinstance(pdb, str): ag = parsePDB(pdb) title = ag.getTitle() elif isinstance(pdb, Atomic): ag = pdb if isinstance(pdb, AtomGroup): title = ag.getTitle() else: title = ag.getAtomGroup().getTitle() else: raise TypeError("pdb must be an atom container, not {0}".format(type(pdb))) gnm = GNM(title) sel = ag.select(selstr) gnm.buildKirchhoff(sel, cutoff, gamma) gnm.calcModes(n_modes) return gnm, sel
def calcANM(pdb, selstr='calpha', cutoff=15., gamma=1., n_modes=20, zeros=False): """Returns an :class:`ANM` instance and atoms used for the calculations. By default only alpha carbons are considered, but selection string helps selecting a subset of it. *pdb* can be :class:`.Atomic` instance.""" if isinstance(pdb, str): ag = parsePDB(pdb) title = ag.getTitle() elif isinstance(pdb, Atomic): ag = pdb if isinstance(pdb, AtomGroup): title = ag.getTitle() else: title = ag.getAtomGroup().getTitle() else: raise TypeError('pdb must be an atomic class, not {0}' .format(type(pdb))) anm = ANM(title) sel = ag.select(selstr) anm.buildHessian(sel, cutoff, gamma) anm.calcModes(n_modes, zeros) return anm, sel
def parsePDBs(self, **kwargs): """Load PDB into memory as :class:`.AtomGroup` instances using :func:`.parsePDB` and perform selection based on residue ranges given by CATH.""" pdbs = self.getPDBs() # selstrs = self.getSelstrs() header = kwargs.get('header', False) model = kwargs.get('model', None) LOGGER.timeit('_uniprot_parsePDB') LOGGER.info('Parsing {0} PDB files...'.format(len(pdbs))) ret = parsePDB(*pdbs, **kwargs) if model != 0: headers = None if header: prots, headers = ret else: prots = ret if not isinstance(prots, list): prots = [prots] if header: headers = [headers] ret = (prots, headers) else: ret = prots LOGGER.info('Extracting domains...') # for i in range(len(prots)): # sel = prots[i].select(selstrs[i]) # prots[i] = sel LOGGER.report('Uniprot domains are parsed and extracted in %.2fs', '_uniprot_parsePDB') return ret
def test(pdb="2nwl-mem.pdb", blk="2nwl.blk"): from prody import parsePDB from numpy import zeros, linalg pdb = parsePDB(pdb, subset="ca") pdb.setData("block", zeros(len(pdb), int)) with open(blk) as inp: for line in inp: if line.startswith("BLOCK"): _, b, n1, c1, r1, n2, c2, r2 = line.split() sel = pdb.select("chain {} and resnum {} to {}".format(c1, r1, r2)) if sel: sel.setData("block", int(b)) pdb.setBetas(pdb.getData("block")) coords = pdb.getCoords() blocks = pdb.getBetas() from prody import writePDB writePDB("pdb2gb1_truncated.pdb", pdb) rtb = RTB("2nwl") rtb.buildHessian(coords, blocks, scale=64) # rtb.calcModes() return rtb
def parseScipionModes(run_path, title=None, pdb=None): """Returns :class:`.NMA` containing eigenvectors and eigenvalues parsed from a ContinuousFlex FlexProtNMA Run directory. :arg run_path: path to the Run directory :type run_path: str :arg title: title for :class:`.NMA` object :type title: str """ if run_path.endswith("/"): run_path = run_path[:-1] run_name = os.path.split(run_path)[-1] top_dirs = os.path.split(run_path)[0][:-4] # exclude "Runs" star_data = parseSTAR(run_path + '/modes.xmd') star_loop = star_data[0][0] n_modes = star_loop.numRows() row1 = star_loop[0] mode1 = parseArray(top_dirs + row1['_nmaModefile']).reshape(-1) dof = mode1.shape[0] if pdb is not None: atoms = parsePDB(pdb) n_atoms = atoms.numAtoms() else: # assume standard NMA n_atoms = dof // 3 vectors = np.zeros((dof, n_modes)) vectors[:, 0] = mode1 eigvals = np.zeros(n_modes) try: eigvals[0] = float(row1['_nmaEigenval']) found_eigvals = True except: found_eigvals = False for i, row in enumerate(star_loop[1:]): vectors[:, i + 1] = parseArray(top_dirs + row['_nmaModefile']).reshape(-1) if found_eigvals: eigvals[i + 1] = float(row['_nmaEigenval']) if not found_eigvals: log_fname = run_path + '/logs/run.stdout' fi = open(log_fname, 'r') lines = fi.readlines() fi.close() for line in lines: if line.find('Eigenvector number') != -1: j = int(line.strip().split()[-1]) - 1 if line.find('Corresponding eigenvalue') != -1: eigvals[j] = float(line.strip().split()[-1]) if not found_eigvals: found_eigvals = True if title is None: title = run_name if not found_eigvals: LOGGER.warn('No eigenvalues found') eigvals = None if dof == n_atoms * 3: nma = NMA(title) else: nma = GNM(title) nma.setEigens(vectors, eigvals) return nma
def alignPDBEnsemble(ensemble, suffix='_aligned', outdir='.', gzip=False): """Align PDB files using transformations from *ensemble*, which may be a :class:`.PDBEnsemble` or a :class:`.PDBConformation` instance. Label of the conformation (see :meth:`~.PDBConformation.getLabel`) will be used to determine the PDB structure and model number. First four characters of the label is expected to be the PDB identifier and ending numbers to be the model number. For example, the :class:`.Transformation` from conformation with label *2k39_ca_selection_'resnum_<_71'_m116* will be applied to 116th model of structure **2k39**. After applicable transformations are made, structure will be written into *outputdir* as :file:`2k39_aligned.pdb`. If ``gzip=True``, output files will be compressed. Return value is the output filename or list of filenames, in the order files are processed. Note that if multiple models from a file are aligned, that filename will appear in the list multiple times.""" if not isinstance(ensemble, (PDBEnsemble, PDBConformation)): raise TypeError('ensemble must be a PDBEnsemble or PDBConformation') if isinstance(ensemble, PDBConformation): ensemble = [ensemble] if gzip: gzip = '.gz' else: gzip = '' output = [] pdbdict = {} for conf in ensemble: trans = conf.getTransformation() if trans is None: raise ValueError('transformations are not calculated, call ' '`superpose` or `iterpose`') label = conf.getLabel() pdb = label[:4] filename = pdbdict.get(pdb, fetchPDB(pdb)) if filename is None: LOGGER.warning( 'PDB file for conformation {0} is not found.'.format(label)) output.append(None) continue LOGGER.info('Parsing PDB file {0} for conformation {1}.'.format( pdb, label)) acsi = None model = label.rfind('m') if model > 3: model = label[model + 1:] if model.isdigit(): acsi = int(model) - 1 LOGGER.info('Applying transformation to model {0}.'.format(model)) if isinstance(filename, str): ag = parsePDB(filename) else: ag = filename if acsi is not None: if acsi >= ag.numCoordsets(): LOGGER.warn('Model number {0} for {1} is out of range.'.format( model, pdb)) output.append(None) continue ag.setACSIndex(acsi) trans.apply(ag) outfn = os.path.join(outdir, pdb + suffix + '.pdb' + gzip) if ag.numCoordsets() > 1: pdbdict[pdb] = ag else: writePDB(outfn, ag) output.append(os.path.normpath(outfn)) for pdb, ag in pdbdict.items(): # PY3K: OK writePDB(os.path.join(outdir, pdb + suffix + '.pdb' + gzip), ag) if len(output) == 1: return output[0] else: return output
def parsePfamPDBs(query, data=[], **kwargs): """Returns a list of AtomGroups containing sections of chains that correspond to a particular PFAM domain family. These are defined by alignment start and end residue numbers. :arg query: UniProt ID or PDB ID If a PDB ID is provided the corresponding UniProt ID is used. If this returns multiple matches then start or end must also be provided. This query is also used for label refinement of the Pfam domain MSA. :type query: str :arg data: If given the data list from the Pfam mapping table will be output through this argument. :type data: list :keyword start: Residue number for defining the start of the domain. The PFAM domain that starts closest to this will be selected. Default is **1** :type start: int :keyword end: Residue number for defining the end of the domain. The PFAM domain that ends closest to this will be selected. :type end: int """ start = kwargs.pop('start', 1) end = kwargs.pop('end', None) if len(query) > 4 and query.startswith('PF'): pfam_acc = query else: pfam_matches = searchPfam(query) keys = list(pfam_matches.keys()) if isinstance(start, Integral): start_diff = [] for i, key in enumerate(pfam_matches): start_diff.append(int(pfam_matches[key]['locations'][0]['start']) - start) start_diff = np.array(start_diff) pfam_acc = keys[np.where(abs(start_diff) == min(abs(start_diff)))[0][0]] elif isinstance(end, Integral): end_diff = [] for i, key in enumerate(pfam_matches): end_diff.append(int(pfam_matches[key]['locations'][0]['end']) - end) end_diff = np.array(end_diff) pfam_acc = keys[np.where(abs(end_diff) == min(abs(end_diff)))[0][0]] else: raise ValueError('Please provide an integer for start or end ' 'when using a UniProt ID or PDB ID.') from ftplib import FTP from .uniprot import queryUniprot data_stream = BytesIO() ftp_host = 'ftp.ebi.ac.uk' ftp = FTP(ftp_host) ftp.login() ftp.cwd('pub/databases/Pfam/current_release') ftp.retrbinary('RETR pdbmap.gz', data_stream.write) ftp.quit() zip_data = data_stream.getvalue() data_stream.close() rawdata = gunzip(zip_data) if PY3K: rawdata = rawdata.decode() fields = ['PDB_ID', 'chain', 'nothing', 'PFAM_Name', 'PFAM_ACC', 'UniprotAcc', 'UniprotResnumRange'] data_dicts = [] for line in rawdata.split('\n'): if line.find(pfam_acc) != -1: data_dicts.append({}) for j, entry in enumerate(line.strip().split('\t')): data_dicts[-1][fields[j]] = entry.strip(';') pdb_ids = [data_dict['PDB_ID'] for data_dict in data_dicts] chains = [data_dict['chain'] for data_dict in data_dicts] header = kwargs.pop('header', False) model = kwargs.get('model', None) results = parsePDB(*pdb_ids, chain=chains, header=True, **kwargs) ags, headers = results ags, headers = list(ags), list(headers) if model == 0: LOGGER.info('only header is requested and returned') return results if header: results = (ags, headers) else: # ags = results # ags = list(ags) results = ags LOGGER.progress('Extracting Pfam domains...', len(ags)) comma_splitter = re.compile(r'\s*,\s*').split no_info = [] for i, ag in enumerate(ags): LOGGER.update(i) data_dict = data_dicts[i] pfamRange = data_dict['UniprotResnumRange'].split('-') uniprotAcc = data_dict['UniprotAcc'] try: uniData = queryUniprot(uniprotAcc) except: LOGGER.warn('No Uniprot record found for {0}'.format(data_dict['PBD_ID'])) continue resrange = None found = False for key, value in uniData.items(): if not key.startswith('dbReference'): continue try: pdbid = value['PDB'] except: continue if pdbid != data_dict['PDB_ID']: continue pdbchains = value['chains'] # example chain strings: "A=27-139, B=140-150" or "A/B=27-150" pdbchains = comma_splitter(pdbchains) for chain in pdbchains: chids, resrange = chain.split('=') chids = [chid.strip() for chid in chids.split('/')] if data_dict['chain'] in chids: resrange = resrange.split('-') found = True break if found: break if found: header = headers[i] chain_accessions = [dbref.accession for dbref in header[data_dict['chain']].dbrefs] try: if len(chain_accessions) > 0: right_part = np.where(np.array(chain_accessions) == data_dict['UniprotAcc'])[0][0] else: raise ValueError('There is no accession for a chain in the Header') except: LOGGER.warn('Could not map domains in {0}' .format(data_dict['PDB_ID'] + data_dict['chain'])) no_info.append(i) continue right_dbref = header[data_dict['chain']].dbrefs[right_part] chainStart = ag.select('chain {0}'.format(data_dict['chain']) ).getResnums()[0] missing = chainStart - right_dbref.first[0] partStart = ag.getResindices()[np.where(ag.getResnums() == right_dbref.first[0] + missing)][0] pfStart, pfEnd = int(pfamRange[0]), int(pfamRange[1]) uniStart, uniEnd = int(resrange[0]), int(resrange[1]) resiStart = pfStart - uniStart + partStart - missing resiEnd = pfEnd - uniStart + partStart - missing ags[i] = ag.select('resindex {0} to {1}'.format( resiStart, resiEnd)) else: no_info.append(i) LOGGER.finish() for i in reversed(no_info): ags.pop(i) if header: headers.pop(i) if isinstance(data, list): data.extend(data_dicts) else: LOGGER.warn('data should be a list in order to get output') return results
def searchPfam(query, **kwargs): """Returns Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, a protein sequence, or a sequence file. Sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = ''.join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError('could not parse a sequence without gaps from ' + query) else: seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + ' is not a valid sequence') fseq = '>Seq\n' + seq parameters = { 'hmmdb' : 'pfam', 'seq': fseq } enc_params = urllib.urlencode(parameters).encode('utf-8') request = urllib2.Request('https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params) results_url = urllib2.urlopen(request).geturl() #res_params = { 'output' : 'xml' } res_params = { 'format' : 'tsv' } enc_res_params = urllib.urlencode(res_params) #modified_res_url = results_url + '?' + enc_res_params modified_res_url = results_url.replace('results','download') + '?' + enc_res_params result_request = urllib2.Request(modified_res_url) # url = ( urllib2.urlopen(request).geturl() + '?output=xml') LOGGER.debug('Submitted Pfam search for sequence "{0}...".' .format(seq[:MINSEQLEN])) #xml = urllib2.urlopen(result_request).read() tsv = urllib2.urlopen(result_request).read() # openURL(url, timeout=timeout).read() # try: # root = ET.XML(xml) # except Exception as err: # raise ValueError('failed to parse results XML, check URL: ' + modified_res_url) matches = {} #for child in root[0]: #if child.tag == 'hits': # accession = child.get('acc') # pfam_id = accession.split('.')[0] # matches[pfam_id]={} # matches[pfam_id]['accession']=accession # matches[pfam_id]['class']='Domain' # matches[pfam_id]['id']=child.get('name') # matches[pfam_id]['locations']={} # matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto') # matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom') # matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore') # matches[pfam_id]['locations']['end']=child[0].get('alisqto') # matches[pfam_id]['locations']['evalue']=child.get('evalue') # matches[pfam_id]['locations']['evidence']='hmmer v3.0' # matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto') # matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom') # matches[pfam_id]['locations']['significant']=child[0].get('significant') # matches[pfam_id]['locations']['start']=child[0].get('alisqfrom') # matches[pfam_id]['type']='Pfam-A' # return matches lines = tsv.split('\n') keys = lines[0].split('\t') root = {} for i, line in enumerate(lines[1:-1]): root[i] = {} for j, key in enumerate(keys): root[i][key] = line.split('\t')[j] for child in root.values(): accession = child['Family Accession'] pfam_id = accession.split('.')[0] matches[pfam_id]={} matches[pfam_id]['accession'] = accession matches[pfam_id]['class'] = 'Domain' matches[pfam_id]['id'] = child['Family id'] matches[pfam_id]['locations'] = {} matches[pfam_id]['locations']['ali_end'] = child['Ali. End'] matches[pfam_id]['locations']['ali_start'] = child['Ali. Start'] matches[pfam_id]['locations']['bitscore'] = child['Bit Score'] matches[pfam_id]['locations']['end'] = child['Env. End'] matches[pfam_id]['locations']['cond_evalue'] = child['Cond. E-value'] matches[pfam_id]['locations']['ind_evalue'] = child['Ind. E-value'] matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0' matches[pfam_id]['locations']['hmm_end'] = child['Model End'] matches[pfam_id]['locations']['hmm_start'] = child['Model Start'] #matches[pfam_id]['locations']['significant'] = child['significant'] matches[pfam_id]['locations']['start'] = child['Env. Start'] matches[pfam_id]['type'] = 'Pfam-A' return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: LOGGER.warn('failed to parse header for {0} ({1})' .format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue idcode = dbref.idcode accession = dbref.accession LOGGER.info('UniProt ID code {0} for {1} chain ' '{2} will be used.' .format(idcode, seq[:4], poly.chid)) break if idcode is not None: break if idcode is None: LOGGER.warn('A UniProt ID code for PDB {0} could not be ' 'parsed.'.format(repr(seq))) url = prefix + 'protein/' + seq + '?output=xml' else: url = prefix + 'protein/' + idcode + '?output=xml' else: url = prefix + 'protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml not in ['PEND','RUN']: break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None elif xml.find(b'No valid UniProt accession or ID') > 0: try: url = prefix + 'protein/' + accession + '?output=xml' xml = openURL(url, timeout=timeout).read() except: try: ag = parsePDB(seq, subset='ca') ag_seq = ag.getSequence() return searchPfam(ag_seq) except: raise ValueError('No valid UniProt accession or ID for: ' + seq) try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError('failed to parse results XML, check URL: ' + url) else: key = '{' + prefix + '}' results = dictElement(root[0], key) try: xml_matches = results['matches'] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for child in xml_matches: try: accession = child.attrib['accession'][:7] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) if not re.search('^P(F|B)[0-9]{5}$', accession): raise ValueError('{0} does not match pfam accession' ' format'.format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault('locations', []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = 'Query ' + repr(query) else: query = 'Query sequence' if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches
def renumber_InputAlign(alnfile,pdbid,refid,selection="protein"\ ,outfile="renumbered.pdb",pdbfile="",newAA=None,first=1): ''' Renumber input pdb using an exsiting multiple alignment. - alnfile: alignment in .fasta format. Beware of weird characters in the sequence ids, eg "|" - pdbid: sequence id in the alginment file that corresponds to the input structure. Must be the same number of residues - refid: sequence id corresponding to the reference sequence by which to renumber the pdbid sequence. pdbid musnt' align to any gaps in refid. - selection: atom selection(s) in the the structure file to renumber. Will iterate over comma separated selections to renumber each. - pdbfile: original structure file - outfile: output structure file - newAA: comma separated list of unrepresented amino acids XXXYCA: XXX = three letter abbrevation as in pdbfile Y = one letter code in the alignment CA = atom to use as CA if different from "CA", eg C1 in PVL of 1JEN ''' selections = selection.split(",") tmp=tempfile.gettempdir() modified_selections = [] if os.path.exists(alnfile): aln = AlignIO.read(alnfile, "fasta",alphabet=IUPAC.protein) else: print("ERROR, no such alignment: %s"%alnfile) exit(1) aln_ids = [x.id for x in aln] if pdbid in aln_ids and refid in aln_ids: pdbSeqRec = seqbyname(aln, pdbid) if not pdbSeqRec: print("ERROR, bad pdbid name") exit(1) refSeqRec = seqbyname(aln, refid) if not refSeqRec: print("ERROR, bad refid name") exit(1) if pdbfile != '': if os.path.exists(pdbfile): structure = parsePDB(pdbfile) updateAA(structure,newAA) else: print("ERROR, no such pdb file: %s"%pdbfile) exit(1) renumber_aln(aln, refid, pdbid,first) for polymer in selections: currentSel = structure.select("not hetero and protein and name CA and %s"%polymer) if currentSel: renumber_struct(structure, pdbSeqRec, polymer) modified_selections.append(polymer) else: print('ERROR: Selection \"%s\" has zero CA atoms'%polymer) else: if pdbid not in [x.id for x in aln]: print("ERROR, no such sequence to renumber: %s"%pdbid) if refid not in [x.id for x in aln]: print("ERROR, no such sequence to renumber by: %s"%refid) exit(1) if writePDB(outfile, structure): print("Wrote renumbered %s selections from %s to %s"\ %(str(modified_selections),pdbfile,outfile))
def renumber_noInputAlign(pdbfile,refseqfile,selection="protein",\ outfile="renumbered.pdb",newAA=None,first=1): ''' Renumber pdb file (pdbfile) according to reference sequence in refseqfile. Pdb sequence is extracted and aligned with reference sequence using needle from EMBOSS. - refseqfile: .fasta file containing the reference sequence by which to renumber - selection: atom selection(s) in the the structure file to renumber. Will iterate over comma separated selections to renumber each. - pdbfile: original structure file - outfile: output structure file - newAA: comma separated list of unrepresented amino acids XXXYCA: XXX = three letter abbrevation as in pdbfile Y = one letter code in the alignment CA = atom to use as CA if different from "CA", eg C1 in PVL of 1JEN ''' # selections = selection.split(",") selections = selection tmp=tempfile.gettempdir() tmp_refseqfile="%s/refseq.fasta"%tmp pdbID = re.search("\w+\.\w+", pdbfile).group(0) tmp_pdbseqfile="%s/%s.fasta"%(tmp,pdbID) tmp_needle="%s/needle.out"%tmp if os.path.exists(refseqfile): refseqRec = SeqIO.read(refseqfile,"fasta",alphabet=IUPAC.protein ) refseqRec.id = "refseq" SeqIO.write(refseqRec,tmp_refseqfile,"fasta") else: print ("ERROR, no such file: %s"%refseqfile) exit(1) if os.path.exists(pdbfile): structure=parsePDB("%s"%pdbfile) updateAA(structure,newAA) else: print ("ERROR, no such file: %s"%pdbfile) exit(1) modified_selections = [] for polymer in selections: currentSel = structure.select("protein and name CA and %s"%polymer) if currentSel: pdbseq_str=''.join([oneletter[i] for i in currentSel.getResnames()]) pdbseqRec=SeqRecord(Seq(pdbseq_str,IUPAC.protein),id=pdbID) SeqIO.write(pdbseqRec,tmp_pdbseqfile,"fasta") needle_cli = NeedleCommandline(asequence=tmp_pdbseqfile,bsequence=tmp_refseqfile,\ gapopen=10,gapextend=0.5,outfile=tmp_needle) needle_cli() aln = AlignIO.read(tmp_needle, "emboss",alphabet=IUPAC.protein ) # os.remove(tmp_needle) # os.remove(tmp_pdbseqfile) gpdb.renumber_aln(aln,"refseq",pdbID,first) pdbRenSeq = gpdb.seqbyname(aln, pdbID) gpdb.renumber_struct(structure, pdbRenSeq,polymer) pdbRenSeq.annotations["resnum"]=str(pdbRenSeq.letter_annotations["resnum"]) modified_selections.append(polymer) # seems to be the only way to store pret residue annotations # AlignIO.write(aln,"pdb.outseq","seqxml") else: print ('ERROR: Selection \"%s\" has zero CA atoms'%polymer) if writePDB(outfile, structure): print ("Wrote renumbered %s selections from %s to %s"%\ (str(modified_selections),pdbfile,outfile)) os.remove(tmp_refseqfile)
def alignPDBEnsemble(ensemble, suffix='_aligned', outdir='.', gzip=False): """Align PDB files using transformations from *ensemble*, which may be a :class:`.PDBEnsemble` or a :class:`.PDBConformation` instance. Label of the conformation (see :meth:`~.PDBConformation.getLabel`) will be used to determine the PDB structure and model number. First four characters of the label is expected to be the PDB identifier and ending numbers to be the model number. For example, the :class:`.Transformation` from conformation with label *2k39_ca_selection_'resnum_<_71'_m116* will be applied to 116th model of structure **2k39**. After applicable transformations are made, structure will be written into *outputdir* as :file:`2k39_aligned.pdb`. If *gzip* is **True**, output files will be compressed. Return value is the output filename or list of filenames, in the order files are processed. Note that if multiple models from a file are aligned, that filename will appear in the list multiple times.""" if not isinstance(ensemble, (PDBEnsemble, PDBConformation)): raise TypeError('ensemble must be a PDBEnsemble or PDBConformation') if isinstance(ensemble, PDBConformation): ensemble = [ensemble] if gzip: gzip = '.gz' else: gzip = '' output = [] pdbdict = {} for conf in ensemble: trans = conf.getTransformation() if trans is None: raise ValueError('transformations are not calculated, call ' '`superpose` or `iterpose`') label = conf.getLabel() pdb = label[:4] filename = pdbdict.get(pdb, fetchPDB(pdb)) if filename is None: LOGGER.warning('PDB file for conformation {0} is not found.' .format(label)) output.append(None) continue LOGGER.info('Parsing PDB file {0} for conformation {1}.' .format(pdb, label)) acsi = None model = label.rfind('m') if model > 3: model = label[model+1:] if model.isdigit(): acsi = int(model) - 1 LOGGER.info('Applying transformation to model {0}.' .format(model)) if isinstance(filename, str): ag = parsePDB(filename) else: ag = filename if acsi is not None: if acsi >= ag.numCoordsets(): LOGGER.warn('Model number {0} for {1} is out of range.' .format(model, pdb)) output.append(None) continue ag.setACSIndex(acsi) trans.apply(ag) outfn = os.path.join(outdir, pdb + suffix + '.pdb' + gzip) if ag.numCoordsets() > 1: pdbdict[pdb] = ag else: writePDB(outfn, ag) output.append(os.path.normpath(outfn)) for pdb, ag in pdbdict.items(): # PY3K: OK writePDB(os.path.join(outdir, pdb + suffix + '.pdb' + gzip), ag) if len(output) == 1: return output[0] else: return output
def searchPfam(query, **kwargs): """Returns Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, a protein sequence, or a sequence file. Sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = ''.join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError('could not parse a sequence without gaps from ' + query) else: seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + ' is not a valid sequence') fseq = '>Seq\n' + seq parameters = {'hmmdb': 'pfam', 'seq': fseq} enc_params = urllib.urlencode(parameters).encode('utf-8') request = urllib2.Request( 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params) results_url = urllib2.urlopen(request).geturl() #res_params = { 'output' : 'xml' } res_params = {'format': 'tsv'} enc_res_params = urllib.urlencode(res_params) #modified_res_url = results_url + '?' + enc_res_params modified_res_url = results_url.replace( 'results', 'download') + '?' + enc_res_params result_request = urllib2.Request(modified_res_url) # url = ( urllib2.urlopen(request).geturl() + '?output=xml') LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format( seq[:MINSEQLEN])) try: #xml = urllib2.urlopen(result_request).read() tsv = urllib2.urlopen(result_request).read() # openURL(url, timeout=timeout).read() except: raise ValueError('No matching Pfam domains were found.') # try: # root = ET.XML(xml) # except Exception as err: # raise ValueError('failed to parse results XML, check URL: ' + modified_res_url) matches = {} #for child in root[0]: #if child.tag == 'hits': # accession = child.get('acc') # pfam_id = accession.split('.')[0] # matches[pfam_id]={} # matches[pfam_id]['accession']=accession # matches[pfam_id]['class']='Domain' # matches[pfam_id]['id']=child.get('name') # matches[pfam_id]['locations']={} # matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto') # matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom') # matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore') # matches[pfam_id]['locations']['end']=child[0].get('alisqto') # matches[pfam_id]['locations']['evalue']=child.get('evalue') # matches[pfam_id]['locations']['evidence']='hmmer v3.0' # matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto') # matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom') # matches[pfam_id]['locations']['significant']=child[0].get('significant') # matches[pfam_id]['locations']['start']=child[0].get('alisqfrom') # matches[pfam_id]['type']='Pfam-A' # return matches if PY3K: tsv = tsv.decode() lines = tsv.split('\n') keys = lines[0].split('\t') root = {} for i, line in enumerate(lines[1:-1]): root[i] = {} for j, key in enumerate(keys): root[i][key] = line.split('\t')[j] for child in root.values(): accession = child['Family Accession'] pfam_id = accession.split('.')[0] matches[pfam_id] = {} matches[pfam_id]['accession'] = accession matches[pfam_id]['class'] = 'Domain' matches[pfam_id]['id'] = child['Family id'] matches[pfam_id]['locations'] = {} matches[pfam_id]['locations']['ali_end'] = child['Ali. End'] matches[pfam_id]['locations']['ali_start'] = child['Ali. Start'] matches[pfam_id]['locations']['bitscore'] = child['Bit Score'] matches[pfam_id]['locations']['end'] = child['Env. End'] matches[pfam_id]['locations']['cond_evalue'] = child[ 'Cond. E-value'] matches[pfam_id]['locations']['ind_evalue'] = child['Ind. E-value'] matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0' matches[pfam_id]['locations']['hmm_end'] = child['Model End'] matches[pfam_id]['locations']['hmm_start'] = child['Model Start'] #matches[pfam_id]['locations']['significant'] = child['significant'] matches[pfam_id]['locations']['start'] = child['Env. Start'] matches[pfam_id]['type'] = 'Pfam-A' return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: LOGGER.warn('failed to parse header for {0} ({1})'.format( seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue idcode = dbref.idcode accession = dbref.accession LOGGER.info('UniProt ID code {0} for {1} chain ' '{2} will be used.'.format( idcode, seq[:4], poly.chid)) break if idcode is not None: break if idcode is None: LOGGER.warn('A UniProt ID code for PDB {0} could not be ' 'parsed.'.format(repr(seq))) url = prefix + 'protein/' + seq + '?output=xml' else: url = prefix + 'protein/' + idcode + '?output=xml' else: url = prefix + 'protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml not in ['PEND', 'RUN']: break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None elif xml.find(b'No valid UniProt accession or ID') > 0: try: url = prefix + 'protein/' + accession + '?output=xml' xml = openURL(url, timeout=timeout).read() except: try: ag = parsePDB(seq, subset='ca') ag_seq = ag.getSequence() return searchPfam(ag_seq) except: raise ValueError('No valid UniProt accession or ID for: ' + seq) try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError('failed to parse results XML, check URL: ' + url) else: key = '{' + prefix + '}' results = dictElement(root[0], key) try: xml_matches = results['matches'] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for child in xml_matches: try: accession = child.attrib['accession'][:7] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) if not re.search('^P(F|B)[0-9]{5}$', accession): raise ValueError('{0} does not match pfam accession' ' format'.format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault('locations', []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = 'Query ' + repr(query) else: query = 'Query sequence' if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches
def parsePfamPDBs(query, data=[], **kwargs): """Returns a list of AtomGroups containing sections of chains that correspond to a particular PFAM domain family. These are defined by alignment start and end residue numbers. :arg query: UniProt ID or PDB ID If a PDB ID is provided the corresponding UniProt ID is used. If this returns multiple matches then start or end must also be provided. This query is also used for label refinement of the Pfam domain MSA. :type query: str :arg data: If given the data list from the Pfam mapping table will be output through this argument. :type data: list :keyword start: Residue number for defining the start of the domain. The PFAM domain that starts closest to this will be selected. Default is **1** :type start: int :keyword end: Residue number for defining the end of the domain. The PFAM domain that ends closest to this will be selected. :type end: int """ start = kwargs.pop('start', 1) end = kwargs.pop('end', None) if len(query) > 4 and query.startswith('PF'): pfam_acc = query else: pfam_matches = searchPfam(query) keys = list(pfam_matches.keys()) if isinstance(start, Integral): start_diff = [] for i, key in enumerate(pfam_matches): start_diff.append( int(pfam_matches[key]['locations'][0]['start']) - start) start_diff = np.array(start_diff) pfam_acc = keys[np.where( abs(start_diff) == min(abs(start_diff)))[0][0]] elif isinstance(end, Integral): end_diff = [] for i, key in enumerate(pfam_matches): end_diff.append( int(pfam_matches[key]['locations'][0]['end']) - end) end_diff = np.array(end_diff) pfam_acc = keys[np.where( abs(end_diff) == min(abs(end_diff)))[0][0]] else: raise ValueError('Please provide an integer for start or end ' 'when using a UniProt ID or PDB ID.') from ftplib import FTP from .uniprot import queryUniprot data_stream = BytesIO() ftp_host = 'ftp.ebi.ac.uk' ftp = FTP(ftp_host) ftp.login() ftp.cwd('pub/databases/Pfam/current_release') ftp.retrbinary('RETR pdbmap.gz', data_stream.write) ftp.quit() zip_data = data_stream.getvalue() data_stream.close() rawdata = gunzip(zip_data) if PY3K: rawdata = rawdata.decode() fields = [ 'PDB_ID', 'chain', 'nothing', 'PFAM_Name', 'PFAM_ACC', 'UniprotAcc', 'UniprotResnumRange' ] data_dicts = [] for line in rawdata.split('\n'): if line.find(pfam_acc) != -1: data_dicts.append({}) for j, entry in enumerate(line.strip().split('\t')): data_dicts[-1][fields[j]] = entry.strip(';') pdb_ids = [data_dict['PDB_ID'] for data_dict in data_dicts] chains = [data_dict['chain'] for data_dict in data_dicts] header = kwargs.pop('header', False) model = kwargs.get('model', None) results = parsePDB(*pdb_ids, chain=chains, header=True, **kwargs) ags, headers = results ags, headers = list(ags), list(headers) if model == 0: LOGGER.info('only header is requested and returned') return results if header: results = (ags, headers) else: # ags = results # ags = list(ags) results = ags LOGGER.progress('Extracting Pfam domains...', len(ags)) comma_splitter = re.compile(r'\s*,\s*').split no_info = [] for i, ag in enumerate(ags): LOGGER.update(i) data_dict = data_dicts[i] pfamRange = data_dict['UniprotResnumRange'].split('-') uniprotAcc = data_dict['UniprotAcc'] try: uniData = queryUniprot(uniprotAcc) except: LOGGER.warn('No Uniprot record found for {0}'.format( data_dict['PBD_ID'])) continue resrange = None found = False for key, value in uniData.items(): if not key.startswith('dbReference'): continue try: pdbid = value['PDB'] except: continue if pdbid != data_dict['PDB_ID']: continue pdbchains = value['chains'] # example chain strings: "A=27-139, B=140-150" or "A/B=27-150" pdbchains = comma_splitter(pdbchains) for chain in pdbchains: chids, resrange = chain.split('=') chids = [chid.strip() for chid in chids.split('/')] if data_dict['chain'] in chids: resrange = resrange.split('-') found = True break if found: break if found: header = headers[i] chain_accessions = [ dbref.accession for dbref in header[data_dict['chain']].dbrefs ] try: if len(chain_accessions) > 0: right_part = np.where( np.array(chain_accessions) == data_dict['UniprotAcc'])[0][0] else: raise ValueError( 'There is no accession for a chain in the Header') except: LOGGER.warn( 'Could not map domains in {0}'.format(data_dict['PDB_ID'] + data_dict['chain'])) no_info.append(i) continue right_dbref = header[data_dict['chain']].dbrefs[right_part] chainStart = ag.select('chain {0}'.format( data_dict['chain'])).getResnums()[0] missing = chainStart - right_dbref.first[0] partStart = ag.getResindices()[np.where( ag.getResnums() == right_dbref.first[0] + missing)][0] pfStart, pfEnd = int(pfamRange[0]), int(pfamRange[1]) uniStart, uniEnd = int(resrange[0]), int(resrange[1]) resiStart = pfStart - uniStart + partStart - missing resiEnd = pfEnd - uniStart + partStart - missing ags[i] = ag.select('resindex {0} to {1}'.format( resiStart, resiEnd)) else: no_info.append(i) LOGGER.finish() for i in reversed(no_info): ags.pop(i) if header: headers.pop(i) if isinstance(data, list): data.extend(data_dicts) else: LOGGER.warn('data should be a list in order to get output') return results
def scanPockets(self): 'Generates ESSA z-scores for pockets and parses pocket features. It requires both Fpocket 3.0 and Pandas being installed in your system.' from re import findall fpocket = which('fpocket') if fpocket is None: LOGGER.warning( 'Fpocket (version >= 3.0) was not found, please install it.') return None try: from pandas import Index, DataFrame except ImportError as ie: LOGGER.warning(ie.__str__() + ' was found, please install it.') return None rcr = {(i, j): k if self._rib else self._ri[k] for i, j, k in zip(self._ca.getChids(), self._ca.getResnums(), self._ca.getResindices())} writePDB('{}_pro'.format(self._title), self._heavy) direc = '{}_pro_out'.format(self._title) if not isdir(direc): system('fpocket -f {}_pro.pdb'.format(self._title)) chdir(direc + '/pockets') l = [x for x in listdir('.') if x.endswith('.pdb')] l.sort(key=lambda x: int(x.partition('_')[0][6:])) ps = [] for x in l: with open(x, 'r') as f: tmp0 = f.read() tmp1 = [(x[1].strip(), float(x[2])) for x in findall( r'(\w+\s\w+\s*-\s*)(.+):\s*([\d.-]+)(\n)', tmp0)] fea, sco = list(zip(*tmp1)) ps.append(sco) pdbs = parsePDB(l) chdir('../..') # ----- # ----- # ps = array(ps) pcn = { int(pdb.getTitle().partition('_')[0][6:]): set(zip(pdb.getChids().tolist(), pdb.getResnums().tolist())) for pdb in pdbs } pi = {p: [rcr[x] for x in crn] for p, crn in pcn.items()} pzs_max = {k: max(self._zscore[v]) for k, v in pi.items()} pzs_med = {k: median(self._zscore[v]) for k, v in pi.items()} # ----- # ----- # indices = Index(range(1, ps.shape[0] + 1), name='Pocket #') columns = Index(fea, name='Feature') self._df = DataFrame(index=indices, columns=columns, data=ps) # ----- # ----- # columns_zs = Index(['ESSA_max', 'ESSA_med', 'LHD'], name='Z-score') zps = c_[list(pzs_max.values())] zps = hstack((zps, c_[list(pzs_med.values())])) zps = hstack( (zps, zscore(self._df[['Local hydrophobic density Score']]))) self._df_zs = DataFrame(index=indices, columns=columns_zs, data=zps)
def writePerturbResponsePDB(prs_matrix, pdbIn=None, **kwargs): """ Write the average response to perturbation of a particular residue (a row of a perturbation response matrix) or the average effect of perturbation of a particular residue (a column of a normalized perturbation response matrix) into the b-factor field of a PDB file for visualisation in a molecular graphics program. If no chain is given this will be done for that residue in all chains. If no residue number is given then the effectiveness and sensitivity profiles will be written out instead. These two profiles are also returned as arrays for further analysis if they aren't already provided. :arg prs_matrix: a perturbation response matrix or a :class:`.AtomGroup` object with a PRS matrix associated as data :type prs_matrix: array or :class:`.AtomGroup` :arg pdbIn: file name for the input PDB file where you would like the PRS data mapped :type pdbIn: str :arg pdbOut: a list of file names (enclosed in square brackets) for the output PDB file, default is to append the chain and residue info (name and number) onto the pdbIn stem. The input for pdbOut can also be used as a stem if you enter a single string enclosed in quotes. If no residue number is supplied, chain is ignored and the default is to append '_effectiveness' and '_sensitivity' onto the stem. :type pdbOut: list :arg chain: chain identifier for the residue of interest, default is all chains If you want to analyse residues in a subset of chains, concatentate them together e.g. 'AC' :type chain: str :arg resnum: residue number for the residue of interest :type resnum: int :arg direction: the direction you want to use to read data out of the PRS matrix for plotting: the options are 'effect' or 'response'. Default is 'effect'. A row gives the effect on each residue of peturbing the specified residue. A column gives the response of the specified residue to perturbing each residue. If no residue number is provided then this option will be ignored :type direction: str :arg returnData: whether to return effectiveness and sensitivity for analysis default is False :type returnProfiles: bool :arg effectiveness: effectiveness profile :type array :arg sensitivity: sensitivity profile :type array """ if not isinstance(prs_matrix, np.ndarray): try: prs_matrix = prs_matrix.getData('prs_matrix') except: raise TypeError( 'Please provide a valid PRS matrix in numpy ndarray format.') try: fi = open(pdbIn, 'r') lines = fi.readlines() fi.close() except: raise PRSMatrixParseError( 'Please provide a valid file name for the input PDB.') chain = kwargs.get('chain', None) structure = parsePDB(pdbIn, subset='ca') structure.setData('prs_matrix', prs_matrix) hv = structure.getHierView() chains = [] for i in range(len(list(hv))): chainAg = list(hv)[i] chains.append(chainAg.getChids()[0]) chains = np.array(chains) if chain is None: chain = ''.join(chains) resnum = kwargs.get('resnum', None) pdbOut = kwargs.get('pdbOut', None) if pdbOut is None: out_stem = pdbIn.split('.')[0] elif type(pdbOut) is str: out_stem = pdbOut.split('.')[0] pdbOut = None if resnum is None: effectiveness = kwargs.get('effectiveness', None) sensitivity = kwargs.get('sensitivity', None) if effectiveness is None or sensitivity is None: effectiveness, sensitivity = calcPerturbResponseProfiles( prs_matrix) structure.setData('effectiveness', effectiveness) structure.setData('sensitivity', sensitivity) file_effs_name = '{0}_effectiveness.pdb'.format(out_stem) file_sens_name = '{0}_sensitivity.pdb'.format(out_stem) fileEffs = open(file_effs_name, 'w') fileSens = open(file_sens_name, 'w') for line in lines: if line.find('ATOM') != 0 and line.find( 'HETATM') != 0 and line.find('ANISOU') != 0: fileEffs.write(line) fileSens.write(line) elif line.find('ATOM') == 0: fileEffs.write(line[:60] + '{:6.2f}'.format(float(structure.select( \ 'chain {0} and resnum {1}'.format(line[21],line[22:26])) \ .getData('effectiveness')) * 100/np.max( \ structure.getData('effectiveness'))) + line[66:]) fileSens.write(line[:60] + '{:6.2f}'.format(float(structure.select( \ 'chain {0} and resnum {1}'.format(line[21],line[22:26])) \ .getData('sensitivity')) * 100/np.max( \ structure.getData('sensitivity'))) + line[66:]) elif line.find('HETATM') == 0: fileEffs.write(line[:60] + ' 0.00' + line[66:]) fileSens.write(line[:60] + ' 0.00' + line[66:]) fileEffs.close() fileSens.close() LOGGER.info('The effectiveness and sensitivity profiles were written' \ ' to {0} and {1}.'.format(file_effs_name,file_sens_name)) returnData = kwargs.get('returnData', False) if returnData: return structure, effectiveness, sensitivity else: return direction = kwargs.get('direction', 'effect') for n in range(len(chain)): if not chain[n] in chains: raise PRSMatrixParseError('Chain {0} was not found in {1}'.format( chain[n], pdbIn)) if pdbOut is None: pdbOut = [] for c in chain: pdbOut.append('{0}_{1}_{2}{3}_{4}.pdb' \ .format(out_stem, c, \ str(structure.select('chain {0} and resnum {1}' \ .format(c, resnum)).getResnames()), \ resnum, direction)) for c in chain: fo = open(pdbOut[n], 'w') for line in lines: if line.find('ATOM') != 0 and line.find( 'HETATM') != 0 and line.find('ANISOU') != 0: fo.write(line) elif line.find('ATOM') == 0: if direction is 'effect': fo.write(line[:60] + '{:6.2f}'.format(float(structure.getData('prs_matrix') \ [structure.select('chain {0} and resnum {1}' \ .format(c, resnum)).getResindices(), \ structure.select('chain {0} and resnum {1}' \ .format(line[21], line[22:26])).getResindices()])*100) \ + line[66:]) else: fo.write(line[:60] + '{:6.2f}'.format(float(structure.getData('prs_matrix') \ [structure.select('chain {0} and resnum {1}' \ .format(line[21], line[22:26])).getResindices(), \ structure.select('chain {0} and resnum {1}' \ .format(c, resnum)).getResindices()])*100) \ + line[66:]) elif line.find('HETATM') == 0: fo.write(line[:60] + ' 0.00' + line[66:]) LOGGER.info('Perturbation responses for specific residues were written' \ ' to {0}.'.format(', '.join(pdbOut)))
def writePerturbResponsePDB(prs_matrix, pdbIn, **kwargs): """ Write the average response to perturbation of a particular residue (a row of a perturbation response matrix) or the average effect of perturbation of a particular residue (a column of a normalized perturbation response matrix) into the b-factor field of a PDB file for visualisation in a molecular graphics program. If no chain is given this will be done for that residue in all chains. If no residue number is given then the effectiveness and sensitivity profiles will be written out instead. These two profiles are also returned as arrays for further analysis if they aren't already provided. :arg prs_matrix: a perturbation response matrix :type prs_matrix: ndarray :arg pdbIn: file name for the input PDB file where you would like the PRS data mapped :type pdbIn: str :arg pdbOut: a list of file names (enclosed in square brackets) for the output PDB file, default is to append the chain and residue info (name and number) onto the pdbIn stem. The input for pdbOut can also be used as a stem if you enter a single string enclosed in quotes. If no residue number is supplied, chain is ignored and the default is to append '_effectiveness' and '_sensitivity' onto the stem. :type pdbOut: list :arg chain: chain identifier for the residue of interest, default is all chains If you want to analyse residues in a subset of chains, concatentate them together e.g. 'AC' :type chain: str :arg resnum: residue number for the residue of interest :type resnum: int :arg direction: the direction you want to use to read data out of the PRS matrix for plotting: the options are 'row' or 'column'. Default is 'row'. A row gives the effect on each residue of peturbing the specified residue. A column gives the response of the specified residue to perturbing each residue. If no residue number is provided then this option will be ignored :type direction: str :arg returnData: whether to return effectiveness and sensitivity for analysis default is False :type returnProfiles: bool """ if not type(prs_matrix) is np.ndarray: raise TypeError( 'Please provide a valid PRS matrix in numpy ndarray format.') try: fi = open(pdbIn, 'r') lines = fi.readlines() fi.close() except: raise PRSMatrixParseError( 'Please provide a valid file name for the input PDB.') chain = kwargs.get('chain', None) structure = parsePDB(pdbIn).calpha hv = structure.getHierView() chains = [] for i in range(len(list(hv))): chainAg = list(hv)[i] chains.append(chainAg.getChids()[0]) chains = np.array(chains) if chain is None: chain = ''.join(chains) resnum = kwargs.get('resnum', None) pdbOut = kwargs.get('pdbOut', None) if pdbOut is None: out_stem = pdbIn.split('.')[0] elif type(pdbOut) is str: out_stem = pdbOut.split('.')[0] pdbOut = None if resnum is None: effectiveness = kwargs.get('effectiveness') sensitivity = kwargs.get('sensitivity') if effectiveness is None or sensitivity is None: effectiveness, sensitivity = calcPerturbResponseProfiles( prs_matrix) file_effs_name = '{0}_effectiveness.pdb'.format(out_stem) file_sens_name = '{0}_sensitivity.pdb'.format(out_stem) fileEffs = open(file_effs_name, 'w') fileSens = open(file_sens_name, 'w') for line in lines: if line.find('ATOM') != 0 and line.find( 'HETATM') != 0 and line.find('ANISOU') != 0: fileEffs.write(line) fileSens.write(line) elif line.find('ATOM') == 0: sel_line_res = structure.select('resid {0}'.format( line[22:26])) j = np.where(structure.getResnums() == int(line[22:26]))[0] \ [np.where(sel_line_res.getChids() == line[21])[0][0]] fileEffs.write(line[:60] + ' '*(6-len('{:3.2f}'.format(( \ effectiveness[j]*100/np.max(effectiveness))))) \ + '{:3.2f}'.format((effectiveness[j]) \ *100/np.max(effectiveness)) + line[66:]) fileSens.write(line[:60] + ' '*(6-len('{:3.2f}'.format((\ sensitivity[j]*100/np.max(sensitivity))))) \ + '{:3.2f}'.format((sensitivity[j]) \ *100/np.max(sensitivity)) + line[66:]) elif line.find('HETATM') == 0: fileEffs.write(line[:60] + ' ' * 2 + '0.00' + line[66:]) fileSens.write(line[:60] + ' ' * 2 + '0.00' + line[66:]) fileEffs.close() fileSens.close() LOGGER.info('The effectiveness and sensitivity profiles were written' \ ' to {0} and {1}.'.format(file_effs_name,file_sens_name)) returnData = kwargs.get('returnData', False) if returnData: return effectiveness, sensitivity else: return timesNF = 0 direction = kwargs.get('direction', 'row') for n in range(len(chain)): if not chain[n] in chains: raise PRSMatrixParseError('Chain {0} was not found in {1}'.format( chain[n], pdbIn)) chainNum = int(np.where(chains == chain[n])[0]) chainAg = list(hv)[chainNum] if not resnum in chainAg.getResnums(): LOGGER.info('A residue with number {0} was not found', ' in chain {1}. Continuing to next chain.' \ .format(resnum, chain[n])) timesNF += 1 continue if pdbOut is None: pdbOut = [] for n in range(len(chain)): chainNum = int(np.where(chains == chain[n])[0]) i = np.where(structure.getResnums() == resnum)[0][chainNum - timesNF] pdbOut.append('{0}_{1}_{2}{3}_{4}.pdb'.format(out_stem, chain[n], \ structure.getResnames()[i], resnum, direction)) for n in range(len(chain)): chainNum = int(np.where(chains == chain)[0]) i = np.where(structure.getResnums() == resnum)[0][chainNum - timesNF] fo = open(pdbOut[n], 'w') for line in lines: if line.find('ATOM') != 0 and line.find( 'HETATM') != 0 and line.find('ANISOU') != 0: fo.write(line) elif line.find('ATOM') == 0: sel_line_res = structure.select('resid {0}'.format( line[22:26])) j = np.where(structure.getResnums() == int(line[22:26]))[0] \ [np.where(sel_line_res.getChids() == line[21])[0][0]] if direction is 'row': fo.write(line[:60] + ' '*(6-len('{:3.2f}'.format(( \ prs_matrix[i][j])*100/np.max(prs_matrix)))) \ + '{:3.2f}'.format((prs_matrix[i][j]) \ *100/np.max(prs_matrix)) + line[66:]) else: fo.write(line[:60] + ' '*(6-len('{:3.2f}'.format(( \ prs_matrix[j][i])*100/np.max(prs_matrix)))) \ + '{:3.2f}'.format((prs_matrix[j][i]) \ *100/np.max(prs_matrix)) + line[66:]) elif line.find('HETATM') == 0: fo.write(line[:60] + ' ' * 2 + '0.00' + line[66:]) LOGGER.info('Perturbation responses for specific residues were written' \ ' to {0}.'.format(', '.join(pdbOut))) return
def fetchPfamPDBs(**kwargs): """Returns a list of AtomGroups containing sections of chains that correspond to a particular PFAM domain family. These are defined by alignment start and end residue numbers. :arg pfam_acc: The accession number for a pfam domain family, if known. Alternatively you can select a family based on a query (see below). :type pfam_acc: str :arg query: UniProt ID or PDB ID If a PDB ID is provided the corresponding UniProt ID is used. If no query is provided but a pfam_acc is then the first entry will be used as a query. This query is also used for label refinement of the pfam domain MSA. :type query: str You must provide one of these two arguments. Use of query requires start or end to also be provided. :arg start: Residue number for defining the start of the domain. The PFAM domain that starts closest to this will be selected. :type start: int :arg end: Residue number for defining the end of the domain. The PFAM domain that ends closest to this will be selected. :type end: int :arg return_data: Whether to return the data dictionary from the Pfam mapping table, default is False :type return_data: bool """ pfam_acc = kwargs.pop('pfam_acc', None) query = kwargs.pop('query', None) start = kwargs.pop('start', None) end = kwargs.pop('end', None) return_data = kwargs.pop('return_data', False) if pfam_acc is None: if query is None: raise ValueError('Please provide a value for pfam_acc or query.') else: pfam_matches = searchPfam(query) if start is not None and type(start) is int: start_diff = [] for i, key in enumerate(pfam_matches): start_diff.append( int(pfam_matches[key]['locations'][0]['start']) - start) start_diff = np.array(start_diff) pfam_acc = pfam_matches.keys()[np.where( abs(start_diff) == min(abs(start_diff)))[0][0]] elif end is not None and type(end) is int: end_diff = [] for i, key in enumerate(pfam_matches): end_diff.append( int(pfam_matches[key]['locations'][0]['end']) - end) end_diff = np.array(end_diff) pfam_acc = pfam_matches.keys()[np.where( abs(end_diff) == min(abs(end_diff)))[0][0]] else: raise ValueError( 'Please provide an integer for start or end when using query.' ) from ftplib import FTP data = [] ftp_host = 'ftp.ebi.ac.uk' ftp = FTP(ftp_host) ftp.login('') ftp.cwd('pub/databases/Pfam/mappings') ftp.retrlines('RETR pdb_pfam_mapping.txt', data.append) fields = [] for field in data[0].strip().split('\t'): fields.append(field) data_dict = [] for line in data[1:]: if line.find(pfam_acc) != -1: data_dict.append({}) for j, entry in enumerate(line.strip().split('\t')): data_dict[-1][fields[j]] = entry pdb_ids = [] pdbs = [] headers = [] for i in range(len(data_dict)): pdb_id = data_dict[i]['PDB_ID'] if not pdb_id in pdb_ids: pdb_ids.append(pdb_id) result = parsePDB(*pdb_ids, **kwargs) if return_data: return data_dict, result else: return result