def searchPfam(query, **kwargs): """Returns Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, a protein sequence, or a sequence file. Sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = ''.join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError('could not parse a sequence without gaps from ' + query) else: seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + ' is not a valid sequence') fseq = '>Seq\n' + seq parameters = {'hmmdb': 'pfam', 'seq': fseq} enc_params = urllib.urlencode(parameters).encode('utf-8') request = urllib2.Request( 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params) results_url = urllib2.urlopen(request).geturl() #res_params = { 'output' : 'xml' } res_params = {'format': 'tsv'} enc_res_params = urllib.urlencode(res_params) #modified_res_url = results_url + '?' + enc_res_params modified_res_url = results_url.replace( 'results', 'download') + '?' + enc_res_params result_request = urllib2.Request(modified_res_url) # url = ( urllib2.urlopen(request).geturl() + '?output=xml') LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format( seq[:MINSEQLEN])) try: #xml = urllib2.urlopen(result_request).read() tsv = urllib2.urlopen(result_request).read() # openURL(url, timeout=timeout).read() except: raise ValueError('No matching Pfam domains were found.') # try: # root = ET.XML(xml) # except Exception as err: # raise ValueError('failed to parse results XML, check URL: ' + modified_res_url) matches = {} #for child in root[0]: #if child.tag == 'hits': # accession = child.get('acc') # pfam_id = accession.split('.')[0] # matches[pfam_id]={} # matches[pfam_id]['accession']=accession # matches[pfam_id]['class']='Domain' # matches[pfam_id]['id']=child.get('name') # matches[pfam_id]['locations']={} # matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto') # matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom') # matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore') # matches[pfam_id]['locations']['end']=child[0].get('alisqto') # matches[pfam_id]['locations']['evalue']=child.get('evalue') # matches[pfam_id]['locations']['evidence']='hmmer v3.0' # matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto') # matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom') # matches[pfam_id]['locations']['significant']=child[0].get('significant') # matches[pfam_id]['locations']['start']=child[0].get('alisqfrom') # matches[pfam_id]['type']='Pfam-A' # return matches if PY3K: tsv = tsv.decode() lines = tsv.split('\n') keys = lines[0].split('\t') root = {} for i, line in enumerate(lines[1:-1]): root[i] = {} for j, key in enumerate(keys): root[i][key] = line.split('\t')[j] for child in root.values(): accession = child['Family Accession'] pfam_id = accession.split('.')[0] matches[pfam_id] = {} matches[pfam_id]['accession'] = accession matches[pfam_id]['class'] = 'Domain' matches[pfam_id]['id'] = child['Family id'] matches[pfam_id]['locations'] = {} matches[pfam_id]['locations']['ali_end'] = child['Ali. End'] matches[pfam_id]['locations']['ali_start'] = child['Ali. Start'] matches[pfam_id]['locations']['bitscore'] = child['Bit Score'] matches[pfam_id]['locations']['end'] = child['Env. End'] matches[pfam_id]['locations']['cond_evalue'] = child[ 'Cond. E-value'] matches[pfam_id]['locations']['ind_evalue'] = child['Ind. E-value'] matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0' matches[pfam_id]['locations']['hmm_end'] = child['Model End'] matches[pfam_id]['locations']['hmm_start'] = child['Model Start'] #matches[pfam_id]['locations']['significant'] = child['significant'] matches[pfam_id]['locations']['start'] = child['Env. Start'] matches[pfam_id]['type'] = 'Pfam-A' return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: LOGGER.warn('failed to parse header for {0} ({1})'.format( seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue idcode = dbref.idcode accession = dbref.accession LOGGER.info('UniProt ID code {0} for {1} chain ' '{2} will be used.'.format( idcode, seq[:4], poly.chid)) break if idcode is not None: break if idcode is None: LOGGER.warn('A UniProt ID code for PDB {0} could not be ' 'parsed.'.format(repr(seq))) url = prefix + 'protein/' + seq + '?output=xml' else: url = prefix + 'protein/' + idcode + '?output=xml' else: url = prefix + 'protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml not in ['PEND', 'RUN']: break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None elif xml.find(b'No valid UniProt accession or ID') > 0: try: url = prefix + 'protein/' + accession + '?output=xml' xml = openURL(url, timeout=timeout).read() except: try: ag = parsePDB(seq, subset='ca') ag_seq = ag.getSequence() return searchPfam(ag_seq) except: raise ValueError('No valid UniProt accession or ID for: ' + seq) try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError('failed to parse results XML, check URL: ' + url) else: key = '{' + prefix + '}' results = dictElement(root[0], key) try: xml_matches = results['matches'] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for child in xml_matches: try: accession = child.attrib['accession'][:7] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) if not re.search('^P(F|B)[0-9]{5}$', accession): raise ValueError('{0} does not match pfam accession' ' format'.format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault('locations', []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = 'Query ' + repr(query) else: query = 'Query sequence' if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches
def searchPfam(query, **kwargs): """Return Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence file, sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" prefix = '{http://pfam.xfam.org/}' query = str(query) if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = ''.join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError('could not parse a sequence without gaps from ' + query) else: seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + ' is not a valid sequence') fseq = '>Seq\n' + seq parameters = { 'hmmdb' : 'pfam', 'seq': fseq } enc_params = urllib.urlencode(parameters) request = urllib2.Request('http://hmmer.janelia.org/search/hmmscan', enc_params) url = ( urllib2.urlopen(request).geturl() + '?output=xml') LOGGER.debug('Submitted Pfam search for sequence "{0}...".' .format(seq[:MINSEQLEN])) xml = openURL(url, timeout=timeout).read() try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) matches = {} for child in root[0]: if child.tag == 'hits': accession = child.get('acc') pfam_id = accession.split('.')[0] matches[pfam_id]={} matches[pfam_id]['accession']=accession matches[pfam_id]['class']='Domain' matches[pfam_id]['id']=child.get('name') matches[pfam_id]['locations']={} matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto') matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom') matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore') matches[pfam_id]['locations']['end']=child[0].get('alisqto') matches[pfam_id]['locations']['evalue']=child.get('evalue') matches[pfam_id]['locations']['evidence']='hmmer v3.0' matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto') matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom') matches[pfam_id]['locations']['significant']=child[0].get('significant') matches[pfam_id]['locations']['start']=child[0].get('alisqfrom') matches[pfam_id]['type']='Pfam-A' return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: LOGGER.warn('failed to parse header for {0} ({1})' .format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue idcode = dbref.idcode LOGGER.info('UniProt ID code {0} for {1} chain ' '{2} will be used.' .format(idcode, seq[:4], poly.chid)) break if idcode is not None: break if idcode is None: LOGGER.warn('A UniProt ID code for PDB {0} could not be ' 'parsed.'.format(repr(seq))) url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml' else: url = ('http://pfam.xfam.org/protein/' + idcode + '?output=xml') else: url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml: break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError('failed to parse results XML, check URL: ' + url) else: results = dictElement(root[0], prefix) try: xml_matches = results['matches'] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for child in xml_matches: try: accession = child.attrib['accession'][:7] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) if not re.search('^P(F|B)[0-9]{5}$', accession): raise ValueError('{0} does not match pfam accession' ' format'.format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault('locations', []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = 'Query ' + repr(query) else: query = 'Query sequence' if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches
def searchPfam(query, search_b=False, skip_a=False, **kwargs): """Return Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence file, sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg search_b: search Pfam-B families when **True** :type search_b: bool :arg skip_a: do not search Pfam-A families when **True** :type skip_a: bool :arg ga: use gathering threshold when **True** :type ga: bool :arg evalue: user specified e-value cutoff, must be smaller than 10.0 :type evalue: float :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" prefix = '{http://pfam.sanger.ac.uk/}' query = str(query) if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = ''.join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError('could not parse a sequence without gaps from ' + query) else: seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + ' is not a valid sequence') urlextension = '' if kwargs: ga = int(kwargs.get('ga', 1)) if not (ga == 1 or ga == 0): raise ValueError('ga must be either 0 or 1') evalue = kwargs.get('evalue', None) if evalue: if not float(evalue) <= 10.0: raise ValueError('evalue must be a valid float < 10.0') urlextension = urlextension + '&evalue=' + str(evalue) else: urlextension = urlextension + '&ga=' + str(ga) search_b = int(bool(search_b)) skip_a = int(bool(skip_a)) if skip_a == 1: search_b = 1 urlextension = urlextension + '&searchBs=' + str(search_b) urlextension = urlextension + '&skipAs='******'http://pfam.sanger.ac.uk/search/sequence?seq=' + str(seq) + urlextension + '&output=xml') LOGGER.debug('Submitted Pfam search for sequence "{0}...".' .format(seq[:MINSEQLEN])) xml = openURL(url, timeout=timeout).read() try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) try: url = dictElement(root[0], prefix)['result_url'] except (IndexError, KeyError): raise ValueError('failed to parse results XML, check URL: ' + url) else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: LOGGER.warn('failed to parse header for {0} ({1})' .format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue idcode = dbref.idcode LOGGER.info('UniProt ID code {0} for {1} chain ' '{2} will be used.' .format(idcode, seq[:4], poly.chid)) break if idcode is not None: break if idcode is None: LOGGER.warn('A UniProt ID code for PDB {0} could not be ' 'parsed.'.format(repr(seq))) url = 'http://pfam.sanger.ac.uk/protein/' + seq + '?output=xml' else: url = ('http://pfam.sanger.ac.uk/protein/' + idcode + '?output=xml') else: url = 'http://pfam.sanger.ac.uk/protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass #else: # if xml: # break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError('failed to parse results XML, check URL: ' + url) else: results = dictElement(root[0], prefix) try: xml_matches = results['matches'] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for child in xml_matches: try: accession = child.attrib['accession'][:7] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) if not re.search('^P(F|B)[0-9]{5}$', accession): raise ValueError('{0} does not match pfam accession' ' format'.format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault('locations', []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = 'Query ' + repr(query) else: query = 'Query sequence' if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches