def queryUniprot(id, loop_through=[]): """Query Uniprot with *id* and return a `dictionary` containing the results :arg loop_through: entries through which you want to loop dictElements until there aren't any elements left :type loop_through: list """ if not isinstance(id, str): raise TypeError('id should be a string') try: record_file = openURL( 'http://www.uniprot.org/uniprot/{0}.xml'.format(id)) except: raise ValueError('No Uniprot record found with that id') data = record_file.read() record_file.close() data = ET.XML(data) data = dictElement(data.getchildren()[0], '{http://uniprot.org/uniprot}', number_multiples=True) if loop_through != []: data = dictElementLoop(data, loop_through, '{http://uniprot.org/uniprot}') return data
def queryUniprot(*args, n_attempts=3, dt=1, **kwargs): """ Redefine prody function to check for no internet connection """ attempt = 0 while attempt < n_attempts: try: _ = openURL('http://www.uniprot.org/') break except: LOGGER.info(f'Attempt {attempt} to contact www.uniprot.org failed') attempt += 1 time.sleep((attempt + 1) * dt) else: _ = openURL('http://www.uniprot.org/') return pd.queryUniprot(*args, **kwargs)
def parseCCD(ids): """Retrieve the whole Chemical Component Dictionary (CCD) resource. """ if isListLike(ids): n_ids = len(ids) else: ids = [ids] n_ids = 1 ret = [] for id in ids: id_url = 'http://ligand-expo.rcsb.org/reports/{0}/{1}/{1}.cif'.format(id[0], id) try: handle = openURL(id_url) except Exception as err: LOGGER.warn('download failed ({1}).'.format(str(err))) else: data = handle.read() if len(data): if PY3K: data = data.decode() parsingDict, prog = parseSTARLines(data.split('\n'), shlex=True) star_dict = StarDict(parsingDict, prog, id) ret.append(star_dict[id]) else: ret.append(None) LOGGER.warn('Could not parse CCD data for {0}'.format(id)) if n_ids == 1: return ret[0] return ret
def parseOBO(**kwargs): """Parse a GO OBO file containing the GO itself. See `OBO`_ for more information on the file format. .. _OBO: http://owlcollab.github.io/oboformat/doc/obo-syntax.html """ try: from goatools import obo_parser except: raise ImportError('GOATools needs to be installed to use parseOBO') go_obo_url = kwargs.get('go_obo_url', None) if go_obo_url is None: go_obo_url = 'http://purl.obolibrary.org/obo/go/go-basic.obo' data_folder = kwargs.get('data_folder', None) if data_folder is None: data_folder = os.getcwd() + '/Data' # Check if we have the ./data directory already if (not os.path.isfile(data_folder)): # Emulate mkdir -p (no error if folder exists) try: os.mkdir(data_folder) except OSError as e: if (e.errno != 17): raise e else: raise Exception( 'Data path (' + data_folder + ') exists as a file. ' 'Please rename, remove or change the desired location of the data path.' ) # Check if the file exists already if (not os.path.isfile(data_folder + '/go-basic.obo')): try: handle = openURL(go_obo_url) except Exception as err: LOGGER.warn('{0} download failed ({1}).'.format( go_obo_url, str(err))) else: data = handle.read() if len(data): filename = data_folder + '/go-basic.obo' with open(filename, 'w+b') as obofile: obofile.write(data) LOGGER.debug('{0} downloaded ({1})'.format( go_obo_url, sympath(filename))) else: LOGGER.warn( '{0} download failed, reason unknown.'.format(go_obo_url)) else: go_obo = data_folder + '/go-basic.obo' return obo_parser.GODag(go_obo)
def queryUniprot(id, expand=[], regex=True): """Query Uniprot with *id* and return a `dict` containing the raw results. Regular users should use :func:`searchUniprot` instead. :arg expand: entries through which you want to loop dictElements until there aren't any elements left :type expand: list """ if not isinstance(id, str): raise TypeError('id should be a string') try: record_file = openURL( 'http://www.uniprot.org/uniprot/{0}.xml'.format(id)) except: raise ValueError('No Uniprot record found with that id') data = record_file.read() record_file.close() data = XML(data) data = dictElement(data.getchildren()[0], '{http://uniprot.org/uniprot}', number_multiples=True) for key in data: value = data[key] if not key.startswith('dbReference'): continue try: if value.get('type') != 'PDB': continue except AttributeError: continue pdbid = value.get('id') refdata = {'PDB': pdbid} for prop in value: prop_key = prop.get('type') prop_val = prop.get('value') refdata[prop_key] = prop_val data[key] = refdata if expand: keys = [] if regex: for lt in expand: lt_re = re.compile(lt) for key in data: if lt_re.match(key): keys.append(key) else: keys = expand data = dictElementLoop(data, keys, '{http://uniprot.org/uniprot}') return data
def parseOBO(**kwargs): """Parse a GO OBO file containing the GO itself. See `OBO`_ for more information on the file format. .. _OBO: http://owlcollab.github.io/oboformat/doc/obo-syntax.html """ try: from goatools import obo_parser except: raise ImportError('GOATools needs to be installed to use parseOBO') go_obo_url = kwargs.get('go_obo_url', None) if go_obo_url is None: go_obo_url = 'http://purl.obolibrary.org/obo/go/go-basic.obo' data_folder = kwargs.get('data_folder', None) if data_folder is None: data_folder = os.getcwd() + '/Data' # Check if we have the ./data directory already if(not os.path.isfile(data_folder)): # Emulate mkdir -p (no error if folder exists) try: os.mkdir(data_folder) except OSError as e: if(e.errno != 17): raise e else: raise Exception('Data path (' + data_folder + ') exists as a file. ' 'Please rename, remove or change the desired location of the data path.') # Check if the file exists already if(not os.path.isfile(data_folder+'/go-basic.obo')): try: handle = openURL(go_obo_url) except Exception as err: LOGGER.warn('{0} download failed ({1}).'.format( go_obo_url, str(err))) else: data = handle.read() if len(data): filename = data_folder+'/go-basic.obo' with open(filename, 'w+b') as obofile: obofile.write(data) LOGGER.debug('{0} downloaded ({1})' .format(go_obo_url, sympath(filename))) else: LOGGER.warn('{0} download failed, reason unknown.' .format(go_obo_url)) else: go_obo = data_folder+'/go-basic.obo' return obo_parser.GODag(go_obo)
def queryUniprot(id, expand=[], regex=True): """Query Uniprot with *id* and return a `dictionary` containing the results :arg expand: entries through which you want to loop dictElements until there aren't any elements left :type expand: list """ if not isinstance(id, str): raise TypeError('id should be a string') try: record_file = openURL('http://www.uniprot.org/uniprot/{0}.xml'.format(id)) except: raise ValueError('No Uniprot record found with that id') data = record_file.read() record_file.close() data = XML(data) data = dictElement(data.getchildren()[0], '{http://uniprot.org/uniprot}', number_multiples=True) for key in data: value = data[key] if not key.startswith('dbReference'): continue try: if value.get('type') != 'PDB': continue except AttributeError: continue pdbid = value.get('id') refdata = {'PDB': pdbid} for prop in value: prop_key = prop.get('type') prop_val = prop.get('value') refdata[prop_key] = prop_val data[key] = refdata if expand: keys = [] if regex: for lt in expand: lt_re = re.compile(lt) for key in data.keys(): if lt_re.match(key): keys.append(key) else: keys = expand data = dictElementLoop(data, keys, '{http://uniprot.org/uniprot}') return data
def fetchPDBClusters(sqid=None): """Retrieve PDB sequence clusters. PDB sequence clusters are results of the weekly clustering of protein chains in the PDB generated by blastclust. They are available at FTP site: ftp://resources.rcsb.org/sequence/clusters/ This function will download about 10 Mb of data and save it after compressing in your home directory in :file:`.prody/pdbclusters`. Compressed files will be less than 4 Mb in size. Cluster data can be loaded using :func:`loadPDBClusters` function and be accessed using :func:`listPDBCluster`.""" if sqid is not None: if isListLike(sqid): for s in sqid: if s not in PDB_CLUSTERS: raise ValueError('sqid must be one or more of ' + PDB_CLUSTERS_SQID_STR) keys = list(sqid) else: if sqid not in PDB_CLUSTERS: raise ValueError('sqid must be one or more of ' + PDB_CLUSTERS_SQID_STR) keys = [sqid] else: keys = list(PDB_CLUSTERS) PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters') if not os.path.isdir(PDB_CLUSTERS_PATH): os.mkdir(PDB_CLUSTERS_PATH) LOGGER.progress('Downloading sequence clusters', len(keys), '_prody_fetchPDBClusters') count = 0 for i, x in enumerate(keys): filename = 'bc-{0}.out'.format(x) url = ('ftp://resources.rcsb.org/sequence/clusters/' + filename) try: inp = openURL(url) except IOError: LOGGER.warning('Clusters at {0}% sequence identity level could ' 'not be downloaded.'.format(x)) continue else: out = openFile(filename + '.gz', 'w', folder=PDB_CLUSTERS_PATH) out.write(inp.read()) inp.close() out.close() count += 1 LOGGER.update(i, label='_prody_fetchPDBClusters') LOGGER.finish() if len(keys) == count: LOGGER.info('All selected PDB clusters were downloaded successfully.') elif count == 0: LOGGER.warn('PDB clusters could not be downloaded.')
def fetchPDBClusters(sqid=None): """Retrieve PDB sequence clusters. PDB sequence clusters are results of the weekly clustering of protein chains in the PDB generated by blastclust. They are available at FTP site: ftp://resources.rcsb.org/sequence/clusters/ This function will download about 10 Mb of data and save it after compressing in your home directory in :file:`.prody/pdbclusters`. Compressed files will be less than 4 Mb in size. Cluster data can be loaded using :func:`loadPDBClusters` function and be accessed using :func:`listPDBCluster`.""" if sqid is not None: if sqid not in PDB_CLUSTERS: raise ValueError('sqid must be one of ' + PDB_CLUSTERS_SQID_STR) keys = [sqid] else: keys = list(PDB_CLUSTERS) PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters') if not os.path.isdir(PDB_CLUSTERS_PATH): os.mkdir(PDB_CLUSTERS_PATH) LOGGER.progress('Downloading sequence clusters', len(PDB_CLUSTERS), '_prody_fetchPDBClusters') count = 0 for i, x in enumerate(keys): filename = 'bc-{0}.out'.format(x) url = ('ftp://resources.rcsb.org/sequence/clusters/' + filename) try: inp = openURL(url) except IOError: LOGGER.warning('Clusters at {0}% sequence identity level could ' 'not be downloaded.') continue else: out = openFile(filename+'.gz', 'w', folder=PDB_CLUSTERS_PATH) out.write(inp.read()) inp.close() out.close() count += 1 LOGGER.update(i, '_prody_fetchPDBClusters') LOGGER.clear() if len(PDB_CLUSTERS) == count: LOGGER.info('All PDB clusters were downloaded successfully.') elif count == 0: LOGGER.warn('PDB clusters could not be downloaded.')
def blastPDBUniProtKB(sequence, filename=None, **kwargs): """Returns a :class:`PDBBlastRecord` instance that contains results from blast searching of ProteinDataBank database *sequence* using NCBI blastp. :arg sequence: single-letter code amino acid sequence of the protein without any gap characters, all white spaces will be removed :type sequence: str :arg filename: a *filename* to save the results in XML format :type filename: str *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``) search parameters can be adjusted by the user. *sleep* keyword argument (default is ``2`` seconds) determines how long to wait to reconnect for results. Sleep time is doubled when results are not ready. *timeout* (default is 120s) determines when to give up waiting for the results. *num_sequences (default is ``1``) """ num_sequences = int(kwargs.pop('num_sequences', 1)) if sequence == 'runexample': sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI' 'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN' 'DAYDIVKMKKSNISPNFNFMGQLLDFERTL') else: if num_sequences == 1: try: sequence = ''.join(sequence.split()) _ = sequence.isalpha() except AttributeError: raise TypeError('sequence must be a string') else: if not _: raise ValueError('not a valid protein sequence') headers = {'User-agent': 'ProDy'} query = [('DATABASE', 'swissprot'), ('ENTREZ_QUERY', '(none)'), ('PROGRAM', 'blastp'),] expect = float(kwargs.pop('expect', 10e-5)) if expect <= 0: raise ValueError('expect must be a positive number') query.append(('EXPECT', expect)) hitlist_size = int(kwargs.pop('hitlist_size', 250)) if hitlist_size <= 0: raise ValueError('expect must be a positive integer') psiblast = 'true' step_number = 3 query.append(('RUN_PSIBLAST', psiblast)) query.append(('HITLIST_SIZE', hitlist_size)) query.append(('QUERY', sequence)) query.append(('CMD', 'Put')) query.append(('STEP_NUMBER', step_number)) sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', 120)) if kwargs: LOGGER.warn('Keyword argument(s) {0} are not used.' .format(', '.join([repr(key) for key in kwargs]))) try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib.parse import urlencode url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi' data = urlencode(query) LOGGER.timeit('_prody_blast') LOGGER.info('Blast searching NCBI PDB database for "{0}..."' .format(sequence[:5])) handle = openURL(url, data=data, headers=headers) html = handle.read() index = html.find(b'name="RID" type="hidden" value="') if index == -1: raise Exception('NCBI did not return expected response.') else: last = html.find(b'>',index) rid = html[index + len('name="RID" type="hidden" value="'):last-1].strip() index = html.find(b'name="RTOE" type="hidden" value="') if index == -1: rtoe = None # This is not used else: last = html.find(b'>', index) rtoe = html[index + len('name="RTOE" type="hidden" value="'):last-1].strip() query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500), ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')] data = urlencode(query) while True: LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.') LOGGER.write('Connecting NCBI for search results...') handle = openURL(url, data=data, headers=headers) results = handle.read() index = results.find(b'Status=') LOGGER.clear() if index < 0: break last = results.index(b'\n', index) status = results[index+len('Status='):last].strip() if status.upper() == 'READY': break sleep = int(sleep * 1.5) if LOGGER.timing('_prody_blast') > timeout: LOGGER.warn('Blast search time out.') return None LOGGER.clear() LOGGER.report('Blast search completed in %.1fs.', '_prody_blast') try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' out = open(filename, 'w') out.write(results) out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) return SwissProtBlastRecord(results, sequence)
def fetchPfamMSA(acc, alignment='full', compressed=False, **kwargs): """Return a path to the downloaded Pfam MSA file. :arg acc: Pfam ID or Accession Code :type acc: str :arg alignment: alignment type, one of ``'full'`` (default), ``'seed'``, ``'ncbi'``, ``'metagenomics'``, ``'rp15'``, ``'rp35'``, ``'rp55'``, or ``'rp75'`` where rp stands for representative proteomes :arg compressed: gzip the downloaded MSA file, default is **False** *Alignment Options* :arg format: a Pfam supported MSA file format, one of ``'selex'``, (default), ``'stockholm'`` or ``'fasta'`` :arg order: ordering of sequences, ``'tree'`` (default) or ``'alphabetical'`` :arg inserts: letter case for inserts, ``'upper'`` (default) or ``'lower'`` :arg gaps: gap character, one of ``'dashes'`` (default), ``'dots'``, ``'mixed'`` or **None** for unaligned *Other Options* :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :arg outname: out filename, default is input ``'acc_alignment.format'`` :arg folder: output folder, default is ``'.'``""" url = 'http://pfam.sanger.ac.uk/family/acc?id=' + acc handle = openURL(url) orig_acc = acc acc = handle.readline().strip() if PY3K: acc = acc.decode() url_flag = False if not re.search('(?<=PF)[0-9]{5}$', acc): raise ValueError('{0} is not a valid Pfam ID or Accession Code' .format(repr(orig_acc))) if alignment not in DOWNLOAD_FORMATS: raise ValueError('alignment must be one of full, seed, ncbi or' ' metagenomics') if alignment == 'ncbi' or alignment == 'metagenomics': url = ('http://pfam.sanger.ac.uk/family/' + acc + '/alignment/' + alignment + '/gzipped') url_flag = True extension = '.sth' else: if not kwargs: url = ('http://pfam.sanger.ac.uk/family/' + acc + '/alignment/' + alignment + '/gzipped') url_flag = True extension = '.sth' else: align_format = kwargs.get('format', 'selex').lower() if align_format not in FORMAT_OPTIONS['format']: raise ValueError('alignment format must be of type selex' ' stockholm or fasta. MSF not supported') if align_format == SELEX: align_format, extension = 'pfam', '.slx' elif align_format == FASTA: extension = '.fasta' else: extension = '.sth' gaps = str(kwargs.get('gaps', 'dashes')).lower() if gaps not in FORMAT_OPTIONS['gaps']: raise ValueError('gaps must be of type mixed, dots, dashes, ' 'or None') inserts = kwargs.get('inserts', 'upper').lower() if(inserts not in FORMAT_OPTIONS['inserts']): raise ValueError('inserts must be of type lower or upper') order = kwargs.get('order', 'tree').lower() if order not in FORMAT_OPTIONS['order']: raise ValueError('order must be of type tree or alphabetical') url = ('http://pfam.sanger.ac.uk/family/' + acc + '/alignment/' + alignment + '/format?format=' + align_format + '&alnType=' + alignment + '&order=' + order[0] + '&case=' + inserts[0] + '&gaps=' + gaps + '&download=1') response = openURL(url, timeout=int(kwargs.get('timeout', 60))) outname = kwargs.get('outname', None) if not outname: outname = orig_acc folder = str(kwargs.get('folder', '.')) filepath = join(makePath(folder), outname + '_' + alignment + extension) if compressed: filepath = filepath + '.gz' if url_flag: f_out = open(filepath, 'wb') else: f_out = openFile(filepath, 'wb') f_out.write(response.read()) f_out.close() else: if url_flag: gunzip(response.read(), filepath) else: with open(filepath, 'wb') as f_out: f_out.write(response.read()) filepath = relpath(filepath) LOGGER.info('Pfam MSA for {0} is written as {1}.' .format(orig_acc, filepath)) return filepath
def searchPfam(query, **kwargs): """Return Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence file, sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" prefix = '{http://pfam.xfam.org/}' query = str(query) if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = ''.join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError('could not parse a sequence without gaps from ' + query) else: seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + ' is not a valid sequence') fseq = '>Seq\n' + seq parameters = { 'hmmdb' : 'pfam', 'seq': fseq } enc_params = urllib.urlencode(parameters) request = urllib2.Request('http://hmmer.janelia.org/search/hmmscan', enc_params) url = ( urllib2.urlopen(request).geturl() + '?output=xml') LOGGER.debug('Submitted Pfam search for sequence "{0}...".' .format(seq[:MINSEQLEN])) xml = openURL(url, timeout=timeout).read() try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) matches = {} for child in root[0]: if child.tag == 'hits': accession = child.get('acc') pfam_id = accession.split('.')[0] matches[pfam_id]={} matches[pfam_id]['accession']=accession matches[pfam_id]['class']='Domain' matches[pfam_id]['id']=child.get('name') matches[pfam_id]['locations']={} matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto') matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom') matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore') matches[pfam_id]['locations']['end']=child[0].get('alisqto') matches[pfam_id]['locations']['evalue']=child.get('evalue') matches[pfam_id]['locations']['evidence']='hmmer v3.0' matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto') matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom') matches[pfam_id]['locations']['significant']=child[0].get('significant') matches[pfam_id]['locations']['start']=child[0].get('alisqfrom') matches[pfam_id]['type']='Pfam-A' return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: LOGGER.warn('failed to parse header for {0} ({1})' .format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue idcode = dbref.idcode LOGGER.info('UniProt ID code {0} for {1} chain ' '{2} will be used.' .format(idcode, seq[:4], poly.chid)) break if idcode is not None: break if idcode is None: LOGGER.warn('A UniProt ID code for PDB {0} could not be ' 'parsed.'.format(repr(seq))) url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml' else: url = ('http://pfam.xfam.org/protein/' + idcode + '?output=xml') else: url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml: break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError('failed to parse results XML, check URL: ' + url) else: results = dictElement(root[0], prefix) try: xml_matches = results['matches'] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for child in xml_matches: try: accession = child.attrib['accession'][:7] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) if not re.search('^P(F|B)[0-9]{5}$', accession): raise ValueError('{0} does not match pfam accession' ' format'.format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault('locations', []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = 'Query ' + repr(query) else: query = 'Query sequence' if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches
def searchPfam(query, **kwargs): """Returns Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, a protein sequence, or a sequence file. Sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = ''.join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError('could not parse a sequence without gaps from ' + query) else: seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + ' is not a valid sequence') fseq = '>Seq\n' + seq parameters = {'hmmdb': 'pfam', 'seq': fseq} enc_params = urllib.urlencode(parameters).encode('utf-8') request = urllib2.Request( 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params) results_url = urllib2.urlopen(request).geturl() #res_params = { 'output' : 'xml' } res_params = {'format': 'tsv'} enc_res_params = urllib.urlencode(res_params) #modified_res_url = results_url + '?' + enc_res_params modified_res_url = results_url.replace( 'results', 'download') + '?' + enc_res_params result_request = urllib2.Request(modified_res_url) # url = ( urllib2.urlopen(request).geturl() + '?output=xml') LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format( seq[:MINSEQLEN])) try: #xml = urllib2.urlopen(result_request).read() tsv = urllib2.urlopen(result_request).read() # openURL(url, timeout=timeout).read() except: raise ValueError('No matching Pfam domains were found.') # try: # root = ET.XML(xml) # except Exception as err: # raise ValueError('failed to parse results XML, check URL: ' + modified_res_url) matches = {} #for child in root[0]: #if child.tag == 'hits': # accession = child.get('acc') # pfam_id = accession.split('.')[0] # matches[pfam_id]={} # matches[pfam_id]['accession']=accession # matches[pfam_id]['class']='Domain' # matches[pfam_id]['id']=child.get('name') # matches[pfam_id]['locations']={} # matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto') # matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom') # matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore') # matches[pfam_id]['locations']['end']=child[0].get('alisqto') # matches[pfam_id]['locations']['evalue']=child.get('evalue') # matches[pfam_id]['locations']['evidence']='hmmer v3.0' # matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto') # matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom') # matches[pfam_id]['locations']['significant']=child[0].get('significant') # matches[pfam_id]['locations']['start']=child[0].get('alisqfrom') # matches[pfam_id]['type']='Pfam-A' # return matches if PY3K: tsv = tsv.decode() lines = tsv.split('\n') keys = lines[0].split('\t') root = {} for i, line in enumerate(lines[1:-1]): root[i] = {} for j, key in enumerate(keys): root[i][key] = line.split('\t')[j] for child in root.values(): accession = child['Family Accession'] pfam_id = accession.split('.')[0] matches[pfam_id] = {} matches[pfam_id]['accession'] = accession matches[pfam_id]['class'] = 'Domain' matches[pfam_id]['id'] = child['Family id'] matches[pfam_id]['locations'] = {} matches[pfam_id]['locations']['ali_end'] = child['Ali. End'] matches[pfam_id]['locations']['ali_start'] = child['Ali. Start'] matches[pfam_id]['locations']['bitscore'] = child['Bit Score'] matches[pfam_id]['locations']['end'] = child['Env. End'] matches[pfam_id]['locations']['cond_evalue'] = child[ 'Cond. E-value'] matches[pfam_id]['locations']['ind_evalue'] = child['Ind. E-value'] matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0' matches[pfam_id]['locations']['hmm_end'] = child['Model End'] matches[pfam_id]['locations']['hmm_start'] = child['Model Start'] #matches[pfam_id]['locations']['significant'] = child['significant'] matches[pfam_id]['locations']['start'] = child['Env. Start'] matches[pfam_id]['type'] = 'Pfam-A' return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: LOGGER.warn('failed to parse header for {0} ({1})'.format( seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue idcode = dbref.idcode accession = dbref.accession LOGGER.info('UniProt ID code {0} for {1} chain ' '{2} will be used.'.format( idcode, seq[:4], poly.chid)) break if idcode is not None: break if idcode is None: LOGGER.warn('A UniProt ID code for PDB {0} could not be ' 'parsed.'.format(repr(seq))) url = prefix + 'protein/' + seq + '?output=xml' else: url = prefix + 'protein/' + idcode + '?output=xml' else: url = prefix + 'protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml not in ['PEND', 'RUN']: break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None elif xml.find(b'No valid UniProt accession or ID') > 0: try: url = prefix + 'protein/' + accession + '?output=xml' xml = openURL(url, timeout=timeout).read() except: try: ag = parsePDB(seq, subset='ca') ag_seq = ag.getSequence() return searchPfam(ag_seq) except: raise ValueError('No valid UniProt accession or ID for: ' + seq) try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError('failed to parse results XML, check URL: ' + url) else: key = '{' + prefix + '}' results = dictElement(root[0], key) try: xml_matches = results['matches'] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for child in xml_matches: try: accession = child.attrib['accession'][:7] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) if not re.search('^P(F|B)[0-9]{5}$', accession): raise ValueError('{0} does not match pfam accession' ' format'.format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault('locations', []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = 'Query ' + repr(query) else: query = 'Query sequence' if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches
def searchUniprotID(query, search_b=False, skip_a=False, **kwargs): """Returns Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence file, sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg search_b: search Pfam-B families when **True** :type search_b: bool :arg skip_a: do not search Pfam-A families when **True** :type skip_a: bool :arg ga: use gathering threshold when **True** :type ga: bool :arg evalue: user specified e-value cutoff, must be smaller than 10.0 :type evalue: float :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" prefix = '{http://pfam.xfam.org/}' query = str(query) seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml: break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) result = root[0].get('id') return result
def fetchPDBLigand(cci, filename=None): """Fetch PDB ligand data from PDB_ for chemical component *cci*. *cci* may be 3-letter chemical component identifier or a valid XML filename. If *filename* is given, XML file will be saved with that name. If you query ligand data frequently, you may configure ProDy to save XML files in your computer. Set ``ligand_xml_save`` option **True**, i.e. ``confProDy(ligand_xml_save=True)``. Compressed XML files will be save to ProDy package folder, e.g. :file:`/home/user/.prody/pdbligands`. Each file is around 5Kb when compressed. This function is compatible with PDBx/PDBML v 4.0. Ligand data is returned in a dictionary. Ligand coordinate atom data with *model* and *ideal* coordinate sets are also stored in this dictionary. Note that this dictionary will contain data that is present in the XML file and all Ligand Expo XML files do not contain every possible data field. So, it may be better if you use :meth:`dict.get` instead of indexing the dictionary, e.g. to retrieve formula weight (or relative molar mass) of the chemical component use ``data.get('formula_weight')`` instead of ``data['formula_weight']`` to avoid exceptions when this data field is not found in the XML file. URL and/or path of the XML file are returned in the dictionary with keys ``url`` and ``path``, respectively. Following example downloads data for ligand STI (a.k.a. Gleevec and Imatinib) and calculates RMSD between model (X-ray structure 1IEP) and ideal (energy minimized) coordinate sets: .. ipython:: python from prody import * ligand_data = fetchPDBLigand('STI') ligand_data['model_coordinates_db_code'] ligand_model = ligand_data['model'] ligand_ideal = ligand_data['ideal'] transformation = superpose(ligand_ideal.noh, ligand_model.noh) calcRMSD(ligand_ideal.noh, ligand_model.noh)""" if not isinstance(cci, str): raise TypeError('cci must be a string') if isfile(cci): inp = openFile(cci) xml = inp.read() inp.close() url = None path = cci cci = splitext(splitext(split(cci)[1])[0])[0].upper() elif len(cci) > 4 or not cci.isalnum(): raise ValueError('cci must be 3-letters long and alphanumeric or ' 'a valid filename') else: xml = None cci = cci.upper() if SETTINGS.get('ligand_xml_save'): folder = join(getPackagePath(), 'pdbligands') if not isdir(folder): makePath(folder) xmlgz = path = join(folder, cci + '.xml.gz') if isfile(xmlgz): with openFile(xmlgz) as inp: xml = inp.read() else: path = None #url = ('http://ligand-expo.rcsb.org/reports/{0[0]}/{0}/{0}' # '.xml'.format(cci.upper())) url = 'http://www.pdb.org/pdb/files/ligand/{0}.xml'.format(cci.upper()) if not xml: #'http://www.pdb.org/pdb/files/ligand/{0}.xml' try: inp = openURL(url) except IOError: raise IOError('XML file for ligand {0} is not found online' .format(cci)) else: xml = inp.read() inp.close() if filename: out = openFile(filename, mode='w', folder=folder) out.write(xml) out.close() if SETTINGS.get('ligand_xml_save'): with openFile(xmlgz, 'w') as out: out.write(xml) import xml.etree.cElementTree as ET root = ET.XML(xml) if (root.get('{http://www.w3.org/2001/XMLSchema-instance}' 'schemaLocation') != 'http://pdbml.pdb.org/schema/pdbx-v40.xsd pdbx-v40.xsd'): LOGGER.warn('XML is not in PDBx/PDBML v 4.0 format, resulting ' 'dictionary may not contain all data fields') ns = root.tag[:root.tag.rfind('}')+1] len_ns = len(ns) dict_ = {'url': url, 'path': path} for child in list(root.find(ns + 'chem_compCategory')[0]): tag = child.tag[len_ns:] if tag.startswith('pdbx_'): tag = tag[5:] dict_[tag] = child.text dict_['formula_weight'] = float(dict_.get('formula_weight')) identifiers_and_descriptors = [] results = root.find(ns + 'pdbx_chem_comp_identifierCategory') if results: identifiers_and_descriptors.extend(results) results = root.find(ns + 'pdbx_chem_comp_descriptorCategory') if results: identifiers_and_descriptors.extend(results) for child in identifiers_and_descriptors: program = child.get('program').replace(' ', '_') type_ = child.get('type').replace(' ', '_') dict_[program + '_' + type_] = child[0].text dict_[program + '_version'] = child.get('program_version') dict_['audits'] = [(audit.get('action_type'), audit.get('date')) for audit in list(root.find(ns + 'pdbx_chem_comp_auditCategory'))] atoms = list(root.find(ns + 'chem_comp_atomCategory')) n_atoms = len(atoms) ideal_coords = np.zeros((n_atoms, 3)) model_coords = np.zeros((n_atoms, 3)) atomnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['name'].dtype) elements = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['element'].dtype) resnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['resname'].dtype) charges = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['charge'].dtype) resnums = np.ones(n_atoms, dtype=ATOMIC_FIELDS['charge'].dtype) alternate_atomnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['name'].dtype) leaving_atom_flags = np.zeros(n_atoms, np.bool) aromatic_flags = np.zeros(n_atoms, np.bool) stereo_configs = np.zeros(n_atoms, np.bool) ordinals = np.zeros(n_atoms, int) name2index = {} for i, atom in enumerate(atoms): data = dict([(child.tag[len_ns:], child.text) for child in list(atom)]) name = data.get('pdbx_component_atom_id', 'X') name2index[name] = i atomnames[i] = name elements[i] = data.get('type_symbol', 'X') resnames[i] = data.get('pdbx_component_comp_id', 'UNK') charges[i] = float(data.get('charge', 0)) alternate_atomnames[i] = data.get('alt_atom_id', 'X') leaving_atom_flags[i] = data.get('pdbx_leaving_atom_flag') == 'Y' aromatic_flags[i] = data.get('pdbx_atomatic_flag') == 'Y' stereo_configs[i] = data.get('pdbx_stereo_config') == 'Y' ordinals[i] = int(data.get('pdbx_ordinal', 0)) model_coords[i, 0] = float(data.get('model_Cartn_x', 0)) model_coords[i, 1] = float(data.get('model_Cartn_y', 0)) model_coords[i, 2] = float(data.get('model_Cartn_z', 0)) ideal_coords[i, 0] = float(data.get('pdbx_model_Cartn_x_ideal', 0)) ideal_coords[i, 1] = float(data.get('pdbx_model_Cartn_y_ideal', 0)) ideal_coords[i, 2] = float(data.get('pdbx_model_Cartn_z_ideal', 0)) pdbid = dict_.get('model_coordinates_db_code') if pdbid: model = AtomGroup(cci + ' model ({0})'.format(pdbid)) else: model = AtomGroup(cci + ' model') model.setCoords(model_coords) model.setNames(atomnames) model.setResnames(resnames) model.setResnums(resnums) model.setElements(elements) model.setCharges(charges) model.setFlags('leaving_atom_flags', leaving_atom_flags) model.setFlags('aromatic_flags', aromatic_flags) model.setFlags('stereo_configs', stereo_configs) model.setData('ordinals', ordinals) model.setData('alternate_atomnames', alternate_atomnames) dict_['model'] = model ideal = model.copy() ideal.setTitle(cci + ' ideal') ideal.setCoords(ideal_coords) dict_['ideal'] = ideal bonds = [] warned = set() for bond in list(root.find(ns + 'chem_comp_bondCategory') or bonds): name_1 = bond.get('atom_id_1') name_2 = bond.get('atom_id_2') try: bonds.append((name2index[name_1], name2index[name_2])) except KeyError: if name_1 not in warned and name_1 not in name2index: warned.add(name_1) LOGGER.warn('{0} specified {1} in bond category is not ' 'a valid atom name.'.format(repr(name_1), cci)) if name_2 not in warned and name_2 not in name2index: warned.add(name_2) LOGGER.warn('{0} specified {1} in bond category is not ' 'a valid atom name.'.format(repr(name_2), cci)) if bonds: bonds = np.array(bonds, int) model.setBonds(bonds) ideal.setBonds(bonds) return dict_
def blastPDB(sequence, filename=None, **kwargs): """Returns a :class:`PDBBlastRecord` instance that contains results from blast searching of ProteinDataBank database *sequence* using NCBI blastp. :arg sequence: single-letter code amino acid sequence of the protein without any gap characters, all white spaces will be removed :type sequence: str :arg filename: a *filename* to save the results in XML format :type filename: str *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``) search parameters can be adjusted by the user. *sleep* keyword argument (default is ``2`` seconds) determines how long to wait to reconnect for results. Sleep time is doubled when results are not ready. *timeout* (default is 120s) determines when to give up waiting for the results. """ if sequence == "runexample": sequence = ( "ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI" "SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN" "DAYDIVKMKKSNISPNFNFMGQLLDFERTL" ) else: try: sequence = "".join(sequence.split()) _ = sequence.isalpha() except AttributeError: raise TypeError("sequence must be a string") else: if not _: raise ValueError("not a valid protein sequence") headers = {"User-agent": "ProDy"} query = [("DATABASE", "pdb"), ("ENTREZ_QUERY", "(none)"), ("PROGRAM", "blastp")] expect = float(kwargs.pop("expect", 10e-10)) if expect <= 0: raise ValueError("expect must be a positive number") query.append(("EXPECT", expect)) hitlist_size = int(kwargs.pop("hitlist_size", 250)) if hitlist_size <= 0: raise ValueError("expect must be a positive integer") query.append(("HITLIST_SIZE", hitlist_size)) query.append(("QUERY", sequence)) query.append(("CMD", "Put")) sleep = float(kwargs.pop("sleep", 2)) timeout = float(kwargs.pop("timeout", 120)) if kwargs: LOGGER.warn("Keyword argument(s) {0} are not used.".format(", ".join([repr(key) for key in kwargs]))) try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), "utf-8") except ImportError: from urllib import urlencode url = "https://blast.ncbi.nlm.nih.gov/Blast.cgi" data = urlencode(query) LOGGER.timeit("_prody_blast") LOGGER.info('Blast searching NCBI PDB database for "{0}..."'.format(sequence[:5])) handle = openURL(url, data=data, headers=headers) html = handle.read() index = html.find(b"RID =") if index == -1: raise Exception("NCBI did not return expected response.") else: last = html.find(b"\n", index) rid = html[index + len("RID =") : last].strip() index = html.find(b"RTOE =") if index == -1: rtoe = None # This is not used else: last = html.find(b"\n", index) rtoe = int(html[index + len("RTOE =") : last].strip()) query = [("ALIGNMENTS", 500), ("DESCRIPTIONS", 500), ("FORMAT_TYPE", "XML"), ("RID", rid), ("CMD", "Get")] data = urlencode(query) while True: LOGGER.sleep(int(sleep), "to reconnect NCBI for search results.") LOGGER.write("Connecting NCBI for search results...") handle = openURL(url, data=data, headers=headers) results = handle.read() index = results.find(b"Status=") LOGGER.clear() if index < 0: break last = results.index(b"\n", index) status = results[index + len("Status=") : last].strip() if status.upper() == "READY": break sleep = int(sleep * 1.5) if LOGGER.timing("_prody_blast") > timeout: LOGGER.warn("Blast search time out.") return None LOGGER.clear() LOGGER.report("Blast search completed in %.1fs.", "_prody_blast") try: ext_xml = filename.lower().endswith(".xml") except AttributeError: pass else: if not ext_xml: filename += ".xml" out = open(filename, "w") out.write(results) out.close() LOGGER.info("Results are saved as {0}.".format(repr(filename))) return PDBBlastRecord(results, sequence)
def fetchPDBLigand(cci, filename=None): """Fetch PDB ligand data from PDB_ for chemical component *cci*. *cci* may be 3-letter chemical component identifier or a valid XML filename. If *filename* is given, XML file will be saved with that name. If you query ligand data frequently, you may configure ProDy to save XML files in your computer. Set ``ligand_xml_save`` option **True**, i.e. ``confProDy(ligand_xml_save=True)``. Compressed XML files will be save to ProDy package folder, e.g. :file:`/home/user/.prody/pdbligands`. Each file is around 5Kb when compressed. This function is compatible with PDBx/PDBML v 4.0. Ligand data is returned in a dictionary. Ligand coordinate atom data with *model* and *ideal* coordinate sets are also stored in this dictionary. Note that this dictionary will contain data that is present in the XML file and all Ligand Expo XML files do not contain every possible data field. So, it may be better if you use :meth:`dict.get` instead of indexing the dictionary, e.g. to retrieve formula weight (or relative molar mass) of the chemical component use ``data.get('formula_weight')`` instead of ``data['formula_weight']`` to avoid exceptions when this data field is not found in the XML file. URL and/or path of the XML file are returned in the dictionary with keys ``url`` and ``path``, respectively. Following example downloads data for ligand STI (a.k.a. Gleevec and Imatinib) and calculates RMSD between model (X-ray structure 1IEP) and ideal (energy minimized) coordinate sets: .. ipython:: python from prody import * ligand_data = fetchPDBLigand('STI') ligand_data['model_coordinates_db_code'] ligand_model = ligand_data['model'] ligand_ideal = ligand_data['ideal'] transformation = superpose(ligand_ideal.noh, ligand_model.noh) calcRMSD(ligand_ideal.noh, ligand_model.noh)""" if not isinstance(cci, str): raise TypeError('cci must be a string') if isfile(cci): inp = openFile(cci) xml = inp.read() inp.close() url = None path = cci cci = splitext(splitext(split(cci)[1])[0])[0].upper() elif len(cci) > 4 or not cci.isalnum(): raise ValueError('cci must be 3-letters long and alphanumeric or ' 'a valid filename') else: xml = None cci = cci.upper() if SETTINGS.get('ligand_xml_save'): folder = join(getPackagePath(), 'pdbligands') if not isdir(folder): makePath(folder) xmlgz = path = join(folder, cci + '.xml.gz') if isfile(xmlgz): with openFile(xmlgz) as inp: xml = inp.read() else: path = None #url = ('http://ligand-expo.rcsb.org/reports/{0[0]}/{0}/{0}' # '.xml'.format(cci.upper())) url = 'http://files.rcsb.org/ligands/download/{0}.xml'.format( cci.upper()) if not xml: #'http://www.pdb.org/pdb/files/ligand/{0}.xml' try: inp = openURL(url) except IOError: raise IOError( 'XML file for ligand {0} is not found online'.format(cci)) else: xml = inp.read() inp.close() if filename: out = openFile(filename, mode='w', folder=folder) out.write(xml) out.close() if SETTINGS.get('ligand_xml_save'): with openFile(xmlgz, 'w') as out: out.write(xml) import xml.etree.cElementTree as ET root = ET.XML(xml) if (root.get('{http://www.w3.org/2001/XMLSchema-instance}' 'schemaLocation') != 'http://pdbml.pdb.org/schema/pdbx-v40.xsd pdbx-v40.xsd'): LOGGER.warn('XML is not in PDBx/PDBML v 4.0 format, resulting ' 'dictionary may not contain all data fields') ns = root.tag[:root.tag.rfind('}') + 1] len_ns = len(ns) dict_ = {'url': url, 'path': path} for child in list(root.find(ns + 'chem_compCategory')[0]): tag = child.tag[len_ns:] if tag.startswith('pdbx_'): tag = tag[5:] dict_[tag] = child.text dict_['formula_weight'] = float(dict_.get('formula_weight')) identifiers_and_descriptors = [] results = root.find(ns + 'pdbx_chem_comp_identifierCategory') if results: identifiers_and_descriptors.extend(results) results = root.find(ns + 'pdbx_chem_comp_descriptorCategory') if results: identifiers_and_descriptors.extend(results) for child in identifiers_and_descriptors: program = child.get('program').replace(' ', '_') type_ = child.get('type').replace(' ', '_') dict_[program + '_' + type_] = child[0].text dict_[program + '_version'] = child.get('program_version') dict_['audits'] = [ (audit.get('action_type'), audit.get('date')) for audit in list(root.find(ns + 'pdbx_chem_comp_auditCategory')) ] atoms = list(root.find(ns + 'chem_comp_atomCategory')) n_atoms = len(atoms) ideal_coords = np.zeros((n_atoms, 3)) model_coords = np.zeros((n_atoms, 3)) atomnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['name'].dtype) elements = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['element'].dtype) resnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['resname'].dtype) charges = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['charge'].dtype) resnums = np.ones(n_atoms, dtype=ATOMIC_FIELDS['charge'].dtype) alternate_atomnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['name'].dtype) leaving_atom_flags = np.zeros(n_atoms, np.bool) aromatic_flags = np.zeros(n_atoms, np.bool) stereo_configs = np.zeros(n_atoms, np.bool) ordinals = np.zeros(n_atoms, int) name2index = {} for i, atom in enumerate(atoms): data = dict([(child.tag[len_ns:], child.text) for child in list(atom)]) name = data.get('pdbx_component_atom_id', 'X') name2index[name] = i atomnames[i] = name elements[i] = data.get('type_symbol', 'X') resnames[i] = data.get('pdbx_component_comp_id', 'UNK') charges[i] = float(data.get('charge', 0)) alternate_atomnames[i] = data.get('alt_atom_id', 'X') leaving_atom_flags[i] = data.get('pdbx_leaving_atom_flag') == 'Y' aromatic_flags[i] = data.get('pdbx_atomatic_flag') == 'Y' stereo_configs[i] = data.get('pdbx_stereo_config') == 'Y' ordinals[i] = int(data.get('pdbx_ordinal', 0)) model_coords[i, 0] = float(data.get('model_Cartn_x', 0)) model_coords[i, 1] = float(data.get('model_Cartn_y', 0)) model_coords[i, 2] = float(data.get('model_Cartn_z', 0)) ideal_coords[i, 0] = float(data.get('pdbx_model_Cartn_x_ideal', 0)) ideal_coords[i, 1] = float(data.get('pdbx_model_Cartn_y_ideal', 0)) ideal_coords[i, 2] = float(data.get('pdbx_model_Cartn_z_ideal', 0)) pdbid = dict_.get('model_coordinates_db_code') if pdbid: model = AtomGroup(cci + ' model ({0})'.format(pdbid)) else: model = AtomGroup(cci + ' model') model.setCoords(model_coords) model.setNames(atomnames) model.setResnames(resnames) model.setResnums(resnums) model.setElements(elements) model.setCharges(charges) model.setFlags('leaving_atom_flags', leaving_atom_flags) model.setFlags('aromatic_flags', aromatic_flags) model.setFlags('stereo_configs', stereo_configs) model.setData('ordinals', ordinals) model.setData('alternate_atomnames', alternate_atomnames) dict_['model'] = model ideal = model.copy() ideal.setTitle(cci + ' ideal') ideal.setCoords(ideal_coords) dict_['ideal'] = ideal bonds = [] warned = set() for bond in list(root.find(ns + 'chem_comp_bondCategory') or bonds): name_1 = bond.get('atom_id_1') name_2 = bond.get('atom_id_2') try: bonds.append((name2index[name_1], name2index[name_2])) except KeyError: if name_1 not in warned and name_1 not in name2index: warned.add(name_1) LOGGER.warn('{0} specified {1} in bond category is not ' 'a valid atom name.'.format(repr(name_1), cci)) if name_2 not in warned and name_2 not in name2index: warned.add(name_2) LOGGER.warn('{0} specified {1} in bond category is not ' 'a valid atom name.'.format(repr(name_2), cci)) if bonds: bonds = np.array(bonds, int) model.setBonds(bonds) ideal.setBonds(bonds) return dict_
def searchPfam(query, **kwargs): """Return Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence file, sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" prefix = "{http://pfam.xfam.org/}" query = str(query) if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = "".join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError("could not parse a sequence without gaps from " + query) else: seq = "".join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit("_pfam") timeout = int(kwargs.get("timeout", 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + " is not a valid sequence") fseq = ">Seq\n" + seq parameters = {"hmmdb": "pfam", "seq": fseq} enc_params = urllib.urlencode(parameters) request = urllib2.Request("http://hmmer.janelia.org/search/hmmscan", enc_params) url = urllib2.urlopen(request).geturl() + "?output=xml" LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format(seq[:MINSEQLEN])) xml = openURL(url, timeout=timeout).read() try: root = ET.XML(xml) except Exception as err: raise ValueError("failed to parse results XML, check URL: " + url) matches = {} for child in root[0]: if child.tag == "hits": accession = child.get("acc") pfam_id = accession.split(".")[0] matches[pfam_id] = {} matches[pfam_id]["accession"] = accession matches[pfam_id]["class"] = "Domain" matches[pfam_id]["id"] = child.get("name") matches[pfam_id]["locations"] = {} matches[pfam_id]["locations"]["ali_end"] = child[0].get("alisqto") matches[pfam_id]["locations"]["ali_start"] = child[0].get("alisqfrom") matches[pfam_id]["locations"]["bitscore"] = child[0].get("bitscore") matches[pfam_id]["locations"]["end"] = child[0].get("alisqto") matches[pfam_id]["locations"]["evalue"] = child.get("evalue") matches[pfam_id]["locations"]["evidence"] = "hmmer v3.0" matches[pfam_id]["locations"]["hmm_end"] = child[0].get("alihmmto") matches[pfam_id]["locations"]["hmm_start"] = child[0].get("alihmmfrom") matches[pfam_id]["locations"]["significant"] = child[0].get("significant") matches[pfam_id]["locations"]["start"] = child[0].get("alisqfrom") matches[pfam_id]["type"] = "Pfam-A" return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], "polymers") except Exception as err: LOGGER.warn("failed to parse header for {0} ({1})".format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != "UniProt": continue idcode = dbref.idcode LOGGER.info( "UniProt ID code {0} for {1} chain " "{2} will be used.".format(idcode, seq[:4], poly.chid) ) break if idcode is not None: break if idcode is None: LOGGER.warn("A UniProt ID code for PDB {0} could not be " "parsed.".format(repr(seq))) url = "http://pfam.xfam.org/protein/" + seq + "?output=xml" else: url = "http://pfam.xfam.org/protein/" + idcode + "?output=xml" else: url = "http://pfam.xfam.org/protein/" + seq + "?output=xml" LOGGER.debug("Retrieving Pfam search results: " + url) xml = None while LOGGER.timing("_pfam") < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml: break if not xml: raise IOError("Pfam search timed out or failed to parse results " "XML, check URL: " + url) else: LOGGER.report("Pfam search completed in %.2fs.", "_pfam") if xml.find(b"There was a system error on your last request.") > 0: LOGGER.warn("No Pfam matches found for: " + seq) return None try: root = ET.XML(xml) except Exception as err: raise ValueError("failed to parse results XML, check URL: " + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError("failed to parse results XML, check URL: " + url) else: results = dictElement(root[0], prefix) try: xml_matches = results["matches"] except KeyError: raise ValueError("failed to parse results XML, check URL: " + url) matches = dict() for child in xml_matches: try: accession = child.attrib["accession"][:7] except KeyError: raise ValueError("failed to parse results XML, check URL: " + url) if not re.search("^P(F|B)[0-9]{5}$", accession): raise ValueError("{0} does not match pfam accession" " format".format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault("locations", []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = "Query " + repr(query) else: query = "Query sequence" if matches: LOGGER.info(query + " matched {0} Pfam families.".format(len(matches))) else: LOGGER.info(query + " did not match any Pfam families.") return matches
def blastPDB(sequence, filename=None, **kwargs): """Returns a :class:`PDBBlastRecord` instance that contains results from blast searching *sequence* against the PDB using NCBI blastp. :arg sequence: an object with an associated sequence string or a sequence string itself :type sequence: :class:`Atomic`, :class:`Sequence`, or str :arg filename: a *filename* to save the results in XML format :type filename: str *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``) search parameters can be adjusted by the user. *sleep* keyword argument (default is ``2`` seconds) determines how long to wait to reconnect for results. Sleep time is multiplied by 1.5 when results are not ready. *timeout* (default is 120 s) determines when to give up waiting for the results. """ if sequence == 'runexample': sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI' 'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN' 'DAYDIVKMKKSNISPNFNFMGQLLDFERTL') elif isinstance(sequence, Atomic): sequence = sequence.calpha.getSequence() elif isinstance(sequence, Sequence): sequence = str(sequence) elif isinstance(sequence, str): if len(sequence) in [4, 5, 6]: ag = parsePDB(sequence) sequence = ag.calpha.getSequence() sequence = ''.join(sequence.split()) else: raise TypeError('sequence must be Atomic, Sequence, or str not {0}' .format(type(sequence))) headers = {'User-agent': 'ProDy'} query = [('DATABASE', 'pdb'), ('ENTREZ_QUERY', '(none)'), ('PROGRAM', 'blastp'),] expect = float(kwargs.pop('expect', 10e-10)) if expect <= 0: raise ValueError('expect must be a positive number') query.append(('EXPECT', expect)) hitlist_size = int(kwargs.pop('hitlist_size', 250)) if hitlist_size <= 0: raise ValueError('expect must be a positive integer') query.append(('HITLIST_SIZE', hitlist_size)) query.append(('QUERY', sequence)) query.append(('CMD', 'Put')) sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', 120)) try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib import urlencode url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi' data = urlencode(query) LOGGER.timeit('_prody_blast') LOGGER.info('Blast searching NCBI PDB database for "{0}..."' .format(sequence[:5])) handle = openURL(url, data=data, headers=headers) html = handle.read() index = html.find(b'RID =') if index == -1: raise Exception('NCBI did not return expected response.') else: last = html.find(b'\n', index) rid = html[index + len('RID ='):last].strip() index = html.find(b'RTOE =') if index == -1: rtoe = None # This is not used else: last = html.find(b'\n', index) rtoe = int(html[index + len('RTOE ='):last].strip()) query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500), ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')] data = urlencode(query) while True: LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.') LOGGER.write('Connecting to NCBI for search results...') handle = openURL(url, data=data, headers=headers) results = handle.read() index = results.find(b'Status=') LOGGER.clear() if index < 0: break last = results.index(b'\n', index) status = results[index+len('Status='):last].strip() if status.upper() == 'READY': break sleep = int(sleep * 1.5) if LOGGER.timing('_prody_blast') > timeout: LOGGER.warn('Blast search time out.') return None LOGGER.clear() LOGGER.report('Blast search completed in %.1fs.', '_prody_blast') try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' out = open(filename, 'w') out.write(results) out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) return PDBBlastRecord(results, sequence)
def searchPfam(query, **kwargs): """Return Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence file, sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" prefix = '{http://pfam.xfam.org/}' query = str(query) if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = ''.join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError('could not parse a sequence without gaps from ' + query) else: seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + ' is not a valid sequence') fseq = '>Seq\n' + seq parameters = { 'hmmdb' : 'pfam', 'seq': fseq } enc_params = urllib.urlencode(parameters) request = urllib.request.Request('http://hmmer.janelia.org/search/hmmscan', enc_params) url = ( urllib.request.urlopen(request).geturl() + '?output=xml') LOGGER.debug('Submitted Pfam search for sequence "{0}...".' .format(seq[:MINSEQLEN])) xml = openURL(url, timeout=timeout).read() try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) matches = {} for child in root[0]: if child.tag == 'hits': accession = child.get('acc') pfam_id = accession.split('.')[0] matches[pfam_id]={} matches[pfam_id]['accession']=accession matches[pfam_id]['class']='Domain' matches[pfam_id]['id']=child.get('name') matches[pfam_id]['locations']={} matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto') matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom') matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore') matches[pfam_id]['locations']['end']=child[0].get('alisqto') matches[pfam_id]['locations']['evalue']=child.get('evalue') matches[pfam_id]['locations']['evidence']='hmmer v3.0' matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto') matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom') matches[pfam_id]['locations']['significant']=child[0].get('significant') matches[pfam_id]['locations']['start']=child[0].get('alisqfrom') matches[pfam_id]['type']='Pfam-A' return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: LOGGER.warn('failed to parse header for {0} ({1})' .format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue idcode = dbref.idcode LOGGER.info('UniProt ID code {0} for {1} chain ' '{2} will be used.' .format(idcode, seq[:4], poly.chid)) break if idcode is not None: break if idcode is None: LOGGER.warn('A UniProt ID code for PDB {0} could not be ' 'parsed.'.format(repr(seq))) url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml' else: url = ('http://pfam.xfam.org/protein/' + idcode + '?output=xml') else: url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml: break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError('failed to parse results XML, check URL: ' + url) else: results = dictElement(root[0], prefix) try: xml_matches = results['matches'] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for child in xml_matches: try: accession = child.attrib['accession'][:7] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) if not re.search('^P(F|B)[0-9]{5}$', accession): raise ValueError('{0} does not match pfam accession' ' format'.format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault('locations', []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = 'Query ' + repr(query) else: query = 'Query sequence' if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches
def fetchPDBviaHTTP(*pdb, **kwargs): """Retrieve PDB file(s) for specified *pdb* identifier(s) and return path(s). Downloaded files will be stored in local PDB folder, if one is set using :meth:`.pathPDBFolder`, and copied into *folder*, if specified by the user. If no destination folder is specified, files will be saved in the current working directory. If *compressed* is **False**, decompressed files will be copied into *folder*.""" if kwargs.get('check', True): identifiers = checkIdentifiers(*pdb) else: identifiers = list(pdb) output_folder = kwargs.pop('folder', None) compressed = bool(kwargs.pop('compressed', True)) extension = '.pdb' local_folder = pathPDBFolder() if local_folder: local_folder, is_divided = local_folder if is_divided: getPath = lambda pdb: join(makePath(join(local_folder, pdb[1:3])), 'pdb' + pdb + '.pdb.gz') else: getPath = lambda pdb: join(local_folder, pdb + '.pdb.gz') if output_folder is None: second = lambda filename, pdb: filename else: if compressed: second = lambda filename, pdb: (copyFile(filename, join(output_folder, pdb + extension + '.gz'))) else: second = lambda filename, pdb: gunzip(filename, join(output_folder, pdb + extension)) else: if output_folder is None: output_folder = getcwd() if compressed: getPath = lambda pdb: join(output_folder, pdb + extension + '.gz') second = lambda filename, pdb: filename else: getPath = lambda pdb: join(output_folder, pdb + extension) second = lambda filename, pdb: gunzip(getPath(pdb), getPath(pdb)) getURL = WWPDB_HTTP_URL[wwPDBServer() or 'us'] success = 0 failure = 0 filenames = [] for pdb in identifiers: if pdb is None: filenames.append(None) continue try: handle = openURL(getURL(pdb)) except Exception as err: LOGGER.warn('{0} download failed ({1}).'.format(pdb, str(err))) failure += 1 filenames.append(None) else: data = handle.read() if len(data): filename = getPath(pdb) with open(filename, 'w+b') as pdbfile: pdbfile.write(data) filename = normpath(relpath(second(filename, pdb))) LOGGER.debug('{0} downloaded ({1})' .format(pdb, sympath(filename))) success += 1 filenames.append(filename) else: LOGGER.warn('{0} download failed, reason unknown.' .format(pdb)) failure += 1 filenames.append(None) LOGGER.debug('PDB download via HTTP completed ({0} downloaded, ' '{1} failed).'.format(success, failure)) if len(identifiers) == 1: return filenames[0] else: return filenames
def searchPfam(query, **kwargs): """Returns Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, a protein sequence, or a sequence file. Sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = ''.join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError('could not parse a sequence without gaps from ' + query) else: seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + ' is not a valid sequence') fseq = '>Seq\n' + seq parameters = { 'hmmdb' : 'pfam', 'seq': fseq } enc_params = urllib.urlencode(parameters).encode('utf-8') request = urllib2.Request('https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params) results_url = urllib2.urlopen(request).geturl() #res_params = { 'output' : 'xml' } res_params = { 'format' : 'tsv' } enc_res_params = urllib.urlencode(res_params) #modified_res_url = results_url + '?' + enc_res_params modified_res_url = results_url.replace('results','download') + '?' + enc_res_params result_request = urllib2.Request(modified_res_url) # url = ( urllib2.urlopen(request).geturl() + '?output=xml') LOGGER.debug('Submitted Pfam search for sequence "{0}...".' .format(seq[:MINSEQLEN])) #xml = urllib2.urlopen(result_request).read() tsv = urllib2.urlopen(result_request).read() # openURL(url, timeout=timeout).read() # try: # root = ET.XML(xml) # except Exception as err: # raise ValueError('failed to parse results XML, check URL: ' + modified_res_url) matches = {} #for child in root[0]: #if child.tag == 'hits': # accession = child.get('acc') # pfam_id = accession.split('.')[0] # matches[pfam_id]={} # matches[pfam_id]['accession']=accession # matches[pfam_id]['class']='Domain' # matches[pfam_id]['id']=child.get('name') # matches[pfam_id]['locations']={} # matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto') # matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom') # matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore') # matches[pfam_id]['locations']['end']=child[0].get('alisqto') # matches[pfam_id]['locations']['evalue']=child.get('evalue') # matches[pfam_id]['locations']['evidence']='hmmer v3.0' # matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto') # matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom') # matches[pfam_id]['locations']['significant']=child[0].get('significant') # matches[pfam_id]['locations']['start']=child[0].get('alisqfrom') # matches[pfam_id]['type']='Pfam-A' # return matches lines = tsv.split('\n') keys = lines[0].split('\t') root = {} for i, line in enumerate(lines[1:-1]): root[i] = {} for j, key in enumerate(keys): root[i][key] = line.split('\t')[j] for child in root.values(): accession = child['Family Accession'] pfam_id = accession.split('.')[0] matches[pfam_id]={} matches[pfam_id]['accession'] = accession matches[pfam_id]['class'] = 'Domain' matches[pfam_id]['id'] = child['Family id'] matches[pfam_id]['locations'] = {} matches[pfam_id]['locations']['ali_end'] = child['Ali. End'] matches[pfam_id]['locations']['ali_start'] = child['Ali. Start'] matches[pfam_id]['locations']['bitscore'] = child['Bit Score'] matches[pfam_id]['locations']['end'] = child['Env. End'] matches[pfam_id]['locations']['cond_evalue'] = child['Cond. E-value'] matches[pfam_id]['locations']['ind_evalue'] = child['Ind. E-value'] matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0' matches[pfam_id]['locations']['hmm_end'] = child['Model End'] matches[pfam_id]['locations']['hmm_start'] = child['Model Start'] #matches[pfam_id]['locations']['significant'] = child['significant'] matches[pfam_id]['locations']['start'] = child['Env. Start'] matches[pfam_id]['type'] = 'Pfam-A' return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: LOGGER.warn('failed to parse header for {0} ({1})' .format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue idcode = dbref.idcode accession = dbref.accession LOGGER.info('UniProt ID code {0} for {1} chain ' '{2} will be used.' .format(idcode, seq[:4], poly.chid)) break if idcode is not None: break if idcode is None: LOGGER.warn('A UniProt ID code for PDB {0} could not be ' 'parsed.'.format(repr(seq))) url = prefix + 'protein/' + seq + '?output=xml' else: url = prefix + 'protein/' + idcode + '?output=xml' else: url = prefix + 'protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml not in ['PEND','RUN']: break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None elif xml.find(b'No valid UniProt accession or ID') > 0: try: url = prefix + 'protein/' + accession + '?output=xml' xml = openURL(url, timeout=timeout).read() except: try: ag = parsePDB(seq, subset='ca') ag_seq = ag.getSequence() return searchPfam(ag_seq) except: raise ValueError('No valid UniProt accession or ID for: ' + seq) try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError('failed to parse results XML, check URL: ' + url) else: key = '{' + prefix + '}' results = dictElement(root[0], key) try: xml_matches = results['matches'] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for child in xml_matches: try: accession = child.attrib['accession'][:7] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) if not re.search('^P(F|B)[0-9]{5}$', accession): raise ValueError('{0} does not match pfam accession' ' format'.format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault('locations', []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = 'Query ' + repr(query) else: query = 'Query sequence' if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches
def searchPfam(query, search_b=False, skip_a=False, **kwargs): """Return Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence file, sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg search_b: search Pfam-B families when **True** :type search_b: bool :arg skip_a: do not search Pfam-A families when **True** :type skip_a: bool :arg ga: use gathering threshold when **True** :type ga: bool :arg evalue: user specified e-value cutoff, must be smaller than 10.0 :type evalue: float :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" prefix = '{http://pfam.sanger.ac.uk/}' query = str(query) if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = ''.join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError('could not parse a sequence without gaps from ' + query) else: seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + ' is not a valid sequence') urlextension = '' if kwargs: ga = int(kwargs.get('ga', 1)) if not (ga == 1 or ga == 0): raise ValueError('ga must be either 0 or 1') evalue = kwargs.get('evalue', None) if evalue: if not float(evalue) <= 10.0: raise ValueError('evalue must be a valid float < 10.0') urlextension = urlextension + '&evalue=' + str(evalue) else: urlextension = urlextension + '&ga=' + str(ga) search_b = int(bool(search_b)) skip_a = int(bool(skip_a)) if skip_a == 1: search_b = 1 urlextension = urlextension + '&searchBs=' + str(search_b) urlextension = urlextension + '&skipAs='******'http://pfam.sanger.ac.uk/search/sequence?seq=' + str(seq) + urlextension + '&output=xml') LOGGER.debug('Submitted Pfam search for sequence "{0}...".' .format(seq[:MINSEQLEN])) xml = openURL(url, timeout=timeout).read() try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) try: url = dictElement(root[0], prefix)['result_url'] except (IndexError, KeyError): raise ValueError('failed to parse results XML, check URL: ' + url) else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: LOGGER.warn('failed to parse header for {0} ({1})' .format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue idcode = dbref.idcode LOGGER.info('UniProt ID code {0} for {1} chain ' '{2} will be used.' .format(idcode, seq[:4], poly.chid)) break if idcode is not None: break if idcode is None: LOGGER.warn('A UniProt ID code for PDB {0} could not be ' 'parsed.'.format(repr(seq))) url = 'http://pfam.sanger.ac.uk/protein/' + seq + '?output=xml' else: url = ('http://pfam.sanger.ac.uk/protein/' + idcode + '?output=xml') else: url = 'http://pfam.sanger.ac.uk/protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass #else: # if xml: # break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError('failed to parse results XML, check URL: ' + url) else: results = dictElement(root[0], prefix) try: xml_matches = results['matches'] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for child in xml_matches: try: accession = child.attrib['accession'][:7] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) if not re.search('^P(F|B)[0-9]{5}$', accession): raise ValueError('{0} does not match pfam accession' ' format'.format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault('locations', []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = 'Query ' + repr(query) else: query = 'Query sequence' if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches
def psiBlastCycle(sequence=None, filename=None, **kwargs): """Returns a :class:`PDBBlastRecord` instance that contains results from a single cycle of EBI psiblast. :arg sequence: an object with an associated sequence string or a sequence string itself :type sequence: :class:`Atomic`, :class:`Sequence`, or str :arg filename: a *filename* to save the results in XML format :type filename: str The following search parameters can be adjusted by the user. We use the same default values as http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/ wherever applicable. :arg email: email address for reporting problems default is [email protected] :type email: str with an @ before a . :arg matrix: The comparison matrix to be used to score alignments when searching the database possible values are 'BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30' and 'PAM70' default is 'BLOSUM62' :type matrix: str :arg gapopen: Penalty taken away from the score when a gap is created in sequence alignments. Increasing the gap opening penalty will decrease the number of gaps in the final alignment. Possible values range from 8 to 16 inclusive, default is 11 :type gapopen: int :arg gapext: Penalty taken away from the score for each base or residue in the gap. Increasing the gap extension penalty favors short gaps in the final alignment, conversly decreasing the gap extension penalty favors long gaps in the final alignment. Possible values range from 0 to 3, default is 1 :type gapext: int :arg expthr: Expectation threshold that limits the number of scores and alignments reported. This is the maximum number of times the match is expected to occur by chance. Possible values are 1.0e-200, 1.0e-100, 1.0e-50, 1.0e-10, 1.0e-5, 1.0e-4, 1.0e-3, 1.0e-2, 0.1, 1.0, 10.0, 100, 1000 default is 10.0 :type expthr: float :arg psithr: Expectation value threshold for automatic selection of matched sequences for inclusion in the PSSM at each iteration. Possible values are 1.0e-6, 1.0e-5, 1.0e-4, 2.0e-4, 5.0e-4, 1.0e-3, 2.0e-3, 5.0e-3, 1.0e-2, 2.0e-2, 0.1, 0.3, 0.5, 1.0, 3.0, 10.0 default is 1.0e-3 :type psithr: float :arg scores: Maximum number of match score summaries reported in the result output. Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000 Default is 500 :type scores: int :arg alignments: Maximum number of match alignments reported in the result output. Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000 Default is 500 :type alignmets: int :arg dropoff: The amount a score can drop before extension of word hits is halted Possible values are 0, 2, 4, 6, 8, 10, 15, 20, 25, or 30 Default is 15 :type dropoff: int :arg finaldropoff: Dropoff value for final gapped alignment Possible values are 10, 12, 14, 16, 18, 20, 22, 24, 25, 26, 28, or 30 Default is 25 :type finaldropoff: int :arg filter: Filter regions of low sequence complexity. This can avoid issues with low complexity sequences where matches are found due to composition rather than meaningful sequence similarity. However, in some cases filtering also masks regions of interest and so should be used with caution. Possible values are T and F, default is F :type filter: str :arg seqrange: Specify a range or section of the input sequence to use in the search. Example: Specifying '34-89' in an input sequence of total length 100, will tell BLAST to only use residues 34 to 89, inclusive. :type seqrange: str of form START-END :arg database: a database name from those available. See http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/database default is pdb :type database: str :arg previousjobid: The job identifier for the previous PSI-BLAST iteration. default is None You can change this if you want to continue from a previous run :type previousjobid: str :arg selectedHits: Name of a file containing a list of identifiers of the hits from the previous iteration to use to construct the search PSSM for this iteration. default is None :type selectedHits: str :arg cpfile: Name of a Checkpoint file from the previous iteration. default is None :type cpfile: str :arg sleep: how long to wait to reconnect for status Sleep time is multiplied by 1.5 when results are not ready. default is 2 seconds :type sleep: float :arg timeout: when to give up waiting for the results default is 120 seconds :type timeout: float :arg cycle: cycle number :type cycle: int """ cycle = kwargs.get('cycle',0) if sequence == 'runexample': sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI' 'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN' 'DAYDIVKMKKSNISPNFNFMGQLLDFERTL') elif isinstance(sequence, Atomic): sequence = sequence.calpha.getSequence() elif isinstance(sequence, Sequence): sequence = str(sequence) elif isinstance(sequence, str): if len(sequence) in [4, 5, 6]: ag = parsePDB(sequence) sequence = ag.calpha.getSequence() sequence = ''.join(sequence.split()) elif sequence is None: if cycle == 0: cycle = 1 else: raise TypeError('sequence must be Atomic, Sequence, or str not {0}' .format(type(sequence))) if cycle == 0: query = [('sequence', sequence)] else: query = [] email = kwargs.get('email','*****@*****.**') if not isinstance(email, str): raise TypeError('email must be a string') elif email.find('@') == -1 or email.find('.') == -1 or len(email.split('@')) != 2: raise ValueError('email must be a valid email address with at least one . and exactly one @ sign') elif not email.find('@') < email.find(email.split('.')[-1]): raise ValueError('email must be a valid email address with a . after the @ sign') query.append(('email', email)) query.append(('title', 'ProDy psiBlastPDB request')) previousjobid = kwargs.get('previousjobid','') if previousjobid is not '': query.append(('previousjobid',previousjobid)) selectedHits = kwargs.get('selectedHits','') if selectedHits is not '': query.append(('selectedHits',selectedHits)) database = kwargs.get('database','pdb') checkPsiBlastParameter('database', database) query.append(('database',database)) matrix = kwargs.get('matrix', 'BLOSUM62') checkPsiBlastParameter('matrix', matrix) query.append(('matrix',matrix)) gapopen = kwargs.get('gapopen',11) checkPsiBlastParameter('gapopen', gapopen) query.append(('gapopen',gapopen)) gapext = kwargs.get('gapext',1) checkPsiBlastParameter('gapext', gapext) query.append(('gapext',gapext)) expthr = kwargs.get('expthr', 10.) checkPsiBlastParameter('expthr', expthr) query.append(('expthr',expthr)) psithr = kwargs.get('psithr',1.0e-3) checkPsiBlastParameter('psithr', psithr) query.append(('psithr',psithr)) scores = kwargs.get('scores',500) checkPsiBlastParameter('scores', scores) query.append(('scores',scores)) alignments = kwargs.get('alignments',500) checkPsiBlastParameter('alignments', alignments) query.append(('alignments',alignments)) query.append(('alignView',0)) dropoff = kwargs.get('dropoff',15) checkPsiBlastParameter('dropoff', dropoff) query.append(('dropoff',dropoff)) finaldropoff = kwargs.get('finaldropoff',25) checkPsiBlastParameter('finaldropoff', finaldropoff) query.append(('finaldropoff',finaldropoff)) filter = kwargs.get('filter','F') checkPsiBlastParameter('filter', filter) query.append(('filter',filter)) if previousjobid is '' and selectedHits is '': seqrange = kwargs.get('seqrange', None) if seqrange is None: seqrange = '0-' + str(len(sequence)) elif not isinstance(seqrange, str): raise TypeError('seqrange should be a string') elif len(seqrange.split('-')) != 2: raise ValueError('seqrange should take the form START-END') try: start = int(seqrange.split('-')[0]) end = int(seqrange.split('-')[1]) except: raise ValueError('seqrange should be START-END with START and END being integers') query.append(('seqrange',seqrange)) headers = { 'User-Agent' : 'ProDy' } try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib import urlencode sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', 120)) data = urlencode(query) # submit the job base_url = 'http://www.ebi.ac.uk/Tools/services/rest/psiblast/' url = base_url + 'run/' LOGGER.timeit('_prody_psi-blast') if cycle == 0: LOGGER.info('PSI-Blast searching PDB database for "{0}..."' .format(sequence[:5])) else: LOGGER.info('PSI-Blast searching PDB database, cycle={0}' .format(cycle)) handle = openURL(url, data=data, headers=headers) job_id = handle.read() handle.close() # check the status url = base_url + 'status/' + job_id handle = openURL(url) status = handle.read() handle.close() # keep checking the status until it's no longer running while status == 'RUNNING': LOGGER.sleep(int(sleep), 'to reconnect to EBI for status.') LOGGER.write('Connecting to EBI for status...') handle = openURL(url) status = handle.read() LOGGER.clear() sleep = int(sleep * 1.5) if LOGGER.timing('_prody_psi-blast') > timeout: LOGGER.warn('PSI-Blast search time out.') return None LOGGER.info('The status is {0}'.format(status)) LOGGER.clear() LOGGER.report('PSI-Blast search completed in %.1fs.', '_prody_psi-blast') if cycle != 1: # get the results url = base_url + 'result/' + job_id + '/xml' handle = openURL(url) results = handle.read() handle.close() try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' f_out = open(filename, 'w') f_out.write(results) f_out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) return job_id, PsiBlastRecord(results, sequence) else: return job_id
def blastPDB(sequence, filename=None, **kwargs): """Returns a :class:`PDBBlastRecord` instance that contains results from blast searching *sequence* against the PDB using NCBI blastp. :arg sequence: an object with an associated sequence string or a sequence string itself :type sequence: :class:`Atomic`, :class:`Sequence`, or str :arg filename: a *filename* to save the results in XML format :type filename: str *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``) search parameters can be adjusted by the user. *sleep* keyword argument (default is ``2`` seconds) determines how long to wait to reconnect for results. Sleep time is multiplied by 1.5 when results are not ready. *timeout* (default is 120 s) determines when to give up waiting for the results. """ if sequence == 'runexample': sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI' 'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN' 'DAYDIVKMKKSNISPNFNFMGQLLDFERTL') elif isinstance(sequence, Atomic): sequence = sequence.calpha.getSequence() elif isinstance(sequence, Sequence): sequence = str(sequence) elif isinstance(sequence, str): if len(sequence) in [4, 5, 6]: ag = parsePDB(sequence) sequence = ag.calpha.getSequence() sequence = ''.join(sequence.split()) else: raise TypeError( 'sequence must be Atomic, Sequence, or str not {0}'.format( type(sequence))) headers = {'User-agent': 'ProDy'} query = [ ('DATABASE', 'pdb'), ('ENTREZ_QUERY', '(none)'), ('PROGRAM', 'blastp'), ] expect = float(kwargs.pop('expect', 10e-10)) if expect <= 0: raise ValueError('expect must be a positive number') query.append(('EXPECT', expect)) hitlist_size = int(kwargs.pop('hitlist_size', 250)) if hitlist_size <= 0: raise ValueError('expect must be a positive integer') query.append(('HITLIST_SIZE', hitlist_size)) query.append(('QUERY', sequence)) query.append(('CMD', 'Put')) sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', 120)) try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib import urlencode url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi' data = urlencode(query) LOGGER.timeit('_prody_blast') LOGGER.info('Blast searching NCBI PDB database for "{0}..."'.format( sequence[:5])) handle = openURL(url, data=data, headers=headers) html = handle.read() index = html.find(b'RID =') if index == -1: raise Exception('NCBI did not return expected response.') else: last = html.find(b'\n', index) rid = html[index + len('RID ='):last].strip() index = html.find(b'RTOE =') if index == -1: rtoe = None # This is not used else: last = html.find(b'\n', index) rtoe = int(html[index + len('RTOE ='):last].strip()) query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500), ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')] data = urlencode(query) while True: LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.') LOGGER.write('Connecting to NCBI for search results...') handle = openURL(url, data=data, headers=headers) results = handle.read() index = results.find(b'Status=') LOGGER.clear() if index < 0: break last = results.index(b'\n', index) status = results[index + len('Status='):last].strip() if status.upper() == 'READY': break sleep = int(sleep * 1.5) if LOGGER.timing('_prody_blast') > timeout: LOGGER.warn('Blast search time out.') return None LOGGER.clear() LOGGER.report('Blast search completed in %.1fs.', '_prody_blast') try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' out = open(filename, 'w') if PY3K: out.write(results.decode()) else: out.write(results) out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) return PDBBlastRecord(results, sequence)
def fetchPfamMSA(acc, alignment="full", compressed=False, **kwargs): """Return a path to the downloaded Pfam MSA file. :arg acc: Pfam ID or Accession Code :type acc: str :arg alignment: alignment type, one of ``'full'`` (default), ``'seed'``, ``'ncbi'``, ``'metagenomics'``, ``'rp15'``, ``'rp35'``, ``'rp55'``, or ``'rp75'`` where rp stands for representative proteomes :arg compressed: gzip the downloaded MSA file, default is **False** *Alignment Options* :arg format: a Pfam supported MSA file format, one of ``'selex'``, (default), ``'stockholm'`` or ``'fasta'`` :arg order: ordering of sequences, ``'tree'`` (default) or ``'alphabetical'`` :arg inserts: letter case for inserts, ``'upper'`` (default) or ``'lower'`` :arg gaps: gap character, one of ``'dashes'`` (default), ``'dots'``, ``'mixed'`` or **None** for unaligned *Other Options* :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :arg outname: out filename, default is input ``'acc_alignment.format'`` :arg folder: output folder, default is ``'.'``""" url = "http://pfam.sanger.ac.uk/family/acc?id=" + acc handle = openURL(url) orig_acc = acc acc = handle.readline().strip() if PY3K: acc = acc.decode() url_flag = False if not re.search("(?<=PF)[0-9]{5}$", acc): raise ValueError("{0} is not a valid Pfam ID or Accession Code".format(repr(orig_acc))) if alignment not in DOWNLOAD_FORMATS: raise ValueError("alignment must be one of full, seed, ncbi or" " metagenomics") if alignment == "ncbi" or alignment == "metagenomics": url = "http://pfam.sanger.ac.uk/family/" + acc + "/alignment/" + alignment + "/gzipped" url_flag = True extension = ".sth" else: if not kwargs: url = "http://pfam.sanger.ac.uk/family/" + acc + "/alignment/" + alignment + "/gzipped" url_flag = True extension = ".sth" else: align_format = kwargs.get("format", "selex").lower() if align_format not in FORMAT_OPTIONS["format"]: raise ValueError("alignment format must be of type selex" " stockholm or fasta. MSF not supported") if align_format == SELEX: align_format, extension = "pfam", ".slx" elif align_format == FASTA: extension = ".fasta" else: extension = ".sth" gaps = str(kwargs.get("gaps", "dashes")).lower() if gaps not in FORMAT_OPTIONS["gaps"]: raise ValueError("gaps must be of type mixed, dots, dashes, " "or None") inserts = kwargs.get("inserts", "upper").lower() if inserts not in FORMAT_OPTIONS["inserts"]: raise ValueError("inserts must be of type lower or upper") order = kwargs.get("order", "tree").lower() if order not in FORMAT_OPTIONS["order"]: raise ValueError("order must be of type tree or alphabetical") url = ( "http://pfam.sanger.ac.uk/family/" + acc + "/alignment/" + alignment + "/format?format=" + align_format + "&alnType=" + alignment + "&order=" + order[0] + "&case=" + inserts[0] + "&gaps=" + gaps + "&download=1" ) response = openURL(url, timeout=int(kwargs.get("timeout", 60))) outname = kwargs.get("outname", None) if not outname: outname = orig_acc folder = str(kwargs.get("folder", ".")) filepath = join(makePath(folder), outname + "_" + alignment + extension) if compressed: filepath = filepath + ".gz" if url_flag: f_out = open(filepath, "wb") else: f_out = openFile(filepath, "wb") f_out.write(response.read()) f_out.close() else: if url_flag: gunzip(response.read(), filepath) else: with open(filepath, "wb") as f_out: f_out.write(response.read()) filepath = relpath(filepath) LOGGER.info("Pfam MSA for {0} is written as {1}.".format(orig_acc, filepath)) return filepath
def fetch(self, xml=None, sequence=None, **kwargs): """Get Blast record from url or file. :arg sequence: an object with an associated sequence string or a sequence string itself :type sequence: :class:`Atomic`, :class:`Sequence`, or str :arg xml: blast search results in XML format or an XML file that contains the results or a filename for saving the results or None :type xml: str :arg timeout: amount of time until the query times out in seconds default value is 120 :type timeout: int """ if self.isSuccess: LOGGER.warn( "The record already exists so not further search is performed") return True if sequence == None: sequence = self._sequence if xml == None: xml = self._xml import xml.etree.cElementTree as ET if xml is not None and len(xml) < 100: if os.path.isfile(xml): xml = ET.parse(xml) root = xml.getroot() else: raise ValueError('xml is not a filename and does not look like' ' a valid XML string') else: headers = {'User-agent': 'ProDy'} query = [ ('DATABASE', 'pdb'), ('ENTREZ_QUERY', '(none)'), ('PROGRAM', 'blastp'), ] expect = float(kwargs.pop('expect', 10e-10)) if expect <= 0: raise ValueError('expect must be a positive number') query.append(('EXPECT', expect)) hitlist_size = int(kwargs.pop('hitlist_size', 250)) if hitlist_size <= 0: raise ValueError('expect must be a positive integer') query.append(('HITLIST_SIZE', hitlist_size)) query.append(('QUERY', sequence)) query.append(('CMD', 'Put')) sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', self._timeout)) self._timeout = timeout try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib import urlencode url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi' data = urlencode(query) LOGGER.timeit('_prody_blast') LOGGER.info( 'Blast searching NCBI PDB database for "{0}..."'.format( sequence[:5])) handle = openURL(url, data=data, headers=headers) html = handle.read() index = html.find(b'RID =') if index == -1: raise Exception('NCBI did not return expected response.') else: last = html.find(b'\n', index) rid = html[index + len('RID ='):last].strip() query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500), ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')] data = urlencode(query) while True: LOGGER.sleep(int(sleep), 'to reconnect to NCBI for search results.') LOGGER.write('Connecting to NCBI for search results...') handle = openURL(url, data=data, headers=headers) results = handle.read() index = results.find(b'Status=') LOGGER.clear() if index < 0: break last = results.index(b'\n', index) status = results[index + len('Status='):last].strip() if status.upper() == b'READY': break sleep = int(sleep * 1.5) if LOGGER.timing('_prody_blast') > timeout: LOGGER.warn('Blast search time out.') return False LOGGER.clear() LOGGER.report('Blast search completed in %.1fs.', '_prody_blast') filename = xml root = ET.XML(results) try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' out = open(filename, 'w') if PY3K: out.write(results.decode()) else: out.write(results) out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) root = dictElement(root, 'BlastOutput_') if root['db'] != 'pdb': raise ValueError('blast search database in xml must be "pdb"') if root['program'] != 'blastp': raise ValueError( 'blast search program in xml must be "blastp"') self._param = dictElement(root['param'][0], 'Parameters_') query_len = int(root['query-len']) if sequence and len(sequence) != query_len: raise ValueError( 'query-len and the length of the sequence do not ' 'match, xml data may not be for given sequence') hits = [] for iteration in root['iterations']: for hit in dictElement(iteration, 'Iteration_')['hits']: hit = dictElement(hit, 'Hit_') data = dictElement(hit['hsps'][0], 'Hsp_') for key in [ 'align-len', 'gaps', 'hit-frame', 'hit-from', 'hit-to', 'identity', 'positive', 'query-frame', 'query-from', 'query-to' ]: data[key] = int(data[key]) data['query-len'] = query_len for key in ['evalue', 'bit-score', 'score']: data[key] = float(data[key]) p_identity = 100.0 * data['identity'] / ( data['query-to'] - data['query-from'] + 1) data['percent_identity'] = p_identity p_overlap = (100.0 * (data['align-len'] - data['gaps']) / query_len) data['percent_coverage'] = p_overlap for item in (hit['id'] + hit['def']).split('>gi'): head, title = item.split(None, 1) head = head.split('|') pdb_id = head[-2].lower() chain_id = head[-1][:1] pdbch = dict(data) pdbch['pdb_id'] = pdb_id pdbch['chain_id'] = chain_id pdbch['title'] = (head[-1][1:] + title).strip() hits.append((p_identity, p_overlap, pdbch)) hits.sort(key=lambda hit: hit[0], reverse=True) self._hits = hits return True
def psiBlastCycle(sequence=None, filename=None, **kwargs): """Returns a :class:`PDBBlastRecord` instance that contains results from a single cycle of EBI psiblast. :arg sequence: an object with an associated sequence string or a sequence string itself :type sequence: :class:`Atomic`, :class:`Sequence`, or str :arg filename: a *filename* to save the results in XML format :type filename: str The following search parameters can be adjusted by the user. We use the same default values as http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/ wherever applicable. :arg email: email address for reporting problems default is [email protected] :type email: str with an @ before a . :arg matrix: The comparison matrix to be used to score alignments when searching the database possible values are 'BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30' and 'PAM70' default is 'BLOSUM62' :type matrix: str :arg gapopen: Penalty taken away from the score when a gap is created in sequence alignments. Increasing the gap opening penalty will decrease the number of gaps in the final alignment. Possible values range from 8 to 16 inclusive, default is 11 :type gapopen: int :arg gapext: Penalty taken away from the score for each base or residue in the gap. Increasing the gap extension penalty favors short gaps in the final alignment, conversly decreasing the gap extension penalty favors long gaps in the final alignment. Possible values range from 0 to 3, default is 1 :type gapext: int :arg expthr: Expectation threshold that limits the number of scores and alignments reported. This is the maximum number of times the match is expected to occur by chance. Possible values are 1.0e-200, 1.0e-100, 1.0e-50, 1.0e-10, 1.0e-5, 1.0e-4, 1.0e-3, 1.0e-2, 0.1, 1.0, 10.0, 100, 1000 default is 10.0 :type expthr: float :arg psithr: Expectation value threshold for automatic selection of matched sequences for inclusion in the PSSM at each iteration. Possible values are 1.0e-6, 1.0e-5, 1.0e-4, 2.0e-4, 5.0e-4, 1.0e-3, 2.0e-3, 5.0e-3, 1.0e-2, 2.0e-2, 0.1, 0.3, 0.5, 1.0, 3.0, 10.0 default is 1.0e-3 :type psithr: float :arg scores: Maximum number of match score summaries reported in the result output. Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000 Default is 500 :type scores: int :arg alignments: Maximum number of match alignments reported in the result output. Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000 Default is 500 :type alignmets: int :arg dropoff: The amount a score can drop before extension of word hits is halted Possible values are 0, 2, 4, 6, 8, 10, 15, 20, 25, or 30 Default is 15 :type dropoff: int :arg finaldropoff: Dropoff value for final gapped alignment Possible values are 10, 12, 14, 16, 18, 20, 22, 24, 25, 26, 28, or 30 Default is 25 :type finaldropoff: int :arg filter: Filter regions of low sequence complexity. This can avoid issues with low complexity sequences where matches are found due to composition rather than meaningful sequence similarity. However, in some cases filtering also masks regions of interest and so should be used with caution. Possible values are T and F, default is F :type filter: str :arg seqrange: Specify a range or section of the input sequence to use in the search. Example: Specifying '34-89' in an input sequence of total length 100, will tell BLAST to only use residues 34 to 89, inclusive. :type seqrange: str of form START-END :arg database: a database name from those available. See http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/database default is pdb :type database: str :arg previousjobid: The job identifier for the previous PSI-BLAST iteration. default is None You can change this if you want to continue from a previous run :type previousjobid: str :arg selectedHits: Name of a file containing a list of identifiers of the hits from the previous iteration to use to construct the search PSSM for this iteration. default is None :type selectedHits: str :arg cpfile: Name of a Checkpoint file from the previous iteration. default is None :type cpfile: str :arg sleep: how long to wait to reconnect for status Sleep time is multiplied by 1.5 when results are not ready. default is 2 seconds :type sleep: float :arg timeout: when to give up waiting for the results default is 120 seconds :type timeout: float :arg cycle: cycle number :type cycle: int """ cycle = kwargs.get('cycle', 0) if sequence == 'runexample': sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI' 'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN' 'DAYDIVKMKKSNISPNFNFMGQLLDFERTL') elif isinstance(sequence, Atomic): sequence = sequence.calpha.getSequence() elif isinstance(sequence, Sequence): sequence = str(sequence) elif isinstance(sequence, str): if len(sequence) in [4, 5, 6]: ag = parsePDB(sequence) sequence = ag.calpha.getSequence() sequence = ''.join(sequence.split()) elif sequence is None: if cycle == 0: cycle = 1 else: raise TypeError( 'sequence must be Atomic, Sequence, or str not {0}'.format( type(sequence))) if cycle == 0: query = [('sequence', sequence)] else: query = [] email = kwargs.get('email', '*****@*****.**') if not isinstance(email, str): raise TypeError('email must be a string') elif email.find('@') == -1 or email.find('.') == -1 or len( email.split('@')) != 2: raise ValueError( 'email must be a valid email address with at least one . and exactly one @ sign' ) elif not email.find('@') < email.find(email.split('.')[-1]): raise ValueError( 'email must be a valid email address with a . after the @ sign') query.append(('email', email)) query.append(('title', 'ProDy psiBlastPDB request')) previousjobid = kwargs.get('previousjobid', '') if previousjobid != '': query.append(('previousjobid', previousjobid)) selectedHits = kwargs.get('selectedHits', '') if selectedHits != '': query.append(('selectedHits', selectedHits)) database = kwargs.get('database', 'pdb') checkPsiBlastParameter('database', database) query.append(('database', database)) matrix = kwargs.get('matrix', 'BLOSUM62') checkPsiBlastParameter('matrix', matrix) query.append(('matrix', matrix)) gapopen = kwargs.get('gapopen', 11) checkPsiBlastParameter('gapopen', gapopen) query.append(('gapopen', gapopen)) gapext = kwargs.get('gapext', 1) checkPsiBlastParameter('gapext', gapext) query.append(('gapext', gapext)) expthr = kwargs.get('expthr', 10.) checkPsiBlastParameter('expthr', expthr) query.append(('expthr', expthr)) psithr = kwargs.get('psithr', 1.0e-3) checkPsiBlastParameter('psithr', psithr) query.append(('psithr', psithr)) scores = kwargs.get('scores', 500) checkPsiBlastParameter('scores', scores) query.append(('scores', scores)) alignments = kwargs.get('alignments', 500) checkPsiBlastParameter('alignments', alignments) query.append(('alignments', alignments)) query.append(('alignView', 0)) dropoff = kwargs.get('dropoff', 15) checkPsiBlastParameter('dropoff', dropoff) query.append(('dropoff', dropoff)) finaldropoff = kwargs.get('finaldropoff', 25) checkPsiBlastParameter('finaldropoff', finaldropoff) query.append(('finaldropoff', finaldropoff)) filter = kwargs.get('filter', 'no') checkPsiBlastParameter('filter', filter) query.append(('filter', filter)) if previousjobid == '' and selectedHits == '': seqrange = kwargs.get('seqrange', None) if seqrange is None: seqrange = '0-' + str(len(sequence)) elif not isinstance(seqrange, str): raise TypeError('seqrange should be a string') elif len(seqrange.split('-')) != 2: raise ValueError('seqrange should take the form START-END') try: start = int(seqrange.split('-')[0]) end = int(seqrange.split('-')[1]) except: raise ValueError( 'seqrange should be START-END with START and END being integers' ) query.append(('seqrange', seqrange)) headers = {'User-Agent': 'ProDy'} try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib import urlencode sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', 120)) data = urlencode(query) # submit the job base_url = 'http://www.ebi.ac.uk/Tools/services/rest/psiblast/' url = base_url + 'run/' LOGGER.timeit('_prody_psi-blast') if cycle == 0: LOGGER.info('PSI-Blast searching PDB database for "{0}..."'.format( sequence[:5])) else: LOGGER.info( 'PSI-Blast searching PDB database, cycle={0}'.format(cycle)) handle = openURL(url, data=data, headers=headers) job_id = handle.read() if PY3K: job_id = job_id.decode() handle.close() # check the status url = base_url + 'status/' + job_id handle = openURL(url) status = handle.read() if PY3K: status = status.decode() handle.close() # keep checking the status until it's no longer running while status == 'RUNNING': LOGGER.sleep(int(sleep), 'to reconnect to EBI for status.') LOGGER.write('Connecting to EBI for status...') handle = openURL(url) status = handle.read() if PY3K: status = status.decode() LOGGER.clear() sleep = int(sleep * 1.5) if LOGGER.timing('_prody_psi-blast') > timeout: LOGGER.warn('PSI-Blast search time out.') return None LOGGER.info('The status is {0}'.format(status)) LOGGER.clear() LOGGER.report('PSI-Blast search completed in %.1fs.', '_prody_psi-blast') if cycle != 1: # get the results url = base_url + 'result/' + job_id + '/xml' handle = openURL(url) results = handle.read() handle.close() try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' f_out = open(filename, 'w') f_out.write(results) f_out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) return job_id, PsiBlastRecord(results, sequence) else: return job_id
def blastPDBUniProtKB(sequence, filename=None, **kwargs): """Returns a :class:`PDBBlastRecord` instance that contains results from blast searching of ProteinDataBank database *sequence* using NCBI blastp. :arg sequence: single-letter code amino acid sequence of the protein without any gap characters, all white spaces will be removed :type sequence: str :arg filename: a *filename* to save the results in XML format :type filename: str *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``) search parameters can be adjusted by the user. *sleep* keyword argument (default is ``2`` seconds) determines how long to wait to reconnect for results. Sleep time is doubled when results are not ready. *timeout* (default is 120s) determines when to give up waiting for the results. *num_sequences (default is ``1``) """ num_sequences = int(kwargs.pop('num_sequences', 1)) if sequence == 'runexample': sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI' 'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN' 'DAYDIVKMKKSNISPNFNFMGQLLDFERTL') else: if num_sequences == 1: try: sequence = ''.join(sequence.split()) _ = sequence.isalpha() except AttributeError: raise TypeError('sequence must be a string') else: if not _: raise ValueError('not a valid protein sequence') headers = {'User-agent': 'ProDy'} query = [('DATABASE', 'swissprot'), ('ENTREZ_QUERY', '(none)'), ('PROGRAM', 'blastp'),] expect = float(kwargs.pop('expect', 10e-5)) if expect <= 0: raise ValueError('expect must be a positive number') query.append(('EXPECT', expect)) hitlist_size = int(kwargs.pop('hitlist_size', 250)) if hitlist_size <= 0: raise ValueError('expect must be a positive integer') psiblast = 'true' step_number = 3 query.append(('RUN_PSIBLAST', psiblast)) query.append(('HITLIST_SIZE', hitlist_size)) query.append(('QUERY', sequence)) query.append(('CMD', 'Put')) query.append(('STEP_NUMBER', step_number)) sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', 120)) if kwargs: LOGGER.warn('Keyword argument(s) {0} are not used.' .format(', '.join([repr(key) for key in kwargs]))) try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib import urlencode url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi' data = urlencode(query) LOGGER.timeit('_prody_blast') LOGGER.info('Blast searching NCBI PDB database for "{0}..."' .format(sequence[:5])) handle = openURL(url, data=data, headers=headers) html = handle.read() index = html.find(b'name="RID" type="hidden" value="') if index == -1: raise Exception('NCBI did not return expected response.') else: last = html.find(b'>',index) rid = html[index + len('name="RID" type="hidden" value="'):last-1].strip() index = html.find(b'name="RTOE" type="hidden" value="') if index == -1: rtoe = None # This is not used else: last = html.find(b'>', index) rtoe = html[index + len('name="RTOE" type="hidden" value="'):last-1].strip() query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500), ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')] data = urlencode(query) while True: LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.') LOGGER.write('Connecting NCBI for search results...') handle = openURL(url, data=data, headers=headers) results = handle.read() index = results.find(b'Status=') LOGGER.clear() if index < 0: break last = results.index(b'\n', index) status = results[index+len('Status='):last].strip() if status.upper() == 'READY': break sleep = int(sleep * 1.5) if LOGGER.timing('_prody_blast') > timeout: LOGGER.warn('Blast search time out.') return None LOGGER.clear() LOGGER.report('Blast search completed in %.1fs.', '_prody_blast') try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' out = open(filename, 'w') out.write(results) out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) return SwissProtBlastRecord(results, sequence)