def psiBlastCycle(sequence=None, filename=None, **kwargs): """Returns a :class:`PDBBlastRecord` instance that contains results from a single cycle of EBI psiblast. :arg sequence: an object with an associated sequence string or a sequence string itself :type sequence: :class:`Atomic`, :class:`Sequence`, or str :arg filename: a *filename* to save the results in XML format :type filename: str The following search parameters can be adjusted by the user. We use the same default values as http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/ wherever applicable. :arg email: email address for reporting problems default is [email protected] :type email: str with an @ before a . :arg matrix: The comparison matrix to be used to score alignments when searching the database possible values are 'BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30' and 'PAM70' default is 'BLOSUM62' :type matrix: str :arg gapopen: Penalty taken away from the score when a gap is created in sequence alignments. Increasing the gap opening penalty will decrease the number of gaps in the final alignment. Possible values range from 8 to 16 inclusive, default is 11 :type gapopen: int :arg gapext: Penalty taken away from the score for each base or residue in the gap. Increasing the gap extension penalty favors short gaps in the final alignment, conversly decreasing the gap extension penalty favors long gaps in the final alignment. Possible values range from 0 to 3, default is 1 :type gapext: int :arg expthr: Expectation threshold that limits the number of scores and alignments reported. This is the maximum number of times the match is expected to occur by chance. Possible values are 1.0e-200, 1.0e-100, 1.0e-50, 1.0e-10, 1.0e-5, 1.0e-4, 1.0e-3, 1.0e-2, 0.1, 1.0, 10.0, 100, 1000 default is 10.0 :type expthr: float :arg psithr: Expectation value threshold for automatic selection of matched sequences for inclusion in the PSSM at each iteration. Possible values are 1.0e-6, 1.0e-5, 1.0e-4, 2.0e-4, 5.0e-4, 1.0e-3, 2.0e-3, 5.0e-3, 1.0e-2, 2.0e-2, 0.1, 0.3, 0.5, 1.0, 3.0, 10.0 default is 1.0e-3 :type psithr: float :arg scores: Maximum number of match score summaries reported in the result output. Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000 Default is 500 :type scores: int :arg alignments: Maximum number of match alignments reported in the result output. Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000 Default is 500 :type alignmets: int :arg dropoff: The amount a score can drop before extension of word hits is halted Possible values are 0, 2, 4, 6, 8, 10, 15, 20, 25, or 30 Default is 15 :type dropoff: int :arg finaldropoff: Dropoff value for final gapped alignment Possible values are 10, 12, 14, 16, 18, 20, 22, 24, 25, 26, 28, or 30 Default is 25 :type finaldropoff: int :arg filter: Filter regions of low sequence complexity. This can avoid issues with low complexity sequences where matches are found due to composition rather than meaningful sequence similarity. However, in some cases filtering also masks regions of interest and so should be used with caution. Possible values are T and F, default is F :type filter: str :arg seqrange: Specify a range or section of the input sequence to use in the search. Example: Specifying '34-89' in an input sequence of total length 100, will tell BLAST to only use residues 34 to 89, inclusive. :type seqrange: str of form START-END :arg database: a database name from those available. See http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/database default is pdb :type database: str :arg previousjobid: The job identifier for the previous PSI-BLAST iteration. default is None You can change this if you want to continue from a previous run :type previousjobid: str :arg selectedHits: Name of a file containing a list of identifiers of the hits from the previous iteration to use to construct the search PSSM for this iteration. default is None :type selectedHits: str :arg cpfile: Name of a Checkpoint file from the previous iteration. default is None :type cpfile: str :arg sleep: how long to wait to reconnect for status Sleep time is multiplied by 1.5 when results are not ready. default is 2 seconds :type sleep: float :arg timeout: when to give up waiting for the results default is 120 seconds :type timeout: float :arg cycle: cycle number :type cycle: int """ cycle = kwargs.get('cycle',0) if sequence == 'runexample': sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI' 'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN' 'DAYDIVKMKKSNISPNFNFMGQLLDFERTL') elif isinstance(sequence, Atomic): sequence = sequence.calpha.getSequence() elif isinstance(sequence, Sequence): sequence = str(sequence) elif isinstance(sequence, str): if len(sequence) in [4, 5, 6]: ag = parsePDB(sequence) sequence = ag.calpha.getSequence() sequence = ''.join(sequence.split()) elif sequence is None: if cycle == 0: cycle = 1 else: raise TypeError('sequence must be Atomic, Sequence, or str not {0}' .format(type(sequence))) if cycle == 0: query = [('sequence', sequence)] else: query = [] email = kwargs.get('email','*****@*****.**') if not isinstance(email, str): raise TypeError('email must be a string') elif email.find('@') == -1 or email.find('.') == -1 or len(email.split('@')) != 2: raise ValueError('email must be a valid email address with at least one . and exactly one @ sign') elif not email.find('@') < email.find(email.split('.')[-1]): raise ValueError('email must be a valid email address with a . after the @ sign') query.append(('email', email)) query.append(('title', 'ProDy psiBlastPDB request')) previousjobid = kwargs.get('previousjobid','') if previousjobid is not '': query.append(('previousjobid',previousjobid)) selectedHits = kwargs.get('selectedHits','') if selectedHits is not '': query.append(('selectedHits',selectedHits)) database = kwargs.get('database','pdb') checkPsiBlastParameter('database', database) query.append(('database',database)) matrix = kwargs.get('matrix', 'BLOSUM62') checkPsiBlastParameter('matrix', matrix) query.append(('matrix',matrix)) gapopen = kwargs.get('gapopen',11) checkPsiBlastParameter('gapopen', gapopen) query.append(('gapopen',gapopen)) gapext = kwargs.get('gapext',1) checkPsiBlastParameter('gapext', gapext) query.append(('gapext',gapext)) expthr = kwargs.get('expthr', 10.) checkPsiBlastParameter('expthr', expthr) query.append(('expthr',expthr)) psithr = kwargs.get('psithr',1.0e-3) checkPsiBlastParameter('psithr', psithr) query.append(('psithr',psithr)) scores = kwargs.get('scores',500) checkPsiBlastParameter('scores', scores) query.append(('scores',scores)) alignments = kwargs.get('alignments',500) checkPsiBlastParameter('alignments', alignments) query.append(('alignments',alignments)) query.append(('alignView',0)) dropoff = kwargs.get('dropoff',15) checkPsiBlastParameter('dropoff', dropoff) query.append(('dropoff',dropoff)) finaldropoff = kwargs.get('finaldropoff',25) checkPsiBlastParameter('finaldropoff', finaldropoff) query.append(('finaldropoff',finaldropoff)) filter = kwargs.get('filter','F') checkPsiBlastParameter('filter', filter) query.append(('filter',filter)) if previousjobid is '' and selectedHits is '': seqrange = kwargs.get('seqrange', None) if seqrange is None: seqrange = '0-' + str(len(sequence)) elif not isinstance(seqrange, str): raise TypeError('seqrange should be a string') elif len(seqrange.split('-')) != 2: raise ValueError('seqrange should take the form START-END') try: start = int(seqrange.split('-')[0]) end = int(seqrange.split('-')[1]) except: raise ValueError('seqrange should be START-END with START and END being integers') query.append(('seqrange',seqrange)) headers = { 'User-Agent' : 'ProDy' } try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib import urlencode sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', 120)) data = urlencode(query) # submit the job base_url = 'http://www.ebi.ac.uk/Tools/services/rest/psiblast/' url = base_url + 'run/' LOGGER.timeit('_prody_psi-blast') if cycle == 0: LOGGER.info('PSI-Blast searching PDB database for "{0}..."' .format(sequence[:5])) else: LOGGER.info('PSI-Blast searching PDB database, cycle={0}' .format(cycle)) handle = openURL(url, data=data, headers=headers) job_id = handle.read() handle.close() # check the status url = base_url + 'status/' + job_id handle = openURL(url) status = handle.read() handle.close() # keep checking the status until it's no longer running while status == 'RUNNING': LOGGER.sleep(int(sleep), 'to reconnect to EBI for status.') LOGGER.write('Connecting to EBI for status...') handle = openURL(url) status = handle.read() LOGGER.clear() sleep = int(sleep * 1.5) if LOGGER.timing('_prody_psi-blast') > timeout: LOGGER.warn('PSI-Blast search time out.') return None LOGGER.info('The status is {0}'.format(status)) LOGGER.clear() LOGGER.report('PSI-Blast search completed in %.1fs.', '_prody_psi-blast') if cycle != 1: # get the results url = base_url + 'result/' + job_id + '/xml' handle = openURL(url) results = handle.read() handle.close() try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' f_out = open(filename, 'w') f_out.write(results) f_out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) return job_id, PsiBlastRecord(results, sequence) else: return job_id
def psiBlastCycle(sequence=None, filename=None, **kwargs): """Returns a :class:`PDBBlastRecord` instance that contains results from a single cycle of EBI psiblast. :arg sequence: an object with an associated sequence string or a sequence string itself :type sequence: :class:`Atomic`, :class:`Sequence`, or str :arg filename: a *filename* to save the results in XML format :type filename: str The following search parameters can be adjusted by the user. We use the same default values as http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/ wherever applicable. :arg email: email address for reporting problems default is [email protected] :type email: str with an @ before a . :arg matrix: The comparison matrix to be used to score alignments when searching the database possible values are 'BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30' and 'PAM70' default is 'BLOSUM62' :type matrix: str :arg gapopen: Penalty taken away from the score when a gap is created in sequence alignments. Increasing the gap opening penalty will decrease the number of gaps in the final alignment. Possible values range from 8 to 16 inclusive, default is 11 :type gapopen: int :arg gapext: Penalty taken away from the score for each base or residue in the gap. Increasing the gap extension penalty favors short gaps in the final alignment, conversly decreasing the gap extension penalty favors long gaps in the final alignment. Possible values range from 0 to 3, default is 1 :type gapext: int :arg expthr: Expectation threshold that limits the number of scores and alignments reported. This is the maximum number of times the match is expected to occur by chance. Possible values are 1.0e-200, 1.0e-100, 1.0e-50, 1.0e-10, 1.0e-5, 1.0e-4, 1.0e-3, 1.0e-2, 0.1, 1.0, 10.0, 100, 1000 default is 10.0 :type expthr: float :arg psithr: Expectation value threshold for automatic selection of matched sequences for inclusion in the PSSM at each iteration. Possible values are 1.0e-6, 1.0e-5, 1.0e-4, 2.0e-4, 5.0e-4, 1.0e-3, 2.0e-3, 5.0e-3, 1.0e-2, 2.0e-2, 0.1, 0.3, 0.5, 1.0, 3.0, 10.0 default is 1.0e-3 :type psithr: float :arg scores: Maximum number of match score summaries reported in the result output. Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000 Default is 500 :type scores: int :arg alignments: Maximum number of match alignments reported in the result output. Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000 Default is 500 :type alignmets: int :arg dropoff: The amount a score can drop before extension of word hits is halted Possible values are 0, 2, 4, 6, 8, 10, 15, 20, 25, or 30 Default is 15 :type dropoff: int :arg finaldropoff: Dropoff value for final gapped alignment Possible values are 10, 12, 14, 16, 18, 20, 22, 24, 25, 26, 28, or 30 Default is 25 :type finaldropoff: int :arg filter: Filter regions of low sequence complexity. This can avoid issues with low complexity sequences where matches are found due to composition rather than meaningful sequence similarity. However, in some cases filtering also masks regions of interest and so should be used with caution. Possible values are T and F, default is F :type filter: str :arg seqrange: Specify a range or section of the input sequence to use in the search. Example: Specifying '34-89' in an input sequence of total length 100, will tell BLAST to only use residues 34 to 89, inclusive. :type seqrange: str of form START-END :arg database: a database name from those available. See http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/database default is pdb :type database: str :arg previousjobid: The job identifier for the previous PSI-BLAST iteration. default is None You can change this if you want to continue from a previous run :type previousjobid: str :arg selectedHits: Name of a file containing a list of identifiers of the hits from the previous iteration to use to construct the search PSSM for this iteration. default is None :type selectedHits: str :arg cpfile: Name of a Checkpoint file from the previous iteration. default is None :type cpfile: str :arg sleep: how long to wait to reconnect for status Sleep time is multiplied by 1.5 when results are not ready. default is 2 seconds :type sleep: float :arg timeout: when to give up waiting for the results default is 120 seconds :type timeout: float :arg cycle: cycle number :type cycle: int """ cycle = kwargs.get('cycle', 0) if sequence == 'runexample': sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI' 'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN' 'DAYDIVKMKKSNISPNFNFMGQLLDFERTL') elif isinstance(sequence, Atomic): sequence = sequence.calpha.getSequence() elif isinstance(sequence, Sequence): sequence = str(sequence) elif isinstance(sequence, str): if len(sequence) in [4, 5, 6]: ag = parsePDB(sequence) sequence = ag.calpha.getSequence() sequence = ''.join(sequence.split()) elif sequence is None: if cycle == 0: cycle = 1 else: raise TypeError( 'sequence must be Atomic, Sequence, or str not {0}'.format( type(sequence))) if cycle == 0: query = [('sequence', sequence)] else: query = [] email = kwargs.get('email', '*****@*****.**') if not isinstance(email, str): raise TypeError('email must be a string') elif email.find('@') == -1 or email.find('.') == -1 or len( email.split('@')) != 2: raise ValueError( 'email must be a valid email address with at least one . and exactly one @ sign' ) elif not email.find('@') < email.find(email.split('.')[-1]): raise ValueError( 'email must be a valid email address with a . after the @ sign') query.append(('email', email)) query.append(('title', 'ProDy psiBlastPDB request')) previousjobid = kwargs.get('previousjobid', '') if previousjobid != '': query.append(('previousjobid', previousjobid)) selectedHits = kwargs.get('selectedHits', '') if selectedHits != '': query.append(('selectedHits', selectedHits)) database = kwargs.get('database', 'pdb') checkPsiBlastParameter('database', database) query.append(('database', database)) matrix = kwargs.get('matrix', 'BLOSUM62') checkPsiBlastParameter('matrix', matrix) query.append(('matrix', matrix)) gapopen = kwargs.get('gapopen', 11) checkPsiBlastParameter('gapopen', gapopen) query.append(('gapopen', gapopen)) gapext = kwargs.get('gapext', 1) checkPsiBlastParameter('gapext', gapext) query.append(('gapext', gapext)) expthr = kwargs.get('expthr', 10.) checkPsiBlastParameter('expthr', expthr) query.append(('expthr', expthr)) psithr = kwargs.get('psithr', 1.0e-3) checkPsiBlastParameter('psithr', psithr) query.append(('psithr', psithr)) scores = kwargs.get('scores', 500) checkPsiBlastParameter('scores', scores) query.append(('scores', scores)) alignments = kwargs.get('alignments', 500) checkPsiBlastParameter('alignments', alignments) query.append(('alignments', alignments)) query.append(('alignView', 0)) dropoff = kwargs.get('dropoff', 15) checkPsiBlastParameter('dropoff', dropoff) query.append(('dropoff', dropoff)) finaldropoff = kwargs.get('finaldropoff', 25) checkPsiBlastParameter('finaldropoff', finaldropoff) query.append(('finaldropoff', finaldropoff)) filter = kwargs.get('filter', 'no') checkPsiBlastParameter('filter', filter) query.append(('filter', filter)) if previousjobid == '' and selectedHits == '': seqrange = kwargs.get('seqrange', None) if seqrange is None: seqrange = '0-' + str(len(sequence)) elif not isinstance(seqrange, str): raise TypeError('seqrange should be a string') elif len(seqrange.split('-')) != 2: raise ValueError('seqrange should take the form START-END') try: start = int(seqrange.split('-')[0]) end = int(seqrange.split('-')[1]) except: raise ValueError( 'seqrange should be START-END with START and END being integers' ) query.append(('seqrange', seqrange)) headers = {'User-Agent': 'ProDy'} try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib import urlencode sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', 120)) data = urlencode(query) # submit the job base_url = 'http://www.ebi.ac.uk/Tools/services/rest/psiblast/' url = base_url + 'run/' LOGGER.timeit('_prody_psi-blast') if cycle == 0: LOGGER.info('PSI-Blast searching PDB database for "{0}..."'.format( sequence[:5])) else: LOGGER.info( 'PSI-Blast searching PDB database, cycle={0}'.format(cycle)) handle = openURL(url, data=data, headers=headers) job_id = handle.read() if PY3K: job_id = job_id.decode() handle.close() # check the status url = base_url + 'status/' + job_id handle = openURL(url) status = handle.read() if PY3K: status = status.decode() handle.close() # keep checking the status until it's no longer running while status == 'RUNNING': LOGGER.sleep(int(sleep), 'to reconnect to EBI for status.') LOGGER.write('Connecting to EBI for status...') handle = openURL(url) status = handle.read() if PY3K: status = status.decode() LOGGER.clear() sleep = int(sleep * 1.5) if LOGGER.timing('_prody_psi-blast') > timeout: LOGGER.warn('PSI-Blast search time out.') return None LOGGER.info('The status is {0}'.format(status)) LOGGER.clear() LOGGER.report('PSI-Blast search completed in %.1fs.', '_prody_psi-blast') if cycle != 1: # get the results url = base_url + 'result/' + job_id + '/xml' handle = openURL(url) results = handle.read() handle.close() try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' f_out = open(filename, 'w') f_out.write(results) f_out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) return job_id, PsiBlastRecord(results, sequence) else: return job_id
def blastPDB(sequence, filename=None, **kwargs): """Return a :class:`PDBBlastRecord` instance that contains results from blast searching of ProteinDataBank database *sequence* using NCBI blastp. :arg sequence: single-letter code amino acid sequence of the protein without any gap characters, all white spaces will be removed :type sequence: str :arg filename: a *filename* to save the results in XML format :type filename: str *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``) search parameters can be adjusted by the user. *sleep* keyword argument (default is ``2`` seconds) determines how long to wait to reconnect for results. Sleep time is doubled when results are not ready. *timeout* (default is 30 seconds) determines when to give up waiting for the results. """ if sequence == 'runexample': sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI' 'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN' 'DAYDIVKMKKSNISPNFNFMGQLLDFERTL') elif isinstance(sequence, str): sequence = ''.join(sequence.split()) if not checkSequence(sequence): raise ValueError(repr(sequence) + ' is not a valid sequence') else: raise TypeError('sequence must be a string') query = [('DATABASE', 'pdb'), ('ENTREZ_QUERY', '(none)'), ('PROGRAM', 'blastp'),] expect = kwargs.pop('expect', 10e-10) assert isinstance(expect, (float, int)), 'expect must be a float' assert expect > 0, 'expect must be a positive number' query.append(('EXPECT', expect)) hitlist_size = kwargs.pop('hitlist_size', 250) assert isinstance(hitlist_size, int), 'hitlist_size must be an integer' assert hitlist_size > 0, 'expect must be a positive integer' query.append(('HITLIST_SIZE', hitlist_size)) query.append(('QUERY', sequence)) query.append(('CMD', 'Put')) sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', 20)) if kwargs: LOGGER.warning("Keyword argument(s) '{0:s}' are not used." .format("', '".join(kwargs.keys()))) import urllib, urllib2 url = 'http://blast.ncbi.nlm.nih.gov/Blast.cgi' data = urllib.urlencode(query) LOGGER.timeit() LOGGER.info('Blast searching NCBI PDB database for "{0:s}..."' .format(sequence[:5])) request = urllib2.Request(url, data, {'User-agent': 'ProDy'}) handle = urllib2.urlopen(request) html = handle.read() index = html.find('RID =') if index == -1: raise Exception('NCBI did not return expected response.') else: last = html.find('\n', index) rid = html[index + len('RID ='):last].strip() index = html.find('RTOE =') if index == -1: rtoe = None # This is not used else: last = html.find('\n', index) rtoe = int(html[index + len('RTOE ='):last].strip()) query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500), ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')] data = urllib.urlencode(query) while True: LOGGER.sleep(int(sleep), ' to connect NCBI for search results.') LOGGER.write('Connecting NCBI for search results...') request = urllib2.Request(url, data, {'User-agent': 'ProDy'}) handle = urllib2.urlopen(request) results = handle.read() index = results.find('Status=') LOGGER.clear() if index < 0: break last = results.index('\n', index) status = results[index+len('Status='):last].strip() if status.upper() == 'READY': break sleep *= 2 if LOGGER.timing() > timeout: LOGGER.warning('Blast search time out.') return None LOGGER.clear() LOGGER.timing('Blast search completed in %.1fs.') if isinstance(filename, str): if not filename.lower().endswith('.xml'): filename += '.xml' out = open(filename, 'w') out.write(results) out.close() LOGGER.info('Results are saved as {0:s}.'.format(filename)) return PDBBlastRecord(results, sequence)
def blastPDBUniProtKB(sequence, filename=None, **kwargs): """Returns a :class:`PDBBlastRecord` instance that contains results from blast searching of ProteinDataBank database *sequence* using NCBI blastp. :arg sequence: single-letter code amino acid sequence of the protein without any gap characters, all white spaces will be removed :type sequence: str :arg filename: a *filename* to save the results in XML format :type filename: str *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``) search parameters can be adjusted by the user. *sleep* keyword argument (default is ``2`` seconds) determines how long to wait to reconnect for results. Sleep time is doubled when results are not ready. *timeout* (default is 120s) determines when to give up waiting for the results. *num_sequences (default is ``1``) """ num_sequences = int(kwargs.pop('num_sequences', 1)) if sequence == 'runexample': sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI' 'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN' 'DAYDIVKMKKSNISPNFNFMGQLLDFERTL') else: if num_sequences == 1: try: sequence = ''.join(sequence.split()) _ = sequence.isalpha() except AttributeError: raise TypeError('sequence must be a string') else: if not _: raise ValueError('not a valid protein sequence') headers = {'User-agent': 'ProDy'} query = [('DATABASE', 'swissprot'), ('ENTREZ_QUERY', '(none)'), ('PROGRAM', 'blastp'),] expect = float(kwargs.pop('expect', 10e-5)) if expect <= 0: raise ValueError('expect must be a positive number') query.append(('EXPECT', expect)) hitlist_size = int(kwargs.pop('hitlist_size', 250)) if hitlist_size <= 0: raise ValueError('expect must be a positive integer') psiblast = 'true' step_number = 3 query.append(('RUN_PSIBLAST', psiblast)) query.append(('HITLIST_SIZE', hitlist_size)) query.append(('QUERY', sequence)) query.append(('CMD', 'Put')) query.append(('STEP_NUMBER', step_number)) sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', 120)) if kwargs: LOGGER.warn('Keyword argument(s) {0} are not used.' .format(', '.join([repr(key) for key in kwargs]))) try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib.parse import urlencode url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi' data = urlencode(query) LOGGER.timeit('_prody_blast') LOGGER.info('Blast searching NCBI PDB database for "{0}..."' .format(sequence[:5])) handle = openURL(url, data=data, headers=headers) html = handle.read() index = html.find(b'name="RID" type="hidden" value="') if index == -1: raise Exception('NCBI did not return expected response.') else: last = html.find(b'>',index) rid = html[index + len('name="RID" type="hidden" value="'):last-1].strip() index = html.find(b'name="RTOE" type="hidden" value="') if index == -1: rtoe = None # This is not used else: last = html.find(b'>', index) rtoe = html[index + len('name="RTOE" type="hidden" value="'):last-1].strip() query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500), ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')] data = urlencode(query) while True: LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.') LOGGER.write('Connecting NCBI for search results...') handle = openURL(url, data=data, headers=headers) results = handle.read() index = results.find(b'Status=') LOGGER.clear() if index < 0: break last = results.index(b'\n', index) status = results[index+len('Status='):last].strip() if status.upper() == 'READY': break sleep = int(sleep * 1.5) if LOGGER.timing('_prody_blast') > timeout: LOGGER.warn('Blast search time out.') return None LOGGER.clear() LOGGER.report('Blast search completed in %.1fs.', '_prody_blast') try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' out = open(filename, 'w') out.write(results) out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) return SwissProtBlastRecord(results, sequence)
def blastPDBUniProtKB(sequence, filename=None, **kwargs): """Returns a :class:`PDBBlastRecord` instance that contains results from blast searching of ProteinDataBank database *sequence* using NCBI blastp. :arg sequence: single-letter code amino acid sequence of the protein without any gap characters, all white spaces will be removed :type sequence: str :arg filename: a *filename* to save the results in XML format :type filename: str *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``) search parameters can be adjusted by the user. *sleep* keyword argument (default is ``2`` seconds) determines how long to wait to reconnect for results. Sleep time is doubled when results are not ready. *timeout* (default is 120s) determines when to give up waiting for the results. *num_sequences (default is ``1``) """ num_sequences = int(kwargs.pop('num_sequences', 1)) if sequence == 'runexample': sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI' 'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN' 'DAYDIVKMKKSNISPNFNFMGQLLDFERTL') else: if num_sequences == 1: try: sequence = ''.join(sequence.split()) _ = sequence.isalpha() except AttributeError: raise TypeError('sequence must be a string') else: if not _: raise ValueError('not a valid protein sequence') headers = {'User-agent': 'ProDy'} query = [('DATABASE', 'swissprot'), ('ENTREZ_QUERY', '(none)'), ('PROGRAM', 'blastp'),] expect = float(kwargs.pop('expect', 10e-5)) if expect <= 0: raise ValueError('expect must be a positive number') query.append(('EXPECT', expect)) hitlist_size = int(kwargs.pop('hitlist_size', 250)) if hitlist_size <= 0: raise ValueError('expect must be a positive integer') psiblast = 'true' step_number = 3 query.append(('RUN_PSIBLAST', psiblast)) query.append(('HITLIST_SIZE', hitlist_size)) query.append(('QUERY', sequence)) query.append(('CMD', 'Put')) query.append(('STEP_NUMBER', step_number)) sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', 120)) if kwargs: LOGGER.warn('Keyword argument(s) {0} are not used.' .format(', '.join([repr(key) for key in kwargs]))) try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib import urlencode url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi' data = urlencode(query) LOGGER.timeit('_prody_blast') LOGGER.info('Blast searching NCBI PDB database for "{0}..."' .format(sequence[:5])) handle = openURL(url, data=data, headers=headers) html = handle.read() index = html.find(b'name="RID" type="hidden" value="') if index == -1: raise Exception('NCBI did not return expected response.') else: last = html.find(b'>',index) rid = html[index + len('name="RID" type="hidden" value="'):last-1].strip() index = html.find(b'name="RTOE" type="hidden" value="') if index == -1: rtoe = None # This is not used else: last = html.find(b'>', index) rtoe = html[index + len('name="RTOE" type="hidden" value="'):last-1].strip() query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500), ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')] data = urlencode(query) while True: LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.') LOGGER.write('Connecting NCBI for search results...') handle = openURL(url, data=data, headers=headers) results = handle.read() index = results.find(b'Status=') LOGGER.clear() if index < 0: break last = results.index(b'\n', index) status = results[index+len('Status='):last].strip() if status.upper() == 'READY': break sleep = int(sleep * 1.5) if LOGGER.timing('_prody_blast') > timeout: LOGGER.warn('Blast search time out.') return None LOGGER.clear() LOGGER.report('Blast search completed in %.1fs.', '_prody_blast') try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' out = open(filename, 'w') out.write(results) out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) return SwissProtBlastRecord(results, sequence)
def getRecord(self, url, localFile=False): if localFile: dali_file = open(url, 'r') data = dali_file.read() dali_file.close() else: sleep = 2 timeout = 120 LOGGER.timeit('_dali') log_message = '' try_error = 3 while True: LOGGER.sleep(int(sleep), 'to reconnect Dali '+log_message) LOGGER.clear() LOGGER.write('Connecting Dali for search results...') LOGGER.clear() try: html = urllib2.urlopen(url).read() except: try_error -= 1 if try_error >= 0: LOGGER.sleep(2, '. Connection error happened. Trying to reconnect...') continue else: html = urllib2.urlopen(url).read() if html.find('Status: Queued') > -1: log_message = '(Dali searching is queued)...' elif html.find('Status: Running') > -1: log_message = '(Dali searching is running)...' elif html.find('Your job') == -1 and html.find('.txt') > -1: break elif html.find('ERROR:') > -1: LOGGER.warn(': Dali search reported an ERROR!') return None break sleep = 20 if int(sleep * 1.5) >= 20 else int(sleep * 1.5) if LOGGER.timing('_dali') > timeout: LOGGER.warn(': Dali search is time out. \nThe results can be obtained using getRecord() function later.') return None break LOGGER.clear() LOGGER.clear() LOGGER.report('Dali results completed in %.1fs.', '_dali') lines = html.strip().split('\n') file_name = re.search('=.+-90\.txt', html).group()[1:] file_name = file_name[:-7] # LOGGER.info(url+file_name+self._subset+'.txt') data = urllib2.urlopen(url+file_name+self._subset+'.txt').read() temp_name = file_name+self._subset+'_dali.txt' with open(temp_name, "w") as file_temp: file_temp.write(html + '\n' + url+file_name + '\n' + data) # with open(temp_name, "a+") as file_temp: file_temp.write(url+file_name + '\n' + data) data_list = data.strip().split('# ') # No: Chain Z rmsd lali nres %id PDB Description -> data_list[3] # Structural equivalences -> data_list[4] # Translation-rotation matrices -> data_list[5] map_temp_dict = dict() mapping = [] lines = data_list[4].strip().split('\n') self._lines_4 = lines mapping_temp = np.genfromtxt(lines[1:], delimiter = (4,1,14,6,2,4,4,5,2,4,4,3,5,4,3,5,6,3,5,4,3,5,28), usecols = [0,3,5,7,9,12,15,15,18,21], dtype='|i4') # [0,3,5,7,9,12,15,15,18,21] -> [index, residue_a, residue_b, residue_i_a, residue_i_b, resid_a, resid_b, resid_i_a, resid_i_b] for map_i in mapping_temp: if not map_i[0] in map_temp_dict: map_temp_dict[map_i[0]] = [[map_i[1], map_i[2], map_i[3], map_i[4]]] else: map_temp_dict[map_i[0]].append([map_i[1], map_i[2], map_i[3], map_i[4]]) self._max_index = max(mapping_temp[:,2]) self._mapping = map_temp_dict self._data = data_list[3] lines = data_list[3].strip().split('\n') daliInfo = np.genfromtxt(lines[1:], delimiter = (4,3,6,5,5,5,6,5,57), usecols = [0,2,3,4,5,6,7,8], dtype=[('id', '<i4'), ('pdb_chain', '|S6'), ('Z', '<f4'), ('rmsd', '<f4'), ('len_align', '<i4'), ('res_num', '<i4'), ('identity', '<i4'), ('title', '|S70')]) if daliInfo.ndim == 0: daliInfo = np.array([daliInfo]) pdbListAll = [] self._daliInfo = daliInfo dali_temp_dict = dict() for temp in self._daliInfo: temp_dict = dict() pdb_chain = temp[1].strip()[0:6] temp_dict['pdbId'] = pdb_chain[0:4] temp_dict['chainId'] = pdb_chain[5:6] temp_dict['pdb_chain'] = pdb_chain temp_dict['Z'] = temp[2] temp_dict['rmsd'] = temp[3] temp_dict['len_align'] = temp[4] temp_dict['res_num'] = temp[5] temp_dict['identity'] = temp[6] temp_dict['mapping'] = (np.array(map_temp_dict[temp[0]])-1).tolist() temp_dict['map_ref'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[0], map_i[1]+1)] temp_dict['map_sel'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[2], map_i[3]+1)] dali_temp_dict[temp_dict['pdb_chain']] = temp_dict pdbListAll.append(pdb_chain) self._pdbListAll = tuple(pdbListAll) self._pdbList = self._pdbListAll self._alignPDB = dali_temp_dict LOGGER.info(str(len(pdbListAll)) + ' Dali results have been searched.') return True
def fetch(self, url=None, localFile=False, **kwargs): """Get Dali record from url or file. :arg url: url of Dali results page or local dali results file If None then the url already associated with the DaliRecord object is used. :type url: str :arg localFile: whether provided url is a path for a local dali results file :type localFile: bool :arg timeout: amount of time until the query times out in seconds default value is 120 :type timeout: int :arg localfolder: folder in which to find the local file default is the current folder :type localfolder: str """ if localFile: dali_file = open(url, 'r') data = dali_file.read() dali_file.close() else: import requests if url == None: url = self._url sleep = 2 timeout = kwargs.pop('timeout', 120) LOGGER.timeit('_dali') log_message = '' try_error = 3 while True: LOGGER.write('Connecting to Dali for search results...') LOGGER.clear() try: # html = urllib2.urlopen(url).read() html = requests.get(url).content except: try_error -= 1 if try_error >= 0: LOGGER.sleep(2, '. Connection error happened. Trying to reconnect...') continue else: # html = urllib2.urlopen(url).read() html = requests.get(url).content if PY3K: html = html.decode() if html.find('Status: Queued') > -1: log_message = '(Dali search is queued)...' elif html.find('Status: Running') > -1: log_message = '(Dali search is running)...' elif html.find('Your job') == -1 and html.find('.txt') > -1: break elif html.find('ERROR:') > -1: LOGGER.warn(': Dali search reported an ERROR!') return False sleep = 20 if int(sleep * 1.5) >= 20 else int(sleep * 1.5) if LOGGER.timing('_dali') > timeout: LOGGER.warn(': Dali search has timed out. \nThe results can be obtained later using the fetch() method.') return False LOGGER.sleep(int(sleep), 'to reconnect to Dali '+log_message) LOGGER.clear() LOGGER.clear() LOGGER.report('Dali results were fetched in %.1fs.', '_dali') lines = html.strip().split('\n') file_name = re.search('=.+-90\\.txt', html).group()[1:] file_name = file_name[:-7] # LOGGER.info(url+file_name+self._subset+'.txt') # data = urllib2.urlopen(url+file_name+self._subset+'.txt').read() data = requests.get(url+file_name+self._subset+'.txt').content if PY3K: data = data.decode() localfolder = kwargs.pop('localfolder', '.') if file_name.lower().startswith('s001'): temp_name = self._pdbId + self._chain else: temp_name = file_name temp_name += self._subset + '_dali.txt' if localfolder != '.' and not os.path.exists(localfolder): os.mkdir(localfolder) with open(localfolder+os.sep+temp_name, "w") as file_temp: file_temp.write(html + '\n' + url+file_name+self._subset+'.txt' + '\n' + data) # with open(temp_name, "a+") as file_temp: file_temp.write(url+file_name + '\n' + data) data_list = data.strip().split('# ') # No: Chain Z rmsd lali nres %id PDB Description -> data_list[3] # Structural equivalences -> data_list[4] # Translation-rotation matrices -> data_list[5] map_temp_dict = dict() lines = data_list[4].strip().split('\n') self._lines_4 = lines mapping_temp = np.genfromtxt(lines[1:], delimiter = (4,1,14,6,2,4,4,5,2,4,4,3,5,4,3,5,6,3,5,4,3,5,28), usecols = [0,3,5,7,9,12,15,15,18,21], dtype='|i4') # [0,3,5,7,9,12,15,15,18,21] -> [index, residue_a, residue_b, residue_i_a, residue_i_b, resid_a, resid_b, resid_i_a, resid_i_b] for map_i in mapping_temp: if not map_i[0] in map_temp_dict: map_temp_dict[map_i[0]] = [[map_i[1], map_i[2], map_i[3], map_i[4]]] else: map_temp_dict[map_i[0]].append([map_i[1], map_i[2], map_i[3], map_i[4]]) self._max_index = max(mapping_temp[:,2]) self._mapping = map_temp_dict self._data = data_list[3] lines = data_list[3].strip().split('\n') # daliInfo = np.genfromtxt(lines[1:], delimiter = (4,3,6,5,5,5,6,5,57), usecols = [0,2,3,4,5,6,7,8], # dtype=[('id', '<i4'), ('pdb_chain', '|S6'), ('Z', '<f4'), ('rmsd', '<f4'), # ('len_align', '<i4'), ('nres', '<i4'), ('identity', '<i4'), ('title', '|S70')]) daliInfo = np.genfromtxt(lines[1:], delimiter = (4,3,6,5,5,5,6,5,57), usecols = [0,2,3,4,5,6,7,8], dtype=[('id', '<i4'), ('pdb_chain', '|U6'), ('Z', '<f4'), ('rmsd', '<f4'), ('len_align', '<i4'), ('nres', '<i4'), ('identity', '<i4'), ('title', '|U70')]) if daliInfo.ndim == 0: daliInfo = np.array([daliInfo]) pdbListAll = [] self._daliInfo = daliInfo dali_temp_dict = dict() for temp in self._daliInfo: temp_dict = dict() pdb_chain = temp[1].strip()[0:6] # U6 and U70 were used as the dtype for np.genfromtext -> unicode string were used in daliInfo # if PY3K: # pdb_chain = pdb_chain.decode() pdb_chain = str(pdb_chain) temp_dict['pdbId'] = pdbid = pdb_chain[0:4].lower() temp_dict['chainId'] = chid = pdb_chain[5:6] temp_dict['pdb_chain'] = pdb_chain = pdbid + chid temp_dict['Z'] = temp[2] temp_dict['rmsd'] = temp[3] temp_dict['len_align'] = temp[4] temp_dict['nres'] = temp[5] temp_dict['identity'] = temp[6] temp_dict['mapping'] = (np.array(map_temp_dict[temp[0]])-1).tolist() temp_dict['map_ref'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[0], map_i[1]+1)] temp_dict['map_sel'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[2], map_i[3]+1)] dali_temp_dict[pdb_chain] = temp_dict pdbListAll.append(pdb_chain) self._pdbListAll = tuple(pdbListAll) self._pdbList = self._pdbListAll self._alignPDB = dali_temp_dict LOGGER.info('Obtained ' + str(len(pdbListAll)) + ' PDB chains from Dali for '+self._pdbId+self._chain+'.') return True
def blastPDB(sequence, filename=None, **kwargs): """Returns a :class:`PDBBlastRecord` instance that contains results from blast searching of ProteinDataBank database *sequence* using NCBI blastp. :arg sequence: single-letter code amino acid sequence of the protein without any gap characters, all white spaces will be removed :type sequence: str :arg filename: a *filename* to save the results in XML format :type filename: str *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``) search parameters can be adjusted by the user. *sleep* keyword argument (default is ``2`` seconds) determines how long to wait to reconnect for results. Sleep time is doubled when results are not ready. *timeout* (default is 120s) determines when to give up waiting for the results. """ if sequence == "runexample": sequence = ( "ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI" "SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN" "DAYDIVKMKKSNISPNFNFMGQLLDFERTL" ) else: try: sequence = "".join(sequence.split()) _ = sequence.isalpha() except AttributeError: raise TypeError("sequence must be a string") else: if not _: raise ValueError("not a valid protein sequence") headers = {"User-agent": "ProDy"} query = [("DATABASE", "pdb"), ("ENTREZ_QUERY", "(none)"), ("PROGRAM", "blastp")] expect = float(kwargs.pop("expect", 10e-10)) if expect <= 0: raise ValueError("expect must be a positive number") query.append(("EXPECT", expect)) hitlist_size = int(kwargs.pop("hitlist_size", 250)) if hitlist_size <= 0: raise ValueError("expect must be a positive integer") query.append(("HITLIST_SIZE", hitlist_size)) query.append(("QUERY", sequence)) query.append(("CMD", "Put")) sleep = float(kwargs.pop("sleep", 2)) timeout = float(kwargs.pop("timeout", 120)) if kwargs: LOGGER.warn("Keyword argument(s) {0} are not used.".format(", ".join([repr(key) for key in kwargs]))) try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), "utf-8") except ImportError: from urllib import urlencode url = "https://blast.ncbi.nlm.nih.gov/Blast.cgi" data = urlencode(query) LOGGER.timeit("_prody_blast") LOGGER.info('Blast searching NCBI PDB database for "{0}..."'.format(sequence[:5])) handle = openURL(url, data=data, headers=headers) html = handle.read() index = html.find(b"RID =") if index == -1: raise Exception("NCBI did not return expected response.") else: last = html.find(b"\n", index) rid = html[index + len("RID =") : last].strip() index = html.find(b"RTOE =") if index == -1: rtoe = None # This is not used else: last = html.find(b"\n", index) rtoe = int(html[index + len("RTOE =") : last].strip()) query = [("ALIGNMENTS", 500), ("DESCRIPTIONS", 500), ("FORMAT_TYPE", "XML"), ("RID", rid), ("CMD", "Get")] data = urlencode(query) while True: LOGGER.sleep(int(sleep), "to reconnect NCBI for search results.") LOGGER.write("Connecting NCBI for search results...") handle = openURL(url, data=data, headers=headers) results = handle.read() index = results.find(b"Status=") LOGGER.clear() if index < 0: break last = results.index(b"\n", index) status = results[index + len("Status=") : last].strip() if status.upper() == "READY": break sleep = int(sleep * 1.5) if LOGGER.timing("_prody_blast") > timeout: LOGGER.warn("Blast search time out.") return None LOGGER.clear() LOGGER.report("Blast search completed in %.1fs.", "_prody_blast") try: ext_xml = filename.lower().endswith(".xml") except AttributeError: pass else: if not ext_xml: filename += ".xml" out = open(filename, "w") out.write(results) out.close() LOGGER.info("Results are saved as {0}.".format(repr(filename))) return PDBBlastRecord(results, sequence)
def fetch(self, xml=None, sequence=None, **kwargs): """Get Blast record from url or file. :arg sequence: an object with an associated sequence string or a sequence string itself :type sequence: :class:`Atomic`, :class:`Sequence`, or str :arg xml: blast search results in XML format or an XML file that contains the results or a filename for saving the results or None :type xml: str :arg timeout: amount of time until the query times out in seconds default value is 120 :type timeout: int """ if self.isSuccess: LOGGER.warn( "The record already exists so not further search is performed") return True if sequence == None: sequence = self._sequence if xml == None: xml = self._xml import xml.etree.cElementTree as ET if xml is not None and len(xml) < 100: if os.path.isfile(xml): xml = ET.parse(xml) root = xml.getroot() else: raise ValueError('xml is not a filename and does not look like' ' a valid XML string') else: headers = {'User-agent': 'ProDy'} query = [ ('DATABASE', 'pdb'), ('ENTREZ_QUERY', '(none)'), ('PROGRAM', 'blastp'), ] expect = float(kwargs.pop('expect', 10e-10)) if expect <= 0: raise ValueError('expect must be a positive number') query.append(('EXPECT', expect)) hitlist_size = int(kwargs.pop('hitlist_size', 250)) if hitlist_size <= 0: raise ValueError('expect must be a positive integer') query.append(('HITLIST_SIZE', hitlist_size)) query.append(('QUERY', sequence)) query.append(('CMD', 'Put')) sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', self._timeout)) self._timeout = timeout try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib import urlencode url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi' data = urlencode(query) LOGGER.timeit('_prody_blast') LOGGER.info( 'Blast searching NCBI PDB database for "{0}..."'.format( sequence[:5])) handle = openURL(url, data=data, headers=headers) html = handle.read() index = html.find(b'RID =') if index == -1: raise Exception('NCBI did not return expected response.') else: last = html.find(b'\n', index) rid = html[index + len('RID ='):last].strip() query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500), ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')] data = urlencode(query) while True: LOGGER.sleep(int(sleep), 'to reconnect to NCBI for search results.') LOGGER.write('Connecting to NCBI for search results...') handle = openURL(url, data=data, headers=headers) results = handle.read() index = results.find(b'Status=') LOGGER.clear() if index < 0: break last = results.index(b'\n', index) status = results[index + len('Status='):last].strip() if status.upper() == b'READY': break sleep = int(sleep * 1.5) if LOGGER.timing('_prody_blast') > timeout: LOGGER.warn('Blast search time out.') return False LOGGER.clear() LOGGER.report('Blast search completed in %.1fs.', '_prody_blast') filename = xml root = ET.XML(results) try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' out = open(filename, 'w') if PY3K: out.write(results.decode()) else: out.write(results) out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) root = dictElement(root, 'BlastOutput_') if root['db'] != 'pdb': raise ValueError('blast search database in xml must be "pdb"') if root['program'] != 'blastp': raise ValueError( 'blast search program in xml must be "blastp"') self._param = dictElement(root['param'][0], 'Parameters_') query_len = int(root['query-len']) if sequence and len(sequence) != query_len: raise ValueError( 'query-len and the length of the sequence do not ' 'match, xml data may not be for given sequence') hits = [] for iteration in root['iterations']: for hit in dictElement(iteration, 'Iteration_')['hits']: hit = dictElement(hit, 'Hit_') data = dictElement(hit['hsps'][0], 'Hsp_') for key in [ 'align-len', 'gaps', 'hit-frame', 'hit-from', 'hit-to', 'identity', 'positive', 'query-frame', 'query-from', 'query-to' ]: data[key] = int(data[key]) data['query-len'] = query_len for key in ['evalue', 'bit-score', 'score']: data[key] = float(data[key]) p_identity = 100.0 * data['identity'] / ( data['query-to'] - data['query-from'] + 1) data['percent_identity'] = p_identity p_overlap = (100.0 * (data['align-len'] - data['gaps']) / query_len) data['percent_coverage'] = p_overlap for item in (hit['id'] + hit['def']).split('>gi'): head, title = item.split(None, 1) head = head.split('|') pdb_id = head[-2].lower() chain_id = head[-1][:1] pdbch = dict(data) pdbch['pdb_id'] = pdb_id pdbch['chain_id'] = chain_id pdbch['title'] = (head[-1][1:] + title).strip() hits.append((p_identity, p_overlap, pdbch)) hits.sort(key=lambda hit: hit[0], reverse=True) self._hits = hits return True
def blastPDB(sequence, filename=None, **kwargs): """Returns a :class:`PDBBlastRecord` instance that contains results from blast searching *sequence* against the PDB using NCBI blastp. :arg sequence: an object with an associated sequence string or a sequence string itself :type sequence: :class:`Atomic`, :class:`Sequence`, or str :arg filename: a *filename* to save the results in XML format :type filename: str *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``) search parameters can be adjusted by the user. *sleep* keyword argument (default is ``2`` seconds) determines how long to wait to reconnect for results. Sleep time is multiplied by 1.5 when results are not ready. *timeout* (default is 120 s) determines when to give up waiting for the results. """ if sequence == 'runexample': sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI' 'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN' 'DAYDIVKMKKSNISPNFNFMGQLLDFERTL') elif isinstance(sequence, Atomic): sequence = sequence.calpha.getSequence() elif isinstance(sequence, Sequence): sequence = str(sequence) elif isinstance(sequence, str): if len(sequence) in [4, 5, 6]: ag = parsePDB(sequence) sequence = ag.calpha.getSequence() sequence = ''.join(sequence.split()) else: raise TypeError('sequence must be Atomic, Sequence, or str not {0}' .format(type(sequence))) headers = {'User-agent': 'ProDy'} query = [('DATABASE', 'pdb'), ('ENTREZ_QUERY', '(none)'), ('PROGRAM', 'blastp'),] expect = float(kwargs.pop('expect', 10e-10)) if expect <= 0: raise ValueError('expect must be a positive number') query.append(('EXPECT', expect)) hitlist_size = int(kwargs.pop('hitlist_size', 250)) if hitlist_size <= 0: raise ValueError('expect must be a positive integer') query.append(('HITLIST_SIZE', hitlist_size)) query.append(('QUERY', sequence)) query.append(('CMD', 'Put')) sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', 120)) try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib import urlencode url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi' data = urlencode(query) LOGGER.timeit('_prody_blast') LOGGER.info('Blast searching NCBI PDB database for "{0}..."' .format(sequence[:5])) handle = openURL(url, data=data, headers=headers) html = handle.read() index = html.find(b'RID =') if index == -1: raise Exception('NCBI did not return expected response.') else: last = html.find(b'\n', index) rid = html[index + len('RID ='):last].strip() index = html.find(b'RTOE =') if index == -1: rtoe = None # This is not used else: last = html.find(b'\n', index) rtoe = int(html[index + len('RTOE ='):last].strip()) query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500), ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')] data = urlencode(query) while True: LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.') LOGGER.write('Connecting to NCBI for search results...') handle = openURL(url, data=data, headers=headers) results = handle.read() index = results.find(b'Status=') LOGGER.clear() if index < 0: break last = results.index(b'\n', index) status = results[index+len('Status='):last].strip() if status.upper() == 'READY': break sleep = int(sleep * 1.5) if LOGGER.timing('_prody_blast') > timeout: LOGGER.warn('Blast search time out.') return None LOGGER.clear() LOGGER.report('Blast search completed in %.1fs.', '_prody_blast') try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' out = open(filename, 'w') out.write(results) out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) return PDBBlastRecord(results, sequence)
def blastPDB(sequence, filename=None, **kwargs): """Returns a :class:`PDBBlastRecord` instance that contains results from blast searching *sequence* against the PDB using NCBI blastp. :arg sequence: an object with an associated sequence string or a sequence string itself :type sequence: :class:`Atomic`, :class:`Sequence`, or str :arg filename: a *filename* to save the results in XML format :type filename: str *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``) search parameters can be adjusted by the user. *sleep* keyword argument (default is ``2`` seconds) determines how long to wait to reconnect for results. Sleep time is multiplied by 1.5 when results are not ready. *timeout* (default is 120 s) determines when to give up waiting for the results. """ if sequence == 'runexample': sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI' 'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN' 'DAYDIVKMKKSNISPNFNFMGQLLDFERTL') elif isinstance(sequence, Atomic): sequence = sequence.calpha.getSequence() elif isinstance(sequence, Sequence): sequence = str(sequence) elif isinstance(sequence, str): if len(sequence) in [4, 5, 6]: ag = parsePDB(sequence) sequence = ag.calpha.getSequence() sequence = ''.join(sequence.split()) else: raise TypeError( 'sequence must be Atomic, Sequence, or str not {0}'.format( type(sequence))) headers = {'User-agent': 'ProDy'} query = [ ('DATABASE', 'pdb'), ('ENTREZ_QUERY', '(none)'), ('PROGRAM', 'blastp'), ] expect = float(kwargs.pop('expect', 10e-10)) if expect <= 0: raise ValueError('expect must be a positive number') query.append(('EXPECT', expect)) hitlist_size = int(kwargs.pop('hitlist_size', 250)) if hitlist_size <= 0: raise ValueError('expect must be a positive integer') query.append(('HITLIST_SIZE', hitlist_size)) query.append(('QUERY', sequence)) query.append(('CMD', 'Put')) sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', 120)) try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib import urlencode url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi' data = urlencode(query) LOGGER.timeit('_prody_blast') LOGGER.info('Blast searching NCBI PDB database for "{0}..."'.format( sequence[:5])) handle = openURL(url, data=data, headers=headers) html = handle.read() index = html.find(b'RID =') if index == -1: raise Exception('NCBI did not return expected response.') else: last = html.find(b'\n', index) rid = html[index + len('RID ='):last].strip() index = html.find(b'RTOE =') if index == -1: rtoe = None # This is not used else: last = html.find(b'\n', index) rtoe = int(html[index + len('RTOE ='):last].strip()) query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500), ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')] data = urlencode(query) while True: LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.') LOGGER.write('Connecting to NCBI for search results...') handle = openURL(url, data=data, headers=headers) results = handle.read() index = results.find(b'Status=') LOGGER.clear() if index < 0: break last = results.index(b'\n', index) status = results[index + len('Status='):last].strip() if status.upper() == 'READY': break sleep = int(sleep * 1.5) if LOGGER.timing('_prody_blast') > timeout: LOGGER.warn('Blast search time out.') return None LOGGER.clear() LOGGER.report('Blast search completed in %.1fs.', '_prody_blast') try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' out = open(filename, 'w') if PY3K: out.write(results.decode()) else: out.write(results) out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) return PDBBlastRecord(results, sequence)