예제 #1
0
파일: psiblast.py 프로젝트: fongchun/ProDy
def psiBlastCycle(sequence=None, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    a single cycle of EBI psiblast.

    :arg sequence: an object with an associated sequence string 
         or a sequence string itself
    :type sequence: :class:`Atomic`, :class:`Sequence`, or str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    The following search parameters can be adjusted by the user.
    We use the same default values as 
    http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/
    wherever applicable.

    :arg email: email address for reporting problems
        default is [email protected]
    :type email: str with an @ before a .

    :arg matrix: The comparison matrix to be used to score alignments when searching the database
        possible values are 'BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30' and 'PAM70' 
        default is 'BLOSUM62'
    :type matrix: str

    :arg gapopen: Penalty taken away from the score when a gap is created in sequence alignments. 
        Increasing the gap opening penalty will decrease the number of gaps in the final alignment.
        Possible values range from 8 to 16 inclusive, default is 11
    :type gapopen: int

    :arg gapext: Penalty taken away from the score for each base or residue in the gap. 
        Increasing the gap extension penalty favors short gaps in the final alignment, 
        conversly decreasing the gap extension penalty favors long gaps in the final alignment. 
        Possible values range from 0 to 3, default is 1
    :type gapext: int

    :arg expthr: Expectation threshold that limits the number of scores and alignments reported. 
        This is the maximum number of times the match is expected to occur by chance.
        Possible values are 1.0e-200, 1.0e-100, 1.0e-50, 1.0e-10, 1.0e-5, 1.0e-4, 1.0e-3,
        1.0e-2, 0.1, 1.0, 10.0, 100, 1000
        default is 10.0
    :type expthr: float

    :arg psithr: Expectation value threshold for automatic selection of matched sequences for 
        inclusion in the PSSM at each iteration.
        Possible values are 1.0e-6, 1.0e-5, 1.0e-4, 2.0e-4, 5.0e-4, 1.0e-3, 2.0e-3, 5.0e-3,
        1.0e-2, 2.0e-2, 0.1, 0.3, 0.5, 1.0, 3.0, 10.0
        default is 1.0e-3
    :type psithr: float

    :arg scores: Maximum number of match score summaries reported in the result output.
        Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000
        Default is 500
    :type scores: int

    :arg alignments: Maximum number of match alignments reported in the result output.
        Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000
        Default is 500
    :type alignmets: int

    :arg dropoff: The amount a score can drop before extension of word hits is halted
        Possible values are 0, 2, 4, 6, 8, 10, 15, 20, 25, or 30
        Default is 15
    :type dropoff: int

    :arg finaldropoff: Dropoff value for final gapped alignment
        Possible values are 10, 12, 14, 16, 18, 20, 22, 24, 25, 26, 28, or 30
        Default is 25
    :type finaldropoff: int

    :arg filter: Filter regions of low sequence complexity. This can avoid issues with 
        low complexity sequences where matches are found due to composition rather than 
        meaningful sequence similarity. However, in some cases filtering also masks 
        regions of interest and so should be used with caution.
        Possible values are T and F, default is F
    :type filter: str

    :arg seqrange: Specify a range or section of the input sequence to use in the search.
        Example: Specifying '34-89' in an input sequence of total length 100, will tell BLAST 
        to only use residues 34 to 89, inclusive.
    :type seqrange: str of form START-END

    :arg database: a database name from those available. See
        http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/database
        default is pdb
    :type database: str

    :arg previousjobid: The job identifier for the previous PSI-BLAST iteration. 
        default is None
        You can change this if you want to continue from a previous run
    :type previousjobid: str

    :arg selectedHits: Name of a file containing a list of identifiers of the 
        hits from the previous iteration to use to construct the search PSSM 
        for this iteration.
        default is None
    :type selectedHits: str

    :arg cpfile: Name of a Checkpoint file from the previous iteration. 
        default is None
    :type cpfile: str

    :arg sleep: how long to wait to reconnect for status
         Sleep time is multiplied by 1.5 when results are not ready.
         default is 2 seconds
    :type sleep: float

    :arg timeout:  when to give up waiting for the results 
        default is 120 seconds
    :type timeout: float

    :arg cycle: cycle number
    :type cycle: int

    """
    cycle = kwargs.get('cycle',0)

    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')

    elif isinstance(sequence, Atomic):
        sequence = sequence.calpha.getSequence()

    elif isinstance(sequence, Sequence):
        sequence = str(sequence)

    elif isinstance(sequence, str):
        if len(sequence) in [4, 5, 6]:
            ag = parsePDB(sequence)
            sequence = ag.calpha.getSequence()
        sequence = ''.join(sequence.split())

    elif sequence is None:
        if cycle == 0: 
            cycle = 1
    else:
        raise TypeError('sequence must be Atomic, Sequence, or str not {0}'
                        .format(type(sequence)))

    if cycle == 0:
        query = [('sequence', sequence)]
    else:
        query = []

    email = kwargs.get('email','*****@*****.**')
    if not isinstance(email, str):
        raise TypeError('email must be a string')
    elif email.find('@') == -1 or email.find('.') == -1 or len(email.split('@')) != 2:
        raise ValueError('email must be a valid email address with at least one . and exactly one @ sign')
    elif not email.find('@') < email.find(email.split('.')[-1]):
        raise ValueError('email must be a valid email address with a . after the @ sign')
    query.append(('email', email))
    query.append(('title', 'ProDy psiBlastPDB request'))

    previousjobid = kwargs.get('previousjobid','')
    if previousjobid is not '':
        query.append(('previousjobid',previousjobid))

    selectedHits = kwargs.get('selectedHits','')
    if selectedHits is not '':
        query.append(('selectedHits',selectedHits))

    database = kwargs.get('database','pdb')
    checkPsiBlastParameter('database', database)
    query.append(('database',database))

    matrix = kwargs.get('matrix', 'BLOSUM62')
    checkPsiBlastParameter('matrix', matrix)
    query.append(('matrix',matrix))

    gapopen = kwargs.get('gapopen',11)
    checkPsiBlastParameter('gapopen', gapopen)
    query.append(('gapopen',gapopen))

    gapext = kwargs.get('gapext',1)
    checkPsiBlastParameter('gapext', gapext)
    query.append(('gapext',gapext))

    expthr = kwargs.get('expthr', 10.)
    checkPsiBlastParameter('expthr', expthr)
    query.append(('expthr',expthr))
    
    psithr = kwargs.get('psithr',1.0e-3)
    checkPsiBlastParameter('psithr', psithr)
    query.append(('psithr',psithr))

    scores = kwargs.get('scores',500)
    checkPsiBlastParameter('scores', scores)
    query.append(('scores',scores))

    alignments = kwargs.get('alignments',500)
    checkPsiBlastParameter('alignments', alignments)
    query.append(('alignments',alignments))
    
    query.append(('alignView',0))
                    
    dropoff = kwargs.get('dropoff',15)
    checkPsiBlastParameter('dropoff', dropoff)
    query.append(('dropoff',dropoff))
        
    finaldropoff = kwargs.get('finaldropoff',25)
    checkPsiBlastParameter('finaldropoff', finaldropoff)
    query.append(('finaldropoff',finaldropoff))
        
    filter = kwargs.get('filter','F')
    checkPsiBlastParameter('filter', filter)
    query.append(('filter',filter))
    
    if previousjobid is '' and selectedHits is '':
        seqrange = kwargs.get('seqrange', None)
        if seqrange is None:
            seqrange = '0-' + str(len(sequence))
        elif not isinstance(seqrange, str):
            raise TypeError('seqrange should be a string')
        elif len(seqrange.split('-')) != 2:
            raise ValueError('seqrange should take the form START-END')
        try:
            start = int(seqrange.split('-')[0])
            end = int(seqrange.split('-')[1])
        except:
            raise ValueError('seqrange should be START-END with START and END being integers')
        query.append(('seqrange',seqrange))
        
    headers = { 'User-Agent' : 'ProDy' }
    
    try:
        import urllib.parse
        urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8')
    except ImportError:
        from urllib import urlencode

    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 120))
    
    data = urlencode(query)

    # submit the job
    base_url = 'http://www.ebi.ac.uk/Tools/services/rest/psiblast/'
    url = base_url + 'run/'
    LOGGER.timeit('_prody_psi-blast')
    if cycle == 0:
        LOGGER.info('PSI-Blast searching PDB database for "{0}..."'
                    .format(sequence[:5]))
    else:
        LOGGER.info('PSI-Blast searching PDB database, cycle={0}'
                    .format(cycle))

    handle = openURL(url, data=data, headers=headers)
    job_id = handle.read()
    handle.close()

    # check the status
    url = base_url + 'status/' + job_id
    handle = openURL(url)
    status = handle.read()
    handle.close()
                    
    # keep checking the status until it's no longer running
    while status == 'RUNNING':
        LOGGER.sleep(int(sleep), 'to reconnect to EBI for status.')
        LOGGER.write('Connecting to EBI for status...')
        handle = openURL(url)
        status = handle.read()
        LOGGER.clear()
        sleep = int(sleep * 1.5)
        if LOGGER.timing('_prody_psi-blast') > timeout:
            LOGGER.warn('PSI-Blast search time out.')
            return None

    LOGGER.info('The status is {0}'.format(status))
    LOGGER.clear()
    LOGGER.report('PSI-Blast search completed in %.1fs.', '_prody_psi-blast')
 
    if cycle != 1:
        # get the results
        url = base_url + 'result/' + job_id + '/xml'
        handle = openURL(url)
        results = handle.read()
        handle.close()
        
        try:
            ext_xml = filename.lower().endswith('.xml')
        except AttributeError:
            pass
        else:
            if not ext_xml:
                filename += '.xml'
            f_out = open(filename, 'w')
            f_out.write(results)
            f_out.close()
            LOGGER.info('Results are saved as {0}.'.format(repr(filename)))
        
        return job_id, PsiBlastRecord(results, sequence)
    else:
        return job_id
예제 #2
0
def psiBlastCycle(sequence=None, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    a single cycle of EBI psiblast.

    :arg sequence: an object with an associated sequence string 
         or a sequence string itself
    :type sequence: :class:`Atomic`, :class:`Sequence`, or str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    The following search parameters can be adjusted by the user.
    We use the same default values as 
    http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/
    wherever applicable.

    :arg email: email address for reporting problems
        default is [email protected]
    :type email: str with an @ before a .

    :arg matrix: The comparison matrix to be used to score alignments when searching the database
        possible values are 'BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30' and 'PAM70' 
        default is 'BLOSUM62'
    :type matrix: str

    :arg gapopen: Penalty taken away from the score when a gap is created in sequence alignments. 
        Increasing the gap opening penalty will decrease the number of gaps in the final alignment.
        Possible values range from 8 to 16 inclusive, default is 11
    :type gapopen: int

    :arg gapext: Penalty taken away from the score for each base or residue in the gap. 
        Increasing the gap extension penalty favors short gaps in the final alignment, 
        conversly decreasing the gap extension penalty favors long gaps in the final alignment. 
        Possible values range from 0 to 3, default is 1
    :type gapext: int

    :arg expthr: Expectation threshold that limits the number of scores and alignments reported. 
        This is the maximum number of times the match is expected to occur by chance.
        Possible values are 1.0e-200, 1.0e-100, 1.0e-50, 1.0e-10, 1.0e-5, 1.0e-4, 1.0e-3,
        1.0e-2, 0.1, 1.0, 10.0, 100, 1000
        default is 10.0
    :type expthr: float

    :arg psithr: Expectation value threshold for automatic selection of matched sequences for 
        inclusion in the PSSM at each iteration.
        Possible values are 1.0e-6, 1.0e-5, 1.0e-4, 2.0e-4, 5.0e-4, 1.0e-3, 2.0e-3, 5.0e-3,
        1.0e-2, 2.0e-2, 0.1, 0.3, 0.5, 1.0, 3.0, 10.0
        default is 1.0e-3
    :type psithr: float

    :arg scores: Maximum number of match score summaries reported in the result output.
        Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000
        Default is 500
    :type scores: int

    :arg alignments: Maximum number of match alignments reported in the result output.
        Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000
        Default is 500
    :type alignmets: int

    :arg dropoff: The amount a score can drop before extension of word hits is halted
        Possible values are 0, 2, 4, 6, 8, 10, 15, 20, 25, or 30
        Default is 15
    :type dropoff: int

    :arg finaldropoff: Dropoff value for final gapped alignment
        Possible values are 10, 12, 14, 16, 18, 20, 22, 24, 25, 26, 28, or 30
        Default is 25
    :type finaldropoff: int

    :arg filter: Filter regions of low sequence complexity. This can avoid issues with 
        low complexity sequences where matches are found due to composition rather than 
        meaningful sequence similarity. However, in some cases filtering also masks 
        regions of interest and so should be used with caution.
        Possible values are T and F, default is F
    :type filter: str

    :arg seqrange: Specify a range or section of the input sequence to use in the search.
        Example: Specifying '34-89' in an input sequence of total length 100, will tell BLAST 
        to only use residues 34 to 89, inclusive.
    :type seqrange: str of form START-END

    :arg database: a database name from those available. See
        http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/database
        default is pdb
    :type database: str

    :arg previousjobid: The job identifier for the previous PSI-BLAST iteration. 
        default is None
        You can change this if you want to continue from a previous run
    :type previousjobid: str

    :arg selectedHits: Name of a file containing a list of identifiers of the 
        hits from the previous iteration to use to construct the search PSSM 
        for this iteration.
        default is None
    :type selectedHits: str

    :arg cpfile: Name of a Checkpoint file from the previous iteration. 
        default is None
    :type cpfile: str

    :arg sleep: how long to wait to reconnect for status
         Sleep time is multiplied by 1.5 when results are not ready.
         default is 2 seconds
    :type sleep: float

    :arg timeout:  when to give up waiting for the results 
        default is 120 seconds
    :type timeout: float

    :arg cycle: cycle number
    :type cycle: int

    """
    cycle = kwargs.get('cycle', 0)

    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')

    elif isinstance(sequence, Atomic):
        sequence = sequence.calpha.getSequence()

    elif isinstance(sequence, Sequence):
        sequence = str(sequence)

    elif isinstance(sequence, str):
        if len(sequence) in [4, 5, 6]:
            ag = parsePDB(sequence)
            sequence = ag.calpha.getSequence()
        sequence = ''.join(sequence.split())

    elif sequence is None:
        if cycle == 0:
            cycle = 1
    else:
        raise TypeError(
            'sequence must be Atomic, Sequence, or str not {0}'.format(
                type(sequence)))

    if cycle == 0:
        query = [('sequence', sequence)]
    else:
        query = []

    email = kwargs.get('email', '*****@*****.**')
    if not isinstance(email, str):
        raise TypeError('email must be a string')
    elif email.find('@') == -1 or email.find('.') == -1 or len(
            email.split('@')) != 2:
        raise ValueError(
            'email must be a valid email address with at least one . and exactly one @ sign'
        )
    elif not email.find('@') < email.find(email.split('.')[-1]):
        raise ValueError(
            'email must be a valid email address with a . after the @ sign')
    query.append(('email', email))
    query.append(('title', 'ProDy psiBlastPDB request'))

    previousjobid = kwargs.get('previousjobid', '')
    if previousjobid != '':
        query.append(('previousjobid', previousjobid))

    selectedHits = kwargs.get('selectedHits', '')
    if selectedHits != '':
        query.append(('selectedHits', selectedHits))

    database = kwargs.get('database', 'pdb')
    checkPsiBlastParameter('database', database)
    query.append(('database', database))

    matrix = kwargs.get('matrix', 'BLOSUM62')
    checkPsiBlastParameter('matrix', matrix)
    query.append(('matrix', matrix))

    gapopen = kwargs.get('gapopen', 11)
    checkPsiBlastParameter('gapopen', gapopen)
    query.append(('gapopen', gapopen))

    gapext = kwargs.get('gapext', 1)
    checkPsiBlastParameter('gapext', gapext)
    query.append(('gapext', gapext))

    expthr = kwargs.get('expthr', 10.)
    checkPsiBlastParameter('expthr', expthr)
    query.append(('expthr', expthr))

    psithr = kwargs.get('psithr', 1.0e-3)
    checkPsiBlastParameter('psithr', psithr)
    query.append(('psithr', psithr))

    scores = kwargs.get('scores', 500)
    checkPsiBlastParameter('scores', scores)
    query.append(('scores', scores))

    alignments = kwargs.get('alignments', 500)
    checkPsiBlastParameter('alignments', alignments)
    query.append(('alignments', alignments))

    query.append(('alignView', 0))

    dropoff = kwargs.get('dropoff', 15)
    checkPsiBlastParameter('dropoff', dropoff)
    query.append(('dropoff', dropoff))

    finaldropoff = kwargs.get('finaldropoff', 25)
    checkPsiBlastParameter('finaldropoff', finaldropoff)
    query.append(('finaldropoff', finaldropoff))

    filter = kwargs.get('filter', 'no')
    checkPsiBlastParameter('filter', filter)
    query.append(('filter', filter))

    if previousjobid == '' and selectedHits == '':
        seqrange = kwargs.get('seqrange', None)
        if seqrange is None:
            seqrange = '0-' + str(len(sequence))
        elif not isinstance(seqrange, str):
            raise TypeError('seqrange should be a string')
        elif len(seqrange.split('-')) != 2:
            raise ValueError('seqrange should take the form START-END')
        try:
            start = int(seqrange.split('-')[0])
            end = int(seqrange.split('-')[1])
        except:
            raise ValueError(
                'seqrange should be START-END with START and END being integers'
            )
        query.append(('seqrange', seqrange))

    headers = {'User-Agent': 'ProDy'}

    try:
        import urllib.parse
        urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8')
    except ImportError:
        from urllib import urlencode

    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 120))

    data = urlencode(query)

    # submit the job
    base_url = 'http://www.ebi.ac.uk/Tools/services/rest/psiblast/'
    url = base_url + 'run/'
    LOGGER.timeit('_prody_psi-blast')
    if cycle == 0:
        LOGGER.info('PSI-Blast searching PDB database for "{0}..."'.format(
            sequence[:5]))
    else:
        LOGGER.info(
            'PSI-Blast searching PDB database, cycle={0}'.format(cycle))

    handle = openURL(url, data=data, headers=headers)
    job_id = handle.read()
    if PY3K:
        job_id = job_id.decode()
    handle.close()

    # check the status
    url = base_url + 'status/' + job_id
    handle = openURL(url)
    status = handle.read()
    if PY3K:
        status = status.decode()
    handle.close()

    # keep checking the status until it's no longer running
    while status == 'RUNNING':
        LOGGER.sleep(int(sleep), 'to reconnect to EBI for status.')
        LOGGER.write('Connecting to EBI for status...')
        handle = openURL(url)
        status = handle.read()
        if PY3K:
            status = status.decode()
        LOGGER.clear()
        sleep = int(sleep * 1.5)
        if LOGGER.timing('_prody_psi-blast') > timeout:
            LOGGER.warn('PSI-Blast search time out.')
            return None

    LOGGER.info('The status is {0}'.format(status))
    LOGGER.clear()
    LOGGER.report('PSI-Blast search completed in %.1fs.', '_prody_psi-blast')

    if cycle != 1:
        # get the results
        url = base_url + 'result/' + job_id + '/xml'
        handle = openURL(url)
        results = handle.read()
        handle.close()

        try:
            ext_xml = filename.lower().endswith('.xml')
        except AttributeError:
            pass
        else:
            if not ext_xml:
                filename += '.xml'
            f_out = open(filename, 'w')
            f_out.write(results)
            f_out.close()
            LOGGER.info('Results are saved as {0}.'.format(repr(filename)))

        return job_id, PsiBlastRecord(results, sequence)
    else:
        return job_id
예제 #3
0
def blastPDB(sequence, filename=None, **kwargs):
    """Return a :class:`PDBBlastRecord` instance that contains results from
    blast searching of ProteinDataBank database *sequence* using NCBI blastp.
        
    :arg sequence: single-letter code amino acid sequence of the protein
        without any gap characters, all white spaces will be removed
    :type sequence: str 
    :arg filename: a *filename* to save the results in XML format 
    :type filename: str
    
    *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``) 
    search parameters can be adjusted by the user.  *sleep* keyword argument
    (default is ``2`` seconds) determines how long to wait to reconnect for 
    results.  Sleep time is doubled when results are not ready.  *timeout* 
    (default is 30 seconds) determines when to give up waiting for the results.  
    """
    
    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')
    elif isinstance(sequence, str):
        sequence = ''.join(sequence.split())
        if not checkSequence(sequence):
            raise ValueError(repr(sequence) + ' is not a valid sequence')
    else:
        raise TypeError('sequence must be a string')

    query = [('DATABASE', 'pdb'), ('ENTREZ_QUERY', '(none)'),
             ('PROGRAM', 'blastp'),] 
    expect = kwargs.pop('expect', 10e-10)
    assert isinstance(expect, (float, int)), 'expect must be a float'
    assert expect > 0, 'expect must be a positive number'
    query.append(('EXPECT', expect))
    hitlist_size = kwargs.pop('hitlist_size', 250)
    assert isinstance(hitlist_size, int), 'hitlist_size must be an integer'
    assert hitlist_size > 0, 'expect must be a positive integer'
    query.append(('HITLIST_SIZE', hitlist_size))
    query.append(('QUERY', sequence))
    query.append(('CMD', 'Put'))
    
    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 20))
    
    if kwargs:
        LOGGER.warning("Keyword argument(s) '{0:s}' are not used."
                       .format("', '".join(kwargs.keys())))

    import urllib, urllib2
    
    url = 'http://blast.ncbi.nlm.nih.gov/Blast.cgi'
    
    data = urllib.urlencode(query)
    LOGGER.timeit()
    LOGGER.info('Blast searching NCBI PDB database for "{0:s}..."'
                .format(sequence[:5]))
    request = urllib2.Request(url, data, {'User-agent': 'ProDy'})
    handle = urllib2.urlopen(request)
    
    html = handle.read()
    index = html.find('RID =')
    if index == -1:
        raise Exception('NCBI did not return expected response.')
    else:
        last = html.find('\n', index)
        rid = html[index + len('RID ='):last].strip()

    index = html.find('RTOE =')
    if index == -1:
        rtoe = None # This is not used
    else:
        last = html.find('\n', index)
        rtoe = int(html[index + len('RTOE ='):last].strip())

    query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500), 
             ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')]
    data = urllib.urlencode(query)
    
    while True:
        LOGGER.sleep(int(sleep), ' to connect NCBI for search results.')
        LOGGER.write('Connecting NCBI for search results...')
        request = urllib2.Request(url, data, {'User-agent': 'ProDy'})
        handle = urllib2.urlopen(request)
        results = handle.read()
        index = results.find('Status=')
        LOGGER.clear()
        if index < 0:
            break
        last = results.index('\n', index)
        status = results[index+len('Status='):last].strip()
        if status.upper() == 'READY':
            break
        sleep *= 2
        if LOGGER.timing() > timeout:
            LOGGER.warning('Blast search time out.')
            return None
    LOGGER.clear()
    LOGGER.timing('Blast search completed in %.1fs.')
    if isinstance(filename, str):
        if not filename.lower().endswith('.xml'):
                filename += '.xml'        
        out = open(filename, 'w')
        out.write(results)
        out.close()
        LOGGER.info('Results are saved as {0:s}.'.format(filename))
    return PDBBlastRecord(results, sequence)
예제 #4
0
def blastPDBUniProtKB(sequence, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    blast searching of ProteinDataBank database *sequence* using NCBI blastp.

    :arg sequence: single-letter code amino acid sequence of the protein
        without any gap characters, all white spaces will be removed
    :type sequence: str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``)
    search parameters can be adjusted by the user.  *sleep* keyword argument
    (default is ``2`` seconds) determines how long to wait to reconnect for
    results.  Sleep time is doubled when results are not ready.  *timeout*
    (default is 120s) determines when to give up waiting for the results. 
    *num_sequences (default is ``1``)
    """

    num_sequences = int(kwargs.pop('num_sequences', 1))
    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')
    else:
        if num_sequences == 1:
            try:
                sequence = ''.join(sequence.split())
                _ = sequence.isalpha()
            except AttributeError:
                raise TypeError('sequence must be a string')
            else:
                if not _:
                    raise ValueError('not a valid protein sequence')
                    
    headers = {'User-agent': 'ProDy'}

    query = [('DATABASE', 'swissprot'), ('ENTREZ_QUERY', '(none)'),
             ('PROGRAM', 'blastp'),]
    expect = float(kwargs.pop('expect', 10e-5))
    if expect <= 0:
        raise ValueError('expect must be a positive number')
    query.append(('EXPECT', expect))
    hitlist_size = int(kwargs.pop('hitlist_size', 250))
    if hitlist_size <= 0:
        raise ValueError('expect must be a positive integer')
    psiblast = 'true'
    step_number = 3
    query.append(('RUN_PSIBLAST', psiblast))
    query.append(('HITLIST_SIZE', hitlist_size))
    query.append(('QUERY', sequence))
    query.append(('CMD', 'Put'))
    query.append(('STEP_NUMBER', step_number))

    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 120))

    if kwargs:
        LOGGER.warn('Keyword argument(s) {0} are not used.'
                    .format(', '.join([repr(key) for key in kwargs])))

    try:
        import urllib.parse
        urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8')
    except ImportError:
        from urllib.parse import urlencode

    url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi'

    data = urlencode(query)
    LOGGER.timeit('_prody_blast')
    LOGGER.info('Blast searching NCBI PDB database for "{0}..."'
                .format(sequence[:5]))
    handle = openURL(url, data=data, headers=headers)

    html = handle.read()
    index = html.find(b'name="RID" type="hidden" value="')
    if index == -1:
        raise Exception('NCBI did not return expected response.')
    else:
        last = html.find(b'>',index)
        rid = html[index + len('name="RID" type="hidden" value="'):last-1].strip()

    index = html.find(b'name="RTOE" type="hidden" value="')
    if index == -1:
        rtoe = None # This is not used
    else:
        last = html.find(b'>', index)
        rtoe = html[index + len('name="RTOE" type="hidden" value="'):last-1].strip()

    query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500),
             ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')]
    data = urlencode(query)

    while True:
        LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.')
        LOGGER.write('Connecting NCBI for search results...')
        handle = openURL(url, data=data, headers=headers)
        results = handle.read()
        index = results.find(b'Status=')
        LOGGER.clear()
        if index < 0:
            break
        last = results.index(b'\n', index)
        status = results[index+len('Status='):last].strip()
        if status.upper() == 'READY':
            break
        sleep = int(sleep * 1.5)
        if LOGGER.timing('_prody_blast') > timeout:
            LOGGER.warn('Blast search time out.')
            return None
    LOGGER.clear()
    LOGGER.report('Blast search completed in %.1fs.', '_prody_blast')
    try:
        ext_xml = filename.lower().endswith('.xml')
    except AttributeError:
        pass
    else:
        if not ext_xml:
            filename += '.xml'
        out = open(filename, 'w')
        out.write(results)
        out.close()
        LOGGER.info('Results are saved as {0}.'.format(repr(filename)))
    return SwissProtBlastRecord(results, sequence)
예제 #5
0
def blastPDBUniProtKB(sequence, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    blast searching of ProteinDataBank database *sequence* using NCBI blastp.

    :arg sequence: single-letter code amino acid sequence of the protein
        without any gap characters, all white spaces will be removed
    :type sequence: str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``)
    search parameters can be adjusted by the user.  *sleep* keyword argument
    (default is ``2`` seconds) determines how long to wait to reconnect for
    results.  Sleep time is doubled when results are not ready.  *timeout*
    (default is 120s) determines when to give up waiting for the results. 
    *num_sequences (default is ``1``)
    """

    num_sequences = int(kwargs.pop('num_sequences', 1))
    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')
    else:
        if num_sequences == 1:
            try:
                sequence = ''.join(sequence.split())
                _ = sequence.isalpha()
            except AttributeError:
                raise TypeError('sequence must be a string')
            else:
                if not _:
                    raise ValueError('not a valid protein sequence')
                    
    headers = {'User-agent': 'ProDy'}

    query = [('DATABASE', 'swissprot'), ('ENTREZ_QUERY', '(none)'),
             ('PROGRAM', 'blastp'),]
    expect = float(kwargs.pop('expect', 10e-5))
    if expect <= 0:
        raise ValueError('expect must be a positive number')
    query.append(('EXPECT', expect))
    hitlist_size = int(kwargs.pop('hitlist_size', 250))
    if hitlist_size <= 0:
        raise ValueError('expect must be a positive integer')
    psiblast = 'true'
    step_number = 3
    query.append(('RUN_PSIBLAST', psiblast))
    query.append(('HITLIST_SIZE', hitlist_size))
    query.append(('QUERY', sequence))
    query.append(('CMD', 'Put'))
    query.append(('STEP_NUMBER', step_number))

    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 120))

    if kwargs:
        LOGGER.warn('Keyword argument(s) {0} are not used.'
                    .format(', '.join([repr(key) for key in kwargs])))

    try:
        import urllib.parse
        urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8')
    except ImportError:
        from urllib import urlencode

    url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi'

    data = urlencode(query)
    LOGGER.timeit('_prody_blast')
    LOGGER.info('Blast searching NCBI PDB database for "{0}..."'
                .format(sequence[:5]))
    handle = openURL(url, data=data, headers=headers)

    html = handle.read()
    index = html.find(b'name="RID" type="hidden" value="')
    if index == -1:
        raise Exception('NCBI did not return expected response.')
    else:
        last = html.find(b'>',index)
        rid = html[index + len('name="RID" type="hidden" value="'):last-1].strip()

    index = html.find(b'name="RTOE" type="hidden" value="')
    if index == -1:
        rtoe = None # This is not used
    else:
        last = html.find(b'>', index)
        rtoe = html[index + len('name="RTOE" type="hidden" value="'):last-1].strip()

    query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500),
             ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')]
    data = urlencode(query)

    while True:
        LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.')
        LOGGER.write('Connecting NCBI for search results...')
        handle = openURL(url, data=data, headers=headers)
        results = handle.read()
        index = results.find(b'Status=')
        LOGGER.clear()
        if index < 0:
            break
        last = results.index(b'\n', index)
        status = results[index+len('Status='):last].strip()
        if status.upper() == 'READY':
            break
        sleep = int(sleep * 1.5)
        if LOGGER.timing('_prody_blast') > timeout:
            LOGGER.warn('Blast search time out.')
            return None
    LOGGER.clear()
    LOGGER.report('Blast search completed in %.1fs.', '_prody_blast')
    try:
        ext_xml = filename.lower().endswith('.xml')
    except AttributeError:
        pass
    else:
        if not ext_xml:
            filename += '.xml'
        out = open(filename, 'w')
        out.write(results)
        out.close()
        LOGGER.info('Results are saved as {0}.'.format(repr(filename)))
    return SwissProtBlastRecord(results, sequence)
예제 #6
0
파일: dali.py 프로젝트: creageng/ProDy
 def getRecord(self, url, localFile=False):
     if localFile:
         dali_file = open(url, 'r')
         data = dali_file.read()
         dali_file.close()
     else:
         sleep = 2
         timeout = 120
         LOGGER.timeit('_dali')
         log_message = ''
         try_error = 3
         while True:
             LOGGER.sleep(int(sleep), 'to reconnect Dali '+log_message)
             LOGGER.clear()
             LOGGER.write('Connecting Dali for search results...')
             LOGGER.clear()
             try:
                 html = urllib2.urlopen(url).read()
             except:
                 try_error -= 1
                 if try_error >= 0:
                     LOGGER.sleep(2, '. Connection error happened. Trying to reconnect...')
                     continue
                 else:
                     html = urllib2.urlopen(url).read()
             if html.find('Status: Queued') > -1:
                 log_message = '(Dali searching is queued)...'
             elif html.find('Status: Running') > -1:
                 log_message = '(Dali searching is running)...'
             elif html.find('Your job') == -1 and html.find('.txt') > -1:
                 break
             elif html.find('ERROR:') > -1:
                 LOGGER.warn(': Dali search reported an ERROR!')
                 return None
                 break
             sleep = 20 if int(sleep * 1.5) >= 20 else int(sleep * 1.5)
             if LOGGER.timing('_dali') > timeout:
                 LOGGER.warn(': Dali search is time out. \nThe results can be obtained using getRecord() function later.')
                 return None
                 break
             LOGGER.clear()
         LOGGER.clear()
         LOGGER.report('Dali results completed in %.1fs.', '_dali')
         lines = html.strip().split('\n')
         file_name = re.search('=.+-90\.txt', html).group()[1:]
         file_name = file_name[:-7]
         # LOGGER.info(url+file_name+self._subset+'.txt')
         data = urllib2.urlopen(url+file_name+self._subset+'.txt').read()
         temp_name = file_name+self._subset+'_dali.txt'
         with open(temp_name, "w") as file_temp: file_temp.write(html + '\n' + url+file_name + '\n' + data)
         # with open(temp_name, "a+") as file_temp: file_temp.write(url+file_name + '\n' + data)
     data_list = data.strip().split('# ')
     # No:  Chain   Z    rmsd lali nres  %id PDB  Description -> data_list[3]
     # Structural equivalences -> data_list[4]
     # Translation-rotation matrices -> data_list[5]
     map_temp_dict = dict()
     mapping = []
     lines = data_list[4].strip().split('\n')
     self._lines_4 = lines
     mapping_temp = np.genfromtxt(lines[1:], delimiter = (4,1,14,6,2,4,4,5,2,4,4,3,5,4,3,5,6,3,5,4,3,5,28), usecols = [0,3,5,7,9,12,15,15,18,21], dtype='|i4')
     # [0,3,5,7,9,12,15,15,18,21] -> [index, residue_a, residue_b, residue_i_a, residue_i_b, resid_a, resid_b, resid_i_a, resid_i_b]
     for map_i in mapping_temp:
         if not map_i[0] in map_temp_dict:
             map_temp_dict[map_i[0]] = [[map_i[1], map_i[2], map_i[3], map_i[4]]]
         else:
             map_temp_dict[map_i[0]].append([map_i[1], map_i[2], map_i[3], map_i[4]])
     self._max_index = max(mapping_temp[:,2])
     self._mapping = map_temp_dict
     self._data = data_list[3]
     lines = data_list[3].strip().split('\n')
     daliInfo = np.genfromtxt(lines[1:], delimiter = (4,3,6,5,5,5,6,5,57), usecols = [0,2,3,4,5,6,7,8], dtype=[('id', '<i4'), ('pdb_chain', '|S6'), ('Z', '<f4'), ('rmsd', '<f4'), ('len_align', '<i4'), ('res_num', '<i4'), ('identity', '<i4'), ('title', '|S70')])
     if daliInfo.ndim == 0:
         daliInfo = np.array([daliInfo])
     pdbListAll = []
     self._daliInfo = daliInfo
     dali_temp_dict = dict()
     for temp in self._daliInfo:
         temp_dict = dict()
         pdb_chain = temp[1].strip()[0:6]
         temp_dict['pdbId'] = pdb_chain[0:4]
         temp_dict['chainId'] = pdb_chain[5:6]
         temp_dict['pdb_chain'] = pdb_chain
         temp_dict['Z'] = temp[2]
         temp_dict['rmsd'] = temp[3]
         temp_dict['len_align'] = temp[4]
         temp_dict['res_num'] = temp[5]
         temp_dict['identity'] = temp[6]
         temp_dict['mapping'] = (np.array(map_temp_dict[temp[0]])-1).tolist()
         temp_dict['map_ref'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[0], map_i[1]+1)]
         temp_dict['map_sel'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[2], map_i[3]+1)]
         dali_temp_dict[temp_dict['pdb_chain']] = temp_dict
         pdbListAll.append(pdb_chain)
     self._pdbListAll = tuple(pdbListAll)
     self._pdbList = self._pdbListAll
     self._alignPDB = dali_temp_dict
     LOGGER.info(str(len(pdbListAll)) + ' Dali results have been searched.')
     return True
예제 #7
0
    def fetch(self, url=None, localFile=False, **kwargs):
        """Get Dali record from url or file.

        :arg url: url of Dali results page or local dali results file
            If None then the url already associated with the DaliRecord object is used.
        :type url: str

        :arg localFile: whether provided url is a path for a local dali results file
        :type localFile: bool

        :arg timeout: amount of time until the query times out in seconds
            default value is 120
        :type timeout: int

        :arg localfolder: folder in which to find the local file
            default is the current folder
        :type localfolder: str
        """
        if localFile:
            dali_file = open(url, 'r')
            data = dali_file.read()
            dali_file.close()
        else:
            import requests
            
            if url == None:
                url = self._url
            
            sleep = 2
            timeout = kwargs.pop('timeout', 120)
            LOGGER.timeit('_dali')
            log_message = ''
            try_error = 3
            while True:
                LOGGER.write('Connecting to Dali for search results...')
                LOGGER.clear()
                try:
                    # html = urllib2.urlopen(url).read()
                    html = requests.get(url).content
                except:
                    try_error -= 1
                    if try_error >= 0:
                        LOGGER.sleep(2, '. Connection error happened. Trying to reconnect...')
                        continue
                    else:
                        # html = urllib2.urlopen(url).read()
                        html = requests.get(url).content
                if PY3K:
                    html = html.decode()
                if html.find('Status: Queued') > -1:
                    log_message = '(Dali search is queued)...'
                elif html.find('Status: Running') > -1:
                    log_message = '(Dali search is running)...'
                elif html.find('Your job') == -1 and html.find('.txt') > -1:
                    break
                elif html.find('ERROR:') > -1:
                    LOGGER.warn(': Dali search reported an ERROR!')
                    return False
                sleep = 20 if int(sleep * 1.5) >= 20 else int(sleep * 1.5)
                if LOGGER.timing('_dali') > timeout:
                    LOGGER.warn(': Dali search has timed out. \nThe results can be obtained later using the fetch() method.')
                    return False
                LOGGER.sleep(int(sleep), 'to reconnect to Dali '+log_message)
                LOGGER.clear()
            LOGGER.clear()
            LOGGER.report('Dali results were fetched in %.1fs.', '_dali')
            lines = html.strip().split('\n')
            file_name = re.search('=.+-90\\.txt', html).group()[1:]
            file_name = file_name[:-7]
            # LOGGER.info(url+file_name+self._subset+'.txt')
            # data = urllib2.urlopen(url+file_name+self._subset+'.txt').read()
            data = requests.get(url+file_name+self._subset+'.txt').content
            if PY3K:
                data = data.decode()
            localfolder = kwargs.pop('localfolder', '.')

            if file_name.lower().startswith('s001'):
                temp_name = self._pdbId + self._chain
            else:
                temp_name = file_name
            temp_name += self._subset + '_dali.txt'
            if localfolder != '.' and not os.path.exists(localfolder):
                os.mkdir(localfolder)
            with open(localfolder+os.sep+temp_name, "w") as file_temp: file_temp.write(html + '\n' + url+file_name+self._subset+'.txt' + '\n' + data)
            # with open(temp_name, "a+") as file_temp: file_temp.write(url+file_name + '\n' + data)
        data_list = data.strip().split('# ')
        # No:  Chain   Z    rmsd lali nres  %id PDB  Description -> data_list[3]
        # Structural equivalences -> data_list[4]
        # Translation-rotation matrices -> data_list[5]
        map_temp_dict = dict()
        lines = data_list[4].strip().split('\n')
        self._lines_4 = lines
        mapping_temp = np.genfromtxt(lines[1:], delimiter = (4,1,14,6,2,4,4,5,2,4,4,3,5,4,3,5,6,3,5,4,3,5,28), 
                                     usecols = [0,3,5,7,9,12,15,15,18,21], dtype='|i4')
        # [0,3,5,7,9,12,15,15,18,21] -> [index, residue_a, residue_b, residue_i_a, residue_i_b, resid_a, resid_b, resid_i_a, resid_i_b]
        for map_i in mapping_temp:
            if not map_i[0] in map_temp_dict:
                map_temp_dict[map_i[0]] = [[map_i[1], map_i[2], map_i[3], map_i[4]]]
            else:
                map_temp_dict[map_i[0]].append([map_i[1], map_i[2], map_i[3], map_i[4]])
        self._max_index = max(mapping_temp[:,2])
        self._mapping = map_temp_dict
        self._data = data_list[3]
        lines = data_list[3].strip().split('\n')
        # daliInfo = np.genfromtxt(lines[1:], delimiter = (4,3,6,5,5,5,6,5,57), usecols = [0,2,3,4,5,6,7,8], 
                                # dtype=[('id', '<i4'), ('pdb_chain', '|S6'), ('Z', '<f4'), ('rmsd', '<f4'), 
                                # ('len_align', '<i4'), ('nres', '<i4'), ('identity', '<i4'), ('title', '|S70')])
        daliInfo = np.genfromtxt(lines[1:], delimiter = (4,3,6,5,5,5,6,5,57), usecols = [0,2,3,4,5,6,7,8], 
                                dtype=[('id', '<i4'), ('pdb_chain', '|U6'), ('Z', '<f4'), ('rmsd', '<f4'), 
                                ('len_align', '<i4'), ('nres', '<i4'), ('identity', '<i4'), ('title', '|U70')])
        if daliInfo.ndim == 0:
            daliInfo = np.array([daliInfo])
        pdbListAll = []
        self._daliInfo = daliInfo
        dali_temp_dict = dict()
        for temp in self._daliInfo:
            temp_dict = dict()
            pdb_chain = temp[1].strip()[0:6]
            # U6 and U70 were used as the dtype for np.genfromtext -> unicode string were used in daliInfo 
            # if PY3K:
                # pdb_chain = pdb_chain.decode()
            pdb_chain = str(pdb_chain)
            temp_dict['pdbId'] = pdbid = pdb_chain[0:4].lower()
            temp_dict['chainId'] = chid = pdb_chain[5:6]
            temp_dict['pdb_chain'] = pdb_chain = pdbid + chid
            temp_dict['Z'] = temp[2]
            temp_dict['rmsd'] = temp[3]
            temp_dict['len_align'] = temp[4]
            temp_dict['nres'] = temp[5]
            temp_dict['identity'] = temp[6]
            temp_dict['mapping'] = (np.array(map_temp_dict[temp[0]])-1).tolist()
            temp_dict['map_ref'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[0], map_i[1]+1)]
            temp_dict['map_sel'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[2], map_i[3]+1)]
            dali_temp_dict[pdb_chain] = temp_dict
            pdbListAll.append(pdb_chain)
        self._pdbListAll = tuple(pdbListAll)
        self._pdbList = self._pdbListAll
        self._alignPDB = dali_temp_dict
        LOGGER.info('Obtained ' + str(len(pdbListAll)) + ' PDB chains from Dali for '+self._pdbId+self._chain+'.')
        return True
예제 #8
0
파일: blastpdb.py 프로젝트: prody/ProDy
def blastPDB(sequence, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    blast searching of ProteinDataBank database *sequence* using NCBI blastp.

    :arg sequence: single-letter code amino acid sequence of the protein
        without any gap characters, all white spaces will be removed
    :type sequence: str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``)
    search parameters can be adjusted by the user.  *sleep* keyword argument
    (default is ``2`` seconds) determines how long to wait to reconnect for
    results.  Sleep time is doubled when results are not ready.  *timeout*
    (default is 120s) determines when to give up waiting for the results.
    """

    if sequence == "runexample":
        sequence = (
            "ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI"
            "SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN"
            "DAYDIVKMKKSNISPNFNFMGQLLDFERTL"
        )
    else:
        try:
            sequence = "".join(sequence.split())
            _ = sequence.isalpha()
        except AttributeError:
            raise TypeError("sequence must be a string")
        else:
            if not _:
                raise ValueError("not a valid protein sequence")
    headers = {"User-agent": "ProDy"}

    query = [("DATABASE", "pdb"), ("ENTREZ_QUERY", "(none)"), ("PROGRAM", "blastp")]
    expect = float(kwargs.pop("expect", 10e-10))
    if expect <= 0:
        raise ValueError("expect must be a positive number")
    query.append(("EXPECT", expect))
    hitlist_size = int(kwargs.pop("hitlist_size", 250))
    if hitlist_size <= 0:
        raise ValueError("expect must be a positive integer")
    query.append(("HITLIST_SIZE", hitlist_size))
    query.append(("QUERY", sequence))
    query.append(("CMD", "Put"))

    sleep = float(kwargs.pop("sleep", 2))
    timeout = float(kwargs.pop("timeout", 120))

    if kwargs:
        LOGGER.warn("Keyword argument(s) {0} are not used.".format(", ".join([repr(key) for key in kwargs])))

    try:
        import urllib.parse

        urlencode = lambda data: bytes(urllib.parse.urlencode(data), "utf-8")
    except ImportError:
        from urllib import urlencode

    url = "https://blast.ncbi.nlm.nih.gov/Blast.cgi"

    data = urlencode(query)
    LOGGER.timeit("_prody_blast")
    LOGGER.info('Blast searching NCBI PDB database for "{0}..."'.format(sequence[:5]))
    handle = openURL(url, data=data, headers=headers)

    html = handle.read()
    index = html.find(b"RID =")
    if index == -1:
        raise Exception("NCBI did not return expected response.")
    else:
        last = html.find(b"\n", index)
        rid = html[index + len("RID =") : last].strip()

    index = html.find(b"RTOE =")
    if index == -1:
        rtoe = None  # This is not used
    else:
        last = html.find(b"\n", index)
        rtoe = int(html[index + len("RTOE =") : last].strip())

    query = [("ALIGNMENTS", 500), ("DESCRIPTIONS", 500), ("FORMAT_TYPE", "XML"), ("RID", rid), ("CMD", "Get")]
    data = urlencode(query)

    while True:
        LOGGER.sleep(int(sleep), "to reconnect NCBI for search results.")
        LOGGER.write("Connecting NCBI for search results...")
        handle = openURL(url, data=data, headers=headers)
        results = handle.read()
        index = results.find(b"Status=")
        LOGGER.clear()
        if index < 0:
            break
        last = results.index(b"\n", index)
        status = results[index + len("Status=") : last].strip()
        if status.upper() == "READY":
            break
        sleep = int(sleep * 1.5)
        if LOGGER.timing("_prody_blast") > timeout:
            LOGGER.warn("Blast search time out.")
            return None
    LOGGER.clear()
    LOGGER.report("Blast search completed in %.1fs.", "_prody_blast")
    try:
        ext_xml = filename.lower().endswith(".xml")
    except AttributeError:
        pass
    else:
        if not ext_xml:
            filename += ".xml"
        out = open(filename, "w")
        out.write(results)
        out.close()
        LOGGER.info("Results are saved as {0}.".format(repr(filename)))
    return PDBBlastRecord(results, sequence)
예제 #9
0
파일: blastpdb.py 프로젝트: uibcdf/ProDy
    def fetch(self, xml=None, sequence=None, **kwargs):
        """Get Blast record from url or file.

        :arg sequence: an object with an associated sequence string 
            or a sequence string itself
        :type sequence: :class:`Atomic`, :class:`Sequence`, or str

        :arg xml: blast search results in XML format or an XML file that
            contains the results or a filename for saving the results or None
        :type xml: str

        :arg timeout: amount of time until the query times out in seconds
            default value is 120
        :type timeout: int
        """
        if self.isSuccess:
            LOGGER.warn(
                "The record already exists so not further search is performed")
            return True

        if sequence == None:
            sequence = self._sequence

        if xml == None:
            xml = self._xml

        import xml.etree.cElementTree as ET
        if xml is not None and len(xml) < 100:
            if os.path.isfile(xml):
                xml = ET.parse(xml)
                root = xml.getroot()
            else:
                raise ValueError('xml is not a filename and does not look like'
                                 ' a valid XML string')
        else:

            headers = {'User-agent': 'ProDy'}
            query = [
                ('DATABASE', 'pdb'),
                ('ENTREZ_QUERY', '(none)'),
                ('PROGRAM', 'blastp'),
            ]

            expect = float(kwargs.pop('expect', 10e-10))
            if expect <= 0:
                raise ValueError('expect must be a positive number')
            query.append(('EXPECT', expect))
            hitlist_size = int(kwargs.pop('hitlist_size', 250))
            if hitlist_size <= 0:
                raise ValueError('expect must be a positive integer')
            query.append(('HITLIST_SIZE', hitlist_size))
            query.append(('QUERY', sequence))
            query.append(('CMD', 'Put'))

            sleep = float(kwargs.pop('sleep', 2))
            timeout = float(kwargs.pop('timeout', self._timeout))
            self._timeout = timeout

            try:
                import urllib.parse
                urlencode = lambda data: bytes(urllib.parse.urlencode(data),
                                               'utf-8')
            except ImportError:
                from urllib import urlencode

            url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi'

            data = urlencode(query)
            LOGGER.timeit('_prody_blast')
            LOGGER.info(
                'Blast searching NCBI PDB database for "{0}..."'.format(
                    sequence[:5]))
            handle = openURL(url, data=data, headers=headers)

            html = handle.read()
            index = html.find(b'RID =')
            if index == -1:
                raise Exception('NCBI did not return expected response.')
            else:
                last = html.find(b'\n', index)
                rid = html[index + len('RID ='):last].strip()

            query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500),
                     ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')]
            data = urlencode(query)

            while True:
                LOGGER.sleep(int(sleep),
                             'to reconnect to NCBI for search results.')
                LOGGER.write('Connecting to NCBI for search results...')
                handle = openURL(url, data=data, headers=headers)
                results = handle.read()
                index = results.find(b'Status=')
                LOGGER.clear()
                if index < 0:
                    break
                last = results.index(b'\n', index)
                status = results[index + len('Status='):last].strip()
                if status.upper() == b'READY':
                    break
                sleep = int(sleep * 1.5)
                if LOGGER.timing('_prody_blast') > timeout:
                    LOGGER.warn('Blast search time out.')
                    return False

            LOGGER.clear()
            LOGGER.report('Blast search completed in %.1fs.', '_prody_blast')

            filename = xml
            root = ET.XML(results)
            try:
                ext_xml = filename.lower().endswith('.xml')
            except AttributeError:
                pass
            else:
                if not ext_xml:
                    filename += '.xml'
                out = open(filename, 'w')
                if PY3K:
                    out.write(results.decode())
                else:
                    out.write(results)
                out.close()
                LOGGER.info('Results are saved as {0}.'.format(repr(filename)))

            root = dictElement(root, 'BlastOutput_')
            if root['db'] != 'pdb':
                raise ValueError('blast search database in xml must be "pdb"')
            if root['program'] != 'blastp':
                raise ValueError(
                    'blast search program in xml must be "blastp"')
            self._param = dictElement(root['param'][0], 'Parameters_')

            query_len = int(root['query-len'])
            if sequence and len(sequence) != query_len:
                raise ValueError(
                    'query-len and the length of the sequence do not '
                    'match, xml data may not be for given sequence')
            hits = []
            for iteration in root['iterations']:
                for hit in dictElement(iteration, 'Iteration_')['hits']:
                    hit = dictElement(hit, 'Hit_')
                    data = dictElement(hit['hsps'][0], 'Hsp_')
                    for key in [
                            'align-len', 'gaps', 'hit-frame', 'hit-from',
                            'hit-to', 'identity', 'positive', 'query-frame',
                            'query-from', 'query-to'
                    ]:
                        data[key] = int(data[key])
                    data['query-len'] = query_len
                    for key in ['evalue', 'bit-score', 'score']:
                        data[key] = float(data[key])
                    p_identity = 100.0 * data['identity'] / (
                        data['query-to'] - data['query-from'] + 1)
                    data['percent_identity'] = p_identity
                    p_overlap = (100.0 * (data['align-len'] - data['gaps']) /
                                 query_len)
                    data['percent_coverage'] = p_overlap

                    for item in (hit['id'] + hit['def']).split('>gi'):
                        head, title = item.split(None, 1)
                        head = head.split('|')
                        pdb_id = head[-2].lower()
                        chain_id = head[-1][:1]
                        pdbch = dict(data)
                        pdbch['pdb_id'] = pdb_id
                        pdbch['chain_id'] = chain_id
                        pdbch['title'] = (head[-1][1:] + title).strip()
                        hits.append((p_identity, p_overlap, pdbch))
            hits.sort(key=lambda hit: hit[0], reverse=True)
            self._hits = hits

        return True
예제 #10
0
파일: blastpdb.py 프로젝트: fongchun/ProDy
def blastPDB(sequence, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    blast searching *sequence* against the PDB using NCBI blastp.

    :arg sequence: an object with an associated sequence string 
         or a sequence string itself
    :type sequence: :class:`Atomic`, :class:`Sequence`, or str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``)
    search parameters can be adjusted by the user.  *sleep* keyword argument
    (default is ``2`` seconds) determines how long to wait to reconnect for
    results.  Sleep time is multiplied by 1.5 when results are not ready.  
    *timeout* (default is 120 s) determines when to give up waiting for the results.
    """

    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')

    elif isinstance(sequence, Atomic):
        sequence = sequence.calpha.getSequence()

    elif isinstance(sequence, Sequence):
        sequence = str(sequence)

    elif isinstance(sequence, str):
        if len(sequence) in [4, 5, 6]:
            ag = parsePDB(sequence)
            sequence = ag.calpha.getSequence()
        sequence = ''.join(sequence.split())

    else:
        raise TypeError('sequence must be Atomic, Sequence, or str not {0}'
                        .format(type(sequence)))

    headers = {'User-agent': 'ProDy'}
    query = [('DATABASE', 'pdb'), ('ENTREZ_QUERY', '(none)'),
             ('PROGRAM', 'blastp'),]

    expect = float(kwargs.pop('expect', 10e-10))
    if expect <= 0:
        raise ValueError('expect must be a positive number')
    query.append(('EXPECT', expect))
    hitlist_size = int(kwargs.pop('hitlist_size', 250))
    if hitlist_size <= 0:
        raise ValueError('expect must be a positive integer')
    query.append(('HITLIST_SIZE', hitlist_size))
    query.append(('QUERY', sequence))
    query.append(('CMD', 'Put'))

    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 120))

    try:
        import urllib.parse
        urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8')
    except ImportError:
        from urllib import urlencode

    url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi'

    data = urlencode(query)
    LOGGER.timeit('_prody_blast')
    LOGGER.info('Blast searching NCBI PDB database for "{0}..."'
                .format(sequence[:5]))
    handle = openURL(url, data=data, headers=headers)

    html = handle.read()
    index = html.find(b'RID =')
    if index == -1:
        raise Exception('NCBI did not return expected response.')
    else:
        last = html.find(b'\n', index)
        rid = html[index + len('RID ='):last].strip()

    index = html.find(b'RTOE =')
    if index == -1:
        rtoe = None # This is not used
    else:
        last = html.find(b'\n', index)
        rtoe = int(html[index + len('RTOE ='):last].strip())

    query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500),
             ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')]
    data = urlencode(query)

    while True:
        LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.')
        LOGGER.write('Connecting to NCBI for search results...')
        handle = openURL(url, data=data, headers=headers)
        results = handle.read()
        index = results.find(b'Status=')
        LOGGER.clear()
        if index < 0:
            break
        last = results.index(b'\n', index)
        status = results[index+len('Status='):last].strip()
        if status.upper() == 'READY':
            break
        sleep = int(sleep * 1.5)
        if LOGGER.timing('_prody_blast') > timeout:
            LOGGER.warn('Blast search time out.')
            return None
    LOGGER.clear()
    LOGGER.report('Blast search completed in %.1fs.', '_prody_blast')

    try:
        ext_xml = filename.lower().endswith('.xml')
    except AttributeError:
        pass
    else:
        if not ext_xml:
            filename += '.xml'
        out = open(filename, 'w')
        out.write(results)
        out.close()
        LOGGER.info('Results are saved as {0}.'.format(repr(filename)))

    return PDBBlastRecord(results, sequence)
예제 #11
0
def blastPDB(sequence, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    blast searching *sequence* against the PDB using NCBI blastp.

    :arg sequence: an object with an associated sequence string 
         or a sequence string itself
    :type sequence: :class:`Atomic`, :class:`Sequence`, or str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``)
    search parameters can be adjusted by the user.  *sleep* keyword argument
    (default is ``2`` seconds) determines how long to wait to reconnect for
    results.  Sleep time is multiplied by 1.5 when results are not ready.  
    *timeout* (default is 120 s) determines when to give up waiting for the results.
    """

    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')

    elif isinstance(sequence, Atomic):
        sequence = sequence.calpha.getSequence()

    elif isinstance(sequence, Sequence):
        sequence = str(sequence)

    elif isinstance(sequence, str):
        if len(sequence) in [4, 5, 6]:
            ag = parsePDB(sequence)
            sequence = ag.calpha.getSequence()
        sequence = ''.join(sequence.split())

    else:
        raise TypeError(
            'sequence must be Atomic, Sequence, or str not {0}'.format(
                type(sequence)))

    headers = {'User-agent': 'ProDy'}
    query = [
        ('DATABASE', 'pdb'),
        ('ENTREZ_QUERY', '(none)'),
        ('PROGRAM', 'blastp'),
    ]

    expect = float(kwargs.pop('expect', 10e-10))
    if expect <= 0:
        raise ValueError('expect must be a positive number')
    query.append(('EXPECT', expect))
    hitlist_size = int(kwargs.pop('hitlist_size', 250))
    if hitlist_size <= 0:
        raise ValueError('expect must be a positive integer')
    query.append(('HITLIST_SIZE', hitlist_size))
    query.append(('QUERY', sequence))
    query.append(('CMD', 'Put'))

    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 120))

    try:
        import urllib.parse
        urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8')
    except ImportError:
        from urllib import urlencode

    url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi'

    data = urlencode(query)
    LOGGER.timeit('_prody_blast')
    LOGGER.info('Blast searching NCBI PDB database for "{0}..."'.format(
        sequence[:5]))
    handle = openURL(url, data=data, headers=headers)

    html = handle.read()
    index = html.find(b'RID =')
    if index == -1:
        raise Exception('NCBI did not return expected response.')
    else:
        last = html.find(b'\n', index)
        rid = html[index + len('RID ='):last].strip()

    index = html.find(b'RTOE =')
    if index == -1:
        rtoe = None  # This is not used
    else:
        last = html.find(b'\n', index)
        rtoe = int(html[index + len('RTOE ='):last].strip())

    query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500),
             ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')]
    data = urlencode(query)

    while True:
        LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.')
        LOGGER.write('Connecting to NCBI for search results...')
        handle = openURL(url, data=data, headers=headers)
        results = handle.read()
        index = results.find(b'Status=')
        LOGGER.clear()
        if index < 0:
            break
        last = results.index(b'\n', index)
        status = results[index + len('Status='):last].strip()
        if status.upper() == 'READY':
            break
        sleep = int(sleep * 1.5)
        if LOGGER.timing('_prody_blast') > timeout:
            LOGGER.warn('Blast search time out.')
            return None

    LOGGER.clear()
    LOGGER.report('Blast search completed in %.1fs.', '_prody_blast')

    try:
        ext_xml = filename.lower().endswith('.xml')
    except AttributeError:
        pass
    else:
        if not ext_xml:
            filename += '.xml'
        out = open(filename, 'w')
        if PY3K:
            out.write(results.decode())
        else:
            out.write(results)
        out.close()
        LOGGER.info('Results are saved as {0}.'.format(repr(filename)))

    return PDBBlastRecord(results, sequence)