Пример #1
0
def loadAtoms(filename):
    """Return :class:`AtomGroup` instance from *filename*.  This function makes
    use of :func:`numpy.load` function.  See also :func:`saveAtoms`."""
    
    LOGGER.timeit()
    attr_dict = load(filename)
    files = set(attr_dict.files)
    if not 'n_atoms' in files:
        raise ValueError("'{0:s}' is not a valid atomic data file"
                         .format(filename))
    title = str(attr_dict['title'])
    if 'coordinates' in files:
        coords = attr_dict['coordinates']
        ag = AtomGroup(title)
        ag._n_csets = int(attr_dict['n_csets'])
        ag._coords = coords
    ag._n_atoms = int(attr_dict['n_atoms'])
    ag._setTimeStamp()
    if 'bonds' in files and 'bmap' in files and 'numbonds' in files:
        ag._bonds = attr_dict['bonds']
        ag._bmap = attr_dict['bmap']
        ag._data['numbonds'] = attr_dict['numbonds']
    for key, data in attr_dict.iteritems():
        if key in SKIPLOAD:
            continue
        if key in ATOMIC_ATTRIBUTES:
            ag._data[key] = data
        else:
            ag.setData(key, data)
    if ag.numCoordsets() > 0:
        ag._acsi = 0
    if 'cslabels' in files:
        ag.setCSLabels(list(attr_dict['cslabels']))
    LOGGER.timing('Atom group was loaded in %.2fs.')
    return ag
Пример #2
0
def parsePQR(filename, **kwargs):
    """Return an :class:`~.AtomGroup` containing data parsed 
    from PDB lines. 
    
    :arg filename: a PQR filename
    :type filename: str"""
    
    title = kwargs.get('title', kwargs.get('name'))
    model = 1
    header = False
    chain = kwargs.get('chain')
    subset = kwargs.get('subset')
    altloc = kwargs.get('altloc', 'A')
    if not os.path.isfile(filename):
        raise IOError('No such file: {0:s}'.format(repr(filename)))
    if title is None:
        fn, ext = os.path.splitext(os.path.split(filename)[1])
        if ext == '.gz':
            fn, ext = os.path.splitext(fn)
        title = fn.lower()
    title_suffix = ''
    if subset is not None:
        if not isinstance(subset, str):
            raise TypeError('subset must be a string')
        elif subset.lower() not in _PDBSubsets:
            raise ValueError('{0:s} is not a valid subset'
                             .format(repr(subset)))
        title_suffix = '_' + _PDBSubsets[subset]
    if chain is not None:
        if not isinstance(chain, str):
            raise TypeError('chain must be a string')
        elif len(chain) == 0:
            raise ValueError('chain must not be an empty string')
        title_suffix = '_' + chain + title_suffix
    if 'ag' in kwargs:
        ag = kwargs['ag']
        if not isinstance(ag, AtomGroup):
            raise TypeError('ag must be an AtomGroup instance')
        n_csets = ag.numCoordsets()
    else:
        ag = AtomGroup(title + title_suffix)
        n_csets = 0
        
    pqr = openFile(filename)
    lines = pqr.readlines()
    pqr.close()
    LOGGER.timeit()
    ag = _parsePDBLines(ag, lines, split=0, model=1, chain=chain, 
                        subset=subset, altloc_torf=False, format='pqr')
    if ag.numAtoms() > 0:
        LOGGER.timing('{0:d} atoms and {1:d} coordinate sets were '
                      'parsed in %.2fs.'.format(ag.numAtoms(), 
                         ag.numCoordsets() - n_csets))
        return ag
    else:
        return None
Пример #3
0
 def superpose(self):
     """Superpose the ensemble onto the reference coordinates."""
     
     if self._coords is None:
         raise ValueError('coordinates are not set, use `setCoords`')
     if self._confs is None or len(self._confs) == 0: 
         raise ValueError('conformations are not set, use `addCoordset`')
     LOGGER.timeit()
     self._superpose(trans=True) # trans kwarg is used by PDBEnsemble
     LOGGER.timing('Superposition completed in %.2f seconds.')
Пример #4
0
 def iterpose(self, rmsd=0.0001):
     """Iteratively superpose the ensemble until convergence.  Initially, 
     all conformations are aligned with the reference coordinates.  Then 
     mean coordinates are calculated, and are set as the new reference 
     coordinates.  This is repeated until reference coordinates do not 
     change.  This is determined by the value of RMSD between the new and 
     old reference coordinates.        
     
     :arg rmsd: change in reference coordinates to determine convergence,
         default is 0.0001 Å RMSD
     :type rmsd: float"""
     
     if self._coords is None:
         raise AttributeError('coordinates are not set, use `setCoords`')
     if self._confs is None or len(self._confs) == 0: 
         raise AttributeError('conformations are not set, use `addCoordset`'
                              )
     LOGGER.info('Starting iterative superposition:')
     LOGGER.timeit()
     rmsdif = 1
     step = 0
     weights = self._weights
     if weights is not None and weights.ndim == 3:
         weightsum = weights.sum(axis=0)
     length = len(self)
     while rmsdif > rmsd:
         self._superpose()
         if weights is None:
             newxyz = self._confs.sum(0) / length
         else:
             newxyz = (self._confs * weights).sum(0) / weightsum
         rmsdif = getRMSD(self._coords, newxyz)
         self._coords = newxyz
         step += 1
         LOGGER.info(('Step #{0:d}: RMSD difference = '
                            '{1:.4e}').format(step, rmsdif))
     LOGGER.timing('Iterative superposition completed in %.2fs.')
Пример #5
0
def searchPfam(query, search_b=False, skip_a=False, **kwargs):
    """Return Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence
        file, sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg search_b: search Pfam-B families when **True**
    :type search_b: bool

    :arg skip_a: do not search Pfam-A families when **True**
    :type skip_a: bool

    :arg ga: use gathering threshold when **True**
    :type ga: bool

    :arg evalue: user specified e-value cutoff, must be smaller than 10.0
    :type evalue: float

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    prefix = '{http://pfam.sanger.ac.uk/}'
    query = str(query)
    if isfile(query):
        from prody.sequence import MSAFile
        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = ''.join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError('could not parse a sequence without gaps from ' +
                             query)
    else:
        seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + ' is not a valid sequence')

        urlextension = ''
        if kwargs:
            ga = int(kwargs.get('ga', 1))
            if not (ga == 1 or ga == 0):
                raise ValueError('ga must be either 0 or 1')

            evalue = kwargs.get('evalue', None)
            if evalue:
                if not float(evalue) <= 10.0:
                    raise ValueError('evalue must be a valid float < 10.0')
                urlextension = urlextension + '&evalue=' + str(evalue)
            else:
                urlextension = urlextension + '&ga=' + str(ga)

        search_b = int(bool(search_b))
        skip_a = int(bool(skip_a))
        if skip_a == 1:
            search_b = 1

        urlextension = urlextension + '&searchBs=' + str(search_b)
        urlextension = urlextension + '&skipAs='******'http://pfam.sanger.ac.uk/search/sequence?seq=' + str(seq) +
               urlextension + '&output=xml')
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'
                     .format(seq[:MINSEQLEN]))

        xml = openURL(url, timeout=timeout).read()

        try:
            root = ET.XML(xml)
        except Exception as err:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        try:
            url = dictElement(root[0], prefix)['result_url']
        except (IndexError, KeyError):
            raise ValueError('failed to parse results XML, check URL: ' + url)

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader
            try:
                polymers = parsePDBHeader(seq[:4], 'polymers')
            except Exception as err:
                LOGGER.warn('failed to parse header for {0} ({1})'
                            .format(seq[:4], str(err)))
            else:
                chid = seq[4:].upper()
                for poly in polymers:
                    if chid and poly.chid != chid:
                        continue
                    for dbref in poly.dbrefs:
                        if dbref.database != 'UniProt':
                            continue
                        idcode = dbref.idcode
                        LOGGER.info('UniProt ID code {0} for {1} chain '
                                    '{2} will be used.'
                                    .format(idcode, seq[:4], poly.chid))
                        break
                    if idcode is not None:
                        break
            if idcode is None:
                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
                            'parsed.'.format(repr(seq)))
                url = 'http://pfam.sanger.ac.uk/protein/' + seq + '?output=xml'
            else:
                url = ('http://pfam.sanger.ac.uk/protein/' +
                       idcode + '?output=xml')

        else:
            url = 'http://pfam.sanger.ac.uk/protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        #else:
        #    if xml:
        #        break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError('failed to parse results XML, check URL: ' + url)
    else:
        results = dictElement(root[0], prefix)
        try:
            xml_matches = results['matches']
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib['accession'][:7]
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        if not re.search('^P(F|B)[0-9]{5}$', accession):
            raise ValueError('{0} does not match pfam accession'
                             ' format'.format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault('locations', [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = 'Query ' + repr(query)
    else:
        query = 'Query sequence'

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches
Пример #6
0
def blastPDBUniProtKB(sequence, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    blast searching of ProteinDataBank database *sequence* using NCBI blastp.

    :arg sequence: single-letter code amino acid sequence of the protein
        without any gap characters, all white spaces will be removed
    :type sequence: str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``)
    search parameters can be adjusted by the user.  *sleep* keyword argument
    (default is ``2`` seconds) determines how long to wait to reconnect for
    results.  Sleep time is doubled when results are not ready.  *timeout*
    (default is 120s) determines when to give up waiting for the results. 
    *num_sequences (default is ``1``)
    """

    num_sequences = int(kwargs.pop('num_sequences', 1))
    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')
    else:
        if num_sequences == 1:
            try:
                sequence = ''.join(sequence.split())
                _ = sequence.isalpha()
            except AttributeError:
                raise TypeError('sequence must be a string')
            else:
                if not _:
                    raise ValueError('not a valid protein sequence')
                    
    headers = {'User-agent': 'ProDy'}

    query = [('DATABASE', 'swissprot'), ('ENTREZ_QUERY', '(none)'),
             ('PROGRAM', 'blastp'),]
    expect = float(kwargs.pop('expect', 10e-5))
    if expect <= 0:
        raise ValueError('expect must be a positive number')
    query.append(('EXPECT', expect))
    hitlist_size = int(kwargs.pop('hitlist_size', 250))
    if hitlist_size <= 0:
        raise ValueError('expect must be a positive integer')
    psiblast = 'true'
    step_number = 3
    query.append(('RUN_PSIBLAST', psiblast))
    query.append(('HITLIST_SIZE', hitlist_size))
    query.append(('QUERY', sequence))
    query.append(('CMD', 'Put'))
    query.append(('STEP_NUMBER', step_number))

    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 120))

    if kwargs:
        LOGGER.warn('Keyword argument(s) {0} are not used.'
                    .format(', '.join([repr(key) for key in kwargs])))

    try:
        import urllib.parse
        urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8')
    except ImportError:
        from urllib import urlencode

    url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi'

    data = urlencode(query)
    LOGGER.timeit('_prody_blast')
    LOGGER.info('Blast searching NCBI PDB database for "{0}..."'
                .format(sequence[:5]))
    handle = openURL(url, data=data, headers=headers)

    html = handle.read()
    index = html.find(b'name="RID" type="hidden" value="')
    if index == -1:
        raise Exception('NCBI did not return expected response.')
    else:
        last = html.find(b'>',index)
        rid = html[index + len('name="RID" type="hidden" value="'):last-1].strip()

    index = html.find(b'name="RTOE" type="hidden" value="')
    if index == -1:
        rtoe = None # This is not used
    else:
        last = html.find(b'>', index)
        rtoe = html[index + len('name="RTOE" type="hidden" value="'):last-1].strip()

    query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500),
             ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')]
    data = urlencode(query)

    while True:
        LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.')
        LOGGER.write('Connecting NCBI for search results...')
        handle = openURL(url, data=data, headers=headers)
        results = handle.read()
        index = results.find(b'Status=')
        LOGGER.clear()
        if index < 0:
            break
        last = results.index(b'\n', index)
        status = results[index+len('Status='):last].strip()
        if status.upper() == 'READY':
            break
        sleep = int(sleep * 1.5)
        if LOGGER.timing('_prody_blast') > timeout:
            LOGGER.warn('Blast search time out.')
            return None
    LOGGER.clear()
    LOGGER.report('Blast search completed in %.1fs.', '_prody_blast')
    try:
        ext_xml = filename.lower().endswith('.xml')
    except AttributeError:
        pass
    else:
        if not ext_xml:
            filename += '.xml'
        out = open(filename, 'w')
        out.write(results)
        out.close()
        LOGGER.info('Results are saved as {0}.'.format(repr(filename)))
    return SwissProtBlastRecord(results, sequence)
Пример #7
0
    def fetch(self, url=None, localFile=False, **kwargs):
        """Get Dali record from url or file.

        :arg url: url of Dali results page or local dali results file
            If None then the url already associated with the DaliRecord object is used.
        :type url: str

        :arg localFile: whether provided url is a path for a local dali results file
        :type localFile: bool

        :arg timeout: amount of time until the query times out in seconds
            default value is 120
        :type timeout: int

        :arg localfolder: folder in which to find the local file
            default is the current folder
        :type localfolder: str
        """
        if localFile:
            dali_file = open(url, 'r')
            data = dali_file.read()
            dali_file.close()
        else:
            import requests
            
            if url == None:
                url = self._url
            
            sleep = 2
            timeout = kwargs.pop('timeout', 120)
            LOGGER.timeit('_dali')
            log_message = ''
            try_error = 3
            while True:
                LOGGER.write('Connecting to Dali for search results...')
                LOGGER.clear()
                try:
                    # html = urllib2.urlopen(url).read()
                    html = requests.get(url).content
                except:
                    try_error -= 1
                    if try_error >= 0:
                        LOGGER.sleep(2, '. Connection error happened. Trying to reconnect...')
                        continue
                    else:
                        # html = urllib2.urlopen(url).read()
                        html = requests.get(url).content
                if PY3K:
                    html = html.decode()
                if html.find('Status: Queued') > -1:
                    log_message = '(Dali search is queued)...'
                elif html.find('Status: Running') > -1:
                    log_message = '(Dali search is running)...'
                elif html.find('Your job') == -1 and html.find('.txt') > -1:
                    break
                elif html.find('ERROR:') > -1:
                    LOGGER.warn(': Dali search reported an ERROR!')
                    return False
                sleep = 20 if int(sleep * 1.5) >= 20 else int(sleep * 1.5)
                if LOGGER.timing('_dali') > timeout:
                    LOGGER.warn(': Dali search has timed out. \nThe results can be obtained later using the fetch() method.')
                    return False
                LOGGER.sleep(int(sleep), 'to reconnect to Dali '+log_message)
                LOGGER.clear()
            LOGGER.clear()
            LOGGER.report('Dali results were fetched in %.1fs.', '_dali')
            lines = html.strip().split('\n')
            file_name = re.search('=.+-90\\.txt', html).group()[1:]
            file_name = file_name[:-7]
            # LOGGER.info(url+file_name+self._subset+'.txt')
            # data = urllib2.urlopen(url+file_name+self._subset+'.txt').read()
            data = requests.get(url+file_name+self._subset+'.txt').content
            if PY3K:
                data = data.decode()
            localfolder = kwargs.pop('localfolder', '.')

            if file_name.lower().startswith('s001'):
                temp_name = self._pdbId + self._chain
            else:
                temp_name = file_name
            temp_name += self._subset + '_dali.txt'
            if localfolder != '.' and not os.path.exists(localfolder):
                os.mkdir(localfolder)
            with open(localfolder+os.sep+temp_name, "w") as file_temp: file_temp.write(html + '\n' + url+file_name+self._subset+'.txt' + '\n' + data)
            # with open(temp_name, "a+") as file_temp: file_temp.write(url+file_name + '\n' + data)
        data_list = data.strip().split('# ')
        # No:  Chain   Z    rmsd lali nres  %id PDB  Description -> data_list[3]
        # Structural equivalences -> data_list[4]
        # Translation-rotation matrices -> data_list[5]
        map_temp_dict = dict()
        lines = data_list[4].strip().split('\n')
        self._lines_4 = lines
        mapping_temp = np.genfromtxt(lines[1:], delimiter = (4,1,14,6,2,4,4,5,2,4,4,3,5,4,3,5,6,3,5,4,3,5,28), 
                                     usecols = [0,3,5,7,9,12,15,15,18,21], dtype='|i4')
        # [0,3,5,7,9,12,15,15,18,21] -> [index, residue_a, residue_b, residue_i_a, residue_i_b, resid_a, resid_b, resid_i_a, resid_i_b]
        for map_i in mapping_temp:
            if not map_i[0] in map_temp_dict:
                map_temp_dict[map_i[0]] = [[map_i[1], map_i[2], map_i[3], map_i[4]]]
            else:
                map_temp_dict[map_i[0]].append([map_i[1], map_i[2], map_i[3], map_i[4]])
        self._max_index = max(mapping_temp[:,2])
        self._mapping = map_temp_dict
        self._data = data_list[3]
        lines = data_list[3].strip().split('\n')
        # daliInfo = np.genfromtxt(lines[1:], delimiter = (4,3,6,5,5,5,6,5,57), usecols = [0,2,3,4,5,6,7,8], 
                                # dtype=[('id', '<i4'), ('pdb_chain', '|S6'), ('Z', '<f4'), ('rmsd', '<f4'), 
                                # ('len_align', '<i4'), ('nres', '<i4'), ('identity', '<i4'), ('title', '|S70')])
        daliInfo = np.genfromtxt(lines[1:], delimiter = (4,3,6,5,5,5,6,5,57), usecols = [0,2,3,4,5,6,7,8], 
                                dtype=[('id', '<i4'), ('pdb_chain', '|U6'), ('Z', '<f4'), ('rmsd', '<f4'), 
                                ('len_align', '<i4'), ('nres', '<i4'), ('identity', '<i4'), ('title', '|U70')])
        if daliInfo.ndim == 0:
            daliInfo = np.array([daliInfo])
        pdbListAll = []
        self._daliInfo = daliInfo
        dali_temp_dict = dict()
        for temp in self._daliInfo:
            temp_dict = dict()
            pdb_chain = temp[1].strip()[0:6]
            # U6 and U70 were used as the dtype for np.genfromtext -> unicode string were used in daliInfo 
            # if PY3K:
                # pdb_chain = pdb_chain.decode()
            pdb_chain = str(pdb_chain)
            temp_dict['pdbId'] = pdbid = pdb_chain[0:4].lower()
            temp_dict['chainId'] = chid = pdb_chain[5:6]
            temp_dict['pdb_chain'] = pdb_chain = pdbid + chid
            temp_dict['Z'] = temp[2]
            temp_dict['rmsd'] = temp[3]
            temp_dict['len_align'] = temp[4]
            temp_dict['nres'] = temp[5]
            temp_dict['identity'] = temp[6]
            temp_dict['mapping'] = (np.array(map_temp_dict[temp[0]])-1).tolist()
            temp_dict['map_ref'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[0], map_i[1]+1)]
            temp_dict['map_sel'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[2], map_i[3]+1)]
            dali_temp_dict[pdb_chain] = temp_dict
            pdbListAll.append(pdb_chain)
        self._pdbListAll = tuple(pdbListAll)
        self._pdbList = self._pdbListAll
        self._alignPDB = dali_temp_dict
        LOGGER.info('Obtained ' + str(len(pdbListAll)) + ' PDB chains from Dali for '+self._pdbId+self._chain+'.')
        return True
Пример #8
0
def psiBlastCycle(sequence=None, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    a single cycle of EBI psiblast.

    :arg sequence: an object with an associated sequence string 
         or a sequence string itself
    :type sequence: :class:`Atomic`, :class:`Sequence`, or str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    The following search parameters can be adjusted by the user.
    We use the same default values as 
    http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/
    wherever applicable.

    :arg email: email address for reporting problems
        default is [email protected]
    :type email: str with an @ before a .

    :arg matrix: The comparison matrix to be used to score alignments when searching the database
        possible values are 'BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30' and 'PAM70' 
        default is 'BLOSUM62'
    :type matrix: str

    :arg gapopen: Penalty taken away from the score when a gap is created in sequence alignments. 
        Increasing the gap opening penalty will decrease the number of gaps in the final alignment.
        Possible values range from 8 to 16 inclusive, default is 11
    :type gapopen: int

    :arg gapext: Penalty taken away from the score for each base or residue in the gap. 
        Increasing the gap extension penalty favors short gaps in the final alignment, 
        conversly decreasing the gap extension penalty favors long gaps in the final alignment. 
        Possible values range from 0 to 3, default is 1
    :type gapext: int

    :arg expthr: Expectation threshold that limits the number of scores and alignments reported. 
        This is the maximum number of times the match is expected to occur by chance.
        Possible values are 1.0e-200, 1.0e-100, 1.0e-50, 1.0e-10, 1.0e-5, 1.0e-4, 1.0e-3,
        1.0e-2, 0.1, 1.0, 10.0, 100, 1000
        default is 10.0
    :type expthr: float

    :arg psithr: Expectation value threshold for automatic selection of matched sequences for 
        inclusion in the PSSM at each iteration.
        Possible values are 1.0e-6, 1.0e-5, 1.0e-4, 2.0e-4, 5.0e-4, 1.0e-3, 2.0e-3, 5.0e-3,
        1.0e-2, 2.0e-2, 0.1, 0.3, 0.5, 1.0, 3.0, 10.0
        default is 1.0e-3
    :type psithr: float

    :arg scores: Maximum number of match score summaries reported in the result output.
        Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000
        Default is 500
    :type scores: int

    :arg alignments: Maximum number of match alignments reported in the result output.
        Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000
        Default is 500
    :type alignmets: int

    :arg dropoff: The amount a score can drop before extension of word hits is halted
        Possible values are 0, 2, 4, 6, 8, 10, 15, 20, 25, or 30
        Default is 15
    :type dropoff: int

    :arg finaldropoff: Dropoff value for final gapped alignment
        Possible values are 10, 12, 14, 16, 18, 20, 22, 24, 25, 26, 28, or 30
        Default is 25
    :type finaldropoff: int

    :arg filter: Filter regions of low sequence complexity. This can avoid issues with 
        low complexity sequences where matches are found due to composition rather than 
        meaningful sequence similarity. However, in some cases filtering also masks 
        regions of interest and so should be used with caution.
        Possible values are T and F, default is F
    :type filter: str

    :arg seqrange: Specify a range or section of the input sequence to use in the search.
        Example: Specifying '34-89' in an input sequence of total length 100, will tell BLAST 
        to only use residues 34 to 89, inclusive.
    :type seqrange: str of form START-END

    :arg database: a database name from those available. See
        http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/database
        default is pdb
    :type database: str

    :arg previousjobid: The job identifier for the previous PSI-BLAST iteration. 
        default is None
        You can change this if you want to continue from a previous run
    :type previousjobid: str

    :arg selectedHits: Name of a file containing a list of identifiers of the 
        hits from the previous iteration to use to construct the search PSSM 
        for this iteration.
        default is None
    :type selectedHits: str

    :arg cpfile: Name of a Checkpoint file from the previous iteration. 
        default is None
    :type cpfile: str

    :arg sleep: how long to wait to reconnect for status
         Sleep time is multiplied by 1.5 when results are not ready.
         default is 2 seconds
    :type sleep: float

    :arg timeout:  when to give up waiting for the results 
        default is 120 seconds
    :type timeout: float

    :arg cycle: cycle number
    :type cycle: int

    """
    cycle = kwargs.get('cycle',0)

    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')

    elif isinstance(sequence, Atomic):
        sequence = sequence.calpha.getSequence()

    elif isinstance(sequence, Sequence):
        sequence = str(sequence)

    elif isinstance(sequence, str):
        if len(sequence) in [4, 5, 6]:
            ag = parsePDB(sequence)
            sequence = ag.calpha.getSequence()
        sequence = ''.join(sequence.split())

    elif sequence is None:
        if cycle == 0: 
            cycle = 1
    else:
        raise TypeError('sequence must be Atomic, Sequence, or str not {0}'
                        .format(type(sequence)))

    if cycle == 0:
        query = [('sequence', sequence)]
    else:
        query = []

    email = kwargs.get('email','*****@*****.**')
    if not isinstance(email, str):
        raise TypeError('email must be a string')
    elif email.find('@') == -1 or email.find('.') == -1 or len(email.split('@')) != 2:
        raise ValueError('email must be a valid email address with at least one . and exactly one @ sign')
    elif not email.find('@') < email.find(email.split('.')[-1]):
        raise ValueError('email must be a valid email address with a . after the @ sign')
    query.append(('email', email))
    query.append(('title', 'ProDy psiBlastPDB request'))

    previousjobid = kwargs.get('previousjobid','')
    if previousjobid is not '':
        query.append(('previousjobid',previousjobid))

    selectedHits = kwargs.get('selectedHits','')
    if selectedHits is not '':
        query.append(('selectedHits',selectedHits))

    database = kwargs.get('database','pdb')
    checkPsiBlastParameter('database', database)
    query.append(('database',database))

    matrix = kwargs.get('matrix', 'BLOSUM62')
    checkPsiBlastParameter('matrix', matrix)
    query.append(('matrix',matrix))

    gapopen = kwargs.get('gapopen',11)
    checkPsiBlastParameter('gapopen', gapopen)
    query.append(('gapopen',gapopen))

    gapext = kwargs.get('gapext',1)
    checkPsiBlastParameter('gapext', gapext)
    query.append(('gapext',gapext))

    expthr = kwargs.get('expthr', 10.)
    checkPsiBlastParameter('expthr', expthr)
    query.append(('expthr',expthr))
    
    psithr = kwargs.get('psithr',1.0e-3)
    checkPsiBlastParameter('psithr', psithr)
    query.append(('psithr',psithr))

    scores = kwargs.get('scores',500)
    checkPsiBlastParameter('scores', scores)
    query.append(('scores',scores))

    alignments = kwargs.get('alignments',500)
    checkPsiBlastParameter('alignments', alignments)
    query.append(('alignments',alignments))
    
    query.append(('alignView',0))
                    
    dropoff = kwargs.get('dropoff',15)
    checkPsiBlastParameter('dropoff', dropoff)
    query.append(('dropoff',dropoff))
        
    finaldropoff = kwargs.get('finaldropoff',25)
    checkPsiBlastParameter('finaldropoff', finaldropoff)
    query.append(('finaldropoff',finaldropoff))
        
    filter = kwargs.get('filter','F')
    checkPsiBlastParameter('filter', filter)
    query.append(('filter',filter))
    
    if previousjobid is '' and selectedHits is '':
        seqrange = kwargs.get('seqrange', None)
        if seqrange is None:
            seqrange = '0-' + str(len(sequence))
        elif not isinstance(seqrange, str):
            raise TypeError('seqrange should be a string')
        elif len(seqrange.split('-')) != 2:
            raise ValueError('seqrange should take the form START-END')
        try:
            start = int(seqrange.split('-')[0])
            end = int(seqrange.split('-')[1])
        except:
            raise ValueError('seqrange should be START-END with START and END being integers')
        query.append(('seqrange',seqrange))
        
    headers = { 'User-Agent' : 'ProDy' }
    
    try:
        import urllib.parse
        urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8')
    except ImportError:
        from urllib import urlencode

    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 120))
    
    data = urlencode(query)

    # submit the job
    base_url = 'http://www.ebi.ac.uk/Tools/services/rest/psiblast/'
    url = base_url + 'run/'
    LOGGER.timeit('_prody_psi-blast')
    if cycle == 0:
        LOGGER.info('PSI-Blast searching PDB database for "{0}..."'
                    .format(sequence[:5]))
    else:
        LOGGER.info('PSI-Blast searching PDB database, cycle={0}'
                    .format(cycle))

    handle = openURL(url, data=data, headers=headers)
    job_id = handle.read()
    handle.close()

    # check the status
    url = base_url + 'status/' + job_id
    handle = openURL(url)
    status = handle.read()
    handle.close()
                    
    # keep checking the status until it's no longer running
    while status == 'RUNNING':
        LOGGER.sleep(int(sleep), 'to reconnect to EBI for status.')
        LOGGER.write('Connecting to EBI for status...')
        handle = openURL(url)
        status = handle.read()
        LOGGER.clear()
        sleep = int(sleep * 1.5)
        if LOGGER.timing('_prody_psi-blast') > timeout:
            LOGGER.warn('PSI-Blast search time out.')
            return None

    LOGGER.info('The status is {0}'.format(status))
    LOGGER.clear()
    LOGGER.report('PSI-Blast search completed in %.1fs.', '_prody_psi-blast')
 
    if cycle != 1:
        # get the results
        url = base_url + 'result/' + job_id + '/xml'
        handle = openURL(url)
        results = handle.read()
        handle.close()
        
        try:
            ext_xml = filename.lower().endswith('.xml')
        except AttributeError:
            pass
        else:
            if not ext_xml:
                filename += '.xml'
            f_out = open(filename, 'w')
            f_out.write(results)
            f_out.close()
            LOGGER.info('Results are saved as {0}.'.format(repr(filename)))
        
        return job_id, PsiBlastRecord(results, sequence)
    else:
        return job_id
Пример #9
0
    def run(self,
            cutoff=15.,
            n_modes=3,
            gamma=1.,
            n_confs=50,
            rmsd=1.0,
            n_gens=5,
            maxclust=None,
            threshold=None,
            solvent='imp',
            sim=True,
            force_field=None,
            temp=303.15,
            t_steps_i=1000,
            t_steps_g=7500,
            outlier=True,
            mzscore=3.5,
            **kwargs):
        '''
        Performs a ClustENM run.

        :arg cutoff: Cutoff distance (A) for pairwise interactions used in ANM
            computations, default is 15.0 A.
        :type cutoff: float

        :arg gamma: Spring constant of ANM, default is 1.0.
        :type gamma: float

        :arg n_modes: Number of non-zero eigenvalues/vectors to calculate.
        :type n_modes: int

        :arg n_confs: Number of new conformers to be generated based on any conformer
            from the previous generation, default is 50.
        :type n_confs: int
            
        :arg rmsd: Average RMSD of the new conformers with respect to the conformer
            from which they are generated, default is 1.0 A.
            A tuple of floats can be given, e.g. (1.0, 1.5, 1.5) for subsequent generations.
            Note: In the case of ClustENMv1, this value is the maximum rmsd, not the average.
        :type rmsd: float, tuple of floats

        :arg n_gens: Number of generations.
        :type n_gens: int

        :arg maxclust: Maximum number of clusters for each generation, default in None.
            A tuple of int's can be given, e.g. (10, 30, 50) for subsequent generations.
            Warning: Either maxclust or RMSD threshold should be given! For large number of
            generations and/or structures, specifying maxclust is more efficient.
        :type maxclust: int or a tuple of int's

        :arg threshold: RMSD threshold to apply when forming clusters, default is None.
            This parameter has been used in ClustENMv1, setting it to 75% of the maximum RMSD
            value used for sampling. A tuple of floats can be given, e.g. (1.5, 2.0, 2.5)
            for subsequent generations.
            Warning: This threshold should be chosen carefully in ClustENMv2 for efficiency.
        :type threshold: float or tuple of floats.

        :arg solvent: Solvent model to be used. If it is set to 'imp' (default),
            implicit solvent model will be used, whereas 'exp' stands for explicit solvent model.
            Warning: In the case of nucleotide chains, explicit solvent model is automatically set.
        :type solvent: str

        :arg padding: Padding distance to use for solvation. Default is 1.0 nm.
        :type padding: float

        :arg ionicStrength: Total concentration of ions (both positive and negative) to add.
            This does not include ions that are added to neutralize the system.
            Default concentration is 0.0 molar.
        :type ionicStrength: float

        :arg force_field: Implicit solvent force field is ('amber99sbildn.xml', 'amber99_obc.xml'). 
            Explicit solvent force field is ('amber14-all.xml', 'amber14/tip3pfb.xml').
            Experimental feature: Forcefields already implemented in OpenMM can be used. 
        :type force_field: a tuple of str
        
        :arg tolerance: Energy tolerance to which the system should be minimized, default is 10.0 kJ/mole.
        :type tolerance: float
        
        :arg maxIterations: Maximum number of iterations to perform during energy minimization.
            If this is 0 (default), minimization is continued until the results converge without
            regard to how many iterations it takes.
        :type maxIterations: int

        :arg sim: If it is True (default), a short MD simulation is performed after energy minimization.
            Note: There is also a heating-up phase until the desired temperature is reached.
        :type sim: bool

        :arg temp: Temperature at which the simulations are conducted, default is 303.15 K.
        :type temp: float

        :arg t_steps_i: Duration of MD simulation (number of time steps) for the starting structure
            following the heating-up phase, default is 1000. Each time step is 2.0 fs.
            Note: Default value reduces possible drift from the starting structure. 
        :type t_steps_i : int

        :arg t_steps_g: Duration of MD simulations (number of time steps) to run for each conformer
            following the heating-up phase, default is 7500. Each time step is 2.0 fs.
            A tuple of int's can be given, e.g. (3000, 5000, 7000) for subsequent generations.
        :type t_steps_g: int or tuple of int's

        :arg outlier: Exclusion of conformers detected as outliers in each generation.
            Default is True for implicit solvent. Outliers, if any, are detected by
            the modified z-scores of the conformers' potential energies over a generation.
            Note: It is automatically set to False when explicit solvent model is being used.
        :type outlier: bool

        :arg mzscore: Modified z-score threshold to label conformers as outliers. Default is 3.5.
        :type mzscore: float

        :arg v1: Original sampling method with complete enumeration of desired ANM modes is used.
            Default is False. Maximum number of modes should not exceed 5 for efficiency.
        :type v1: bool

        :arg platform: Architecture on which the OpenMM part runs, default is None.
            It can be chosen as 'CUDA', 'OpenCL' or 'CPU'.
            For efficiency, 'CUDA' or 'OpenCL' is recommended.
        :type platform: str

        :arg parallel: If it is True (default is False), conformer generation will be parallelized.
        :type parallel: bool
        '''

        if self._isBuilt():
            raise ValueError(
                'ClustENM ensemble has been built; please start a new instance'
            )

        # set up parameters
        self._cutoff = cutoff
        self._n_modes = n_modes
        self._gamma = gamma
        self._n_confs = n_confs
        self._rmsd = (0., ) + rmsd if isinstance(
            rmsd, tuple) else (0., ) + (rmsd, ) * n_gens
        self._n_gens = n_gens
        self._platform = kwargs.pop('platform', None)
        self._parallel = kwargs.pop('parallel', False)
        self._targeted = kwargs.pop('targeted', False)
        self._tmdk = kwargs.pop('tmdk', 15.)

        if maxclust is None and threshold is None:
            raise ValueError('Either maxclust or threshold should be set!')

        if maxclust is None:
            self._maxclust = None
        else:
            if isinstance(maxclust, tuple):
                self._maxclust = (0, ) + maxclust
            else:
                self._maxclust = (0, ) + (maxclust, ) * n_gens

            if len(self._maxclust) != self._n_gens + 1:
                raise ValueError(
                    'size mismatch: %d generations were set; %d maxclusts were given'
                    % (self._n_gens + 1, self._maxclust))

        if threshold is None:
            self._threshold = None
        else:
            if isinstance(threshold, tuple):
                self._threshold = (0, ) + threshold
            else:
                self._threshold = (0, ) + (threshold, ) * n_gens

            if len(self._threshold) != self._n_gens + 1:
                raise ValueError(
                    'size mismatch: %d generations were set; %d thresholds were given'
                    % (self._n_gens + 1, self._threshold))

        self._sol = solvent if self._nuc is None else 'exp'
        self._padding = kwargs.pop('padding', 1.0)
        self._ionicStrength = kwargs.pop('ionicStrength', 0.0)
        if self._sol == 'imp':
            self._force_field = (
                'amber99sbildn.xml',
                'amber99_obc.xml') if force_field is None else force_field
        if self._sol == 'exp':
            self._force_field = (
                'amber14-all.xml',
                'amber14/tip3pfb.xml') if force_field is None else force_field
        self._tolerance = kwargs.pop('tolerance', 10.0)
        self._maxIterations = kwargs.pop('maxIterations', 0)
        self._sim = sim
        self._temp = temp

        if self._sim:
            if isinstance(t_steps_g, tuple):
                self._t_steps = (t_steps_i, ) + t_steps_g
            else:
                self._t_steps = (t_steps_i, ) + (t_steps_g, ) * n_gens

        self._outlier = False if self._sol == 'exp' else outlier
        self._mzscore = mzscore
        self._v1 = kwargs.pop('v1', False)

        self._cycle = 0

        # check for discontinuity in the structure
        gnm = GNM()
        gnm.buildKirchhoff(self._atoms[self._idx_cg], cutoff=self._cutoff)
        K = gnm.getKirchhoff()
        rank_diff = (len(K) - 1 -
                     np.linalg.matrix_rank(K, tol=ZERO, hermitian=True))
        if rank_diff != 0:
            raise ValueError(
                'atoms has disconnected parts; please check the structure')

        LOGGER.timeit('_clustenm_overall')

        LOGGER.info('Generation 0 ...')

        if self._sim:
            if self._t_steps[0] != 0:
                LOGGER.info(
                    'Minimization, heating-up & simulation in generation 0 ...'
                )
            else:
                LOGGER.info('Minimization & heating-up in generation 0 ...')
        else:
            LOGGER.info('Minimization in generation 0 ...')
        LOGGER.timeit('_clustenm_min')
        potential, conformer = self._min_sim(self._atoms.getCoords())
        if np.isnan(potential):
            raise ValueError(
                'Initial structure could not be minimized. Try again and/or check your structure.'
            )

        LOGGER.report(label='_clustenm_min')

        LOGGER.info('#' + '-' * 19 + '/*\\' + '-' * 19 + '#')

        self.setCoords(conformer)

        potentials = [potential]
        sizes = [1]
        new_shape = [1]
        for s in conformer.shape:
            new_shape.append(s)
        conf = conformer.reshape(new_shape)
        conformers = start_confs = conf
        keys = [(0, 0)]

        for i in range(1, self._n_gens + 1):
            self._cycle += 1
            LOGGER.info('Generation %d ...' % i)
            confs, weights = self._generate(start_confs)
            if self._sim:
                if self._t_steps[i] != 0:
                    LOGGER.info(
                        'Minimization, heating-up & simulation in generation %d ...'
                        % i)
                else:
                    LOGGER.info(
                        'Minimization & heating-up in generation %d ...' % i)
            else:
                LOGGER.info('Minimization in generation %d ...' % i)
            LOGGER.timeit('_clustenm_min_sim')

            pot_conf = [self._min_sim(conf) for conf in confs]

            LOGGER.report('Structures were sampled in %.2fs.',
                          label='_clustenm_min_sim')
            LOGGER.info('#' + '-' * 19 + '/*\\' + '-' * 19 + '#')

            pots, confs = list(zip(*pot_conf))
            idx = np.logical_not(np.isnan(pots))
            weights = np.array(weights)[idx]
            pots = np.array(pots)[idx]
            confs = np.array(confs)[idx]

            if self._outlier:
                idx = np.logical_not(self._outliers(pots))
            else:
                idx = np.full(pots.size, True, dtype=bool)

            sizes.extend(weights[idx])
            potentials.extend(pots[idx])
            start_confs = self._superpose_cg(confs[idx])

            for j in range(start_confs.shape[0]):
                keys.append((i, j))
            conformers = np.vstack((conformers, start_confs))

        LOGGER.timeit('_clustenm_ens')
        LOGGER.info('Creating an ensemble of conformers ...')

        self._build(conformers, keys, potentials, sizes)
        LOGGER.report('Ensemble was created in %.2fs.', label='_clustenm_ens')

        self._time = LOGGER.timing(label='_clustenm_overall')
        LOGGER.report('All completed in %.2fs.', label='_clustenm_overall')
Пример #10
0
def searchPfam(query, **kwargs):
    """Returns Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, a protein sequence, or a sequence
        file. Sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    if isfile(query):
        from prody.sequence import MSAFile
        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = ''.join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError('could not parse a sequence without gaps from ' +
                             query)
    else:
        seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + ' is not a valid sequence')
        fseq = '>Seq\n' + seq
        parameters = {'hmmdb': 'pfam', 'seq': fseq}
        enc_params = urllib.urlencode(parameters).encode('utf-8')
        request = urllib2.Request(
            'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params)

        results_url = urllib2.urlopen(request).geturl()

        #res_params = { 'output' : 'xml' }
        res_params = {'format': 'tsv'}
        enc_res_params = urllib.urlencode(res_params)
        #modified_res_url = results_url + '?' + enc_res_params
        modified_res_url = results_url.replace(
            'results', 'download') + '?' + enc_res_params

        result_request = urllib2.Request(modified_res_url)
        # url = ( urllib2.urlopen(request).geturl() + '?output=xml')
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format(
            seq[:MINSEQLEN]))

        try:
            #xml = urllib2.urlopen(result_request).read()
            tsv = urllib2.urlopen(result_request).read()
            # openURL(url, timeout=timeout).read()
        except:
            raise ValueError('No matching Pfam domains were found.')

        # try:
        #     root = ET.XML(xml)
        # except Exception as err:
        #     raise ValueError('failed to parse results XML, check URL: ' + modified_res_url)

        matches = {}
        #for child in root[0]:
        #if child.tag == 'hits':
        # accession = child.get('acc')
        # pfam_id = accession.split('.')[0]
        # matches[pfam_id]={}
        # matches[pfam_id]['accession']=accession
        # matches[pfam_id]['class']='Domain'
        # matches[pfam_id]['id']=child.get('name')
        # matches[pfam_id]['locations']={}
        # matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto')
        # matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom')
        # matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore')
        # matches[pfam_id]['locations']['end']=child[0].get('alisqto')
        # matches[pfam_id]['locations']['evalue']=child.get('evalue')
        # matches[pfam_id]['locations']['evidence']='hmmer v3.0'
        # matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto')
        # matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom')
        # matches[pfam_id]['locations']['significant']=child[0].get('significant')
        # matches[pfam_id]['locations']['start']=child[0].get('alisqfrom')
        # matches[pfam_id]['type']='Pfam-A'
        # return matches

        if PY3K:
            tsv = tsv.decode()

        lines = tsv.split('\n')
        keys = lines[0].split('\t')
        root = {}
        for i, line in enumerate(lines[1:-1]):
            root[i] = {}
            for j, key in enumerate(keys):
                root[i][key] = line.split('\t')[j]

        for child in root.values():
            accession = child['Family Accession']
            pfam_id = accession.split('.')[0]
            matches[pfam_id] = {}
            matches[pfam_id]['accession'] = accession
            matches[pfam_id]['class'] = 'Domain'
            matches[pfam_id]['id'] = child['Family id']
            matches[pfam_id]['locations'] = {}
            matches[pfam_id]['locations']['ali_end'] = child['Ali. End']
            matches[pfam_id]['locations']['ali_start'] = child['Ali. Start']
            matches[pfam_id]['locations']['bitscore'] = child['Bit Score']
            matches[pfam_id]['locations']['end'] = child['Env. End']
            matches[pfam_id]['locations']['cond_evalue'] = child[
                'Cond. E-value']
            matches[pfam_id]['locations']['ind_evalue'] = child['Ind. E-value']
            matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0'
            matches[pfam_id]['locations']['hmm_end'] = child['Model End']
            matches[pfam_id]['locations']['hmm_start'] = child['Model Start']
            #matches[pfam_id]['locations']['significant'] = child['significant']
            matches[pfam_id]['locations']['start'] = child['Env. Start']
            matches[pfam_id]['type'] = 'Pfam-A'
        return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader
            try:
                polymers = parsePDBHeader(seq[:4], 'polymers')
            except Exception as err:
                LOGGER.warn('failed to parse header for {0} ({1})'.format(
                    seq[:4], str(err)))
            else:
                chid = seq[4:].upper()

            for poly in polymers:
                if chid and poly.chid != chid:
                    continue
                for dbref in poly.dbrefs:
                    if dbref.database != 'UniProt':
                        continue
                    idcode = dbref.idcode
                    accession = dbref.accession
                    LOGGER.info('UniProt ID code {0} for {1} chain '
                                '{2} will be used.'.format(
                                    idcode, seq[:4], poly.chid))
                    break
                if idcode is not None:
                    break
            if idcode is None:
                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
                            'parsed.'.format(repr(seq)))
                url = prefix + 'protein/' + seq + '?output=xml'
            else:
                url = prefix + 'protein/' + idcode + '?output=xml'

        else:
            url = prefix + 'protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml not in ['PEND', 'RUN']:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None
    elif xml.find(b'No valid UniProt accession or ID') > 0:
        try:
            url = prefix + 'protein/' + accession + '?output=xml'
            xml = openURL(url, timeout=timeout).read()
        except:
            try:
                ag = parsePDB(seq, subset='ca')
                ag_seq = ag.getSequence()
                return searchPfam(ag_seq)
            except:
                raise ValueError('No valid UniProt accession or ID for: ' +
                                 seq)

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError('failed to parse results XML, check URL: ' + url)
    else:
        key = '{' + prefix + '}'
        results = dictElement(root[0], key)
        try:
            xml_matches = results['matches']
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib['accession'][:7]
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        if not re.search('^P(F|B)[0-9]{5}$', accession):
            raise ValueError('{0} does not match pfam accession'
                             ' format'.format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault('locations', [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = 'Query ' + repr(query)
    else:
        query = 'Query sequence'

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches
Пример #11
0
def blastPDB(sequence, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    blast searching *sequence* against the PDB using NCBI blastp.

    :arg sequence: an object with an associated sequence string 
         or a sequence string itself
    :type sequence: :class:`Atomic`, :class:`Sequence`, or str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``)
    search parameters can be adjusted by the user.  *sleep* keyword argument
    (default is ``2`` seconds) determines how long to wait to reconnect for
    results.  Sleep time is multiplied by 1.5 when results are not ready.  
    *timeout* (default is 120 s) determines when to give up waiting for the results.
    """

    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')

    elif isinstance(sequence, Atomic):
        sequence = sequence.calpha.getSequence()

    elif isinstance(sequence, Sequence):
        sequence = str(sequence)

    elif isinstance(sequence, str):
        if len(sequence) in [4, 5, 6]:
            ag = parsePDB(sequence)
            sequence = ag.calpha.getSequence()
        sequence = ''.join(sequence.split())

    else:
        raise TypeError('sequence must be Atomic, Sequence, or str not {0}'
                        .format(type(sequence)))

    headers = {'User-agent': 'ProDy'}
    query = [('DATABASE', 'pdb'), ('ENTREZ_QUERY', '(none)'),
             ('PROGRAM', 'blastp'),]

    expect = float(kwargs.pop('expect', 10e-10))
    if expect <= 0:
        raise ValueError('expect must be a positive number')
    query.append(('EXPECT', expect))
    hitlist_size = int(kwargs.pop('hitlist_size', 250))
    if hitlist_size <= 0:
        raise ValueError('expect must be a positive integer')
    query.append(('HITLIST_SIZE', hitlist_size))
    query.append(('QUERY', sequence))
    query.append(('CMD', 'Put'))

    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 120))

    try:
        import urllib.parse
        urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8')
    except ImportError:
        from urllib import urlencode

    url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi'

    data = urlencode(query)
    LOGGER.timeit('_prody_blast')
    LOGGER.info('Blast searching NCBI PDB database for "{0}..."'
                .format(sequence[:5]))
    handle = openURL(url, data=data, headers=headers)

    html = handle.read()
    index = html.find(b'RID =')
    if index == -1:
        raise Exception('NCBI did not return expected response.')
    else:
        last = html.find(b'\n', index)
        rid = html[index + len('RID ='):last].strip()

    index = html.find(b'RTOE =')
    if index == -1:
        rtoe = None # This is not used
    else:
        last = html.find(b'\n', index)
        rtoe = int(html[index + len('RTOE ='):last].strip())

    query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500),
             ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')]
    data = urlencode(query)

    while True:
        LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.')
        LOGGER.write('Connecting to NCBI for search results...')
        handle = openURL(url, data=data, headers=headers)
        results = handle.read()
        index = results.find(b'Status=')
        LOGGER.clear()
        if index < 0:
            break
        last = results.index(b'\n', index)
        status = results[index+len('Status='):last].strip()
        if status.upper() == 'READY':
            break
        sleep = int(sleep * 1.5)
        if LOGGER.timing('_prody_blast') > timeout:
            LOGGER.warn('Blast search time out.')
            return None
    LOGGER.clear()
    LOGGER.report('Blast search completed in %.1fs.', '_prody_blast')

    try:
        ext_xml = filename.lower().endswith('.xml')
    except AttributeError:
        pass
    else:
        if not ext_xml:
            filename += '.xml'
        out = open(filename, 'w')
        out.write(results)
        out.close()
        LOGGER.info('Results are saved as {0}.'.format(repr(filename)))

    return PDBBlastRecord(results, sequence)
Пример #12
0
def parsePDBStream(stream, **kwargs):
    """Return an :class:`.AtomGroup` and/or dictionary containing header data 
    parsed from a stream of PDB lines. 
    
    :arg stream: Anything that implements the method ``readlines`` 
        (e.g. :class:`file`, buffer, stdin)"""
    
    model = kwargs.get('model')
    header = kwargs.get('header', False)
    assert isinstance(header, bool), 'header must be a boolean'
    chain = kwargs.get('chain')
    subset = kwargs.get('subset')
    altloc = kwargs.get('altloc', 'A')
    if model is not None:
        if isinstance(model, int):
            if model < 0:
                raise ValueError('model must be greater than 0')
        else:
            raise TypeError('model must be an integer, {0:s} is invalid'
                            .format(str(model)))
    title_suffix = ''
    if subset is not None: 
        if not isinstance(subset, str):
            raise TypeError('subset must be a string')
        elif subset.lower() not in _PDBSubsets:
            raise ValueError('{0:s} is not a valid subset'
                             .format(repr(subset)))
        title_suffix = '_' + _PDBSubsets[subset]
    if chain is not None:
        if not isinstance(chain, str):
            raise TypeError('chain must be a string')
        elif len(chain) == 0:
            raise ValueError('chain must not be an empty string')
        title_suffix = '_' + chain + title_suffix
    ag = None
    if 'ag' in kwargs:
        ag = kwargs['ag']
        if not isinstance(ag, AtomGroup):
            raise TypeError('ag must be an AtomGroup instance')
        n_csets = ag.numCoordsets()
    elif model != 0:
        ag = AtomGroup(str(kwargs.get('title', 'Unknown')) + title_suffix)
        n_csets = 0
    
    biomol = kwargs.get('biomol', False)
    auto_secondary = None
    secondary = kwargs.get('secondary')
    if not secondary:
        auto_secondary = SETTINGS.get('auto_secondary')
        secondary = auto_secondary
    split = 0
    hd = None
    if model != 0:
        LOGGER.timeit()
        lines = stream.readlines()
        if header or biomol or secondary:
            hd, split = getHeaderDict(lines)
        _parsePDBLines(ag, lines, split, model, chain, subset, altloc)
        if ag.numAtoms() > 0:
            LOGGER.timing('{0:d} atoms and {1:d} coordinate set(s) were '
                          'parsed in %.2fs.'.format(ag.numAtoms(), 
                           ag.numCoordsets() - n_csets))
        else:
            ag = None
            LOGGER.warn('Atomic data could not be parsed, please '
                        'check the input file.')
    elif header:
        hd, split = getHeaderDict(stream)

    if ag is not None and isinstance(hd, dict): 
        if secondary:
            if auto_secondary:
                try:
                    ag = assignSecstr(hd, ag)
                except ValueError:
                    pass
            else:
                ag = assignSecstr(hd, ag)
        if biomol:
            ag = buildBiomolecules(hd, ag)

            if isinstance(ag, list):
                LOGGER.info('Biomolecular transformations were applied, {0:d} '
                            'biomolecule(s) are returned.'.format(len(ag)))
            else:
                LOGGER.info('Biomolecular transformations were applied to the '
                            'coordinate data.')
    if model != 0:
        if header:
            return ag, hd
        else:
            return ag
    else:
        return hd
Пример #13
0
def searchPfam(query, **kwargs):
    """Returns Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, a protein sequence, or a sequence
        file. Sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    if isfile(query):
        from prody.sequence import MSAFile
        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = ''.join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError('could not parse a sequence without gaps from ' +
                             query)
    else:
        seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + ' is not a valid sequence')
        fseq = '>Seq\n' + seq
        parameters = { 'hmmdb' : 'pfam', 'seq': fseq }
        enc_params = urllib.urlencode(parameters).encode('utf-8')
        request = urllib2.Request('https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params)

        results_url = urllib2.urlopen(request).geturl()

        #res_params = { 'output' : 'xml' }
        res_params = { 'format' : 'tsv' }
        enc_res_params = urllib.urlencode(res_params)
        #modified_res_url = results_url + '?' + enc_res_params
        modified_res_url = results_url.replace('results','download') + '?' + enc_res_params

        result_request = urllib2.Request(modified_res_url) 
        # url = ( urllib2.urlopen(request).geturl() + '?output=xml') 
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'
                     .format(seq[:MINSEQLEN]))

        #xml = urllib2.urlopen(result_request).read()
        tsv = urllib2.urlopen(result_request).read()
        # openURL(url, timeout=timeout).read()
        
        # try:
        #     root = ET.XML(xml)
        # except Exception as err:
        #     raise ValueError('failed to parse results XML, check URL: ' + modified_res_url)

        matches = {}
        #for child in root[0]:
            #if child.tag == 'hits':
                # accession = child.get('acc')
                # pfam_id = accession.split('.')[0]
                # matches[pfam_id]={}
                # matches[pfam_id]['accession']=accession
                # matches[pfam_id]['class']='Domain'
                # matches[pfam_id]['id']=child.get('name')
                # matches[pfam_id]['locations']={}
                # matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto')
                # matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom')
                # matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore')
                # matches[pfam_id]['locations']['end']=child[0].get('alisqto')
                # matches[pfam_id]['locations']['evalue']=child.get('evalue')
                # matches[pfam_id]['locations']['evidence']='hmmer v3.0'
                # matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto')
                # matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom')
                # matches[pfam_id]['locations']['significant']=child[0].get('significant')    
                # matches[pfam_id]['locations']['start']=child[0].get('alisqfrom')
                # matches[pfam_id]['type']='Pfam-A'
        # return matches

        lines = tsv.split('\n')
        keys = lines[0].split('\t')
        root = {}
        for i, line in enumerate(lines[1:-1]):
            root[i] = {}
            for j, key in enumerate(keys):
                root[i][key] = line.split('\t')[j]

        for child in root.values():
            accession = child['Family Accession']
            pfam_id = accession.split('.')[0]
            matches[pfam_id]={}
            matches[pfam_id]['accession'] = accession
            matches[pfam_id]['class'] = 'Domain'
            matches[pfam_id]['id'] = child['Family id']
            matches[pfam_id]['locations'] = {}
            matches[pfam_id]['locations']['ali_end'] = child['Ali. End']
            matches[pfam_id]['locations']['ali_start'] = child['Ali. Start']
            matches[pfam_id]['locations']['bitscore'] = child['Bit Score']
            matches[pfam_id]['locations']['end'] = child['Env. End']
            matches[pfam_id]['locations']['cond_evalue'] = child['Cond. E-value']
            matches[pfam_id]['locations']['ind_evalue'] = child['Ind. E-value']
            matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0'
            matches[pfam_id]['locations']['hmm_end'] = child['Model End']
            matches[pfam_id]['locations']['hmm_start'] = child['Model Start']
            #matches[pfam_id]['locations']['significant'] = child['significant']   
            matches[pfam_id]['locations']['start'] = child['Env. Start']
            matches[pfam_id]['type'] = 'Pfam-A'
        return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader
            try:
                polymers = parsePDBHeader(seq[:4], 'polymers')
            except Exception as err:
                LOGGER.warn('failed to parse header for {0} ({1})'
                            .format(seq[:4], str(err)))
            else:
                chid = seq[4:].upper()
 
            for poly in polymers:
                if chid and poly.chid != chid:
                    continue
                for dbref in poly.dbrefs:
                    if dbref.database != 'UniProt':
                        continue
                    idcode = dbref.idcode
                    accession = dbref.accession
                    LOGGER.info('UniProt ID code {0} for {1} chain '
                                '{2} will be used.'
                                .format(idcode, seq[:4], poly.chid))
                    break
                if idcode is not None:
                    break
            if idcode is None:
                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
                            'parsed.'.format(repr(seq)))
                url = prefix + 'protein/' + seq + '?output=xml'
            else:
                url = prefix + 'protein/' + idcode + '?output=xml'

        else:
            url = prefix + 'protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml not in ['PEND','RUN']:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None
    elif xml.find(b'No valid UniProt accession or ID') > 0:
        try:
            url = prefix + 'protein/' + accession + '?output=xml'
            xml = openURL(url, timeout=timeout).read()
        except:
            try:
                ag = parsePDB(seq, subset='ca')
                ag_seq = ag.getSequence()
                return searchPfam(ag_seq)
            except:
                raise ValueError('No valid UniProt accession or ID for: ' + seq)

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError('failed to parse results XML, check URL: ' + url)
    else:
        key = '{' + prefix + '}'
        results = dictElement(root[0], key)
        try:
            xml_matches = results['matches']
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib['accession'][:7]
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        if not re.search('^P(F|B)[0-9]{5}$', accession):
            raise ValueError('{0} does not match pfam accession'
                             ' format'.format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault('locations', [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = 'Query ' + repr(query)
    else:
        query = 'Query sequence'

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches
Пример #14
0
def fetchPfamMSA(acc, alignment='full', compressed=False, **kwargs):
    """Returns a path to the downloaded Pfam MSA file.

    :arg acc: Pfam ID or Accession Code
    :type acc: str

    :arg alignment: alignment type, one of ``'full'`` (default), ``'seed'``,
         ``'ncbi'``, ``'metagenomics'``, ``'rp15'``, ``'rp35'``, ``'rp55'``,
         ``'rp75'`` or ``'uniprot'`` where rp stands for representative 
         proteomes

    :arg compressed: gzip the downloaded MSA file, default is **False**

    *Alignment Options*

    :arg format: a Pfam supported MSA file format, one of ``'selex'``,
        (default), ``'stockholm'`` or ``'fasta'``

    :arg order: ordering of sequences, ``'tree'`` (default) or
        ``'alphabetical'``

    :arg inserts: letter case for inserts, ``'upper'`` (default) or ``'lower'``

    :arg gaps: gap character, one of ``'dashes'`` (default), ``'dots'``,
        ``'mixed'`` or **None** for unaligned

    *Other Options*

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60

    :arg outname: out filename, default is input ``'acc_alignment.format'``

    :arg folder: output folder, default is ``'.'``"""

    import requests

    # url = prefix + 'family/acc?id=' + acc
    # handle = openURL(url, timeout=int(kwargs.get('timeout', 60)))
    orig_acc = acc
    # acc = handle.readline().strip()
    # if PY3K:
    #     acc = acc.decode()
    url_flag = False

    if not re.search('(?<=PF)[0-9]{5}$', acc):
        raise ValueError('{0} is not a valid Pfam ID or Accession Code'.format(
            repr(orig_acc)))

    if alignment not in DOWNLOAD_FORMATS:
        raise ValueError('alignment must be one of full, seed, ncbi or'
                         ' metagenomics')
    if alignment == 'ncbi' or alignment == 'metagenomics' or alignment == 'uniprot':
        url = (prefix + 'family/' + acc + '/alignment/' + alignment +
               '/gzipped')
        url_flag = True
        extension = '.sth'
    else:
        if not kwargs:
            url = (prefix + 'family/' + acc + '/alignment/' + alignment +
                   '/gzipped')
            url_flag = True
            extension = '.sth'
        else:
            align_format = kwargs.get('format', 'selex').lower()

            if align_format not in FORMAT_OPTIONS['format']:
                raise ValueError('alignment format must be of type selex'
                                 ' stockholm or fasta. MSF not supported')

            if align_format == SELEX:
                align_format, extension = 'pfam', '.slx'
            elif align_format == FASTA:
                extension = '.fasta'
            else:
                extension = '.sth'

            gaps = str(kwargs.get('gaps', 'dashes')).lower()
            if gaps not in FORMAT_OPTIONS['gaps']:
                raise ValueError('gaps must be of type mixed, dots, dashes, '
                                 'or None')

            inserts = kwargs.get('inserts', 'upper').lower()
            if (inserts not in FORMAT_OPTIONS['inserts']):
                raise ValueError('inserts must be of type lower or upper')

            order = kwargs.get('order', 'tree').lower()
            if order not in FORMAT_OPTIONS['order']:
                raise ValueError('order must be of type tree or alphabetical')

            url = (prefix + 'family/' + acc + '/alignment/' + alignment +
                   '/format?format=' + align_format + '&alnType=' + alignment +
                   '&order=' + order[0] + '&case=' + inserts[0] + '&gaps=' +
                   gaps + '&download=1')

    LOGGER.timeit('_pfam')
    timeout = kwargs.get('timeout', 60)
    response = None
    sleep = 2
    try_error = 3
    while LOGGER.timing('_pfam') < timeout:
        try:
            response = requests.get(url, verify=False).content
        except Exception:
            pass
        else:
            break

        sleep = 20 if int(sleep * 1.5) >= 20 else int(sleep * 1.5)
        LOGGER.sleep(int(sleep), '. Trying to reconnect...')

    # response = openURL(url, timeout=int(kwargs.get('timeout', 60)))
    outname = kwargs.get('outname', None)
    if not outname:
        outname = orig_acc
    folder = str(kwargs.get('folder', '.'))
    filepath = join(makePath(folder), outname + '_' + alignment + extension)
    if compressed:
        filepath = filepath + '.gz'
        if url_flag:
            f_out = open(filepath, 'wb')
        else:
            f_out = openFile(filepath, 'wb')
        # f_out.write(response.read())
        f_out.write(response)
        f_out.close()
    else:
        if url_flag:
            gunzip(response, filepath)
        else:
            with open(filepath, 'wb') as f_out:
                # f_out.write(response.read())
                f_out.write(response)

    filepath = relpath(filepath)
    LOGGER.info('Pfam MSA for {0} is written as {1}.'.format(
        orig_acc, filepath))

    return filepath
Пример #15
0
def blastPDB(sequence, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    blast searching of ProteinDataBank database *sequence* using NCBI blastp.

    :arg sequence: single-letter code amino acid sequence of the protein
        without any gap characters, all white spaces will be removed
    :type sequence: str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``)
    search parameters can be adjusted by the user.  *sleep* keyword argument
    (default is ``2`` seconds) determines how long to wait to reconnect for
    results.  Sleep time is doubled when results are not ready.  *timeout*
    (default is 120s) determines when to give up waiting for the results.
    """

    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')
    else:
        try:
            sequence = ''.join(sequence.split())
            _ = sequence.isalpha()
        except AttributeError:
            raise TypeError('sequence must be a string')
        else:
            if not _:
                raise ValueError('not a valid protein sequence')
    headers = {'User-agent': 'ProDy'}

    query = [
        ('DATABASE', 'pdb'),
        ('ENTREZ_QUERY', '(none)'),
        ('PROGRAM', 'blastp'),
    ]
    expect = float(kwargs.pop('expect', 10e-10))
    if expect <= 0:
        raise ValueError('expect must be a positive number')
    query.append(('EXPECT', expect))
    hitlist_size = int(kwargs.pop('hitlist_size', 250))
    if hitlist_size <= 0:
        raise ValueError('expect must be a positive integer')
    query.append(('HITLIST_SIZE', hitlist_size))
    query.append(('QUERY', sequence))
    query.append(('CMD', 'Put'))

    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 120))

    if kwargs:
        LOGGER.warn('Keyword argument(s) {0} are not used.'.format(', '.join(
            [repr(key) for key in kwargs])))

    try:
        import urllib.parse
        urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8')
    except ImportError:
        from urllib import urlencode

    url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi'

    data = urlencode(query)
    LOGGER.timeit('_prody_blast')
    LOGGER.info('Blast searching NCBI PDB database for "{0}..."'.format(
        sequence[:5]))
    handle = openURL(url, data=data, headers=headers)

    html = handle.read()
    index = html.find(b'RID =')
    if index == -1:
        raise Exception('NCBI did not return expected response.')
    else:
        last = html.find(b'\n', index)
        rid = html[index + len('RID ='):last].strip()

    index = html.find(b'RTOE =')
    if index == -1:
        rtoe = None  # This is not used
    else:
        last = html.find(b'\n', index)
        rtoe = int(html[index + len('RTOE ='):last].strip())

    query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500),
             ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')]
    data = urlencode(query)

    while True:
        LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.')
        LOGGER.write('Connecting NCBI for search results...')
        handle = openURL(url, data=data, headers=headers)
        results = handle.read()
        index = results.find(b'Status=')
        LOGGER.clear()
        if index < 0:
            break
        last = results.index(b'\n', index)
        status = results[index + len('Status='):last].strip()
        if status.upper() == 'READY':
            break
        sleep = int(sleep * 1.5)
        if LOGGER.timing('_prody_blast') > timeout:
            LOGGER.warn('Blast search time out.')
            return None
    LOGGER.clear()
    LOGGER.report('Blast search completed in %.1fs.', '_prody_blast')
    try:
        ext_xml = filename.lower().endswith('.xml')
    except AttributeError:
        pass
    else:
        if not ext_xml:
            filename += '.xml'
        out = open(filename, 'w')
        out.write(results)
        out.close()
        LOGGER.info('Results are saved as {0}.'.format(repr(filename)))
    return PDBBlastRecord(results, sequence)
Пример #16
0
    def fetch(self, xml=None, sequence=None, **kwargs):
        """Get Blast record from url or file.

        :arg sequence: an object with an associated sequence string 
            or a sequence string itself
        :type sequence: :class:`Atomic`, :class:`Sequence`, or str

        :arg xml: blast search results in XML format or an XML file that
            contains the results or a filename for saving the results or None
        :type xml: str

        :arg timeout: amount of time until the query times out in seconds
            default value is 120
        :type timeout: int
        """
        if self.isSuccess:
            LOGGER.warn(
                "The record already exists so not further search is performed")
            return True

        if sequence == None:
            sequence = self._sequence

        if xml == None:
            xml = self._xml

        import xml.etree.cElementTree as ET
        if xml is not None and len(xml) < 100:
            if os.path.isfile(xml):
                xml = ET.parse(xml)
                root = xml.getroot()
            else:
                raise ValueError('xml is not a filename and does not look like'
                                 ' a valid XML string')
        else:

            headers = {'User-agent': 'ProDy'}
            query = [
                ('DATABASE', 'pdb'),
                ('ENTREZ_QUERY', '(none)'),
                ('PROGRAM', 'blastp'),
            ]

            expect = float(kwargs.pop('expect', 10e-10))
            if expect <= 0:
                raise ValueError('expect must be a positive number')
            query.append(('EXPECT', expect))
            hitlist_size = int(kwargs.pop('hitlist_size', 250))
            if hitlist_size <= 0:
                raise ValueError('expect must be a positive integer')
            query.append(('HITLIST_SIZE', hitlist_size))
            query.append(('QUERY', sequence))
            query.append(('CMD', 'Put'))

            sleep = float(kwargs.pop('sleep', 2))
            timeout = float(kwargs.pop('timeout', self._timeout))
            self._timeout = timeout

            try:
                import urllib.parse
                urlencode = lambda data: bytes(urllib.parse.urlencode(data),
                                               'utf-8')
            except ImportError:
                from urllib import urlencode

            url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi'

            data = urlencode(query)
            LOGGER.timeit('_prody_blast')
            LOGGER.info(
                'Blast searching NCBI PDB database for "{0}..."'.format(
                    sequence[:5]))
            handle = openURL(url, data=data, headers=headers)

            html = handle.read()
            index = html.find(b'RID =')
            if index == -1:
                raise Exception('NCBI did not return expected response.')
            else:
                last = html.find(b'\n', index)
                rid = html[index + len('RID ='):last].strip()

            query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500),
                     ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')]
            data = urlencode(query)

            while True:
                LOGGER.sleep(int(sleep),
                             'to reconnect to NCBI for search results.')
                LOGGER.write('Connecting to NCBI for search results...')
                handle = openURL(url, data=data, headers=headers)
                results = handle.read()
                index = results.find(b'Status=')
                LOGGER.clear()
                if index < 0:
                    break
                last = results.index(b'\n', index)
                status = results[index + len('Status='):last].strip()
                if status.upper() == b'READY':
                    break
                sleep = int(sleep * 1.5)
                if LOGGER.timing('_prody_blast') > timeout:
                    LOGGER.warn('Blast search time out.')
                    return False

            LOGGER.clear()
            LOGGER.report('Blast search completed in %.1fs.', '_prody_blast')

            filename = xml
            root = ET.XML(results)
            try:
                ext_xml = filename.lower().endswith('.xml')
            except AttributeError:
                pass
            else:
                if not ext_xml:
                    filename += '.xml'
                out = open(filename, 'w')
                if PY3K:
                    out.write(results.decode())
                else:
                    out.write(results)
                out.close()
                LOGGER.info('Results are saved as {0}.'.format(repr(filename)))

            root = dictElement(root, 'BlastOutput_')
            if root['db'] != 'pdb':
                raise ValueError('blast search database in xml must be "pdb"')
            if root['program'] != 'blastp':
                raise ValueError(
                    'blast search program in xml must be "blastp"')
            self._param = dictElement(root['param'][0], 'Parameters_')

            query_len = int(root['query-len'])
            if sequence and len(sequence) != query_len:
                raise ValueError(
                    'query-len and the length of the sequence do not '
                    'match, xml data may not be for given sequence')
            hits = []
            for iteration in root['iterations']:
                for hit in dictElement(iteration, 'Iteration_')['hits']:
                    hit = dictElement(hit, 'Hit_')
                    data = dictElement(hit['hsps'][0], 'Hsp_')
                    for key in [
                            'align-len', 'gaps', 'hit-frame', 'hit-from',
                            'hit-to', 'identity', 'positive', 'query-frame',
                            'query-from', 'query-to'
                    ]:
                        data[key] = int(data[key])
                    data['query-len'] = query_len
                    for key in ['evalue', 'bit-score', 'score']:
                        data[key] = float(data[key])
                    p_identity = 100.0 * data['identity'] / (
                        data['query-to'] - data['query-from'] + 1)
                    data['percent_identity'] = p_identity
                    p_overlap = (100.0 * (data['align-len'] - data['gaps']) /
                                 query_len)
                    data['percent_coverage'] = p_overlap

                    for item in (hit['id'] + hit['def']).split('>gi'):
                        head, title = item.split(None, 1)
                        head = head.split('|')
                        pdb_id = head[-2].lower()
                        chain_id = head[-1][:1]
                        pdbch = dict(data)
                        pdbch['pdb_id'] = pdb_id
                        pdbch['chain_id'] = chain_id
                        pdbch['title'] = (head[-1][1:] + title).strip()
                        hits.append((p_identity, p_overlap, pdbch))
            hits.sort(key=lambda hit: hit[0], reverse=True)
            self._hits = hits

        return True
Пример #17
0
def searchUniprotID(query, search_b=False, skip_a=False, **kwargs):
    """Returns Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, protein sequence, or a 
        sequence file. Sequence queries must not contain gaps and 
        must be at least 16 characters long
    :type query: str

    :arg search_b: search Pfam-B families when **True**
    :type search_b: bool

    :arg skip_a: do not search Pfam-A families when **True**
    :type skip_a: bool

    :arg ga: use gathering threshold when **True**
    :type ga: bool

    :arg evalue: user specified e-value cutoff, must be smaller than 10.0
    :type evalue: float

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    query = str(query)
    seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    url = prefix + 'protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    result = root[0].get('id')
    return result
Пример #18
0
def blastPDB(sequence, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    blast searching of ProteinDataBank database *sequence* using NCBI blastp.

    :arg sequence: single-letter code amino acid sequence of the protein
        without any gap characters, all white spaces will be removed
    :type sequence: str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``)
    search parameters can be adjusted by the user.  *sleep* keyword argument
    (default is ``2`` seconds) determines how long to wait to reconnect for
    results.  Sleep time is doubled when results are not ready.  *timeout*
    (default is 120s) determines when to give up waiting for the results.
    """

    if sequence == "runexample":
        sequence = (
            "ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI"
            "SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN"
            "DAYDIVKMKKSNISPNFNFMGQLLDFERTL"
        )
    else:
        try:
            sequence = "".join(sequence.split())
            _ = sequence.isalpha()
        except AttributeError:
            raise TypeError("sequence must be a string")
        else:
            if not _:
                raise ValueError("not a valid protein sequence")
    headers = {"User-agent": "ProDy"}

    query = [("DATABASE", "pdb"), ("ENTREZ_QUERY", "(none)"), ("PROGRAM", "blastp")]
    expect = float(kwargs.pop("expect", 10e-10))
    if expect <= 0:
        raise ValueError("expect must be a positive number")
    query.append(("EXPECT", expect))
    hitlist_size = int(kwargs.pop("hitlist_size", 250))
    if hitlist_size <= 0:
        raise ValueError("expect must be a positive integer")
    query.append(("HITLIST_SIZE", hitlist_size))
    query.append(("QUERY", sequence))
    query.append(("CMD", "Put"))

    sleep = float(kwargs.pop("sleep", 2))
    timeout = float(kwargs.pop("timeout", 120))

    if kwargs:
        LOGGER.warn("Keyword argument(s) {0} are not used.".format(", ".join([repr(key) for key in kwargs])))

    try:
        import urllib.parse

        urlencode = lambda data: bytes(urllib.parse.urlencode(data), "utf-8")
    except ImportError:
        from urllib import urlencode

    url = "https://blast.ncbi.nlm.nih.gov/Blast.cgi"

    data = urlencode(query)
    LOGGER.timeit("_prody_blast")
    LOGGER.info('Blast searching NCBI PDB database for "{0}..."'.format(sequence[:5]))
    handle = openURL(url, data=data, headers=headers)

    html = handle.read()
    index = html.find(b"RID =")
    if index == -1:
        raise Exception("NCBI did not return expected response.")
    else:
        last = html.find(b"\n", index)
        rid = html[index + len("RID =") : last].strip()

    index = html.find(b"RTOE =")
    if index == -1:
        rtoe = None  # This is not used
    else:
        last = html.find(b"\n", index)
        rtoe = int(html[index + len("RTOE =") : last].strip())

    query = [("ALIGNMENTS", 500), ("DESCRIPTIONS", 500), ("FORMAT_TYPE", "XML"), ("RID", rid), ("CMD", "Get")]
    data = urlencode(query)

    while True:
        LOGGER.sleep(int(sleep), "to reconnect NCBI for search results.")
        LOGGER.write("Connecting NCBI for search results...")
        handle = openURL(url, data=data, headers=headers)
        results = handle.read()
        index = results.find(b"Status=")
        LOGGER.clear()
        if index < 0:
            break
        last = results.index(b"\n", index)
        status = results[index + len("Status=") : last].strip()
        if status.upper() == "READY":
            break
        sleep = int(sleep * 1.5)
        if LOGGER.timing("_prody_blast") > timeout:
            LOGGER.warn("Blast search time out.")
            return None
    LOGGER.clear()
    LOGGER.report("Blast search completed in %.1fs.", "_prody_blast")
    try:
        ext_xml = filename.lower().endswith(".xml")
    except AttributeError:
        pass
    else:
        if not ext_xml:
            filename += ".xml"
        out = open(filename, "w")
        out.write(results)
        out.close()
        LOGGER.info("Results are saved as {0}.".format(repr(filename)))
    return PDBBlastRecord(results, sequence)
Пример #19
0
 def getRecord(self, url, localFile=False):
     if localFile:
         dali_file = open(url, 'r')
         data = dali_file.read()
         dali_file.close()
     else:
         sleep = 2
         timeout = 120
         LOGGER.timeit('_dali')
         log_message = ''
         try_error = 3
         while True:
             LOGGER.sleep(int(sleep), 'to reconnect Dali '+log_message)
             LOGGER.clear()
             LOGGER.write('Connecting Dali for search results...')
             LOGGER.clear()
             try:
                 html = urllib2.urlopen(url).read()
             except:
                 try_error -= 1
                 if try_error >= 0:
                     LOGGER.sleep(2, '. Connection error happened. Trying to reconnect...')
                     continue
                 else:
                     html = urllib2.urlopen(url).read()
             if html.find('Status: Queued') > -1:
                 log_message = '(Dali searching is queued)...'
             elif html.find('Status: Running') > -1:
                 log_message = '(Dali searching is running)...'
             elif html.find('Your job') == -1 and html.find('.txt') > -1:
                 break
             elif html.find('ERROR:') > -1:
                 LOGGER.warn(': Dali search reported an ERROR!')
                 return None
                 break
             sleep = 20 if int(sleep * 1.5) >= 20 else int(sleep * 1.5)
             if LOGGER.timing('_dali') > timeout:
                 LOGGER.warn(': Dali search is time out. \nThe results can be obtained using getRecord() function later.')
                 return None
                 break
             LOGGER.clear()
         LOGGER.clear()
         LOGGER.report('Dali results completed in %.1fs.', '_dali')
         lines = html.strip().split('\n')
         file_name = re.search('=.+-90\.txt', html).group()[1:]
         file_name = file_name[:-7]
         # LOGGER.info(url+file_name+self._subset+'.txt')
         data = urllib2.urlopen(url+file_name+self._subset+'.txt').read()
         temp_name = file_name+self._subset+'_dali.txt'
         with open(temp_name, "w") as file_temp: file_temp.write(html + '\n' + url+file_name + '\n' + data)
         # with open(temp_name, "a+") as file_temp: file_temp.write(url+file_name + '\n' + data)
     data_list = data.strip().split('# ')
     # No:  Chain   Z    rmsd lali nres  %id PDB  Description -> data_list[3]
     # Structural equivalences -> data_list[4]
     # Translation-rotation matrices -> data_list[5]
     map_temp_dict = dict()
     mapping = []
     lines = data_list[4].strip().split('\n')
     self._lines_4 = lines
     mapping_temp = np.genfromtxt(lines[1:], delimiter = (4,1,14,6,2,4,4,5,2,4,4,3,5,4,3,5,6,3,5,4,3,5,28), usecols = [0,3,5,7,9,12,15,15,18,21], dtype='|i4')
     # [0,3,5,7,9,12,15,15,18,21] -> [index, residue_a, residue_b, residue_i_a, residue_i_b, resid_a, resid_b, resid_i_a, resid_i_b]
     for map_i in mapping_temp:
         if not map_i[0] in map_temp_dict:
             map_temp_dict[map_i[0]] = [[map_i[1], map_i[2], map_i[3], map_i[4]]]
         else:
             map_temp_dict[map_i[0]].append([map_i[1], map_i[2], map_i[3], map_i[4]])
     self._max_index = max(mapping_temp[:,2])
     self._mapping = map_temp_dict
     self._data = data_list[3]
     lines = data_list[3].strip().split('\n')
     daliInfo = np.genfromtxt(lines[1:], delimiter = (4,3,6,5,5,5,6,5,57), usecols = [0,2,3,4,5,6,7,8], dtype=[('id', '<i4'), ('pdb_chain', '|S6'), ('Z', '<f4'), ('rmsd', '<f4'), ('len_align', '<i4'), ('res_num', '<i4'), ('identity', '<i4'), ('title', '|S70')])
     if daliInfo.ndim == 0:
         daliInfo = np.array([daliInfo])
     pdbListAll = []
     self._daliInfo = daliInfo
     dali_temp_dict = dict()
     for temp in self._daliInfo:
         temp_dict = dict()
         pdb_chain = temp[1].strip()[0:6]
         temp_dict['pdbId'] = pdb_chain[0:4]
         temp_dict['chainId'] = pdb_chain[5:6]
         temp_dict['pdb_chain'] = pdb_chain
         temp_dict['Z'] = temp[2]
         temp_dict['rmsd'] = temp[3]
         temp_dict['len_align'] = temp[4]
         temp_dict['res_num'] = temp[5]
         temp_dict['identity'] = temp[6]
         temp_dict['mapping'] = (np.array(map_temp_dict[temp[0]])-1).tolist()
         temp_dict['map_ref'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[0], map_i[1]+1)]
         temp_dict['map_sel'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[2], map_i[3]+1)]
         dali_temp_dict[temp_dict['pdb_chain']] = temp_dict
         pdbListAll.append(pdb_chain)
     self._pdbListAll = tuple(pdbListAll)
     self._pdbList = self._pdbListAll
     self._alignPDB = dali_temp_dict
     LOGGER.info(str(len(pdbListAll)) + ' Dali results have been searched.')
     return True
Пример #20
0
def searchUniprotID(query, search_b=False, skip_a=False, **kwargs):
    """Returns Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence
        file, sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg search_b: search Pfam-B families when **True**
    :type search_b: bool

    :arg skip_a: do not search Pfam-A families when **True**
    :type skip_a: bool

    :arg ga: use gathering threshold when **True**
    :type ga: bool

    :arg evalue: user specified e-value cutoff, must be smaller than 10.0
    :type evalue: float

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    prefix = '{http://pfam.xfam.org/}'
    query = str(query)
    seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    result = root[0].get('id')
    return result
Пример #21
0
def blastPDB(sequence, filename=None, **kwargs):
    """Return a :class:`PDBBlastRecord` instance that contains results from
    blast searching of ProteinDataBank database *sequence* using NCBI blastp.
        
    :arg sequence: single-letter code amino acid sequence of the protein
        without any gap characters, all white spaces will be removed
    :type sequence: str 
    :arg filename: a *filename* to save the results in XML format 
    :type filename: str
    
    *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``) 
    search parameters can be adjusted by the user.  *sleep* keyword argument
    (default is ``2`` seconds) determines how long to wait to reconnect for 
    results.  Sleep time is doubled when results are not ready.  *timeout* 
    (default is 30 seconds) determines when to give up waiting for the results.  
    """
    
    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')
    elif isinstance(sequence, str):
        sequence = ''.join(sequence.split())
        if not checkSequence(sequence):
            raise ValueError(repr(sequence) + ' is not a valid sequence')
    else:
        raise TypeError('sequence must be a string')

    query = [('DATABASE', 'pdb'), ('ENTREZ_QUERY', '(none)'),
             ('PROGRAM', 'blastp'),] 
    expect = kwargs.pop('expect', 10e-10)
    assert isinstance(expect, (float, int)), 'expect must be a float'
    assert expect > 0, 'expect must be a positive number'
    query.append(('EXPECT', expect))
    hitlist_size = kwargs.pop('hitlist_size', 250)
    assert isinstance(hitlist_size, int), 'hitlist_size must be an integer'
    assert hitlist_size > 0, 'expect must be a positive integer'
    query.append(('HITLIST_SIZE', hitlist_size))
    query.append(('QUERY', sequence))
    query.append(('CMD', 'Put'))
    
    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 20))
    
    if kwargs:
        LOGGER.warning("Keyword argument(s) '{0:s}' are not used."
                       .format("', '".join(kwargs.keys())))

    import urllib, urllib2
    
    url = 'http://blast.ncbi.nlm.nih.gov/Blast.cgi'
    
    data = urllib.urlencode(query)
    LOGGER.timeit()
    LOGGER.info('Blast searching NCBI PDB database for "{0:s}..."'
                .format(sequence[:5]))
    request = urllib2.Request(url, data, {'User-agent': 'ProDy'})
    handle = urllib2.urlopen(request)
    
    html = handle.read()
    index = html.find('RID =')
    if index == -1:
        raise Exception('NCBI did not return expected response.')
    else:
        last = html.find('\n', index)
        rid = html[index + len('RID ='):last].strip()

    index = html.find('RTOE =')
    if index == -1:
        rtoe = None # This is not used
    else:
        last = html.find('\n', index)
        rtoe = int(html[index + len('RTOE ='):last].strip())

    query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500), 
             ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')]
    data = urllib.urlencode(query)
    
    while True:
        LOGGER.sleep(int(sleep), ' to connect NCBI for search results.')
        LOGGER.write('Connecting NCBI for search results...')
        request = urllib2.Request(url, data, {'User-agent': 'ProDy'})
        handle = urllib2.urlopen(request)
        results = handle.read()
        index = results.find('Status=')
        LOGGER.clear()
        if index < 0:
            break
        last = results.index('\n', index)
        status = results[index+len('Status='):last].strip()
        if status.upper() == 'READY':
            break
        sleep *= 2
        if LOGGER.timing() > timeout:
            LOGGER.warning('Blast search time out.')
            return None
    LOGGER.clear()
    LOGGER.timing('Blast search completed in %.1fs.')
    if isinstance(filename, str):
        if not filename.lower().endswith('.xml'):
                filename += '.xml'        
        out = open(filename, 'w')
        out.write(results)
        out.close()
        LOGGER.info('Results are saved as {0:s}.'.format(filename))
    return PDBBlastRecord(results, sequence)
Пример #22
0
def searchPfam(query, **kwargs):
    """Return Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence
        file, sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    prefix = '{http://pfam.xfam.org/}'
    query = str(query)
    if isfile(query):
        from prody.sequence import MSAFile
        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = ''.join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError('could not parse a sequence without gaps from ' +
                             query)
    else:
        seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + ' is not a valid sequence')
        fseq = '>Seq\n' + seq
        parameters = { 'hmmdb' : 'pfam', 'seq': fseq }
        enc_params = urllib.urlencode(parameters)
        request = urllib.request.Request('http://hmmer.janelia.org/search/hmmscan', enc_params)

        url = ( urllib.request.urlopen(request).geturl() + '?output=xml') 
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'
                     .format(seq[:MINSEQLEN]))

        xml = openURL(url, timeout=timeout).read()
        
        try:
            root = ET.XML(xml)
        except Exception as err:
            raise ValueError('failed to parse results XML, check URL: ' + url)
        matches = {}
        for child in root[0]:
            if child.tag == 'hits':
                accession = child.get('acc')
                pfam_id = accession.split('.')[0]
                matches[pfam_id]={}
                matches[pfam_id]['accession']=accession
                matches[pfam_id]['class']='Domain'
                matches[pfam_id]['id']=child.get('name')
                matches[pfam_id]['locations']={}
                matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto')
                matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom')
                matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore')
                matches[pfam_id]['locations']['end']=child[0].get('alisqto')
                matches[pfam_id]['locations']['evalue']=child.get('evalue')
                matches[pfam_id]['locations']['evidence']='hmmer v3.0'
                matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto')
                matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom')
                matches[pfam_id]['locations']['significant']=child[0].get('significant')    
                matches[pfam_id]['locations']['start']=child[0].get('alisqfrom')
                matches[pfam_id]['type']='Pfam-A'
                return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader
            try:
                polymers = parsePDBHeader(seq[:4], 'polymers')
            except Exception as err:
                LOGGER.warn('failed to parse header for {0} ({1})'
                            .format(seq[:4], str(err)))
            else:
                chid = seq[4:].upper()
                for poly in polymers:
                    if chid and poly.chid != chid:
                        continue
                    for dbref in poly.dbrefs:
                        if dbref.database != 'UniProt':
                            continue
                        idcode = dbref.idcode
                        LOGGER.info('UniProt ID code {0} for {1} chain '
                                    '{2} will be used.'
                                    .format(idcode, seq[:4], poly.chid))
                        break
                    if idcode is not None:
                        break
            if idcode is None:
                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
                            'parsed.'.format(repr(seq)))
                url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml'
            else:
                url = ('http://pfam.xfam.org/protein/' +
                       idcode + '?output=xml')

        else:
            url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError('failed to parse results XML, check URL: ' + url)
    else:
        results = dictElement(root[0], prefix)
        try:
            xml_matches = results['matches']
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib['accession'][:7]
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        if not re.search('^P(F|B)[0-9]{5}$', accession):
            raise ValueError('{0} does not match pfam accession'
                             ' format'.format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault('locations', [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = 'Query ' + repr(query)
    else:
        query = 'Query sequence'

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches
Пример #23
0
def searchPfam(query, **kwargs):
    """Return Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence
        file, sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    prefix = "{http://pfam.xfam.org/}"
    query = str(query)
    if isfile(query):
        from prody.sequence import MSAFile

        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = "".join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError("could not parse a sequence without gaps from " + query)
    else:
        seq = "".join(query.split())

    import xml.etree.cElementTree as ET

    LOGGER.timeit("_pfam")
    timeout = int(kwargs.get("timeout", 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + " is not a valid sequence")

            fseq = ">Seq\n" + seq
            parameters = {"hmmdb": "pfam", "seq": fseq}
            enc_params = urllib.urlencode(parameters)
            request = urllib2.Request("http://hmmer.janelia.org/search/hmmscan", enc_params)

            url = urllib2.urlopen(request).geturl() + "?output=xml"
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format(seq[:MINSEQLEN]))

        xml = openURL(url, timeout=timeout).read()

        try:
            root = ET.XML(xml)
        except Exception as err:
            raise ValueError("failed to parse results XML, check URL: " + url)
            matches = {}
            for child in root[0]:
                if child.tag == "hits":
                    accession = child.get("acc")
                    pfam_id = accession.split(".")[0]
                    matches[pfam_id] = {}
                    matches[pfam_id]["accession"] = accession
                    matches[pfam_id]["class"] = "Domain"
                    matches[pfam_id]["id"] = child.get("name")
                    matches[pfam_id]["locations"] = {}
                    matches[pfam_id]["locations"]["ali_end"] = child[0].get("alisqto")
                    matches[pfam_id]["locations"]["ali_start"] = child[0].get("alisqfrom")
                    matches[pfam_id]["locations"]["bitscore"] = child[0].get("bitscore")
                    matches[pfam_id]["locations"]["end"] = child[0].get("alisqto")
                    matches[pfam_id]["locations"]["evalue"] = child.get("evalue")
                    matches[pfam_id]["locations"]["evidence"] = "hmmer v3.0"
                    matches[pfam_id]["locations"]["hmm_end"] = child[0].get("alihmmto")
                    matches[pfam_id]["locations"]["hmm_start"] = child[0].get("alihmmfrom")
                    matches[pfam_id]["locations"]["significant"] = child[0].get("significant")
                    matches[pfam_id]["locations"]["start"] = child[0].get("alisqfrom")
                    matches[pfam_id]["type"] = "Pfam-A"
                return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader

            try:
                polymers = parsePDBHeader(seq[:4], "polymers")
            except Exception as err:
                LOGGER.warn("failed to parse header for {0} ({1})".format(seq[:4], str(err)))
            else:
                chid = seq[4:].upper()
                for poly in polymers:
                    if chid and poly.chid != chid:
                        continue
                    for dbref in poly.dbrefs:
                        if dbref.database != "UniProt":
                            continue
                        idcode = dbref.idcode
                        LOGGER.info(
                            "UniProt ID code {0} for {1} chain " "{2} will be used.".format(idcode, seq[:4], poly.chid)
                        )
                        break
                    if idcode is not None:
                        break
            if idcode is None:
                LOGGER.warn("A UniProt ID code for PDB {0} could not be " "parsed.".format(repr(seq)))
                url = "http://pfam.xfam.org/protein/" + seq + "?output=xml"
            else:
                url = "http://pfam.xfam.org/protein/" + idcode + "?output=xml"

        else:
            url = "http://pfam.xfam.org/protein/" + seq + "?output=xml"

    LOGGER.debug("Retrieving Pfam search results: " + url)
    xml = None
    while LOGGER.timing("_pfam") < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml:
                break

    if not xml:
        raise IOError("Pfam search timed out or failed to parse results " "XML, check URL: " + url)
    else:
        LOGGER.report("Pfam search completed in %.2fs.", "_pfam")

    if xml.find(b"There was a system error on your last request.") > 0:
        LOGGER.warn("No Pfam matches found for: " + seq)
        return None

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError("failed to parse results XML, check URL: " + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError("failed to parse results XML, check URL: " + url)
    else:
        results = dictElement(root[0], prefix)
        try:
            xml_matches = results["matches"]
        except KeyError:
            raise ValueError("failed to parse results XML, check URL: " + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib["accession"][:7]
        except KeyError:
            raise ValueError("failed to parse results XML, check URL: " + url)

        if not re.search("^P(F|B)[0-9]{5}$", accession):
            raise ValueError("{0} does not match pfam accession" " format".format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault("locations", [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = "Query " + repr(query)
    else:
        query = "Query sequence"

    if matches:
        LOGGER.info(query + " matched {0} Pfam families.".format(len(matches)))
    else:
        LOGGER.info(query + " did not match any Pfam families.")
    return matches
Пример #24
0
def searchPfam(query, **kwargs):
    """Return Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence
        file, sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    prefix = '{http://pfam.xfam.org/}'
    query = str(query)
    if isfile(query):
        from prody.sequence import MSAFile
        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = ''.join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError('could not parse a sequence without gaps from ' +
                             query)
    else:
        seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + ' is not a valid sequence')
        fseq = '>Seq\n' + seq
        parameters = {'hmmdb': 'pfam', 'seq': fseq}
        enc_params = urllib.urlencode(parameters)
        request = urllib.request.Request(
            'http://hmmer.janelia.org/search/hmmscan', enc_params)

        url = (urllib.request.urlopen(request).geturl() + '?output=xml')
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format(
            seq[:MINSEQLEN]))

        xml = openURL(url, timeout=timeout).read()

        try:
            root = ET.XML(xml)
        except Exception as err:
            raise ValueError('failed to parse results XML, check URL: ' + url)
        matches = {}
        for child in root[0]:
            if child.tag == 'hits':
                accession = child.get('acc')
                pfam_id = accession.split('.')[0]
                matches[pfam_id] = {}
                matches[pfam_id]['accession'] = accession
                matches[pfam_id]['class'] = 'Domain'
                matches[pfam_id]['id'] = child.get('name')
                matches[pfam_id]['locations'] = {}
                matches[pfam_id]['locations']['ali_end'] = child[0].get(
                    'alisqto')
                matches[pfam_id]['locations']['ali_start'] = child[0].get(
                    'alisqfrom')
                matches[pfam_id]['locations']['bitscore'] = child[0].get(
                    'bitscore')
                matches[pfam_id]['locations']['end'] = child[0].get('alisqto')
                matches[pfam_id]['locations']['evalue'] = child.get('evalue')
                matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0'
                matches[pfam_id]['locations']['hmm_end'] = child[0].get(
                    'alihmmto')
                matches[pfam_id]['locations']['hmm_start'] = child[0].get(
                    'alihmmfrom')
                matches[pfam_id]['locations']['significant'] = child[0].get(
                    'significant')
                matches[pfam_id]['locations']['start'] = child[0].get(
                    'alisqfrom')
                matches[pfam_id]['type'] = 'Pfam-A'
                return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader
            try:
                polymers = parsePDBHeader(seq[:4], 'polymers')
            except Exception as err:
                LOGGER.warn('failed to parse header for {0} ({1})'.format(
                    seq[:4], str(err)))
            else:
                chid = seq[4:].upper()
                for poly in polymers:
                    if chid and poly.chid != chid:
                        continue
                    for dbref in poly.dbrefs:
                        if dbref.database != 'UniProt':
                            continue
                        idcode = dbref.idcode
                        LOGGER.info('UniProt ID code {0} for {1} chain '
                                    '{2} will be used.'.format(
                                        idcode, seq[:4], poly.chid))
                        break
                    if idcode is not None:
                        break
            if idcode is None:
                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
                            'parsed.'.format(repr(seq)))
                url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml'
            else:
                url = ('http://pfam.xfam.org/protein/' + idcode +
                       '?output=xml')

        else:
            url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError('failed to parse results XML, check URL: ' + url)
    else:
        results = dictElement(root[0], prefix)
        try:
            xml_matches = results['matches']
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib['accession'][:7]
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        if not re.search('^P(F|B)[0-9]{5}$', accession):
            raise ValueError('{0} does not match pfam accession'
                             ' format'.format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault('locations', [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = 'Query ' + repr(query)
    else:
        query = 'Query sequence'

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches