示例#1
0
文件: ensemble.py 项目: sixpi/ProDy
    def _superpose(self, **kwargs):
        """Superpose conformations and update coordinates."""

        indices = self._indices
        weights = self._weights
        mobs = self._confs
        if indices is None:
            idx = False
            tar = self._coords
            movs = None
        else:
            idx = True
            if self._weights is not None:
                weights = weights[indices]
            tar = self._coords[indices]
            movs = self._confs

        linalg = importLA()
        svd = linalg.svd
        det = linalg.det

        if weights is None:
            tar_com = tar.mean(0)
            tar_org = (tar - tar_com)
            mob_org = zeros(tar_org.shape, dtype=mobs.dtype)
            tar_org = tar_org.T
        else:
            weights_sum = weights.sum()
            weights_dot = dot(weights.T, weights)
            tar_com = (tar * weights).sum(axis=0) / weights_sum
            tar_org = (tar - tar_com)
            mob_org = zeros(tar_org.shape, dtype=mobs.dtype)

        LOGGER.progress('Superposing ', len(mobs), '_prody_ensemble')
        for i, mob in enumerate(mobs):
            if idx:
                mob = mob[indices]
            if weights is None:
                mob_com = mob.mean(0)
                matrix = dot(tar_org, subtract(mob, mob_com, mob_org))
            else:
                mob_com = (mob * weights).sum(axis=0) / weights_sum
                subtract(mob, mob_com, mob_org)
                matrix = dot((tar_org * weights).T,
                             (mob_org * weights)) / weights_dot

            U, s, Vh = svd(matrix)
            Id = array([[1, 0, 0], [0, 1, 0], [0, 0, sign(det(matrix))]])
            rotation = dot(Vh.T, dot(Id, U.T))

            if movs is None:
                mobs[i] = dot(mob_org, rotation)
                add(mobs[i], tar_com, mobs[i])
            else:
                add(dot(movs[i], rotation),
                    (tar_com - dot(mob_com, rotation)), movs[i])
            LOGGER.update(i, '_prody_ensemble')
        LOGGER.clear()
示例#2
0
    def _superpose(self, **kwargs):
        """Superpose conformations and update coordinates."""

        indices = self._indices
        weights = self._weights
        mobs = self._confs
        if indices is None:
            idx = False
            tar = self._coords
            movs = None
        else:
            idx = True
            if self._weights is not None:
                weights = weights[indices]
            tar = self._coords[indices]
            movs = self._confs

        linalg = importLA()
        svd = linalg.svd
        det = linalg.det

        if weights is None:
            tar_com = tar.mean(0)
            tar_org = (tar - tar_com)
            mob_org = zeros(tar_org.shape, dtype=mobs.dtype)
            tar_org = tar_org.T
        else:
            weights_sum = weights.sum()
            weights_dot = dot(weights.T, weights)
            tar_com = (tar * weights).sum(axis=0) / weights_sum
            tar_org = (tar - tar_com)
            mob_org = zeros(tar_org.shape, dtype=mobs.dtype)

        LOGGER.progress('Superposing ', len(mobs), '_prody_ensemble')
        for i, mob in enumerate(mobs):
            if idx:
                mob = mob[indices]
            if weights is None:
                mob_com = mob.mean(0)
                matrix = dot(tar_org, subtract(mob, mob_com, mob_org))
            else:
                mob_com = (mob * weights).sum(axis=0) / weights_sum
                subtract(mob, mob_com, mob_org)
                matrix = dot((tar_org * weights).T,
                             (mob_org * weights)) / weights_dot

            U, s, Vh = svd(matrix)
            Id = array([[1, 0, 0], [0, 1, 0], [0, 0, sign(det(matrix))]])
            rotation = dot(Vh.T, dot(Id, U.T))

            if movs is None:
                mobs[i] = dot(mob_org, rotation)
                add(mobs[i], tar_com, mobs[i])
            else:
                add(dot(movs[i], rotation), (tar_com - dot(mob_com, rotation)),
                    movs[i])
            LOGGER.update(i, '_prody_ensemble')
        LOGGER.clear()
示例#3
0
def searchDali(pdbId, chainId, daliURL=None, subset='fullPDB', **kwargs):
    """Search Dali server with input of PDB ID and chain ID.
    Dali server: http://ekhidna2.biocenter.helsinki.fi/dali/
    
    :arg subset: fullPDB, PDB25, PDB50, PDB90
    :type subset: str
    
    """

    LOGGER.timeit('_dali')
    # timeout = 120
    timeout = kwargs.pop('timeout', 120)

    if daliURL is None:
        daliURL = "http://ekhidna2.biocenter.helsinki.fi/cgi-bin/sans/dump.cgi"
    pdbId = pdbId.lower()
    pdb_chain = pdbId + chainId
    parameters = {
        'cd1': pdb_chain,
        'method': 'search',
        'title': 'Title_' + pdb_chain,
        'address': ''
    }
    enc_params = urllib.urlencode(parameters).encode('utf-8')
    request = urllib2.Request(daliURL, enc_params)
    try_error = 3
    while try_error >= 0:
        try:
            url = urllib2.urlopen(request).url
            break
        except:
            try_error -= 1
            if try_error >= 0:
                LOGGER.sleep(
                    2, '. Connection error happened. Trying to reconnect...')
                continue
            else:
                url = urllib2.urlopen(request).url
                break
    if url.split('.')[-1].lower() in ['html', 'php']:
        # print('test -1: '+url)
        url = url.replace(url.split('/')[-1], '')
    LOGGER.debug(
        'Submitted Dali search for PDB and chain "{0} and {1}".'.format(
            pdbId, chainId))
    LOGGER.info(url)
    LOGGER.clear()
    obj = DaliRecord(url,
                     pdbId,
                     chainId,
                     subset=subset,
                     timeout=timeout,
                     **kwargs)
    #if obj.isSuccess:

    return obj
示例#4
0
def fetchPDBClusters(sqid=None):
    """Retrieve PDB sequence clusters.  PDB sequence clusters are results of 
    the weekly clustering of protein chains in the PDB generated by blastclust. 
    They are available at FTP site: ftp://resources.rcsb.org/sequence/clusters/
    
    This function will download about 10 Mb of data and save it after 
    compressing in your home directory in :file:`.prody/pdbclusters`.
    Compressed files will be less than 4 Mb in size.  Cluster data can 
    be loaded using :func:`loadPDBClusters` function and be accessed 
    using :func:`listPDBCluster`."""
    
    if sqid is not None:
        if sqid not in PDB_CLUSTERS:
            raise ValueError('sqid must be one of ' + PDB_CLUSTERS_SQID_STR)
        keys = [sqid]
    else:
        keys = list(PDB_CLUSTERS)
    
    PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters')
    if not os.path.isdir(PDB_CLUSTERS_PATH):
        os.mkdir(PDB_CLUSTERS_PATH)
    LOGGER.progress('Downloading sequence clusters', len(PDB_CLUSTERS),
                    '_prody_fetchPDBClusters')
    count = 0
    for i, x in enumerate(keys):
        filename = 'bc-{0}.out'.format(x)
        url = ('ftp://resources.rcsb.org/sequence/clusters/' + filename)
        try:
            inp = openURL(url)
        except IOError:
            LOGGER.warning('Clusters at {0}% sequence identity level could '
                           'not be downloaded.')
            continue
        else:
            out = openFile(filename+'.gz', 'w', folder=PDB_CLUSTERS_PATH) 
            out.write(inp.read())
            inp.close()
            out.close()
            count += 1
        LOGGER.update(i, '_prody_fetchPDBClusters')
    LOGGER.clear()
    if len(PDB_CLUSTERS) == count:
        LOGGER.info('All PDB clusters were downloaded successfully.')
    elif count == 0:
        LOGGER.warn('PDB clusters could not be downloaded.')
示例#5
0
def fetchPDBClusters(sqid=None):
    """Retrieve PDB sequence clusters.  PDB sequence clusters are results of
    the weekly clustering of protein chains in the PDB generated by blastclust.
    They are available at FTP site: ftp://resources.rcsb.org/sequence/clusters/

    This function will download about 10 Mb of data and save it after
    compressing in your home directory in :file:`.prody/pdbclusters`.
    Compressed files will be less than 4 Mb in size.  Cluster data can
    be loaded using :func:`loadPDBClusters` function and be accessed
    using :func:`listPDBCluster`."""

    if sqid is not None:
        if sqid not in PDB_CLUSTERS:
            raise ValueError('sqid must be one of ' + PDB_CLUSTERS_SQID_STR)
        keys = [sqid]
    else:
        keys = list(PDB_CLUSTERS)

    PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters')
    if not os.path.isdir(PDB_CLUSTERS_PATH):
        os.mkdir(PDB_CLUSTERS_PATH)
    LOGGER.progress('Downloading sequence clusters', len(PDB_CLUSTERS),
                    '_prody_fetchPDBClusters')
    count = 0
    for i, x in enumerate(keys):
        filename = 'bc-{0}.out'.format(x)
        url = ('ftp://resources.rcsb.org/sequence/clusters/' + filename)
        try:
            inp = openURL(url)
        except IOError:
            LOGGER.warning('Clusters at {0}% sequence identity level could '
                           'not be downloaded.')
            continue
        else:
            out = openFile(filename + '.gz', 'w', folder=PDB_CLUSTERS_PATH)
            out.write(inp.read())
            inp.close()
            out.close()
            count += 1
        LOGGER.update(i, '_prody_fetchPDBClusters')
    LOGGER.clear()
    if len(PDB_CLUSTERS) == count:
        LOGGER.info('All PDB clusters were downloaded successfully.')
    elif count == 0:
        LOGGER.warn('PDB clusters could not be downloaded.')
示例#6
0
文件: emsurfer.py 项目: uibcdf/ProDy
    def fetch(self, url=None, localFile=False, **kwargs):
        if localFile:
            emsurfer_file = open(url, 'r')
            data = emsurfer_file.read()
            emsurfer_file.close()
        else:
            import requests
            
            if url == None:
                url = self._url

            html = requests.get(url).content

            if PY3K:
                html = html.decode()

            LOGGER.clear()
            LOGGER.report('Emsurfer results were fetched in %.1fs.', '_emsurfer')
            data = html.strip().split('\n')
        
        data_list = []
        for line in data[3:-2]:
            data_list.append(tuple(line.split('\t')))

        # Rank	EMDB_ID	EUC_D	RESOLUTION
        emsurferInfo = np.array(data_list, dtype=[('Rank', '<i4'), ('EMDB_ID', '<U70'),
                                                  ('EUC_D', '<f4'), ('RESOLUTION', '<f4')])
        emdListAll = []
        self._emsurferInfo = emsurferInfo
        emsurfer_temp_dict = dict()
        for temp in self._emsurferInfo:
            temp_dict = dict()
            temp_dict['Rank'] = temp[0]
            temp_dict['EMDB_ID'] = emdbId = temp[1]
            temp_dict['EUC_D'] = temp[2]
            temp_dict['RESOLUTION'] = temp[3]
            emsurfer_temp_dict[emdbId] = temp_dict
            emdListAll.append(emdbId)
        self._emdListAll = tuple(emdListAll)
        self._emdList = self._emdListAll
        self._alignEMD = emsurfer_temp_dict
        LOGGER.info('Obtained ' + str(len(emdListAll)) + ' EMD matches from Emsurfer for '+self._emdId+'.')
        return True
示例#7
0
def calcMSF(coordsets):
    """Calculate mean square fluctuation(s) (MSF)."""

    try:
        ncsets = coordsets.numFrames()
    except AttributeError:
        try:
            coordsets = coordsets.getCoordsets()
        except AttributeError:
            pass
        try:
            ndim, shape = coordsets.ndim, coordsets.shape
        except:
            raise TypeError('coordsets must be a Numpy array or a ProDy '
                            'object with `getCoordsets` method')
        if ndim != 3 or shape[0] == 1:
            raise ValueError('coordsets must contain multiple sets')
        msf = var(coordsets, 0).sum(1)
    else:
        nfi = coordsets.nextIndex()
        natoms = coordsets.numSelected()
        total = zeros((natoms, 3))
        sqsum = zeros((natoms, 3))

        LOGGER.progress('Evaluating {0} frames from {1}:'
                        .format(ncsets, str(coordsets)), ncsets,
                        '_prody_calcMSF')
        ncsets = 0
        coordsets.reset()
        for frame in coordsets:
            frame.superpose()
            coords = frame._getCoords()
            total += coords
            sqsum += coords ** 2
            ncsets += 1
            LOGGER.update(ncsets, '_prody_calcMSF')
        msf = (sqsum/ncsets - (total/ncsets)**2).sum(1)
        LOGGER.clear()
        coordsets.goto(nfi)
    return msf
示例#8
0
def calcMSF(coordsets):
    """Calculate mean square fluctuation(s) (MSF)."""

    try:
        ncsets = coordsets.numFrames()
    except AttributeError:
        try:
            coordsets = coordsets.getCoordsets()
        except AttributeError:
            pass
        try:
            ndim, shape = coordsets.ndim, coordsets.shape
        except:
            raise TypeError('coordsets must be a Numpy array or a ProDy '
                            'object with `getCoordsets` method')
        if ndim != 3 or shape[0] == 1:
            raise ValueError('coordsets must contain multiple sets')
        msf = var(coordsets, 0).sum(1)
    else:
        nfi = coordsets.nextIndex()
        natoms = coordsets.numSelected()
        total = zeros((natoms, 3))
        sqsum = zeros((natoms, 3))

        LOGGER.progress(
            'Evaluating {0} frames from {1}:'.format(ncsets, str(coordsets)),
            ncsets, '_prody_calcMSF')
        ncsets = 0
        coordsets.reset()
        for frame in coordsets:
            frame.superpose()
            coords = frame._getCoords()
            total += coords
            sqsum += coords**2
            ncsets += 1
            LOGGER.update(ncsets, '_prody_calcMSF')
        msf = (sqsum / ncsets - (total / ncsets)**2).sum(1)
        LOGGER.clear()
        coordsets.goto(nfi)
    return msf
示例#9
0
def blastPDBUniProtKB(sequence, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    blast searching of ProteinDataBank database *sequence* using NCBI blastp.

    :arg sequence: single-letter code amino acid sequence of the protein
        without any gap characters, all white spaces will be removed
    :type sequence: str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``)
    search parameters can be adjusted by the user.  *sleep* keyword argument
    (default is ``2`` seconds) determines how long to wait to reconnect for
    results.  Sleep time is doubled when results are not ready.  *timeout*
    (default is 120s) determines when to give up waiting for the results. 
    *num_sequences (default is ``1``)
    """

    num_sequences = int(kwargs.pop('num_sequences', 1))
    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')
    else:
        if num_sequences == 1:
            try:
                sequence = ''.join(sequence.split())
                _ = sequence.isalpha()
            except AttributeError:
                raise TypeError('sequence must be a string')
            else:
                if not _:
                    raise ValueError('not a valid protein sequence')
                    
    headers = {'User-agent': 'ProDy'}

    query = [('DATABASE', 'swissprot'), ('ENTREZ_QUERY', '(none)'),
             ('PROGRAM', 'blastp'),]
    expect = float(kwargs.pop('expect', 10e-5))
    if expect <= 0:
        raise ValueError('expect must be a positive number')
    query.append(('EXPECT', expect))
    hitlist_size = int(kwargs.pop('hitlist_size', 250))
    if hitlist_size <= 0:
        raise ValueError('expect must be a positive integer')
    psiblast = 'true'
    step_number = 3
    query.append(('RUN_PSIBLAST', psiblast))
    query.append(('HITLIST_SIZE', hitlist_size))
    query.append(('QUERY', sequence))
    query.append(('CMD', 'Put'))
    query.append(('STEP_NUMBER', step_number))

    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 120))

    if kwargs:
        LOGGER.warn('Keyword argument(s) {0} are not used.'
                    .format(', '.join([repr(key) for key in kwargs])))

    try:
        import urllib.parse
        urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8')
    except ImportError:
        from urllib import urlencode

    url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi'

    data = urlencode(query)
    LOGGER.timeit('_prody_blast')
    LOGGER.info('Blast searching NCBI PDB database for "{0}..."'
                .format(sequence[:5]))
    handle = openURL(url, data=data, headers=headers)

    html = handle.read()
    index = html.find(b'name="RID" type="hidden" value="')
    if index == -1:
        raise Exception('NCBI did not return expected response.')
    else:
        last = html.find(b'>',index)
        rid = html[index + len('name="RID" type="hidden" value="'):last-1].strip()

    index = html.find(b'name="RTOE" type="hidden" value="')
    if index == -1:
        rtoe = None # This is not used
    else:
        last = html.find(b'>', index)
        rtoe = html[index + len('name="RTOE" type="hidden" value="'):last-1].strip()

    query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500),
             ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')]
    data = urlencode(query)

    while True:
        LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.')
        LOGGER.write('Connecting NCBI for search results...')
        handle = openURL(url, data=data, headers=headers)
        results = handle.read()
        index = results.find(b'Status=')
        LOGGER.clear()
        if index < 0:
            break
        last = results.index(b'\n', index)
        status = results[index+len('Status='):last].strip()
        if status.upper() == 'READY':
            break
        sleep = int(sleep * 1.5)
        if LOGGER.timing('_prody_blast') > timeout:
            LOGGER.warn('Blast search time out.')
            return None
    LOGGER.clear()
    LOGGER.report('Blast search completed in %.1fs.', '_prody_blast')
    try:
        ext_xml = filename.lower().endswith('.xml')
    except AttributeError:
        pass
    else:
        if not ext_xml:
            filename += '.xml'
        out = open(filename, 'w')
        out.write(results)
        out.close()
        LOGGER.info('Results are saved as {0}.'.format(repr(filename)))
    return SwissProtBlastRecord(results, sequence)
示例#10
0
def searchDali(pdb, chain=None, subset='fullPDB', daliURL=None, **kwargs):
    """Search Dali server with input of PDB ID (or local PDB file) and chain ID.
    Dali server: http://ekhidna2.biocenter.helsinki.fi/dali/
    
    :arg pdb: PDB code or local PDB file for the protein to be searched

    :arg chain: chain identifier (only one chain can be assigned for PDB)
    :type chain: str

    :arg subset: fullPDB, PDB25, PDB50, PDB90
    :type subset: str
    """
    
    import requests
    
    LOGGER.timeit('_dali')
    # timeout = 120
    timeout = kwargs.pop('timeout', 120)
    
    if daliURL is None:
        daliURL = "http://ekhidna2.biocenter.helsinki.fi/cgi-bin/sans/dump.cgi"
    
    if isinstance(pdb, Atomic):
        atoms = pdb
        chain_set = set(atoms.getChids())
        if chain and not chain in chain_set:
            raise ValueError('input structure (%s) does not have chain %s'%(atoms.getTitle(), chain))
        
        if len(chain_set) > 1:
            if not chain:
                raise TypeError('the structure (%s) contains more than one chain, therefore a chain identifier '
                                'needs to be specified'%pdb.getTitle())
            atoms = atoms.select('chain '+chain)
        else:
            chain = chain_set.pop()
            
        stream = createStringIO()
        writePDBStream(stream, atoms)
        data = stream.getvalue()
        stream.close()
        files = {"file1" : data}

        pdbId = atoms.getTitle()
        pdb_chain = ''
        dali_title = 'Title_'+pdbId+chain
    elif isinstance(pdb, str):
        if os.path.isfile(pdb):
            atoms = parsePDB(pdb)
            chain_set = set(atoms.getChids())
            # pdbId = "s001"
            filename = os.path.basename(pdb)
            filename, ext = os.path.splitext(filename)
            if ext.lower() == '.gz':
                filename2, ext2 = os.path.splitext(filename)
                if ext2.lower() == '.pdb':
                    filename = filename2
            pdbId = filename
            if chain and not chain in chain_set:
                raise ValueError('input PDB file does not have chain ' + chain)
            
            if len(chain_set) > 1:
                if not chain:
                    raise TypeError('PDB file (%s) contains more than one chain, therefore a chain identifier '
                                    'needs to be specified'%pdb)
                atoms = atoms.select('chain '+chain)
                #local_temp_pdb = pdbId+chain+'.pdb'
                #local_temp_pdb = 's001'+chain+'.pdb'
                stream = createStringIO()
                writePDBStream(stream, atoms)
                data = stream.getvalue()
                stream.close()
            else:
                data = open(pdb, "rb")
                chain = chain_set.pop()
            files = {"file1" : data}
            # case: multiple chains.             apply fetch ? multiple times?
            pdb_chain = ''
            dali_title = 'Title_' + pdbId + chain
        else:
            pdbId, ch = _getPDBid(pdb)
            if not chain:
                chain = ch
            if not chain:
                raise TypeError('a chain identifier is needed for the search')
            pdb_chain = pdbId + chain
            dali_title = 'Title_' + pdb_chain
            files = ''
    parameters = { 'cd1' : pdb_chain, 'method': 'search', 'title': dali_title, 'address': '' }
    # enc_params = urllib.urlencode(parameters).encode('utf-8')
    # request = urllib2.Request(daliURL, enc_params)
    request = requests.post(daliURL, parameters, files=files)
    try_error = 3
    while try_error >= 0:
        try:
            # url = urllib2.urlopen(request).url
            url = request.url
            break
        except:
            try_error -= 1
            if try_error >= 0:
                LOGGER.sleep(2, '. Connection error happened. Trying to reconnect...')
                continue
            else:
                # url = urllib2.urlopen(request).url
                url = request.url
                break
    if url.split('.')[-1].lower() in ['html', 'php']:
        # print('test -1: '+url)
        url = url.replace(url.split('/')[-1], '')
    LOGGER.debug('Submitted Dali search for PDB "{0}{1}".'.format(pdbId, chain))
    LOGGER.info(url)
    LOGGER.clear()
    
    return DaliRecord(url, pdbId, chain, subset=subset, timeout=timeout, **kwargs)
示例#11
0
    def fetch(self, url=None, localFile=False, **kwargs):
        """Get Dali record from url or file.

        :arg url: url of Dali results page or local dali results file
            If None then the url already associated with the DaliRecord object is used.
        :type url: str

        :arg localFile: whether provided url is a path for a local dali results file
        :type localFile: bool

        :arg timeout: amount of time until the query times out in seconds
            default value is 120
        :type timeout: int

        :arg localfolder: folder in which to find the local file
            default is the current folder
        :type localfolder: str
        """
        if localFile:
            dali_file = open(url, 'r')
            data = dali_file.read()
            dali_file.close()
        else:
            import requests
            
            if url == None:
                url = self._url
            
            sleep = 2
            timeout = kwargs.pop('timeout', 120)
            LOGGER.timeit('_dali')
            log_message = ''
            try_error = 3
            while True:
                LOGGER.write('Connecting to Dali for search results...')
                LOGGER.clear()
                try:
                    # html = urllib2.urlopen(url).read()
                    html = requests.get(url).content
                except:
                    try_error -= 1
                    if try_error >= 0:
                        LOGGER.sleep(2, '. Connection error happened. Trying to reconnect...')
                        continue
                    else:
                        # html = urllib2.urlopen(url).read()
                        html = requests.get(url).content
                if PY3K:
                    html = html.decode()
                if html.find('Status: Queued') > -1:
                    log_message = '(Dali search is queued)...'
                elif html.find('Status: Running') > -1:
                    log_message = '(Dali search is running)...'
                elif html.find('Your job') == -1 and html.find('.txt') > -1:
                    break
                elif html.find('ERROR:') > -1:
                    LOGGER.warn(': Dali search reported an ERROR!')
                    return False
                sleep = 20 if int(sleep * 1.5) >= 20 else int(sleep * 1.5)
                if LOGGER.timing('_dali') > timeout:
                    LOGGER.warn(': Dali search has timed out. \nThe results can be obtained later using the fetch() method.')
                    return False
                LOGGER.sleep(int(sleep), 'to reconnect to Dali '+log_message)
                LOGGER.clear()
            LOGGER.clear()
            LOGGER.report('Dali results were fetched in %.1fs.', '_dali')
            lines = html.strip().split('\n')
            file_name = re.search('=.+-90\\.txt', html).group()[1:]
            file_name = file_name[:-7]
            # LOGGER.info(url+file_name+self._subset+'.txt')
            # data = urllib2.urlopen(url+file_name+self._subset+'.txt').read()
            data = requests.get(url+file_name+self._subset+'.txt').content
            if PY3K:
                data = data.decode()
            localfolder = kwargs.pop('localfolder', '.')

            if file_name.lower().startswith('s001'):
                temp_name = self._pdbId + self._chain
            else:
                temp_name = file_name
            temp_name += self._subset + '_dali.txt'
            if localfolder != '.' and not os.path.exists(localfolder):
                os.mkdir(localfolder)
            with open(localfolder+os.sep+temp_name, "w") as file_temp: file_temp.write(html + '\n' + url+file_name+self._subset+'.txt' + '\n' + data)
            # with open(temp_name, "a+") as file_temp: file_temp.write(url+file_name + '\n' + data)
        data_list = data.strip().split('# ')
        # No:  Chain   Z    rmsd lali nres  %id PDB  Description -> data_list[3]
        # Structural equivalences -> data_list[4]
        # Translation-rotation matrices -> data_list[5]
        map_temp_dict = dict()
        lines = data_list[4].strip().split('\n')
        self._lines_4 = lines
        mapping_temp = np.genfromtxt(lines[1:], delimiter = (4,1,14,6,2,4,4,5,2,4,4,3,5,4,3,5,6,3,5,4,3,5,28), 
                                     usecols = [0,3,5,7,9,12,15,15,18,21], dtype='|i4')
        # [0,3,5,7,9,12,15,15,18,21] -> [index, residue_a, residue_b, residue_i_a, residue_i_b, resid_a, resid_b, resid_i_a, resid_i_b]
        for map_i in mapping_temp:
            if not map_i[0] in map_temp_dict:
                map_temp_dict[map_i[0]] = [[map_i[1], map_i[2], map_i[3], map_i[4]]]
            else:
                map_temp_dict[map_i[0]].append([map_i[1], map_i[2], map_i[3], map_i[4]])
        self._max_index = max(mapping_temp[:,2])
        self._mapping = map_temp_dict
        self._data = data_list[3]
        lines = data_list[3].strip().split('\n')
        # daliInfo = np.genfromtxt(lines[1:], delimiter = (4,3,6,5,5,5,6,5,57), usecols = [0,2,3,4,5,6,7,8], 
                                # dtype=[('id', '<i4'), ('pdb_chain', '|S6'), ('Z', '<f4'), ('rmsd', '<f4'), 
                                # ('len_align', '<i4'), ('nres', '<i4'), ('identity', '<i4'), ('title', '|S70')])
        daliInfo = np.genfromtxt(lines[1:], delimiter = (4,3,6,5,5,5,6,5,57), usecols = [0,2,3,4,5,6,7,8], 
                                dtype=[('id', '<i4'), ('pdb_chain', '|U6'), ('Z', '<f4'), ('rmsd', '<f4'), 
                                ('len_align', '<i4'), ('nres', '<i4'), ('identity', '<i4'), ('title', '|U70')])
        if daliInfo.ndim == 0:
            daliInfo = np.array([daliInfo])
        pdbListAll = []
        self._daliInfo = daliInfo
        dali_temp_dict = dict()
        for temp in self._daliInfo:
            temp_dict = dict()
            pdb_chain = temp[1].strip()[0:6]
            # U6 and U70 were used as the dtype for np.genfromtext -> unicode string were used in daliInfo 
            # if PY3K:
                # pdb_chain = pdb_chain.decode()
            pdb_chain = str(pdb_chain)
            temp_dict['pdbId'] = pdbid = pdb_chain[0:4].lower()
            temp_dict['chainId'] = chid = pdb_chain[5:6]
            temp_dict['pdb_chain'] = pdb_chain = pdbid + chid
            temp_dict['Z'] = temp[2]
            temp_dict['rmsd'] = temp[3]
            temp_dict['len_align'] = temp[4]
            temp_dict['nres'] = temp[5]
            temp_dict['identity'] = temp[6]
            temp_dict['mapping'] = (np.array(map_temp_dict[temp[0]])-1).tolist()
            temp_dict['map_ref'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[0], map_i[1]+1)]
            temp_dict['map_sel'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[2], map_i[3]+1)]
            dali_temp_dict[pdb_chain] = temp_dict
            pdbListAll.append(pdb_chain)
        self._pdbListAll = tuple(pdbListAll)
        self._pdbList = self._pdbListAll
        self._alignPDB = dali_temp_dict
        LOGGER.info('Obtained ' + str(len(pdbListAll)) + ' PDB chains from Dali for '+self._pdbId+self._chain+'.')
        return True
示例#12
0
def searchDali(pdb,
               chainId,
               isLocal=False,
               subset='fullPDB',
               daliURL=None,
               **kwargs):
    """Search Dali server with input of PDB ID (or local PDB file) and chain ID.
    Dali server: http://ekhidna2.biocenter.helsinki.fi/dali/
    
    :arg pdb: PDB code or local PDB file for searched protein
    :arg chainId: chain identifier (only one chain can be assigned for PDB)
    :arg isLocal: submit a local PDB file instead of a PDB code when **True**
    :arg subset: fullPDB, PDB25, PDB50, PDB90
    :type subset: str
    
    """

    import requests

    LOGGER.timeit('_dali')
    # timeout = 120
    timeout = kwargs.pop('timeout', 120)

    if daliURL is None:
        daliURL = "http://ekhidna2.biocenter.helsinki.fi/cgi-bin/sans/dump.cgi"
    if len(chainId) != 1:
        raise ValueError('input PDB chain identifier ' + chainId +
                         ' is invalid')
    if isLocal:
        if not os.path.isfile(pdb):
            raise ValueError('input PDB file ' + pdb + ' does not exist ')
        atom = parsePDB(pdb)
        chain_set = set(atom.getChids())
        # pdbId = "s001"
        pdbId = '.'.join(pdb.split(os.sep)[-1].split('.')[0:-1])
        if not chainId in chain_set:
            raise ValueError('input PDB file does not have chain ' + chainId)
        elif len(chain_set) > 1:
            atom = atom.select('chain ' + chainId)
            # local_temp_pdb = pdbId+chainId+'.pdb'
            local_temp_pdb = 's001' + chainId + '.pdb'
            writePDB(local_temp_pdb, atom)
        else:
            local_temp_pdb = pdb
        files = {"file1": open(local_temp_pdb, "rb")}
        # case: multiple chains.             apply getRecord ? multiple times?
        pdb_chain = ''
        dali_title = 'Title_' + pdbId + chainId
    else:
        pdbId = pdb.lower()
        if len(pdbId) != 4:
            raise ValueError('input PDB code ' + pdb + ' is invalid')
        files = ''
        pdb_chain = pdbId + chainId
        dali_title = 'Title_' + pdb_chain
    parameters = {
        'cd1': pdb_chain,
        'method': 'search',
        'title': dali_title,
        'address': ''
    }
    # enc_params = urllib.urlencode(parameters).encode('utf-8')
    # request = urllib2.Request(daliURL, enc_params)
    request = requests.post(daliURL, parameters, files=files)
    try_error = 3
    while try_error >= 0:
        try:
            # url = urllib2.urlopen(request).url
            url = request.url
            break
        except:
            try_error -= 1
            if try_error >= 0:
                LOGGER.sleep(
                    2, '. Connection error happened. Trying to reconnect...')
                continue
            else:
                # url = urllib2.urlopen(request).url
                url = request.url
                break
    if url.split('.')[-1].lower() in ['html', 'php']:
        # print('test -1: '+url)
        url = url.replace(url.split('/')[-1], '')
    LOGGER.debug(
        'Submitted Dali search for PDB and chain "{0} and {1}".'.format(
            pdbId, chainId))
    LOGGER.info(url)
    LOGGER.clear()
    obj = DaliRecord(url,
                     pdbId,
                     chainId,
                     subset=subset,
                     timeout=timeout,
                     **kwargs)
    #if obj.isSuccess:

    return obj
示例#13
0
    def runStep(self, **kwargs):
        """Run a single step of adaptive ANM. 
        Modes will be calculated for *structA* and the subset with 
        a cumulative overlap above a threshold defined by *Fmin* 
        is used for transitioning towards *structB*.
        
        By default this function uses values from initialisation but 
        they can be over-ridden if desired. For example, in bi-directional 
        adaptive ANM, we switch *structA* and *structB*, *alignSelA* and *alignSelB*,
        and *reduceSelA* and *reduceSelB*
        """

        structA = kwargs.pop('structA', self.structA)
        structB = kwargs.pop('structA', self.structB)

        alignSel = kwargs.pop('alignSel', self.alignSel)
        alignSelA = kwargs.pop('alignSelA', self.alignSelA)
        alignSelB = kwargs.pop('alignSelB', self.alignSelB)

        reduceSel = kwargs.pop('reduceSel', self.reduceSel)
        reduceSelA = kwargs.pop('reduceSelA', self.reduceSelA)
        reduceSelB = kwargs.pop('reduceSelB', self.reduceSelB)

        if reduceSelA is None:
            reduceSelA = reduceSel

        if reduceSelB is None:
            reduceSelB = reduceSel

        if alignSelA is None:
            if alignSelA is None:
                alignSelA = reduceSelA

            if alignSelB is None:
                alignSelB = reduceSelB
        else:
            if alignSelA is None:
                alignSelA = alignSel

            if alignSelB is None:
                alignSelB = alignSel

        Fmin = kwargs.get('Fmin', self.Fmin)

        f = kwargs.get('f', self.f)

        outputDCD = kwargs.get('outputDCD', self.outputDCD)
        outputPDB = kwargs.get('outputPDB', self.outputPDB)
        filename = kwargs.get('filename', self.filename)

        LOGGER.info('\nStarting cycle {0} with initial structure {1}'.format(
            self.numSteps + 1, structA))

        mapping_func = kwargs.get('mapping_func', mapOntoChains)

        if alignSelA is None:
            structA_sel = structA
        else:
            structA_sel = structA.select(alignSelA)

        if alignSelB is None:
            structB_sel = structB
        else:
            structB_sel = structB.select(alignSelB)

        mapping_func = kwargs.pop('mapping_func', self.mapping_func)
        seqid = kwargs.pop('seqid', self.seqid)
        coverage = kwargs.pop('overlap', self.coverage)
        coverage = kwargs.pop('coverage', coverage)
        pwalign = kwargs.pop('pwalign', self.pwalign)
        pwalign = kwargs.pop('mapping', pwalign)

        try:
            _, T = superpose(structA_sel, structB_sel)
            structA = applyTransformation(T, structA)
        except:
            structB_amap = sum(
                np.array(
                    mapping_func(structB_sel,
                                 structA_sel,
                                 overlap=coverage,
                                 seqid=seqid,
                                 pwalign=pwalign))[:, 0])
            _, T = superpose(structA_sel, structB_amap)
            structA = applyTransformation(T, structA)

        maxModes = kwargs.get('maxModes', self.maxModes)
        if not isinstance(maxModes, (int, float)):
            raise TypeError('maxModes should be an integer or float')
        if maxModes < 1:
            maxModes = int(maxModes * 3 * self.structA.numAtoms() - 6)
        if maxModes > 3 * self.structA.numAtoms() - 6:
            maxModes = 3 * self.structA.numAtoms() - 6

        if self.n_modes > maxModes:
            self.n_modes = maxModes

        trim = kwargs.pop('trim', self.trim)
        anmA, _ = calcENM(structA, n_modes=self.n_modes)

        if trim == 'slice':
            trim_anmA, _ = sliceModel(anmA, structA, reduceSelA)
        elif trim == 'reduce':
            trim_anmA, _ = reduceModel(anmA, structA, reduceSelA)
            trim_anmA.calcModes(n_modes=self.n_modes)
        else:
            trim_anmA = anmA

        coordsA = structA.getCoords()
        coordsA_sel = structA_sel.getCoords()
        coordsB_sel = structB_sel.getCoords()

        defvec = coordsB_sel - coordsA_sel
        d = defvec.flatten()
        self.dList.append(d)

        if Fmin is None:
            if self.numSteps == 0 or self.resetFmin:
                Fmin = 0.  # Select the first mode only
            else:
                Fmin = 1 - np.sqrt(
                    np.linalg.norm(self.dList[self.numSteps]) /
                    np.linalg.norm(self.dList[0]))

        if Fmin > self.Fmin_max:
            Fmin = self.Fmin_max

        LOGGER.info(
            'Fmin is {:4.3f}, corresponding to a cumulative overlap of {:4.3f}'
            .format(Fmin, np.sqrt(Fmin)))

        trim_d = sliceAtomicData(d, structA_sel, reduceSelA)
        overlaps = np.dot(trim_d, trim_anmA.getEigvecs())
        overlap_sorting_indices = list(
            reversed(list(np.argsort(abs(overlaps)))))
        overlaps = overlaps[overlap_sorting_indices]

        if trim == 'reduce':
            sliced_anmA, _ = sliceModel(anmA, structA, reduceSelA)
            modesetA = ModeSet(trim_anmA, overlap_sorting_indices)
            _, overlap_sorting_indices = matchModes(modesetA,
                                                    sliced_anmA,
                                                    index=True)

        modesetA = ModeSet(anmA, overlap_sorting_indices)

        normalised_overlaps = overlaps / np.linalg.norm(d)
        c_sq = np.cumsum(np.power(normalised_overlaps, 2), axis=0)

        modesCrossingFmin = np.where(c_sq <= Fmin)[0]
        numModes = len(modesCrossingFmin)
        if numModes == 0:
            numModes = 1
            modesCrossingFmin = [0]

        self.numModesList.append(numModes)

        if numModes == 1:
            LOGGER.info('Using 1 mode with overlap {0} (Mode {1})'.format(
                '{:4.3f}'.format(np.sqrt(c_sq[0])),
                modesetA.getIndices()[0] + 1))
        elif numModes < 11:
            LOGGER.info(
                'Using {0} modes with cumulative overlap {1} (Modes {2} and {3})'
                .format(
                    numModes, '{:4.3f}'.format(np.sqrt(c_sq[numModes - 1])),
                    ', '.join([
                        str(entry)
                        for entry in modesetA.getIndices()[:numModes - 1] + 1
                    ]), str(modesetA.getIndices()[numModes - 1] + 1)))
        else:
            LOGGER.info(
                'Using {0} modes with cumulative overlap {1} (Modes {2}, ... and {3}) with max mode number {4} and min mode number {5}'
                .format(
                    numModes, '{:4.3f}'.format(np.sqrt(c_sq[numModes - 1])),
                    ', '.join([
                        str(entry) for entry in modesetA.getIndices()[:10] + 1
                    ]), str(modesetA.getIndices()[numModes - 1] + 1),
                    np.max(modesetA.getIndices()[:numModes] + 1),
                    np.min(modesetA.getIndices()[:numModes] + 1)))

        if np.max(modesetA.getIndices()[:numModes]) > self.n_modes - 5:
            self.n_modes *= 10

        if self.n_modes > 3 * self.structA.numAtoms() - 6:
            self.n_modes = 3 * self.structA.numAtoms() - 6

        v = np.sum(np.multiply(overlaps[:numModes],
                               modesetA.getEigvecs()[:, :numModes]),
                   axis=1).reshape(coordsA.shape)

        trim_v = sliceAtomicData(v.reshape(-1), structA,
                                 reduceSelA).reshape(-1, 3)
        s_min = sum(np.multiply(trim_v.flatten(), trim_d)) / sum(
            np.power(trim_v.flatten(), 2))

        new_coordsA = coordsA + f * s_min * v

        if structA == self.structA:
            self.anmA = anmA
            self.anmListA.append(modesetA)
            self.structA.setCoords(new_coordsA)
            self.ensembleA.addCoordset(new_coordsA)
            self.whichModesA.append(modesetA[modesCrossingFmin])
        elif structA == self.structB:
            self.anmB = anmA
            self.anmListB.append(modesetA)
            self.structB.setCoords(new_coordsA)
            self.ensembleB.addCoordset(new_coordsA)
            self.whichModesB.append(modesetA[modesCrossingFmin])

        new_coordsA_reduceSel = structA.select(reduceSelA).getCoords()
        coordsB_reduceSel = structB.select(reduceSelB).getCoords()
        rmsd = calcRMSD(new_coordsA_reduceSel, coordsB_reduceSel)

        LOGGER.info('Current RMSD is {:4.3f}\n'.format(rmsd))

        self.numSteps += 1

        self.rmsds.append(rmsd)

        if outputPDB:
            writePDB(filename + '_A', self.ensembleA)
            LOGGER.clear()
            writePDB(filename + '_B', self.ensembleB)
            LOGGER.clear()

        if outputDCD:
            writeDCD(filename + '_A', self.ensembleA)
            LOGGER.clear()
            writeDCD(filename + '_B', self.ensembleB)
            LOGGER.clear()

        return
示例#14
0
def calcPerturbResponse(model, atoms=None, repeats=100):
    """Return a matrix of profiles from scanning of the response of the
    structure to random perturbations at specific atom (or node) positions.
    The function implements the perturbation response scanning (PRS) method
    described in [CA09]_.  Rows of the matrix are the average magnitude of the
    responses obtained by perturbing the atom/node position at that row index,
    i.e. ``prs_profile[i,j]`` will give the response of residue/node *j* to
    perturbations in residue/node *i*.  PRS is performed using the covariance
    matrix from *model*, e.t. :class:`.ANM` instance.  Each residue/node is
    perturbed *repeats* times with a random unit force vector.  When *atoms*
    instance is given, PRS profile for residues will be added as an attribute
    which then can be retrieved as ``atoms.getData('prs_profile')``.  *model*
    and *atoms* must have the same number of atoms. *atoms* must be an
    :class:`.AtomGroup` instance.


    .. [CA09] Atilgan C, Atilgan AR, Perturbation-Response Scanning
       Reveals Ligand Entry-Exit Mechanisms of Ferric Binding Protein.
       *PLoS Comput Biol* **2009** 5(10):e1000544.

    The RPS matrix can be save as follows::

      prs_matrix = calcPerturbationResponse(p38_anm)
      writeArray('prs_matrix.txt', prs_matrix, format='%8.6f', delimiter='\t')
    """

    if not isinstance(model, NMA):
        raise TypeError('model must be an NMA instance')
    elif not model.is3d():
        raise TypeError('model must be a 3-dimensional NMA instance')
    elif len(model) == 0:
        raise ValueError('model must have normal modes calculated')
    if atoms is not None:
        if not isinstance(atoms, AtomGroup):
            raise TypeError('atoms must be an AtomGroup instance')
        elif atoms.numAtoms() != model.numAtoms():
            raise ValueError('model and atoms must have the same number atoms')

    assert isinstance(repeats, int), 'repeats must be an integer'
    cov = calcCovariance(model)
    if cov is None:
        raise ValueError('model did not return a covariance matrix')

    n_atoms = model.numAtoms()
    response_matrix = np.zeros((n_atoms, n_atoms))
    LOGGER.progress('Calculating perturbation response', n_atoms, '_prody_prs')
    i3 = -3
    i3p3 = 0
    for i in range(n_atoms):
        i3 += 3
        i3p3 += 3
        forces = np.random.rand(repeats * 3).reshape((repeats, 3))
        forces /= ((forces**2).sum(1)**0.5).reshape((repeats, 1))
        for force in forces:
            response_matrix[i] += (
                np.dot(cov[:, i3:i3p3], force)
                ** 2).reshape((n_atoms, 3)).sum(1)
        LOGGER.update(i, '_prody_prs')

    response_matrix /= repeats
    LOGGER.clear()
    LOGGER.report('Perturbation response scanning completed in %.1fs.',
                  '_prody_prs')
    if atoms is not None:
        atoms.setData('prs_profile', response_matrix)
    return response_matrix

    # save the original PRS matrix
    np.savetxt('orig_PRS_matrix', response_matrix, delimiter='\t', fmt='%8.6f')
    # calculate the normalized PRS matrix
    self_dp = np.diag(response_matrix)  # using self displacement (diagonal of
                               # the original matrix) as a
                               # normalization factor
    self_dp = self_dp.reshape(n_atoms, 1)
    norm_PRS_mat = response_matrix / np.repeat(self_dp, n_atoms, axis=1)
    # suppress the diagonal (self displacement) to facilitate
    # visualizing the response profile
    norm_PRS_mat = norm_PRS_mat - np.diag(np.diag(norm_PRS_mat))
    np.savetxt('norm_PRS_matrix', norm_PRS_mat, delimiter='\t', fmt='%8.6f')
    return response_matrix
示例#15
0
文件: dali.py 项目: creageng/ProDy
 def getRecord(self, url, localFile=False):
     if localFile:
         dali_file = open(url, 'r')
         data = dali_file.read()
         dali_file.close()
     else:
         sleep = 2
         timeout = 120
         LOGGER.timeit('_dali')
         log_message = ''
         try_error = 3
         while True:
             LOGGER.sleep(int(sleep), 'to reconnect Dali '+log_message)
             LOGGER.clear()
             LOGGER.write('Connecting Dali for search results...')
             LOGGER.clear()
             try:
                 html = urllib2.urlopen(url).read()
             except:
                 try_error -= 1
                 if try_error >= 0:
                     LOGGER.sleep(2, '. Connection error happened. Trying to reconnect...')
                     continue
                 else:
                     html = urllib2.urlopen(url).read()
             if html.find('Status: Queued') > -1:
                 log_message = '(Dali searching is queued)...'
             elif html.find('Status: Running') > -1:
                 log_message = '(Dali searching is running)...'
             elif html.find('Your job') == -1 and html.find('.txt') > -1:
                 break
             elif html.find('ERROR:') > -1:
                 LOGGER.warn(': Dali search reported an ERROR!')
                 return None
                 break
             sleep = 20 if int(sleep * 1.5) >= 20 else int(sleep * 1.5)
             if LOGGER.timing('_dali') > timeout:
                 LOGGER.warn(': Dali search is time out. \nThe results can be obtained using getRecord() function later.')
                 return None
                 break
             LOGGER.clear()
         LOGGER.clear()
         LOGGER.report('Dali results completed in %.1fs.', '_dali')
         lines = html.strip().split('\n')
         file_name = re.search('=.+-90\.txt', html).group()[1:]
         file_name = file_name[:-7]
         # LOGGER.info(url+file_name+self._subset+'.txt')
         data = urllib2.urlopen(url+file_name+self._subset+'.txt').read()
         temp_name = file_name+self._subset+'_dali.txt'
         with open(temp_name, "w") as file_temp: file_temp.write(html + '\n' + url+file_name + '\n' + data)
         # with open(temp_name, "a+") as file_temp: file_temp.write(url+file_name + '\n' + data)
     data_list = data.strip().split('# ')
     # No:  Chain   Z    rmsd lali nres  %id PDB  Description -> data_list[3]
     # Structural equivalences -> data_list[4]
     # Translation-rotation matrices -> data_list[5]
     map_temp_dict = dict()
     mapping = []
     lines = data_list[4].strip().split('\n')
     self._lines_4 = lines
     mapping_temp = np.genfromtxt(lines[1:], delimiter = (4,1,14,6,2,4,4,5,2,4,4,3,5,4,3,5,6,3,5,4,3,5,28), usecols = [0,3,5,7,9,12,15,15,18,21], dtype='|i4')
     # [0,3,5,7,9,12,15,15,18,21] -> [index, residue_a, residue_b, residue_i_a, residue_i_b, resid_a, resid_b, resid_i_a, resid_i_b]
     for map_i in mapping_temp:
         if not map_i[0] in map_temp_dict:
             map_temp_dict[map_i[0]] = [[map_i[1], map_i[2], map_i[3], map_i[4]]]
         else:
             map_temp_dict[map_i[0]].append([map_i[1], map_i[2], map_i[3], map_i[4]])
     self._max_index = max(mapping_temp[:,2])
     self._mapping = map_temp_dict
     self._data = data_list[3]
     lines = data_list[3].strip().split('\n')
     daliInfo = np.genfromtxt(lines[1:], delimiter = (4,3,6,5,5,5,6,5,57), usecols = [0,2,3,4,5,6,7,8], dtype=[('id', '<i4'), ('pdb_chain', '|S6'), ('Z', '<f4'), ('rmsd', '<f4'), ('len_align', '<i4'), ('res_num', '<i4'), ('identity', '<i4'), ('title', '|S70')])
     if daliInfo.ndim == 0:
         daliInfo = np.array([daliInfo])
     pdbListAll = []
     self._daliInfo = daliInfo
     dali_temp_dict = dict()
     for temp in self._daliInfo:
         temp_dict = dict()
         pdb_chain = temp[1].strip()[0:6]
         temp_dict['pdbId'] = pdb_chain[0:4]
         temp_dict['chainId'] = pdb_chain[5:6]
         temp_dict['pdb_chain'] = pdb_chain
         temp_dict['Z'] = temp[2]
         temp_dict['rmsd'] = temp[3]
         temp_dict['len_align'] = temp[4]
         temp_dict['res_num'] = temp[5]
         temp_dict['identity'] = temp[6]
         temp_dict['mapping'] = (np.array(map_temp_dict[temp[0]])-1).tolist()
         temp_dict['map_ref'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[0], map_i[1]+1)]
         temp_dict['map_sel'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[2], map_i[3]+1)]
         dali_temp_dict[temp_dict['pdb_chain']] = temp_dict
         pdbListAll.append(pdb_chain)
     self._pdbListAll = tuple(pdbListAll)
     self._pdbList = self._pdbListAll
     self._alignPDB = dali_temp_dict
     LOGGER.info(str(len(pdbListAll)) + ' Dali results have been searched.')
     return True
示例#16
0
文件: pca.py 项目: njekin/ProDy
    def buildCovariance(self, coordsets, **kwargs):
        """Build a covariance matrix for *coordsets* using mean coordinates
        as the reference.  *coordsets* argument may be one of the following:

        * :class:`.Atomic`
        * :class:`.Ensemble`
        * :class:`.TrajBase`
        * :class:`numpy.ndarray` with shape ``(n_csets, n_atoms, 3)``

        For ensemble and trajectory objects, ``update_coords=True`` argument
        can be used to set the mean coordinates as the coordinates of the
        object.

        When *coordsets* is a trajectory object, such as :class:`.DCDFile`,
        covariance will be built by superposing frames onto the reference
        coordinate set (see :meth:`.Frame.superpose`).  If frames are already
        aligned, use ``aligned=True`` argument to skip this step.


        .. note::
           If *coordsets* is a :class:`.PDBEnsemble` instance, coordinates are
           treated specially.  Let's say **C**\_ij is the element of the
           covariance matrix that corresponds to atoms *i* and *j*.  This
           super element is divided by number of coordinate sets (PDB models or
           structures) in which both of these atoms are observed together."""

        if not isinstance(coordsets, (Ensemble, Atomic, TrajBase, np.ndarray)):
            raise TypeError('coordsets must be an Ensemble, Atomic, Numpy '
                            'array instance')
        LOGGER.timeit('_prody_pca')
        mean = None
        weights = None
        ensemble = None
        if isinstance(coordsets, np.ndarray):
            if (coordsets.ndim != 3 or coordsets.shape[2] != 3 or
                    coordsets.dtype not in (np.float32, float)):
                raise ValueError('coordsets is not a valid coordinate array')
        elif isinstance(coordsets, Atomic):
            coordsets = coordsets._getCoordsets()
        elif isinstance(coordsets, Ensemble):
            ensemble = coordsets
            if isinstance(coordsets, PDBEnsemble):
                weights = coordsets.getWeights() > 0
            coordsets = coordsets._getCoordsets()

        update_coords = bool(kwargs.get('update_coords', False))

        if isinstance(coordsets, TrajBase):
            nfi = coordsets.nextIndex()
            coordsets.reset()
            n_atoms = coordsets.numSelected()
            dof = n_atoms * 3
            cov = np.zeros((dof, dof))
            #mean = coordsets._getCoords().flatten()
            n_confs = 0
            n_frames = len(coordsets)
            LOGGER.info('Covariance will be calculated using {0} frames.'
                        .format(n_frames))
            coordsum = np.zeros(dof)
            LOGGER.progress('Building covariance', n_frames, '_prody_pca')
            align = not kwargs.get('aligned', False)
            for frame in coordsets:
                if align:
                    frame.superpose()
                coords = frame._getCoords().flatten()
                coordsum += coords
                cov += np.outer(coords, coords)
                n_confs += 1
                LOGGER.update(n_confs, '_prody_pca')
            LOGGER.clear()
            cov /= n_confs
            coordsum /= n_confs
            mean = coordsum
            cov -= np.outer(coordsum, coordsum)
            coordsets.goto(nfi)
            self._cov = cov
            if update_coords:
                coordsets.setCoords(mean.reshape((n_atoms, 3)))
        else:
            n_confs = coordsets.shape[0]
            if n_confs < 3:
                raise ValueError('coordsets must have more than 3 coordinate '
                                 'sets')
            n_atoms = coordsets.shape[1]
            if n_atoms < 3:
                raise ValueError('coordsets must have more than 3 atoms')
            dof = n_atoms * 3
            LOGGER.info('Covariance is calculated using {0} coordinate sets.'
                        .format(len(coordsets)))
            s = (n_confs, dof)
            if weights is None:
                if coordsets.dtype == float:
                    self._cov = np.cov(coordsets.reshape((n_confs, dof)).T,
                                       bias=1)
                else:
                    cov = np.zeros((dof, dof))
                    coordsets = coordsets.reshape((n_confs, dof))
                    mean = coordsets.mean(0)
                    LOGGER.progress('Building covariance', n_confs,
                                    '_prody_pca')
                    for i, coords in enumerate(coordsets.reshape(s)):
                        deviations = coords - mean
                        cov += np.outer(deviations, deviations)
                        LOGGER.update(n_confs, '_prody_pca')
                    LOGGER.clear()
                    cov /= n_confs
                    self._cov = cov
            else:
                # PDB ensemble case
                mean = np.zeros((n_atoms, 3))
                for i, coords in enumerate(coordsets):
                    mean += coords * weights[i]
                mean /= weights.sum(0)
                d_xyz = ((coordsets - mean) * weights).reshape(s)
                divide_by = weights.astype(float).repeat(3, axis=2).reshape(s)
                self._cov = np.dot(d_xyz.T, d_xyz) / np.dot(divide_by.T,
                                                            divide_by)
            if update_coords and ensemble is not None:
                if mean is None:
                    mean = coordsets.mean(0)
                ensemble.setCoords(mean)

        self._trace = self._cov.trace()
        self._dof = dof
        self._n_atoms = n_atoms
        LOGGER.report('Covariance matrix calculated in %2fs.', '_prody_pca')
示例#17
0
def calcPerturbResponse(model, atoms=None, repeats=100, **kwargs):
    """Returns a matrix of profiles from scanning of the response of the
    structure to random perturbations at specific atom (or node) positions.
    The function implements the perturbation response scanning (PRS) method
    described in [CA09]_.  Rows of the matrix are the average magnitude of the
    responses obtained by perturbing the atom/node position at that row index,
    i.e. ``prs_profile[i,j]`` will give the response of residue/node *j* to
    perturbations in residue/node *i*.  PRS is performed using the covariance
    matrix from *model*, e.g. :class:`.ANM` instance.  Each residue/node is
    perturbed *repeats* times with a random unit force vector.  When *atoms*
    instance is given, PRS profile for residues will be added as an attribute
    which then can be retrieved as ``atoms.getData('prs_profile')``.  *model*
    and *atoms* must have the same number of atoms. *atoms* must be an
    :class:`.AtomGroup` instance.

    .. [CA09] Atilgan C, Atilgan AR, Perturbation-Response Scanning
       Reveals Ligand Entry-Exit Mechanisms of Ferric Binding Protein.
       *PLoS Comput Biol* **2009** 5(10):e1000544.

    The PRS matrix can be calculated and saved as follows::

      prs_matrix = calcPerturbationResponse(p38_anm, saveMatrix=True)
      
    The PRS matrix can also be save later as follows::
    
      writeArray('prs_matrix.txt', prs_matrix, format='%8.6f', delimiter='\t')

    You can also control which operation is used for getting a single matrix
    from the repeated force application and whether to normalise the matrix
    at the end. If you do choose to normalise the matrix, you can still save
    the original matrix before normalisation as well.

    :arg operation: which operation to perform to get a single response matrix::
        the mean, variance, max or min of the set of repeats. Another operation 
        is to select elements from the matrix showing biggest difference from 
        the square sum of the covariance matrix. The Default is the mean.
        To obtain all response matrices, set operation=None without quotes.
        You can also ask for 'all' operations or provide a list containing
        any set of them.
    :type operation: str or list

    :arg noForce: whether to use the covariance matrix directly rather
        than applying forces. This appears to be equivalent when scanning for
        response magnitudes and will be much quicker. Default is True.
    :type noForce: bool

    :arg normMatrix: whether to normalise the single response matrix by
        dividing each row by its diagonal, Default is False, we recommend true
    :type normMatrix: bool

    :arg saveMatrix: whether to save the last matrix generated to a text file.
        Default is False
    :type saveMatrix: bool

    :arg saveOrig: whether to save the original matrix despite normalisation.
        This is the same as saveMatrix when not normalizing. Default is False
    :type saveOrig: bool

    :arg baseSaveName: The central part of the file name for saved
        matrices, which you can set. This is surrounded by underscores. 
        The beginning says orig or norm and the end says which operation 
        was used. Default is 'response_matrix'.
    :type baseSaveName: str

    :arg acceptDirection: select reference direction for forces to be accepted.
        Can be 'in' (towards center of atoms), 'out' (away from center),
        or 'all'. Default is 'all'; Using other directions requires atoms.
    :type acceptDirection: str
    """
    noForce = kwargs.get('noForce', True)
    if not noForce:
        operation = kwargs.get('operation', 'mea')

        if operation is not None:
            if type(operation) is str:
                if operation == 'all' or operation == 'all operations':
                    operationList = ['var', 'mea', 'max', 'min', 'dif']
                else:
                    operationList = []
                    operationList.append(operation.lower()[:3])
            elif type(operation) is list:
                operationList = operation
                for i in range(len(operationList)):
                    operationList[i] = operationList[i].lower()[:3]

            operationList = np.array(operationList)
            found_valid_operation = False

            if 'var' in operationList:
                found_valid_operation = True

            if 'max' in operationList:
                found_valid_operation = True

            if 'mea' in operationList:
                found_valid_operation = True

            if 'min' in operationList:
                found_valid_operation = True

            if 'dif' in operationList:
                found_valid_operation = True

            if not found_valid_operation:
                raise ValueError('Operation should be mean, variance, max, min or ' \
                                 'or difference (from covariance matrix) in quotes ' \
                                 'or a list containing a set of these or None.')

    if not isinstance(model, NMA):
        raise TypeError('model must be an NMA instance')
    elif not model.is3d() and not noForce:
        raise TypeError('model must be a 3-dimensional NMA instance' \
                        'for using PRS with force')
    elif len(model) == 0:
        raise ValueError('model must have normal modes calculated')

    if atoms is not None:
        if isinstance(atoms, Selection):
            atoms = atoms.copy()
        if not isinstance(atoms, AtomGroup):
            raise TypeError('atoms must be an AtomGroup instance')
        elif atoms.numAtoms() != model.numAtoms():
            raise ValueError('model and atoms must have the same number atoms')

    n_atoms = model.numAtoms()
    LOGGER.timeit('_prody_prs_all')
    LOGGER.info('Calculating covariance matrix')
    LOGGER.timeit('_prody_cov')

    assert isinstance(repeats, int), 'repeats must be an integer'
    cov = calcCovariance(model)
    if cov is None:
        raise ValueError('model did not return a covariance matrix')

    LOGGER.clear()
    LOGGER.report('Covariance matrix calculated in %.1fs.', '_prody_cov')

    LOGGER.progress('Calculating perturbation response', n_atoms,
                    '_prody_prs_mat')
    matrix_dict = {}

    if noForce or 'dif' in operationList:
        if not model.is3d():
            n_by_n_cov_squared = cov**2

        else:
            cov_squared = cov**2
            n_by_3n_cov_squared = np.zeros((n_atoms, 3 * n_atoms))
            n_by_n_cov_squared = np.zeros((n_atoms, n_atoms))
            i3 = -3
            i3p3 = 0
            for i in range(n_atoms):
                i3 += 3
                i3p3 += 3
                n_by_3n_cov_squared[i, :] = (cov_squared[i3:i3p3, :]).sum(0)

            j3 = -3
            j3p3 = 0
            for j in range(n_atoms):
                j3 += 3
                j3p3 += 3
                n_by_n_cov_squared[:,
                                   j] = (n_by_3n_cov_squared[:,
                                                             j3:j3p3]).sum(1)

    if noForce:
        matrix_dict['noForce'] = n_by_n_cov_squared
        LOGGER.clear()
        LOGGER.report('Perturbation response matrix calculated in %.1fs.',
                      '_prody_prs_mat')

    else:

        acceptDirection = kwargs.get('acceptDirection', 'all')
        if acceptDirection is not 'all':
            if atoms is None:
                acceptDirection = 'all'
                LOGGER.info('A specific direction for accepting forces was' \
                            ' provided without an atoms object. This' \
                            ' direction will be ignored and all forces will' \
                            ' be accepted.')
            else:
                coords = atoms.getCoords()
                atoms_center = array([np.mean(coords[:,0]), \
                                      np.mean(coords[:,1]), \
                                      np.mean(coords[:,2])])

        mag = kwargs.get('mag', 1)
        response_matrix = np.zeros((repeats, n_atoms, n_atoms))
        i3 = -3
        i3p3 = 0
        for i in range(n_atoms):
            i3 += 3
            i3p3 += 3
            forces = np.random.randn(repeats * 3).reshape((repeats, 3))
            forces /= ((forces**2).sum(1)**0.5).reshape((repeats, 1)) * mag
            for n in range(repeats):
                force = forces[n]

                if acceptDirection is 'in' or acceptDirection is 'out':
                    res_coords = atoms.getCoords()[i]
                    vec_to_center = atoms_center - res_coords
                    vec_to_center /= (((atoms_center -
                                        res_coords)**2).sum()**0.5)
                    force_overlap = np.dot(force, vec_to_center)

                    if acceptDirection is 'in' and force_overlap < 0:
                        force *= -1

                    if acceptDirection is 'out' and force_overlap > 0:
                        force *= -1

                response_matrix[n, i, :] = (np.dot(cov[:, i3:i3p3],
                                                   force)**2).reshape(
                                                       (n_atoms, 3)).sum(1)
            LOGGER.update(i, '_prody_prs_mat')

        LOGGER.clear()
        LOGGER.report(
            'Perturbation response scanning matrix calculated in %.1fs.',
            '_prody_prs_mat')

        LOGGER.progress('Performing matrix combination operations', n_atoms, \
                        '_prody_prs_ops')

        if 'var' in operationList:
            matrix_dict['var'] = np.var(response_matrix, axis=0)

        if 'max' in operationList:
            matrix_dict['max'] = np.amax(response_matrix, axis=0)

        if 'mea' in operationList:
            matrix_dict['mea'] = np.mean(response_matrix, axis=0)

        if 'min' in operationList:
            matrix_dict['min'] = np.amin(response_matrix, axis=0)

        if 'dif' in operationList:
            matrix_dict['dif'] = np.max(abs(response_matrix - n_by_n_cov_squared) \
                                       , axis=0)

            LOGGER.report(
                'Perturbation response matrix operations completed in %.1fs.',
                '_prody_prs_ops')

        if operation is None:
            LOGGER.info('Operation is None so all {0} repeats are output.' \
                        ' This is not compatible with saving, normalizing' \
                        ' or mapping to atoms at present.'.format(repeats))
            return response_matrix

    if atoms is not None:
        atoms.setData('prs_profile', matrix_dict[list(matrix_dict.keys())[0]])
        if len(list(matrix_dict.keys())) > 1:
            LOGGER.info('Only one matrix can be added as data to atoms so' \
                        ' the first one was chosen. The operation that generated' \
                        ' it was {0} (1st 3 letters).'.format(list(matrix_dict.keys())[0]))

    saveOrig = kwargs.get('saveOrig', False)
    saveMatrix = kwargs.get('saveMatrix', False)
    normMatrix = kwargs.get('normMatrix', False)
    suppressDiag = kwargs.get('suppressDiag', False)
    baseSaveName = kwargs.get('baseSaveName', 'response_matrix')

    if saveOrig == True or saveMatrix == True and normMatrix == False:
        # save the original PRS matrix for each operation
        for m in list(matrix_dict.keys()):
            np.savetxt('orig_{0}_{1}.txt'.format(baseSaveName,m), \
                       matrix_dict[m], delimiter='\t', fmt='%8.6f')

    if normMatrix == True:
        norm_PRS_mat = {}
        # calculate the normalized PRS matrix for each operation
        for m in list(matrix_dict.keys()):
            self_dp = np.diag(
                matrix_dict[m])  # using self displacement (diagonal of
            # the original matrix) as a
            # normalization factor
            self_dp = self_dp.reshape(n_atoms, 1)
            norm_PRS_mat[m] = matrix_dict[m] / np.repeat(
                self_dp, n_atoms, axis=1)

            if suppressDiag == True:
                # suppress the diagonal (self displacement) to facilitate
                # visualizing the response profile
                norm_PRS_mat[m] = norm_PRS_mat[m] - np.diag(
                    np.diag(norm_PRS_mat[m]))

            if saveMatrix == True:
                np.savetxt('norm_{0}_{1}.txt'.format(baseSaveName,m), \
                           norm_PRS_mat[m], delimiter='\t', fmt='%8.6f')

    LOGGER.report('Perturbation response scanning completed in %.1fs.',
                  '_prody_prs_all')

    matrix_list = []
    for m in list(matrix_dict.keys()):
        if normMatrix == True:
            matrix_list.append(norm_PRS_mat[m])
        else:
            matrix_list.append(matrix_dict[m])
    matrix_array = array(matrix_list)

    returnFormat = kwargs.get('returnFormat', 'array')
    returnFormat = returnFormat.lower()

    if len(matrix_array) == 1:
        LOGGER.info('Output has been returned as a single matrix (an array).')
        return matrix_array.reshape(n_atoms, n_atoms)

    if returnFormat is 'both':
        LOGGER.info('You have requested return in both formats.' \
                    ' Array comes first.')
        return matrix_array, matrix_dict
    elif 'dict' in returnFormat:
        LOGGER.info('Output has been returned as a dictionary of matrices.')
        return matrix_dict
    else:
        LOGGER.info('Output has been returned as an array of matrices,' \
                    ' which you can split into individual matrices.')
        return matrix_array
示例#18
0
def blastPDB(sequence, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    blast searching *sequence* against the PDB using NCBI blastp.

    :arg sequence: an object with an associated sequence string 
         or a sequence string itself
    :type sequence: :class:`Atomic`, :class:`Sequence`, or str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``)
    search parameters can be adjusted by the user.  *sleep* keyword argument
    (default is ``2`` seconds) determines how long to wait to reconnect for
    results.  Sleep time is multiplied by 1.5 when results are not ready.  
    *timeout* (default is 120 s) determines when to give up waiting for the results.
    """

    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')

    elif isinstance(sequence, Atomic):
        sequence = sequence.calpha.getSequence()

    elif isinstance(sequence, Sequence):
        sequence = str(sequence)

    elif isinstance(sequence, str):
        if len(sequence) in [4, 5, 6]:
            ag = parsePDB(sequence)
            sequence = ag.calpha.getSequence()
        sequence = ''.join(sequence.split())

    else:
        raise TypeError('sequence must be Atomic, Sequence, or str not {0}'
                        .format(type(sequence)))

    headers = {'User-agent': 'ProDy'}
    query = [('DATABASE', 'pdb'), ('ENTREZ_QUERY', '(none)'),
             ('PROGRAM', 'blastp'),]

    expect = float(kwargs.pop('expect', 10e-10))
    if expect <= 0:
        raise ValueError('expect must be a positive number')
    query.append(('EXPECT', expect))
    hitlist_size = int(kwargs.pop('hitlist_size', 250))
    if hitlist_size <= 0:
        raise ValueError('expect must be a positive integer')
    query.append(('HITLIST_SIZE', hitlist_size))
    query.append(('QUERY', sequence))
    query.append(('CMD', 'Put'))

    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 120))

    try:
        import urllib.parse
        urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8')
    except ImportError:
        from urllib import urlencode

    url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi'

    data = urlencode(query)
    LOGGER.timeit('_prody_blast')
    LOGGER.info('Blast searching NCBI PDB database for "{0}..."'
                .format(sequence[:5]))
    handle = openURL(url, data=data, headers=headers)

    html = handle.read()
    index = html.find(b'RID =')
    if index == -1:
        raise Exception('NCBI did not return expected response.')
    else:
        last = html.find(b'\n', index)
        rid = html[index + len('RID ='):last].strip()

    index = html.find(b'RTOE =')
    if index == -1:
        rtoe = None # This is not used
    else:
        last = html.find(b'\n', index)
        rtoe = int(html[index + len('RTOE ='):last].strip())

    query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500),
             ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')]
    data = urlencode(query)

    while True:
        LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.')
        LOGGER.write('Connecting to NCBI for search results...')
        handle = openURL(url, data=data, headers=headers)
        results = handle.read()
        index = results.find(b'Status=')
        LOGGER.clear()
        if index < 0:
            break
        last = results.index(b'\n', index)
        status = results[index+len('Status='):last].strip()
        if status.upper() == 'READY':
            break
        sleep = int(sleep * 1.5)
        if LOGGER.timing('_prody_blast') > timeout:
            LOGGER.warn('Blast search time out.')
            return None
    LOGGER.clear()
    LOGGER.report('Blast search completed in %.1fs.', '_prody_blast')

    try:
        ext_xml = filename.lower().endswith('.xml')
    except AttributeError:
        pass
    else:
        if not ext_xml:
            filename += '.xml'
        out = open(filename, 'w')
        out.write(results)
        out.close()
        LOGGER.info('Results are saved as {0}.'.format(repr(filename)))

    return PDBBlastRecord(results, sequence)
示例#19
0
def calcPerturbResponse(model, **kwargs):
    """Returns a matrix of profiles from scanning the response of the
    structure to random perturbations at specific atom (or node) positions.
    The function implements the perturbation response scanning (PRS) method
    described in [CA09]_.  Rows of the matrix are the average magnitude of the
    responses obtained by perturbing the atom/node position at that row index,
    i.e. ``prs_profile[i,j]`` will give the response of residue/node *j* to
    perturbations in residue/node *i*.  PRS is performed using the covariance
    matrix from *model*, e.g. :class:`.ANM` instance.

    When an *atoms* instance is given, the PRS matrix will be added as data, 
    which can be retrieved with ``atoms.getData('prs_matrix')``.  

    *model* and *atoms* must have the same number of atoms. *atoms* must be an
    :class:`.AtomGroup` instance. 

    .. [CA09] Atilgan C, Atilgan AR, Perturbation-Response Scanning
       Reveals Ligand Entry-Exit Mechanisms of Ferric Binding Protein.
       *PLoS Comput Biol* **2009** 5(10):e1000544.

    The PRS matrix can be calculated and saved as follows::

      prs_matrix = calcPerturbResponse(p38_anm, saveMatrix=True)
      
    The PRS matrix can also be save later as follows::
    
      writeArray('prs_matrix.txt', prs_matrix, format='%8.6f', delimiter='\t')

    :arg saveMatrix: whether to save the last matrix generated to a text file.
        Default is False
    :type saveMatrix: bool

    :arg saveName: The file name for saved matrices
        Default is 'response_matrix.txt'.
    :type saveName: str
    """

    if not isinstance(model, (NMA, ModeSet, Mode)):
        raise TypeError('model must be an NMA, ModeSet, or Mode instance')

    if isinstance(model, NMA) and len(model) == 0:
        raise ValueError('model must have normal modes calculated')

    atoms = kwargs.get('atoms', None)
    if atoms is not None:
        if isinstance(atoms, Selection):
            atoms = atoms.copy()
        if not isinstance(atoms, AtomGroup):
            raise TypeError('atoms must be an AtomGroup instance')
        elif atoms.numAtoms() != model.numAtoms():
            raise ValueError('model and atoms must have the same number atoms')

    n_atoms = model.numAtoms()
    LOGGER.timeit('_prody_prs_all')
    LOGGER.info('Calculating covariance matrix')
    LOGGER.timeit('_prody_cov')

    cov = calcCovariance(model)
    if cov is None:
        raise ValueError('model did not return a covariance matrix')

    LOGGER.clear()
    LOGGER.report('Covariance matrix calculated in %.1fs.', '_prody_cov')

    LOGGER.progress('Calculating perturbation response', n_atoms,
                    '_prody_prs_mat')

    if not model.is3d():
        prs_matrix = cov**2

    else:
        cov_squared = cov**2
        n_by_3n_cov_squared = np.zeros((n_atoms, 3 * n_atoms))
        prs_matrix = np.zeros((n_atoms, n_atoms))
        i3 = -3
        i3p3 = 0
        for i in range(n_atoms):
            i3 += 3
            i3p3 += 3
            n_by_3n_cov_squared[i, :] = (cov_squared[i3:i3p3, :]).sum(0)

        j3 = -3
        j3p3 = 0
        for j in range(n_atoms):
            j3 += 3
            j3p3 += 3
            prs_matrix[:, j] = (n_by_3n_cov_squared[:, j3:j3p3]).sum(1)

    LOGGER.clear()
    LOGGER.report('Perturbation response matrix calculated in %.1fs.',
                  '_prody_prs_mat')

    saveMatrix = kwargs.get('saveMatrix', False)
    suppressDiag = kwargs.get('suppressDiag', False)
    saveName = kwargs.get('saveName', 'response_matrix.txt')

    norm_prs_matrix = np.zeros((n_atoms, n_atoms))
    self_dp = np.diag(prs_matrix)
    self_dp = self_dp.reshape(n_atoms, 1)
    norm_prs_matrix = prs_matrix / np.repeat(self_dp, n_atoms, axis=1)

    if suppressDiag == True:
        # suppress the diagonal (self displacement) to facilitate
        # visualizing the response profile
        norm_prs_matrix = norm_prs_matrix - np.diag(np.diag(norm_prs_matrix))

    if saveMatrix == True:
        np.savetxt(saveName, norm_prs_matrix, delimiter='\t', fmt='%8.6f')

    LOGGER.report('Perturbation response scanning completed in %.1fs.',
                  '_prody_prs_all')

    if atoms is not None:
        atoms.setData('prs_matrix', norm_prs_matrix)
        return atoms, norm_prs_matrix
    else:
        return norm_prs_matrix
示例#20
0
    def buildCovariance(self, coordsets, **kwargs):
        """Build a covariance matrix for *coordsets* using mean coordinates
        as the reference.  *coordsets* argument may be one of the following:

        * :class:`.Atomic`
        * :class:`.Ensemble`
        * :class:`.TrajBase`
        * :class:`numpy.ndarray` with shape ``(n_csets, n_atoms, 3)``

        For ensemble and trajectory objects, ``update_coords=True`` argument
        can be used to set the mean coordinates as the coordinates of the
        object.

        When *coordsets* is a trajectory object, such as :class:`.DCDFile`,
        covariance will be built by superposing frames onto the reference
        coordinate set (see :meth:`.Frame.superpose`).  If frames are already
        aligned, use ``aligned=True`` argument to skip this step.


        .. note::
           If *coordsets* is a :class:`.PDBEnsemble` instance, coordinates are
           treated specially.  Let's say **C**\_ij is the element of the
           covariance matrix that corresponds to atoms *i* and *j*.  This
           super element is divided by number of coordinate sets (PDB models or
           structures) in which both of these atoms are observed together."""

        if not isinstance(coordsets, (Ensemble, Atomic, TrajBase, np.ndarray)):
            raise TypeError('coordsets must be an Ensemble, Atomic, Numpy '
                            'array instance')
        LOGGER.timeit('_prody_pca')
        mean = None
        weights = None
        ensemble = None
        if isinstance(coordsets, np.ndarray):
            if (coordsets.ndim != 3 or coordsets.shape[2] != 3
                    or coordsets.dtype not in (np.float32, float)):
                raise ValueError('coordsets is not a valid coordinate array')
        elif isinstance(coordsets, Atomic):
            coordsets = coordsets._getCoordsets()
        elif isinstance(coordsets, Ensemble):
            ensemble = coordsets
            if isinstance(coordsets, PDBEnsemble):
                weights = coordsets.getWeights() > 0
            coordsets = coordsets._getCoordsets()

        update_coords = bool(kwargs.get('update_coords', False))

        if isinstance(coordsets, TrajBase):
            nfi = coordsets.nextIndex()
            coordsets.reset()
            n_atoms = coordsets.numSelected()
            dof = n_atoms * 3
            cov = np.zeros((dof, dof))
            #mean = coordsets._getCoords().flatten()
            n_confs = 0
            n_frames = len(coordsets)
            LOGGER.info(
                'Covariance will be calculated using {0} frames.'.format(
                    n_frames))
            coordsum = np.zeros(dof)
            LOGGER.progress('Building covariance', n_frames, '_prody_pca')
            align = not kwargs.get('aligned', False)
            for frame in coordsets:
                if align:
                    frame.superpose()
                coords = frame._getCoords().flatten()
                coordsum += coords
                cov += np.outer(coords, coords)
                n_confs += 1
                LOGGER.update(n_confs, '_prody_pca')
            LOGGER.clear()
            cov /= n_confs
            coordsum /= n_confs
            mean = coordsum
            cov -= np.outer(coordsum, coordsum)
            coordsets.goto(nfi)
            self._cov = cov
            if update_coords:
                coordsets.setCoords(mean.reshape((n_atoms, 3)))
        else:
            n_confs = coordsets.shape[0]
            if n_confs < 3:
                raise ValueError('coordsets must have more than 3 coordinate '
                                 'sets')
            n_atoms = coordsets.shape[1]
            if n_atoms < 3:
                raise ValueError('coordsets must have more than 3 atoms')
            dof = n_atoms * 3
            LOGGER.info(
                'Covariance is calculated using {0} coordinate sets.'.format(
                    len(coordsets)))
            s = (n_confs, dof)
            if weights is None:
                if coordsets.dtype == float:
                    self._cov = np.cov(coordsets.reshape((n_confs, dof)).T,
                                       bias=1)
                else:
                    cov = np.zeros((dof, dof))
                    coordsets = coordsets.reshape((n_confs, dof))
                    mean = coordsets.mean(0)
                    LOGGER.progress('Building covariance', n_confs,
                                    '_prody_pca')
                    for i, coords in enumerate(coordsets.reshape(s)):
                        deviations = coords - mean
                        cov += np.outer(deviations, deviations)
                        LOGGER.update(n_confs, '_prody_pca')
                    LOGGER.clear()
                    cov /= n_confs
                    self._cov = cov
            else:
                # PDB ensemble case
                mean = np.zeros((n_atoms, 3))
                for i, coords in enumerate(coordsets):
                    mean += coords * weights[i]
                mean /= weights.sum(0)
                d_xyz = ((coordsets - mean) * weights).reshape(s)
                divide_by = weights.astype(float).repeat(3, axis=2).reshape(s)
                self._cov = np.dot(d_xyz.T, d_xyz) / np.dot(
                    divide_by.T, divide_by)
            if update_coords and ensemble is not None:
                if mean is None:
                    mean = coordsets.mean(0)
                ensemble.setCoords(mean)

        self._trace = self._cov.trace()
        self._dof = dof
        self._n_atoms = n_atoms
        LOGGER.report('Covariance matrix calculated in %2fs.', '_prody_pca')
示例#21
0
def blastPDB(sequence, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    blast searching *sequence* against the PDB using NCBI blastp.

    :arg sequence: an object with an associated sequence string 
         or a sequence string itself
    :type sequence: :class:`Atomic`, :class:`Sequence`, or str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``)
    search parameters can be adjusted by the user.  *sleep* keyword argument
    (default is ``2`` seconds) determines how long to wait to reconnect for
    results.  Sleep time is multiplied by 1.5 when results are not ready.  
    *timeout* (default is 120 s) determines when to give up waiting for the results.
    """

    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')

    elif isinstance(sequence, Atomic):
        sequence = sequence.calpha.getSequence()

    elif isinstance(sequence, Sequence):
        sequence = str(sequence)

    elif isinstance(sequence, str):
        if len(sequence) in [4, 5, 6]:
            ag = parsePDB(sequence)
            sequence = ag.calpha.getSequence()
        sequence = ''.join(sequence.split())

    else:
        raise TypeError(
            'sequence must be Atomic, Sequence, or str not {0}'.format(
                type(sequence)))

    headers = {'User-agent': 'ProDy'}
    query = [
        ('DATABASE', 'pdb'),
        ('ENTREZ_QUERY', '(none)'),
        ('PROGRAM', 'blastp'),
    ]

    expect = float(kwargs.pop('expect', 10e-10))
    if expect <= 0:
        raise ValueError('expect must be a positive number')
    query.append(('EXPECT', expect))
    hitlist_size = int(kwargs.pop('hitlist_size', 250))
    if hitlist_size <= 0:
        raise ValueError('expect must be a positive integer')
    query.append(('HITLIST_SIZE', hitlist_size))
    query.append(('QUERY', sequence))
    query.append(('CMD', 'Put'))

    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 120))

    try:
        import urllib.parse
        urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8')
    except ImportError:
        from urllib import urlencode

    url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi'

    data = urlencode(query)
    LOGGER.timeit('_prody_blast')
    LOGGER.info('Blast searching NCBI PDB database for "{0}..."'.format(
        sequence[:5]))
    handle = openURL(url, data=data, headers=headers)

    html = handle.read()
    index = html.find(b'RID =')
    if index == -1:
        raise Exception('NCBI did not return expected response.')
    else:
        last = html.find(b'\n', index)
        rid = html[index + len('RID ='):last].strip()

    index = html.find(b'RTOE =')
    if index == -1:
        rtoe = None  # This is not used
    else:
        last = html.find(b'\n', index)
        rtoe = int(html[index + len('RTOE ='):last].strip())

    query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500),
             ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')]
    data = urlencode(query)

    while True:
        LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.')
        LOGGER.write('Connecting to NCBI for search results...')
        handle = openURL(url, data=data, headers=headers)
        results = handle.read()
        index = results.find(b'Status=')
        LOGGER.clear()
        if index < 0:
            break
        last = results.index(b'\n', index)
        status = results[index + len('Status='):last].strip()
        if status.upper() == 'READY':
            break
        sleep = int(sleep * 1.5)
        if LOGGER.timing('_prody_blast') > timeout:
            LOGGER.warn('Blast search time out.')
            return None

    LOGGER.clear()
    LOGGER.report('Blast search completed in %.1fs.', '_prody_blast')

    try:
        ext_xml = filename.lower().endswith('.xml')
    except AttributeError:
        pass
    else:
        if not ext_xml:
            filename += '.xml'
        out = open(filename, 'w')
        if PY3K:
            out.write(results.decode())
        else:
            out.write(results)
        out.close()
        LOGGER.info('Results are saved as {0}.'.format(repr(filename)))

    return PDBBlastRecord(results, sequence)
示例#22
0
def calcPerturbResponse(model, atoms=None, **kwargs):
    """Returns a matrix of profiles from scanning the response of the
    structure to random perturbations at specific atom (or node) positions.
    The function implements the perturbation response scanning (PRS) method
    described in [CA09]_.  Rows of the matrix are the average magnitude of the
    responses obtained by perturbing the atom/node position at that row index,
    i.e. ``prs_profile[i,j]`` will give the response of residue/node *j* to
    perturbations in residue/node *i*.  PRS is performed using the covariance
    matrix from *model*, e.g. :class:`.ANM` instance.

    When an *atoms* instance is given, the PRS matrix will be added as data, 
    which can be retrieved with ``atoms.getData('prs_matrix')``.  

    *model* and *atoms* must have the same number of atoms. *atoms* must be an
    :class:`.AtomGroup` instance. 

    .. [CA09] Atilgan C, Atilgan AR, Perturbation-Response Scanning
       Reveals Ligand Entry-Exit Mechanisms of Ferric Binding Protein.
       *PLoS Comput Biol* **2009** 5(10):e1000544.

    """

    if not isinstance(model, (NMA, ModeSet, Mode)):
        raise TypeError('model must be an NMA, ModeSet, or Mode instance')

    if isinstance(model, NMA) and len(model) == 0:
        raise ValueError('model must have normal modes calculated')

    atoms = kwargs.get('atoms', None)
    if atoms is not None:
        if isinstance(atoms, Selection):
            atoms = atoms.copy()
        if not isinstance(atoms, AtomGroup):
            raise TypeError('atoms must be an AtomGroup instance')
        elif atoms.numAtoms() != model.numAtoms():
            raise ValueError('model and atoms must have the same number atoms')

    n_atoms = model.numAtoms()
    LOGGER.timeit('_prody_prs_all')
    LOGGER.info('Calculating covariance matrix')
    LOGGER.timeit('_prody_cov')

    cov = model.getCovariance()

    LOGGER.clear()
    LOGGER.report('Covariance matrix calculated in %.1fs.', '_prody_cov')

    LOGGER.info('Calculating perturbation response')
    LOGGER.timeit('_prody_prs_mat')
    if not model.is3d():
        prs_matrix = cov**2

    else:
        cov_squared = cov**2
        n_by_3n_cov_squared = np.zeros((n_atoms, 3 * n_atoms))
        prs_matrix = np.zeros((n_atoms, n_atoms))
        i3 = -3
        i3p3 = 0
        for i in range(n_atoms):
            i3 += 3
            i3p3 += 3
            n_by_3n_cov_squared[i, :] = (cov_squared[i3:i3p3, :]).sum(0)

        j3 = -3
        j3p3 = 0
        for j in range(n_atoms):
            j3 += 3
            j3p3 += 3
            prs_matrix[:, j] = (n_by_3n_cov_squared[:, j3:j3p3]).sum(1)

    LOGGER.clear()
    LOGGER.report('Perturbation response matrix calculated in %.1fs.',
                  '_prody_prs_mat')

    no_diag = kwargs.get('no_diag', False)
    #filename = kwargs.get('filename', None)

    norm_prs_matrix = np.zeros((n_atoms, n_atoms))
    self_dp = np.diag(prs_matrix)
    self_dp = self_dp.reshape(n_atoms, 1)
    norm_prs_matrix = prs_matrix / np.repeat(self_dp, n_atoms, axis=1)

    effectiveness = np.mean(norm_prs_matrix, axis=1)
    sensitivity = np.mean(norm_prs_matrix, axis=0)

    if no_diag:
        # suppress the diagonal (self displacement) to facilitate
        # visualizing the response profile
        norm_prs_matrix = norm_prs_matrix - np.diag(np.diag(norm_prs_matrix))

    #if filename:
    #    np.savetxt(filename, norm_prs_matrix, delimiter='\t', fmt='%8.6f')

    LOGGER.report('Perturbation response scanning completed in %.1fs.',
                  '_prody_prs_all')

    if atoms is not None:
        try:
            ag = atoms.getAtomGroup()
            defdata = np.zeros(ag.numAtoms(), dtype=float)
            ag.setData('effectiveness', defdata.copy())
            ag.setData('sensitivity', defdata.copy())
        except AttributeError:
            pass
        atoms.setData('effectiveness', effectiveness)
        atoms.setData('sensitivity', sensitivity)

        #atoms.setData('prs_matrix', norm_prs_matrix)

    return norm_prs_matrix, effectiveness, sensitivity
示例#23
0
def writeDCD(filename, trajectory, start=None, stop=None, step=None, 
             align=False):
    """Write 32-bit CHARMM format DCD file (also NAMD 2.1 and later).
    *trajectory can be an :class:`Trajectory`, :class:`DCDFile`, or 
    :class:`Ensemble` instance. *filename* is returned upon successful
    output of file."""
    
    if not isinstance(trajectory, (TrajBase, Ensemble, Atomic)):
        raise TypeError('{0:s} is not a valid type for trajectory'
                        .format(type(trajectory)))
    
    irange = range(*slice(start, stop, 
                          step).indices(trajectory.numCoordsets()))
    n_csets = len(irange)
    if n_csets == 0:
        raise ValueError('trajectory does not have any coordinate sets, or '
                         'no coordinate sets are selected')
    
    if isinstance(trajectory, Atomic):
        isEnsemble = False
        isAtomic = True
        n_atoms = trajectory.numAtoms()
    else:
        isEnsemble = True
        isAtomic = False
        n_atoms = trajectory.numSelected()
    if n_atoms == 0:
        raise ValueError('no atoms are selected in the trajectory')
    if isinstance(trajectory, TrajBase):
        isTrajectory = True
        unitcell = trajectory.hasUnitcell()
        nfi = trajectory.nextIndex() 
        trajectory.reset()
        pack_i_48 = pack('i', 48)
        if isinstance(trajectory, Trajectory):
            timestep = trajectory.getTimestep()[0]
            first_ts = trajectory.getFirstTimestep()[0]
            framefreq = trajectory.getFrameFreq()[0]
            n_fixed = trajectory.numFixed()[0]
        else:
            timestep = trajectory.getTimestep()
            first_ts = trajectory.getFirstTimestep()
            framefreq = trajectory.getFrameFreq()
            n_fixed = trajectory.numFixed()
    else:
        isTrajectory = False
        unitcell = False
        if isinstance(trajectory, Ensemble):
            frame = trajectory[0]
        else:
            frame = trajectory
            acsi = trajectory.getACSIndex()
        timestep = 1
        first_ts = 0
        framefreq = 1
        n_fixed = 0
        
    dcd = DCDFile(filename, mode='w')
    LOGGER.progress('Writing DCD', len(irange), '_prody_writeDCD')
    prev = -1
    uc = None
    time_ = time()
    for j, i in enumerate(irange):
        diff = i - prev
        if diff > 1:
            trajectory.skip(diff-1)
        prev = i
        if isTrajectory:
            frame = trajectory.next()
            if frame is None:
                break
            if unitcell:
                uc = frame._getUnitcell()
                uc[3:] = np.sin((PISQUARE/90) * (90-uc[3:]))
                uc = uc[[0,3,1,4,5,2]]
        elif isEnsemble:
            frame._index = i
        else:
            frame.setACSIndex(i) 
        if align:
            frame.superpose()
        if j == 0:
            dcd.write(frame._getCoords(), uc, timestep=timestep, 
                      firsttimestep=first_ts, framefreq=framefreq)
        else:
            dcd.write(frame._getCoords(), uc)
        LOGGER.update(i, '_prody_writeDCD')
    if isAtomic:
        trajectory.setACSIndex(acsi)
    j += 1
    LOGGER.clear()
    dcd.close()
    time_ = time() - time_ or 0.01
    dcd_size = 1.0 * (56 + (n_atoms * 3 + 6) * 4 ) * n_csets / (1024*1024)
    LOGGER.info('DCD file was written in {0:.2f} seconds.'.format(time_))
    LOGGER.info('{0:.2f} MB written at input rate {1:.2f} MB/s.'
                .format(dcd_size, dcd_size/time_))
    LOGGER.info('{0:d} coordinate sets written at output rate {1:d} frame/s.'
                .format(n_csets, int(n_csets/time_)))
    if j != n_csets:
        LOGGER.warn('Warning: {0:d} frames expected, {1:d} written.'
                    .format(n_csets, j))
    if isTrajectory:
        trajectory.goto(nfi)
    return filename
示例#24
0
文件: analysis.py 项目: sixpi/ProDy
def calcPerturbResponse(model, atoms=None, repeats=100):
    """Returns a matrix of profiles from scanning of the response of the
    structure to random perturbations at specific atom (or node) positions.
    The function implements the perturbation response scanning (PRS) method
    described in [CA09]_.  Rows of the matrix are the average magnitude of the
    responses obtained by perturbing the atom/node position at that row index,
    i.e. ``prs_profile[i,j]`` will give the response of residue/node *j* to
    perturbations in residue/node *i*.  PRS is performed using the covariance
    matrix from *model*, e.t. :class:`.ANM` instance.  Each residue/node is
    perturbed *repeats* times with a random unit force vector.  When *atoms*
    instance is given, PRS profile for residues will be added as an attribute
    which then can be retrieved as ``atoms.getData('prs_profile')``.  *model*
    and *atoms* must have the same number of atoms. *atoms* must be an
    :class:`.AtomGroup` instance.


    .. [CA09] Atilgan C, Atilgan AR, Perturbation-Response Scanning
       Reveals Ligand Entry-Exit Mechanisms of Ferric Binding Protein.
       *PLoS Comput Biol* **2009** 5(10):e1000544.

    The PRS matrix can be saved as follows::

      prs_matrix = calcPerturbationResponse(p38_anm)
      writeArray('prs_matrix.txt', prs_matrix, format='%8.6f', delimiter='\t')
    """

    if not isinstance(model, NMA):
        raise TypeError('model must be an NMA instance')
    elif not model.is3d():
        raise TypeError('model must be a 3-dimensional NMA instance')
    elif len(model) == 0:
        raise ValueError('model must have normal modes calculated')
    if atoms is not None:
        if not isinstance(atoms, AtomGroup):
            raise TypeError('atoms must be an AtomGroup instance')
        elif atoms.numAtoms() != model.numAtoms():
            raise ValueError('model and atoms must have the same number atoms')

    assert isinstance(repeats, int), 'repeats must be an integer'
    cov = calcCovariance(model)
    if cov is None:
        raise ValueError('model did not return a covariance matrix')

    n_atoms = model.numAtoms()
    response_matrix = np.zeros((n_atoms, n_atoms))
    LOGGER.progress('Calculating perturbation response', n_atoms, '_prody_prs')
    i3 = -3
    i3p3 = 0
    for i in range(n_atoms):
        i3 += 3
        i3p3 += 3
        forces = np.random.rand(repeats * 3).reshape((repeats, 3))
        forces /= ((forces**2).sum(1)**0.5).reshape((repeats, 1))
        for force in forces:
            response_matrix[i] += (
                np.dot(cov[:, i3:i3p3], force)
                ** 2).reshape((n_atoms, 3)).sum(1)
        LOGGER.update(i, '_prody_prs')

    response_matrix /= repeats
    LOGGER.clear()
    LOGGER.report('Perturbation response scanning completed in %.1fs.',
                  '_prody_prs')
    if atoms is not None:
        atoms.setData('prs_profile', response_matrix)
    return response_matrix

    # save the original PRS matrix
    np.savetxt('orig_PRS_matrix', response_matrix, delimiter='\t', fmt='%8.6f')
    # calculate the normalized PRS matrix
    self_dp = np.diag(response_matrix)  # using self displacement (diagonal of
                               # the original matrix) as a
                               # normalization factor
    self_dp = self_dp.reshape(n_atoms, 1)
    norm_PRS_mat = response_matrix / np.repeat(self_dp, n_atoms, axis=1)
    # suppress the diagonal (self displacement) to facilitate
    # visualizing the response profile
    norm_PRS_mat = norm_PRS_mat - np.diag(np.diag(norm_PRS_mat))
    np.savetxt('norm_PRS_matrix', norm_PRS_mat, delimiter='\t', fmt='%8.6f')
    return response_matrix
示例#25
0
def blastPDB(sequence, filename=None, **kwargs):
    """Return a :class:`PDBBlastRecord` instance that contains results from
    blast searching of ProteinDataBank database *sequence* using NCBI blastp.
        
    :arg sequence: single-letter code amino acid sequence of the protein
        without any gap characters, all white spaces will be removed
    :type sequence: str 
    :arg filename: a *filename* to save the results in XML format 
    :type filename: str
    
    *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``) 
    search parameters can be adjusted by the user.  *sleep* keyword argument
    (default is ``2`` seconds) determines how long to wait to reconnect for 
    results.  Sleep time is doubled when results are not ready.  *timeout* 
    (default is 30 seconds) determines when to give up waiting for the results.  
    """
    
    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')
    elif isinstance(sequence, str):
        sequence = ''.join(sequence.split())
        if not checkSequence(sequence):
            raise ValueError(repr(sequence) + ' is not a valid sequence')
    else:
        raise TypeError('sequence must be a string')

    query = [('DATABASE', 'pdb'), ('ENTREZ_QUERY', '(none)'),
             ('PROGRAM', 'blastp'),] 
    expect = kwargs.pop('expect', 10e-10)
    assert isinstance(expect, (float, int)), 'expect must be a float'
    assert expect > 0, 'expect must be a positive number'
    query.append(('EXPECT', expect))
    hitlist_size = kwargs.pop('hitlist_size', 250)
    assert isinstance(hitlist_size, int), 'hitlist_size must be an integer'
    assert hitlist_size > 0, 'expect must be a positive integer'
    query.append(('HITLIST_SIZE', hitlist_size))
    query.append(('QUERY', sequence))
    query.append(('CMD', 'Put'))
    
    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 20))
    
    if kwargs:
        LOGGER.warning("Keyword argument(s) '{0:s}' are not used."
                       .format("', '".join(kwargs.keys())))

    import urllib, urllib2
    
    url = 'http://blast.ncbi.nlm.nih.gov/Blast.cgi'
    
    data = urllib.urlencode(query)
    LOGGER.timeit()
    LOGGER.info('Blast searching NCBI PDB database for "{0:s}..."'
                .format(sequence[:5]))
    request = urllib2.Request(url, data, {'User-agent': 'ProDy'})
    handle = urllib2.urlopen(request)
    
    html = handle.read()
    index = html.find('RID =')
    if index == -1:
        raise Exception('NCBI did not return expected response.')
    else:
        last = html.find('\n', index)
        rid = html[index + len('RID ='):last].strip()

    index = html.find('RTOE =')
    if index == -1:
        rtoe = None # This is not used
    else:
        last = html.find('\n', index)
        rtoe = int(html[index + len('RTOE ='):last].strip())

    query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500), 
             ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')]
    data = urllib.urlencode(query)
    
    while True:
        LOGGER.sleep(int(sleep), ' to connect NCBI for search results.')
        LOGGER.write('Connecting NCBI for search results...')
        request = urllib2.Request(url, data, {'User-agent': 'ProDy'})
        handle = urllib2.urlopen(request)
        results = handle.read()
        index = results.find('Status=')
        LOGGER.clear()
        if index < 0:
            break
        last = results.index('\n', index)
        status = results[index+len('Status='):last].strip()
        if status.upper() == 'READY':
            break
        sleep *= 2
        if LOGGER.timing() > timeout:
            LOGGER.warning('Blast search time out.')
            return None
    LOGGER.clear()
    LOGGER.timing('Blast search completed in %.1fs.')
    if isinstance(filename, str):
        if not filename.lower().endswith('.xml'):
                filename += '.xml'        
        out = open(filename, 'w')
        out.write(results)
        out.close()
        LOGGER.info('Results are saved as {0:s}.'.format(filename))
    return PDBBlastRecord(results, sequence)
示例#26
0
文件: emsurfer.py 项目: uibcdf/ProDy
def searchEmsurfer(emd, **kwargs):
    """Search with the EM-Surfer server with input of EMD ID (or local EMD file).
    EM-Surfer server: http://kiharalab.org/em-surfer/
    
    :arg emd: EMD code or local EMD map file for the query protein
    """
    
    import requests
    from requests.models import Request
    
    LOGGER.timeit('_emsurfer')
    # timeout = 120
    timeout = kwargs.pop('timeout', 120)
    
    emsurferURL = "http://kiharalab.org/em-surfer/cgi-bin/listResults.cgi"
    
    volumeFilter = kwargs.get('volumeFilter', 'on')
    representation = kwargs.get('representation','recommend')
    minResolution = kwargs.get('minResolution', 0.5)
    maxResolution = kwargs.get('maxResolution', 30.)

    if isinstance(emd, EMDMAP):
        emdmap = emd
        stream = createStringIO()
        writeEMD(stream, emdmap)
        data = stream.getvalue()
        stream.close()
        files = {"file1" : data}

        emdId = emdmap.getTitle()
        emdId = ''
        emsurfer_title = 'Title_'+emdId
    elif isinstance(emd, str):
        if os.path.isfile(emd):
            emdmap = parseEMD(emd)
            filename = os.path.basename(emd)
            filename, ext = os.path.splitext(filename)
            if ext.lower() == '.gz':
                filename2, ext2 = os.path.splitext(filename)
                if ext2.lower() == '.emd':
                    filename = filename2
            emdId = filename
            files = {"file1" : data}
            emdId = ''
            emsurfer_title = 'Title_' + emdId
        else:
            emdId = emd
            emsurfer_title = 'Title_' + emdId
            files = ''

    method='post'
    url=emsurferURL
    params = { 'emdbid' : emdId, 'volumefilter' : volumeFilter, 'representation' : representation, 
               'minresolution': minResolution, 'maxresolution': maxResolution }

    # Generate request url deep inside 
    data=None; headers=None; cookies=None; files=None
    auth=None; timeout=None; allow_redirects=True; proxies=None
    hooks=None; stream=None; verify=None; cert=None; json=None
    req = Request(
        method=method.upper(),
        url=url,
        headers=headers,
        files=files,
        data=data or {},
        json=json,
        params=params or {},
        auth=auth,
        cookies=cookies,
        hooks=hooks,
    )
    session = requests.sessions.Session()
    prep = session.prepare_request(req)
    resp = session.send(prep)
    url = resp.url

    LOGGER.debug('Submitted Emsurfer search for EMD "{0}".'.format(emdId))
    LOGGER.info(url)
    LOGGER.clear()
    obj = EmsurferRecord(url, emdId, timeout=timeout, **kwargs)
        
    return obj
示例#27
0
def blastPDBUniProtKB(sequence, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    blast searching of ProteinDataBank database *sequence* using NCBI blastp.

    :arg sequence: single-letter code amino acid sequence of the protein
        without any gap characters, all white spaces will be removed
    :type sequence: str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``)
    search parameters can be adjusted by the user.  *sleep* keyword argument
    (default is ``2`` seconds) determines how long to wait to reconnect for
    results.  Sleep time is doubled when results are not ready.  *timeout*
    (default is 120s) determines when to give up waiting for the results. 
    *num_sequences (default is ``1``)
    """

    num_sequences = int(kwargs.pop('num_sequences', 1))
    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')
    else:
        if num_sequences == 1:
            try:
                sequence = ''.join(sequence.split())
                _ = sequence.isalpha()
            except AttributeError:
                raise TypeError('sequence must be a string')
            else:
                if not _:
                    raise ValueError('not a valid protein sequence')
                    
    headers = {'User-agent': 'ProDy'}

    query = [('DATABASE', 'swissprot'), ('ENTREZ_QUERY', '(none)'),
             ('PROGRAM', 'blastp'),]
    expect = float(kwargs.pop('expect', 10e-5))
    if expect <= 0:
        raise ValueError('expect must be a positive number')
    query.append(('EXPECT', expect))
    hitlist_size = int(kwargs.pop('hitlist_size', 250))
    if hitlist_size <= 0:
        raise ValueError('expect must be a positive integer')
    psiblast = 'true'
    step_number = 3
    query.append(('RUN_PSIBLAST', psiblast))
    query.append(('HITLIST_SIZE', hitlist_size))
    query.append(('QUERY', sequence))
    query.append(('CMD', 'Put'))
    query.append(('STEP_NUMBER', step_number))

    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 120))

    if kwargs:
        LOGGER.warn('Keyword argument(s) {0} are not used.'
                    .format(', '.join([repr(key) for key in kwargs])))

    try:
        import urllib.parse
        urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8')
    except ImportError:
        from urllib.parse import urlencode

    url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi'

    data = urlencode(query)
    LOGGER.timeit('_prody_blast')
    LOGGER.info('Blast searching NCBI PDB database for "{0}..."'
                .format(sequence[:5]))
    handle = openURL(url, data=data, headers=headers)

    html = handle.read()
    index = html.find(b'name="RID" type="hidden" value="')
    if index == -1:
        raise Exception('NCBI did not return expected response.')
    else:
        last = html.find(b'>',index)
        rid = html[index + len('name="RID" type="hidden" value="'):last-1].strip()

    index = html.find(b'name="RTOE" type="hidden" value="')
    if index == -1:
        rtoe = None # This is not used
    else:
        last = html.find(b'>', index)
        rtoe = html[index + len('name="RTOE" type="hidden" value="'):last-1].strip()

    query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500),
             ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')]
    data = urlencode(query)

    while True:
        LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.')
        LOGGER.write('Connecting NCBI for search results...')
        handle = openURL(url, data=data, headers=headers)
        results = handle.read()
        index = results.find(b'Status=')
        LOGGER.clear()
        if index < 0:
            break
        last = results.index(b'\n', index)
        status = results[index+len('Status='):last].strip()
        if status.upper() == 'READY':
            break
        sleep = int(sleep * 1.5)
        if LOGGER.timing('_prody_blast') > timeout:
            LOGGER.warn('Blast search time out.')
            return None
    LOGGER.clear()
    LOGGER.report('Blast search completed in %.1fs.', '_prody_blast')
    try:
        ext_xml = filename.lower().endswith('.xml')
    except AttributeError:
        pass
    else:
        if not ext_xml:
            filename += '.xml'
        out = open(filename, 'w')
        out.write(results)
        out.close()
        LOGGER.info('Results are saved as {0}.'.format(repr(filename)))
    return SwissProtBlastRecord(results, sequence)
示例#28
0
文件: perturb.py 项目: fongchun/ProDy
def calcPerturbResponse(model, atoms=None, **kwargs):

    """Returns a matrix of profiles from scanning the response of the
    structure to random perturbations at specific atom (or node) positions.
    The function implements the perturbation response scanning (PRS) method
    described in [CA09]_.  Rows of the matrix are the average magnitude of the
    responses obtained by perturbing the atom/node position at that row index,
    i.e. ``prs_profile[i,j]`` will give the response of residue/node *j* to
    perturbations in residue/node *i*.  PRS is performed using the covariance
    matrix from *model*, e.g. :class:`.ANM` instance.

    When an *atoms* instance is given, the PRS matrix will be added as data, 
    which can be retrieved with ``atoms.getData('prs_matrix')``.  

    *model* and *atoms* must have the same number of atoms. *atoms* must be an
    :class:`.AtomGroup` instance. 

    .. [CA09] Atilgan C, Atilgan AR, Perturbation-Response Scanning
       Reveals Ligand Entry-Exit Mechanisms of Ferric Binding Protein.
       *PLoS Comput Biol* **2009** 5(10):e1000544.

    """

    if not isinstance(model, (NMA, ModeSet, Mode)):
        raise TypeError('model must be an NMA, ModeSet, or Mode instance')

    if isinstance(model, NMA) and len(model) == 0:
        raise ValueError('model must have normal modes calculated')

    atoms = kwargs.get('atoms',None)
    if atoms is not None:
        if isinstance(atoms, Selection):
            atoms = atoms.copy()
        if not isinstance(atoms, AtomGroup):
            raise TypeError('atoms must be an AtomGroup instance')
        elif atoms.numAtoms() != model.numAtoms():
            raise ValueError('model and atoms must have the same number atoms')

    n_atoms = model.numAtoms()
    LOGGER.timeit('_prody_prs_all')
    LOGGER.info('Calculating covariance matrix')
    LOGGER.timeit('_prody_cov')

    cov = model.getCovariance()

    LOGGER.clear()
    LOGGER.report('Covariance matrix calculated in %.1fs.', '_prody_cov')

    LOGGER.info('Calculating perturbation response')
    LOGGER.timeit('_prody_prs_mat')
    if not model.is3d():
        prs_matrix = cov**2

    else:
        cov_squared = cov**2
        n_by_3n_cov_squared = np.zeros((n_atoms, 3 * n_atoms))
        prs_matrix = np.zeros((n_atoms, n_atoms))
        i3 = -3
        i3p3 = 0
        for i in range(n_atoms):
            i3 += 3
            i3p3 += 3
            n_by_3n_cov_squared[i,:] = (cov_squared[i3:i3p3,:]).sum(0)

        j3 = -3
        j3p3 = 0
        for j in range(n_atoms):
            j3 += 3
            j3p3 += 3                
            prs_matrix[:,j] = (n_by_3n_cov_squared[:,j3:j3p3]).sum(1)

    LOGGER.clear()
    LOGGER.report('Perturbation response matrix calculated in %.1fs.',
                  '_prody_prs_mat')

    no_diag = kwargs.get('no_diag', True)
    #filename = kwargs.get('filename', None)

    norm_prs_matrix = np.zeros((n_atoms, n_atoms))
    self_dp = np.diag(prs_matrix)  
    self_dp = self_dp.reshape(n_atoms, 1)
    norm_prs_matrix = prs_matrix / np.repeat(self_dp, n_atoms, axis=1)

    if no_diag:
       # suppress the diagonal (self displacement) to facilitate
       # visualizing the response profile
       norm_prs_matrix = norm_prs_matrix - np.diag(np.diag(norm_prs_matrix))
    
    W = 1 - np.eye(n_atoms)
    effectiveness = np.average(norm_prs_matrix, weights=W, axis=1)
    sensitivity = np.average(norm_prs_matrix, weights=W, axis=0)

    #if filename:
    #    np.savetxt(filename, norm_prs_matrix, delimiter='\t', fmt='%8.6f')

    LOGGER.report('Perturbation response scanning completed in %.1fs.',
                  '_prody_prs_all')

    if atoms is not None:
        try:
            ag = atoms.getAtomGroup()
            defdata = np.zeros(ag.numAtoms(), dtype=float)
            ag.setData('effectiveness', defdata.copy())
            ag.setData('sensitivity', defdata.copy())
        except AttributeError:
            pass
        atoms.setData('effectiveness', effectiveness)
        atoms.setData('sensitivity', sensitivity)

        #atoms.setData('prs_matrix', norm_prs_matrix)

    return norm_prs_matrix, effectiveness, sensitivity
示例#29
0
文件: perturb.py 项目: SHZ66/ProDy
def calcPerturbResponse(model, **kwargs):
    """This function implements the perturbation response scanning (PRS) method
    described in [CA09]_ and [IG14]_. It returns a PRS matrix, and effectiveness 
    and sensitivity profiles.
    
    Rows of the matrix are the average magnitude of the responses obtained by 
    perturbing the atom/node position at that row index, i.e. ``prs_matrix[i,j]`` 
    will give the response of residue/node *j* to perturbations in residue/node *i*. 
    
    PRS is performed using the covariance matrix from a *model*, e.g. 
    a :class:`.ANM` instance. To use an external matrix, please provide it to 
    a :class:`.PCA` instance using the :meth:`.PCA.setCovariance`.

    When an *atoms* instance is given, the PRS matrix will be added as data, 
    which can be retrieved with ``atoms.getData('prs_matrix')``.  

    *model* and *atoms* must have the same number of atoms. *atoms* must be an
    :class:`.AtomGroup` instance. 

    .. [CA09] Atilgan C, Atilgan AR, Perturbation-Response Scanning
       Reveals Ligand Entry-Exit Mechanisms of Ferric Binding Protein.
       *PLoS Comput Biol* **2009** 5(10):e1000544.

    .. [IG14] General IJ, Liu Y, Blackburn ME, Mao W, Gierasch LM, Bahar I.
        ATPase subdomain IA is a mediator of interdomain allostery in Hsp70
        molecular chaperones. *PLoS Comput. Biol.* **2014** 10:e1003624.

    If *turbo* is **True** (default), then PRS is approximated by the limit of 
    large numbers of forces and no perturbation forces are explicitly applied. 
    If set to **False**, then each residue/node is perturbed *repeats* times (default 100) 
    with a random unit force vector as in ProDy v1.8 and earlier.
    """

    if not isinstance(model, (NMA, ModeSet, Mode)):
        raise TypeError('model must be an NMA, ModeSet, or Mode instance')

    if isinstance(model, NMA) and len(model) == 0:
        raise ValueError('model must have normal modes calculated')

    atoms = kwargs.get('atoms', None)
    suppress_diag = kwargs.get('suppress_diag', False)
    no_diag = kwargs.get('no_diag', suppress_diag)

    if atoms is not None:
        if isinstance(atoms, Selection):
            atoms = atoms.copy()
        if not isinstance(atoms, AtomGroup):
            raise TypeError('atoms must be an AtomGroup instance')
        elif atoms.numAtoms() != model.numAtoms():
            raise ValueError('model and atoms must have the same number atoms')

    n_atoms = model.numAtoms()
    # LOGGER.timeit('_prody_prs_all')
    # LOGGER.info('Calculating covariance matrix')
    # LOGGER.timeit('_prody_cov')

    cov = model.getCovariance()

    turbo = kwargs.get('turbo', True)
    if turbo:
        if not model.is3d():
            prs_matrix = cov**2

        else:
            cov_squared = cov**2
            n_by_3n_cov_squared = np.zeros((n_atoms, 3 * n_atoms))
            prs_matrix = np.zeros((n_atoms, n_atoms))
            i3 = -3
            i3p3 = 0
            for i in range(n_atoms):
                i3 += 3
                i3p3 += 3
                n_by_3n_cov_squared[i, :] = (cov_squared[i3:i3p3, :]).sum(0)

            j3 = -3
            j3p3 = 0
            for j in range(n_atoms):
                j3 += 3
                j3p3 += 3
                prs_matrix[:, j] = (n_by_3n_cov_squared[:, j3:j3p3]).sum(1)
    else:
        repeats = kwargs.pop('repeats', 100)
        LOGGER.info(
            'Calculating perturbation response with {0} repeats'.format(
                repeats))
        LOGGER.timeit('_prody_prs_mat')

        response_matrix = np.zeros((n_atoms, n_atoms))
        LOGGER.progress('Calculating perturbation response', n_atoms,
                        '_prody_prs')
        i3 = -3
        i3p3 = 0
        for i in range(n_atoms):
            i3 += 3
            i3p3 += 3
            forces = np.random.rand(repeats * 3).reshape((repeats, 3))
            forces /= ((forces**2).sum(1)**0.5).reshape((repeats, 1))
            for force in forces:
                response_matrix[i] += (np.dot(cov[:, i3:i3p3],
                                              force)**2).reshape(
                                                  (n_atoms, 3)).sum(1)
            LOGGER.update(i, '_prody_prs')

        response_matrix /= repeats

        LOGGER.clear()
        LOGGER.report('Perturbation response matrix calculated in %.1fs.',
                      '_prody_prs_mat')

    norm_prs_matrix = np.zeros((n_atoms, n_atoms))
    self_dp = np.diag(prs_matrix)
    self_dp = self_dp.reshape(n_atoms, 1)
    re_self_dp = np.repeat(self_dp, n_atoms, axis=1)
    norm_prs_matrix = div0(prs_matrix, re_self_dp)

    if no_diag:
        # suppress the diagonal (self displacement) to facilitate
        # visualizing the response profile
        norm_prs_matrix = norm_prs_matrix - np.diag(np.diag(norm_prs_matrix))

    W = 1 - np.eye(n_atoms)
    effectiveness = np.average(norm_prs_matrix, weights=W, axis=1)
    sensitivity = np.average(norm_prs_matrix, weights=W, axis=0)

    # LOGGER.report('Perturbation response scanning completed in %.1fs.',
    #               '_prody_prs_all')

    if atoms is not None:
        try:
            ag = atoms.getAtomGroup()
            defdata = np.zeros(ag.numAtoms(), dtype=float)
            ag.setData('effectiveness', defdata.copy())
            ag.setData('sensitivity', defdata.copy())
        except AttributeError:
            pass
        atoms.setData('effectiveness', effectiveness)
        atoms.setData('sensitivity', sensitivity)

        #atoms.setData('prs_matrix', norm_prs_matrix)

    return norm_prs_matrix, effectiveness, sensitivity
示例#30
0
def writeDCD(filename,
             trajectory,
             start=None,
             stop=None,
             step=None,
             align=False):
    """Write 32-bit CHARMM format DCD file (also NAMD 2.1 and later).
    *trajectory can be an :class:`Trajectory`, :class:`DCDFile`, or
    :class:`Ensemble` instance. *filename* is returned upon successful
    output of file."""

    if not isinstance(trajectory, (TrajBase, Ensemble, Atomic)):
        raise TypeError('{0} is not a valid type for trajectory'.format(
            type(trajectory)))

    irange = list(
        range(*slice(start, stop, step).indices(trajectory.numCoordsets())))
    n_csets = len(irange)
    if n_csets == 0:
        raise ValueError('trajectory does not have any coordinate sets, or '
                         'no coordinate sets are selected')

    if isinstance(trajectory, Atomic):
        isEnsemble = False
        isAtomic = True
        n_atoms = trajectory.numAtoms()
    else:
        isEnsemble = True
        isAtomic = False
        n_atoms = trajectory.numSelected()
    if n_atoms == 0:
        raise ValueError('no atoms are selected in the trajectory')
    if isinstance(trajectory, TrajBase):
        isTrajectory = True
        unitcell = trajectory.hasUnitcell()
        nfi = trajectory.nextIndex()
        trajectory.reset()
        pack_i_48 = pack('i', 48)
        if isinstance(trajectory, Trajectory):
            timestep = trajectory.getTimestep()[0]
            first_ts = trajectory.getFirstTimestep()[0]
            framefreq = trajectory.getFrameFreq()[0]
            n_fixed = trajectory.numFixed()[0]
        else:
            timestep = trajectory.getTimestep()
            first_ts = trajectory.getFirstTimestep()
            framefreq = trajectory.getFrameFreq()
            n_fixed = trajectory.numFixed()
    else:
        isTrajectory = False
        unitcell = False
        if isinstance(trajectory, Ensemble):
            frame = trajectory[0]
        else:
            frame = trajectory
            acsi = trajectory.getACSIndex()
        timestep = 1
        first_ts = 0
        framefreq = 1
        n_fixed = 0

    dcd = DCDFile(filename, mode='w')
    LOGGER.progress('Writing DCD', len(irange), '_prody_writeDCD')
    prev = -1
    uc = None
    time_ = time()
    for j, i in enumerate(irange):
        diff = i - prev
        if diff > 1:
            trajectory.skip(diff - 1)
        prev = i
        if isTrajectory:
            frame = next(trajectory)
            if frame is None:
                break
            if unitcell:
                uc = frame._getUnitcell()
                uc[3:] = np.sin((PISQUARE / 90) * (90 - uc[3:]))
                uc = uc[[0, 3, 1, 4, 5, 2]]
        elif isEnsemble:
            frame._index = i
        else:
            frame.setACSIndex(i)
        if align:
            frame.superpose()
        if j == 0:
            dcd.write(frame._getCoords(),
                      uc,
                      timestep=timestep,
                      firsttimestep=first_ts,
                      framefreq=framefreq)
        else:
            dcd.write(frame._getCoords(), uc)
        LOGGER.update(i, '_prody_writeDCD')
    if isAtomic:
        trajectory.setACSIndex(acsi)
    j += 1
    LOGGER.clear()
    dcd.close()
    time_ = time() - time_ or 0.01
    dcd_size = 1.0 * (56 + (n_atoms * 3 + 6) * 4) * n_csets / (1024 * 1024)
    LOGGER.info('DCD file was written in {0:.2f} seconds.'.format(time_))
    LOGGER.info('{0:.2f} MB written at input rate {1:.2f} MB/s.'.format(
        dcd_size, dcd_size / time_))
    LOGGER.info(
        '{0} coordinate sets written at output rate {1} frame/s.'.format(
            n_csets, int(n_csets / time_)))
    if j != n_csets:
        LOGGER.warn('Warning: {0} frames expected, {1} written.'.format(
            n_csets, j))
    if isTrajectory:
        trajectory.goto(nfi)
    return filename
示例#31
0
def psiBlastCycle(sequence=None, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    a single cycle of EBI psiblast.

    :arg sequence: an object with an associated sequence string 
         or a sequence string itself
    :type sequence: :class:`Atomic`, :class:`Sequence`, or str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    The following search parameters can be adjusted by the user.
    We use the same default values as 
    http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/
    wherever applicable.

    :arg email: email address for reporting problems
        default is [email protected]
    :type email: str with an @ before a .

    :arg matrix: The comparison matrix to be used to score alignments when searching the database
        possible values are 'BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30' and 'PAM70' 
        default is 'BLOSUM62'
    :type matrix: str

    :arg gapopen: Penalty taken away from the score when a gap is created in sequence alignments. 
        Increasing the gap opening penalty will decrease the number of gaps in the final alignment.
        Possible values range from 8 to 16 inclusive, default is 11
    :type gapopen: int

    :arg gapext: Penalty taken away from the score for each base or residue in the gap. 
        Increasing the gap extension penalty favors short gaps in the final alignment, 
        conversly decreasing the gap extension penalty favors long gaps in the final alignment. 
        Possible values range from 0 to 3, default is 1
    :type gapext: int

    :arg expthr: Expectation threshold that limits the number of scores and alignments reported. 
        This is the maximum number of times the match is expected to occur by chance.
        Possible values are 1.0e-200, 1.0e-100, 1.0e-50, 1.0e-10, 1.0e-5, 1.0e-4, 1.0e-3,
        1.0e-2, 0.1, 1.0, 10.0, 100, 1000
        default is 10.0
    :type expthr: float

    :arg psithr: Expectation value threshold for automatic selection of matched sequences for 
        inclusion in the PSSM at each iteration.
        Possible values are 1.0e-6, 1.0e-5, 1.0e-4, 2.0e-4, 5.0e-4, 1.0e-3, 2.0e-3, 5.0e-3,
        1.0e-2, 2.0e-2, 0.1, 0.3, 0.5, 1.0, 3.0, 10.0
        default is 1.0e-3
    :type psithr: float

    :arg scores: Maximum number of match score summaries reported in the result output.
        Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000
        Default is 500
    :type scores: int

    :arg alignments: Maximum number of match alignments reported in the result output.
        Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000
        Default is 500
    :type alignmets: int

    :arg dropoff: The amount a score can drop before extension of word hits is halted
        Possible values are 0, 2, 4, 6, 8, 10, 15, 20, 25, or 30
        Default is 15
    :type dropoff: int

    :arg finaldropoff: Dropoff value for final gapped alignment
        Possible values are 10, 12, 14, 16, 18, 20, 22, 24, 25, 26, 28, or 30
        Default is 25
    :type finaldropoff: int

    :arg filter: Filter regions of low sequence complexity. This can avoid issues with 
        low complexity sequences where matches are found due to composition rather than 
        meaningful sequence similarity. However, in some cases filtering also masks 
        regions of interest and so should be used with caution.
        Possible values are T and F, default is F
    :type filter: str

    :arg seqrange: Specify a range or section of the input sequence to use in the search.
        Example: Specifying '34-89' in an input sequence of total length 100, will tell BLAST 
        to only use residues 34 to 89, inclusive.
    :type seqrange: str of form START-END

    :arg database: a database name from those available. See
        http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/database
        default is pdb
    :type database: str

    :arg previousjobid: The job identifier for the previous PSI-BLAST iteration. 
        default is None
        You can change this if you want to continue from a previous run
    :type previousjobid: str

    :arg selectedHits: Name of a file containing a list of identifiers of the 
        hits from the previous iteration to use to construct the search PSSM 
        for this iteration.
        default is None
    :type selectedHits: str

    :arg cpfile: Name of a Checkpoint file from the previous iteration. 
        default is None
    :type cpfile: str

    :arg sleep: how long to wait to reconnect for status
         Sleep time is multiplied by 1.5 when results are not ready.
         default is 2 seconds
    :type sleep: float

    :arg timeout:  when to give up waiting for the results 
        default is 120 seconds
    :type timeout: float

    :arg cycle: cycle number
    :type cycle: int

    """
    cycle = kwargs.get('cycle',0)

    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')

    elif isinstance(sequence, Atomic):
        sequence = sequence.calpha.getSequence()

    elif isinstance(sequence, Sequence):
        sequence = str(sequence)

    elif isinstance(sequence, str):
        if len(sequence) in [4, 5, 6]:
            ag = parsePDB(sequence)
            sequence = ag.calpha.getSequence()
        sequence = ''.join(sequence.split())

    elif sequence is None:
        if cycle == 0: 
            cycle = 1
    else:
        raise TypeError('sequence must be Atomic, Sequence, or str not {0}'
                        .format(type(sequence)))

    if cycle == 0:
        query = [('sequence', sequence)]
    else:
        query = []

    email = kwargs.get('email','*****@*****.**')
    if not isinstance(email, str):
        raise TypeError('email must be a string')
    elif email.find('@') == -1 or email.find('.') == -1 or len(email.split('@')) != 2:
        raise ValueError('email must be a valid email address with at least one . and exactly one @ sign')
    elif not email.find('@') < email.find(email.split('.')[-1]):
        raise ValueError('email must be a valid email address with a . after the @ sign')
    query.append(('email', email))
    query.append(('title', 'ProDy psiBlastPDB request'))

    previousjobid = kwargs.get('previousjobid','')
    if previousjobid is not '':
        query.append(('previousjobid',previousjobid))

    selectedHits = kwargs.get('selectedHits','')
    if selectedHits is not '':
        query.append(('selectedHits',selectedHits))

    database = kwargs.get('database','pdb')
    checkPsiBlastParameter('database', database)
    query.append(('database',database))

    matrix = kwargs.get('matrix', 'BLOSUM62')
    checkPsiBlastParameter('matrix', matrix)
    query.append(('matrix',matrix))

    gapopen = kwargs.get('gapopen',11)
    checkPsiBlastParameter('gapopen', gapopen)
    query.append(('gapopen',gapopen))

    gapext = kwargs.get('gapext',1)
    checkPsiBlastParameter('gapext', gapext)
    query.append(('gapext',gapext))

    expthr = kwargs.get('expthr', 10.)
    checkPsiBlastParameter('expthr', expthr)
    query.append(('expthr',expthr))
    
    psithr = kwargs.get('psithr',1.0e-3)
    checkPsiBlastParameter('psithr', psithr)
    query.append(('psithr',psithr))

    scores = kwargs.get('scores',500)
    checkPsiBlastParameter('scores', scores)
    query.append(('scores',scores))

    alignments = kwargs.get('alignments',500)
    checkPsiBlastParameter('alignments', alignments)
    query.append(('alignments',alignments))
    
    query.append(('alignView',0))
                    
    dropoff = kwargs.get('dropoff',15)
    checkPsiBlastParameter('dropoff', dropoff)
    query.append(('dropoff',dropoff))
        
    finaldropoff = kwargs.get('finaldropoff',25)
    checkPsiBlastParameter('finaldropoff', finaldropoff)
    query.append(('finaldropoff',finaldropoff))
        
    filter = kwargs.get('filter','F')
    checkPsiBlastParameter('filter', filter)
    query.append(('filter',filter))
    
    if previousjobid is '' and selectedHits is '':
        seqrange = kwargs.get('seqrange', None)
        if seqrange is None:
            seqrange = '0-' + str(len(sequence))
        elif not isinstance(seqrange, str):
            raise TypeError('seqrange should be a string')
        elif len(seqrange.split('-')) != 2:
            raise ValueError('seqrange should take the form START-END')
        try:
            start = int(seqrange.split('-')[0])
            end = int(seqrange.split('-')[1])
        except:
            raise ValueError('seqrange should be START-END with START and END being integers')
        query.append(('seqrange',seqrange))
        
    headers = { 'User-Agent' : 'ProDy' }
    
    try:
        import urllib.parse
        urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8')
    except ImportError:
        from urllib import urlencode

    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 120))
    
    data = urlencode(query)

    # submit the job
    base_url = 'http://www.ebi.ac.uk/Tools/services/rest/psiblast/'
    url = base_url + 'run/'
    LOGGER.timeit('_prody_psi-blast')
    if cycle == 0:
        LOGGER.info('PSI-Blast searching PDB database for "{0}..."'
                    .format(sequence[:5]))
    else:
        LOGGER.info('PSI-Blast searching PDB database, cycle={0}'
                    .format(cycle))

    handle = openURL(url, data=data, headers=headers)
    job_id = handle.read()
    handle.close()

    # check the status
    url = base_url + 'status/' + job_id
    handle = openURL(url)
    status = handle.read()
    handle.close()
                    
    # keep checking the status until it's no longer running
    while status == 'RUNNING':
        LOGGER.sleep(int(sleep), 'to reconnect to EBI for status.')
        LOGGER.write('Connecting to EBI for status...')
        handle = openURL(url)
        status = handle.read()
        LOGGER.clear()
        sleep = int(sleep * 1.5)
        if LOGGER.timing('_prody_psi-blast') > timeout:
            LOGGER.warn('PSI-Blast search time out.')
            return None

    LOGGER.info('The status is {0}'.format(status))
    LOGGER.clear()
    LOGGER.report('PSI-Blast search completed in %.1fs.', '_prody_psi-blast')
 
    if cycle != 1:
        # get the results
        url = base_url + 'result/' + job_id + '/xml'
        handle = openURL(url)
        results = handle.read()
        handle.close()
        
        try:
            ext_xml = filename.lower().endswith('.xml')
        except AttributeError:
            pass
        else:
            if not ext_xml:
                filename += '.xml'
            f_out = open(filename, 'w')
            f_out.write(results)
            f_out.close()
            LOGGER.info('Results are saved as {0}.'.format(repr(filename)))
        
        return job_id, PsiBlastRecord(results, sequence)
    else:
        return job_id
示例#32
0
文件: blastpdb.py 项目: uibcdf/ProDy
    def fetch(self, xml=None, sequence=None, **kwargs):
        """Get Blast record from url or file.

        :arg sequence: an object with an associated sequence string 
            or a sequence string itself
        :type sequence: :class:`Atomic`, :class:`Sequence`, or str

        :arg xml: blast search results in XML format or an XML file that
            contains the results or a filename for saving the results or None
        :type xml: str

        :arg timeout: amount of time until the query times out in seconds
            default value is 120
        :type timeout: int
        """
        if self.isSuccess:
            LOGGER.warn(
                "The record already exists so not further search is performed")
            return True

        if sequence == None:
            sequence = self._sequence

        if xml == None:
            xml = self._xml

        import xml.etree.cElementTree as ET
        if xml is not None and len(xml) < 100:
            if os.path.isfile(xml):
                xml = ET.parse(xml)
                root = xml.getroot()
            else:
                raise ValueError('xml is not a filename and does not look like'
                                 ' a valid XML string')
        else:

            headers = {'User-agent': 'ProDy'}
            query = [
                ('DATABASE', 'pdb'),
                ('ENTREZ_QUERY', '(none)'),
                ('PROGRAM', 'blastp'),
            ]

            expect = float(kwargs.pop('expect', 10e-10))
            if expect <= 0:
                raise ValueError('expect must be a positive number')
            query.append(('EXPECT', expect))
            hitlist_size = int(kwargs.pop('hitlist_size', 250))
            if hitlist_size <= 0:
                raise ValueError('expect must be a positive integer')
            query.append(('HITLIST_SIZE', hitlist_size))
            query.append(('QUERY', sequence))
            query.append(('CMD', 'Put'))

            sleep = float(kwargs.pop('sleep', 2))
            timeout = float(kwargs.pop('timeout', self._timeout))
            self._timeout = timeout

            try:
                import urllib.parse
                urlencode = lambda data: bytes(urllib.parse.urlencode(data),
                                               'utf-8')
            except ImportError:
                from urllib import urlencode

            url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi'

            data = urlencode(query)
            LOGGER.timeit('_prody_blast')
            LOGGER.info(
                'Blast searching NCBI PDB database for "{0}..."'.format(
                    sequence[:5]))
            handle = openURL(url, data=data, headers=headers)

            html = handle.read()
            index = html.find(b'RID =')
            if index == -1:
                raise Exception('NCBI did not return expected response.')
            else:
                last = html.find(b'\n', index)
                rid = html[index + len('RID ='):last].strip()

            query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500),
                     ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')]
            data = urlencode(query)

            while True:
                LOGGER.sleep(int(sleep),
                             'to reconnect to NCBI for search results.')
                LOGGER.write('Connecting to NCBI for search results...')
                handle = openURL(url, data=data, headers=headers)
                results = handle.read()
                index = results.find(b'Status=')
                LOGGER.clear()
                if index < 0:
                    break
                last = results.index(b'\n', index)
                status = results[index + len('Status='):last].strip()
                if status.upper() == b'READY':
                    break
                sleep = int(sleep * 1.5)
                if LOGGER.timing('_prody_blast') > timeout:
                    LOGGER.warn('Blast search time out.')
                    return False

            LOGGER.clear()
            LOGGER.report('Blast search completed in %.1fs.', '_prody_blast')

            filename = xml
            root = ET.XML(results)
            try:
                ext_xml = filename.lower().endswith('.xml')
            except AttributeError:
                pass
            else:
                if not ext_xml:
                    filename += '.xml'
                out = open(filename, 'w')
                if PY3K:
                    out.write(results.decode())
                else:
                    out.write(results)
                out.close()
                LOGGER.info('Results are saved as {0}.'.format(repr(filename)))

            root = dictElement(root, 'BlastOutput_')
            if root['db'] != 'pdb':
                raise ValueError('blast search database in xml must be "pdb"')
            if root['program'] != 'blastp':
                raise ValueError(
                    'blast search program in xml must be "blastp"')
            self._param = dictElement(root['param'][0], 'Parameters_')

            query_len = int(root['query-len'])
            if sequence and len(sequence) != query_len:
                raise ValueError(
                    'query-len and the length of the sequence do not '
                    'match, xml data may not be for given sequence')
            hits = []
            for iteration in root['iterations']:
                for hit in dictElement(iteration, 'Iteration_')['hits']:
                    hit = dictElement(hit, 'Hit_')
                    data = dictElement(hit['hsps'][0], 'Hsp_')
                    for key in [
                            'align-len', 'gaps', 'hit-frame', 'hit-from',
                            'hit-to', 'identity', 'positive', 'query-frame',
                            'query-from', 'query-to'
                    ]:
                        data[key] = int(data[key])
                    data['query-len'] = query_len
                    for key in ['evalue', 'bit-score', 'score']:
                        data[key] = float(data[key])
                    p_identity = 100.0 * data['identity'] / (
                        data['query-to'] - data['query-from'] + 1)
                    data['percent_identity'] = p_identity
                    p_overlap = (100.0 * (data['align-len'] - data['gaps']) /
                                 query_len)
                    data['percent_coverage'] = p_overlap

                    for item in (hit['id'] + hit['def']).split('>gi'):
                        head, title = item.split(None, 1)
                        head = head.split('|')
                        pdb_id = head[-2].lower()
                        chain_id = head[-1][:1]
                        pdbch = dict(data)
                        pdbch['pdb_id'] = pdb_id
                        pdbch['chain_id'] = chain_id
                        pdbch['title'] = (head[-1][1:] + title).strip()
                        hits.append((p_identity, p_overlap, pdbch))
            hits.sort(key=lambda hit: hit[0], reverse=True)
            self._hits = hits

        return True
示例#33
0
def psiBlastCycle(sequence=None, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    a single cycle of EBI psiblast.

    :arg sequence: an object with an associated sequence string 
         or a sequence string itself
    :type sequence: :class:`Atomic`, :class:`Sequence`, or str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    The following search parameters can be adjusted by the user.
    We use the same default values as 
    http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/
    wherever applicable.

    :arg email: email address for reporting problems
        default is [email protected]
    :type email: str with an @ before a .

    :arg matrix: The comparison matrix to be used to score alignments when searching the database
        possible values are 'BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30' and 'PAM70' 
        default is 'BLOSUM62'
    :type matrix: str

    :arg gapopen: Penalty taken away from the score when a gap is created in sequence alignments. 
        Increasing the gap opening penalty will decrease the number of gaps in the final alignment.
        Possible values range from 8 to 16 inclusive, default is 11
    :type gapopen: int

    :arg gapext: Penalty taken away from the score for each base or residue in the gap. 
        Increasing the gap extension penalty favors short gaps in the final alignment, 
        conversly decreasing the gap extension penalty favors long gaps in the final alignment. 
        Possible values range from 0 to 3, default is 1
    :type gapext: int

    :arg expthr: Expectation threshold that limits the number of scores and alignments reported. 
        This is the maximum number of times the match is expected to occur by chance.
        Possible values are 1.0e-200, 1.0e-100, 1.0e-50, 1.0e-10, 1.0e-5, 1.0e-4, 1.0e-3,
        1.0e-2, 0.1, 1.0, 10.0, 100, 1000
        default is 10.0
    :type expthr: float

    :arg psithr: Expectation value threshold for automatic selection of matched sequences for 
        inclusion in the PSSM at each iteration.
        Possible values are 1.0e-6, 1.0e-5, 1.0e-4, 2.0e-4, 5.0e-4, 1.0e-3, 2.0e-3, 5.0e-3,
        1.0e-2, 2.0e-2, 0.1, 0.3, 0.5, 1.0, 3.0, 10.0
        default is 1.0e-3
    :type psithr: float

    :arg scores: Maximum number of match score summaries reported in the result output.
        Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000
        Default is 500
    :type scores: int

    :arg alignments: Maximum number of match alignments reported in the result output.
        Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000
        Default is 500
    :type alignmets: int

    :arg dropoff: The amount a score can drop before extension of word hits is halted
        Possible values are 0, 2, 4, 6, 8, 10, 15, 20, 25, or 30
        Default is 15
    :type dropoff: int

    :arg finaldropoff: Dropoff value for final gapped alignment
        Possible values are 10, 12, 14, 16, 18, 20, 22, 24, 25, 26, 28, or 30
        Default is 25
    :type finaldropoff: int

    :arg filter: Filter regions of low sequence complexity. This can avoid issues with 
        low complexity sequences where matches are found due to composition rather than 
        meaningful sequence similarity. However, in some cases filtering also masks 
        regions of interest and so should be used with caution.
        Possible values are T and F, default is F
    :type filter: str

    :arg seqrange: Specify a range or section of the input sequence to use in the search.
        Example: Specifying '34-89' in an input sequence of total length 100, will tell BLAST 
        to only use residues 34 to 89, inclusive.
    :type seqrange: str of form START-END

    :arg database: a database name from those available. See
        http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/database
        default is pdb
    :type database: str

    :arg previousjobid: The job identifier for the previous PSI-BLAST iteration. 
        default is None
        You can change this if you want to continue from a previous run
    :type previousjobid: str

    :arg selectedHits: Name of a file containing a list of identifiers of the 
        hits from the previous iteration to use to construct the search PSSM 
        for this iteration.
        default is None
    :type selectedHits: str

    :arg cpfile: Name of a Checkpoint file from the previous iteration. 
        default is None
    :type cpfile: str

    :arg sleep: how long to wait to reconnect for status
         Sleep time is multiplied by 1.5 when results are not ready.
         default is 2 seconds
    :type sleep: float

    :arg timeout:  when to give up waiting for the results 
        default is 120 seconds
    :type timeout: float

    :arg cycle: cycle number
    :type cycle: int

    """
    cycle = kwargs.get('cycle', 0)

    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')

    elif isinstance(sequence, Atomic):
        sequence = sequence.calpha.getSequence()

    elif isinstance(sequence, Sequence):
        sequence = str(sequence)

    elif isinstance(sequence, str):
        if len(sequence) in [4, 5, 6]:
            ag = parsePDB(sequence)
            sequence = ag.calpha.getSequence()
        sequence = ''.join(sequence.split())

    elif sequence is None:
        if cycle == 0:
            cycle = 1
    else:
        raise TypeError(
            'sequence must be Atomic, Sequence, or str not {0}'.format(
                type(sequence)))

    if cycle == 0:
        query = [('sequence', sequence)]
    else:
        query = []

    email = kwargs.get('email', '*****@*****.**')
    if not isinstance(email, str):
        raise TypeError('email must be a string')
    elif email.find('@') == -1 or email.find('.') == -1 or len(
            email.split('@')) != 2:
        raise ValueError(
            'email must be a valid email address with at least one . and exactly one @ sign'
        )
    elif not email.find('@') < email.find(email.split('.')[-1]):
        raise ValueError(
            'email must be a valid email address with a . after the @ sign')
    query.append(('email', email))
    query.append(('title', 'ProDy psiBlastPDB request'))

    previousjobid = kwargs.get('previousjobid', '')
    if previousjobid != '':
        query.append(('previousjobid', previousjobid))

    selectedHits = kwargs.get('selectedHits', '')
    if selectedHits != '':
        query.append(('selectedHits', selectedHits))

    database = kwargs.get('database', 'pdb')
    checkPsiBlastParameter('database', database)
    query.append(('database', database))

    matrix = kwargs.get('matrix', 'BLOSUM62')
    checkPsiBlastParameter('matrix', matrix)
    query.append(('matrix', matrix))

    gapopen = kwargs.get('gapopen', 11)
    checkPsiBlastParameter('gapopen', gapopen)
    query.append(('gapopen', gapopen))

    gapext = kwargs.get('gapext', 1)
    checkPsiBlastParameter('gapext', gapext)
    query.append(('gapext', gapext))

    expthr = kwargs.get('expthr', 10.)
    checkPsiBlastParameter('expthr', expthr)
    query.append(('expthr', expthr))

    psithr = kwargs.get('psithr', 1.0e-3)
    checkPsiBlastParameter('psithr', psithr)
    query.append(('psithr', psithr))

    scores = kwargs.get('scores', 500)
    checkPsiBlastParameter('scores', scores)
    query.append(('scores', scores))

    alignments = kwargs.get('alignments', 500)
    checkPsiBlastParameter('alignments', alignments)
    query.append(('alignments', alignments))

    query.append(('alignView', 0))

    dropoff = kwargs.get('dropoff', 15)
    checkPsiBlastParameter('dropoff', dropoff)
    query.append(('dropoff', dropoff))

    finaldropoff = kwargs.get('finaldropoff', 25)
    checkPsiBlastParameter('finaldropoff', finaldropoff)
    query.append(('finaldropoff', finaldropoff))

    filter = kwargs.get('filter', 'no')
    checkPsiBlastParameter('filter', filter)
    query.append(('filter', filter))

    if previousjobid == '' and selectedHits == '':
        seqrange = kwargs.get('seqrange', None)
        if seqrange is None:
            seqrange = '0-' + str(len(sequence))
        elif not isinstance(seqrange, str):
            raise TypeError('seqrange should be a string')
        elif len(seqrange.split('-')) != 2:
            raise ValueError('seqrange should take the form START-END')
        try:
            start = int(seqrange.split('-')[0])
            end = int(seqrange.split('-')[1])
        except:
            raise ValueError(
                'seqrange should be START-END with START and END being integers'
            )
        query.append(('seqrange', seqrange))

    headers = {'User-Agent': 'ProDy'}

    try:
        import urllib.parse
        urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8')
    except ImportError:
        from urllib import urlencode

    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 120))

    data = urlencode(query)

    # submit the job
    base_url = 'http://www.ebi.ac.uk/Tools/services/rest/psiblast/'
    url = base_url + 'run/'
    LOGGER.timeit('_prody_psi-blast')
    if cycle == 0:
        LOGGER.info('PSI-Blast searching PDB database for "{0}..."'.format(
            sequence[:5]))
    else:
        LOGGER.info(
            'PSI-Blast searching PDB database, cycle={0}'.format(cycle))

    handle = openURL(url, data=data, headers=headers)
    job_id = handle.read()
    if PY3K:
        job_id = job_id.decode()
    handle.close()

    # check the status
    url = base_url + 'status/' + job_id
    handle = openURL(url)
    status = handle.read()
    if PY3K:
        status = status.decode()
    handle.close()

    # keep checking the status until it's no longer running
    while status == 'RUNNING':
        LOGGER.sleep(int(sleep), 'to reconnect to EBI for status.')
        LOGGER.write('Connecting to EBI for status...')
        handle = openURL(url)
        status = handle.read()
        if PY3K:
            status = status.decode()
        LOGGER.clear()
        sleep = int(sleep * 1.5)
        if LOGGER.timing('_prody_psi-blast') > timeout:
            LOGGER.warn('PSI-Blast search time out.')
            return None

    LOGGER.info('The status is {0}'.format(status))
    LOGGER.clear()
    LOGGER.report('PSI-Blast search completed in %.1fs.', '_prody_psi-blast')

    if cycle != 1:
        # get the results
        url = base_url + 'result/' + job_id + '/xml'
        handle = openURL(url)
        results = handle.read()
        handle.close()

        try:
            ext_xml = filename.lower().endswith('.xml')
        except AttributeError:
            pass
        else:
            if not ext_xml:
                filename += '.xml'
            f_out = open(filename, 'w')
            f_out.write(results)
            f_out.close()
            LOGGER.info('Results are saved as {0}.'.format(repr(filename)))

        return job_id, PsiBlastRecord(results, sequence)
    else:
        return job_id
示例#34
0
文件: blastpdb.py 项目: prody/ProDy
def blastPDB(sequence, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    blast searching of ProteinDataBank database *sequence* using NCBI blastp.

    :arg sequence: single-letter code amino acid sequence of the protein
        without any gap characters, all white spaces will be removed
    :type sequence: str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``)
    search parameters can be adjusted by the user.  *sleep* keyword argument
    (default is ``2`` seconds) determines how long to wait to reconnect for
    results.  Sleep time is doubled when results are not ready.  *timeout*
    (default is 120s) determines when to give up waiting for the results.
    """

    if sequence == "runexample":
        sequence = (
            "ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI"
            "SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN"
            "DAYDIVKMKKSNISPNFNFMGQLLDFERTL"
        )
    else:
        try:
            sequence = "".join(sequence.split())
            _ = sequence.isalpha()
        except AttributeError:
            raise TypeError("sequence must be a string")
        else:
            if not _:
                raise ValueError("not a valid protein sequence")
    headers = {"User-agent": "ProDy"}

    query = [("DATABASE", "pdb"), ("ENTREZ_QUERY", "(none)"), ("PROGRAM", "blastp")]
    expect = float(kwargs.pop("expect", 10e-10))
    if expect <= 0:
        raise ValueError("expect must be a positive number")
    query.append(("EXPECT", expect))
    hitlist_size = int(kwargs.pop("hitlist_size", 250))
    if hitlist_size <= 0:
        raise ValueError("expect must be a positive integer")
    query.append(("HITLIST_SIZE", hitlist_size))
    query.append(("QUERY", sequence))
    query.append(("CMD", "Put"))

    sleep = float(kwargs.pop("sleep", 2))
    timeout = float(kwargs.pop("timeout", 120))

    if kwargs:
        LOGGER.warn("Keyword argument(s) {0} are not used.".format(", ".join([repr(key) for key in kwargs])))

    try:
        import urllib.parse

        urlencode = lambda data: bytes(urllib.parse.urlencode(data), "utf-8")
    except ImportError:
        from urllib import urlencode

    url = "https://blast.ncbi.nlm.nih.gov/Blast.cgi"

    data = urlencode(query)
    LOGGER.timeit("_prody_blast")
    LOGGER.info('Blast searching NCBI PDB database for "{0}..."'.format(sequence[:5]))
    handle = openURL(url, data=data, headers=headers)

    html = handle.read()
    index = html.find(b"RID =")
    if index == -1:
        raise Exception("NCBI did not return expected response.")
    else:
        last = html.find(b"\n", index)
        rid = html[index + len("RID =") : last].strip()

    index = html.find(b"RTOE =")
    if index == -1:
        rtoe = None  # This is not used
    else:
        last = html.find(b"\n", index)
        rtoe = int(html[index + len("RTOE =") : last].strip())

    query = [("ALIGNMENTS", 500), ("DESCRIPTIONS", 500), ("FORMAT_TYPE", "XML"), ("RID", rid), ("CMD", "Get")]
    data = urlencode(query)

    while True:
        LOGGER.sleep(int(sleep), "to reconnect NCBI for search results.")
        LOGGER.write("Connecting NCBI for search results...")
        handle = openURL(url, data=data, headers=headers)
        results = handle.read()
        index = results.find(b"Status=")
        LOGGER.clear()
        if index < 0:
            break
        last = results.index(b"\n", index)
        status = results[index + len("Status=") : last].strip()
        if status.upper() == "READY":
            break
        sleep = int(sleep * 1.5)
        if LOGGER.timing("_prody_blast") > timeout:
            LOGGER.warn("Blast search time out.")
            return None
    LOGGER.clear()
    LOGGER.report("Blast search completed in %.1fs.", "_prody_blast")
    try:
        ext_xml = filename.lower().endswith(".xml")
    except AttributeError:
        pass
    else:
        if not ext_xml:
            filename += ".xml"
        out = open(filename, "w")
        out.write(results)
        out.close()
        LOGGER.info("Results are saved as {0}.".format(repr(filename)))
    return PDBBlastRecord(results, sequence)