示例#1
0
文件: compare.py 项目: xy21hb/ProDy
def matchModes(*modesets, **kwargs):
    """Returns the matches of modes among *modesets*. Note that the first 
    modeset will be treated as the reference so that only the matching 
    of each modeset to the first modeset is garanteed to be optimal.
    
    :arg index: if `True` then indices of modes will be returned instead of 
                :class:`Mode` instances.
    :type index: bool
    """

    index = kwargs.pop('index', False)
    modeset0 = modesets[0]
    ret = [modeset0]

    n_modes = len(modeset0)
    n_sets = len(modesets)
    if n_sets == 1:
        return modesets
    elif n_sets == 0:
        raise ValueError('at least one modeset should be given')

    LOGGER.progress(
        'Matching {0} modes across {1} modesets...'.format(n_modes, n_sets),
        n_sets, '_prody_matchModes')
    for i, modeset in enumerate(modesets):
        LOGGER.update(i, label='_prody_matchModes')
        if i > 0:
            _, reordered_modeset = pairModes(modeset0, modeset, index=index)
            ret.append(reordered_modeset)
    LOGGER.finish()

    return ret
示例#2
0
    def _superpose(self, **kwargs):
        """Superpose conformations and update coordinates."""

        indices = self._indices
        weights = self._weights
        mobs = self._confs
        if indices is None:
            idx = False
            tar = self._coords
            movs = None
        else:
            idx = True
            if self._weights is not None:
                weights = weights[indices]
            tar = self._coords[indices]
            movs = self._confs

        linalg = importLA()
        svd = linalg.svd
        det = linalg.det

        if weights is None:
            tar_com = tar.mean(0)
            tar_org = (tar - tar_com)
            mob_org = zeros(tar_org.shape, dtype=mobs.dtype)
            tar_org = tar_org.T
        else:
            weights_sum = weights.sum()
            weights_dot = dot(weights.T, weights)
            tar_com = (tar * weights).sum(axis=0) / weights_sum
            tar_org = (tar - tar_com)
            mob_org = zeros(tar_org.shape, dtype=mobs.dtype)

        LOGGER.progress('Superposing ', len(mobs), '_prody_ensemble')
        for i, mob in enumerate(mobs):
            if idx:
                mob = mob[indices]
            if weights is None:
                mob_com = mob.mean(0)
                matrix = dot(tar_org, subtract(mob, mob_com, mob_org))
            else:
                mob_com = (mob * weights).sum(axis=0) / weights_sum
                subtract(mob, mob_com, mob_org)
                matrix = dot((tar_org * weights).T,
                             (mob_org * weights)) / weights_dot

            U, s, Vh = svd(matrix)
            Id = array([[1, 0, 0], [0, 1, 0], [0, 0, sign(det(matrix))]])
            rotation = dot(Vh.T, dot(Id, U.T))

            if movs is None:
                mobs[i] = dot(mob_org, rotation)
                add(mobs[i], tar_com, mobs[i])
            else:
                add(dot(movs[i], rotation),
                    (tar_com - dot(mob_com, rotation)), movs[i])
            LOGGER.update(i + 1, label='_prody_ensemble')
        LOGGER.finish()
示例#3
0
    def _superpose(self, **kwargs):
        """Superpose conformations and update coordinates."""

        indices = self._indices
        weights = self._weights
        mobs = self._confs
        if indices is None:
            idx = False
            tar = self._coords
            movs = None
        else:
            idx = True
            if self._weights is not None:
                weights = weights[indices]
            tar = self._coords[indices]
            movs = self._confs

        linalg = importLA()
        svd = linalg.svd
        det = linalg.det

        if weights is None:
            tar_com = tar.mean(0)
            tar_org = (tar - tar_com)
            mob_org = zeros(tar_org.shape, dtype=mobs.dtype)
            tar_org = tar_org.T
        else:
            weights_sum = weights.sum()
            weights_dot = dot(weights.T, weights)
            tar_com = (tar * weights).sum(axis=0) / weights_sum
            tar_org = (tar - tar_com)
            mob_org = zeros(tar_org.shape, dtype=mobs.dtype)

        LOGGER.progress('Superposing ', len(mobs), '_prody_ensemble')
        for i, mob in enumerate(mobs):
            if idx:
                mob = mob[indices]
            if weights is None:
                mob_com = mob.mean(0)
                matrix = dot(tar_org, subtract(mob, mob_com, mob_org))
            else:
                mob_com = (mob * weights).sum(axis=0) / weights_sum
                subtract(mob, mob_com, mob_org)
                matrix = dot((tar_org * weights).T,
                             (mob_org * weights)) / weights_dot

            U, s, Vh = svd(matrix)
            Id = array([[1, 0, 0], [0, 1, 0], [0, 0, sign(det(matrix))]])
            rotation = dot(Vh.T, dot(Id, U.T))

            if movs is None:
                mobs[i] = dot(mob_org, rotation)
                add(mobs[i], tar_com, mobs[i])
            else:
                add(dot(movs[i], rotation),
                    (tar_com - dot(mob_com, rotation)), movs[i])
            LOGGER.update(i + 1, label='_prody_ensemble')
        LOGGER.finish()
示例#4
0
def fetchPDBClusters(sqid=None):
    """Retrieve PDB sequence clusters.  PDB sequence clusters are results of
    the weekly clustering of protein chains in the PDB generated by blastclust.
    They are available at FTP site: ftp://resources.rcsb.org/sequence/clusters/

    This function will download about 10 Mb of data and save it after
    compressing in your home directory in :file:`.prody/pdbclusters`.
    Compressed files will be less than 4 Mb in size.  Cluster data can
    be loaded using :func:`loadPDBClusters` function and be accessed
    using :func:`listPDBCluster`."""

    if sqid is not None:
        if isListLike(sqid):
            for s in sqid:
                if s not in PDB_CLUSTERS:
                    raise ValueError('sqid must be one or more of ' +
                                     PDB_CLUSTERS_SQID_STR)
            keys = list(sqid)
        else:
            if sqid not in PDB_CLUSTERS:
                raise ValueError('sqid must be one or more of ' +
                                 PDB_CLUSTERS_SQID_STR)
            keys = [sqid]
    else:
        keys = list(PDB_CLUSTERS)

    PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters')
    if not os.path.isdir(PDB_CLUSTERS_PATH):
        os.mkdir(PDB_CLUSTERS_PATH)
    LOGGER.progress('Downloading sequence clusters', len(keys),
                    '_prody_fetchPDBClusters')
    count = 0
    for i, x in enumerate(keys):
        filename = 'bc-{0}.out'.format(x)
        url = ('ftp://resources.rcsb.org/sequence/clusters/' + filename)
        try:
            inp = openURL(url)
        except IOError:
            LOGGER.warning('Clusters at {0}% sequence identity level could '
                           'not be downloaded.'.format(x))
            continue
        else:
            out = openFile(filename + '.gz', 'w', folder=PDB_CLUSTERS_PATH)
            out.write(inp.read())
            inp.close()
            out.close()
            count += 1
        LOGGER.update(i, label='_prody_fetchPDBClusters')
    LOGGER.finish()
    if len(keys) == count:
        LOGGER.info('All selected PDB clusters were downloaded successfully.')
    elif count == 0:
        LOGGER.warn('PDB clusters could not be downloaded.')
示例#5
0
def fetchPDBClusters(sqid=None):
    """Retrieve PDB sequence clusters.  PDB sequence clusters are results of
    the weekly clustering of protein chains in the PDB generated by blastclust.
    They are available at FTP site: ftp://resources.rcsb.org/sequence/clusters/

    This function will download about 10 Mb of data and save it after
    compressing in your home directory in :file:`.prody/pdbclusters`.
    Compressed files will be less than 4 Mb in size.  Cluster data can
    be loaded using :func:`loadPDBClusters` function and be accessed
    using :func:`listPDBCluster`."""

    if sqid is not None:
        if sqid not in PDB_CLUSTERS:
            raise ValueError('sqid must be one of ' + PDB_CLUSTERS_SQID_STR)
        keys = [sqid]
    else:
        keys = list(PDB_CLUSTERS)

    PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters')
    if not os.path.isdir(PDB_CLUSTERS_PATH):
        os.mkdir(PDB_CLUSTERS_PATH)
    LOGGER.progress('Downloading sequence clusters', len(keys),
                    '_prody_fetchPDBClusters')
    count = 0
    for i, x in enumerate(keys):
        filename = 'bc-{0}.out'.format(x)
        url = ('ftp://resources.rcsb.org/sequence/clusters/' + filename)
        try:
            inp = openURL(url)
        except IOError:
            LOGGER.warning('Clusters at {0}% sequence identity level could '
                           'not be downloaded.')
            continue
        else:
            out = openFile(filename+'.gz', 'w', folder=PDB_CLUSTERS_PATH)
            out.write(inp.read())
            inp.close()
            out.close()
            count += 1
        LOGGER.update(i, label='_prody_fetchPDBClusters')
    LOGGER.finish()
    if len(PDB_CLUSTERS) == count:
        LOGGER.info('All PDB clusters were downloaded successfully.')
    elif count == 0:
        LOGGER.warn('PDB clusters could not be downloaded.')
示例#6
0
def parseChainsList(filename):
    """
    Parse a set of PDBs and extract chains based on a list in a text file.

    :arg filename: the name of the file to be read
    :type filename: str

    Returns: lists containing an :class:'.AtomGroup' for each PDB, 
    the headers for those PDBs, and the requested :class:`.Chain` objects
    """

    fi = open(filename, 'r')
    lines = fi.readlines()
    fi.close()

    pdb_ids = []
    ags = []
    headers = []
    chains = []
    num_lines = len(lines)
    LOGGER.progress('Starting', num_lines, '_prody_parseChainsList')
    for i, line in enumerate(lines):
        LOGGER.update(i, 'Parsing lines...', label='_prody_parseChainsList')
        pdb_id = line.split()[0].split('_')[0]
        if not pdb_id in pdb_ids:
            pdb_ids.append(pdb_id)

            ag, header = parsePDB(pdb_id, compressed=False, \
                                  subset=line.split()[0].split('_')[1], header=True)

            ags.append(ag)
            headers.append(header)

        chains.append(ag.getHierView()[line.strip().split()[1]])

    LOGGER.finish()
    LOGGER.info(
        '{0} PDBs have been parsed and {1} chains have been extracted. \
                '.format(len(ags), len(chains)))

    return ags, headers, chains
示例#7
0
文件: pdbfile.py 项目: prody/ProDy
def parseChainsList(filename):
    """
    Parse a set of PDBs and extract chains based on a list in a text file.

    :arg filename: the name of the file to be read
    :type filename: str

    Returns: lists containing an :class:'.AtomGroup' for each PDB, 
    the headers for those PDBs, and the requested :class:`.Chain` objects
    """
    
    fi = open(filename,'r')
    lines = fi.readlines()
    fi.close()

    pdb_ids = []
    ags = []
    headers = []
    chains = []
    num_lines = len(lines)
    LOGGER.progress('Starting', num_lines, '_prody_parseChainsList')
    for i, line in enumerate(lines):
        LOGGER.update(i, 'Parsing lines...', label='_prody_parseChainsList')
        pdb_id = line.split()[0].split('_')[0]
        if not pdb_id in pdb_ids:
            pdb_ids.append(pdb_id)

            ag, header = parsePDB(pdb_id, compressed=False, \
                                  subset=line.split()[0].split('_')[1], header=True)

            ags.append(ag)
            headers.append(header)

        chains.append(ag.getHierView()[line.strip().split()[1]])

    LOGGER.finish()
    LOGGER.info('{0} PDBs have been parsed and {1} chains have been extracted. \
                '.format(len(ags),len(chains)))

    return ags, headers, chains
示例#8
0
文件: measure.py 项目: SHZ66/ProDy
def calcMSF(coordsets):
    """Calculate mean square fluctuation(s) (MSF)."""

    try:
        ncsets = coordsets.numFrames()
    except AttributeError:
        try:
            coordsets = coordsets.getCoordsets()
        except AttributeError:
            pass
        try:
            ndim, shape = coordsets.ndim, coordsets.shape
        except:
            raise TypeError('coordsets must be a Numpy array or a ProDy '
                            'object with `getCoordsets` method')
        if ndim != 3 or shape[0] == 1:
            raise ValueError('coordsets must contain multiple sets')
        msf = var(coordsets, 0).sum(1)
    else:
        nfi = coordsets.nextIndex()
        natoms = coordsets.numSelected()
        total = zeros((natoms, 3))
        sqsum = zeros((natoms, 3))

        LOGGER.progress(
            'Evaluating {0} frames from {1}:'.format(ncsets, str(coordsets)),
            ncsets, '_prody_calcMSF')
        ncsets = 0
        coordsets.reset()
        for frame in coordsets:
            frame.superpose()
            coords = frame._getCoords()
            total += coords
            sqsum += coords**2
            ncsets += 1
            LOGGER.update(ncsets, label='_prody_calcMSF')
        LOGGER.finish()
        msf = (sqsum / ncsets - (total / ncsets)**2).sum(1)
        coordsets.goto(nfi)
    return msf
示例#9
0
文件: measure.py 项目: prody/ProDy
def calcMSF(coordsets):
    """Calculate mean square fluctuation(s) (MSF)."""

    try:
        ncsets = coordsets.numFrames()
    except AttributeError:
        try:
            coordsets = coordsets.getCoordsets()
        except AttributeError:
            pass
        try:
            ndim, shape = coordsets.ndim, coordsets.shape
        except:
            raise TypeError('coordsets must be a Numpy array or a ProDy '
                            'object with `getCoordsets` method')
        if ndim != 3 or shape[0] == 1:
            raise ValueError('coordsets must contain multiple sets')
        msf = var(coordsets, 0).sum(1)
    else:
        nfi = coordsets.nextIndex()
        natoms = coordsets.numSelected()
        total = zeros((natoms, 3))
        sqsum = zeros((natoms, 3))

        LOGGER.progress('Evaluating {0} frames from {1}:'
                        .format(ncsets, str(coordsets)), ncsets,
                        '_prody_calcMSF')
        ncsets = 0
        coordsets.reset()
        for frame in coordsets:
            frame.superpose()
            coords = frame._getCoords()
            total += coords
            sqsum += coords ** 2
            ncsets += 1
            LOGGER.update(ncsets, label='_prody_calcMSF')
        LOGGER.finish()
        msf = (sqsum/ncsets - (total/ncsets)**2).sum(1)
        coordsets.goto(nfi)
    return msf
示例#10
0
def parsePDB(*pdb, **kwargs):
    """Returns an :class:`.AtomGroup` and/or dictionary containing header data
    parsed from a PDB file.

    This function extends :func:`.parsePDBStream`.

    See :ref:`parsepdb` for a detailed usage example.

    :arg pdb: one PDB identifier or filename, or a list of them.
        If needed, PDB files are downloaded using :func:`.fetchPDB()` function.
    
    You can also provide arguments that you would like passed on to fetchPDB().
    """

    n_pdb = len(pdb)
    if n_pdb == 1:
        if isListLike(pdb[0]):
            pdb = pdb[0]
            n_pdb = len(pdb)

    if n_pdb == 1:
        return _parsePDB(pdb[0], **kwargs)
    else:
        results = []
        lstkwargs = {}
        for key in kwargs:
            argval = kwargs.get(key)
            if np.isscalar(argval):
                argval = [argval] * n_pdb
            lstkwargs[key] = argval

        start = time.time()
        LOGGER.progress('Retrieving {0} PDB structures...'.format(n_pdb),
                        n_pdb, '_prody_parsePDB')
        for i, p in enumerate(pdb):
            kwargs = {}
            for key in lstkwargs:
                kwargs[key] = lstkwargs[key][i]
            c = kwargs.get('chain', '')
            LOGGER.update(i,
                          'Retrieving {0}...'.format(p + c),
                          label='_prody_parsePDB')
            result = _parsePDB(p, **kwargs)
            if not isinstance(result, tuple):
                if isinstance(result, dict):
                    result = (None, result)
                else:
                    result = (result, None)
            results.append(result)

        results = list(zip(*results))
        LOGGER.finish()

        for i in reversed(range(len(results))):
            if all(j is None for j in results[i]):
                results.pop(i)
        if len(results) == 1:
            results = results[0]
        results = list(results)

        model = kwargs.get('model')
        header = kwargs.get('header', False)
        if model != 0 and header:
            numPdbs = len(results[0])
        else:
            numPdbs = len(results)

        LOGGER.info('{0} PDBs were parsed in {1:.2f}s.'.format(
            numPdbs,
            time.time() - start))

        return results
示例#11
0
def matchModes(*modesets, **kwargs):
    """Returns the matches of modes among *modesets*. Note that the first 
    modeset will be treated as the reference so that only the matching 
    of each modeset to the first modeset is garanteed to be optimal.
    
    :arg index: if **True** then indices of modes will be returned instead of 
                :class:`Mode` instances
    :type index: bool

    :arg turbo: if **True** then the computation will be performed in parallel. 
                The number of threads is set to be the same as the number of 
                CPUs. Assigning a number to specify the number of threads to be 
                used. Note that if writing a script, ``if __name__ == '__main__'`` 
                is necessary to protect your code when multi-tasking. 
                See https://docs.python.org/2/library/multiprocessing.html for details.
                Default is **False**
    :type turbo: bool, int
    """

    index = kwargs.pop('index', False)
    turbo = kwargs.pop('turbo', False)

    n_worker = None
    if not isinstance(turbo, bool):
        n_worker = int(turbo)

    modeset0 = modesets[0]
    if index:
        ret = [modeset0.getIndices()]
    else:
        ret = [modeset0]

    n_modes = len(modeset0)
    n_sets = len(modesets)
    if n_sets == 1:
        return ret
    elif n_sets == 0:
        raise ValueError('at least one modeset should be given')

    if turbo:
        from multiprocessing import Pool, cpu_count
        from math import ceil
        
        if not n_worker:
            n_worker = cpu_count()

        LOGGER.info('Matching {0} modes across {1} modesets with {2} threads...'
                        .format(n_modes, n_sets, n_worker))

        pool = Pool(n_worker)
        n_sets_per_worker = ceil((n_sets - 1) / n_worker)
        args = []
        for i in range(n_worker):
            start = i*n_sets_per_worker + 1
            end = (i+1)*n_sets_per_worker + 1
            subset = modesets[start:end]
            args.append((modeset0, subset, index))
        nested_ret = pool.map(_pairModes_wrapper, args)
        for entry in nested_ret:
            ret.extend(entry)

        pool.close()
        pool.join()
    else:
        LOGGER.progress('Matching {0} modes across {1} modesets...'
                        .format(n_modes, n_sets), n_sets, '_prody_matchModes')
        for i, modeset in enumerate(modesets):
            LOGGER.update(i, label='_prody_matchModes')
            if i > 0:
                _, reordered_modeset = pairModes(modeset0, modeset, index=index, **kwargs)
                ret.append(reordered_modeset)
        LOGGER.finish()
    
    return ret
示例#12
0
文件: pfam.py 项目: fongchun/ProDy
def parsePfamPDBs(query, data=[], **kwargs):
    """Returns a list of AtomGroups containing sections of chains that 
    correspond to a particular PFAM domain family. These are defined by 
    alignment start and end residue numbers.

    :arg query: UniProt ID or PDB ID
        If a PDB ID is provided the corresponding UniProt ID is used.
        If this returns multiple matches then start or end must also be provided.
        This query is also used for label refinement of the Pfam domain MSA.
    :type query: str

    :arg data: If given the data list from the Pfam mapping table will 
        be output through this argument.
    :type data: list

    :keyword start: Residue number for defining the start of the domain.
        The PFAM domain that starts closest to this will be selected. 
        Default is **1**
    :type start: int

    :keyword end: Residue number for defining the end of the domain.
        The PFAM domain that ends closest to this will be selected. 
    :type end: int
    """
    
    start = kwargs.pop('start', 1)
    end = kwargs.pop('end', None)

    if len(query) > 4 and query.startswith('PF'):
        pfam_acc = query
    else:
        pfam_matches = searchPfam(query)
        keys = list(pfam_matches.keys())

        if isinstance(start, Integral):
            start_diff = []
            for i, key in enumerate(pfam_matches):
                start_diff.append(int(pfam_matches[key]['locations'][0]['start']) - start)
            start_diff = np.array(start_diff)
            pfam_acc = keys[np.where(abs(start_diff) == min(abs(start_diff)))[0][0]]

        elif isinstance(end, Integral):
            end_diff = []
            for i, key in enumerate(pfam_matches):
                end_diff.append(int(pfam_matches[key]['locations'][0]['end']) - end)
            end_diff = np.array(end_diff)
            pfam_acc = keys[np.where(abs(end_diff) == min(abs(end_diff)))[0][0]]

        else:
            raise ValueError('Please provide an integer for start or end '
                             'when using a UniProt ID or PDB ID.')

    from ftplib import FTP
    from .uniprot import queryUniprot

    data_stream = BytesIO()
    ftp_host = 'ftp.ebi.ac.uk'
    ftp = FTP(ftp_host)
    ftp.login()
    ftp.cwd('pub/databases/Pfam/current_release')
    ftp.retrbinary('RETR pdbmap.gz', data_stream.write)
    ftp.quit()
    zip_data = data_stream.getvalue()
    data_stream.close()

    rawdata = gunzip(zip_data)
    if PY3K:
        rawdata = rawdata.decode()

    fields = ['PDB_ID', 'chain', 'nothing', 'PFAM_Name', 'PFAM_ACC', 
              'UniprotAcc', 'UniprotResnumRange']
    
    data_dicts = []
    for line in rawdata.split('\n'):
        if line.find(pfam_acc) != -1:
            data_dicts.append({})
            for j, entry in enumerate(line.strip().split('\t')):
                data_dicts[-1][fields[j]] = entry.strip(';')

    pdb_ids = [data_dict['PDB_ID'] for data_dict in data_dicts]
    chains = [data_dict['chain'] for data_dict in data_dicts]

    header = kwargs.pop('header', False)
    model = kwargs.get('model', None)
    results = parsePDB(*pdb_ids, chain=chains, header=True, **kwargs)

    ags, headers = results
    ags, headers = list(ags), list(headers)

    if model == 0:
        LOGGER.info('only header is requested and returned')
        return results

    if header:
        results = (ags, headers)
    else:
#        ags = results
#        ags = list(ags)
        results = ags

    LOGGER.progress('Extracting Pfam domains...', len(ags))
    comma_splitter = re.compile(r'\s*,\s*').split
    no_info = []
    for i, ag in enumerate(ags):
        LOGGER.update(i)
        data_dict = data_dicts[i]
        pfamRange = data_dict['UniprotResnumRange'].split('-')
        uniprotAcc = data_dict['UniprotAcc']
        try:
            uniData = queryUniprot(uniprotAcc)
        except:
            LOGGER.warn('No Uniprot record found for {0}'.format(data_dict['PBD_ID']))
            continue

        resrange = None
        found = False
        for key, value in uniData.items():
            if not key.startswith('dbReference'):
                continue
            try:
                pdbid = value['PDB']
            except:
                continue
            if pdbid != data_dict['PDB_ID']:
                continue
            pdbchains = value['chains']

            # example chain strings: "A=27-139, B=140-150" or "A/B=27-150"
            pdbchains = comma_splitter(pdbchains)
            for chain in pdbchains:
                chids, resrange = chain.split('=')
                chids = [chid.strip() for chid in chids.split('/')]
                if data_dict['chain'] in chids:
                    resrange = resrange.split('-')
                    found = True
                    break
            if found:
                break

        if found:
            header = headers[i]
            chain_accessions = [dbref.accession 
                                for dbref in header[data_dict['chain']].dbrefs]
            try:
                if len(chain_accessions) > 0:
                    right_part = np.where(np.array(chain_accessions) == 
                                        data_dict['UniprotAcc'])[0][0]
                else:
                    raise ValueError('There is no accession for a chain in the Header')
            except:
                LOGGER.warn('Could not map domains in {0}'
                            .format(data_dict['PDB_ID'] 
                            + data_dict['chain']))
                no_info.append(i)
                continue

            right_dbref = header[data_dict['chain']].dbrefs[right_part]
            chainStart = ag.select('chain {0}'.format(data_dict['chain'])
                                  ).getResnums()[0]
            missing = chainStart - right_dbref.first[0]
            partStart = ag.getResindices()[np.where(ag.getResnums() == 
                                           right_dbref.first[0] + missing)][0]
            pfStart, pfEnd = int(pfamRange[0]), int(pfamRange[1])
            uniStart, uniEnd = int(resrange[0]), int(resrange[1])

            resiStart = pfStart - uniStart + partStart - missing
            resiEnd = pfEnd - uniStart + partStart - missing
            ags[i] = ag.select('resindex {0} to {1}'.format(
                            resiStart, resiEnd)) 
        else:
            no_info.append(i)
    LOGGER.finish()

    for i in reversed(no_info):
        ags.pop(i)
        if header:
            headers.pop(i)

    if isinstance(data, list):
        data.extend(data_dicts)
    else:
        LOGGER.warn('data should be a list in order to get output')
    
    return results
示例#13
0
文件: functions.py 项目: prody/ProDy
def addPDBEnsemble(ensemble, PDBs, refpdb=None, labels=None, 
                   mapping_func=mapOntoChain, occupancy=None, unmapped=None, **kwargs):  
    """Adds extra structures to a given PDB ensemble. 

    :arg ensemble: the ensemble to which the PDBs are added
    :type ensemble: :class:`.PDBEnsemble`

    :arg refpdb: reference structure. If set to `None`, it will be set to `ensemble.getAtoms()` automatically
    :type refpdb: :class:`.Chain`, :class:`.Selection`, or :class:`.AtomGroup`

    :arg PDBs: A list of PDB structures
    :type PDBs: iterable

    :arg title: the title of the ensemble
    :type title: str

    :arg labels: labels of the conformations
    :type labels: list

    :arg seqid: minimal sequence identity (percent)
    :type seqid: int

    :arg coverage: minimal sequence overlap (percent)
    :type coverage: int

    :arg occupancy: minimal occupancy of columns (range from 0 to 1). Columns whose occupancy 
                    is below this value will be trimmed
    :type occupancy: float

    :arg unmapped: a list of PDB IDs that cannot be included in the ensemble. This is an 
                   output argument
    :type unmapped: list
    """

    degeneracy = kwargs.pop('degeneracy', True)
    subset = str(kwargs.get('subset', 'calpha')).lower()
    superpose = kwargs.pop('superpose', True)

    if labels is not None:
        if len(labels) != len(PDBs):
            raise TypeError('Labels and PDBs must have the same lengths.')
    else:
        labels = []
        
        for pdb in PDBs:
            if pdb is None:
                labels.append(None)
            else:
                labels.append(pdb.getTitle())

    # obtain refchains from the hierarhical view of the reference PDB
    if refpdb is None:
        refpdb = ensemble._atoms
    else:
        if subset != 'all':
            refpdb = refpdb.select(subset)

    refchains = list(refpdb.getHierView())

    start = time.time()

    # obtain the atommap of all the chains combined.
    atoms = refchains[0]
    for i in range(1, len(refchains)):
        atoms += refchains[i]
    
    # add the PDBs to the ensemble
    if unmapped is None: unmapped = []

    LOGGER.progress('Appending the ensemble...', len(PDBs), '_prody_addPDBEnsemble')
    for i, pdb in enumerate(PDBs):
        lbl = labels[i]
        if pdb is None:
            unmapped.append(labels[i])
            continue

        LOGGER.update(i, 'Mapping %s to the reference...'%pdb.getTitle(), 
                      label='_prody_addPDBEnsemble')
        if not isinstance(pdb, (Chain, Selection, AtomGroup)):
            raise TypeError('PDBs must be a list of Chain, Selection, or AtomGroup.')

        atommaps = []
        # find the mapping of the pdb to each reference chain
        for chain in refchains:
            mappings = mapping_func(pdb, chain,
                                    index=i,
                                    **kwargs)
            if len(mappings) > 0:
                atommaps.append(mappings[0][0])
            else:
                break

        if len(atommaps) != len(refchains):
            unmapped.append(lbl)
            continue
        
        # combine the mappings of pdb to reference chains
        atommap = atommaps[0]
        for i in range(1, len(atommaps)):
            atommap += atommaps[i]
        
        # add the mappings to the ensemble
        ensemble.addCoordset(atommap, weights=atommap.getFlags('mapped'), 
                             label=lbl, degeneracy=degeneracy)
    LOGGER.finish()

    if occupancy is not None:
        ensemble = trimPDBEnsemble(ensemble, occupancy=occupancy)
    if superpose:
        ensemble.iterpose()

    LOGGER.info('{0} PDBs were added to the ensemble in {1:.2f}s.'
                     .format(len(PDBs) - len(unmapped), time.time()-start))

    if unmapped:
        LOGGER.warn('{0} structures cannot be mapped.'.format(len(unmapped)))

    return ensemble
示例#14
0
文件: signature.py 项目: brezal/ProDy
def calcEnsembleENMs(ensemble,
                     model='gnm',
                     trim='reduce',
                     n_modes=20,
                     **kwargs):
    """Description"""

    match = kwargs.pop('match', True)
    if isinstance(ensemble, Conformation):
        conformation = ensemble
        ensemble = conformation.getEnsemble()
        index = conformation.getIndex()
        ensemble = ensemble[index:index + 1]
    if model is GNM:
        model_type = 'GNM'
    elif model is ANM:
        model_type = 'ANM'
    else:
        model_type = str(model).strip().upper()

    start = time.time()

    atoms = ensemble.getAtoms()
    select = None
    if ensemble.isSelected():
        select = atoms
        atoms = ensemble.getAtoms(selected=False)

    labels = ensemble.getLabels()

    ### ENMs ###
    ## ENM for every conf
    enms = []
    n_confs = ensemble.numConfs()

    str_modes = 'all' if n_modes is None else str(n_modes)
    LOGGER.progress(
        'Calculating {0} {1} modes for {2} conformations...'.format(
            str_modes, model_type, n_confs), n_confs,
        '_prody_calcEnsembleENMs')

    for i in range(n_confs):
        LOGGER.update(i, label='_prody_calcEnsembleENMs')
        coords = ensemble.getCoordsets(i, selected=False)
        nodes = coords[0, :, :]
        if atoms is not None:
            atoms.setCoords(nodes)
            nodes = atoms
        enm, _ = calcENM(nodes,
                         select,
                         model=model,
                         trim=trim,
                         n_modes=n_modes,
                         title=labels[i],
                         **kwargs)
        enms.append(enm)

        #lbl = labels[i] if labels[i] != '' else '%d-th conformation'%(i+1)
    LOGGER.finish()

    min_n_modes = ensemble.numAtoms() * 3
    for enm in enms:
        n_modes = enm.numModes()
        if n_modes < min_n_modes:
            min_n_modes = n_modes

    for i in range(len(enms)):
        n_modes = enms[i].numModes()
        if n_modes > min_n_modes:
            enms[i] = enms[i][:min_n_modes]
            LOGGER.warn(
                'last {0} modes for {1} has been discarded because at least one '
                'conformation has only {2} modes'.format(
                    n_modes - min_n_modes, enms[i].getTitle(), min_n_modes))

    LOGGER.info(
        '{0} {1} modes were calculated for each of the {2} conformations in {3:.2f}s.'
        .format(str_modes, model_type, n_confs,
                time.time() - start))

    modeens = ModeEnsemble(title=ensemble.getTitle())
    modeens.addModeSet(enms,
                       weights=ensemble.getWeights(),
                       label=ensemble.getLabels())
    modeens.setAtoms(ensemble.getAtoms())

    if match:
        modeens.match()
    return modeens
示例#15
0
def buildPDBEnsemble(PDBs,
                     ref=None,
                     title='Unknown',
                     labels=None,
                     mapping_func=mapOntoChain,
                     unmapped=None,
                     **kwargs):
    """Builds a PDB ensemble from a given reference structure and a list of PDB structures. 
    Note that the reference structure should be included in the list as well.

    :arg PDBs: A list of PDB structures
    :type PDBs: iterable

    :arg ref: Reference structure or the index to the reference in ``PDBs``. If **None**,
        then the first item in ``PDBs`` will be considered as the reference. 
        Default is **None**
    :type ref: int, :class:`.Chain`, :class:`.Selection`, or :class:`.AtomGroup`

    :arg title: The title of the ensemble
    :type title: str

    :arg labels: labels of the conformations
    :type labels: list

    :arg occupancy: Minimal occupancy of columns (range from 0 to 1). Columns whose occupancy
        is below this value will be trimmed.
    :type occupancy: float

    :arg unmapped: A list of PDB IDs that cannot be included in the ensemble. This is an 
        output argument. 
    :type unmapped: list

    :arg subset: A subset for selecting particular atoms from the input structures.
        Default is calpha
    :type subset: str

    :arg superpose: if set to ``'iter'``, :func:`.PDBEnsemble.iterpose` will be used to 
        superpose the structures, otherwise conformations will be superposed with respect 
        to the reference specified by ``ref``. Default is ``'iter'``
    :type superpose: str
    """

    occupancy = kwargs.pop('occupancy', None)
    degeneracy = kwargs.pop('degeneracy', True)
    subset = str(kwargs.get('subset', 'calpha')).lower()
    superpose = kwargs.pop('superpose', 'iter')
    superpose = kwargs.pop('iterpose', superpose)

    if len(PDBs) == 1:
        raise ValueError('PDBs should have at least two items')

    if labels is not None:
        if len(labels) != len(PDBs):
            raise TypeError('Labels and PDBs must have the same lengths.')
    else:
        labels = []

        for pdb in PDBs:
            if pdb is None:
                labels.append(None)
            else:
                labels.append(pdb.getTitle())

    if ref is None:
        refpdb = PDBs[0]
        refidx = 0
    elif isinstance(ref, Integral):
        refpdb = PDBs[ref]
        refidx = ref
    else:
        refpdb = ref
        if refpdb not in PDBs:
            raise ValueError('refpdb should be also in the PDBs')
        refidx = PDBs.index(ref)

    # obtain refchains from the hierarchical view of the reference PDB
    if subset != 'all':
        refpdb = refpdb.select(subset)

    try:
        refchains = list(refpdb.getHierView())
    except AttributeError:
        raise TypeError('refpdb must have getHierView')

    start = time.time()
    # obtain the atommap of all the chains combined.
    atoms = refpdb

    # initialize a PDBEnsemble with reference atoms and coordinates
    ensemble = PDBEnsemble(title)
    ensemble.setAtoms(atoms)
    ensemble.setCoords(atoms.getCoords())

    # build the ensemble
    if unmapped is None: unmapped = []

    LOGGER.progress('Building the ensemble...', len(PDBs),
                    '_prody_buildPDBEnsemble')
    for i, pdb in enumerate(PDBs):
        if pdb is None:
            unmapped.append(labels[i])
            continue

        LOGGER.update(i,
                      'Mapping %s to the reference...' % pdb.getTitle(),
                      label='_prody_buildPDBEnsemble')
        try:
            pdb.getHierView()
        except AttributeError:
            raise TypeError(
                'PDBs must be a list of instances having the access to getHierView'
            )

        if labels is None:
            lbl = pdb.getTitle()
        else:
            lbl = labels[i]

        atommaps = []
        # find the mapping of the pdb to each reference chain
        for chain in refchains:
            mappings = mapping_func(pdb, chain, index=i, **kwargs)
            if len(mappings) > 0:
                atommaps.append(mappings[0][0])
            else:
                break

        if len(atommaps) != len(refchains):
            unmapped.append(lbl)
            continue

        # combine the mappings of pdb to reference chains
        atommap = atommaps[0]
        for j in range(1, len(atommaps)):
            atommap += atommaps[j]

        # add the mappings to the ensemble
        ensemble.addCoordset(atommap,
                             weights=atommap.getFlags('mapped'),
                             label=lbl,
                             degeneracy=degeneracy)

    LOGGER.finish()

    if occupancy is not None:
        ensemble = trimPDBEnsemble(ensemble, occupancy=occupancy)

    if superpose != 'iter':
        ensemble.superpose(ref=refidx)
    else:
        ensemble.iterpose()

    LOGGER.info('Ensemble ({0} conformations) were built in {1:.2f}s.'.format(
        ensemble.numConfs(),
        time.time() - start))

    if unmapped:
        LOGGER.warn('{0} structures cannot be mapped.'.format(len(unmapped)))
    return ensemble
示例#16
0
def writeDCD(filename,
             trajectory,
             start=None,
             stop=None,
             step=None,
             align=False):
    """Write 32-bit CHARMM format DCD file (also NAMD 2.1 and later).
    *trajectory* can be an :class:`Trajectory`, :class:`DCDFile`, or
    :class:`Ensemble` instance. *filename* is returned upon successful
    output of file."""
    if not filename.lower().endswith('.dcd'):
        filename += '.dcd'

    if not isinstance(trajectory, (TrajBase, Ensemble, Atomic)):
        raise TypeError('{0} is not a valid type for trajectory'.format(
            type(trajectory)))

    irange = list(
        range(*slice(start, stop, step).indices(trajectory.numCoordsets())))
    n_csets = len(irange)
    if n_csets == 0:
        raise ValueError('trajectory does not have any coordinate sets, or '
                         'no coordinate sets are selected')

    if isinstance(trajectory, Atomic):
        isEnsemble = False
        isAtomic = True
        n_atoms = trajectory.numAtoms()
    else:
        isEnsemble = True
        isAtomic = False
        n_atoms = trajectory.numSelected()
    if n_atoms == 0:
        raise ValueError('no atoms are selected in the trajectory')
    if isinstance(trajectory, TrajBase):
        isTrajectory = True
        unitcell = trajectory.hasUnitcell()
        nfi = trajectory.nextIndex()
        trajectory.reset()
        pack_i_48 = pack('i', 48)
        if isinstance(trajectory, Trajectory):
            timestep = trajectory.getTimestep()[0]
            first_ts = trajectory.getFirstTimestep()[0]
            framefreq = trajectory.getFrameFreq()[0]
            n_fixed = trajectory.numFixed()[0]
        else:
            timestep = trajectory.getTimestep()
            first_ts = trajectory.getFirstTimestep()
            framefreq = trajectory.getFrameFreq()
            n_fixed = trajectory.numFixed()
    else:
        isTrajectory = False
        unitcell = False
        if isinstance(trajectory, Ensemble):
            frame = trajectory[0]
        else:
            frame = trajectory
            acsi = trajectory.getACSIndex()
        timestep = 1
        first_ts = 0
        framefreq = 1
        n_fixed = 0

    dcd = DCDFile(filename, mode='w')
    LOGGER.progress('Writing DCD', len(irange), '_prody_writeDCD')
    prev = -1
    uc = None
    time_ = time()
    for j, i in enumerate(irange):
        diff = i - prev
        prev = i
        if isTrajectory:
            if diff > 1:
                trajectory.skip(diff - 1)
            frame = next(trajectory)
            if frame is None:
                break
            if unitcell:
                uc = frame._getUnitcell()
                uc[3:] = np.sin((PISQUARE / 90) * (90 - uc[3:]))
                uc = uc[[0, 3, 1, 4, 5, 2]]
        elif isEnsemble:
            frame._index = i
        else:
            frame.setACSIndex(i)
        if align:
            frame.superpose()
        if j == 0:
            dcd.write(frame._getCoords(),
                      uc,
                      timestep=timestep,
                      firsttimestep=first_ts,
                      framefreq=framefreq)
        else:
            dcd.write(frame._getCoords(), uc)
        LOGGER.update(i, label='_prody_writeDCD')
    if isAtomic:
        trajectory.setACSIndex(acsi)
    j += 1
    LOGGER.finish()
    dcd.close()
    time_ = time() - time_ or 0.01
    dcd_size = 1.0 * (56 + (n_atoms * 3 + 6) * 4) * n_csets / (1024 * 1024)
    LOGGER.info('DCD file was written in {0:.2f} seconds.'.format(time_))
    LOGGER.info('{0:.2f} MB written at input rate {1:.2f} MB/s.'.format(
        dcd_size, dcd_size / time_))
    LOGGER.info(
        '{0} coordinate sets written at output rate {1} frame/s.'.format(
            n_csets, int(n_csets / time_)))
    if j != n_csets:
        LOGGER.warn('Warning: {0} frames expected, {1} written.'.format(
            n_csets, j))
    if isTrajectory:
        trajectory.goto(nfi)
    return filename
示例#17
0
文件: compare.py 项目: fongchun/ProDy
def matchModes(*modesets, **kwargs):
    """Returns the matches of modes among *modesets*. Note that the first 
    modeset will be treated as the reference so that only the matching 
    of each modeset to the first modeset is garanteed to be optimal.
    
    :arg index: if **True** then indices of modes will be returned instead of 
                :class:`Mode` instances
    :type index: bool

    :arg turbo: if **True** then the computation will be performed in parallel. 
                The number of threads is set to be the same as the number of 
                CPUs. Assigning a number to specify the number of threads to be 
                used. Note that if writing a script, ``if __name__ == '__main__'`` 
                is necessary to protect your code when multi-tasking. 
                See https://docs.python.org/2/library/multiprocessing.html for details.
                Default is **False**
    :type turbo: bool, int
    """

    index = kwargs.pop('index', False)
    turbo = kwargs.pop('turbo', False)

    n_worker = None
    if not isinstance(turbo, bool):
        n_worker = int(turbo)

    modeset0 = modesets[0]
    if index:
        ret = [modeset0.getIndices()]
    else:
        ret = [modeset0]

    n_modes = len(modeset0)
    n_sets = len(modesets)
    if n_sets == 1:
        return ret
    elif n_sets == 0:
        raise ValueError('at least one modeset should be given')

    if turbo:
        from multiprocessing import Pool, cpu_count
        from math import ceil
        
        if not n_worker:
            n_worker = cpu_count()

        LOGGER.info('Matching {0} modes across {1} modesets with {2} threads...'
                        .format(n_modes, n_sets, n_worker))

        pool = Pool(n_worker)
        n_sets_per_worker = ceil((n_sets - 1) / n_worker)
        args = []
        for i in range(n_worker):
            start = i*n_sets_per_worker + 1
            end = (i+1)*n_sets_per_worker + 1
            subset = modesets[start:end]
            args.append((modeset0, subset, index))
        nested_ret = pool.map(_pairModes_wrapper, args)
        for entry in nested_ret:
            ret.extend(entry)

        pool.close()
        pool.join()
    else:
        LOGGER.progress('Matching {0} modes across {1} modesets...'
                        .format(n_modes, n_sets), n_sets, '_prody_matchModes')
        for i, modeset in enumerate(modesets):
            LOGGER.update(i, label='_prody_matchModes')
            if i > 0:
                _, reordered_modeset = pairModes(modeset0, modeset, index=index, **kwargs)
                ret.append(reordered_modeset)
        LOGGER.finish()
    
    return ret
示例#18
0
文件: pca.py 项目: fongchun/ProDy
    def buildCovariance(self, coordsets, **kwargs):
        """Build a covariance matrix for *coordsets* using mean coordinates
        as the reference.  *coordsets* argument may be one of the following:

        * :class:`.Atomic`
        * :class:`.Ensemble`
        * :class:`.TrajBase`
        * :class:`numpy.ndarray` with shape ``(n_csets, n_atoms, 3)``

        For ensemble and trajectory objects, ``update_coords=True`` argument
        can be used to set the mean coordinates as the coordinates of the
        object.

        When *coordsets* is a trajectory object, such as :class:`.DCDFile`,
        covariance will be built by superposing frames onto the reference
        coordinate set (see :meth:`.Frame.superpose`).  If frames are already
        aligned, use ``aligned=True`` argument to skip this step.


        .. note::
           If *coordsets* is a :class:`.PDBEnsemble` instance, coordinates are
           treated specially.  Let's say **C**\_ij is the element of the
           covariance matrix that corresponds to atoms *i* and *j*.  This
           super element is divided by number of coordinate sets (PDB models or
           structures) in which both of these atoms are observed together."""

        if not isinstance(coordsets, (Ensemble, Atomic, TrajBase, np.ndarray)):
            raise TypeError('coordsets must be an Ensemble, Atomic, Numpy '
                            'array instance')
        LOGGER.timeit('_prody_pca')
        mean = None
        weights = None
        ensemble = None
        if isinstance(coordsets, np.ndarray):
            if (coordsets.ndim != 3 or coordsets.shape[2] != 3 or
                    coordsets.dtype not in (np.float32, float)):
                raise ValueError('coordsets is not a valid coordinate array')
        elif isinstance(coordsets, Atomic):
            coordsets = coordsets._getCoordsets()
        elif isinstance(coordsets, Ensemble):
            ensemble = coordsets
            if isinstance(coordsets, PDBEnsemble):
                weights = coordsets.getWeights() > 0
            coordsets = coordsets._getCoordsets()

        update_coords = bool(kwargs.get('update_coords', False))

        if isinstance(coordsets, TrajBase):
            nfi = coordsets.nextIndex()
            coordsets.reset()
            n_atoms = coordsets.numSelected()
            dof = n_atoms * 3
            cov = np.zeros((dof, dof))
            #mean = coordsets._getCoords().flatten()
            n_confs = 0
            n_frames = len(coordsets)
            LOGGER.info('Covariance will be calculated using {0} frames.'
                        .format(n_frames))
            coordsum = np.zeros(dof)
            LOGGER.progress('Building covariance', n_frames, '_prody_pca')
            align = not kwargs.get('aligned', False)
            for frame in coordsets:
                if align:
                    frame.superpose()
                coords = frame._getCoords().flatten()
                coordsum += coords
                cov += np.outer(coords, coords)
                n_confs += 1
                LOGGER.update(n_confs, label='_prody_pca')
            LOGGER.finish()
            cov /= n_confs
            coordsum /= n_confs
            mean = coordsum
            cov -= np.outer(coordsum, coordsum)
            coordsets.goto(nfi)
            self._cov = cov
            if update_coords:
                coordsets.setCoords(mean.reshape((n_atoms, 3)))
        else:
            n_confs = coordsets.shape[0]
            if n_confs < 3:
                raise ValueError('coordsets must have more than 3 coordinate '
                                 'sets')
            n_atoms = coordsets.shape[1]
            if n_atoms < 3:
                raise ValueError('coordsets must have more than 3 atoms')
            dof = n_atoms * 3
            LOGGER.info('Covariance is calculated using {0} coordinate sets.'
                        .format(len(coordsets)))
            s = (n_confs, dof)
            if weights is None:
                if coordsets.dtype == float:
                    self._cov = np.cov(coordsets.reshape((n_confs, dof)).T,
                                       bias=1)
                else:
                    cov = np.zeros((dof, dof))
                    coordsets = coordsets.reshape((n_confs, dof))
                    mean = coordsets.mean(0)
                    LOGGER.progress('Building covariance', n_confs,
                                    '_prody_pca')
                    for i, coords in enumerate(coordsets.reshape(s)):
                        deviations = coords - mean
                        cov += np.outer(deviations, deviations)
                        LOGGER.update(n_confs, label='_prody_pca')
                    LOGGER.finish()
                    cov /= n_confs
                    self._cov = cov
            else:
                # PDB ensemble case
                mean = np.zeros((n_atoms, 3))
                for i, coords in enumerate(coordsets):
                    mean += coords * weights[i]
                mean /= weights.sum(0)
                d_xyz = ((coordsets - mean) * weights).reshape(s)
                divide_by = weights.astype(float).repeat(3, axis=2).reshape(s)
                self._cov = np.dot(d_xyz.T, d_xyz) / np.dot(divide_by.T,
                                                            divide_by)
            if update_coords and ensemble is not None:
                if mean is None:
                    mean = coordsets.mean(0)
                ensemble.setCoords(mean)

        self._trace = self._cov.trace()
        self._dof = dof
        self._n_atoms = n_atoms
        LOGGER.report('Covariance matrix calculated in %2fs.', '_prody_pca')
示例#19
0
    def buildCovariance(self, coordsets, **kwargs):
        """Build a covariance matrix for *coordsets* using mean coordinates
        as the reference.  *coordsets* argument may be one of the following:

        * :class:`.Atomic`
        * :class:`.Ensemble`
        * :class:`.TrajBase`
        * :class:`numpy.ndarray` with shape ``(n_csets, n_atoms, 3)``

        For ensemble and trajectory objects, ``update_coords=True`` argument
        can be used to set the mean coordinates as the coordinates of the
        object.

        When *coordsets* is a trajectory object, such as :class:`.DCDFile`,
        covariance will be built by superposing frames onto the reference
        coordinate set (see :meth:`.Frame.superpose`).  If frames are already
        aligned, use ``aligned=True`` argument to skip this step.


        .. note::
           If *coordsets* is a :class:`.PDBEnsemble` instance, coordinates are
           treated specially.  Let's say **C**\_ij is the element of the
           covariance matrix that corresponds to atoms *i* and *j*.  This
           super element is divided by number of coordinate sets (PDB models or
           structures) in which both of these atoms are observed together."""

        if not isinstance(coordsets, (Ensemble, Atomic, TrajBase, np.ndarray)):
            raise TypeError('coordsets must be an Ensemble, Atomic, Numpy '
                            'array instance')
        LOGGER.timeit('_prody_pca')
        mean = None
        weights = None
        ensemble = None
        if isinstance(coordsets, np.ndarray):
            if (coordsets.ndim != 3 or coordsets.shape[2] != 3
                    or coordsets.dtype not in (np.float32, float)):
                raise ValueError('coordsets is not a valid coordinate array')
        elif isinstance(coordsets, Atomic):
            coordsets = coordsets._getCoordsets()
        elif isinstance(coordsets, Ensemble):
            ensemble = coordsets
            if isinstance(coordsets, PDBEnsemble):
                weights = coordsets.getWeights() > 0
            coordsets = coordsets._getCoordsets()

        update_coords = bool(kwargs.get('update_coords', False))

        if isinstance(coordsets, TrajBase):
            nfi = coordsets.nextIndex()
            coordsets.reset()
            n_atoms = coordsets.numSelected()
            dof = n_atoms * 3
            cov = np.zeros((dof, dof))
            #mean = coordsets._getCoords().flatten()
            n_confs = 0
            n_frames = len(coordsets)
            LOGGER.info(
                'Covariance will be calculated using {0} frames.'.format(
                    n_frames))
            coordsum = np.zeros(dof)
            LOGGER.progress('Building covariance', n_frames, '_prody_pca')
            align = not kwargs.get('aligned', False)
            for frame in coordsets:
                if align:
                    frame.superpose()
                coords = frame._getCoords().flatten()
                coordsum += coords
                cov += np.outer(coords, coords)
                n_confs += 1
                LOGGER.update(n_confs, label='_prody_pca')
            LOGGER.finish()
            cov /= n_confs
            coordsum /= n_confs
            mean = coordsum
            cov -= np.outer(coordsum, coordsum)
            coordsets.goto(nfi)
            self._cov = cov
            if update_coords:
                coordsets.setCoords(mean.reshape((n_atoms, 3)))
        else:
            n_confs = coordsets.shape[0]
            if n_confs < 3:
                raise ValueError('coordsets must have more than 3 coordinate '
                                 'sets')
            n_atoms = coordsets.shape[1]
            if n_atoms < 3:
                raise ValueError('coordsets must have more than 3 atoms')
            dof = n_atoms * 3
            LOGGER.info(
                'Covariance is calculated using {0} coordinate sets.'.format(
                    len(coordsets)))
            s = (n_confs, dof)
            if weights is None:
                if coordsets.dtype == float:
                    self._cov = np.cov(coordsets.reshape((n_confs, dof)).T,
                                       bias=1)
                else:
                    cov = np.zeros((dof, dof))
                    coordsets = coordsets.reshape((n_confs, dof))
                    mean = coordsets.mean(0)
                    LOGGER.progress('Building covariance', n_confs,
                                    '_prody_pca')
                    for i, coords in enumerate(coordsets.reshape(s)):
                        deviations = coords - mean
                        cov += np.outer(deviations, deviations)
                        LOGGER.update(n_confs, label='_prody_pca')
                    LOGGER.finish()
                    cov /= n_confs
                    self._cov = cov
            else:
                # PDB ensemble case
                mean = np.zeros((n_atoms, 3))
                for i, coords in enumerate(coordsets):
                    mean += coords * weights[i]
                mean /= weights.sum(0)
                d_xyz = ((coordsets - mean) * weights).reshape(s)
                divide_by = weights.astype(float).repeat(3, axis=2).reshape(s)
                self._cov = np.dot(d_xyz.T, d_xyz) / np.dot(
                    divide_by.T, divide_by)
            if update_coords and ensemble is not None:
                if mean is None:
                    mean = coordsets.mean(0)
                ensemble.setCoords(mean)

        self._trace = self._cov.trace()
        self._dof = dof
        self._n_atoms = n_atoms
        LOGGER.report('Covariance matrix calculated in %2fs.', '_prody_pca')
示例#20
0
文件: bird.py 项目: bwingert/ProDy
def fetchBIRDviaFTP(**kwargs):
    """Retrieve the whole Biologically Interesting Molecule Reference 
    Dictionary (BIRD) resource, which is updated every week. This includes 
    2 kinds of keys, which can be selected with the **keys** keyword argument.

    The chemical information is found in a zipped (tar.gz) directory at 
    https://files.rcsb.org/pub/pdb/data/bird/prd/prd-all.cif.gz, which 
    contains individual CIF files within it. This data will be downloaded 
    and extracted to :file:`.prody/bird-prd`.

    Biological function information is also found in a zipped (tar.gz) directory at 
    https://files.rcsb.org/pub/pdb/data/bird/family/family-all.cif.gz, which 
    contains individual CIF files within it. This data will be downloaded 
    and extracted to :file:`.prody/bird-family`.

    :arg keys: keys specifying which data to fetch out of ``'prd'``, ``'family'`` or ``'both'``
               default is ``'both'``
    :type keys: str, tuple, list, :class:`~numpy.ndarray`

    The underlying data can be accessed using :func:`parseBIRD`."""

    BIRD_PATH = os.path.join(getPackagePath(), 'bird')

    keys = kwargs.get('keys', 'both')
    if isinstance(keys, str):
        if keys == 'both':
            keys = ['prd', 'family']
        elif keys[:3].lower() == 'prd':
            keys = ['prd']
        elif keys[:3].lower() == 'fam':
            keys = ['family']
        else:
            raise ValueError("keys should be 'both', 'prd' or 'fam'")

    elif isListLike(keys):
        keys = list(keys)
    else:
        raise TypeError("keys should be list-like or string")

    ftp_divided = 'pdb/data/bird/'
    ftp_pdbext = '.cif.gz'
    ftp_prefix = ''

    if not os.path.isdir(BIRD_PATH):
        os.mkdir(BIRD_PATH)

    LOGGER.progress('Downloading BIRD', len(keys),
                    '_prody_fetchBIRD')

    ftp_name, ftp_host, ftp_path = WWPDB_FTP_SERVERS[wwPDBServer() or 'us']
    LOGGER.debug('Connecting wwPDB FTP server {0}.'.format(ftp_name))

    from ftplib import FTP
    try:
        ftp = FTP(ftp_host)
    except Exception as error:
        raise type(error)('FTP connection problem, potential reason: '
                          'no internet connectivity')
    else:
        count = 0
        success = 0
        failure = 0
        filenames = []
        ftp.login('')
        for i, x in enumerate(keys):
            data = []
            ftp_fn = ftp_prefix + '{0}-all'.format(x) + ftp_pdbext
            try:
                ftp.cwd(ftp_path)
                ftp.cwd(ftp_divided)
                ftp.cwd(x)
                ftp.retrbinary('RETR ' + ftp_fn, data.append)
            except Exception as error:
                if ftp_fn in ftp.nlst():
                    LOGGER.warn('{0} download failed ({1}). It is '
                                'possible that you do not have rights to '
                                'download .gz files in the current network.'
                                .format(x, str(error)))
                else:
                    LOGGER.info('{0} download failed. {1} does not exist '
                                'on {2}.'.format(ftp_fn, x, ftp_host))
                failure += 1
                filenames.append(None)
            else:
                if len(data):
                    filename = BIRD_PATH + '/{0}-all.cif.gz'.format(x)

                    with open(filename, 'w+b') as outfile:
                        write = outfile.write
                        [write(block) for block in data]

                    success += 1
                else:
                    failure += 1
            count += 1
            LOGGER.update(i, label='_prody_fetchBIRD')
        LOGGER.finish()

    LOGGER.debug('PDB download via FTP completed ({0} downloaded, '
                 '{1} failed).'.format(success, failure))
示例#21
0
文件: pdbfile.py 项目: prody/ProDy
def parsePDB(*pdb, **kwargs):
    """Returns an :class:`.AtomGroup` and/or dictionary containing header data
    parsed from a PDB file.

    This function extends :func:`.parsePDBStream`.

    See :ref:`parsepdb` for a detailed usage example.

    :arg pdb: one PDB identifier or filename, or a list of them.
        If needed, PDB files are downloaded using :func:`.fetchPDB()` function.
    
    You can also provide arguments that you would like passed on to fetchPDB().
    """

    n_pdb = len(pdb)
    if n_pdb == 1:
        if isListLike(pdb[0]):
            pdb = pdb[0]
            n_pdb = len(pdb)
            
    if n_pdb == 1:
        return _parsePDB(pdb[0], **kwargs)
    else:
        results = []
        lstkwargs = {}
        for key in kwargs:
            argval = kwargs.get(key)
            if np.isscalar(argval):
                argval = [argval]*n_pdb
            lstkwargs[key] = argval

        start = time.time()
        LOGGER.progress('Retrieving {0} PDB structures...'
                    .format(n_pdb), n_pdb, '_prody_parsePDB')
        for i, p in enumerate(pdb):
            kwargs = {}
            for key in lstkwargs:
                kwargs[key] = lstkwargs[key][i]
            c = kwargs.get('chain','')
            LOGGER.update(i, 'Retrieving {0}...'.format(p+c), 
                          label='_prody_parsePDB')
            result = _parsePDB(p, **kwargs)
            if not isinstance(result, tuple):
                if isinstance(result, dict):
                    result = (None, result)
                else:
                    result = (result, None)
            results.append(result)

        results = list(zip(*results))
        LOGGER.finish()
       
        for i in reversed(range(len(results))):
            if all(j is None for j in results[i]):
                results.pop(i)
        if len(results) == 1:
            results = results[0]
        results = list(results)

        model = kwargs.get('model')
        header = kwargs.get('header', False)
        if model != 0 and header:
            numPdbs = len(results[0])
        else:
            numPdbs = len(results)

        LOGGER.info('{0} PDBs were parsed in {1:.2f}s.'
                     .format(numPdbs, time.time()-start))

        return results
示例#22
0
文件: functions.py 项目: prody/ProDy
def buildPDBEnsemble(PDBs, ref=None, title='Unknown', labels=None, 
                     mapping_func=mapOntoChain, unmapped=None, **kwargs):
    """Builds a PDB ensemble from a given reference structure and a list of PDB structures. 
    Note that the reference structure should be included in the list as well.

    :arg PDBs: A list of PDB structures
    :type PDBs: iterable

    :arg ref: Reference structure or the index to the reference in ``PDBs``. If **None**,
        then the first item in ``PDBs`` will be considered as the reference. 
        Default is **None**
    :type ref: int, :class:`.Chain`, :class:`.Selection`, or :class:`.AtomGroup`

    :arg title: The title of the ensemble
    :type title: str

    :arg labels: labels of the conformations
    :type labels: list

    :arg occupancy: Minimal occupancy of columns (range from 0 to 1). Columns whose occupancy
        is below this value will be trimmed.
    :type occupancy: float

    :arg unmapped: A list of PDB IDs that cannot be included in the ensemble. This is an 
        output argument. 
    :type unmapped: list

    :arg subset: A subset for selecting particular atoms from the input structures.
        Default is calpha
    :type subset: str
    """

    occupancy = kwargs.pop('occupancy', None)
    degeneracy = kwargs.pop('degeneracy', True)
    subset = str(kwargs.get('subset', 'calpha')).lower()
    superpose = kwargs.pop('superpose', True)

    if len(PDBs) == 1:
        raise ValueError('PDBs should have at least two items')

    if labels is not None:
        if len(labels) != len(PDBs):
            raise TypeError('Labels and PDBs must have the same lengths.')
    else:
        labels = []
        
        for pdb in PDBs:
            if pdb is None:
                labels.append(None)
            else:
                labels.append(pdb.getTitle())

    if ref is None:
        refpdb = PDBs[0]
    elif isinstance(ref, Integral):
        refpdb = PDBs[ref]
    else:
        refpdb = ref
        if refpdb not in PDBs:
            raise ValueError('refpdb should be also in the PDBs')

    # obtain refchains from the hierarchical view of the reference PDB
    if subset != 'all':
        refpdb = refpdb.select(subset)
        
    try:
        refchains = list(refpdb.getHierView())
    except AttributeError:
        raise TypeError('refpdb must have getHierView')

    start = time.time()
    # obtain the atommap of all the chains combined.
    atoms = refchains[0]
    for i in range(1, len(refchains)):
        atoms += refchains[i]
    
    # initialize a PDBEnsemble with reference atoms and coordinates
    ensemble = PDBEnsemble(title)
    ensemble.setAtoms(atoms)
    ensemble.setCoords(atoms.getCoords())
    
    # build the ensemble
    if unmapped is None: unmapped = []

    LOGGER.progress('Building the ensemble...', len(PDBs), '_prody_buildPDBEnsemble')
    for i, pdb in enumerate(PDBs):
        if pdb is None:
            unmapped.append(labels[i])
            continue

        LOGGER.update(i, 'Mapping %s to the reference...'%pdb.getTitle(), 
                      label='_prody_buildPDBEnsemble')
        try:
            pdb.getHierView()
        except AttributeError:
            raise TypeError('PDBs must be a list of instances having the access to getHierView')
            
        if labels is None:
            lbl = pdb.getTitle()
        else:
            lbl = labels[i]

        atommaps = []
        # find the mapping of the pdb to each reference chain
        for chain in refchains:
            mappings = mapping_func(pdb, chain,
                                    index=i,
                                    **kwargs)
            if len(mappings) > 0:
                atommaps.append(mappings[0][0])
            else:
                break

        if len(atommaps) != len(refchains):
            unmapped.append(lbl)
            continue
        
        # combine the mappings of pdb to reference chains
        atommap = atommaps[0]
        for j in range(1, len(atommaps)):
            atommap += atommaps[j]
        
        # add the mappings to the ensemble
        ensemble.addCoordset(atommap, weights=atommap.getFlags('mapped'), 
                             label = lbl, degeneracy=degeneracy)

    LOGGER.finish()

    if occupancy is not None:
        ensemble = trimPDBEnsemble(ensemble, occupancy=occupancy)
    if superpose:
        ensemble.iterpose()
    
    LOGGER.info('Ensemble ({0} conformations) were built in {1:.2f}s.'
                     .format(ensemble.numConfs(), time.time()-start))

    if unmapped:
        LOGGER.warn('{0} structures cannot be mapped.'.format(len(unmapped)))
    return ensemble
示例#23
0
def buildPDBEnsemble(atomics,
                     ref=None,
                     title='Unknown',
                     labels=None,
                     atommaps=None,
                     unmapped=None,
                     **kwargs):
    """Builds a :class:`.PDBEnsemble` from a given reference structure and a list of structures 
    (:class:`.Atomic` instances). Note that the reference should be included in the list as well.

    :arg atomics: a list of :class:`.Atomic` instances
    :type atomics: list

    :arg ref: reference structure or the index to the reference in *atomics*. If **None**,
        then the first item in *atomics* will be considered as the reference. If it is a 
        :class:`.PDBEnsemble` instance, then *atomics* will be appended to the existing ensemble.
        Default is **None**
    :type ref: int, :class:`.Chain`, :class:`.Selection`, or :class:`.AtomGroup`

    :arg title: the title of the ensemble
    :type title: str

    :arg labels: labels of the conformations
    :type labels: list

    :arg degeneracy: whether only the active coordinate set (**True**) or all the coordinate sets 
        (**False**) of each structure should be added to the ensemble. Default is **True**
    :type degeneracy: bool

    :arg occupancy: minimal occupancy of columns (range from 0 to 1). Columns whose occupancy
        is below this value will be trimmed
    :type occupancy: float

    :arg atommaps: labels of *atomics* that were mapped and added into the ensemble. This is an 
        output argument
    :type atommaps: list

    :arg unmapped: labels of *atomics* that cannot be included in the ensemble. This is an 
        output argument
    :type unmapped: list

    :arg subset: a subset for selecting particular atoms from the input structures.
        Default is ``"all"``
    :type subset: str

    :arg superpose: if set to ``'iter'``, :func:`.PDBEnsemble.iterpose` will be used to 
        superpose the structures, otherwise conformations will be superposed with respect 
        to the reference specified by *ref* unless set to ``False``. Default is ``'iter'``
    :type superpose: str, bool
    """

    occupancy = kwargs.pop('occupancy', None)
    degeneracy = kwargs.pop('degeneracy', True)
    subset = str(kwargs.get('subset', 'all')).lower()
    superpose = kwargs.pop('superpose', 'iter')
    superpose = kwargs.pop('iterpose', superpose)
    debug = kwargs.pop('debug', {})

    if 'mapping_func' in kwargs:
        raise DeprecationWarning(
            'mapping_func is deprecated. Please see release notes for '
            'more details: http://prody.csb.pitt.edu/manual/release/v1.11_series.html'
        )
    start = time.time()

    if not isListLike(atomics):
        raise TypeError('atomics should be list-like')

    if len(atomics) == 1 and degeneracy is True:
        raise ValueError('atomics should have at least two items')

    if labels is not None:
        if len(labels) != len(atomics):
            raise TypeError('Labels and atomics must have the same lengths.')
    else:
        labels = []

        for atoms in atomics:
            if atoms is None:
                labels.append(None)
            else:
                labels.append(atoms.getTitle())

    if ref is None:
        target = atomics[0]
    elif isinstance(ref, Integral):
        target = atomics[ref]
    elif isinstance(ref, PDBEnsemble):
        target = ref._atoms
    else:
        target = ref

    # initialize a PDBEnsemble with reference atoms and coordinates
    isrefset = False
    if isinstance(ref, PDBEnsemble):
        ensemble = ref
    else:
        # select the subset of reference beforehand for the sake of efficiency
        if subset != 'all':
            target = target.select(subset)
        ensemble = PDBEnsemble(title)
        if isinstance(target, Atomic):
            ensemble.setAtoms(target)
            ensemble.setCoords(target.getCoords())
            isrefset = True
        else:
            ensemble._n_atoms = len(target)
            isrefset = False

    # build the ensemble
    if unmapped is None: unmapped = []
    if atommaps is None: atommaps = []

    LOGGER.progress('Building the ensemble...', len(atomics),
                    '_prody_buildPDBEnsemble')
    for i, atoms in enumerate(atomics):
        if atoms is None:
            unmapped.append(labels[i])
            continue

        LOGGER.update(i,
                      'Mapping %s to the reference...' % atoms.getTitle(),
                      label='_prody_buildPDBEnsemble')
        try:
            atoms.getHierView()
        except AttributeError:
            raise TypeError(
                'atomics must be a list of instances having the access to getHierView'
            )

        if subset != 'all':
            atoms = atoms.select(subset)

        # find the mapping of chains of atoms to those of target
        debug[labels[i]] = {}
        atommaps_ = alignChains(atoms,
                                target,
                                debug=debug[labels[i]],
                                **kwargs)

        if len(atommaps_) == 0:
            unmapped.append(labels[i])
            continue
        else:
            atommaps.extend(atommaps_)

        # add the atommaps to the ensemble
        for atommap in atommaps_:
            lbl = pystr(labels[i])
            if len(atommaps_) > 1:
                chids = np.unique(atommap.getChids())
                strchids = ''.join(chids)
                lbl += '_%s' % strchids
            ensemble.addCoordset(atommap,
                                 weights=atommap.getFlags('mapped'),
                                 label=lbl,
                                 degeneracy=degeneracy)

            if not isrefset:
                ensemble.setCoords(atommap.getCoords())
                isrefset = True

    LOGGER.finish()

    if occupancy is not None:
        ensemble = trimPDBEnsemble(ensemble, occupancy=occupancy)

    if superpose == 'iter':
        ensemble.iterpose()
    elif superpose is not False:
        ensemble.superpose()

    LOGGER.info('Ensemble ({0} conformations) were built in {1:.2f}s.'.format(
        ensemble.numConfs(),
        time.time() - start))

    if unmapped:
        LOGGER.warn('{0} structures cannot be mapped.'.format(len(unmapped)))
    return ensemble
示例#24
0
文件: dcdfile.py 项目: prody/ProDy
def writeDCD(filename, trajectory, start=None, stop=None, step=None,
             align=False):
    """Write 32-bit CHARMM format DCD file (also NAMD 2.1 and later).
    *trajectory can be an :class:`Trajectory`, :class:`DCDFile`, or
    :class:`Ensemble` instance. *filename* is returned upon successful
    output of file."""
    if not filename.lower().endswith('.dcd'):
        filename += '.dcd'

    if not isinstance(trajectory, (TrajBase, Ensemble, Atomic)):
        raise TypeError('{0} is not a valid type for trajectory'
                        .format(type(trajectory)))

    irange = list(range(*slice(start, stop,step)
                    .indices(trajectory.numCoordsets())))
    n_csets = len(irange)
    if n_csets == 0:
        raise ValueError('trajectory does not have any coordinate sets, or '
                         'no coordinate sets are selected')

    if isinstance(trajectory, Atomic):
        isEnsemble = False
        isAtomic = True
        n_atoms = trajectory.numAtoms()
    else:
        isEnsemble = True
        isAtomic = False
        n_atoms = trajectory.numSelected()
    if n_atoms == 0:
        raise ValueError('no atoms are selected in the trajectory')
    if isinstance(trajectory, TrajBase):
        isTrajectory = True
        unitcell = trajectory.hasUnitcell()
        nfi = trajectory.nextIndex()
        trajectory.reset()
        pack_i_48 = pack('i', 48)
        if isinstance(trajectory, Trajectory):
            timestep = trajectory.getTimestep()[0]
            first_ts = trajectory.getFirstTimestep()[0]
            framefreq = trajectory.getFrameFreq()[0]
            n_fixed = trajectory.numFixed()[0]
        else:
            timestep = trajectory.getTimestep()
            first_ts = trajectory.getFirstTimestep()
            framefreq = trajectory.getFrameFreq()
            n_fixed = trajectory.numFixed()
    else:
        isTrajectory = False
        unitcell = False
        if isinstance(trajectory, Ensemble):
            frame = trajectory[0]
        else:
            frame = trajectory
            acsi = trajectory.getACSIndex()
        timestep = 1
        first_ts = 0
        framefreq = 1
        n_fixed = 0

    dcd = DCDFile(filename, mode='w')
    LOGGER.progress('Writing DCD', len(irange), '_prody_writeDCD')
    prev = -1
    uc = None
    time_ = time()
    for j, i in enumerate(irange):
        diff = i - prev
        if diff > 1:
            trajectory.skip(diff-1)
        prev = i
        if isTrajectory:
            frame = next(trajectory)
            if frame is None:
                break
            if unitcell:
                uc = frame._getUnitcell()
                uc[3:] = np.sin((PISQUARE/90) * (90-uc[3:]))
                uc = uc[[0,3,1,4,5,2]]
        elif isEnsemble:
            frame._index = i
        else:
            frame.setACSIndex(i)
        if align:
            frame.superpose()
        if j == 0:
            dcd.write(frame._getCoords(), uc, timestep=timestep,
                      firsttimestep=first_ts, framefreq=framefreq)
        else:
            dcd.write(frame._getCoords(), uc)
        LOGGER.update(i, label='_prody_writeDCD')
    if isAtomic:
        trajectory.setACSIndex(acsi)
    j += 1
    LOGGER.finish()
    dcd.close()
    time_ = time() - time_ or 0.01
    dcd_size = 1.0 * (56 + (n_atoms * 3 + 6) * 4 ) * n_csets / (1024*1024)
    LOGGER.info('DCD file was written in {0:.2f} seconds.'.format(time_))
    LOGGER.info('{0:.2f} MB written at input rate {1:.2f} MB/s.'
                .format(dcd_size, dcd_size/time_))
    LOGGER.info('{0} coordinate sets written at output rate {1} frame/s.'
                .format(n_csets, int(n_csets/time_)))
    if j != n_csets:
        LOGGER.warn('Warning: {0} frames expected, {1} written.'
                    .format(n_csets, j))
    if isTrajectory:
        trajectory.goto(nfi)
    return filename
示例#25
0
def addPDBEnsemble(ensemble,
                   PDBs,
                   refpdb=None,
                   labels=None,
                   mapping_func=mapOntoChain,
                   occupancy=None,
                   unmapped=None,
                   **kwargs):
    """Adds extra structures to a given PDB ensemble. 

    :arg ensemble: the ensemble to which the PDBs are added
    :type ensemble: :class:`.PDBEnsemble`

    :arg refpdb: reference structure. If set to `None`, it will be set to `ensemble.getAtoms()` automatically
    :type refpdb: :class:`.Chain`, :class:`.Selection`, or :class:`.AtomGroup`

    :arg PDBs: A list of PDB structures
    :type PDBs: iterable

    :arg title: the title of the ensemble
    :type title: str

    :arg labels: labels of the conformations
    :type labels: list

    :arg seqid: minimal sequence identity (percent)
    :type seqid: int

    :arg coverage: minimal sequence overlap (percent)
    :type coverage: int

    :arg occupancy: minimal occupancy of columns (range from 0 to 1). Columns whose occupancy 
                    is below this value will be trimmed
    :type occupancy: float

    :arg unmapped: a list of PDB IDs that cannot be included in the ensemble. This is an 
                   output argument
    :type unmapped: list
    """

    degeneracy = kwargs.pop('degeneracy', True)
    subset = str(kwargs.get('subset', 'calpha')).lower()
    superpose = kwargs.pop('superpose', True)

    if labels is not None:
        if len(labels) != len(PDBs):
            raise TypeError('Labels and PDBs must have the same lengths.')
    else:
        labels = []

        for pdb in PDBs:
            if pdb is None:
                labels.append(None)
            else:
                labels.append(pdb.getTitle())

    # obtain refchains from the hierarhical view of the reference PDB
    if refpdb is None:
        refpdb = ensemble._atoms
    else:
        if subset != 'all':
            refpdb = refpdb.select(subset)

    refchains = list(refpdb.getHierView())

    start = time.time()

    # obtain the atommap of all the chains combined.
    atoms = refchains[0]
    for i in range(1, len(refchains)):
        atoms += refchains[i]

    # add the PDBs to the ensemble
    if unmapped is None: unmapped = []

    LOGGER.progress('Appending the ensemble...', len(PDBs),
                    '_prody_addPDBEnsemble')
    for i, pdb in enumerate(PDBs):
        lbl = labels[i]
        if pdb is None:
            unmapped.append(labels[i])
            continue

        LOGGER.update(i,
                      'Mapping %s to the reference...' % pdb.getTitle(),
                      label='_prody_addPDBEnsemble')
        if not isinstance(pdb, (Chain, Selection, AtomGroup)):
            raise TypeError(
                'PDBs must be a list of Chain, Selection, or AtomGroup.')

        atommaps = []
        # find the mapping of the pdb to each reference chain
        for chain in refchains:
            mappings = mapping_func(pdb, chain, index=i, **kwargs)
            if len(mappings) > 0:
                atommaps.append(mappings[0][0])
            else:
                break

        if len(atommaps) != len(refchains):
            unmapped.append(lbl)
            continue

        # combine the mappings of pdb to reference chains
        atommap = atommaps[0]
        for i in range(1, len(atommaps)):
            atommap += atommaps[i]

        # add the mappings to the ensemble
        ensemble.addCoordset(atommap,
                             weights=atommap.getFlags('mapped'),
                             label=lbl,
                             degeneracy=degeneracy)
    LOGGER.finish()

    if occupancy is not None:
        ensemble = trimPDBEnsemble(ensemble, occupancy=occupancy)
    if superpose:
        ensemble.iterpose()

    LOGGER.info('{0} PDBs were added to the ensemble in {1:.2f}s.'.format(
        len(PDBs) - len(unmapped),
        time.time() - start))

    if unmapped:
        LOGGER.warn('{0} structures cannot be mapped.'.format(len(unmapped)))

    return ensemble
示例#26
0
文件: pfam.py 项目: uibcdf/ProDy
def parsePfamPDBs(query, data=[], **kwargs):
    """Returns a list of AtomGroups containing sections of chains that 
    correspond to a particular PFAM domain family. These are defined by 
    alignment start and end residue numbers.

    :arg query: UniProt ID or PDB ID
        If a PDB ID is provided the corresponding UniProt ID is used.
        If this returns multiple matches then start or end must also be provided.
        This query is also used for label refinement of the Pfam domain MSA.
    :type query: str

    :arg data: If given the data list from the Pfam mapping table will 
        be output through this argument.
    :type data: list

    :keyword start: Residue number for defining the start of the domain.
        The PFAM domain that starts closest to this will be selected. 
        Default is **1**
    :type start: int

    :keyword end: Residue number for defining the end of the domain.
        The PFAM domain that ends closest to this will be selected. 
    :type end: int
    """

    start = kwargs.pop('start', 1)
    end = kwargs.pop('end', None)

    if len(query) > 4 and query.startswith('PF'):
        pfam_acc = query
    else:
        pfam_matches = searchPfam(query)
        keys = list(pfam_matches.keys())

        if isinstance(start, Integral):
            start_diff = []
            for i, key in enumerate(pfam_matches):
                start_diff.append(
                    int(pfam_matches[key]['locations'][0]['start']) - start)
            start_diff = np.array(start_diff)
            pfam_acc = keys[np.where(
                abs(start_diff) == min(abs(start_diff)))[0][0]]

        elif isinstance(end, Integral):
            end_diff = []
            for i, key in enumerate(pfam_matches):
                end_diff.append(
                    int(pfam_matches[key]['locations'][0]['end']) - end)
            end_diff = np.array(end_diff)
            pfam_acc = keys[np.where(
                abs(end_diff) == min(abs(end_diff)))[0][0]]

        else:
            raise ValueError('Please provide an integer for start or end '
                             'when using a UniProt ID or PDB ID.')

    from ftplib import FTP
    from .uniprot import queryUniprot

    data_stream = BytesIO()
    ftp_host = 'ftp.ebi.ac.uk'
    ftp = FTP(ftp_host)
    ftp.login()
    ftp.cwd('pub/databases/Pfam/current_release')
    ftp.retrbinary('RETR pdbmap.gz', data_stream.write)
    ftp.quit()
    zip_data = data_stream.getvalue()
    data_stream.close()

    rawdata = gunzip(zip_data)
    if PY3K:
        rawdata = rawdata.decode()

    fields = [
        'PDB_ID', 'chain', 'nothing', 'PFAM_Name', 'PFAM_ACC', 'UniprotAcc',
        'UniprotResnumRange'
    ]

    data_dicts = []
    for line in rawdata.split('\n'):
        if line.find(pfam_acc) != -1:
            data_dicts.append({})
            for j, entry in enumerate(line.strip().split('\t')):
                data_dicts[-1][fields[j]] = entry.strip(';')

    pdb_ids = [data_dict['PDB_ID'] for data_dict in data_dicts]
    chains = [data_dict['chain'] for data_dict in data_dicts]

    header = kwargs.pop('header', False)
    model = kwargs.get('model', None)
    results = parsePDB(*pdb_ids, chain=chains, header=True, **kwargs)

    ags, headers = results
    ags, headers = list(ags), list(headers)

    if model == 0:
        LOGGER.info('only header is requested and returned')
        return results

    if header:
        results = (ags, headers)
    else:
        #        ags = results
        #        ags = list(ags)
        results = ags

    LOGGER.progress('Extracting Pfam domains...', len(ags))
    comma_splitter = re.compile(r'\s*,\s*').split
    no_info = []
    for i, ag in enumerate(ags):
        LOGGER.update(i)
        data_dict = data_dicts[i]
        pfamRange = data_dict['UniprotResnumRange'].split('-')
        uniprotAcc = data_dict['UniprotAcc']
        try:
            uniData = queryUniprot(uniprotAcc)
        except:
            LOGGER.warn('No Uniprot record found for {0}'.format(
                data_dict['PBD_ID']))
            continue

        resrange = None
        found = False
        for key, value in uniData.items():
            if not key.startswith('dbReference'):
                continue
            try:
                pdbid = value['PDB']
            except:
                continue
            if pdbid != data_dict['PDB_ID']:
                continue
            pdbchains = value['chains']

            # example chain strings: "A=27-139, B=140-150" or "A/B=27-150"
            pdbchains = comma_splitter(pdbchains)
            for chain in pdbchains:
                chids, resrange = chain.split('=')
                chids = [chid.strip() for chid in chids.split('/')]
                if data_dict['chain'] in chids:
                    resrange = resrange.split('-')
                    found = True
                    break
            if found:
                break

        if found:
            header = headers[i]
            chain_accessions = [
                dbref.accession for dbref in header[data_dict['chain']].dbrefs
            ]
            try:
                if len(chain_accessions) > 0:
                    right_part = np.where(
                        np.array(chain_accessions) ==
                        data_dict['UniprotAcc'])[0][0]
                else:
                    raise ValueError(
                        'There is no accession for a chain in the Header')
            except:
                LOGGER.warn(
                    'Could not map domains in {0}'.format(data_dict['PDB_ID'] +
                                                          data_dict['chain']))
                no_info.append(i)
                continue

            right_dbref = header[data_dict['chain']].dbrefs[right_part]
            chainStart = ag.select('chain {0}'.format(
                data_dict['chain'])).getResnums()[0]
            missing = chainStart - right_dbref.first[0]
            partStart = ag.getResindices()[np.where(
                ag.getResnums() == right_dbref.first[0] + missing)][0]
            pfStart, pfEnd = int(pfamRange[0]), int(pfamRange[1])
            uniStart, uniEnd = int(resrange[0]), int(resrange[1])

            resiStart = pfStart - uniStart + partStart - missing
            resiEnd = pfEnd - uniStart + partStart - missing
            ags[i] = ag.select('resindex {0} to {1}'.format(
                resiStart, resiEnd))
        else:
            no_info.append(i)
    LOGGER.finish()

    for i in reversed(no_info):
        ags.pop(i)
        if header:
            headers.pop(i)

    if isinstance(data, list):
        data.extend(data_dicts)
    else:
        LOGGER.warn('data should be a list in order to get output')

    return results