Пример #1
0
def calcSquareInnerProduct(modes1, modes2):
    """Returns the square inner product (SIP) of fluctuations [SK02]_.  
    This function returns a single number.

    .. [SK02] Kundu S, Melton JS, Sorensen DC, Phillips GN: Dynamics of 
        proteins in crystals: comparison of experiment with simple models. 
        Biophys J. 2002, 83: 723-732.
        
    """
    if isinstance(modes1, (NMA, ModeSet)):
        w1 = calcSqFlucts(modes1)
    elif isListLike(modes1):
        w1 = modes1
    else:
        raise TypeError(
            'modes1 should be a profile or an NMA or ModeSet object')

    if isinstance(modes2, (NMA, ModeSet)):
        w2 = calcSqFlucts(modes2)
    elif isListLike(modes2):
        w2 = modes2
    else:
        raise TypeError(
            'modes2 should be a profile or an NMA or ModeSet object')

    return np.dot(w1, w2)**2 / (np.dot(w1, w1) * np.dot(w2, w2))
Пример #2
0
def calcEnsembleFunctionOverlaps(ens, **kwargs):
    """Calculate function overlaps for an ensemble as the 
    mean of the value from :func:`calcDeepFunctionOverlaps`.

    :arg ens: an ensemble with labels
    :type ens: :class:`Ensemble`
    """
    if not isinstance(ens, Ensemble) and not isListLike(ens):
        raise TypeError('ens should be an ensemble or list-like')

    if isinstance(ens, Ensemble):
        ids = [label[:5] for label in ens.getLabels()]
    else:
        ids = ens

    if not isinstance(ids[0], str):
        raise TypeError('ens should have labels')

    goa_ens = queryGOA(ids, **kwargs)
    for entry in goa_ens:
        if len(entry._molecular) == 0:
            LOGGER.warn(
                'ensemble member {0} has no molecular functions and was omitted'
                .format(entry._title))

    goa_ens = [entry for entry in goa_ens if len(entry._molecular) > 0]

    overlaps = calcDeepFunctionOverlaps(*goa_ens, **kwargs)

    return overlaps
Пример #3
0
    def _extend(self, arr, axis=None, defval=0):
        mask = self.mask  #.copy()
        if self.is3d():
            mask = np.repeat(mask, 3)

        n_true = np.sum(mask)
        N = len(mask)

        if axis is None:
            axes = [i for i in range(arr.ndim)]
        elif not isListLike(axis):
            axes = [axis]
        else:
            axes = axis

        shape = np.array(arr.shape)
        shape[axes] = N

        whole_array = np.empty(shape, dtype=arr.dtype)
        whole_array.fill(defval)

        I = [np.arange(s) for s in shape]
        J = [np.arange(s) for s in arr.shape]

        for ax in axes:
            I[ax] = mask
            J[ax] = np.arange(n_true)

        whole_array[np.ix_(*I)] = arr[np.ix_(*J)]

        return whole_array
Пример #4
0
def parseCCD(ids):
    """Retrieve the whole Chemical Component Dictionary (CCD) resource.
    """
    if isListLike(ids):
        n_ids = len(ids)
    else:
        ids = [ids]
        n_ids = 1

    ret = []
    for id in ids:
        id_url = 'http://ligand-expo.rcsb.org/reports/{0}/{1}/{1}.cif'.format(id[0],
                                                                              id)
        try:
            handle = openURL(id_url)
        except Exception as err:
            LOGGER.warn('download failed ({1}).'.format(str(err)))
        else:
            data = handle.read()
            if len(data):
                if PY3K:
                    data = data.decode()

                parsingDict, prog = parseSTARLines(data.split('\n'), shlex=True)
                        
                star_dict = StarDict(parsingDict, prog, id)
                ret.append(star_dict[id])
            else:
                ret.append(None)
                LOGGER.warn('Could not parse CCD data for {0}'.format(id))

    if n_ids == 1:
        return ret[0]

    return ret
Пример #5
0
Файл: goa.py Проект: prody/ProDy
def calcEnsembleFunctionOverlaps(ens, **kwargs):
    """Calculate function overlaps for an ensemble as the 
    mean of the value from :func:`calcDeepFunctionOverlaps`.

    :arg ens: an ensemble with labels
    :type ens: :class:`Ensemble`
    """
    if not isinstance(ens, Ensemble) and not isListLike(ens):
        raise TypeError('ens should be an ensemble or list-like')

    if isinstance(ens, Ensemble):
        ids = [label[:5] for label in ens.getLabels()]
    else:
        ids = ens

    if not isinstance(ids[0], str):
        raise TypeError('ens should have labels')

    goa_ens = queryGOA(ids, **kwargs)
    for entry in goa_ens:
        if len(entry._molecular) == 0:
            LOGGER.warn(
                'ensemble member {0} has no molecular functions and was omitted'.format(entry._title))

    goa_ens = [entry for entry in goa_ens if len(entry._molecular) > 0]

    overlaps = calcDeepFunctionOverlaps(*goa_ens, **kwargs)

    return overlaps
Пример #6
0
    def setIndices(self, value):
        if not isListLike(value):
            raise TypeError('value must be a list or numpy.ndarray instance')

        array = asarray(value)

        if len(array) != self._n_atoms:
            raise ValueError('length mismatch between this ensemble '
                             '(%d) and indices (%d)'%(self._n_atoms, len(array)))
        self._indices = value
Пример #7
0
def fetchPDBClusters(sqid=None):
    """Retrieve PDB sequence clusters.  PDB sequence clusters are results of
    the weekly clustering of protein chains in the PDB generated by blastclust.
    They are available at FTP site: ftp://resources.rcsb.org/sequence/clusters/

    This function will download about 10 Mb of data and save it after
    compressing in your home directory in :file:`.prody/pdbclusters`.
    Compressed files will be less than 4 Mb in size.  Cluster data can
    be loaded using :func:`loadPDBClusters` function and be accessed
    using :func:`listPDBCluster`."""

    if sqid is not None:
        if isListLike(sqid):
            for s in sqid:
                if s not in PDB_CLUSTERS:
                    raise ValueError('sqid must be one or more of ' +
                                     PDB_CLUSTERS_SQID_STR)
            keys = list(sqid)
        else:
            if sqid not in PDB_CLUSTERS:
                raise ValueError('sqid must be one or more of ' +
                                 PDB_CLUSTERS_SQID_STR)
            keys = [sqid]
    else:
        keys = list(PDB_CLUSTERS)

    PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters')
    if not os.path.isdir(PDB_CLUSTERS_PATH):
        os.mkdir(PDB_CLUSTERS_PATH)
    LOGGER.progress('Downloading sequence clusters', len(keys),
                    '_prody_fetchPDBClusters')
    count = 0
    for i, x in enumerate(keys):
        filename = 'bc-{0}.out'.format(x)
        url = ('ftp://resources.rcsb.org/sequence/clusters/' + filename)
        try:
            inp = openURL(url)
        except IOError:
            LOGGER.warning('Clusters at {0}% sequence identity level could '
                           'not be downloaded.'.format(x))
            continue
        else:
            out = openFile(filename + '.gz', 'w', folder=PDB_CLUSTERS_PATH)
            out.write(inp.read())
            inp.close()
            out.close()
            count += 1
        LOGGER.update(i, label='_prody_fetchPDBClusters')
    LOGGER.finish()
    if len(keys) == count:
        LOGGER.info('All selected PDB clusters were downloaded successfully.')
    elif count == 0:
        LOGGER.warn('PDB clusters could not be downloaded.')
Пример #8
0
    def setApix(self, apix):
        if not isListLike(apix):
            try:
                apix = [apix, apix, apix]
            except:
                raise TypeError('apix must be a single value or list-like')

        if len(apix) != 3:
            raise ValueError('apix must be a single value or 3 values')

        self._apix = apix
        self.Lx = apix[0] * self.NS
        self.Ly = apix[1] * self.NR
        self.Lz = apix[2] * self.NC
Пример #9
0
def alignByEnsemble(atomics, ensemble):
    """Align a set of :class:`.Atomic` objects using transformations from *ensemble*, 
    which may be a :class:`.PDBEnsemble` or a :class:`.PDBConformation` instance. 
    
    Transformations will be applied based on indices so *atomics* and *ensemble* must 
    have the same number of members.

    :arg atomics: a set of :class:`.Atomic` objects to be aligned
    :type atomics: tuple, list, :class:`~numpy.ndarray`

    :arg ensemble: a :class:`.PDBEnsemble` or a :class:`.PDBConformation` from which 
                   transformations can be extracted
    :type ensemble: :class:`.PDBEnsemble`, :class:`.PDBConformation`
    """

    if not isListLike(atomics):
        raise TypeError('atomics must be list-like')

    if not isinstance(ensemble, (PDBEnsemble, PDBConformation)):
        raise TypeError('ensemble must be a PDBEnsemble or PDBConformation')
    if isinstance(ensemble, PDBConformation):
        ensemble = [ensemble]

    if len(atomics) != len(ensemble):
        raise ValueError('atomics and ensemble must have the same length')

    output = []
    for i, conf in enumerate(ensemble):
        trans = conf.getTransformation()
        if trans is None:
            raise ValueError('transformations are not calculated, call '
                             '`superpose` or `iterpose`')

        ag = atomics[i]
        if not isinstance(ag, Atomic):
            LOGGER.warning(
                'No atomic object found for conformation {0}.'.format(i))
            output.append(None)
            continue

        output.append(trans.apply(ag))

    if len(output) == 1:
        return output[0]
    else:
        return output
Пример #10
0
def sliceModelByMask(model, mask, norm=False):
    """Returns a part of the *model* indicated by *mask*.  Note that
    normal modes (eigenvectors) are not normalized unless *norm* is **True**.

    :arg mode: NMA model instance to be sliced
    :type mode: :class:`.NMA`

    :arg mask: an Integer array or a Boolean array where ``"True"`` indicates 
        the parts being selected 
    :type mask: list, :class:`~numpy.ndarray`

    :arg norm: whether to normalize eigenvectors, default **False**
    :type norm: bool

    :returns: :class:`.NMA`"""

    if not isListLike(mask):
        raise TypeError(
            'mask must be either a list or a numpy.ndarray, not {0}'.format(
                type(model)))

    is_bool = mask.dtype is np.dtype('bool')

    if is_bool:
        if len(mask) != model.numAtoms():
            raise ValueError('number of atoms in model and mask must be equal')
        which = mask
    else:
        if mask.min() < 0 or mask.max() >= model.numAtoms():
            raise ValueError('index in mask exceeds range')
        which = np.zeros(model.numAtoms(), dtype=bool)
        which[mask] = True

    array = model._getArray()

    nma = type(model)('{0} sliced'.format(model.getTitle()))
    if model.is3d():
        which = np.repeat(which, 3)

    evecs = array[which, :]
    if norm:
        evecs /= np.array([((evecs[:, i])**2).sum()**0.5
                           for i in range(evecs.shape[1])])

    nma.setEigens(evecs, model.getEigvals())
    return nma
Пример #11
0
    def __init__(self, items, element=None):
        if element is not None:
            tag = element.tag
            attrib = element.attrib
        else:
            tag = 'cath'
            attrib = {}

        super(CATHCollection, self).__init__(tag=tag, attrib=attrib)

        if not isListLike(items):
            items = [items]

        parents = []
        for item in items:
            self.append(item)
            parents.append(item.parent)

        uniq_parents = set(parents)
        if len(uniq_parents) == 1:
            self._parent = parents[0]
Пример #12
0
    def __init__(self, items, element=None):
        if element is not None:
            tag = element.tag
            attrib = element.attrib
        else:
            tag = 'cath'
            attrib = {}

        super(CATHCollection, self).__init__(tag=tag, attrib=attrib)

        if not isListLike(items):
            items = [items]

        parents = []
        for item in items:
            self.append(item)
            parents.append(item.parent)

        uniq_parents = set(parents)
        if len(uniq_parents) == 1:
            self._parent = parents[0]
Пример #13
0
    def __init__(self, parsingDict, prog, title='unnamed', indices=None):
        self._title = title
        self._dict = parsingDict
        self._prog = prog
        self._indices = indices

        if indices is None:
            self.dataBlocks = [
                StarDataBlock(self, key) for key in self._dict.keys()
            ]
        else:
            self.dataBlocks = []
            for idx in indices:
                if isListLike(idx):
                    self.dataBlocks.append(StarDataBlock(self, idx[0], idx[1]))
                else:
                    self.dataBlocks.append(StarDataBlock(self, idx))

            self._dict = OrderedDict()
            for i, idx in enumerate(indices):
                self._dict[idx[0]] = self.dataBlocks[i]._dict

        self.numDataBlocks = len(self.dataBlocks)
Пример #14
0
def fetchPDBs(*pdb, **kwargs):
    """"Wrapper function to fetch multiple files from the PDB. 
    If no format is given, it tries PDB then mmCIF then EMD.
    
    :arg pdb: one PDB identifier or filename, or a list of them.
        If needed, PDB files are downloaded using :func:`.fetchPDB()` function.
    """

    n_pdb = len(pdb)
    if n_pdb == 0:
        raise ValueError('Please provide a PDB ID or filename')

    if n_pdb == 1:
        if isListLike(pdb[0]):
            pdb = pdb[0]
            n_pdb = len(pdb)

    fnames = []
    for p in pdb:
        format = kwargs.pop('format', None)

        if format is not None:
            filename = fetchPDB(p, format=format, **kwargs)

        else:
            filename = fetchPDB(p, **kwargs)

            if filename is None:
                filename = fetchPDB(p, format='cif', **kwargs)

            if filename is None:
                filename = fetchPDB(p, format='emd', **kwargs)

        fnames.append(filename)

    return fnames
Пример #15
0
def buildPDBEnsemble(atomics,
                     ref=None,
                     title='Unknown',
                     labels=None,
                     atommaps=None,
                     unmapped=None,
                     **kwargs):
    """Builds a :class:`.PDBEnsemble` from a given reference structure and a list of structures 
    (:class:`.Atomic` instances). Note that the reference should be included in the list as well.

    :arg atomics: a list of :class:`.Atomic` instances
    :type atomics: list

    :arg ref: reference structure or the index to the reference in *atomics*. If **None**,
        then the first item in *atomics* will be considered as the reference. If it is a 
        :class:`.PDBEnsemble` instance, then *atomics* will be appended to the existing ensemble.
        Default is **None**
    :type ref: int, :class:`.Chain`, :class:`.Selection`, or :class:`.AtomGroup`

    :arg title: the title of the ensemble
    :type title: str

    :arg labels: labels of the conformations
    :type labels: list

    :arg degeneracy: whether only the active coordinate set (**True**) or all the coordinate sets 
        (**False**) of each structure should be added to the ensemble. Default is **True**
    :type degeneracy: bool

    :arg occupancy: minimal occupancy of columns (range from 0 to 1). Columns whose occupancy
        is below this value will be trimmed
    :type occupancy: float

    :arg atommaps: labels of *atomics* that were mapped and added into the ensemble. This is an 
        output argument
    :type atommaps: list

    :arg unmapped: labels of *atomics* that cannot be included in the ensemble. This is an 
        output argument
    :type unmapped: list

    :arg subset: a subset for selecting particular atoms from the input structures.
        Default is ``"all"``
    :type subset: str

    :arg superpose: if set to ``'iter'``, :func:`.PDBEnsemble.iterpose` will be used to 
        superpose the structures, otherwise conformations will be superposed with respect 
        to the reference specified by *ref* unless set to ``False``. Default is ``'iter'``
    :type superpose: str, bool
    """

    occupancy = kwargs.pop('occupancy', None)
    degeneracy = kwargs.pop('degeneracy', True)
    subset = str(kwargs.get('subset', 'all')).lower()
    superpose = kwargs.pop('superpose', 'iter')
    superpose = kwargs.pop('iterpose', superpose)
    debug = kwargs.pop('debug', {})

    if 'mapping_func' in kwargs:
        raise DeprecationWarning(
            'mapping_func is deprecated. Please see release notes for '
            'more details: http://prody.csb.pitt.edu/manual/release/v1.11_series.html'
        )
    start = time.time()

    if not isListLike(atomics):
        raise TypeError('atomics should be list-like')

    if len(atomics) == 1 and degeneracy is True:
        raise ValueError('atomics should have at least two items')

    if labels is not None:
        if len(labels) != len(atomics):
            raise TypeError('Labels and atomics must have the same lengths.')
    else:
        labels = []

        for atoms in atomics:
            if atoms is None:
                labels.append(None)
            else:
                labels.append(atoms.getTitle())

    if ref is None:
        target = atomics[0]
    elif isinstance(ref, Integral):
        target = atomics[ref]
    elif isinstance(ref, PDBEnsemble):
        target = ref._atoms
    else:
        target = ref

    # initialize a PDBEnsemble with reference atoms and coordinates
    isrefset = False
    if isinstance(ref, PDBEnsemble):
        ensemble = ref
    else:
        # select the subset of reference beforehand for the sake of efficiency
        if subset != 'all':
            target = target.select(subset)
        ensemble = PDBEnsemble(title)
        if isinstance(target, Atomic):
            ensemble.setAtoms(target)
            ensemble.setCoords(target.getCoords())
            isrefset = True
        else:
            ensemble._n_atoms = len(target)
            isrefset = False

    # build the ensemble
    if unmapped is None: unmapped = []
    if atommaps is None: atommaps = []

    LOGGER.progress('Building the ensemble...', len(atomics),
                    '_prody_buildPDBEnsemble')
    for i, atoms in enumerate(atomics):
        if atoms is None:
            unmapped.append(labels[i])
            continue

        LOGGER.update(i,
                      'Mapping %s to the reference...' % atoms.getTitle(),
                      label='_prody_buildPDBEnsemble')
        try:
            atoms.getHierView()
        except AttributeError:
            raise TypeError(
                'atomics must be a list of instances having the access to getHierView'
            )

        if subset != 'all':
            atoms = atoms.select(subset)

        # find the mapping of chains of atoms to those of target
        debug[labels[i]] = {}
        atommaps_ = alignChains(atoms,
                                target,
                                debug=debug[labels[i]],
                                **kwargs)

        if len(atommaps_) == 0:
            unmapped.append(labels[i])
            continue
        else:
            atommaps.extend(atommaps_)

        # add the atommaps to the ensemble
        for atommap in atommaps_:
            lbl = pystr(labels[i])
            if len(atommaps_) > 1:
                chids = np.unique(atommap.getChids())
                strchids = ''.join(chids)
                lbl += '_%s' % strchids
            ensemble.addCoordset(atommap,
                                 weights=atommap.getFlags('mapped'),
                                 label=lbl,
                                 degeneracy=degeneracy)

            if not isrefset:
                ensemble.setCoords(atommap.getCoords())
                isrefset = True

    LOGGER.finish()

    if occupancy is not None:
        ensemble = trimPDBEnsemble(ensemble, occupancy=occupancy)

    if superpose == 'iter':
        ensemble.iterpose()
    elif superpose is not False:
        ensemble.superpose()

    LOGGER.info('Ensemble ({0} conformations) were built in {1:.2f}s.'.format(
        ensemble.numConfs(),
        time.time() - start))

    if unmapped:
        LOGGER.warn('{0} structures cannot be mapped.'.format(len(unmapped)))
    return ensemble
Пример #16
0
def trimModelByMask(model, mask):
    """Returns a part of the *model* indicated by *mask*. This method removes 
    columns and rows in the connectivity matrix indicated by *mask* and fix the diagonal sums.
    Normal modes need to be calculated again after the trim.

    :arg mode: NMA model instance to be sliced
    :type mode: :class:`.NMA`

    :arg mask: an Integer array or a Boolean array where ``"True"`` indicates 
        the parts being selected 
    :type mask: list, :class:`~numpy.ndarray`

    :returns: :class:`.NMA`"""

    if not isListLike(mask):
        raise TypeError(
            'mask must be either a list or a numpy.ndarray, not {0}'.format(
                type(model)))

    is_bool = mask.dtype is np.dtype('bool')

    if is_bool:
        if len(mask) != model.numAtoms():
            raise ValueError('number of atoms in model and mask must be equal')
        which = mask
    else:
        if mask.min() < 0 or mask.max() >= model.numAtoms():
            raise ValueError('index in mask exceeds range')
        which = np.zeros(model.numAtoms(), dtype=bool)
        which[mask] = True

    if model.is3d():
        which = np.repeat(which, 3)

    if isinstance(model, GNM):
        matrix = model._kirchhoff
    elif isinstance(model, ANM):
        matrix = model._hessian
    elif isinstance(model, PCA):
        matrix = model._cov

    if isinstance(model, PCA):
        ss = matrix[which, :][:, which]
        eda = PCA(model.getTitle() + ' reduced')
        eda.setCovariance(ss)
        return eda
    else:
        matrix = matrix[which, :][:, which]

        if isinstance(model, GNM):
            gnm = GNM(model.getTitle() + ' reduced')
            I = np.eye(len(matrix), dtype=bool)
            matrix[I] = -(matrix.sum(axis=0) - np.diag(matrix))
            gnm.setKirchhoff(matrix)
            return gnm
        elif isinstance(model, ANM):
            model_type = type(model)
            anm = model_type(model.getTitle() + ' reduced')

            n = len(matrix) // 3
            for i in range(n):
                S = np.zeros((3, 3))
                for j in range(n):
                    if i == j:
                        continue
                    S -= matrix[i * 3:i * 3 + 3, j * 3:j * 3 + 3]
                matrix[i * 3:i * 3 + 3, i * 3:i * 3 + 3] = S
            anm.setHessian(matrix)
            if hasattr(anm, 'getMembrane'):
                anm._membrane = model.getMembrane()
                anm._combined = model.getCombined()
            return anm
Пример #17
0
def parsePDB(*pdb, **kwargs):
    """Returns an :class:`.AtomGroup` and/or dictionary containing header data
    parsed from a PDB file.

    This function extends :func:`.parsePDBStream`.

    See :ref:`parsepdb` for a detailed usage example.

    :arg pdb: one PDB identifier or filename, or a list of them.
        If needed, PDB files are downloaded using :func:`.fetchPDB()` function.
    
    You can also provide arguments that you would like passed on to fetchPDB().
    """

    n_pdb = len(pdb)
    if n_pdb == 1:
        if isListLike(pdb[0]):
            pdb = pdb[0]
            n_pdb = len(pdb)

    if n_pdb == 1:
        return _parsePDB(pdb[0], **kwargs)
    else:
        results = []
        lstkwargs = {}
        for key in kwargs:
            argval = kwargs.get(key)
            if np.isscalar(argval):
                argval = [argval] * n_pdb
            lstkwargs[key] = argval

        start = time.time()
        LOGGER.progress('Retrieving {0} PDB structures...'.format(n_pdb),
                        n_pdb, '_prody_parsePDB')
        for i, p in enumerate(pdb):
            kwargs = {}
            for key in lstkwargs:
                kwargs[key] = lstkwargs[key][i]
            c = kwargs.get('chain', '')
            LOGGER.update(i,
                          'Retrieving {0}...'.format(p + c),
                          label='_prody_parsePDB')
            result = _parsePDB(p, **kwargs)
            if not isinstance(result, tuple):
                if isinstance(result, dict):
                    result = (None, result)
                else:
                    result = (result, None)
            results.append(result)

        results = list(zip(*results))
        LOGGER.finish()

        for i in reversed(range(len(results))):
            if all(j is None for j in results[i]):
                results.pop(i)
        if len(results) == 1:
            results = results[0]
        results = list(results)

        model = kwargs.get('model')
        header = kwargs.get('header', False)
        if model != 0 and header:
            numPdbs = len(results[0])
        else:
            numPdbs = len(results)

        LOGGER.info('{0} PDBs were parsed in {1:.2f}s.'.format(
            numPdbs,
            time.time() - start))

        return results
Пример #18
0
def parsePDB(*pdb, **kwargs):
    """Returns an :class:`.AtomGroup` and/or dictionary containing header data
    parsed from a PDB file.

    This function extends :func:`.parsePDBStream`.

    See :ref:`parsepdb` for a detailed usage example.

    :arg pdb: one PDB identifier or filename, or a list of them.
        If needed, PDB files are downloaded using :func:`.fetchPDB()` function.
    
    You can also provide arguments that you would like passed on to fetchPDB().
    """

    n_pdb = len(pdb)
    if n_pdb == 1:
        if isListLike(pdb[0]):
            pdb = pdb[0]
            n_pdb = len(pdb)
            
    if n_pdb == 1:
        return _parsePDB(pdb[0], **kwargs)
    else:
        results = []
        lstkwargs = {}
        for key in kwargs:
            argval = kwargs.get(key)
            if np.isscalar(argval):
                argval = [argval]*n_pdb
            lstkwargs[key] = argval

        start = time.time()
        LOGGER.progress('Retrieving {0} PDB structures...'
                    .format(n_pdb), n_pdb, '_prody_parsePDB')
        for i, p in enumerate(pdb):
            kwargs = {}
            for key in lstkwargs:
                kwargs[key] = lstkwargs[key][i]
            c = kwargs.get('chain','')
            LOGGER.update(i, 'Retrieving {0}...'.format(p+c), 
                          label='_prody_parsePDB')
            result = _parsePDB(p, **kwargs)
            if not isinstance(result, tuple):
                if isinstance(result, dict):
                    result = (None, result)
                else:
                    result = (result, None)
            results.append(result)

        results = list(zip(*results))
        LOGGER.finish()
       
        for i in reversed(range(len(results))):
            if all(j is None for j in results[i]):
                results.pop(i)
        if len(results) == 1:
            results = results[0]
        results = list(results)

        model = kwargs.get('model')
        header = kwargs.get('header', False)
        if model != 0 and header:
            numPdbs = len(results[0])
        else:
            numPdbs = len(results)

        LOGGER.info('{0} PDBs were parsed in {1:.2f}s.'
                     .format(numPdbs, time.time()-start))

        return results
Пример #19
0
def parseBIRD(*ids, **kwargs):
    """Parse data from the Biologically Interesting Molecule Reference 
    Dictionary (BIRD) resource, which is updated every week. This includes 
    2 kinds of keys, which can be selected with the **keys** keyword argument.

    The chemical information is found in a single CIF file at 
    https://files.rcsb.org/pub/pdb/data/bird/prd/prd-all.cif.gz. 
    This data will be downloaded and extracted to :file:`.prody/bird-prd`.

    Biological function information is also found in a single CIF file at 
    https://files.rcsb.org/pub/pdb/data/bird/family/family-all.cif.gz. 
    This data will be downloaded and extracted to :file:`.prody/bird-family`.

    Individual compounds can be selected using **ids**. 
    If needed, BIRD files are downloaded using :func:`.fetchBIRDviaFTP` function.
    
    You can also provide arguments that you would like passed on to fetchBIRDviaFTP.

    :arg ids: one BIRD identifier (starting with PRD or FAM) or a list of them.
        If **None** is provided then all of them are returned.
    :type ids: str, tuple, list, :class:`~numpy.ndarray`, **None**

    :arg key: key specifying which data to fetch out of ``'prd'`` or ``'family'``
               default is ``'prd'``
    :type key: str

    Returns :class:`.StarDataBlock` object or list of them.
    """
    key = kwargs.get('key', 'prd')
    if not isinstance(key, str):
        raise TypeError("key should be a string")

    if key[:3].lower() == 'prd':
        key = 'prd'
    elif key[:3].lower() == 'fam':
        key = 'family'
    else:
        raise ValueError("key should be 'prd' or 'fam'")

    n_ids = len(ids)
    if n_ids == 1:
        if isListLike(ids[0]):
            ids = ids[0]
            n_ids = len(ids)

    if n_ids == 1:
        ids = list(ids)

    BIRD_PATH = os.path.join(getPackagePath(), 'bird')
    filename = BIRD_PATH + '/{0}-all.cif.gz'.format(key)
    if not os.path.isfile(filename):
        fetchBIRDviaFTP(keys=key, **kwargs)

    data = parseSTAR(filename, shlex=True)
    ret = []
    for id in ids:
        try:
            ret.append(data.search(id)[0])
        except ValueError:
            try:
                ret.append(data[id])
            except ValueError:
                LOGGER.warn('id {0} not found in {1} data '
                            'so appending None'.format(id, key))
                ret.append(None)

    if n_ids == 1:
        return ret[0]

    return ret
Пример #20
0
def writePIR(filename, msa, **kwargs):
    """A function to write PIR format alignments for use with MODELLER.

    :arg filename: The name of the file to be written including .ali
    :type filename: str

    :arg msa: a multiple sequence alignment in :class:`MSA` format
    :type msa: :class:`MSA` instance

    :arg chain_sep: chain separation character or list of them
        default is '/'
    :type chain_sep: str, list

    :arg types: a list of strings for field 1, PIR types (Sequence or StructureX)
        default is all Sequence
    :type types: list

    :arg labels: a list of strings for field 2, sequence labels
        default is to take them from msa
    :type labels: list

    :arg first_resnums: contents for field 3, residue number for the first residue.
        This should be a list of strings each having length 5, 
        default is all 'FIRST'
    :type first_resnums: list

    :arg first_chains: contents for field 4, chain ID for the first residue
        This should be a list of strings each having length 1, 
        default is all '@'
    :type first_chains: list

    :arg last_resnums: contents for field 5, residue number for the last residue.
        This should be a list of strings each having length 5, 
        default is all 'LAST '
    :type last_resnums: list

    :arg last_chains: contents for field 6, chain ID for the last residue
        This should be a list of strings each having length 1, 
        default is all ' '
    :type first_chains: list

    :arg protein_names: list of strings for field 7
        default is all ''
    :type protein_names: list

    :arg protein_sources: list of strings for field 8
        default is all ''
    :type protein_sources: list

    :arg resolutions: list of strings for field 9
        default is all ''
    :type resolutions: list

    :arg r_factors: list of strings for field 10
        default is all ''
    :type r_factors: list
    """
    msafile = open(filename, 'w')

    chain_sep = kwargs.get('chain_sep', '/')
    if isinstance(chain_sep, basestring): 
        chain_sep = [chain_sep] * msa.numSequences()
    elif isListLike(chain_sep) and isinstance(chain_sep[0], basestring):
        if len(chain_sep) != msa.numSequences():
            raise ValueError('There should be an entry in chain_sep list for each sequence in msa')
    else:
        raise TypeError('chain_sep should be a string or list of strings')

    types = kwargs.get('types', 'Sequence')
    if isinstance(types, basestring): 
        types = [types] * msa.numSequences()
    elif isListLike(types) and isinstance(types[0], basestring):
        if len(types) != msa.numSequences():
            raise ValueError('There should be an entry in types list for each sequence in msa')
    else:
        raise TypeError('types should be a string or list of strings')

    labels = kwargs.get('labels', None)
    if labels is None: 
        labels = []
        for sequence in msa:
            labels.append(sequence.getLabel())
    elif isListLike(labels) and isinstance(labels[0], basestring):
        if len(labels) != msa.numSequences():
            raise ValueError('There should be an entry in labels list for each sequence in msa')
    else:
        raise TypeError('labels should be a string or list of strings')

    first_resnums = kwargs.get('first_resnums', 'FIRST')
    if isinstance(first_resnums, basestring) and len(first_resnums) == 5: 
        first_resnums = [first_resnums] * msa.numSequences()
    elif isListLike(first_resnums) and isinstance(first_resnums, basestring):
        if len(first_resnums) != msa.numSequences():
            raise ValueError('There should be an entry in first_resnums list for each sequence in msa')
    else:
        raise TypeError('first_resnums should be a string of length 5 or list of them')

    first_chains = kwargs.get('first_chains', '@')
    if isinstance(first_chains, basestring) and len(first_chains) == 1: 
        first_chains = [first_chains] * msa.numSequences()
    elif isListLike(first_chains) and isinstance(first_chains, basestring):
        if len(first_chains) != msa.numSequences():
            raise ValueError('There should be an entry in first_chains list for each sequence in msa')
    else:
        raise TypeError('first_chains should be a string of length 1 or list of them')

    last_resnums = kwargs.get('last_resnums', 'LAST ')
    if isinstance(last_resnums, basestring) and len(last_resnums) == 5: 
        last_resnums = [last_resnums] * msa.numSequences()
    elif isListLike(last_resnums) and isinstance(last_resnums, basestring):
        if len(last_resnums) != msa.numSequences():
            raise ValueError('There should be an entry in last_resnums list for each sequence in msa')
    else:
        raise TypeError('last_resnums should be a string of length 5 or list of them')

    last_chains = kwargs.get('last_chains', ' ')
    if isinstance(last_chains, basestring) and len(last_chains) == 1: 
        last_chains = [last_chains] * msa.numSequences()
    elif isListLike(last_chains) and isinstance(last_chains, basestring):
        if len(last_chains) != msa.numSequences():
            raise ValueError('There should be an entry in last_chains list for each sequence in msa')
    else:
        raise TypeError('last_chains should be a string of length 1 or list of them')

    protein_names = kwargs.get('protein_names', '')
    if isinstance(protein_names, basestring): 
        protein_names = [protein_names] * msa.numSequences()
    elif isListLike(protein_names) and isinstance(protein_names, basestring):
        if len(protein_names) != msa.numSequences():
            raise ValueError('There should be an entry in protein_names list for each sequence in msa')
    else:
        raise TypeError('protein_names should be a string or list of strings')

    protein_sources = kwargs.get('protein_sources', '')
    if isinstance(protein_sources, basestring): 
        protein_sources = [protein_sources] * msa.numSequences()
    elif isListLike(protein_sources) and isinstance(protein_sources, basestring):
        if len(protein_sources) != msa.numSequences():
            raise ValueError('There should be an entry in protein_sources list for each sequence in msa')
    else:
        raise TypeError('protein_sources should be a string or list of strings')

    resolutions = kwargs.get('resolutions', '')
    if isinstance(resolutions, basestring): 
        resolutions = [resolutions] * msa.numSequences()
    elif isListLike(resolutions) and isinstance(resolutions, basestring):
        if len(resolutions) != msa.numSequences():
            raise ValueError('There should be an entry in resolutions list for each sequence in msa')
    else:
        raise TypeError('resolutions should be a string or list of strings')

    r_factors = kwargs.get('r_factors', '')
    if isinstance(r_factors, basestring): 
        r_factors = [r_factors] * msa.numSequences()
    elif isListLike(r_factors) and isinstance(r_factors, basestring):
        if len(r_factors) != msa.numSequences():
            raise ValueError('There should be an entry in r_factors list for each sequence in msa')
    else:
        raise TypeError('r_factors should be a string or list of strings')

    for i, sequence in enumerate(msa):
        sequence = str(sequence).replace(chain_sep[i],'/')
        msafile.write('>P1;' + labels[i] + '\n')
        msafile.write(types[i] + ':' + labels[i] + ':')
        msafile.write(first_resnums[i] + ':' + first_chains[i] + ':')
        msafile.write(last_resnums[i] + ':' + last_chains[i] + ':')
        msafile.write(protein_names[i] + ':' + protein_sources[i] + ':')
        msafile.write(resolutions[i] + ':' + r_factors[i])
        msafile.write('\n')

        for j in range(len(sequence)/60):
            msafile.write(sequence[j*60:(j+1)*60] + '\n')
        msafile.write(sequence[(j+1)*60:] + '*\n\n')

    msafile.close()
    return
Пример #21
0
    def scanPockets(self):

        'Generates ESSA z-scores for pockets and parses pocket features. It requires both Fpocket 3.0 and Pandas being installed in your system.'
        
        from re import findall

        fpocket = which('fpocket')

        if fpocket is None:
            LOGGER.warning('Fpocket (version >= 3.0) was not found, please install it.')
            return None

        try:
            from pandas import Index, DataFrame
        except ImportError as ie:
            LOGGER.warning(ie.__str__() + ' was found, please install it.')
            return None

        rcr = {(i, j): k if self._rib else self._ri[k]
               for i, j, k in zip(self._ca.getChids(),
                                  self._ca.getResnums(),
                                  self._ca.getResindices())}

        writePDB('{}_pro'.format(self._title), self._heavy)

        direc = '{}_pro_out'.format(self._title)
        if not isdir(direc):
            system('fpocket -f {}_pro.pdb'.format(self._title))

        chdir(direc + '/pockets')
        l = [x for x in listdir('.') if x.endswith('.pdb')]
        l.sort(key=lambda x:int(x.partition('_')[0][6:]))

        ps = []
        for x in l:
            with open(x, 'r') as f:
                tmp0 = f.read()
                tmp1 = [(x[1].strip(), float(x[2])) for x in findall(r'(\w+\s\w+\s*-\s*)(.+):\s*([\d.-]+)(\n)', tmp0)]
            fea, sco = list(zip(*tmp1))
            ps.append(sco)
        pdbs = parsePDB(l)
        if not isListLike(pdbs):
            pdbs = [pdbs]
        chdir('../..')

        # ----- # ----- #

        ps = array(ps)

        pcn = {int(pdb.getTitle().partition('_')[0][6:]):
               set(zip(pdb.getChids().tolist(),
                       pdb.getResnums().tolist())) for pdb in pdbs}
        pi = {p: [rcr[x] for x in crn] for p, crn in pcn.items()}

        pzs_max = {k: max(self._zscore[v]) for k, v in pi.items()}
        pzs_med = {k: median(self._zscore[v]) for k, v in pi.items()}

        # ----- # ----- #

        indices = Index(range(1, ps.shape[0] + 1), name='Pocket #')

        columns = Index(fea, name='Feature')

        self._df = DataFrame(index=indices, columns=columns, data=ps)

        # ----- # ----- #

        columns_zs = Index(['ESSA_max',
                            'ESSA_med',
                            'LHD'],
                           name='Z-score')

        zps = c_[list(pzs_max.values())]
        zps = hstack((zps, c_[list(pzs_med.values())]))
        zps = hstack((zps, zscore(self._df[['Local hydrophobic density Score']])))


        self._df_zs = DataFrame(index=indices, columns=columns_zs, data=zps)
Пример #22
0
def fetchBIRDviaFTP(**kwargs):
    """Retrieve the whole Biologically Interesting Molecule Reference 
    Dictionary (BIRD) resource, which is updated every week. This includes 
    2 kinds of keys, which can be selected with the **keys** keyword argument.

    The chemical information is found in a zipped (tar.gz) directory at 
    https://files.rcsb.org/pub/pdb/data/bird/prd/prd-all.cif.gz, which 
    contains individual CIF files within it. This data will be downloaded 
    and extracted to :file:`.prody/bird-prd`.

    Biological function information is also found in a zipped (tar.gz) directory at 
    https://files.rcsb.org/pub/pdb/data/bird/family/family-all.cif.gz, which 
    contains individual CIF files within it. This data will be downloaded 
    and extracted to :file:`.prody/bird-family`.

    :arg keys: keys specifying which data to fetch out of ``'prd'``, ``'family'`` or ``'both'``
               default is ``'both'``
    :type keys: str, tuple, list, :class:`~numpy.ndarray`

    The underlying data can be accessed using :func:`parseBIRD`."""

    BIRD_PATH = os.path.join(getPackagePath(), 'bird')

    keys = kwargs.get('keys', 'both')
    if isinstance(keys, str):
        if keys == 'both':
            keys = ['prd', 'family']
        elif keys[:3].lower() == 'prd':
            keys = ['prd']
        elif keys[:3].lower() == 'fam':
            keys = ['family']
        else:
            raise ValueError("keys should be 'both', 'prd' or 'fam'")

    elif isListLike(keys):
        keys = list(keys)
    else:
        raise TypeError("keys should be list-like or string")

    ftp_divided = 'pdb/data/bird/'
    ftp_pdbext = '.cif.gz'
    ftp_prefix = ''

    if not os.path.isdir(BIRD_PATH):
        os.mkdir(BIRD_PATH)

    LOGGER.progress('Downloading BIRD', len(keys),
                    '_prody_fetchBIRD')

    ftp_name, ftp_host, ftp_path = WWPDB_FTP_SERVERS[wwPDBServer() or 'us']
    LOGGER.debug('Connecting wwPDB FTP server {0}.'.format(ftp_name))

    from ftplib import FTP
    try:
        ftp = FTP(ftp_host)
    except Exception as error:
        raise type(error)('FTP connection problem, potential reason: '
                          'no internet connectivity')
    else:
        count = 0
        success = 0
        failure = 0
        filenames = []
        ftp.login('')
        for i, x in enumerate(keys):
            data = []
            ftp_fn = ftp_prefix + '{0}-all'.format(x) + ftp_pdbext
            try:
                ftp.cwd(ftp_path)
                ftp.cwd(ftp_divided)
                ftp.cwd(x)
                ftp.retrbinary('RETR ' + ftp_fn, data.append)
            except Exception as error:
                if ftp_fn in ftp.nlst():
                    LOGGER.warn('{0} download failed ({1}). It is '
                                'possible that you do not have rights to '
                                'download .gz files in the current network.'
                                .format(x, str(error)))
                else:
                    LOGGER.info('{0} download failed. {1} does not exist '
                                'on {2}.'.format(ftp_fn, x, ftp_host))
                failure += 1
                filenames.append(None)
            else:
                if len(data):
                    filename = BIRD_PATH + '/{0}-all.cif.gz'.format(x)

                    with open(filename, 'w+b') as outfile:
                        write = outfile.write
                        [write(block) for block in data]

                    success += 1
                else:
                    failure += 1
            count += 1
            LOGGER.update(i, label='_prody_fetchBIRD')
        LOGGER.finish()

    LOGGER.debug('PDB download via FTP completed ({0} downloaded, '
                 '{1} failed).'.format(success, failure))
Пример #23
0
def reduceModelByMask(model, mask):
    """Returns NMA model reduced based on *mask*. 

    :arg model: dynamics model
    :type model: :class:`.ANM`, :class:`.GNM`, or :class:`.PCA`

    :arg mask: an Integer array or a Boolean array where ``"True"`` indicates 
        the parts being selected 
    :type mask: list, :class:`~numpy.ndarray`

    :returns: :class:`.NMA`"""

    if not isinstance(model, NMA):
        raise TypeError('model must be an NMA instance, not {0}'.format(
            type(model)))

    if not isListLike(mask):
        raise TypeError(
            'mask must be either a list or a numpy.ndarray, not {0}'.format(
                type(model)))

    is_bool = mask.dtype is np.dtype('bool')

    if is_bool:
        if len(mask) != model.numAtoms():
            raise ValueError('number of atoms in model and mask must be equal')
        system = mask
    else:
        if mask.min() < 0 or mask.max() >= model.numAtoms():
            raise ValueError('index in mask exceeds range')
        system = np.zeros(model.numAtoms(), dtype=bool)
        system[mask] = True

    if isinstance(model, GNM):
        matrix = model._kirchhoff
    elif isinstance(model, ANM):
        matrix = model._hessian
    elif isinstance(model, PCA):
        matrix = model._cov
    else:
        raise TypeError('model does not have a valid type derived from NMA')
    if matrix is None:
        raise ValueError('model matrix (Hessian/Kirchhoff/Covariance) is not '
                         'built')

    if model.is3d():
        system = np.repeat(system, 3)

    if isinstance(model, PCA):
        ss = matrix[system, :][:, system]
        eda = PCA(model.getTitle() + ' reduced')
        eda.setCovariance(ss)
        return eda
    else:
        matrix = _reduceModel(matrix, system)

        if isinstance(model, GNM):
            gnm = GNM(model.getTitle() + ' reduced')
            gnm.setKirchhoff(matrix)
            return gnm
        elif isinstance(model, ANM):
            anm = ANM(model.getTitle() + ' reduced')
            anm.setHessian(matrix)
            return anm
Пример #24
0
def writePIR(filename, msa, **kwargs):
    """A function to write PIR format alignments for use with MODELLER.

    :arg filename: The name of the file to be written including .ali
    :type filename: str

    :arg msa: a multiple sequence alignment in :class:`MSA` format
    :type msa: :class:`MSA` instance

    :arg chain_sep: chain separation character or list of them
        default is '/'
    :type chain_sep: str, list

    :arg types: a list of strings for field 1, PIR types (Sequence or StructureX)
        default is all Sequence
    :type types: list

    :arg labels: a list of strings for field 2, sequence labels
        default is to take them from msa
    :type labels: list

    :arg first_resnums: contents for field 3, residue number for the first residue.
        This should be a list of strings each having length 5, 
        default is all 'FIRST'
    :type first_resnums: list

    :arg first_chains: contents for field 4, chain ID for the first residue
        This should be a list of strings each having length 1, 
        default is all '@'
    :type first_chains: list

    :arg last_resnums: contents for field 5, residue number for the last residue.
        This should be a list of strings each having length 5, 
        default is all 'LAST '
    :type last_resnums: list

    :arg last_chains: contents for field 6, chain ID for the last residue
        This should be a list of strings each having length 1, 
        default is all ' '
    :type first_chains: list

    :arg protein_names: list of strings for field 7
        default is all ''
    :type protein_names: list

    :arg protein_sources: list of strings for field 8
        default is all ''
    :type protein_sources: list

    :arg resolutions: list of strings for field 9
        default is all ''
    :type resolutions: list

    :arg r_factors: list of strings for field 10
        default is all ''
    :type r_factors: list
    """
    msafile = open(filename, 'w')

    chain_sep = kwargs.get('chain_sep', '/')
    if isinstance(chain_sep, basestring): 
        chain_sep = [chain_sep] * msa.numSequences()
    elif isListLike(chain_sep) and isinstance(chain_sep[0], basestring):
        if len(chain_sep) != msa.numSequences():
            raise ValueError('There should be an entry in chain_sep list for each sequence in msa')
    else:
        raise TypeError('chain_sep should be a string or list of strings')

    types = kwargs.get('types', 'Sequence')
    if isinstance(types, basestring): 
        types = [types] * msa.numSequences()
    elif isListLike(types) and isinstance(types[0], basestring):
        if len(types) != msa.numSequences():
            raise ValueError('There should be an entry in types list for each sequence in msa')
    else:
        raise TypeError('types should be a string or list of strings')

    labels = kwargs.get('labels', None)
    if labels is None: 
        labels = []
        for sequence in msa:
            labels.append(sequence.getLabel())
    elif isListLike(labels) and isinstance(labels[0], basestring):
        if len(labels) != msa.numSequences():
            raise ValueError('There should be an entry in labels list for each sequence in msa')
    else:
        raise TypeError('labels should be a string or list of strings')

    first_resnums = kwargs.get('first_resnums', 'FIRST')
    if isinstance(first_resnums, basestring) and len(first_resnums) == 5: 
        first_resnums = [first_resnums] * msa.numSequences()
    elif isListLike(first_resnums) and isinstance(first_resnums, basestring):
        if len(first_resnums) != msa.numSequences():
            raise ValueError('There should be an entry in first_resnums list for each sequence in msa')
    else:
        raise TypeError('first_resnums should be a string of length 5 or list of them')

    first_chains = kwargs.get('first_chains', '@')
    if isinstance(first_chains, basestring) and len(first_chains) == 1: 
        first_chains = [first_chains] * msa.numSequences()
    elif isListLike(first_chains) and isinstance(first_chains, basestring):
        if len(first_chains) != msa.numSequences():
            raise ValueError('There should be an entry in first_chains list for each sequence in msa')
    else:
        raise TypeError('first_chains should be a string of length 1 or list of them')

    last_resnums = kwargs.get('last_resnums', 'LAST ')
    if isinstance(last_resnums, basestring) and len(last_resnums) == 5: 
        last_resnums = [last_resnums] * msa.numSequences()
    elif isListLike(last_resnums) and isinstance(last_resnums, basestring):
        if len(last_resnums) != msa.numSequences():
            raise ValueError('There should be an entry in last_resnums list for each sequence in msa')
    else:
        raise TypeError('last_resnums should be a string of length 5 or list of them')

    last_chains = kwargs.get('last_chains', ' ')
    if isinstance(last_chains, basestring) and len(last_chains) == 1: 
        last_chains = [last_chains] * msa.numSequences()
    elif isListLike(last_chains) and isinstance(last_chains, basestring):
        if len(last_chains) != msa.numSequences():
            raise ValueError('There should be an entry in last_chains list for each sequence in msa')
    else:
        raise TypeError('last_chains should be a string of length 1 or list of them')

    protein_names = kwargs.get('protein_names', '')
    if isinstance(protein_names, basestring): 
        protein_names = [protein_names] * msa.numSequences()
    elif isListLike(protein_names) and isinstance(protein_names, basestring):
        if len(protein_names) != msa.numSequences():
            raise ValueError('There should be an entry in protein_names list for each sequence in msa')
    else:
        raise TypeError('protein_names should be a string or list of strings')

    protein_sources = kwargs.get('protein_sources', '')
    if isinstance(protein_sources, basestring): 
        protein_sources = [protein_sources] * msa.numSequences()
    elif isListLike(protein_sources) and isinstance(protein_sources, basestring):
        if len(protein_sources) != msa.numSequences():
            raise ValueError('There should be an entry in protein_sources list for each sequence in msa')
    else:
        raise TypeError('protein_sources should be a string or list of strings')

    resolutions = kwargs.get('resolutions', '')
    if isinstance(resolutions, basestring): 
        resolutions = [resolutions] * msa.numSequences()
    elif isListLike(resolutions) and isinstance(resolutions, basestring):
        if len(resolutions) != msa.numSequences():
            raise ValueError('There should be an entry in resolutions list for each sequence in msa')
    else:
        raise TypeError('resolutions should be a string or list of strings')

    r_factors = kwargs.get('r_factors', '')
    if isinstance(r_factors, basestring): 
        r_factors = [r_factors] * msa.numSequences()
    elif isListLike(r_factors) and isinstance(r_factors, basestring):
        if len(r_factors) != msa.numSequences():
            raise ValueError('There should be an entry in r_factors list for each sequence in msa')
    else:
        raise TypeError('r_factors should be a string or list of strings')

    for i, sequence in enumerate(msa):
        sequence = str(sequence).replace(chain_sep[i],'/')
        msafile.write('>P1;' + labels[i] + '\n')
        msafile.write(types[i] + ':' + labels[i] + ':')
        msafile.write(first_resnums[i] + ':' + first_chains[i] + ':')
        msafile.write(last_resnums[i] + ':' + last_chains[i] + ':')
        msafile.write(protein_names[i] + ':' + protein_sources[i] + ':')
        msafile.write(resolutions[i] + ':' + r_factors[i])
        msafile.write('\n')

        for j in range(len(sequence)/60):
            msafile.write(sequence[j*60:(j+1)*60] + '\n')
        msafile.write(sequence[(j+1)*60:] + '*\n\n')

    msafile.close()
    return
Пример #25
0
    def __init__(self, starDict, key, indices=None):
        self._title = key
        self._prog = starDict._prog
        self._starDict = starDict

        if indices is None:
            try:
                self._dict = starDict._dict[key]
            except:
                self._dict = list(starDict._dict)[key]

            keys = list(self._dict.keys())
        else:
            keys = [idx[0] for idx in indices]
            self._dict = OrderedDict()
            self._dict['data'] = OrderedDict()
            self._dict['fields'] = OrderedDict()
            for idx in indices:
                if idx[0] == 'data':
                    self._dict[idx[0]][idx[1]] = starDict._dict[self._title][
                        idx[0]][idx[1]]
                    if not 'fields' in keys:
                        for k, v in self._starDict._dict[
                                self._title]['fields'].items():
                            if v == idx[1]:
                                self._dict['fields'][k] = v
                else:
                    self._dict[idx[0]] = OrderedDict()
                    self._dict[idx[0]]['fields'] = starDict._dict[self._title][
                        idx[0]]['fields']
                    self._dict[idx[0]]['data'] = OrderedDict()
                    for id1 in idx[1]:
                        self._dict[idx[0]]['data'][id1] = starDict._dict[
                            self._title][idx[0]]['data'][id1]

        if set(keys) == set(['data', 'fields']):
            self.loops = []
            self.numLoops = 0

            self.data = np.array(list(self._dict['data'].values()))
            self.fields = np.array(list(self._dict['fields'].values()))

            if not isListLike(self.data):
                self.data = [self.data]

            if not isListLike(self.fields):
                self.fields = [self.fields]

            self.numEntries = len(self.data)
            self.numFields = len(self.fields)

        elif 'data' in keys and 'fields' in keys:
            if indices is not None:
                self.loops = [
                    StarLoop(self, key, idx) for (key, idx) in indices
                    if key not in ['data', 'fields']
                ]
            else:
                self.loops = [
                    StarLoop(self, key) for key in keys
                    if key not in ['data', 'fields']
                ]

            self.data = np.array(list(self._dict['data'].values()))
            self.fields = np.array(list(self._dict['fields'].values()))

            if not isListLike(self.data):
                self.data = [self.data]

            if not isListLike(self.fields):
                self.fields = [self.fields]

            self.numEntries = len(self.data)
            self.numFields = len(self.fields)
            self.numLoops = len(self.loops)

        elif 'data' in keys:
            if indices is not None:
                self.loops = [
                    StarLoop(self, key, idx) for (key, idx) in indices
                    if key != 'data'
                ]
            else:
                self.loops = [
                    StarLoop(self, key) for key in keys if key != 'data'
                ]

            self.data = np.array(list(self._dict['data'].values()))
            self.fields = np.array(list(self._dict['fields'].values()))

            if not isListLike(self.data):
                self.data = [self.data]

            if not isListLike(self.fields):
                self.fields = [self.fields]

            self.numLoops = len(self.loops)
            self.numEntries = len(self.data)
            self.numFields = 0

        elif 'fields' in keys:
            if indices is not None:
                self.loops = [
                    StarLoop(self, key, idx) for (key, idx) in indices
                    if key != 'fields'
                ]
            else:
                self.loops = [
                    StarLoop(self, key) for key in keys if key != 'fields'
                ]

            self.data = np.array(list(self._dict['data'].values()))
            self.fields = np.array(list(self._dict['fields'].values()))

            if not isListLike(self.data):
                self.data = [self.data]

            if not isListLike(self.fields):
                self.fields = [self.fields]

            self.numLoops = len(self.loops)
            self.numEntries = len(self.data)
            self.numFields = 0

        else:
            if indices is not None:
                self.loops = [
                    StarLoop(self, key, idx) for (key, idx) in indices
                ]
            else:
                self.loops = [StarLoop(self, key) for key in keys]

            self.numLoops = len(self.loops)
            self.numEntries = 0
            self.numFields = 0
Пример #26
0
Файл: goa.py Проект: prody/ProDy
def queryGOA(*ids, **kwargs):
    """Query a GOA database by identifier.

    :arg ids: an identifier or a list-like of identifiers 
    :type ids: str, tuple, list, :class:`~numpy.ndarray`

    :arg database: name of the database of interest
        default is PDB. Others include UNIPROT and 
        common names of many organisms.
    :type database: str
    """
    database = kwargs.pop('database', 'PDB')

    gaf_dict = kwargs.pop('gaf_dict', None)
    if gaf_dict is None:
        gaf_dict = parseGAF(database=database, **kwargs)
        LOGGER.info('GAF parsing completed.')

    n_ids = len(ids)
    if n_ids == 1:
        if isListLike(ids[0]):
            ids = ids[0]
            n_ids = len(ids)

    if n_ids == 1:
        ids = list(ids)

    results = []
    unmapped = []
    LOGGER.progress('Querying GOA for {0} ids...'
                    .format(n_ids), n_ids, '_prody_queryGOA')
    for i, id in enumerate(ids):
        LOGGER.update(i, 'Querying GOA for id {0} of {1}...'
                      .format(i+1, n_ids), label='_prody_queryGOA')
        if not isinstance(id, str):
            raise TypeError('each ID should be a string')

        id = id.upper()

        if database == 'PDB':
            if not len(id) in [4, 5, 6]:
                raise ValueError('PDB IDs should be strings of length 4 to 6')

            if len(id) == 5 and str.isalpha(id[-1]):
                id = id[:4] + '_' + id[-1]

        if id in list(gaf_dict.keys()):
            results.append(gaf_dict[id])
        else:
            results.append([])
            unmapped.append(id)

    rets = []
    LOGGER.progress('Mapping GO terms back to GOA results for {0} ids...'
                    .format(n_ids), n_ids, '_prody_mapGO')
    for i, result in enumerate(results):
        LOGGER.update(i, 'Mapping GO terms back to GOA results id {0} of {1}...'
                      .format(i+1, n_ids), label='_prody_mapGO')
        rets.append(GOADictList(result, title=ids[i], **kwargs))

    if n_ids == 1:
        rets = rets[0]

    return rets
Пример #27
0
def calcGoOverlap(*go_terms, **kwargs):
    """Calculate overlap between GO terms based on their distance
    in the graph. GO terms in different namespaces (molecular function,
    cellular component, and biological process) have undefined distances.

    :arg go_terms: a list of GO terms or GO IDs
    :type go_terms: list, tuple, `~numpy.ndarray`

    :arg pairwise: whether to calculate to a matrix of pairwise overlaps
        default is False
    :type pairwise: bool

    :arg distance: whether to return distances rather than calculating overlaps
        default is False
    :type distance: bool

    :arg go: GO graph. Default behaviour is to parse it with :func:`.parseOBO`.
    :type go: `~goatools.obo_parser.GODag`
    """
    pairwise = kwargs.pop('pairwise', False)
    distance = kwargs.get('distance', False)
    operator = kwargs.get('operator', None)

    go = kwargs.get('go', None)
    if go is None:
        go = parseOBO(**kwargs)

    if not isListLike(go_terms):
        raise TypeError('please provide a list-like of go terms')

    if pairwise:
        distances = np.zeros((len(go_terms), len(go_terms)))
        for i, go_terms_i in enumerate(go_terms):
            for j, go_terms_j in enumerate(go_terms):
                distances[i, j] = calcGoOverlap(go_terms_i,
                                                go_terms_j,
                                                pairwise=False,
                                                **kwargs)

    else:
        go_terms1 = go_terms[0]

        flattened_term_list = []
        for entry in go_terms[1:]:
            if isListLike(entry):
                flattened_term_list.extend(entry)
            else:
                flattened_term_list.append(entry)

        if not isListLike(go_terms1):
            go_terms1 = [go_terms1]

        if not isListLike(flattened_term_list):
            flattened_term_list = [flattened_term_list]

        try:
            flattened_term_list = [go[term] for term in flattened_term_list]
            go_terms1 = [go[term] for term in go_terms1]
        except:
            try:
                flattened_term_list = [term.id for term in flattened_term_list]
                go_terms1 = [term.id for term in go_terms1]
            except:
                raise TypeError('go_terms should contain go terms or IDs')

        for term in flattened_term_list:
            if not isinstance(term, str):
                term = term.id

        for term in go_terms1:
            if not isinstance(term, str):
                term = term.id

        distances = np.zeros((len(go_terms1), len(flattened_term_list)))
        for i, go_id1 in enumerate(go_terms1):
            for j, go_id2 in enumerate(flattened_term_list):
                distances[i, j] = calcMinBranchLength(go_id1, go_id2, go)

        if operator is not None and isListLike(distances):
            distances = operator(distances)

    if operator is None:
        if distances.shape[-1] == 1:
            distances = distances.flatten()

        if distances.shape == (1, ):
            distances = distances[0]

    if distance:
        return distances
    else:
        return 1. / distances
Пример #28
0
def queryGOA(*ids, **kwargs):
    """Query a GOA database by identifier.

    :arg ids: an identifier or a list-like of identifiers 
    :type ids: str, tuple, list, :class:`~numpy.ndarray`

    :arg database: name of the database of interest
        default is PDB. Others include UNIPROT and 
        common names of many organisms.
    :type database: str
    """
    database = kwargs.pop('database', 'PDB')

    gaf_dict = kwargs.pop('gaf_dict', None)
    if gaf_dict is None:
        gaf_dict = parseGAF(database=database, **kwargs)
        LOGGER.info('GAF parsing completed.')

    n_ids = len(ids)
    if n_ids == 1:
        if isListLike(ids[0]):
            ids = ids[0]
            n_ids = len(ids)

    if n_ids == 1:
        ids = list(ids)

    results = []
    unmapped = []
    LOGGER.progress('Querying GOA for {0} ids...'.format(n_ids), n_ids,
                    '_prody_queryGOA')
    for i, id in enumerate(ids):
        LOGGER.update(i,
                      'Querying GOA for id {0} of {1}...'.format(i + 1, n_ids),
                      label='_prody_queryGOA')
        if not isinstance(id, str):
            raise TypeError('each ID should be a string')

        id = id.upper()

        if database == 'PDB':
            if not len(id) in [4, 5, 6]:
                raise ValueError('PDB IDs should be strings of length 4 to 6')

            if len(id) == 5 and str.isalpha(id[-1]):
                id = id[:4] + '_' + id[-1]

        if id in list(gaf_dict.keys()):
            results.append(gaf_dict[id])
        else:
            results.append([])
            unmapped.append(id)

    rets = []
    LOGGER.progress(
        'Mapping GO terms back to GOA results for {0} ids...'.format(n_ids),
        n_ids, '_prody_mapGO')
    for i, result in enumerate(results):
        LOGGER.update(
            i,
            'Mapping GO terms back to GOA results id {0} of {1}...'.format(
                i + 1, n_ids),
            label='_prody_mapGO')
        rets.append(GOADictList(result, title=ids[i], **kwargs))

    if n_ids == 1:
        rets = rets[0]

    return rets
Пример #29
0
Файл: goa.py Проект: prody/ProDy
def calcGoOverlap(*go_terms, **kwargs):
    """Calculate overlap between GO terms based on their distance
    in the graph. GO terms in different namespaces (molecular function,
    cellular component, and biological process) have undefined distances.

    :arg go_terms: a list of GO terms or GO IDs
    :type go_terms: list, tuple, `~numpy.ndarray`

    :arg pairwise: whether to calculate to a matrix of pairwise overlaps
        default is False
    :type pairwise: bool

    :arg distance: whether to return distances rather than calculating overlaps
        default is False
    :type distance: bool

    :arg go: GO graph. Default behaviour is to parse it with :func:`.parseOBO`.
    :type go: `~goatools.obo_parser.GODag`
    """
    pairwise = kwargs.pop('pairwise', False)
    distance = kwargs.get('distance', False)
    operator = kwargs.get('operator', None)

    go = kwargs.get('go', None)
    if go is None:
        go = parseOBO(**kwargs)

    if not isListLike(go_terms):
        raise TypeError('please provide a list-like of go terms')

    if pairwise:
        distances = np.zeros((len(go_terms), len(go_terms)))
        for i, go_terms_i in enumerate(go_terms):
            for j, go_terms_j in enumerate(go_terms):
                distances[i, j] = calcGoOverlap(
                    go_terms_i, go_terms_j, pairwise=False, **kwargs)

    else:
        go_terms1 = go_terms[0]

        flattened_term_list = []
        for entry in go_terms[1:]:
            if isListLike(entry):
                flattened_term_list.extend(entry)
            else:
                flattened_term_list.append(entry)

        if not isListLike(go_terms1):
            go_terms1 = [go_terms1]

        if not isListLike(flattened_term_list):
            flattened_term_list = [flattened_term_list]

        try:
            flattened_term_list = [go[term] for term in flattened_term_list]
            go_terms1 = [go[term] for term in go_terms1]
        except:
            try:
                flattened_term_list = [term.id for term in flattened_term_list]
                go_terms1 = [term.id for term in go_terms1]
            except:
                raise TypeError('go_terms should contain go terms or IDs')

        for term in flattened_term_list:
            if not isinstance(term, str):
                term = term.id

        for term in go_terms1:
            if not isinstance(term, str):
                term = term.id

        distances = np.zeros((len(go_terms1), len(flattened_term_list)))
        for i, go_id1 in enumerate(go_terms1):
            for j, go_id2 in enumerate(flattened_term_list):
                distances[i, j] = calcMinBranchLength(go_id1, go_id2, go)

        if operator is not None and isListLike(distances):
            distances = operator(distances)

    if operator is None:
        if distances.shape[-1] == 1:
            distances = distances.flatten()

        if distances.shape == (1,):
            distances = distances[0]

    if distance:
        return distances
    else:
        return 1. / distances