예제 #1
0
파일: spectrus.py 프로젝트: fongchun/ProDy
def calcMBSfromSim(simMatrix, nEvals=20, remove_outliers=True,
                   remove_offset=True, **kwargs):

    LOGGER.timeit('_MBS')
    n = simMatrix.shape[0]
    mbs = np.zeros(n) 
    for i in range(n):
        try:
            # cut "non-covalent" bonds around atom 'i'
            modSim = MBSPointMutation(simMatrix, i)
            # compute laplacian's spectrum of eigvals
            laplacian = sparse.csgraph.laplacian(modSim, normed=True)
            evals = sparse.linalg.eigsh(laplacian, k=min(nEvals, n-1), 
                                        which='SM', return_eigenvectors=False)
            # sort eigvals in ascending order
            evals = np.sort(evals)
            # compute MBS at site i
            mbs[i] = np.sum(1./evals[1:])
        except Exception as err:
            LOGGER.warn('Unable to compute MBS at position '
                        '{0}. {1}'.format(i, err))
            mbs[i] = np.nan
    if any(~np.isnan(mbs)):
        # remove outliers
        if remove_outliers is True:
            mbs = _removeOutliers(mbs, **kwargs)
        # remove offset
        if remove_offset is True:
            offset = min(mbs[~np.isnan(mbs)])
            mbs = mbs - offset 
    LOGGER.report('MBS computed in %.1fs.', '_MBS')

    return mbs
예제 #2
0
파일: Uniprot.py 프로젝트: yaz62/rhapsody
 def _calcCustomAlignments(self, title, chains_to_align):
     seqUniprot = self.sequence
     PDBrecord = [d for d in self.customPDBmappings if d['PDB'] == title][0]
     alignments = PDBrecord.setdefault('alignments', {})
     maps = PDBrecord.setdefault('maps', {})
     for c in chains_to_align:
         # check for precomputed alignments and maps
         if c in alignments:
             continue
         # otherwise, align and map to PDB resids
         PDBresids = PDBrecord['chain_res'][c]
         seqChain = PDBrecord['chain_seq'][c]
         LOGGER.timeit('_align')
         try:
             a, m = self._quickAlign(seqUniprot, seqChain, PDBresids)
             msg = f"Chain {c} was quick-aligned"
         except:
             LOGGER.info(f"Aligning chain {c} of custom PDB {title}...")
             a, m = self._align(seqUniprot,
                                seqChain,
                                PDBresids,
                                print_info=True)
             msg = f"Chain {c} was aligned"
         LOGGER.report(msg + ' in %.1fs.', '_align')
         # store alignments and maps into PDBmappings
         alignments[c] = a
         maps[c] = m
     return
예제 #3
0
파일: analysis.py 프로젝트: prody/ProDy
def calcMeff(msa, seqid=0.8, refine=False, weight=False, **kwargs):
    """Returns the Meff for *msa*, which may be an :class:`.MSA`
    instance or a 2D Numpy character array.

    Since similar sequences in an *msa* decreases the diversity of *msa*,
    *Meff* gives a weight for sequences in the *msa*.

    For example: One sequence in MSA has 5 other similar sequences in this
    MSA(itself included). The weight of this sequence is defined as 1/5=0.2.
    Meff is the sum of all sequence weights. In another word, Meff can be
    understood as the effective number of independent sequences.

    Sequences sharing sequence identity of *seqid* or more with another
    sequence are regarded as similar sequences to calculate Meff.

    Sequences are not refined by default. When *refine* is set **True**, the
    MSA will be refined by the first sequence.

    The weight for each sequence are returned when *weight* is **True**."""

    msa = getMSA(msa)
    from .msatools import msameff

    LOGGER.timeit("_meff")
    refine = 1 if refine else 0
    weight = 0 if weight else 1  # A Mark for return weighted array.
    if not weight:
        w = zeros((msa.shape[0]), float)
        meff = msameff(msa, theta=1.0 - seqid, meff_only=weight, refine=refine, w=w)
    else:
        meff = msameff(msa, theta=1.0 - seqid, meff_only=weight, refine=refine)
    LOGGER.report("Meff was calculated in %.2fs.", "_meff")
    return meff
예제 #4
0
    def parsePDBs(self, **kwargs):
        """Load PDB into memory as :class:`.AtomGroup` instances using :func:`.parsePDB` and 
        perform selection based on residue ranges given by CATH."""
        
        pdbs = self.getPDBs(True)
        selstrs = self.getSelStrs()
        header = kwargs.get('header', False)
        model = kwargs.get('model', None)

        LOGGER.timeit('_cath_parsePDB')
        LOGGER.info('Parsing {0} PDB files...'.format(len(pdbs)))
        ret = parsePDB(*pdbs, **kwargs)

        if model != 0:
            if header:
                prots, _ = ret
            else:
                prots = ret

            LOGGER.info('Extracting domains...')
            for i in range(len(prots)):
                sel = prots[i].select(selstrs[i])
                prots[i] = sel
        LOGGER.report('CATH domains are parsed and extracted in %.2fs', '_cath_parsePDB')

        return ret
예제 #5
0
def calcMeff(msa, seqid=.8, refine=False, weight=False, **kwargs):
    """Returns the Meff for *msa*, which may be an :class:`.MSA`
    instance or a 2D Numpy character array.

    Since similar sequences in an *msa* decreases the diversity of *msa*,
    *Meff* gives a weight for sequences in the *msa*.

    For example: One sequence in MSA has 5 other similar sequences in this
    MSA(itself included). The weight of this sequence is defined as 1/5=0.2.
    Meff is the sum of all sequence weights. In another word, Meff can be
    understood as the effective number of independent sequences.

    Sequences sharing sequence identity of *seqid* or more with another
    sequence are regarded as similar sequences to calculate Meff.

    Sequences are not refined by default. When *refine* is set **True**, the
    MSA will be refined by the first sequence.

    The weight for each sequence are returned when *weight* is **True**."""

    msa = getMSA(msa)
    from .msatools import msameff
    LOGGER.timeit('_meff')
    refine = 1 if refine else 0
    weight = 0 if weight else 1  # A Mark for return weighted array.
    if (not weight):
        w = zeros((msa.shape[0]), float)
        meff = msameff(msa, theta=1.-seqid, meff_only=weight,
                       refine=refine, w=w)
    else:
        meff = msameff(msa, theta=1.-seqid, meff_only=weight, refine=refine)
    LOGGER.report('Meff was calculated in %.2fs.', '_meff')
    return meff
예제 #6
0
파일: emdfile.py 프로젝트: Python3pkg/ProDy
def parseEMDStream(stream, **kwargs):
    """ Returns an :class:`.AtomGroup` containing EMD data parsed from a stream of EMD file.

    :arg stream: Anything that implements the method ``readlines``
        (e.g. :class:`file`, buffer, stdin)"""

    cutoff = float(kwargs.get('cutoff', 1.20))
    n_nodes = int(kwargs.get('n_nodes', 1000))
    num_iter = int(kwargs.get('num_iter', 20))

    ag = None
    title_suffix = ''
    if 'ag' in kwargs:
        ag = kwargs['ag']
        if not isinstance(ag, AtomGroup):
            raise TypeError('ag must be an AtomGroup instance')
        n_csets = ag.numCoordsets()
    else: 
        ag = AtomGroup(str(kwargs.get('title', 'Unknown')) + title_suffix)
        n_csets = 0

    biomol = kwargs.get('biomol', False)
    hd = None
    LOGGER.warn('Building coordinates from electron density map. This may take a while.')
    LOGGER.timeit()
    _parseEMDLines(ag, stream, cutoff=cutoff, n_nodes=n_nodes, num_iter=num_iter, format='EMD')
    LOGGER.report('{0} atoms and {1} coordinate sets were '
                      'parsed in %.2fs.'.format(ag.numAtoms(),
                         ag.numCoordsets() - n_csets))
    return ag
예제 #7
0
 def run(self,
         tmax=200,
         li=0.2,
         lf=0.01,
         ei=0.3,
         ef=0.05,
         Ti=0.1,
         Tf=2,
         c=0,
         calcC=False):
     LOGGER.info(
         'Building coordinates from electron density map. This may take a while.'
     )
     LOGGER.timeit('_prody_make_nodes')
     tmax = int(tmax * self.N)
     li = li * self.N
     if calcC:
         Ti = Ti * self.N
         Tf = Tf * self.N
     for t in range(1, tmax + 1):
         # calc the parameters
         tt = float(t) / tmax
         l = li * np.power(lf / li, tt)
         ep = ei * np.power(ef / ei, tt)
         if calcC:
             T = Ti * np.power(Tf / Ti, tt)
         else:
             T = -1
         self.runOnce(t, l, ep, T, c)
     LOGGER.report('{0} pseudoatoms were fitted in %.2fs.'.format(self.N),
                   '_prody_make_nodes')
     return
예제 #8
0
파일: clustenm.py 프로젝트: SHZ66/ProDy
    def _generate(self, confs):

        LOGGER.info('Sampling conformers in generation %d ...' % self._cycle)
        LOGGER.timeit('_clustenm_gen')

        sample_method = self._sample_v1 if self._v1 else self._sample

        if self._parallel:
            with Pool(cpu_count()) as p:
                tmp = p.map(sample_method, [conf for conf in confs])
        else:
            tmp = [sample_method(conf) for conf in confs]

        tmp = [r for r in tmp if r is not None]

        confs_ex = np.concatenate(tmp)

        confs_cg = confs_ex[:, self._idx_cg]

        LOGGER.info('Clustering in generation %d ...' % self._cycle)
        label_cg = self._hc(confs_cg)
        centers, wei = self._centers(confs_cg, label_cg)
        LOGGER.report('Centroids were generated in %.2fs.',
                      label='_clustenm_gen')

        return confs_ex[centers], wei
예제 #9
0
파일: analysis.py 프로젝트: ielushuai/ProDy
def buildPCMatrix(msa, turbo=False, **kwargs):
    """Returns PC matrix calculated for *msa*, which may be an :class:`.MSA`
    instance or a 2D Numpy character array.

    Implementation is case insensitive and handles ambiguous amino acids
    as follows:

      * **B** (Asx) count is allocated to *D* (Asp) and *N* (Asn)
      * **Z** (Glx) count is allocated to *E* (Glu) and *Q* (Gln)
      * **J** (Xle) count is allocated to *I* (Ile) and *L* (Leu)
      * **X** (Xaa) count is allocated to the twenty standard amino acids
      * Joint probability of observing a pair of ambiguous amino acids is
        allocated to all potential combinations, e.g. probability of **XX**
        is allocated to 400 combinations of standard amino acids, similarly
        probability of **XB** is allocated to 40 combinations of *D* and *N*
        with the standard amino acids.

    Selenocysteine (**U**, Sec) and pyrrolysine (**O**, Pyl) are considered
    as distinct amino acids.  When *ambiguity* is set **False**, all alphabet
    characters as considered as distinct types.  All non-alphabet characters
    are considered as gaps.
    """

    msa = getMSA(msa)
    from .msatools import msapsicov
    LOGGER.timeit('_psicov')
    length = msa.shape[1]
    pc = zeros((length, length), float)
    pc = msapsicov(msa, pc, turbo=bool(turbo))
    LOGGER.report('PC matrix was calculated in %.2fs.', '_psicov')
    return pc
예제 #10
0
def loadAtoms(filename):
    """Return :class:`AtomGroup` instance from *filename*.  This function makes
    use of :func:`numpy.load` function.  See also :func:`saveAtoms`."""
    
    LOGGER.timeit()
    attr_dict = load(filename)
    files = set(attr_dict.files)
    if not 'n_atoms' in files:
        raise ValueError("'{0:s}' is not a valid atomic data file"
                         .format(filename))
    title = str(attr_dict['title'])
    if 'coordinates' in files:
        coords = attr_dict['coordinates']
        ag = AtomGroup(title)
        ag._n_csets = int(attr_dict['n_csets'])
        ag._coords = coords
    ag._n_atoms = int(attr_dict['n_atoms'])
    ag._setTimeStamp()
    if 'bonds' in files and 'bmap' in files and 'numbonds' in files:
        ag._bonds = attr_dict['bonds']
        ag._bmap = attr_dict['bmap']
        ag._data['numbonds'] = attr_dict['numbonds']
    for key, data in attr_dict.iteritems():
        if key in SKIPLOAD:
            continue
        if key in ATOMIC_ATTRIBUTES:
            ag._data[key] = data
        else:
            ag.setData(key, data)
    if ag.numCoordsets() > 0:
        ag._acsi = 0
    if 'cslabels' in files:
        ag.setCSLabels(list(attr_dict['cslabels']))
    LOGGER.timing('Atom group was loaded in %.2fs.')
    return ag
예제 #11
0
파일: analysis.py 프로젝트: npabon/ProDy
def buildSCAMatrix(msa, turbo=True, **kwargs):
    """Return SCA matrix calculated for *msa*, which may be an :class:`.MSA`
    instance or a 2D Numpy character array.

    Implementation is case insensitive and handles ambiguous amino acids
    as follows:

      * **B** (Asx) count is allocated to *D* (Asp) and *N* (Asn)
      * **Z** (Glx) count is allocated to *E* (Glu) and *Q* (Gln)
      * **J** (Xle) count is allocated to *I* (Ile) and *L* (Leu)
      * **X** (Xaa) count is allocated to the twenty standard amino acids
      * Joint probability of observing a pair of ambiguous amino acids is
        allocated to all potential combinations, e.g. probability of **XX**
        is allocated to 400 combinations of standard amino acids, similarly
        probability of **XB** is allocated to 40 combinations of *D* and *N*
        with the standard amino acids.

    Selenocysteine (**U**, Sec) and pyrrolysine (**O**, Pyl) are considered
    as distinct amino acids.  When *ambiguity* is set **False**, all alphabet
    characters as considered as distinct types.  All non-alphabet characters
    are considered as gaps."""

    msa = getMSA(msa)
    from .msatools import msasca
    LOGGER.timeit('_sca')
    length = msa.shape[1]
    sca = zeros((length, length), float)
    sca = msasca(msa, sca, turbo=bool(turbo))
    LOGGER.report('SCA matrix was calculated in %.2fs.', '_sca')
    return sca
예제 #12
0
파일: pdbfile.py 프로젝트: fongchun/ProDy
def parsePQR(filename, **kwargs):
    """Returns an :class:`.AtomGroup` containing data parsed from PDB lines.

    :arg filename: a PQR filename
    :type filename: str"""

    title = kwargs.get('title', kwargs.get('name'))
    model = 1
    header = False
    chain = kwargs.get('chain')
    subset = kwargs.get('subset')
    altloc = kwargs.get('altloc', 'A')
    max_n_atoms = kwargs.get('max_n_atoms', 1e5)
    if not os.path.isfile(filename):
        raise IOError('No such file: {0}'.format(repr(filename)))
    if title is None:
        fn, ext = os.path.splitext(os.path.split(filename)[1])
        if ext == '.gz':
            fn, ext = os.path.splitext(fn)
        title = fn.lower()
    title_suffix = ''
    if subset:
        try:
            subset = _PDBSubsets[subset.lower()]
        except AttributeError:
            raise TypeError('subset must be a string')
        except KeyError:
            raise ValueError('{0} is not a valid subset'
                             .format(repr(subset)))
        title_suffix = '_' + subset
    if chain is not None:
        if not isinstance(chain, str):
            raise TypeError('chain must be a string')
        elif len(chain) == 0:
            raise ValueError('chain must not be an empty string')
        title_suffix = '_' + chain + title_suffix
    if 'ag' in kwargs:
        ag = kwargs['ag']
        if not isinstance(ag, AtomGroup):
            raise TypeError('ag must be an AtomGroup instance')
        n_csets = ag.numCoordsets()
    else:
        ag = AtomGroup(title + title_suffix)
        n_csets = 0

    pqr = openFile(filename, 'rt')
    lines = pqr.readlines()
    pqr.close()
    LOGGER.timeit()
    ag = _parsePDBLines(ag, lines, split=0, model=1, chain=chain,
                        subset=subset, altloc_torf=False, format='pqr', 
                        max_n_atoms=max_n_atoms)
    if ag.numAtoms() > 0:
        LOGGER.report('{0} atoms and {1} coordinate sets were '
                      'parsed in %.2fs.'.format(ag.numAtoms(),
                      ag.numCoordsets() - n_csets))
        return ag
    else:
        return None
예제 #13
0
def parsePQR(filename, **kwargs):
    """Returns an :class:`.AtomGroup` containing data parsed from PDB lines.

    :arg filename: a PQR filename
    :type filename: str"""

    title = kwargs.get('title', kwargs.get('name'))
    chain = kwargs.get('chain')
    subset = kwargs.get('subset')
    if not os.path.isfile(filename):
        raise IOError('No such file: {0}'.format(repr(filename)))
    if title is None:
        fn, ext = os.path.splitext(os.path.split(filename)[1])
        if ext == '.gz':
            fn, ext = os.path.splitext(fn)
        title = fn.lower()
    title_suffix = ''
    if subset:
        try:
            subset = _PDBSubsets[subset.lower()]
        except AttributeError:
            raise TypeError('subset must be a string')
        except KeyError:
            raise ValueError('{0} is not a valid subset'.format(repr(subset)))
        title_suffix = '_' + subset
    if chain is not None:
        if not isinstance(chain, str):
            raise TypeError('chain must be a string')
        elif len(chain) == 0:
            raise ValueError('chain must not be an empty string')
        title_suffix = '_' + chain + title_suffix
    if 'ag' in kwargs:
        ag = kwargs['ag']
        if not isinstance(ag, AtomGroup):
            raise TypeError('ag must be an AtomGroup instance')
        n_csets = ag.numCoordsets()
    else:
        ag = AtomGroup(title + title_suffix)
        n_csets = 0

    pqr = openFile(filename, 'rt')
    lines = pqr.readlines()
    pqr.close()
    LOGGER.timeit()
    ag = _parsePDBLines(ag,
                        lines,
                        split=0,
                        model=1,
                        chain=chain,
                        subset=subset,
                        altloc_torf=False,
                        format='pqr')
    if ag.numAtoms() > 0:
        LOGGER.report('{0} atoms and {1} coordinate sets were '
                      'parsed in %.2fs.'.format(ag.numAtoms(),
                                                ag.numCoordsets() - n_csets))
        return ag
    else:
        return None
예제 #14
0
def refineEnsemble(ens, lower=.5, upper=10.):
    """Refine a PDB ensemble based on RMSD criterions."""

    LOGGER.timeit('_prody_refineEnsemble')
    from numpy import argsort

    ### obtain reference index
    rmsd = ens.getRMSDs()
    ref_i = np.argmin(rmsd)

    ### calculate pairwise RMSDs ###
    RMSDs = ens.getRMSDs(pairwise=True)

    def getRefinedIndices(A):
        deg = A.sum(axis=0)
        sorted_indices = list(argsort(deg))
        sorted_indices.remove(ref_i)
        sorted_indices.insert(0, ref_i)

        n_confs = ens.numConfs()
        isdel_temp = np.zeros(n_confs)
        for a in range(n_confs):
            i = sorted_indices[a]
            for b in range(n_confs):
                if a >= b:
                    continue
                j = sorted_indices[b]
                if isdel_temp[i] or isdel_temp[j]:
                    continue
                else:
                    if A[i, j]:
                        isdel_temp[j] = 1
        temp_list = isdel_temp.tolist()
        ind_list = []
        for i in range(n_confs):
            if not temp_list[i]:
                ind_list.append(i)
        return ind_list

    L = list(range(len(ens)))
    U = list(range(len(ens)))
    if lower is not None:
        A = RMSDs < lower
        L = getRefinedIndices(A)

    if upper is not None:
        B = RMSDs > upper
        U = getRefinedIndices(B)

    # find common indices from L and U
    I = list(set(L) - (set(L) - set(U)))
    reens = ens[I]

    LOGGER.report('Ensemble was refined in %.2fs.', '_prody_refineEnsemble')
    LOGGER.info('%d conformations were removed from ensemble.' %
                (len(ens) - len(I)))

    return reens
예제 #15
0
    def runManyStepsAlternating(self, n_steps, **kwargs):
        LOGGER.timeit('_prody_runManySteps')
        n_start = self.numSteps
        while self.numSteps < n_start + n_steps:
            n_modes = self.n_modes

            self.runStep(structA=self.structA,
                         structB=self.structB,
                         reduceSelA=self.reduceSelA,
                         reduceSelB=self.reduceSelB,
                         alignSelA=self.alignSelA,
                         alignSelB=self.alignSelB,
                         n_modes=n_modes,
                         **kwargs)
            LOGGER.debug(
                'Total time so far is %.2f minutes' %
                ((time.time() - LOGGER._times['_prody_runManySteps']) / 60))

            self.runStep(structA=self.structB,
                         structB=self.structA,
                         reduceSelA=self.reduceSelB,
                         reduceSelB=self.reduceSelA,
                         alignSelA=self.alignSelB,
                         alignSelB=self.alignSelA,
                         n_modes=n_modes,
                         **kwargs)
            LOGGER.debug(
                'Total time so far is %.2f minutes' %
                ((time.time() - LOGGER._times['_prody_runManySteps']) / 60))

            converged = self.checkConvergence()
            if converged:
                self.structA.setCoords(
                    self.coordsA
                )  # That way the original object is back to normal
                self.structB.setCoords(
                    self.coordsB
                )  # That way the original object is back to normal
                LOGGER.debug(
                    'Process completed in %.2f hours' %
                    ((time.time() - LOGGER._times['_prody_runManySteps']) /
                     3600))
                break

        ensemble = Ensemble('combined trajectory')
        ensemble.setAtoms(self.structA)
        for coordset in self.ensembleA.getCoordsets():
            ensemble.addCoordset(coordset)
        for coordset in reversed(self.ensembleB.getCoordsets()):
            ensemble.addCoordset(coordset)

        if self.outputPDB:
            writePDB(self.filename, ensemble)

        if self.outputDCD:
            writeDCD(self.filename, ensemble)

        return
예제 #16
0
파일: cath.py 프로젝트: nffaruk/ProDy
    def update(self, source=None):
        """Update data and files from CATH."""

        self._source = source = self._source or source
        self.reset()
        if source is None:
            return

        LOGGER.timeit('_cath_update')

        type_ = 0
        tree = None
        if isinstance(source, str):
            if isfile(source):
                type_ = 1
            elif isURL(source):
                type_ = 0
            else:
                type_ = 2
        elif hasattr(source, 'read'):
            type_ = 1
        else:
            raise TypeError(
                'source must be either an url, file name, file handle, '
                'or text in xml format')

        if type_ == 0:
            LOGGER.info('Fetching data from CATH...')
            self._fetch()

            LOGGER.info('Parsing CATH files...')
            self._parse()
        elif type_ == 1:
            LOGGER.info('Reading data from the local xml file...')
            tree = ET.parse(source)
        elif type_ == 2:
            LOGGER.info('Parsing input string...')
            tree = ET.fromstring(source)

        # post-processing
        if type_ > 0:
            root = tree.getroot()
            nodes = root.iter()

            # remove prefix from node tags
            for node in nodes:
                node.tag = node.tag.lstrip('id.')

            # convert int to str
            length_nodes = root.findall('.//*[@length]')
            for node in length_nodes:
                node.attrib['length'] = int(node.attrib['length'])

            copy2(root, self.root)
            self._update_map()

        LOGGER.report('CATH local database built in %.2fs.', '_cath_update')
예제 #17
0
파일: emdfile.py 프로젝트: uibcdf/ProDy
def parseEMDStream(stream, **kwargs):
    """ Returns an :class:`.AtomGroup` containing EMD data parsed from a stream of EMD file.

    :arg stream: Any object with the method ``readlines``
        (e.g. :class:`file`, buffer, stdin)"""

    cutoff = kwargs.get('cutoff', None)
    if cutoff is not None:
        cutoff = float(cutoff)

    n_nodes = kwargs.get('n_nodes', 0)
    num_iter = int(kwargs.get('num_iter', 20))
    map = kwargs.get('map', False)
    make_nodes = kwargs.get('make_nodes', False)

    if n_nodes > 0:
        make_nodes = True
        n_nodes = int(n_nodes)

    if map is False and make_nodes is False:
        LOGGER.warn(
            'At least one of map and make_nodes should be True. '
            'Setting map to False was an intentional change from the default '
            'behaviour so make_nodes has been set to True with n_nodes=1000.')
        make_nodes = True
        n_nodes = 1000

    title_suffix = kwargs.get('title_suffix', '')
    atomgroup = AtomGroup(str(kwargs.get('title', 'Unknown')) + title_suffix)
    atomgroup._n_atoms = n_nodes

    if make_nodes:
        LOGGER.info(
            'Building coordinates from electron density map. This may take a while.'
        )
        LOGGER.timeit()

        if map:
            emd, atomgroup = _parseEMDLines(atomgroup, stream, cutoff=cutoff, n_nodes=n_nodes, \
                                            num_iter=num_iter, map=map, make_nodes=make_nodes)
        else:
            atomgroup = _parseEMDLines(atomgroup, stream, cutoff=cutoff, n_nodes=n_nodes, \
                                       num_iter=num_iter, map=map, make_nodes=make_nodes)

        LOGGER.report('{0} pseudoatoms were fitted in %.2fs.'.format(
            atomgroup.numAtoms(), atomgroup.numCoordsets()))
    else:
        emd = _parseEMDLines(atomgroup, stream, cutoff=cutoff, n_nodes=n_nodes, \
                             num_iter=num_iter, map=map, make_nodes=make_nodes)

    if make_nodes:
        if map:
            return emd, atomgroup
        else:
            return atomgroup
    else:
        return emd
예제 #18
0
파일: cath.py 프로젝트: fongchun/ProDy
    def update(self, source=None):
        """Update data and files from CATH."""

        self._source = source = self._source or source
        self.reset()
        if source is None:
            return

        LOGGER.timeit('_cath_update')
        
        type_ = 0
        tree = None
        if isinstance(source, str):
            if isfile(source):
                type_ = 1
            elif isURL(source):
                type_ = 0
            else:
                type_ = 2
        elif hasattr(source, 'read'):
                type_ = 1
        else:
            raise TypeError('source must be either an url, file name, file handle, '
                            'or text in xml format')

        if type_ == 0:
            LOGGER.info('Fetching data from CATH...')
            self._fetch()

            LOGGER.info('Parsing CATH files...')
            self._parse()
        elif type_ == 1:
            LOGGER.info('Reading data from the local xml file...')
            tree = ET.parse(source)
        elif type_ == 2:
            LOGGER.info('Parsing input string...')
            tree = ET.fromstring(source)

        # post-processing
        if type_ > 0:
            root = tree.getroot()
            nodes = root.iter()

            # remove prefix from node tags
            for node in nodes:
                node.tag = node.tag.lstrip('id.')

            # convert int to str
            length_nodes = root.findall('.//*[@length]')
            for node in length_nodes:
                node.attrib['length'] = int(node.attrib['length'])
            
            copy2(root, self.root)
            self._update_map()

        LOGGER.report('CATH local database built in %.2fs.', '_cath_update')
예제 #19
0
def searchDali(pdbId, chainId, daliURL=None, subset='fullPDB', **kwargs):
    """Search Dali server with input of PDB ID and chain ID.
    Dali server: http://ekhidna2.biocenter.helsinki.fi/dali/
    
    :arg subset: fullPDB, PDB25, PDB50, PDB90
    :type subset: str
    
    """

    LOGGER.timeit('_dali')
    # timeout = 120
    timeout = kwargs.pop('timeout', 120)

    if daliURL is None:
        daliURL = "http://ekhidna2.biocenter.helsinki.fi/cgi-bin/sans/dump.cgi"
    pdbId = pdbId.lower()
    pdb_chain = pdbId + chainId
    parameters = {
        'cd1': pdb_chain,
        'method': 'search',
        'title': 'Title_' + pdb_chain,
        'address': ''
    }
    enc_params = urllib.urlencode(parameters).encode('utf-8')
    request = urllib2.Request(daliURL, enc_params)
    try_error = 3
    while try_error >= 0:
        try:
            url = urllib2.urlopen(request).url
            break
        except:
            try_error -= 1
            if try_error >= 0:
                LOGGER.sleep(
                    2, '. Connection error happened. Trying to reconnect...')
                continue
            else:
                url = urllib2.urlopen(request).url
                break
    if url.split('.')[-1].lower() in ['html', 'php']:
        # print('test -1: '+url)
        url = url.replace(url.split('/')[-1], '')
    LOGGER.debug(
        'Submitted Dali search for PDB and chain "{0} and {1}".'.format(
            pdbId, chainId))
    LOGGER.info(url)
    LOGGER.clear()
    obj = DaliRecord(url,
                     pdbId,
                     chainId,
                     subset=subset,
                     timeout=timeout,
                     **kwargs)
    #if obj.isSuccess:

    return obj
예제 #20
0
 def superpose(self):
     """Superpose the ensemble onto the reference coordinates."""
     
     if self._coords is None:
         raise ValueError('coordinates are not set, use `setCoords`')
     if self._confs is None or len(self._confs) == 0: 
         raise ValueError('conformations are not set, use `addCoordset`')
     LOGGER.timeit()
     self._superpose(trans=True) # trans kwarg is used by PDBEnsemble
     LOGGER.timing('Superposition completed in %.2f seconds.')
예제 #21
0
    def superpose(self):
        """Superpose the ensemble onto the reference coordinates."""

        if self._coords is None:
            raise ValueError('coordinates are not set, use `setCoords`')
        if self._confs is None or len(self._confs) == 0:
            raise ValueError('conformations are not set, use `addCoordset`')
        LOGGER.timeit('_prody_ensemble')
        self._superpose(trans=True)  # trans kwarg is used by PDBEnsemble
        LOGGER.report('Superposition completed in %.2f seconds.',
                      '_prody_ensemble')
예제 #22
0
파일: functions.py 프로젝트: fongchun/ProDy
def loadAtoms(filename):
    """Returns :class:`.AtomGroup` instance loaded from *filename* using
    :func:`numpy.load` function.  See also :func:`saveAtoms`."""

    LOGGER.timeit('_prody_loadatoms')
    attr_dict = load(filename)
    files = set(attr_dict.files)

    if not 'n_atoms' in files:
        raise ValueError('{0} is not a valid atomic data file'
                         .format(repr(filename)))
    title = str(attr_dict['title'])

    if 'coordinates' in files:
        coords = attr_dict['coordinates']
        ag = AtomGroup(title)
        ag._n_csets = int(attr_dict['n_csets'])
        ag._coords = coords
    ag._n_atoms = int(attr_dict['n_atoms'])
    ag._setTimeStamp()
    if 'flagsts' in files:
        ag._flagsts = int(attr_dict['flagsts'])

    if 'bonds' in files and 'bmap' in files and 'numbonds' in files:
        ag._bonds = attr_dict['bonds']
        ag._bmap = attr_dict['bmap']
        ag._data['numbonds'] = attr_dict['numbonds']

    skip_flags = set()

    for label, data in attr_dict.items():
        if label in SKIPLOAD:
            continue
        if data.ndim == 1 and data.dtype == bool:
            if label in skip_flags:
                continue
            else:
                ag._setFlags(label, data)
                skip_flags.update(flags.ALIASES.get(label, [label]))
        else:
            ag.setData(label, data)

    for label in ['segindex', 'chindex', 'resindex']:
        if label in attr_dict:
            ag._data[label] = attr_dict[label]

    if ag.numCoordsets() > 0:
        ag._acsi = 0

    if 'cslabels' in files:
        ag.setCSLabels(list(attr_dict['cslabels']))

    LOGGER.report('Atom group was loaded in %.2fs.', '_prody_loadatoms')
    return ag
예제 #23
0
파일: mechstiff.py 프로젝트: kaynakb/ProDy
def calcMechStiff(modes, coords, kbt=1.):
    """Calculate stiffness matrix calculated using :class:`.ANM` instance. 
    Method described in [EB08]_ and [KMR17]_. 

    .. [KMR] Mikulska-Ruminska K., Kulik A.J., Benadiba C., Bahar I., Dietler G., Nowak W.
        Nanomechanics of multidomain neuronal cell adhesion protein contactin revealed by 
        single molecule AFM and SMD. *Sci Rep* **2017** 7:8852. 

    :arg coords: a coordinate set or an object with ``getCoords`` method
    :type coords: :class:`numpy.ndarray`.

    :arg n_modes: number of non-zero eigenvalues/vectors to calculate.
        If **None** is given, all modes will be calculated (3x number of atoms).
    :type n_modes: int or **None**, default is 20.
    
    Authors: Mustafa Tekpinar & Karolina Mikulska-Ruminska & Cihan Kaya
    """

    try:
        coords = (coords._getCoords()
                  if hasattr(coords, '_getCoords') else coords.getCoords())
    except AttributeError:
        try:
            checkCoords(coords)
        except TypeError:
            raise TypeError('coords must be a Numpy array or an object '
                            'with `getCoords` method')
    try:
        is3d = modes.is3d()
        eigvecs = modes.getArray().T.flatten()
        eigvals = modes.getEigvals()
    except:
        raise TypeError('modes must be either an NMA or ModeSet object')

    if not is3d:
        raise TypeError('modes must be 3-dimensional')

    n_atoms = modes.numAtoms()
    n_modes = modes.numModes()

    LOGGER.timeit('_sm')

    sm = np.zeros((n_atoms, n_atoms), np.double)
    from .smtools import calcSM
    LOGGER.info('Calculating stiffness matrix.')

    calcSM(coords, sm, eigvecs, eigvals, n_atoms, n_modes, float(kbt))

    LOGGER.report('Stiffness matrix calculated in %.2lfs.', label='_sm')

    LOGGER.info('The range of effective force constant is: {0} to {1}.'.format(
        *calcStiffnessRange(sm)))

    return sm
예제 #24
0
def loadAtoms(filename):
    """Returns :class:`.AtomGroup` instance loaded from *filename* using
    :func:`numpy.load` function.  See also :func:`saveAtoms`."""

    LOGGER.timeit('_prody_loadatoms')
    attr_dict = load(filename)
    files = set(attr_dict.files)

    if not 'n_atoms' in files:
        raise ValueError('{0} is not a valid atomic data file'
                         .format(repr(filename)))
    title = str(attr_dict['title'])

    if 'coordinates' in files:
        coords = attr_dict['coordinates']
        ag = AtomGroup(title)
        ag._n_csets = int(attr_dict['n_csets'])
        ag._coords = coords
    ag._n_atoms = int(attr_dict['n_atoms'])
    ag._setTimeStamp()
    if 'flagsts' in files:
        ag._flagsts = int(attr_dict['flagsts'])

    if 'bonds' in files and 'bmap' in files and 'numbonds' in files:
        ag._bonds = attr_dict['bonds']
        ag._bmap = attr_dict['bmap']
        ag._data['numbonds'] = attr_dict['numbonds']

    skip_flags = set()

    for label, data in attr_dict.items():
        if label in SKIPLOAD:
            continue
        if data.ndim == 1 and data.dtype == bool:
            if label in skip_flags:
                continue
            else:
                ag._setFlags(label, data)
                skip_flags.update(flags.ALIASES.get(label, [label]))
        else:
            ag.setData(label, data)

    for label in ['segindex', 'chindex', 'resindex']:
        if label in attr_dict:
            ag._data[label] = attr_dict[label]

    if ag.numCoordsets() > 0:
        ag._acsi = 0

    if 'cslabels' in files:
        ag.setCSLabels(list(attr_dict['cslabels']))

    LOGGER.report('Atom group was loaded in %.2fs.', '_prody_loadatoms')
    return ag
예제 #25
0
파일: functions.py 프로젝트: prody/ProDy
def loadAtoms(filename):
    """Returns :class:`.AtomGroup` instance loaded from *filename* using
    :func:`numpy.load` function.  See also :func:`saveAtoms`."""

    LOGGER.timeit("_prody_loadatoms")
    attr_dict = load(filename)
    files = set(attr_dict.files)

    if not "n_atoms" in files:
        raise ValueError("{0} is not a valid atomic data file".format(repr(filename)))
    title = str(attr_dict["title"])

    if "coordinates" in files:
        coords = attr_dict["coordinates"]
        ag = AtomGroup(title)
        ag._n_csets = int(attr_dict["n_csets"])
        ag._coords = coords
    ag._n_atoms = int(attr_dict["n_atoms"])
    ag._setTimeStamp()
    if "flagsts" in files:
        ag._flagsts = int(attr_dict["flagsts"])

    if "bonds" in files and "bmap" in files and "numbonds" in files:
        ag._bonds = attr_dict["bonds"]
        ag._bmap = attr_dict["bmap"]
        ag._data["numbonds"] = attr_dict["numbonds"]

    skip_flags = set()

    for label, data in attr_dict.items():
        if label in SKIPLOAD:
            continue
        if data.ndim == 1 and data.dtype == bool:
            if label in skip_flags:
                continue
            else:
                ag._setFlags(label, data)
                skip_flags.update(flags.ALIASES.get(label, [label]))
        else:
            ag.setData(label, data)

    for label in ["segindex", "chindex", "resindex"]:
        if label in attr_dict:
            ag._data[label] = attr_dict[label]

    if ag.numCoordsets() > 0:
        ag._acsi = 0

    if "cslabels" in files:
        ag.setCSLabels(list(attr_dict["cslabels"]))

    LOGGER.report("Atom group was loaded in %.2fs.", "_prody_loadatoms")
    return ag
예제 #26
0
파일: analysis.py 프로젝트: npabon/ProDy
def buildSeqidMatrix(msa, turbo=True):
    """Return sequence identity matrix for *msa*."""

    msa = getMSA(msa)

    LOGGER.timeit('_seqid')
    from .seqtools import msaeye

    seqid = msaeye(msa, turbo=bool(turbo))

    LOGGER.report('Sequence identity matrix was calculated in %.2fs.',
                  '_seqid')
    return seqid
예제 #27
0
파일: analysis.py 프로젝트: prody/ProDy
def buildSeqidMatrix(msa, turbo=True):
    """Returns sequence identity matrix for *msa*."""

    msa = getMSA(msa)

    LOGGER.timeit("_seqid")
    from .seqtools import msaeye

    dim = msa.shape[0]
    seqid = msaeye(msa, ones((dim, dim), float), turbo=bool(turbo))

    LOGGER.report("Sequence identity matrix was calculated in %.2fs.", "_seqid")
    return seqid
예제 #28
0
파일: analysis.py 프로젝트: npabon/ProDy
def buildSeqidMatrix(msa, turbo=True):
    """Return sequence identity matrix for *msa*."""

    msa = getMSA(msa)

    LOGGER.timeit('_seqid')
    from .seqtools import msaeye

    seqid = msaeye(msa, turbo=bool(turbo))

    LOGGER.report('Sequence identity matrix was calculated in %.2fs.',
                  '_seqid')
    return seqid
예제 #29
0
파일: analysis.py 프로젝트: ielushuai/ProDy
def buildDirectInfoMatrix(msa,
                          seqid=.8,
                          pseudo_weight=.5,
                          refine=False,
                          **kwargs):
    """Returns direct information matrix calculated for *msa*, which may be an
    :class:`.MSA` instance or a 2D Numpy character array.

    Sequences sharing sequence identity of *seqid* or more with another
    sequence are regarded as similar sequences for calculating their weights
    using :func:`.calcMeff`.

    *pseudo_weight* are the weight for pseudo count probability.

    Sequences are not refined by default. When *refine* is set **True**,
    the MSA will be refined by the first sequence and the shape of direct
    information matrix will be smaller.
    """

    msa = getMSA(msa)
    from .msatools import msadipretest, msadirectinfo1, msadirectinfo2
    from numpy import matrix

    LOGGER.timeit('_di')
    if msa.shape[0] < 250:
        LOGGER.warning(
            'DI performs the best with higher number of sequences, and '
            'minimal number of sequences is recommended as 250.')
    refine = 1 if refine else 0
    # msadipretest get some parameter from msa to set matrix size
    length, q = msadipretest(msa, refine=refine)
    c = matrix.dot(matrix(zeros((length * q, 1), float)),
                   matrix(zeros((1, length * q), float)))
    prob = zeros((length, q + 1), float)
    # msadirectinfo1 return c to be inversed and prob to be used
    meff, n, length, c, prob = msadirectinfo1(msa,
                                              c,
                                              prob,
                                              theta=1. - seqid,
                                              pseudocount_weight=pseudo_weight,
                                              refine=refine,
                                              q=q + 1)

    c = c.I

    di = zeros((length, length), float)
    # get final DI
    di = msadirectinfo2(n, length, c, prob, di, q + 1)
    del prob, c
    LOGGER.report('DI matrix was calculated in %.2fs.', '_di')
    return di
예제 #30
0
파일: mechstiff.py 프로젝트: fongchun/ProDy
def calcMechStiff(modes, coords, kbt=1.):
    """Calculate stiffness matrix calculated using :class:`.ANM` instance. 
    Method described in [EB08]_. 

    :arg coords: a coordinate set or an object with ``getCoords`` method
    :type coords: :class:`numpy.ndarray`.

    :arg n_modes: number of non-zero eigenvalues/vectors to calculate.
        If **None** is given, all modes will be calculated (3x number of atoms).
    :type n_modes: int or **None**, default is 20.
    
    Author: Mustafa Tekpinar & Karolina Mikulska-Ruminska & Cihan Kaya
    """

    try:
        coords = (coords._getCoords() if hasattr(coords, '_getCoords') else
                    coords.getCoords())
    except AttributeError:
        try:
            checkCoords(coords)
        except TypeError:
            raise TypeError('coords must be a Numpy array or an object '
                            'with `getCoords` method')
    try:
        is3d = modes.is3d()
        eigvecs = modes.getArray().T.flatten()
        eigvals = modes.getEigvals()
    except:
        raise TypeError('modes must be either an NMA or ModeSet object')

    if not is3d:
        raise TypeError('modes must be 3-dimensional')

    n_atoms = modes.numAtoms()
    n_modes = modes.numModes()
    
    LOGGER.timeit('_sm')

    sm = np.zeros((n_atoms, n_atoms), np.double)
    from .smtools import calcSM
    LOGGER.info('Calculating stiffness matrix.')

    calcSM(coords, sm, eigvecs, eigvals,
            n_atoms, n_modes, float(kbt))

    LOGGER.report('Stiffness matrix calculated in %.2lfs.', label='_sm')
    
    LOGGER.info('The range of effective force constant is: {0} to {1}.'
                                .format(*calcStiffnessRange(sm)))

    return sm
예제 #31
0
파일: exanm.py 프로젝트: minghao2016/ProDy
    def buildHessian(self, coords, cutoff=15., gamma=1., **kwargs):
        """Build Hessian matrix for given coordinate set. 
        **kwargs** are passed to :method:`.buildMembrane`.

        :arg coords: a coordinate set or an object with ``getCoords`` method
        :type coords: :class:`numpy.ndarray`

        :arg cutoff: cutoff distance (Å) for pairwise interactions,
            default is 15.0 Å
        :type cutoff: float

        :arg gamma: spring constant, default is 1.0
        :type gamma: float
        """

        atoms = coords
        turbo = kwargs.pop('turbo', True)

        try:
            coords = (coords._getCoords()
                      if hasattr(coords, '_getCoords') else coords.getCoords())
        except AttributeError:
            try:
                checkCoords(coords)
            except TypeError:
                raise TypeError('coords must be a Numpy array or an object '
                                'with `getCoords` method')

        n_atoms = int(coords.shape[0])

        if self._membrane is None:
            coords = self.buildMembrane(atoms, **kwargs)
        else:
            coords = self._combined.getCoords()

        system = zeros(coords.shape[0], dtype=bool)
        system[:n_atoms] = True

        LOGGER.timeit('_exanm')

        if turbo:
            self._hessian = buildReducedHessian(coords, system, cutoff, gamma,
                                                **kwargs)
        else:
            super(exANM, self).buildHessian(coords, cutoff, gamma, **kwargs)
            system = np.repeat(system, 3)
            self._hessian = _reduceModel(self._hessian, system)

        LOGGER.report('Hessian was built in %.2fs.', label='_exanm')
        self._dof = self._hessian.shape[0]
        self._n_atoms = n_atoms
예제 #32
0
def parseEMDStream(stream, **kwargs):
    """ Returns an :class:`.AtomGroup` containing EMD data parsed from a stream of EMD file.

    :arg stream: Anything that implements the method ``readlines``
        (e.g. :class:`file`, buffer, stdin)"""

    cutoff = kwargs.get('cutoff', None)
    if cutoff is not None:
        cutoff = float(cutoff)

    n_nodes = int(kwargs.get('n_nodes', 1000))
    num_iter = int(kwargs.get('num_iter', 20))
    return_map = kwargs.get('return_map',False)
    make_nodes = kwargs.get('make_nodes',False)

    if return_map is False and make_nodes is False:
        LOGGER.warn('At least one of return_map and make_nodes should be True. '
                    'Setting make_nodes to False was an intentional change from the default '
                    'so return_map has been set to True.')
        kwargs['return_map'] = True

    title_suffix = kwargs.get('title_suffix','')
    atomgroup = AtomGroup(str(kwargs.get('title', 'Unknown')) + title_suffix)

    if make_nodes:
        LOGGER.info('Building coordinates from electron density map. This may take a while.')
        LOGGER.timeit()

        if return_map:
            emd, atomgroup = _parseEMDLines(atomgroup, stream, cutoff=cutoff, n_nodes=n_nodes, \
                                            num_iter=num_iter, return_map=return_map, \
                                            make_nodes=make_nodes)
        else:
            atomgroup = _parseEMDLines(atomgroup, stream, cutoff=cutoff, n_nodes=n_nodes, \
                                       num_iter=num_iter, return_map=return_map, \
                                       make_nodes=make_nodes)
        LOGGER.report('{0} atoms and {1} coordinate sets were '
                      'parsed in %.2fs.'.format(atomgroup.numAtoms(), atomgroup.numCoordsets()))
    else: 
        emd = _parseEMDLines(atomgroup, stream, cutoff=cutoff, n_nodes=n_nodes, \
                             num_iter=num_iter, return_map=return_map, \
                             make_nodes=make_nodes)

    if make_nodes:
        if return_map:
            return emd, atomgroup
        else:
            return atomgroup
    else:
        return emd
예제 #33
0
def recoverEVmutFeatures(SAVs):
    LOGGER.timeit('_EVmut')
    LOGGER.info('Recovering EVmutation data...')

    def find_matching_files(file_list, acc, pos):
        match_files = []
        for fname in [f for f in file_list if f.startswith(acc)]:
            basename = splitext(fname)[0]
            res_range = basename.split("_")[-1]
            res_i = int(res_range.split("-")[0])
            res_f = int(res_range.split("-")[1])
            if res_i <= int(pos) <= res_f:
                match_files.append(fname)
        return match_files

    feat_dtype = np.dtype([(f, 'f') for f in EVMUT_FEATS])
    features = np.zeros(len(SAVs), dtype=feat_dtype)
    features[:] = np.nan

    # recover EVmutation data
    EVmut_dir = pathEVmutationFolder()
    if EVmut_dir is None:
        raise RuntimeError('EVmutation folder not set')
    file_list = [basename(f) for f in glob(join(EVmut_dir, '*.csv'))]
    if not file_list:
        raise RuntimeError('EVmutation folder does not contain any .csv files')
    for i, SAV in enumerate(SAVs):
        acc, pos, wt_aa, mut_aa, SAV_txt = SAV
        #       LOGGER.info('Recovering EVmutation data for {}.'.format(SAV_txt))
        # find files containing given SAV coordinates
        match_files = find_matching_files(file_list, acc, pos)
        # recover data and average them if multiple values are found
        mutant = f'{wt_aa}{pos}{mut_aa}'
        data = []
        for fname in match_files:
            with open(join(EVmut_dir, fname), 'r') as f:
                for line in f:
                    if line.startswith(mutant):
                        l = line.strip().split(';')[4:8]
                        data.append(l)
                        break
        data = np.array(data, dtype=float)
        if len(data) == 0:
            #           LOGGER.warn(f"EVmutation data not found for '{SAV_txt}'")
            continue
        else:
            features[i] = tuple(np.mean(data, axis=0))

    LOGGER.report('EVmutation scores recovered in %.1fs.', '_EVmut')
    return features
예제 #34
0
 def _launchDSSP(self, ag):
     LOGGER.info('Running DSSP...')
     LOGGER.timeit('_DSSP')
     try:
         pdb_file = writePDB('_temp.pdb', ag, secondary=False)
         dssp_file = execDSSP(pdb_file, outputname='_temp')
         ag = parseDSSP(dssp_file, ag)
     except:
         raise
     finally:
         os.remove('_temp.pdb')
         os.remove('_temp.dssp')
     LOGGER.report('DSSP finished in %.1fs.', '_DSSP')
     return ag
예제 #35
0
파일: PDB.py 프로젝트: shulp2211/rhapsody
 def _launchDSSP(self, ag):
     LOGGER.info('Running DSSP...')
     LOGGER.timeit('_DSSP')
     pdb_file = writePDB('_temp.pdb', ag, secondary=False)
     try:
         dssp_file = execDSSP(pdb_file, outputname='_temp')
     except EnvironmentError:
         raise EnvironmentError("dssp executable not found: please install "
                                "with 'sudo apt install dssp'")
     ag = parseDSSP(dssp_file, ag)
     os.remove('_temp.pdb')
     os.remove('_temp.dssp')
     LOGGER.report('DSSP finished in %.1fs.', '_DSSP')
     return ag
예제 #36
0
파일: gnm.py 프로젝트: Python3pkg/ProDy
    def getNormDistFluct(self, coords):
        """Normalized distance fluctuation
        """

        model = self.getModel()
        LOGGER.info('Number of chains: {0}, chains: {1}.'
                     .format(len(list(set(coords.getChids()))), \
                                 list(set(coords.getChids()))))

        try:
            #coords = coords.select('protein and name CA')
            coords = (coords._getCoords()
                      if hasattr(coords, '_getCoords') else coords.getCoords())
        except AttributeError:
            try:
                checkCoords(coords)
            except TypeError:
                raise TypeError('coords must be a Numpy array or an object '
                                'with `getCoords` method')

        if not isinstance(model, NMA):
            LOGGER.info('Calculating new model')
            model = GNM('prot analysis')
            model.buildKirchhoff(coords)
            model.calcModes()

        linalg = importLA()
        n_atoms = model.numAtoms()
        n_modes = model.numModes()
        LOGGER.timeit('_ndf')

        from .analysis import calcCrossCorr
        from numpy import linalg as LA
        # <dRi, dRi>, <dRj, dRj> = 1
        crossC = 2 - 2 * calcCrossCorr(model)
        r_ij = np.zeros((n_atoms, n_atoms, 3))

        for i in range(n_atoms):
            for j in range(i + 1, n_atoms):
                r_ij[i][j] = coords[j, :] - coords[i, :]
                r_ij[j][i] = r_ij[i][j]
                r_ij_n = LA.norm(r_ij, axis=2)

        #with np.errstate(divide='ignore'):
        r_ij_n[np.diag_indices_from(r_ij_n)] = 1e-5  # div by 0
        crossC = abs(crossC)
        normdistfluct = np.divide(np.sqrt(crossC), r_ij_n)
        LOGGER.report('NDF calculated in %.2lfs.', label='_ndf')
        normdistfluct[np.diag_indices_from(normdistfluct)] = 0  # div by 0
        return normdistfluct
예제 #37
0
파일: gnm.py 프로젝트: sixpi/ProDy
    def getNormDistFluct(self, coords):
        """Normalized distance fluctuation
        """
            
        model = self.getModel()
        LOGGER.info('Number of chains: {0}, chains: {1}.'
                     .format(len(list(set(coords.getChids()))), \
                                 list(set(coords.getChids()))))

        try:
            #coords = coords.select('protein and name CA')
            coords = (coords._getCoords() if hasattr(coords, '_getCoords') else
                coords.getCoords())
        except AttributeError:
            try:
                checkCoords(coords)
            except TypeError:
                raise TypeError('coords must be a Numpy array or an object '
                                                'with `getCoords` method')
        
        if not isinstance(model, NMA):
            LOGGER.info('Calculating new model')
            model = GNM('prot analysis')
            model.buildKirchhoff(coords)
            model.calcModes() 
            
        linalg = importLA()
        n_atoms = model.numAtoms()
        n_modes = model.numModes()
        LOGGER.timeit('_ndf')
    
        from .analysis import calcCrossCorr
        from numpy import linalg as LA
        # <dRi, dRi>, <dRj, dRj> = 1
        crossC = 2-2*calcCrossCorr(model)
        r_ij = np.zeros((n_atoms,n_atoms,3))

        for i in range(n_atoms):
           for j in range(i+1,n_atoms):
               r_ij[i][j] = coords[j,:] - coords[i,:]
               r_ij[j][i] = r_ij[i][j]
               r_ij_n = LA.norm(r_ij, axis=2)

        #with np.errstate(divide='ignore'):
        r_ij_n[np.diag_indices_from(r_ij_n)] = 1e-5  # div by 0
        crossC=abs(crossC)
        normdistfluct = np.divide(np.sqrt(crossC),r_ij_n)
        LOGGER.report('NDF calculated in %.2lfs.', label='_ndf')
        normdistfluct[np.diag_indices_from(normdistfluct)] = 0  # div by 0
        return normdistfluct
예제 #38
0
    def buildMechStiff(self, coords, n_modes=None, kbt=1.):

        """Calculate stiffness matrix calculated using :class:`.ANM` instance. 
        Method described in [EB08]_. 
    
        .. [EB08] Eyal E., Bahar I. Toward a Molecular Understanding of 
            the Anisotropic Response of Proteins to External Forces:
            Insights from Elastic Network Models. *Biophys J* **2008** 94:3424-34355. 
    
        :arg coords: a coordinate set or an object with ``getCoords`` method
        :type coords: :class:`numpy.ndarray`.
        :arg n_modes: number of non-zero eigenvalues/vectors to calculate.
            If ``None`` is given, all modes will be calculated (3x number of atoms).
        :type n_modes: int or ``None``, default is 20.
        
        Author: Mustafa Tekpinar & Karolina Mikulska-Ruminska & Cihan Kaya
        """

        try:
            coords = (coords._getCoords() if hasattr(coords, '_getCoords') else
                      coords.getCoords())
        except AttributeError:
            try:
                checkCoords(coords)
            except TypeError:
                raise TypeError('coords must be a Numpy array or an object '
                                'with `getCoords` method')
        n_atoms = natoms = self._n_atoms
        n_modes = 3 * n_atoms

        self.calcModes(n_modes=None, zeros=True)
        
        LOGGER.timeit('_sm')
        eigvecs = (np.transpose(self._array)).flatten()
        eigvals = np.transpose(self._eigvals)
        natoms = n_atoms

        sm = np.zeros((n_atoms, n_atoms), np.double)
        from .smtools import calcSM
        LOGGER.info('Calculating stiffness matrix.')

        calcSM(coords, sm, eigvecs, eigvals,
                natoms, n_modes, float(kbt))

        LOGGER.report('Stiffness matrix calculated in %.2lfs.', label='_sm')

        self._stiffness = sm
        
        LOGGER.info('The range of effective force constant is: {0} to {1}.'
                                   .format(np.min(sm[np.nonzero(sm)]), np.amax(sm)))
예제 #39
0
파일: rtb.py 프로젝트: nffaruk/ProDy
    def calcProjection(self, coords, blocks, **kwargs):
        natoms = self._n_atoms

        if natoms != len(blocks):
            raise ValueError('len(blocks) must match number of atoms')

        LOGGER.timeit('_rtb')
        from collections import defaultdict
        i = Increment()
        d = defaultdict(i)
        blocks = np.array([d[b] for b in blocks], dtype='int32')

        try:
            from collections import Counter
        except ImportError:
            counter = defaultdict(int)
            for b in blocks:
                counter[b] += 1
        else:
            counter = Counter(blocks)

        nblocks = len(counter)
        maxsize = 1
        nones = 0
        while counter:
            _, size = counter.popitem()
            if size == 1:
                nones += 1
            if size > maxsize:
                maxsize = size
        LOGGER.info(
            'System has {0} blocks, the largest with {1} of {2} units.'.format(
                nblocks, maxsize, natoms))
        nb6 = nblocks * 6 - nones * 3

        coords = coords.T.astype(float, order='C')

        hessian = self._hessian
        self._project = project = np.zeros((natoms * 3, nb6), float)

        from .rtbtools import calc_projection

        calc_projection(coords, blocks, project, natoms, nblocks, nb6, maxsize)

        self._hessian = project.T.dot(hessian).dot(project)
        self._dof = self._hessian.shape[0]
        LOGGER.report(
            'Block Hessian and projection matrix were calculated in %.2fs.',
            label='_rtb')
예제 #40
0
파일: emdfile.py 프로젝트: prody/ProDy
def parseEMDStream(stream, **kwargs):
    """ Returns an :class:`.AtomGroup` containing EMD data parsed from a stream of EMD file.

    :arg stream: Any object with the method ``readlines``
        (e.g. :class:`file`, buffer, stdin)"""

    cutoff = kwargs.get('cutoff', None)
    if cutoff is not None:
        cutoff = float(cutoff)

    n_nodes = int(kwargs.get('n_nodes', 1000))
    num_iter = int(kwargs.get('num_iter', 20))
    map = kwargs.get('map',True)
    make_nodes = kwargs.get('make_nodes',False)

    if map is False and make_nodes is False:
        LOGGER.warn('At least one of map and make_nodes should be True. '
                    'Setting map to False was an intentional change from the default '
                    'behaviour so make_nodes has been set to True.')
        make_nodes = True

    title_suffix = kwargs.get('title_suffix','')
    atomgroup = AtomGroup(str(kwargs.get('title', 'Unknown')) + title_suffix)
    atomgroup._n_atoms = n_nodes

    if make_nodes:
        LOGGER.info('Building coordinates from electron density map. This may take a while.')
        LOGGER.timeit()

        if map:
            emd, atomgroup = _parseEMDLines(atomgroup, stream, cutoff=cutoff, n_nodes=n_nodes, \
                                            num_iter=num_iter, map=map, make_nodes=make_nodes)
        else:
            atomgroup = _parseEMDLines(atomgroup, stream, cutoff=cutoff, n_nodes=n_nodes, \
                                       num_iter=num_iter, map=map, make_nodes=make_nodes)

        LOGGER.report('{0} atoms and {1} coordinate sets were '
                      'parsed in %.2fs.'.format(atomgroup.numAtoms(), atomgroup.numCoordsets()))
    else: 
        emd = _parseEMDLines(atomgroup, stream, cutoff=cutoff, n_nodes=n_nodes, \
                             num_iter=num_iter, map=map, make_nodes=make_nodes)

    if make_nodes:
        if map:
            return emd, atomgroup
        else:
            return atomgroup
    else:
        return emd
예제 #41
0
파일: clustenm.py 프로젝트: SHZ66/ProDy
    def setAtoms(self, atoms, pH=7.0):

        '''
        Sets atoms.
        
        :arg atoms: *atoms* parsed by parsePDB

        :arg pH: pH based on which to select protonation states for adding missing hydrogens, default is 7.0.
        :type pH: float
        '''

        atoms = atoms.select('not hetatm')

        self._nuc = atoms.select('nucleotide')

        if self._nuc is not None:

            idx_p = []
            for c in self._nuc.getChids():
                tmp = self._nuc[c].iterAtoms()
                for a in tmp:
                    if a.getName() in ['P', 'OP1', 'OP2', 'OP3']:
                        idx_p.append(a.getIndex())

            if idx_p:
                nsel = 'not index ' + ' '.join([str(i) for i in idx_p])
                atoms = atoms.select(nsel)

        if self._isBuilt():
            super(ClustENM, self).setAtoms(atoms)
        else:
            LOGGER.info('Fixing the structure ...')
            LOGGER.timeit('_clustenm_fix')
            self._ph = pH
            self._fix(atoms)
            LOGGER.report('The structure was fixed in %.2fs.',
                          label='_clustenm_fix')

            if self._nuc is None:
                self._idx_cg = self._atoms.ca.getIndices()
                self._n_cg = self._atoms.ca.numAtoms()
            else:
                self._idx_cg = self._atoms.select("name CA C2 C4' P").getIndices()
                self._n_cg = self._atoms.select("name CA C2 C4' P").numAtoms()

            self._n_atoms = self._atoms.numAtoms()
            self._indices = None
예제 #42
0
파일: ensemble.py 프로젝트: kaynakb/ProDy
    def superpose(self, **kwargs):
        """Superpose the ensemble onto the reference coordinates.
        
        :arg ref: index of the reference coordinate. If **None**, the average 
            coordinate will be assumed as the reference. Default is **None**
        :type ref: int
        """

        ref = kwargs.pop('ref', None)
        if self._coords is None:
            raise ValueError('coordinates are not set, use `setCoords`')
        if self._confs is None or len(self._confs) == 0:
            raise ValueError('conformations are not set, use `addCoordset`')
        LOGGER.timeit('_prody_ensemble')
        self._superpose(ref=ref)  # trans kwarg is used by PDBEnsemble
        LOGGER.report('Superposition completed in %.2f seconds.',
                      '_prody_ensemble')
예제 #43
0
파일: gnm.py 프로젝트: cgseitz/ProDy
    def calcModes(self, n_modes=20, zeros=False, turbo=True):
        """Calculate normal modes.  This method uses :func:`scipy.linalg.eigh`
        function to diagonalize the Kirchhoff matrix. When Scipy is not found,
        :func:`numpy.linalg.eigh` is used.

        :arg n_modes: number of non-zero eigenvalues/vectors to calculate.
              If **None** or ``'all'`` is given, all modes will be calculated.
        :type n_modes: int or None, default is 20

        :arg zeros: If **True**, modes with zero eigenvalues will be kept.
        :type zeros: bool, default is **True**

        :arg turbo: Use a memory intensive, but faster way to calculate modes.
        :type turbo: bool, default is **True**

        """

        if self._kirchhoff is None:
            raise ValueError('Kirchhoff matrix is not built or set')
        if str(n_modes).lower() == 'all':
            n_modes = None
        assert n_modes is None or isinstance(n_modes, int) and n_modes > 0, \
            'n_modes must be a positive integer'
        assert isinstance(zeros, bool), 'zeros must be a boolean'
        assert isinstance(turbo, bool), 'turbo must be a boolean'
        self._clear()
        LOGGER.timeit('_gnm_calc_modes')
        values, vectors, vars = solveEig(self._kirchhoff,
                                         n_modes=n_modes,
                                         zeros=zeros,
                                         turbo=turbo,
                                         expct_n_zeros=1)

        self._eigvals = values
        self._array = vectors
        self._vars = vars
        self._trace = self._vars.sum()
        self._n_modes = len(self._eigvals)
        if self._n_modes > 1:
            LOGGER.report('{0} modes were calculated in %.2fs.'.format(
                self._n_modes),
                          label='_anm_calc_modes')
        else:
            LOGGER.report('{0} mode was calculated in %.2fs.'.format(
                self._n_modes),
                          label='_anm_calc_modes')
예제 #44
0
파일: ensemble.py 프로젝트: kaynakb/ProDy
    def iterpose(self, rmsd=0.0001, quiet=False):
        """Iteratively superpose the ensemble until convergence.  Initially,
        all conformations are aligned with the reference coordinates.  Then
        mean coordinates are calculated, and are set as the new reference
        coordinates.  This is repeated until reference coordinates do not
        change.  This is determined by the value of RMSD between the new and
        old reference coordinates.  Note that at the end of the iterative
        procedure the reference coordinate set will be average of conformations
        in the ensemble.

        :arg rmsd: change in reference coordinates to determine convergence,
            default is 0.0001 Å RMSD
        :type rmsd: float"""

        if self._coords is None:
            raise AttributeError('coordinates are not set, use `setCoords`')
        if self._confs is None or len(self._confs) == 0:
            raise AttributeError('conformations are not set, use'
                                 '`addCoordset`')
        LOGGER.info('Starting iterative superposition:')
        LOGGER.timeit('_prody_ensemble')
        rmsdif = 1
        step = 0
        weights = self._weights
        length = len(self)
        if weights is not None:
            if weights.ndim == 3:
                weightsum = weights.sum(axis=0)
                weightsum[weightsum == 0.] = 1.  # add pseudocount to avoid nan
            else:
                weightsum = length

        while rmsdif > rmsd:
            self._superpose(quiet=quiet)
            if weights is None:
                newxyz = self._confs.sum(0) / length
            else:
                newxyz = (self._confs * weights).sum(0) / weightsum
            rmsdif = getRMSD(self._coords, newxyz)
            self._coords = newxyz
            step += 1
            LOGGER.info('Step #{0}: RMSD difference = {1:.4e}'.format(
                step, rmsdif))
        LOGGER.report('Iterative superposition completed in %.2fs.',
                      '_prody_ensemble')
예제 #45
0
파일: msafile.py 프로젝트: prody/ProDy
def parseMSAHeader(filename, **kwargs):

    try:
        fileok = isfile(filename)
    except TypeError:
        raise TypeError('filename must be a string')
    else:
        if not fileok:
            raise IOError('[Errno 2] No such file or directory: ' +
                          repr(filename))

    # if MSA is a compressed file or filter/slice is passed, use
    #   Python parsers

    LOGGER.timeit('_parsemsa')

    title, ext = splitext(filename)
    title = split(title)[1]
    aligned = kwargs.get('aligned', True)
    if (ext.lower() == '.gz' or 'filter' in kwargs or 'slice' in kwargs or
            not aligned):
        if ext.lower() == '.gz':
            title = splitext(title)[0]
    else:
        filesize = getsize(filename)
        format = MSAEXTMAP.get(splitext(filename)[1])

    if format == STOCKHOLM:
        from collections import defaultdict
        lines = defaultdict(list)

        stream = open(filename, 'r')

        for loc, line in enumerate(stream):
            startswith = line.split()[0]
            if line[0] != '#':
                break
            lines[startswith].append((loc, line))

        stream.close()
    else:
        raise TypeError('Only STOCKHOLM type MSAs are supported at present.')

    return lines
예제 #46
0
파일: analysis.py 프로젝트: prody/ProDy
def buildMutinfoMatrix(msa, ambiguity=True, turbo=True, **kwargs):
    """Returns mutual information matrix calculated for *msa*, which may be an
    :class:`.MSA` instance or a 2D Numpy character array.  Implementation
    is case insensitive and handles ambiguous amino acids as follows:

      * **B** (Asx) count is allocated to *D* (Asp) and *N* (Asn)
      * **Z** (Glx) count is allocated to *E* (Glu) and *Q* (Gln)
      * **J** (Xle) count is allocated to *I* (Ile) and *L* (Leu)
      * **X** (Xaa) count is allocated to the twenty standard amino acids
      * Joint probability of observing a pair of ambiguous amino acids is
        allocated to all potential combinations, e.g. probability of **XX**
        is allocated to 400 combinations of standard amino acids, similarly
        probability of **XB** is allocated to 40 combinations of *D* and *N*
        with the standard amino acids.

    Selenocysteine (**U**, Sec) and pyrrolysine (**O**, Pyl) are considered
    as distinct amino acids.  When *ambiguity* is set **False**, all alphabet
    characters as considered as distinct types.  All non-alphabet characters
    are considered as gaps.

    Mutual information matrix can be normalized or corrected using
    :func:`applyMINormalization` and :func:`applyMICorrection` methods,
    respectively.  Normalization by joint entropy can performed using this
    function with *norm* option set **True**."""

    msa = getMSA(msa)

    from .msatools import msamutinfo

    LOGGER.timeit("_mutinfo")
    length = msa.shape[1]
    mutinfo = empty((length, length), float)
    mutinfo = msamutinfo(
        msa,
        mutinfo,
        ambiguity=bool(ambiguity),
        turbo=bool(turbo),
        norm=bool(kwargs.get("norm", False)),
        debug=bool(kwargs.get("debug", False)),
    )
    LOGGER.report("Mutual information matrix was calculated in %.2fs.", "_mutinfo")

    return mutinfo
예제 #47
0
파일: analysis.py 프로젝트: prody/ProDy
def buildDirectInfoMatrix(msa, seqid=0.8, pseudo_weight=0.5, refine=False, **kwargs):
    """Returns direct information matrix calculated for *msa*, which may be an
    :class:`.MSA` instance or a 2D Numpy character array.

    Sequences sharing sequence identity of *seqid* or more with another
    sequence are regarded as similar sequences for calculating their weights
    using :func:`.calcMeff`.

    *pseudo_weight* are the weight for pseudo count probability.

    Sequences are not refined by default. When *refine* is set **True**,
    the MSA will be refined by the first sequence and the shape of direct
    information matrix will be smaller.
    """

    msa = getMSA(msa)
    from .msatools import msadipretest, msadirectinfo1, msadirectinfo2
    from numpy import matrix

    LOGGER.timeit("_di")
    if msa.shape[0] < 250:
        LOGGER.warning(
            "DI performs the best with higher number of sequences, and "
            "minimal number of sequences is recommended as 250."
        )
    refine = 1 if refine else 0
    # msadipretest get some parameter from msa to set matrix size
    length, q = msadipretest(msa, refine=refine)
    c = matrix.dot(matrix(zeros((length * q, 1), float)), matrix(zeros((1, length * q), float)))
    prob = zeros((length, q + 1), float)
    # msadirectinfo1 return c to be inversed and prob to be used
    meff, n, length, c, prob = msadirectinfo1(
        msa, c, prob, theta=1.0 - seqid, pseudocount_weight=pseudo_weight, refine=refine, q=q + 1
    )

    c = c.I

    di = zeros((length, length), float)
    # get final DI
    di = msadirectinfo2(n, length, c, prob, di, q + 1)
    del prob, c
    LOGGER.report("DI matrix was calculated in %.2fs.", "_di")
    return di
예제 #48
0
파일: cath.py 프로젝트: fongchun/ProDy
    def save(self, filename='cath.xml'):
        """Write local CATH database to an XML file. *filename* can either be a 
        file name or a handle."""

        LOGGER.timeit('_cath_write')

        if not isinstance(filename, str):
            try:
                fn = filename.name
            except AttributeError:
                fn = repr(filename)
            f = filename
        else:
            fn = filename

        LOGGER.info('Writing data to {0}...'.format(fn))

        if not len(self.root):
            raise ValueError('local database has not been built, '
                             'please call update() first')

        tree = self.copy()
        root = tree.getroot()

        # convert int to str
        length_nodes = root.findall('.//*[@length]')
        for node in length_nodes:
            node.attrib['length'] = str(node.attrib['length'])
        
        # add prefix to node tags
        nodes = root.iter()
        for node in nodes:
            node.tag = 'id.' + node.tag

        # add indentation to nodes
        indentElement(root)
        
        if isinstance(filename, str):
            f = open(filename, 'wb')
        tree.write(f, encoding='utf-8')
        f.close()

        LOGGER.report('CATH local database saved in %.2fs.', '_cath_write')
예제 #49
0
파일: rtb.py 프로젝트: timlezon/ProDy
    def buildHessian(self, coords, blocks, cutoff=15., gamma=1., **kwargs):
        """Build Hessian matrix for given coordinate set.

        :arg coords: a coordinate set or an object with ``getCoords`` method
        :type coords: :class:`numpy.ndarray`

        :arg cutoff: cutoff distance (Å) for pairwise interactions,
            default is 15.0 Å
        :type cutoff: float

        :arg gamma: spring constant, default is 1.0
        :type gamma: float"""


        try:
            coords = (coords._getCoords() if hasattr(coords, '_getCoords') else
                      coords.getCoords())
        except AttributeError:
            try:
                checkCoords(coords)
            except TypeError:
                raise TypeError('coords must be a Numpy array or an object '
                                'with `getCoords` method')

        LOGGER.timeit('_rtb')
        natoms = coords.shape[0]
        if (natoms,) != blocks.shape:
            raise ValueError('blocks.shape must be (natoms,)')

        nblocks = len(set(blocks))
        nb6 = nblocks * 6

        coords = coords.T.copy()

        self._hessian = hessian = np.zeros((nb6, nb6), float)
        self._project = project = np.zeros((natoms * 3, nb6), float)

        from rtbtools import buildhessian
        buildhessian(coords, blocks, hessian, project,
                     natoms, nblocks, float(cutoff), float(gamma))

        LOGGER.report('Hessian was built in %.2fs.', label='_rtb')
예제 #50
0
파일: gnm.py 프로젝트: prody/ProDy
    def calcModes(self, n_modes=20, zeros=False, turbo=True, hinges=True):
        """Calculate normal modes.  This method uses :func:`scipy.linalg.eigh`
        function to diagonalize the Kirchhoff matrix. When Scipy is not found,
        :func:`numpy.linalg.eigh` is used.

        :arg n_modes: number of non-zero eigenvalues/vectors to calculate.
              If **None** or ``'all'`` is given, all modes will be calculated.
        :type n_modes: int or None, default is 20

        :arg zeros: If **True**, modes with zero eigenvalues will be kept.
        :type zeros: bool, default is **True**

        :arg turbo: Use a memory intensive, but faster way to calculate modes.
        :type turbo: bool, default is **True**

        :arg hinges: Identify hinge sites after modes are computed.
        :type hinges: bool, default is **True**
        """

        if self._kirchhoff is None:
            raise ValueError('Kirchhoff matrix is not built or set')
        if str(n_modes).lower() == 'all':
            n_modes = None
        assert n_modes is None or isinstance(n_modes, int) and n_modes > 0, \
            'n_modes must be a positive integer'
        assert isinstance(zeros, bool), 'zeros must be a boolean'
        assert isinstance(turbo, bool), 'turbo must be a boolean'
        self._clear()
        LOGGER.timeit('_gnm_calc_modes')
        values, vectors, vars = solveEig(self._kirchhoff, n_modes=n_modes, zeros=zeros, 
                                         turbo=turbo, is3d=False)

        self._eigvals = values
        self._array = vectors
        self._vars = vars
        self._trace = self._vars.sum()
        self._n_modes = len(self._eigvals)
        if hinges:
            self.calcHinges()
        LOGGER.report('{0} modes were calculated in %.2fs.'
                     .format(self._n_modes), label='_gnm_calc_modes')
예제 #51
0
파일: ensemble.py 프로젝트: sixpi/ProDy
    def iterpose(self, rmsd=0.0001):
        """Iteratively superpose the ensemble until convergence.  Initially,
        all conformations are aligned with the reference coordinates.  Then
        mean coordinates are calculated, and are set as the new reference
        coordinates.  This is repeated until reference coordinates do not
        change.  This is determined by the value of RMSD between the new and
        old reference coordinates.  Note that at the end of the iterative
        procedure the reference coordinate set will be average of conformations
        in the ensemble.

        :arg rmsd: change in reference coordinates to determine convergence,
            default is 0.0001 Å RMSD
        :type rmsd: float"""

        if self._coords is None:
            raise AttributeError('coordinates are not set, use `setCoords`')
        if self._confs is None or len(self._confs) == 0:
            raise AttributeError('conformations are not set, use'
                                 '`addCoordset`')
        LOGGER.info('Starting iterative superposition:')
        LOGGER.timeit('_prody_ensemble')
        rmsdif = 1
        step = 0
        weights = self._weights
        if weights is not None and weights.ndim == 3:
            weightsum = weights.sum(axis=0)
        length = len(self)
        while rmsdif > rmsd:
            self._superpose()
            if weights is None:
                newxyz = self._confs.sum(0) / length
            else:
                newxyz = (self._confs * weights).sum(0) / weightsum
            rmsdif = getRMSD(self._coords, newxyz)
            self._coords = newxyz
            step += 1
            LOGGER.info('Step #{0}: RMSD difference = {1:.4e}'
                        .format(step, rmsdif))
        LOGGER.report('Iterative superposition completed in %.2fs.',
                      '_prody_ensemble')
예제 #52
0
파일: analysis.py 프로젝트: npabon/ProDy
def buildOMESMatrix(msa, ambiguity=True, turbo=True, **kwargs):
    """Return OMES (Observed Minus Expected Squared) covariance matrix
    calculated for *msa*, which may be an :class:`.MSA` instance or a 2D
    NumPy character array. OMES is defined as::

                        (N_OBS - N_EX)^2              (f_i,j - f_i * f_j)^2
      OMES_(i,j) = sum(------------------) = N * sum(-----------------------)
                             N_EX                           f_i * f_j

    Implementation is case insensitive and handles ambiguous amino acids
    as follows:

      * **B** (Asx) count is allocated to *D* (Asp) and *N* (Asn)
      * **Z** (Glx) count is allocated to *E* (Glu) and *Q* (Gln)
      * **J** (Xle) count is allocated to *I* (Ile) and *L* (Leu)
      * **X** (Xaa) count is allocated to the twenty standard amino acids
      * Joint probability of observing a pair of ambiguous amino acids is
        allocated to all potential combinations, e.g. probability of **XX**
        is allocated to 400 combinations of standard amino acids, similarly
        probability of **XB** is allocated to 40 combinations of *D* and *N*
        with the standard amino acids.

    Selenocysteine (**U**, Sec) and pyrrolysine (**O**, Pyl) are considered
    as distinct amino acids.  When *ambiguity* is set **False**, all alphabet
    characters as considered as distinct types.  All non-alphabet characters
    are considered as gaps."""

    msa = getMSA(msa)

    from .msatools import msaomes
    LOGGER.timeit('_omes')
    length = msa.shape[1]
    omes = empty((length, length), float)
    omes = msaomes(msa, omes, ambiguity=bool(ambiguity), turbo=bool(turbo),
                   debug=bool(kwargs.get('debug', False)))
    LOGGER.report('OMES matrix was calculated in %.2fs.',
                  '_omes')

    return omes
예제 #53
0
파일: exanm.py 프로젝트: sixpi/ProDy
    def buildMembrane(self, coords, **kwargs):
        """Build Hessian matrix for given coordinate set.

        :arg coords: a coordinate set or an object with ``getCoords`` method
        :type coords: :class:`numpy.ndarray`

        :arg membrane_hi: the maximum z coordinate of the pdb default is 13.0
        :type membrane_hi: float

        :arg membrane_lo: the minimum z coordinate of the pdb default is -13.0
        :type membrane_lo: float

        :arg R: radius of all membrane in x-y direction default is 80. 
        :type R: float

        :arg r: radius of individual barrel-type membrane protein default is 2.5.
        :type 
        
        :arg lat: lattice type which could be FCC(face-centered-cubic)(default), 
        SC(simple cubic), SH(simple hexagonal)
        :type lat: str
        """
        if type(coords) is AtomGroup:
            buildAg = True
        else:
            buildAg = False
        
        try:
            coords = (coords._getCoords() if hasattr(coords, '_getCoords') else
                      coords.getCoords())
        except AttributeError:
            try:
                checkCoords(coords)
            except TypeError:
                raise TypeError('coords must be a Numpy array or an object '
                                'with `getCoords` method')

        self._n_atoms = natoms = int(coords.shape[0])

        pxlo = min(np.append(coords[:,0],10000))
        pxhi = max(np.append(coords[:,0],-10000))
        pylo = min(np.append(coords[:,1],10000))
        pyhi = max(np.append(coords[:,1],-10000))
        pzlo = min(np.append(coords[:,2],10000))
        pzhi = max(np.append(coords[:,2],-10000))

        membrane_hi = float(kwargs.get('membrane_hi', 13.0))
        membrane_lo = float(kwargs.get('membrane_lo', -13.0))
        R = float(kwargs.get('R', 80))
        r = float(kwargs.get('r', 5))
        lat = str(kwargs.get('lat', 'FCC'))
        lpv = assign_lpvs(lat)

        imax = (R + lpv[0,2] * (membrane_hi - membrane_lo)/2.)/r
        jmax = (R + lpv[1,2] * (membrane_hi - membrane_lo)/2.)/r
        kmax = (R + lpv[2,2] * (membrane_hi - membrane_lo)/2.)/r    

        #print pxlo, pxhi, pylo, pyhi, pzlo, pzhi
        #print lpv[0,2],lpv[1,2],lpv[2,2]
        #print R,r,imax,jmax,kmax
        membrane = zeros((1,3))

        LOGGER.timeit('_membrane')
        membrane = zeros((1,3))
        atm = 0
        for i in range(-int(imax),int(imax+1)):
            for j in range(-int(jmax),int(jmax+1)):
                for k in range(-int(kmax),int(kmax+1)):
                    X = zeros((1,3))
                    for p in range(3):
                        X[0,p]=2.*r*(i*lpv[0,p]+j*lpv[1,p]+k*lpv[2,p])
                    dd=0
                    for p in range(3):
                        dd += X[0,p] ** 2
                    if dd<R**2 and X[0,2]>membrane_lo and X[0,2]<membrane_hi:
                        if X[0,0]>pxlo-R/2 and X[0,0]<pxhi+R/2 and X[0,1]>pylo-R/2 and X[0,1]<pyhi+R/2 and X[0,2]>pzlo and X[0,2]<pzhi:
                            if checkClash(X, coords[:natoms,:], radius=5):
                                if atm == 0:
                                    membrane = X
                                else:
                                    membrane = np.append(membrane, X, axis=0)
                                atm = atm + 1 
        #print atm             

        self._membrane = AtomGroup(title="Membrane")
        self._membrane.setCoords(membrane)
        self._membrane.setResnums(range(atm))
        self._membrane.setResnames(["NE1" for i in range(atm)])
        self._membrane.setChids(["Q" for i in range(atm)])
        self._membrane.setElements(["Q1" for i in range(atm)])
        self._membrane.setNames(["Q1" for i in range(atm)])
        LOGGER.report('Membrane was built in %2.fs.', label='_membrane')
예제 #54
0
파일: exanm.py 프로젝트: sixpi/ProDy
    def buildHessian(self, coords, cutoff=15., gamma=1., **kwargs):
        """Build Hessian matrix for given coordinate set.

        :arg coords: a coordinate set or an object with ``getCoords`` method
        :type coords: :class:`numpy.ndarray`

        :arg cutoff: cutoff distance (Å) for pairwise interactions,
            default is 15.0 Å
        :type cutoff: float

        :arg gamma: spring constant, default is 1.0
        :type gamma: float

        :arg membrane_hi: the maximum z coordinate of the pdb default is 13.0
        :type membrane_hi: float

        :arg membrane_lo: the minimum z coordinate of the pdb default is -13.0
        :type membrane_lo: float

        :arg R: radius of all membrane in x-y direction default is 80. 
        :type R: float

        :arg r: radius of individual barrel-type membrane protein default is 2.5.
        :type 
        
        :arg lat: lattice type which could be FCC(face-centered-cubic)(default), 
        SC(simple cubic), SH(simple hexagonal)
        :type lat: str
        """

        try:
            coords = (coords._getCoords() if hasattr(coords, '_getCoords') else
                      coords.getCoords())
        except AttributeError:
            try:
                checkCoords(coords)
            except TypeError:
                raise TypeError('coords must be a Numpy array or an object '
                                'with `getCoords` method')

        self._n_atoms = natoms = int(coords.shape[0])

        if self._membrane is None:
            membrane_hi = float(kwargs.get('membrane_hi', 13.0))
            membrane_lo = float(kwargs.get('membrane_lo', -13.0))
            R = float(kwargs.get('R', 80))
            r = float(kwargs.get('r', 5))
            lat = str(kwargs.get('lat', 'FCC'))
            buildMembrane(self,coords,membrane_hi=membrane_hi, membrane_lo=membrane_lo,R=R,r=r,lat=lat)


        LOGGER.timeit('_exanm')
        coords = np.concatenate((coords,self._membrane.getCoords()),axis=0)
        self._combined_coords = coords
        total_natoms = int(coords.shape[0])
        self._hessian = np.zeros((natoms*3, natoms*3), float)
        total_hessian = np.zeros((total_natoms*3, total_natoms*3), float)
        cutoff, g, gamma = checkENMParameters(cutoff, gamma)
        cutoff2 = cutoff * cutoff
        for i in range(total_natoms):
            res_i3 = i*3
            res_i33 = res_i3+3
            i_p1 = i+1
            i2j_all = coords[i_p1:, :] - coords[i]
            for j, dist2 in enumerate((i2j_all ** 2).sum(1)):
                if dist2 > cutoff2:
                    continue
                i2j = i2j_all[j]
                j += i_p1
                g = gamma(dist2, i, j)
                res_j3 = j*3
                res_j33 = res_j3+3
                super_element = np.outer(i2j, i2j) * (- g / dist2)
                total_hessian[res_i3:res_i33, res_j3:res_j33] = super_element
                total_hessian[res_j3:res_j33, res_i3:res_i33] = super_element
                total_hessian[res_i3:res_i33, res_i3:res_i33] = total_hessian[res_i3:res_i33, res_i3:res_i33] - super_element
                total_hessian[res_j3:res_j33, res_j3:res_j33] = total_hessian[res_j3:res_j33, res_j3:res_j33] - super_element

        ss = total_hessian[:natoms*3, :natoms*3]
        so = total_hessian[:natoms*3, natoms*3+1:]
        os = total_hessian[natoms*3+1:,:natoms*3]
        oo = total_hessian[natoms*3+1:, natoms*3+1:]
        self._hessian = ss - np.dot(so, np.dot(linalg.inv(oo), os))
        LOGGER.report('Hessian was built in %.2fs.', label='_exanm')
        self._dof = self._hessian.shape[0]
예제 #55
0
파일: analysis.py 프로젝트: sixpi/ProDy
def calcPairDeformationDist(model, coords, ind1, ind2, kbt=1.):
                                                
    """Returns distribution of the deformations in the distance contributed by each mode 
    for selected pair of residues *ind1* *ind2* using *model* from a :class:`.ANM`.
    Method described in [EB08]_ equation (10) and figure (2).     
    
    .. [EB08] Eyal E., Bahar I. Toward a Molecular Understanding of 
        the Anisotropic Response of Proteins to External Forces:
        Insights from Elastic Network Models. *Biophys J* **2008** 94:3424-34355. 
    
    :arg model: this is an 3-dimensional NMA instance from a :class:`.ANM
    calculations.
    :type model: :class:`.ANM`  
    :arg coords: a coordinate set or an object with ``getCoords`` method.
      Recommended: coords = parsePDB('pdbfile').select('protein and name CA').
    :type coords: :class:`numpy.ndarray`.
    :arg ind1: first residue number.
    :type ind1: int 
    :arg ind2: secound residue number.
    :type ind2: int 
    """

    try:
        resnum_list = coords.getResnums()
        resnam_list = coords.getResnames()
        coords = (coords._getCoords() if hasattr(coords, '_getCoords') else
                coords.getCoords())
    except AttributeError:
        try:
            checkCoords(coords)
        except TypeError:
            raise TypeError('coords must be a Numpy array or an object '
                            'with `getCoords` method')
    
    if not isinstance(model, NMA):
        raise TypeError('model must be a NMA instance')
    elif not model.is3d():
        raise TypeError('model must be a 3-dimensional NMA instance')
    elif len(model) == 0:
        raise ValueError('model must have normal modes calculated')
    elif model.getStiffness() is None:
        raise ValueError('model must have stiffness matrix calculated')
    
    linalg = importLA()
    n_atoms = model.numAtoms()
    n_modes = model.numModes()
    LOGGER.timeit('_pairdef')

    r_ij = np.zeros((n_atoms,n_atoms,3))
    r_ij_norm = np.zeros((n_atoms,n_atoms,3))

    for i in range(n_atoms):
        for j in range(i+1,n_atoms):
            r_ij[i][j] = coords[j,:] - coords[i,:]
            r_ij[j][i] = r_ij[i][j]
            r_ij_norm[i][j] = r_ij[i][j]/linalg.norm(r_ij[i][j])
            r_ij_norm[j][i] = r_ij_norm[i][j]

    eigvecs = model.getEigvecs()
    eigvals = model.getEigvals()
    
    D_pair_k = []
    mode_nr = []
    ind1 = ind1 - resnum_list[0]
    ind2 = ind2 - resnum_list[0]

    for m in xrange(6,n_modes):
        U_ij_k = [(eigvecs[ind1*3][m] - eigvecs[ind2*3][m]), (eigvecs[ind1*3+1][m] \
            - eigvecs[ind2*3+1][m]), (eigvecs[ind1*3+2][m] - eigvecs[ind2*3+2][m])] 
        D_ij_k = abs(np.sqrt(kbt/eigvals[m])*(np.vdot(r_ij_norm[ind1][ind2], U_ij_k)))  
        D_pair_k.append(D_ij_k)
        mode_nr.append(m)

    LOGGER.report('Deformation was calculated in %.2lfs.', label='_pairdef')
    
    return mode_nr, D_pair_k
예제 #56
0
def searchPfam(query, **kwargs):
    """Return Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence
        file, sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    prefix = "{http://pfam.xfam.org/}"
    query = str(query)
    if isfile(query):
        from prody.sequence import MSAFile

        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = "".join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError("could not parse a sequence without gaps from " + query)
    else:
        seq = "".join(query.split())

    import xml.etree.cElementTree as ET

    LOGGER.timeit("_pfam")
    timeout = int(kwargs.get("timeout", 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + " is not a valid sequence")

            fseq = ">Seq\n" + seq
            parameters = {"hmmdb": "pfam", "seq": fseq}
            enc_params = urllib.urlencode(parameters)
            request = urllib2.Request("http://hmmer.janelia.org/search/hmmscan", enc_params)

            url = urllib2.urlopen(request).geturl() + "?output=xml"
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format(seq[:MINSEQLEN]))

        xml = openURL(url, timeout=timeout).read()

        try:
            root = ET.XML(xml)
        except Exception as err:
            raise ValueError("failed to parse results XML, check URL: " + url)
            matches = {}
            for child in root[0]:
                if child.tag == "hits":
                    accession = child.get("acc")
                    pfam_id = accession.split(".")[0]
                    matches[pfam_id] = {}
                    matches[pfam_id]["accession"] = accession
                    matches[pfam_id]["class"] = "Domain"
                    matches[pfam_id]["id"] = child.get("name")
                    matches[pfam_id]["locations"] = {}
                    matches[pfam_id]["locations"]["ali_end"] = child[0].get("alisqto")
                    matches[pfam_id]["locations"]["ali_start"] = child[0].get("alisqfrom")
                    matches[pfam_id]["locations"]["bitscore"] = child[0].get("bitscore")
                    matches[pfam_id]["locations"]["end"] = child[0].get("alisqto")
                    matches[pfam_id]["locations"]["evalue"] = child.get("evalue")
                    matches[pfam_id]["locations"]["evidence"] = "hmmer v3.0"
                    matches[pfam_id]["locations"]["hmm_end"] = child[0].get("alihmmto")
                    matches[pfam_id]["locations"]["hmm_start"] = child[0].get("alihmmfrom")
                    matches[pfam_id]["locations"]["significant"] = child[0].get("significant")
                    matches[pfam_id]["locations"]["start"] = child[0].get("alisqfrom")
                    matches[pfam_id]["type"] = "Pfam-A"
                return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader

            try:
                polymers = parsePDBHeader(seq[:4], "polymers")
            except Exception as err:
                LOGGER.warn("failed to parse header for {0} ({1})".format(seq[:4], str(err)))
            else:
                chid = seq[4:].upper()
                for poly in polymers:
                    if chid and poly.chid != chid:
                        continue
                    for dbref in poly.dbrefs:
                        if dbref.database != "UniProt":
                            continue
                        idcode = dbref.idcode
                        LOGGER.info(
                            "UniProt ID code {0} for {1} chain " "{2} will be used.".format(idcode, seq[:4], poly.chid)
                        )
                        break
                    if idcode is not None:
                        break
            if idcode is None:
                LOGGER.warn("A UniProt ID code for PDB {0} could not be " "parsed.".format(repr(seq)))
                url = "http://pfam.xfam.org/protein/" + seq + "?output=xml"
            else:
                url = "http://pfam.xfam.org/protein/" + idcode + "?output=xml"

        else:
            url = "http://pfam.xfam.org/protein/" + seq + "?output=xml"

    LOGGER.debug("Retrieving Pfam search results: " + url)
    xml = None
    while LOGGER.timing("_pfam") < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml:
                break

    if not xml:
        raise IOError("Pfam search timed out or failed to parse results " "XML, check URL: " + url)
    else:
        LOGGER.report("Pfam search completed in %.2fs.", "_pfam")

    if xml.find(b"There was a system error on your last request.") > 0:
        LOGGER.warn("No Pfam matches found for: " + seq)
        return None

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError("failed to parse results XML, check URL: " + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError("failed to parse results XML, check URL: " + url)
    else:
        results = dictElement(root[0], prefix)
        try:
            xml_matches = results["matches"]
        except KeyError:
            raise ValueError("failed to parse results XML, check URL: " + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib["accession"][:7]
        except KeyError:
            raise ValueError("failed to parse results XML, check URL: " + url)

        if not re.search("^P(F|B)[0-9]{5}$", accession):
            raise ValueError("{0} does not match pfam accession" " format".format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault("locations", [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = "Query " + repr(query)
    else:
        query = "Query sequence"

    if matches:
        LOGGER.info(query + " matched {0} Pfam families.".format(len(matches)))
    else:
        LOGGER.info(query + " did not match any Pfam families.")
    return matches
예제 #57
0
파일: msa.py 프로젝트: njekin/ProDy
def refineMSA(msa, label=None, rowocc=None, seqid=None, colocc=None, **kwargs):
    """Refine *msa* by removing sequences (rows) and residues (columns) that
    contain gaps.

    :arg msa: multiple sequence alignment
    :type msa: :class:`.MSA`

    :arg label: remove columns that are gaps in the sequence matching label,
        ``msa.getIndex(label)`` must return a sequence index, a PDB identifier
        is also acceptable
    :type label: str

    :arg rowocc: row occupancy, sequences with less occupancy will be
        removed after *label* refinement is applied
    :type rowocc: float

    :arg seqid: keep unique sequences at specified sequence identity level,
        unique sequences are identified using :func:`.uniqueSequences`
    :type seqid: float

    :arg colocc: column occupancy, residue positions with less occupancy
        will be removed after other refinements are applied
    :type colocc: float

    :arg keep: keep columns corresponding to residues not resolved in the PDB
        structure, default is **False**, applies when *label* is a PDB
        identifier
    :arg type: bool

    For Pfam MSA data, *label* is UniProt entry name for the protein.  You may
    also use PDB structure and chain identifiers, e.g. ``'1p38'`` or
    ``'1p38A'``, for *label* argument and UniProt entry names will be parsed
    using :func:`.parsePDBHeader` function (see also :class:`.Polymer` and
    :class:`.DBRef`).

    The order of refinements are applied in the order of arguments.  If *label*
    and *unique* is specified is specified, sequence matching *label* will
    be kept in the refined :class:`.MSA` although it may be similar to some
    other sequence."""

    # if msa is a char array, it will be refined but label won't work
    try:
        ndim, dtype_ = msa.ndim, msa.dtype
    except AttributeError:
        try:
            arr = msa._getArray()
        except AttributeError:
            raise TypeError('msa must be a character array or an MSA instance')
        ndim, dtype_ = arr.ndim, arr.dtype
    else:
        arr, msa = msa, None

    if dtype('|S1') != dtype_:
        raise ValueError('msa must be a character array or an MSA instance')
    if ndim != 2:
        raise ValueError('msa must be a 2D array or an MSA instance')

    title = []
    cols = None
    index = None
    if label is not None:
        before = arr.shape[1]
        LOGGER.timeit('_refine')
        try:
            upper, lower = label.upper(), label.lower()
        except AttributeError:
            raise TypeError('label must be a string')

        if msa is None:
            raise TypeError('msa must be an MSA instance, '
                            'label cannot be used')

        index = msa.getIndex(label)
        if index is None:
                index = msa.getIndex(upper)
        if index is None:
                index = msa.getIndex(lower)

        chain = None
        if index is None and (len(label) == 4 or len(label) == 5):
            from prody import parsePDB
            try:
                structure, header = parsePDB(label[:4], header=True)
            except Exception as err:
                raise IOError('failed to parse header for {0} ({1})'
                              .format(label[:4], str(err)))

            chid = label[4:].upper()
            for poly in header['polymers']:
                if chid and poly.chid != chid:
                    continue
                for dbref in poly.dbrefs:
                    if index is None:
                        index = msa.getIndex(dbref.idcode)
                        if index is not None:
                            LOGGER.info('{0} idcode {1} for {2}{3} '
                                        'is found in chain {3}.'.format(
                                        dbref.database, dbref.idcode,
                                        label[:4], poly.chid, str(msa)))
                            break
                    if index is None:
                        index = msa.getIndex(dbref.accession)
                        if index is not None:
                            LOGGER.info('{0} accession {1} for {2}{3} '
                                        'is found in chain {3}.'.format(
                                        dbref.database, dbref.accession,
                                        label[:4], poly.chid, str(msa)))
                            break
            if index is not None:
                chain = structure[poly.chid]

        if index is None:
            raise ValueError('label is not in msa, or msa is not indexed')
        try:
            len(index)
        except TypeError:
            pass
        else:
            raise ValueError('label {0} maps onto multiple sequences, '
                             'so cannot be used for refinement'.format(label))

        title.append('label=' + label)
        cols = char.isalpha(arr[index]).nonzero()[0]
        arr = arr.take(cols, 1)
        LOGGER.report('Label refinement reduced number of columns from {0} to '
                      '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine')

        if chain is not None and not kwargs.get('keep', False):
            before = arr.shape[1]
            LOGGER.timeit('_refine')
            from prody.proteins.compare import importBioPairwise2
            from prody.proteins.compare import MATCH_SCORE, MISMATCH_SCORE
            from prody.proteins.compare import GAP_PENALTY, GAP_EXT_PENALTY
            pw2 = importBioPairwise2()
            chseq = chain.getSequence()
            algn = pw2.align.localms(arr[index].tostring().upper(), chseq,
                                     MATCH_SCORE, MISMATCH_SCORE,
                                     GAP_PENALTY, GAP_EXT_PENALTY,
                                     one_alignment_only=1)
            torf = []
            for s, c in zip(*algn[0][:2]):
                if s == '-':
                    continue
                elif c != '-':
                    torf.append(True)
                else:
                    torf.append(False)
            torf = array(torf)
            tsum = torf.sum()
            assert tsum <= before, 'problem in mapping sequence to structure'
            if tsum < before:
                arr = arr.take(torf.nonzero()[0], 1)
                LOGGER.report('Structure refinement reduced number of '
                              'columns from {0} to {1} in %.2fs.'
                              .format(before, arr.shape[1]), '_refine')
            else:
                LOGGER.debug('All residues in the sequence are contained in '
                             'PDB structure {0}.'.format(label))

    from .analysis import calcMSAOccupancy, uniqueSequences

    rows = None
    if rowocc is not None:
        before = arr.shape[0]
        LOGGER.timeit('_refine')
        try:
            rowocc = float(rowocc)
        except Exception as err:
            raise TypeError('rowocc must be a float ({0})'.format(str(err)))
        assert 0. <= rowocc <= 1., 'rowocc must be between 0 and 1'

        rows = calcMSAOccupancy(arr, 'row') >= rowocc
        if index is not None:
            index = rows[:index].sum()
        rows = (rows).nonzero()[0]
        arr = arr[rows]
        title.append('rowocc>=' + str(rowocc))
        LOGGER.report('Row occupancy refinement reduced number of rows from '
                      '{0} to {1} in %.2fs.'.format(before, arr.shape[0]),
                      '_refine')

    if seqid is not None:
        before = arr.shape[0]
        LOGGER.timeit('_refine')
        unique = uniqueSequences(arr, seqid)
        if index is not None:
            unique[index] = True
        unique = unique.nonzero()[0]
        arr = arr[unique]
        title.append('seqid>=' + str(seqid))
        if rows is not None:
            rows = rows[unique]
        else:
            rows = unique
        LOGGER.report('Sequence identity refinement reduced number of rows '
                      'from {0} to {1} in %.2fs.'.format(before, arr.shape[0]),
                      '_refine')

    if colocc is not None:
        before = arr.shape[1]
        LOGGER.timeit('_refine')
        try:
            colocc = float(colocc)
        except Exception as err:
            raise TypeError('colocc must be a float ({0})'.format(str(err)))
        assert 0. <= colocc <= 1., 'colocc must be between 0 and 1'

        cols = (calcMSAOccupancy(arr, 'col') >= colocc).nonzero()[0]
        arr = arr.take(cols, 1)
        title.append('colocc>=' + str(colocc))
        LOGGER.report('Column occupancy refinement reduced number of columns '
                      'from {0} to {1} in %.2fs.'.format(before, arr.shape[1]),
                      '_refine')

    if not title:
        raise ValueError('label, rowocc, colocc all cannot be None')

    # depending on slicing of rows, arr may not have it's own memory
    if arr.base is not None:
        arr = arr.copy()

    if msa is None:
        return arr
    else:
        if rows is None:
            from copy import copy
            labels = copy(msa._labels)
            mapping = copy(msa._mapping)
        else:
            labels = msa._labels
            labels = [labels[i] for i in rows]
            mapping = None
        return MSA(arr, title=msa.getTitle() + ' refined ({0})'
                   .format(', '.join(title)), labels=labels, mapping=mapping)
예제 #58
0
파일: psiblast.py 프로젝트: fongchun/ProDy
def psiBlastCycle(sequence=None, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    a single cycle of EBI psiblast.

    :arg sequence: an object with an associated sequence string 
         or a sequence string itself
    :type sequence: :class:`Atomic`, :class:`Sequence`, or str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    The following search parameters can be adjusted by the user.
    We use the same default values as 
    http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/
    wherever applicable.

    :arg email: email address for reporting problems
        default is [email protected]
    :type email: str with an @ before a .

    :arg matrix: The comparison matrix to be used to score alignments when searching the database
        possible values are 'BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30' and 'PAM70' 
        default is 'BLOSUM62'
    :type matrix: str

    :arg gapopen: Penalty taken away from the score when a gap is created in sequence alignments. 
        Increasing the gap opening penalty will decrease the number of gaps in the final alignment.
        Possible values range from 8 to 16 inclusive, default is 11
    :type gapopen: int

    :arg gapext: Penalty taken away from the score for each base or residue in the gap. 
        Increasing the gap extension penalty favors short gaps in the final alignment, 
        conversly decreasing the gap extension penalty favors long gaps in the final alignment. 
        Possible values range from 0 to 3, default is 1
    :type gapext: int

    :arg expthr: Expectation threshold that limits the number of scores and alignments reported. 
        This is the maximum number of times the match is expected to occur by chance.
        Possible values are 1.0e-200, 1.0e-100, 1.0e-50, 1.0e-10, 1.0e-5, 1.0e-4, 1.0e-3,
        1.0e-2, 0.1, 1.0, 10.0, 100, 1000
        default is 10.0
    :type expthr: float

    :arg psithr: Expectation value threshold for automatic selection of matched sequences for 
        inclusion in the PSSM at each iteration.
        Possible values are 1.0e-6, 1.0e-5, 1.0e-4, 2.0e-4, 5.0e-4, 1.0e-3, 2.0e-3, 5.0e-3,
        1.0e-2, 2.0e-2, 0.1, 0.3, 0.5, 1.0, 3.0, 10.0
        default is 1.0e-3
    :type psithr: float

    :arg scores: Maximum number of match score summaries reported in the result output.
        Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000
        Default is 500
    :type scores: int

    :arg alignments: Maximum number of match alignments reported in the result output.
        Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000
        Default is 500
    :type alignmets: int

    :arg dropoff: The amount a score can drop before extension of word hits is halted
        Possible values are 0, 2, 4, 6, 8, 10, 15, 20, 25, or 30
        Default is 15
    :type dropoff: int

    :arg finaldropoff: Dropoff value for final gapped alignment
        Possible values are 10, 12, 14, 16, 18, 20, 22, 24, 25, 26, 28, or 30
        Default is 25
    :type finaldropoff: int

    :arg filter: Filter regions of low sequence complexity. This can avoid issues with 
        low complexity sequences where matches are found due to composition rather than 
        meaningful sequence similarity. However, in some cases filtering also masks 
        regions of interest and so should be used with caution.
        Possible values are T and F, default is F
    :type filter: str

    :arg seqrange: Specify a range or section of the input sequence to use in the search.
        Example: Specifying '34-89' in an input sequence of total length 100, will tell BLAST 
        to only use residues 34 to 89, inclusive.
    :type seqrange: str of form START-END

    :arg database: a database name from those available. See
        http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/database
        default is pdb
    :type database: str

    :arg previousjobid: The job identifier for the previous PSI-BLAST iteration. 
        default is None
        You can change this if you want to continue from a previous run
    :type previousjobid: str

    :arg selectedHits: Name of a file containing a list of identifiers of the 
        hits from the previous iteration to use to construct the search PSSM 
        for this iteration.
        default is None
    :type selectedHits: str

    :arg cpfile: Name of a Checkpoint file from the previous iteration. 
        default is None
    :type cpfile: str

    :arg sleep: how long to wait to reconnect for status
         Sleep time is multiplied by 1.5 when results are not ready.
         default is 2 seconds
    :type sleep: float

    :arg timeout:  when to give up waiting for the results 
        default is 120 seconds
    :type timeout: float

    :arg cycle: cycle number
    :type cycle: int

    """
    cycle = kwargs.get('cycle',0)

    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')

    elif isinstance(sequence, Atomic):
        sequence = sequence.calpha.getSequence()

    elif isinstance(sequence, Sequence):
        sequence = str(sequence)

    elif isinstance(sequence, str):
        if len(sequence) in [4, 5, 6]:
            ag = parsePDB(sequence)
            sequence = ag.calpha.getSequence()
        sequence = ''.join(sequence.split())

    elif sequence is None:
        if cycle == 0: 
            cycle = 1
    else:
        raise TypeError('sequence must be Atomic, Sequence, or str not {0}'
                        .format(type(sequence)))

    if cycle == 0:
        query = [('sequence', sequence)]
    else:
        query = []

    email = kwargs.get('email','*****@*****.**')
    if not isinstance(email, str):
        raise TypeError('email must be a string')
    elif email.find('@') == -1 or email.find('.') == -1 or len(email.split('@')) != 2:
        raise ValueError('email must be a valid email address with at least one . and exactly one @ sign')
    elif not email.find('@') < email.find(email.split('.')[-1]):
        raise ValueError('email must be a valid email address with a . after the @ sign')
    query.append(('email', email))
    query.append(('title', 'ProDy psiBlastPDB request'))

    previousjobid = kwargs.get('previousjobid','')
    if previousjobid is not '':
        query.append(('previousjobid',previousjobid))

    selectedHits = kwargs.get('selectedHits','')
    if selectedHits is not '':
        query.append(('selectedHits',selectedHits))

    database = kwargs.get('database','pdb')
    checkPsiBlastParameter('database', database)
    query.append(('database',database))

    matrix = kwargs.get('matrix', 'BLOSUM62')
    checkPsiBlastParameter('matrix', matrix)
    query.append(('matrix',matrix))

    gapopen = kwargs.get('gapopen',11)
    checkPsiBlastParameter('gapopen', gapopen)
    query.append(('gapopen',gapopen))

    gapext = kwargs.get('gapext',1)
    checkPsiBlastParameter('gapext', gapext)
    query.append(('gapext',gapext))

    expthr = kwargs.get('expthr', 10.)
    checkPsiBlastParameter('expthr', expthr)
    query.append(('expthr',expthr))
    
    psithr = kwargs.get('psithr',1.0e-3)
    checkPsiBlastParameter('psithr', psithr)
    query.append(('psithr',psithr))

    scores = kwargs.get('scores',500)
    checkPsiBlastParameter('scores', scores)
    query.append(('scores',scores))

    alignments = kwargs.get('alignments',500)
    checkPsiBlastParameter('alignments', alignments)
    query.append(('alignments',alignments))
    
    query.append(('alignView',0))
                    
    dropoff = kwargs.get('dropoff',15)
    checkPsiBlastParameter('dropoff', dropoff)
    query.append(('dropoff',dropoff))
        
    finaldropoff = kwargs.get('finaldropoff',25)
    checkPsiBlastParameter('finaldropoff', finaldropoff)
    query.append(('finaldropoff',finaldropoff))
        
    filter = kwargs.get('filter','F')
    checkPsiBlastParameter('filter', filter)
    query.append(('filter',filter))
    
    if previousjobid is '' and selectedHits is '':
        seqrange = kwargs.get('seqrange', None)
        if seqrange is None:
            seqrange = '0-' + str(len(sequence))
        elif not isinstance(seqrange, str):
            raise TypeError('seqrange should be a string')
        elif len(seqrange.split('-')) != 2:
            raise ValueError('seqrange should take the form START-END')
        try:
            start = int(seqrange.split('-')[0])
            end = int(seqrange.split('-')[1])
        except:
            raise ValueError('seqrange should be START-END with START and END being integers')
        query.append(('seqrange',seqrange))
        
    headers = { 'User-Agent' : 'ProDy' }
    
    try:
        import urllib.parse
        urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8')
    except ImportError:
        from urllib import urlencode

    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 120))
    
    data = urlencode(query)

    # submit the job
    base_url = 'http://www.ebi.ac.uk/Tools/services/rest/psiblast/'
    url = base_url + 'run/'
    LOGGER.timeit('_prody_psi-blast')
    if cycle == 0:
        LOGGER.info('PSI-Blast searching PDB database for "{0}..."'
                    .format(sequence[:5]))
    else:
        LOGGER.info('PSI-Blast searching PDB database, cycle={0}'
                    .format(cycle))

    handle = openURL(url, data=data, headers=headers)
    job_id = handle.read()
    handle.close()

    # check the status
    url = base_url + 'status/' + job_id
    handle = openURL(url)
    status = handle.read()
    handle.close()
                    
    # keep checking the status until it's no longer running
    while status == 'RUNNING':
        LOGGER.sleep(int(sleep), 'to reconnect to EBI for status.')
        LOGGER.write('Connecting to EBI for status...')
        handle = openURL(url)
        status = handle.read()
        LOGGER.clear()
        sleep = int(sleep * 1.5)
        if LOGGER.timing('_prody_psi-blast') > timeout:
            LOGGER.warn('PSI-Blast search time out.')
            return None

    LOGGER.info('The status is {0}'.format(status))
    LOGGER.clear()
    LOGGER.report('PSI-Blast search completed in %.1fs.', '_prody_psi-blast')
 
    if cycle != 1:
        # get the results
        url = base_url + 'result/' + job_id + '/xml'
        handle = openURL(url)
        results = handle.read()
        handle.close()
        
        try:
            ext_xml = filename.lower().endswith('.xml')
        except AttributeError:
            pass
        else:
            if not ext_xml:
                filename += '.xml'
            f_out = open(filename, 'w')
            f_out.write(results)
            f_out.close()
            LOGGER.info('Results are saved as {0}.'.format(repr(filename)))
        
        return job_id, PsiBlastRecord(results, sequence)
    else:
        return job_id