Exemplo n.º 1
0
    def testAmbiguous(self):

        msa = array([list("bjzxBJZX"), list("bjzxBJZX")], dtype="|S1")

        expect = -log(1.0 / array([2, 2, 2, 20] * 2))
        result = calcShannonEntropy(msa)
        assert_array_almost_equal(expect, result)
Exemplo n.º 2
0
    def testTwenty(self):

        msa = array([[char] for char in "ACDEFGHIKLMNPQRSTVWY"], dtype="|S1")

        expect = -log(1.0 / 20)
        result = calcShannonEntropy(msa)
        assert_array_almost_equal(expect, result)
Exemplo n.º 3
0
def evol_conserv(msa, **kwargs):

    import prody
    from prody import parseMSA, calcShannonEntropy, showShannonEntropy
    from prody import writeArray
    from os.path import splitext

    prefix = kwargs.get('prefix')
    if prefix is None:
        prefix, _ = splitext(msa)
        if _.lower() == '.gz':
            prefix, _ = splitext(prefix)
        prefix += '_conserv'
    msa = parseMSA(msa)
    entropy = calcShannonEntropy(msa, **kwargs)

    writeArray(prefix + '.txt',
               entropy, format=kwargs.get('numformat', '%12g'))

    if kwargs.get('figent'):
        try:
            import matplotlib.pyplot as plt
        except ImportError:
            LOGGER.warn('Matplotlib could not be imported, '
                        'figures are not saved.')
        else:
            prody.SETTINGS['auto_show'] = False
            width = kwargs.get('figwidth', 8)
            height = kwargs.get('figheight', 6)
            figargs = kwargs.get('figargs', ())
            figure = plt.figure(figsize=(width, height))
            show = showShannonEntropy(entropy, msa=msa, *figargs)
            format = kwargs.get('figformat', 'pdf')
            figure.savefig(prefix + '.' + format, format=format,
                        dpi=kwargs.get('figdpi', 300))
Exemplo n.º 4
0
    def testTwenty(self):

        msa = array([[char] for char in 'ACDEFGHIKLMNPQRSTVWY'], dtype='|S1')

        expect = -log(1. / 20)
        result = calcShannonEntropy(msa)
        assert_array_almost_equal(expect, result)
Exemplo n.º 5
0
    def testSmallProbability(self):

        msa = zeros((1000000,1), '|S1')
        msa[0] = 'A'
        msa[1:] = 'C'
        expect = array([1., 999999.]) / 1000000
        expect = - (expect * log(expect)).sum()
        result = calcShannonEntropy(msa)
        assert_array_almost_equal(expect, result)
Exemplo n.º 6
0
    def testSmallProbability(self):

        msa = zeros((1000000, 1), "|S1")
        msa[0] = "A"
        msa[1:] = "C"
        expect = array([1.0, 999999.0]) / 1000000
        expect = -(expect * log(expect)).sum()
        result = calcShannonEntropy(msa)
        assert_array_almost_equal(expect, result)
Exemplo n.º 7
0
    def testSmallProbability(self):

        msa = zeros((1000000, 1), '|S1')
        msa[0] = 'A'
        msa[1:] = 'C'
        expect = array([1., 999999.]) / 1000000
        expect = -(expect * log(expect)).sum()
        result = calcShannonEntropy(msa)
        assert_array_almost_equal(expect, result)
Exemplo n.º 8
0
    def testAmbiguous(self):

        msa = array([
            list('bjzxBJZX'),
            list('bjzxBJZX'),
        ], dtype='|S1')

        expect = -log(1. / array([2, 2, 2, 20] * 2))
        result = calcShannonEntropy(msa)
        assert_array_almost_equal(expect, result)
Exemplo n.º 9
0
    def testGapDividend(self):

        msa = array(
            [list("AAAA"), list("AAAC"), list("AACD"), list("ACCE"), list("ACDF"), list("ACDG"), list("----")],
            dtype="|S1",
        )

        expect = -log(1.0 / array([1, 2, 3, 6]))
        result = calcShannonEntropy(msa, omitgaps=True)
        assert_array_almost_equal(expect, result)
Exemplo n.º 10
0
    def testSixSequences(self):

        msa = array([list('AAAAaaaaAAAAaaaa'),
                     list('AAACaaacAAACaaac'),
                     list('AACDaacdAACDaacd'),
                     list('ACCEacceacceACCE'),
                     list('ACDFacdfacdfACDF'),
                     list('ACDGacdgacdgACDG')], dtype='|S1')

        expect = -log(1. / array([1, 2, 3, 6] * 4))
        result = calcShannonEntropy(msa)
        assert_array_almost_equal(expect, result)
Exemplo n.º 11
0
    def testGapDividend(self):

        msa = array([list('AAAA'),
                     list('AAAC'),
                     list('AACD'),
                     list('ACCE'),
                     list('ACDF'),
                     list('ACDG'),
                     list('----')], dtype='|S1')

        expect = -log(1. / array([1, 2, 3, 6]))
        result = calcShannonEntropy(msa, omitgaps=True)
        assert_array_almost_equal(expect, result)
Exemplo n.º 12
0
    def testSixSequences(self):

        msa = array([
            list('AAAAaaaaAAAAaaaa'),
            list('AAACaaacAAACaaac'),
            list('AACDaacdAACDaacd'),
            list('ACCEacceacceACCE'),
            list('ACDFacdfacdfACDF'),
            list('ACDGacdgacdgACDG')
        ],
                    dtype='|S1')

        expect = -log(1. / array([1, 2, 3, 6] * 4))
        result = calcShannonEntropy(msa)
        assert_array_almost_equal(expect, result)
Exemplo n.º 13
0
    def testGapDividend(self):

        msa = array([
            list('AAAA'),
            list('AAAC'),
            list('AACD'),
            list('ACCE'),
            list('ACDF'),
            list('ACDG'),
            list('----')
        ],
                    dtype='|S1')

        expect = -log(1. / array([1, 2, 3, 6]))
        result = calcShannonEntropy(msa, omitgaps=True)
        assert_array_almost_equal(expect, result)
Exemplo n.º 14
0
    def testSixSequences(self):

        msa = array(
            [
                list("AAAAaaaaAAAAaaaa"),
                list("AAACaaacAAACaaac"),
                list("AACDaacdAACDaacd"),
                list("ACCEacceacceACCE"),
                list("ACDFacdfacdfACDF"),
                list("ACDGacdgacdgACDG"),
            ],
            dtype="|S1",
        )

        expect = -log(1.0 / array([1, 2, 3, 6] * 4))
        result = calcShannonEntropy(msa)
        assert_array_almost_equal(expect, result)
Exemplo n.º 15
0
def evol_conserv(msa, **kwargs):

    import prody
    from prody import parseMSA, calcShannonEntropy, showShannonEntropy
    from prody import writeArray
    from os.path import splitext

    prefix = kwargs.get('prefix')
    if prefix is None:
        prefix, _ = splitext(msa)
        if _.lower() == '.gz':
            prefix, _ = splitext(prefix)
        prefix += '_conserv'
    msa = parseMSA(msa)
    entropy = calcShannonEntropy(msa, **kwargs)

    writeArray(prefix + '.txt',
               entropy,
               format=kwargs.get('numformat', '%12g'))

    if kwargs.get('figent'):
        try:
            import matplotlib.pyplot as plt
        except ImportError:
            LOGGER.warn('Matplotlib could not be imported, '
                        'figures are not saved.')
        else:
            prody.SETTINGS['auto_show'] = False
            width = kwargs.get('figwidth', 8)
            height = kwargs.get('figheight', 6)
            figargs = kwargs.get('figargs', ())
            figure = plt.figure(figsize=(width, height))
            show = showShannonEntropy(entropy, msa=msa, *figargs)
            format = kwargs.get('figformat', 'pdf')
            figure.savefig(prefix + '.' + format,
                           format=format,
                           dpi=kwargs.get('figdpi', 300))
Exemplo n.º 16
0
 def calcEvolProperties(self,
                        resid='all',
                        refresh=False,
                        folder=None,
                        max_cols=None,
                        max_seqs=25000,
                        **kwargs):
     ''' Computes Evol properties, i.e. Shannon entropy, Mutual
     Information and Direct Information, from Pfam Multiple
     Sequence Alignments, for a given residue.
     '''
     assert type(refresh) is bool
     # recover Pfam mapping (if not found already)
     self._searchPfam(refresh=refresh)
     if resid == 'all':
         PF_list = self.Pfam.keys()
     else:
         # get list of Pfam domains containing resid
         PF_list = [
             k for k in self.Pfam if any([
                 resid >= int(segment['start'])
                 and resid <= int(segment['end'])
                 for segment in self.Pfam[k]['locations']
             ])
         ]
         if len(PF_list) == 0:
             raise RuntimeError(
                 'No Pfam domain for resid {}.'.format(resid))
         if len(PF_list) > 1:
             LOGGER.warn('Residue {} is found in multiple '.format(resid) + \
                         '({}) Pfam domains.'.format(len(PF_list)))
     if folder is None:
         folder = SETTINGS.get('rhapsody_local_folder', './')
     # iterate over Pfam families
     for PF in PF_list:
         d = self.Pfam[PF]
         # skip if properties are pre-computed
         if not refresh and d.get('mapping') is not None:
             continue
         d['mapping'] = None
         d['ref_MSA'] = None
         d['entropy'] = np.nan
         d['MutInfo'] = np.nan
         d['DirInfo'] = np.nan
         try:
             LOGGER.info('Processing {}...'.format(PF))
             # fetch & parse MSA
             #               fname = PF + '_full.sth'
             #               fullname = os.path.join(folder, fname)
             #               if not os.path.isfile(fullname):
             #                   f = fetchPfamMSA(PF)
             #                   os.rename(f, fullname)
             #               msa = parseMSA(fullname, **kwargs)
             # fetch & parse MSA without saving downloaded MSA
             f = fetchPfamMSA(PF)
             msa = parseMSA(f, **kwargs)
             os.remove(f)
             # slice MSA to match all segments of the Uniprot sequence
             sliced_msa, indexes = self._sliceMSA(msa)
             #               if max_cols is not None and sliced_msa.numResidues() > max_cols:
             #                   raise Exception('Unable to compute DI: MSA has ' +\
             #                                   'too many columns (max: {}).'.format(max_cols))
             # get mapping between Uniprot sequence and Pfam domain
             d['mapping'] = self._mapUniprot2Pfam(PF, sliced_msa, indexes)
         except Exception as e:
             LOGGER.warn('{}: {}'.format(PF, e))
             d['mapping'] = str(e)
             continue
         try:
             # refine MSA ('seqid' param. is set as in PolyPhen-2)
             rowocc = 0.6
             while True:
                 sliced_msa = refineMSA(sliced_msa, rowocc=rowocc)
                 rowocc += 0.02
                 if sliced_msa.numSequences() <= max_seqs or rowocc >= 1:
                     break
             ref_msa = refineMSA(sliced_msa, seqid=0.94, **kwargs)
             d['ref_MSA'] = ref_msa
             # compute evolutionary properties
             d['entropy'] = calcShannonEntropy(ref_msa)
             d['MutInfo'] = buildMutinfoMatrix(ref_msa)
             # d['DirInfo'] = buildDirectInfoMatrix(ref_msa)
         except Exception as e:
             LOGGER.warn('{}: {}'.format(PF, e))
     return {k: self.Pfam[k] for k in PF_list}
Exemplo n.º 17
0
def evol_coevol(msa, **kwargs):

    from numpy import arange

    import prody
    from prody import parseMSA, buildMutinfoMatrix, showMutinfoMatrix
    from prody import applyMutinfoCorr, calcShannonEntropy
    from prody import writeArray, LOGGER, applyMutinfoNorm, writeHeatmap
    from os.path import splitext

    prefix = kwargs.get('prefix')
    if prefix is None:
        prefix, _ = splitext(msa)
        if _.lower() == '.gz':
            prefix, _ = splitext(prefix)
        prefix += '_mutinfo'

    msa = parseMSA(msa)
    mutinfo = buildMutinfoMatrix(msa, **kwargs)
    numformat = kwargs.get('numformat', '%12g')
    heatmap = kwargs.get('heatmap', False)
    #writeArray(prefix + '.txt', mutinfo, format=numformat)
    if heatmap:
        hmargs = {
                  'xlabel': 'Residue', 'ylabel': 'Residue',
                  'xorigin': 1, 'xstep': 1,
                  'residue': arange(msa.numResidues())}

    todo = [(None, None)]
    norm = kwargs.get('normalization', [])
    corr = kwargs.get('correction', [])
    if norm is not None:
        if 'joint' in norm:
            todo.append(('norm', 'joint'))
        for which in norm:
            if which == 'join': continue
            todo.append(('norm', which))
    if corr is not None:
        for which in corr:
            todo.append(('corr', which))
    entropy = None

    for what, which in todo:
        if what is None:
            matrix = mutinfo
            suffix = ''
            tuffix = ' Mutual Information'
        elif which == 'joint':
            LOGGER.info('Applying {0} normalization.'.format(repr(which)))
            matrix = buildMutinfoMatrix(msa, norm=True, **kwargs)
            suffix = '_norm_joint'
            tuffix = ' MI - Normalization: ' + which
        elif what == 'norm':
            LOGGER.info('Applying {0} normalization.'.format(repr(which)))
            if entropy is None:
                entropy = calcShannonEntropy(msa, **kwargs)
            matrix = applyMutinfoNorm(mutinfo, entropy, norm=which)
            suffix = '_norm_' + which
            tuffix = ' MI - Normalization: ' + which
        else:
            LOGGER.info('Applying {0} correction.'.format(repr(which)))
            matrix = applyMutinfoCorr(mutinfo, which)
            suffix = '_corr_' + which
            tuffix = ' MI - Correction: ' + which

        writeArray(prefix + suffix + '.txt',
                   matrix, format=kwargs.get('numformat', '%12g'))

        if heatmap:
            writeHeatmap(prefix + suffix + '.hm', matrix,
                         title = msa.getTitle() + tuffix, **hmargs)

        if kwargs.get('figcoevol'):
            try:
                import matplotlib.pyplot as plt
            except ImportError:
                LOGGER.warn('Matplotlib could not be imported, '
                            'figures are not saved.')
            else:
                cmin = kwargs.get('cmin', matrix.min())
                cmax = kwargs.get('cmax', matrix.max())
                prody.SETTINGS['auto_show'] = False
                width = kwargs.get('figwidth', 8)
                height = kwargs.get('figheight', 6)
                xlabel = kwargs.get('xlabel')
                title = kwargs.get('title')
                figure = plt.figure(figsize=(width, height))
                show = showMutinfoMatrix(matrix, msa=msa, clim=(cmin, cmax),
                                         xlabel=xlabel, title=title)

                format = kwargs.get('figformat', 'pdf')
                figure.savefig(prefix + suffix + '.' + format, format=format,
                            dpi=kwargs.get('figdpi', 300))