Python parseMSA 예제들, prody.parseMSA Python 예제들

예제 #1

0

파일 보기

파일: evol_conserv.py 프로젝트: barettog1/ProDy

def evol_conserv(msa, **kwargs):

    import prody
    from prody import parseMSA, calcShannonEntropy, showShannonEntropy
    from prody import writeArray
    from os.path import splitext

    prefix = kwargs.get('prefix')
    if prefix is None:
        prefix, _ = splitext(msa)
        if _.lower() == '.gz':
            prefix, _ = splitext(prefix)
        prefix += '_conserv'
    msa = parseMSA(msa)
    entropy = calcShannonEntropy(msa, **kwargs)

    writeArray(prefix + '.txt',
               entropy, format=kwargs.get('numformat', '%12g'))

    if kwargs.get('figent'):
        try:
            import matplotlib.pyplot as plt
        except ImportError:
            LOGGER.warn('Matplotlib could not be imported, '
                        'figures are not saved.')
        else:
            prody.SETTINGS['auto_show'] = False
            width = kwargs.get('figwidth', 8)
            height = kwargs.get('figheight', 6)
            figargs = kwargs.get('figargs', ())
            figure = plt.figure(figsize=(width, height))
            show = showShannonEntropy(entropy, msa=msa, *figargs)
            format = kwargs.get('figformat', 'pdf')
            figure.savefig(prefix + '.' + format, format=format,
                        dpi=kwargs.get('figdpi', 300))

예제 #2

0

파일 보기

파일: evol_occupancy.py 프로젝트: npabon/ProDy

def evol_occupancy(msa, **kwargs):

    from numpy import arange

    import prody
    from prody import parseMSA, calcMSAOccupancy, showMSAOccupancy, writeArray
    from os.path import splitext

    prefix = kwargs.get('prefix')
    if prefix is None:
        prefix, _ = splitext(msa)
        if _.lower() == '.gz':
            prefix, _ = splitext(prefix)
        prefix += '_occupancy'

    msa = parseMSA(msa)

    numformat = kwargs.get('numformat', '%12g')
    occupancy, suffix = [], []
    occaxis = kwargs.get('occaxis', 'row')
    if occaxis == 'both':
        suffix = ['_row', '_col']
        occupancy.append(calcMSAOccupancy(msa, occ='row'))
        occupancy.append(calcMSAOccupancy(msa, occ='col'))
    else:
        suffix = '_' + occaxis
        occupancy.append(calcMSAOccupancy(msa, occ=occaxis))

    for i, occ in enumerate(occupancy):
        writeArray((prefix + suffix[i] + '.txt'), occ, format=numformat)

    for i, occ in enumerate(occupancy):
        if kwargs.get('figocc'):
            try:
                import matplotlib.pyplot as plt
            except ImportError:
                LOGGER.warn('Matplotlib could not be imported, '
                            'figures are not saved.')
            else:
                prody.SETTINGS['auto_show'] = False
                width = kwargs.get('figwidth', 8)
                height = kwargs.get('figheight', 6)
                xlabel = kwargs.get('xlabel')
                title = kwargs.get('title')
                figure = plt.figure(figsize=(width, height))
                label = kwargs.get('label')
                show = showMSAOccupancy(msa=msa,
                                        occ=occ,
                                        label=label,
                                        xlabel=xlabel,
                                        title=title)
                format = kwargs.get('figformat', 'pdf')
                figure.savefig(prefix + suffix[i] + '.' + format,
                               format=format,
                               dpi=kwargs.get('figdpi', 300))

예제 #3

0

파일 보기

파일: evol_occupancy.py 프로젝트: barettog1/ProDy

def evol_occupancy(msa, **kwargs):

    from numpy import arange

    import prody
    from prody import parseMSA, calcMSAOccupancy, showMSAOccupancy, writeArray
    from os.path import splitext

    prefix = kwargs.get('prefix')
    if prefix is None:
        prefix, _ = splitext(msa)
        if _.lower() == '.gz':
            prefix, _ = splitext(prefix)
        prefix += '_occupancy'

    msa = parseMSA(msa)

    numformat = kwargs.get('numformat', '%12g')
    occupancy , suffix = [], []
    occaxis = kwargs.get('occaxis', 'row')
    if occaxis == 'both':
        suffix = ['_row', '_col']
        occupancy.append(calcMSAOccupancy(msa, occ='row'))
        occupancy.append(calcMSAOccupancy(msa, occ='col'))
    else:
        suffix = '_' + occaxis
        occupancy.append(calcMSAOccupancy(msa, occ=occaxis))

    for i, occ in enumerate(occupancy):
        writeArray((prefix + suffix[i] + '.txt'), occ, format=numformat)

    for i, occ in enumerate(occupancy):
        if kwargs.get('figocc'):
            try:
                import matplotlib.pyplot as plt
            except ImportError:
                LOGGER.warn('Matplotlib could not be imported, '
                            'figures are not saved.')
            else:
                prody.SETTINGS['auto_show'] = False
                width = kwargs.get('figwidth', 8)
                height = kwargs.get('figheight', 6)
                xlabel = kwargs.get('xlabel')
                title = kwargs.get('title')
                figure = plt.figure(figsize=(width, height))
                label = kwargs.get('label')
                show = showMSAOccupancy(msa=msa, occ=occ, label=label,
                                         xlabel=xlabel, title=title)
                format = kwargs.get('figformat', 'pdf')
                figure.savefig(prefix + suffix[i] + '.' + format, format=format,
                            dpi=kwargs.get('figdpi', 300))

예제 #4

0

파일 보기

파일: evol_refine.py 프로젝트: nffaruk/ProDy

def evol_refine(msa, **kwargs):

    from prody import parseMSA, refineMSA, writeMSA, LOGGER
    from os.path import splitext

    outname = kwargs.get('outname')
    if outname is None:
        outname, ext = splitext(msa)
        if ext.lower() == '.gz':
            outname, _ = splitext(msa)
        outname += '_refined' + ext

    writeMSA(outname, refineMSA(parseMSA(msa), **kwargs), **kwargs)
    LOGGER.info('Refined MSA is written in file: ' + outname)

예제 #5

0

파일 보기

파일: evol_refine.py 프로젝트: barettog1/ProDy

def evol_refine(msa, **kwargs):

    import prody
    from prody import parseMSA, refineMSA, writeMSA, LOGGER
    from os.path import splitext

    outname = kwargs.get('outname')
    if outname is None:
        outname, ext = splitext(msa)
        if ext.lower() == '.gz':
            outname, _ = splitext(msa)
        outname += '_refined' + ext

    writeMSA(outname, refineMSA(parseMSA(msa), **kwargs), **kwargs)
    LOGGER.info('Refined MSA is written in file: ' + outname)

예제 #6

0

파일 보기

파일: evol_refine.py 프로젝트: njekin/ProDy

def evol_refine(msa, **kwargs):

    import prody
    from prody import parseMSA, refineMSA, writeMSA, LOGGER
    from os.path import splitext

    outname = kwargs.get("outname")
    if outname is None:
        outname, ext = splitext(msa)
        if ext.lower() == ".gz":
            outname, _ = splitext(msa)
        outname += "_refined" + ext

    writeMSA(outname, refineMSA(parseMSA(msa), **kwargs), **kwargs)
    LOGGER.info("Refined MSA is written in file: " + outname)

예제 #7

0

파일 보기

파일: evol_merge.py 프로젝트: npabon/ProDy

def evol_merge(*msa, **kwargs):
    
    import prody
    from prody import parseMSA, mergeMSA, LOGGER, writeMSA, MSAFile
    from prody.sequence.msafile import MSAEXTMAP
    from os.path import splitext
    if len(msa) < 2:
        raise ValueError('multiple msa filenames must be specified')
    msaobj = []
    try:
        msaobj = [parseMSA(fn) for fn in msa]
    except:
        raise IOError('failed to parse {0}'.format(fn))
    
    msafile = MSAFile(msa[0])

    format = kwargs.get('format') or msafile.format
    outname = kwargs.get('outname') or (msafile.getTitle() + '_merged' + 
                                        MSAEXTMAP[msafile.format])
    writeMSA(outname, mergeMSA(*msaobj), **kwargs)    
    LOGGER.info('Merged MSA is saved as: {0}'.format(outname))

예제 #8

0

파일 보기

def evol_conserv(msa, **kwargs):

    import prody
    from prody import parseMSA, calcShannonEntropy, showShannonEntropy
    from prody import writeArray
    from os.path import splitext

    prefix = kwargs.get('prefix')
    if prefix is None:
        prefix, _ = splitext(msa)
        if _.lower() == '.gz':
            prefix, _ = splitext(prefix)
        prefix += '_conserv'
    msa = parseMSA(msa)
    entropy = calcShannonEntropy(msa, **kwargs)

    writeArray(prefix + '.txt',
               entropy,
               format=kwargs.get('numformat', '%12g'))

    if kwargs.get('figent'):
        try:
            import matplotlib.pyplot as plt
        except ImportError:
            LOGGER.warn('Matplotlib could not be imported, '
                        'figures are not saved.')
        else:
            prody.SETTINGS['auto_show'] = False
            width = kwargs.get('figwidth', 8)
            height = kwargs.get('figheight', 6)
            figargs = kwargs.get('figargs', ())
            figure = plt.figure(figsize=(width, height))
            show = showShannonEntropy(entropy, msa=msa, *figargs)
            format = kwargs.get('figformat', 'pdf')
            figure.savefig(prefix + '.' + format,
                           format=format,
                           dpi=kwargs.get('figdpi', 300))

예제 #9

0

파일 보기

파일: evol_coevol.py 프로젝트: agiorgetti1971/ProDy-1

def evol_coevol(msa, **kwargs):

    from numpy import arange

    import prody
    from prody import parseMSA, buildMutinfoMatrix, showMutinfoMatrix
    from prody import applyMutinfoCorr, calcShannonEntropy
    from prody import writeArray, LOGGER, applyMutinfoNorm, writeHeatmap
    from os.path import splitext

    prefix = kwargs.get('prefix')
    if prefix is None:
        prefix, _ = splitext(msa)
        if _.lower() == '.gz':
            prefix, _ = splitext(prefix)
        prefix += '_mutinfo'

    msa = parseMSA(msa)
    mutinfo = buildMutinfoMatrix(msa, **kwargs)
    numformat = kwargs.get('numformat', '%12g')
    heatmap = kwargs.get('heatmap', False)
    #writeArray(prefix + '.txt', mutinfo, format=numformat)
    if heatmap:
        hmargs = {
                  'xlabel': 'Residue', 'ylabel': 'Residue',
                  'xorigin': 1, 'xstep': 1,
                  'residue': arange(msa.numResidues())}

    todo = [(None, None)]
    norm = kwargs.get('normalization', [])
    corr = kwargs.get('correction', [])
    if norm is not None:
        if 'joint' in norm:
            todo.append(('norm', 'joint'))
        for which in norm:
            if which == 'join': continue
            todo.append(('norm', which))
    if corr is not None:
        for which in corr:
            todo.append(('corr', which))
    entropy = None

    for what, which in todo:
        if what is None:
            matrix = mutinfo
            suffix = ''
            tuffix = ' Mutual Information'
        elif which == 'joint':
            LOGGER.info('Applying {0} normalization.'.format(repr(which)))
            matrix = buildMutinfoMatrix(msa, norm=True, **kwargs)
            suffix = '_norm_joint'
            tuffix = ' MI - Normalization: ' + which
        elif what == 'norm':
            LOGGER.info('Applying {0} normalization.'.format(repr(which)))
            if entropy is None:
                entropy = calcShannonEntropy(msa, **kwargs)
            matrix = applyMutinfoNorm(mutinfo, entropy, norm=which)
            suffix = '_norm_' + which
            tuffix = ' MI - Normalization: ' + which
        else:
            LOGGER.info('Applying {0} correction.'.format(repr(which)))
            matrix = applyMutinfoCorr(mutinfo, which)
            suffix = '_corr_' + which
            tuffix = ' MI - Correction: ' + which

        writeArray(prefix + suffix + '.txt',
                   matrix, format=kwargs.get('numformat', '%12g'))

        if heatmap:
            writeHeatmap(prefix + suffix + '.hm', matrix,
                         title = msa.getTitle() + tuffix, **hmargs)

        if kwargs.get('figcoevol'):
            try:
                import matplotlib.pyplot as plt
            except ImportError:
                LOGGER.warn('Matplotlib could not be imported, '
                            'figures are not saved.')
            else:
                cmin = kwargs.get('cmin', matrix.min())
                cmax = kwargs.get('cmax', matrix.max())
                prody.SETTINGS['auto_show'] = False
                width = kwargs.get('figwidth', 8)
                height = kwargs.get('figheight', 6)
                xlabel = kwargs.get('xlabel')
                title = kwargs.get('title')
                figure = plt.figure(figsize=(width, height))
                show = showMutinfoMatrix(matrix, msa=msa, clim=(cmin, cmax),
                                         xlabel=xlabel, title=title)

                format = kwargs.get('figformat', 'pdf')
                figure.savefig(prefix + suffix + '.' + format, format=format,
                            dpi=kwargs.get('figdpi', 300))

예제 #10

0

파일 보기

파일: evol_rankorder.py 프로젝트: njekin/ProDy

def evol_rankorder(mutinfo, **kwargs):
    from prody import parseMSA, LOGGER, parsePDB, calcMSAOccupancy
    from prody.utilities import openFile
    from os.path import splitext
    
    delimiter = kwargs.get('delimiter')
    mi = np.loadtxt(str(mutinfo), delimiter=delimiter)
    
    ndim, shape = mi.ndim, mi.shape
    if ndim != 2 or shape[0] != shape[1]:
        raise ValueError('mutinfo must contain a square matrix')
    
    msa, label = kwargs.get('msa'), kwargs.get('label')
    
    pdb, pdbflag = kwargs.get('pdb'), False
    
    resnum = None
    
    if pdb is not None:
        from prody import parsePDB
        try:
            pdb = parsePDB(pdb)
        except:
            LOGGER.info('Could not parse PDB, ignoring PDB input')
        else:
            chains = list(pdb.iterChains())
            for chain in chains:
                sel = chain.select('protein and name CA')
                if sel.numAtoms() == shape[0]:
                    resnum = sel.getResnums()
                    coordset = sel.getCoordsets()
                    distance = calcAllDist(coordset)
                    pdbflag = True
                    label = pdb.getTitle()
                    LOGGER.info('Residue numbers will be based on pdb: '
                                '{0}'.format(pdb.getTitle()))
                    break
                else:
                    LOGGER.info('Number of residues in PDB does not match '
                                'mutinfo matrix, ignoring PDB input')
    
    if not pdbflag:
        if msa is not None:
            msa = parseMSA(msa)
            if msa.numResidues() != shape[0]:
                LOGGER.info('Input MSA and mutinfo do not have similar no '
                            'of residues, ignoring MSA')
            else:
                index = msa.getIndex(label)   
                if index is None:
                    if label is not None:
                        LOGGER.info('Could not find given label in MSA, '
                                    'using complete sequence from MSA')
                    occ = calcMSAOccupancy(msa._msa, 'row')
                    index = np.where(occ == occ.max())[0][0]
                    label, seq, start, end = msa[index]
                else:
                    label, seq, start, end = msa[index]
                if (start and end is not None) and (start < end):
                    resnum = np.arange(start, end+1)
                    if len(resnum) != shape[0]:
                        LOGGER.info('Label: {0}/{1}-{2} and mutinfo do '
                                    'not have similar no of residues, using '
                                    'serial indexing'.format(label, start, end))
                        label = 'Serial Index'
                        resnum = np.arange(1, shape[0]+1)
                    else:
                        LOGGER.info('Residue numbers will be based on label: '
                                    '{0}'.format(label))
                else:
                    LOGGER.info('Could not identify residue indexes from MSA'
                                    ' using serial indexing')
                    label = 'Serial Index'
                    resnum = np.arange(1, shape[0]+1)
        else:
            LOGGER.info('MSA or PDB not given or does not match mutinfo, '
                        'using serial indexing')
            resnum = np.arange(1, shape[0]+1)
    
    LOGGER.info('Residue numbers start and end with {0}-{1}'.
                format(str(resnum[0]), str(resnum[-1])))
    
    outname = kwargs.get('outname')
    if outname is None:
        outname, ext = splitext(str(mutinfo))
        if ext.lower() == '.gz': 
            outname, _ = splitext(str(mutinfo))
    else:
        outname, ext = splitext(str(outname))
        if ext is None:
            ext = '.txt'
    
    outname += '_rankorder' + ext
    zscore = kwargs.get('zscore')
    if zscore:
        LOGGER.info('zscore normalization applied such that each column '
                    'has 0 mean and standard deviation 1')
        header = 'Serial\tRow\tColumn\tZscore'
        mi = (mi - mi.mean(0)) / mi.std(0)
    else:
        header = 'Serial\tRow\tColumn\tMI'
    
    mi_ind_start, mi_ind_end = np.tril_indices(shape[0], k=-1)
    mi_matrix = mi[mi_ind_start, mi_ind_end]
    sorted_index = mi_matrix.argsort(axis=None)[::-1]
    row = mi_ind_start[sorted_index]
    column = mi_ind_end[sorted_index]
    count = 1
    i = 0
    
    f = openFile(outname, 'wb')
    if label is None:
        label = 'Serial Index'
    
    numpairs = kwargs.get('numpairs')
    size = len(row)
    seqsep = kwargs.get('seqsep')
    if not kwargs.get('usedist') or not pdbflag:
        if kwargs.get('usedist'):
            LOGGER.info('use-struct-sep set to true, but PDB not given or '
                        'incorrect residue number. Using sequence separation')
        else:
            if pdbflag:
                LOGGER.info('use-dist not set, using sequence separation'
                            ' to report coevolving pairs')
        f.write(('Label: '+ label + '\t' + 'Residue Numbers: ' +
             str(resnum[0]) + '-' + str(resnum[-1]) + '\tSequence Separation:' +
             str(seqsep) + '\n'))
        if pdbflag:
            f.write((header + '\tDistance\n'))
            while count <=numpairs  and i < size:        
                if row[i] > (column[i] + seqsep):
                    f.write('{0}\t{1}\t{2}\t{3:.3f}\t{4:.2f}\n'.
                            format(count, resnum[row[i]], resnum[column[i]],
                                   mi[row[i], column[i]],
                                   distance[row[i], column[i]]))
                    count += 1
                i += 1
        else:
            f.write((header + '\n'))
            while count <=numpairs  and i < size:        
                if row[i] > (column[i] + seqsep):
                    f.write('{0}\t{1}\t{2}\t{3:.3f}\n'.
                            format(count, resnum[row[i]], resnum[column[i]],
                                   mi[row[i], column[i]]))
                    count += 1
                i += 1
    else:
        structsep = kwargs.get('dist')
        f.write(('Label: '+ label + '\t' + 'Residue Numbers: ' +
             str(resnum[0]) + '-' + str(resnum[-1]) + 'Distance Cutoff:' +
             str(structsep) + '\n'))
        f.write((header + '\tDistance\n'))        
        while count <=numpairs  and i < size:        
            if distance[row[i], column[i]] > structsep:
                f.write('{0}\t{1}\t{2}\t{3:.3f}\t{4:.2f}\n'.
                        format(count, resnum[row[i]], resnum[column[i]],
                               mi[row[i], column[i]],
                               distance[row[i], column[i]]))
                count += 1                
            i += 1
    f.close()

예제 #11

0

파일 보기

파일: test_analysis.py 프로젝트: njekin/ProDy

__author__ = 'Ahmet Bakan'
__copyright__ = 'Copyright (C) 2010-2012 Ahmet Bakan'

from prody.tests import TestCase

from numpy import array, log, zeros, char, ones
from numpy.testing import assert_array_equal, assert_array_almost_equal

from prody.tests.test_datafiles import *

from prody import LOGGER, calcShannonEntropy, buildMutinfoMatrix, parseMSA
from prody import calcMSAOccupancy, buildSeqidMatrix, uniqueSequences

LOGGER.verbosity = None

FASTA = parseMSA(pathDatafile('msa_Cys_knot.fasta'))
FASTA_ALPHA = char.isalpha(FASTA._msa)
FASTA_UPPER = char.upper(FASTA._msa)

FASTA_NUMBER, FASTA_LENGTH = FASTA_ALPHA.shape
FASTA_EYE = zeros((FASTA_NUMBER, FASTA_NUMBER))
for i in range(FASTA_NUMBER):
    FASTA_EYE[i, i] = 1
    for j in range(i + 1, FASTA_NUMBER):
        score = 0.0
        ncols = 0
        for k in range(FASTA_LENGTH):
            if FASTA_ALPHA[i, k] or FASTA_ALPHA[j, k]:
                if FASTA_UPPER[i, k] == FASTA_UPPER[j, k]:
                    score += 1
                ncols += 1

예제 #12

0

파일 보기

파일: test_msa.py 프로젝트: npabon/ProDy

__author__ = 'Ahmet Bakan'
__copyright__ = 'Copyright (C) 2010-2012 Ahmet Bakan'

from prody.tests import TestCase

from numpy import array, log, zeros, char
from numpy.testing import assert_array_equal, assert_array_almost_equal

from prody.tests.test_datafiles import *

from prody import LOGGER, refineMSA, parseMSA, calcMSAOccupancy, mergeMSA
from prody import uniqueSequences

LOGGER.verbosity = None

FASTA = parseMSA(pathDatafile('msa_Cys_knot.fasta'))
FASTA_ALPHA = char.isalpha(FASTA._msa)
NUMSEQ = FASTA.numSequences() * 1.


class TestRefinement(TestCase):
    def testLabel(self):
        label = 'FSHB_BOVIN'
        index = FASTA.getIndex(label)
        refined = refineMSA(FASTA, label=label)._getArray()

        expected = FASTA._getArray().take(FASTA_ALPHA[index].nonzero()[0], 1)

        assert_array_equal(refined, expected)

    def testRowocc(self):

예제 #13

0

파일 보기

파일: test_msafile.py 프로젝트: Python3pkg/ProDy

from os.path import join
try:
    from io import StringIO
except ImportError:
    from io import StringIO

from numpy import array, log, zeros, char
from numpy.testing import assert_array_equal, dec

from prody.tests.datafiles import *
from prody.tests import TEMPDIR
from prody import MSA, MSAFile, parseMSA, LOGGER, writeMSA

LOGGER.verbosity = None

FASTA = parseMSA(pathDatafile('msa_Cys_knot.fasta'))
SELEX = parseMSA(pathDatafile('msa_Cys_knot.slx'))
STOCK = parseMSA(pathDatafile('msa_Cys_knot.sth'))
FASTA_LIST = list(MSAFile(pathDatafile('msa_Cys_knot.fasta')))
SELEX_LIST = list(MSAFile(pathDatafile('msa_Cys_knot.sth')))
STOCK_LIST = list(MSAFile(pathDatafile('msa_Cys_knot.sth')))


class TestMSAFile(TestCase):
    def testMSAFile(self):

        self.assertListEqual(FASTA_LIST, SELEX_LIST)
        self.assertListEqual(FASTA_LIST, STOCK_LIST)

    def testWriteFasta(self):

예제 #14

0

파일 보기

def evol_rankorder(mutinfo, **kwargs):
    from prody import parseMSA, LOGGER, parsePDB, calcMSAOccupancy
    from prody.utilities import openFile
    from os.path import splitext

    delimiter = kwargs.get('delimiter')
    mi = np.loadtxt(str(mutinfo), delimiter=delimiter)

    ndim, shape = mi.ndim, mi.shape
    if ndim != 2 or shape[0] != shape[1]:
        raise ValueError('mutinfo must contain a square matrix')

    msa, label = kwargs.get('msa'), kwargs.get('label')

    pdb, pdbflag = kwargs.get('pdb'), False

    resnum = None

    if pdb is not None:
        from prody import parsePDB
        try:
            pdb = parsePDB(pdb)
        except:
            LOGGER.info('Could not parse PDB, ignoring PDB input')
        else:
            chains = list(pdb.iterChains())
            for chain in chains:
                sel = chain.select('protein and name CA')
                if sel.numAtoms() == shape[0]:
                    resnum = sel.getResnums()
                    coordset = sel.getCoordsets()
                    distance = calcAllDist(coordset)
                    pdbflag = True
                    label = pdb.getTitle()
                    LOGGER.info('Residue numbers will be based on pdb: '
                                '{0}'.format(pdb.getTitle()))
                    break
                else:
                    LOGGER.info('Number of residues in PDB does not match '
                                'mutinfo matrix, ignoring PDB input')

    if not pdbflag:
        if msa is not None:
            msa = parseMSA(msa)
            if msa.numResidues() != shape[0]:
                LOGGER.info('Input MSA and mutinfo do not have similar no '
                            'of residues, ignoring MSA')
            else:
                index = msa.getIndex(label)
                if index is None:
                    if label is not None:
                        LOGGER.info('Could not find given label in MSA, '
                                    'using complete sequence from MSA')
                    occ = calcMSAOccupancy(msa._msa, 'row')
                    index = np.where(occ == occ.max())[0][0]
                    label, seq, start, end = msa[index]
                else:
                    label, seq, start, end = msa[index]
                if (start and end is not None) and (start < end):
                    resnum = np.arange(start, end + 1)
                    if len(resnum) != shape[0]:
                        LOGGER.info('Label: {0}/{1}-{2} and mutinfo do '
                                    'not have similar no of residues, using '
                                    'serial indexing'.format(
                                        label, start, end))
                        label = 'Serial Index'
                        resnum = np.arange(1, shape[0] + 1)
                    else:
                        LOGGER.info('Residue numbers will be based on label: '
                                    '{0}'.format(label))
                else:
                    LOGGER.info('Could not identify residue indexes from MSA'
                                ' using serial indexing')
                    label = 'Serial Index'
                    resnum = np.arange(1, shape[0] + 1)
        else:
            LOGGER.info('MSA or PDB not given or does not match mutinfo, '
                        'using serial indexing')
            resnum = np.arange(1, shape[0] + 1)

    LOGGER.info('Residue numbers start and end with {0}-{1}'.format(
        str(resnum[0]), str(resnum[-1])))

    outname = kwargs.get('outname')
    if outname is None:
        outname, ext = splitext(str(mutinfo))
        if ext.lower() == '.gz':
            outname, _ = splitext(str(mutinfo))
    else:
        outname, ext = splitext(str(outname))
        if ext is None:
            ext = '.txt'

    outname += '_rankorder' + ext
    zscore = kwargs.get('zscore')
    if zscore:
        LOGGER.info('zscore normalization applied such that each column '
                    'has 0 mean and standard deviation 1')
        header = 'Serial\tRow\tColumn\tZscore'
        mi = (mi - mi.mean(0)) / mi.std(0)
    else:
        header = 'Serial\tRow\tColumn\tMI'

    mi_ind_start, mi_ind_end = np.tril_indices(shape[0], k=-1)
    mi_matrix = mi[mi_ind_start, mi_ind_end]
    sorted_index = mi_matrix.argsort(axis=None)[::-1]
    row = mi_ind_start[sorted_index]
    column = mi_ind_end[sorted_index]
    count = 1
    i = 0

    f = openFile(outname, 'wb')
    if label is None:
        label = 'Serial Index'

    numpairs = kwargs.get('numpairs')
    size = len(row)
    seqsep = kwargs.get('seqsep')
    if not kwargs.get('usedist') or not pdbflag:
        if kwargs.get('usedist'):
            LOGGER.info('use-struct-sep set to true, but PDB not given or '
                        'incorrect residue number. Using sequence separation')
        else:
            if pdbflag:
                LOGGER.info('use-dist not set, using sequence separation'
                            ' to report coevolving pairs')
        f.write(('Label: ' + label + '\t' + 'Residue Numbers: ' +
                 str(resnum[0]) + '-' + str(resnum[-1]) +
                 '\tSequence Separation:' + str(seqsep) + '\n'))
        if pdbflag:
            f.write((header + '\tDistance\n'))
            while count <= numpairs and i < size:
                if row[i] > (column[i] + seqsep):
                    f.write('{0}\t{1}\t{2}\t{3:.3f}\t{4:.2f}\n'.format(
                        count, resnum[row[i]], resnum[column[i]],
                        mi[row[i], column[i]], distance[row[i], column[i]]))
                    count += 1
                i += 1
        else:
            f.write((header + '\n'))
            while count <= numpairs and i < size:
                if row[i] > (column[i] + seqsep):
                    f.write('{0}\t{1}\t{2}\t{3:.3f}\n'.format(
                        count, resnum[row[i]], resnum[column[i]],
                        mi[row[i], column[i]]))
                    count += 1
                i += 1
    else:
        structsep = kwargs.get('dist')
        f.write(('Label: ' + label + '\t' + 'Residue Numbers: ' +
                 str(resnum[0]) + '-' + str(resnum[-1]) + 'Distance Cutoff:' +
                 str(structsep) + '\n'))
        f.write((header + '\tDistance\n'))
        while count <= numpairs and i < size:
            if distance[row[i], column[i]] > structsep:
                f.write('{0}\t{1}\t{2}\t{3:.3f}\t{4:.2f}\n'.format(
                    count, resnum[row[i]], resnum[column[i]],
                    mi[row[i], column[i]], distance[row[i], column[i]]))
                count += 1
            i += 1
    f.close()

예제 #15

0

파일 보기

import prody.sequence as sequence
import prody
import matplotlib.pyplot as plt

alignment = prody.MSAFile("pkinase.fasta")

#get positions -> by hand for now
positions = [72, 83, 117, 119, 194, 251, 354, 355, 357, 429, 432]

#user alignSequenceToMSA instead to derive positions automatically
#set up webservice to get correspondance between MSA position and a particular PDB structure

alignment.setSlice(positions)

prody.writeMSA("test.fasta", alignment)
pa = prody.parseMSA("pocket_type1.fasta")
labs = pa.getLabels()
seqidmatrix = prody.buildSeqidMatrix(pa)
scamatrix = prody.buildSCAMatrix(pa)
tree = prody.calcTree(names=labs, distance_matrix=seqidmatrix)
plt.figure()
show = prody.showTree(tree, format='plt')

예제 #16

0

파일 보기

파일: test_msafile.py 프로젝트: fongchun/ProDy

from prody.tests import TestCase
import os
from os.path import join

from numpy import array, log, zeros, char
from numpy.testing import assert_array_equal, dec

from prody.tests.datafiles import *
from prody.tests import TEMPDIR
from prody import MSA, MSAFile, parseMSA, LOGGER, writeMSA
from prody.utilities import createStringIO

LOGGER.verbosity = None

FASTA = parseMSA(pathDatafile('msa_Cys_knot.fasta'))
SELEX = parseMSA(pathDatafile('msa_Cys_knot.slx'))
STOCK = parseMSA(pathDatafile('msa_Cys_knot.sth'))
FASTA_LIST = list(MSAFile(pathDatafile('msa_Cys_knot.fasta')))
SELEX_LIST = list(MSAFile(pathDatafile('msa_Cys_knot.sth')))
STOCK_LIST = list(MSAFile(pathDatafile('msa_Cys_knot.sth')))

class TestMSAFile(TestCase):

    def testMSAFile(self):

        self.assertListEqual(FASTA_LIST, SELEX_LIST)
        self.assertListEqual(FASTA_LIST, STOCK_LIST)

    def testWriteFasta(self):

예제 #17

0

파일 보기

파일: test_msafile.py 프로젝트: fongchun/ProDy

 def testSelex(self):
     filename = writeMSA(join(TEMPDIR, 'test.slx'), SELEX)
     selex = parseMSA(pathDatafile(filename))
     self.assertListEqual(list(SELEX), list(selex))
     if os.path.isfile(filename):
         os.remove(filename)

예제 #18

0

파일 보기

파일: test_msafile.py 프로젝트: Python3pkg/ProDy

 def testSelex(self):
     filename = writeMSA(join(TEMPDIR, 'test.slx'), SELEX)
     selex = parseMSA(pathDatafile(filename))
     self.assertListEqual(list(SELEX), list(selex))
     if os.path.isfile(filename):
         os.remove(filename)

예제 #19

0

파일 보기

파일: Uniprot.py 프로젝트: yaz62/rhapsody

 def calcEvolProperties(self,
                        resid='all',
                        refresh=False,
                        folder=None,
                        max_cols=None,
                        max_seqs=25000,
                        **kwargs):
     ''' Computes Evol properties, i.e. Shannon entropy, Mutual
     Information and Direct Information, from Pfam Multiple
     Sequence Alignments, for a given residue.
     '''
     assert type(refresh) is bool
     # recover Pfam mapping (if not found already)
     self._searchPfam(refresh=refresh)
     if resid == 'all':
         PF_list = self.Pfam.keys()
     else:
         # get list of Pfam domains containing resid
         PF_list = [
             k for k in self.Pfam if any([
                 resid >= int(segment['start'])
                 and resid <= int(segment['end'])
                 for segment in self.Pfam[k]['locations']
             ])
         ]
         if len(PF_list) == 0:
             raise RuntimeError(
                 'No Pfam domain for resid {}.'.format(resid))
         if len(PF_list) > 1:
             LOGGER.warn('Residue {} is found in multiple '.format(resid) + \
                         '({}) Pfam domains.'.format(len(PF_list)))
     if folder is None:
         folder = SETTINGS.get('rhapsody_local_folder', './')
     # iterate over Pfam families
     for PF in PF_list:
         d = self.Pfam[PF]
         # skip if properties are pre-computed
         if not refresh and d.get('mapping') is not None:
             continue
         d['mapping'] = None
         d['ref_MSA'] = None
         d['entropy'] = np.nan
         d['MutInfo'] = np.nan
         d['DirInfo'] = np.nan
         try:
             LOGGER.info('Processing {}...'.format(PF))
             # fetch & parse MSA
             #               fname = PF + '_full.sth'
             #               fullname = os.path.join(folder, fname)
             #               if not os.path.isfile(fullname):
             #                   f = fetchPfamMSA(PF)
             #                   os.rename(f, fullname)
             #               msa = parseMSA(fullname, **kwargs)
             # fetch & parse MSA without saving downloaded MSA
             f = fetchPfamMSA(PF)
             msa = parseMSA(f, **kwargs)
             os.remove(f)
             # slice MSA to match all segments of the Uniprot sequence
             sliced_msa, indexes = self._sliceMSA(msa)
             #               if max_cols is not None and sliced_msa.numResidues() > max_cols:
             #                   raise Exception('Unable to compute DI: MSA has ' +\
             #                                   'too many columns (max: {}).'.format(max_cols))
             # get mapping between Uniprot sequence and Pfam domain
             d['mapping'] = self._mapUniprot2Pfam(PF, sliced_msa, indexes)
         except Exception as e:
             LOGGER.warn('{}: {}'.format(PF, e))
             d['mapping'] = str(e)
             continue
         try:
             # refine MSA ('seqid' param. is set as in PolyPhen-2)
             rowocc = 0.6
             while True:
                 sliced_msa = refineMSA(sliced_msa, rowocc=rowocc)
                 rowocc += 0.02
                 if sliced_msa.numSequences() <= max_seqs or rowocc >= 1:
                     break
             ref_msa = refineMSA(sliced_msa, seqid=0.94, **kwargs)
             d['ref_MSA'] = ref_msa
             # compute evolutionary properties
             d['entropy'] = calcShannonEntropy(ref_msa)
             d['MutInfo'] = buildMutinfoMatrix(ref_msa)
             # d['DirInfo'] = buildDirectInfoMatrix(ref_msa)
         except Exception as e:
             LOGGER.warn('{}: {}'.format(PF, e))
     return {k: self.Pfam[k] for k in PF_list}

예제 #20

0

파일 보기

파일: test_analysis.py 프로젝트: npabon/ProDy

__copyright__ = "Copyright (C) 2010-2012 Ahmet Bakan"

from prody.tests import TestCase

from numpy import array, log, zeros, char, ones, fromfile
from numpy.testing import assert_array_equal, assert_array_almost_equal

from prody.tests.test_datafiles import *

from prody import LOGGER, calcShannonEntropy, buildMutinfoMatrix, parseMSA
from prody import calcMSAOccupancy, buildSeqidMatrix, uniqueSequences
from prody import buildOMESMatrix, buildSCAMatrix

LOGGER.verbosity = None

FASTA = parseMSA(pathDatafile("msa_Cys_knot.fasta"))
FASTA_ALPHA = char.isalpha(FASTA._msa)
FASTA_UPPER = char.upper(FASTA._msa)

FASTA_NUMBER, FASTA_LENGTH = FASTA_ALPHA.shape
FASTA_EYE = zeros((FASTA_NUMBER, FASTA_NUMBER))
for i in range(FASTA_NUMBER):
    FASTA_EYE[i, i] = 1
    for j in range(i + 1, FASTA_NUMBER):
        score = 0.0
        ncols = 0
        for k in range(FASTA_LENGTH):
            if FASTA_ALPHA[i, k] or FASTA_ALPHA[j, k]:
                if FASTA_UPPER[i, k] == FASTA_UPPER[j, k]:
                    score += 1
                ncols += 1