def testNoAmbiguity(self): msa = array([list('OX'), list('XO')], dtype='|S1') expect = array([[0., log(2.)], [log(2.), 0.]]) result = buildMutinfoMatrix(msa, ambiquity=False) assert_array_almost_equal(expect, result, err_msg='turbo failed') result = buildMutinfoMatrix(msa, ambiquity=False, turbo=False) assert_array_almost_equal(expect, result, err_msg='w/out turbo failed')
def testAmbiguity7(self): msa = array([list("bx"), list("xb")], dtype="|S1") expect = 72 * 0.0125 * log(0.0125 / 0.0250 / 0.275) + 4 * 0.0250 * log(0.0250 / 0.275 / 0.275) expect = array([[0.0, expect], [expect, 0.0]]) result = buildMutinfoMatrix(msa, debug=False) assert_array_almost_equal(expect, result, err_msg="turbo failed") result = buildMutinfoMatrix(msa, turbo=False) assert_array_almost_equal(expect, result, err_msg="w/out turbo failed")
def testAmbiguity3(self): msa = array([list("XX")], dtype="|S1") expect = zeros((2, 2)) result = buildMutinfoMatrix(msa, debug=False) assert_array_almost_equal(expect, result, err_msg="turbo failed") result = buildMutinfoMatrix(msa, turbo=False) assert_array_almost_equal(expect, result, err_msg="w/out turbo failed")
def testNoAmbiguity(self): msa = array([list("OX"), list("XO")], dtype="|S1") expect = array([[0.0, log(2.0)], [log(2.0), 0.0]]) result = buildMutinfoMatrix(msa, ambiquity=False) assert_array_almost_equal(expect, result, err_msg="turbo failed") result = buildMutinfoMatrix(msa, ambiquity=False, turbo=False) assert_array_almost_equal(expect, result, err_msg="w/out turbo failed")
def testAmbiguity3(self): msa = array([list('XX')], dtype='|S1') expect = zeros((2, 2)) result = buildMutinfoMatrix(msa, debug=False) assert_array_almost_equal(expect, result, err_msg='turbo failed') result = buildMutinfoMatrix(msa, turbo=False) assert_array_almost_equal(expect, result, err_msg='w/out turbo failed')
def testAmbiguity7(self): msa = array([list('bx'), list('xb')], dtype='|S1') expect = (72 * 0.0125 * log(0.0125 / 0.0250 / 0.275) + 4 * 0.0250 * log(0.0250 / 0.275 / 0.275)) expect = array([[0., expect], [expect, 0.]]) result = buildMutinfoMatrix(msa, debug=False) assert_array_almost_equal(expect, result, err_msg='turbo failed') result = buildMutinfoMatrix(msa, turbo=False) assert_array_almost_equal(expect, result, err_msg='w/out turbo failed')
def testAmbiguity4(self): msa = array([list("Bb"), list("jJ"), list("Zz")], dtype="|S1") expect = log((1.0 / 12) / (1.0 / 6) / (1.0 / 6)) expect = array([[0.0, expect], [expect, 0.0]]) result = buildMutinfoMatrix(msa, debug=False) assert_array_almost_equal(expect, result, err_msg="turbo failed") result = buildMutinfoMatrix(msa, turbo=False) assert_array_almost_equal(expect, result, err_msg="w/out turbo failed")
def testAmbiguity2(self): msa = array([list('AB'), list('BZ')], dtype='|S1') expect = (2 * .25 * log(.25 / .5 / .25) + 4 * .125 * log(.125 / .25 / .25)) expect = array([[0., expect], [expect, 0.]]) result = buildMutinfoMatrix(msa, debug=False) assert_array_almost_equal(expect, result, err_msg='turbo failed') result = buildMutinfoMatrix(msa, turbo=False) assert_array_almost_equal(expect, result, err_msg='w/out turbo failed')
def testTwentyReversed(self): seq = "ACDEFGHIKLMNPQRSTVWY" msa = array([[s, seq[-i - 1]] for i, s in enumerate(seq)], dtype="|S1") expect = log(20.0) expect = array([[0.0, expect], [expect, 0.0]]) result = buildMutinfoMatrix(msa) assert_array_almost_equal(expect, result, err_msg="turbo failed") result = buildMutinfoMatrix(msa, turbo=False) assert_array_almost_equal(expect, result, err_msg="w/out turbo failed")
def testTwentyReversed(self): seq = 'ACDEFGHIKLMNPQRSTVWY' msa = array([[s, seq[-i - 1]] for i, s in enumerate(seq)], dtype='|S1') expect = log(20.) expect = array([[0., expect], [expect, 0.]]) result = buildMutinfoMatrix(msa) assert_array_almost_equal(expect, result, err_msg='turbo failed') result = buildMutinfoMatrix(msa, turbo=False) assert_array_almost_equal(expect, result, err_msg='w/out turbo failed')
def testTwenty(self): seq = 'ACDEFGHIKLMNPQRSTVWY' msa = array([[s, s] for s in seq], dtype='|S1') expect = log(20.) expect = array([[0., expect], [expect, 0.]]) result = buildMutinfoMatrix(msa, debug=False) assert_array_almost_equal(expect, result, err_msg='turbo failed') result = buildMutinfoMatrix(msa, turbo=False) assert_array_almost_equal(expect, result, err_msg='w/out turbo failed')
def testTwenty(self): seq = "ACDEFGHIKLMNPQRSTVWY" msa = array([[s, s] for s in seq], dtype="|S1") expect = log(20.0) expect = array([[0.0, expect], [expect, 0.0]]) result = buildMutinfoMatrix(msa, debug=False) assert_array_almost_equal(expect, result, err_msg="turbo failed") result = buildMutinfoMatrix(msa, turbo=False) assert_array_almost_equal(expect, result, err_msg="w/out turbo failed")
def testInf(self): msa = zeros((500, 10), '|S1') msa.fill('.') msa[95, 8] = 's' msa[95, 9] = 'i' expect = zeros((10, 10)) expect[8, 9] = expect[9, 8] = 0.002 * log(500.) + 0.998 * log(1. / 0.998) result = buildMutinfoMatrix(msa, debug=False) assert_array_almost_equal(expect, result, err_msg='turbo failed') result = buildMutinfoMatrix(msa, turbo=False) assert_array_almost_equal(expect, result, err_msg='w/out turbo failed')
def testAmbiguity4(self): msa = array([ list('Bb'), list('jJ'), list('Zz'), ], dtype='|S1') expect = log((1. / 12) / (1. / 6) / (1. / 6)) expect = array([[0., expect], [expect, 0.]]) result = buildMutinfoMatrix(msa, debug=False) assert_array_almost_equal(expect, result, err_msg='turbo failed') result = buildMutinfoMatrix(msa, turbo=False) assert_array_almost_equal(expect, result, err_msg='w/out turbo failed')
def testAmbiguity6(self): expect = zeros((2, 2)) for seq in ['bb', 'jj', 'zz']: msa = array([list(seq)], dtype='|S1') result = buildMutinfoMatrix(msa, debug=False) assert_array_almost_equal(expect, result, err_msg=seq + ' failed')
def testNorm(self): seq = 'ACDEFGHIKLMNPQRSTVWY' msa = array([[s, seq[-i - 1]] for i, s in enumerate(seq)], dtype='|S1') expect = 1. expect = array([[0., expect], [expect, 0.]]) result = buildMutinfoMatrix(msa, norm=True) assert_array_almost_equal(expect, result, err_msg='norm failed')
def testSixSequences(self): msa = array([list('ACCA'), list('ACDA'), list('ACEC'), list('ACGC')], dtype='|S1') expect = array([ [0., 0., 0., 0.], [0., 0., 0., 0.], [0., 0., 0., log(2.)], [0., 0., log(2.), 0.], ]) result = buildMutinfoMatrix(msa) assert_array_almost_equal(expect, result, err_msg='turbo failed') result = buildMutinfoMatrix(msa, turbo=False) assert_array_almost_equal(expect, result, err_msg='w/out turbo failed')
def testNorm2(self): seq = 'ACDEFGHIKLMNPQRSTVWY' msa = array([[s, 'O' if i % 2 else 'U'] for i, s in enumerate(seq)], dtype='|S1') expect = log(1. / 20. / (1. / 20. * 1. / 2.)) / (-log(1. / 20.)) expect = array([[0., expect], [expect, 0.]]) result = buildMutinfoMatrix(msa, norm=True) assert_array_almost_equal(expect, result, err_msg='norm failed')
def testAmbiguity5(self): expect = array([[0., 0.], [0., 0.]]) for seq in [ 'bx', 'Xb', 'jX', 'Xj', 'xz', 'ZX', 'bj', 'jb', 'bz', 'zb', 'jz', 'zj' ]: msa = array([list(seq)], dtype='|S1') result = buildMutinfoMatrix(msa, debug=False) assert_array_almost_equal(expect, result, err_msg=seq + ' failed')
def calcEvolProperties(self, resid='all', refresh=False, folder=None, max_cols=None, max_seqs=25000, **kwargs): ''' Computes Evol properties, i.e. Shannon entropy, Mutual Information and Direct Information, from Pfam Multiple Sequence Alignments, for a given residue. ''' assert type(refresh) is bool # recover Pfam mapping (if not found already) self._searchPfam(refresh=refresh) if resid == 'all': PF_list = self.Pfam.keys() else: # get list of Pfam domains containing resid PF_list = [ k for k in self.Pfam if any([ resid >= int(segment['start']) and resid <= int(segment['end']) for segment in self.Pfam[k]['locations'] ]) ] if len(PF_list) == 0: raise RuntimeError( 'No Pfam domain for resid {}.'.format(resid)) if len(PF_list) > 1: LOGGER.warn('Residue {} is found in multiple '.format(resid) + \ '({}) Pfam domains.'.format(len(PF_list))) if folder is None: folder = SETTINGS.get('rhapsody_local_folder', './') # iterate over Pfam families for PF in PF_list: d = self.Pfam[PF] # skip if properties are pre-computed if not refresh and d.get('mapping') is not None: continue d['mapping'] = None d['ref_MSA'] = None d['entropy'] = np.nan d['MutInfo'] = np.nan d['DirInfo'] = np.nan try: LOGGER.info('Processing {}...'.format(PF)) # fetch & parse MSA # fname = PF + '_full.sth' # fullname = os.path.join(folder, fname) # if not os.path.isfile(fullname): # f = fetchPfamMSA(PF) # os.rename(f, fullname) # msa = parseMSA(fullname, **kwargs) # fetch & parse MSA without saving downloaded MSA f = fetchPfamMSA(PF) msa = parseMSA(f, **kwargs) os.remove(f) # slice MSA to match all segments of the Uniprot sequence sliced_msa, indexes = self._sliceMSA(msa) # if max_cols is not None and sliced_msa.numResidues() > max_cols: # raise Exception('Unable to compute DI: MSA has ' +\ # 'too many columns (max: {}).'.format(max_cols)) # get mapping between Uniprot sequence and Pfam domain d['mapping'] = self._mapUniprot2Pfam(PF, sliced_msa, indexes) except Exception as e: LOGGER.warn('{}: {}'.format(PF, e)) d['mapping'] = str(e) continue try: # refine MSA ('seqid' param. is set as in PolyPhen-2) rowocc = 0.6 while True: sliced_msa = refineMSA(sliced_msa, rowocc=rowocc) rowocc += 0.02 if sliced_msa.numSequences() <= max_seqs or rowocc >= 1: break ref_msa = refineMSA(sliced_msa, seqid=0.94, **kwargs) d['ref_MSA'] = ref_msa # compute evolutionary properties d['entropy'] = calcShannonEntropy(ref_msa) d['MutInfo'] = buildMutinfoMatrix(ref_msa) # d['DirInfo'] = buildDirectInfoMatrix(ref_msa) except Exception as e: LOGGER.warn('{}: {}'.format(PF, e)) return {k: self.Pfam[k] for k in PF_list}
def evol_coevol(msa, **kwargs): from numpy import arange import prody from prody import parseMSA, buildMutinfoMatrix, showMutinfoMatrix from prody import applyMutinfoCorr, calcShannonEntropy from prody import writeArray, LOGGER, applyMutinfoNorm, writeHeatmap from os.path import splitext prefix = kwargs.get('prefix') if prefix is None: prefix, _ = splitext(msa) if _.lower() == '.gz': prefix, _ = splitext(prefix) prefix += '_mutinfo' msa = parseMSA(msa) mutinfo = buildMutinfoMatrix(msa, **kwargs) numformat = kwargs.get('numformat', '%12g') heatmap = kwargs.get('heatmap', False) #writeArray(prefix + '.txt', mutinfo, format=numformat) if heatmap: hmargs = { 'xlabel': 'Residue', 'ylabel': 'Residue', 'xorigin': 1, 'xstep': 1, 'residue': arange(msa.numResidues())} todo = [(None, None)] norm = kwargs.get('normalization', []) corr = kwargs.get('correction', []) if norm is not None: if 'joint' in norm: todo.append(('norm', 'joint')) for which in norm: if which == 'join': continue todo.append(('norm', which)) if corr is not None: for which in corr: todo.append(('corr', which)) entropy = None for what, which in todo: if what is None: matrix = mutinfo suffix = '' tuffix = ' Mutual Information' elif which == 'joint': LOGGER.info('Applying {0} normalization.'.format(repr(which))) matrix = buildMutinfoMatrix(msa, norm=True, **kwargs) suffix = '_norm_joint' tuffix = ' MI - Normalization: ' + which elif what == 'norm': LOGGER.info('Applying {0} normalization.'.format(repr(which))) if entropy is None: entropy = calcShannonEntropy(msa, **kwargs) matrix = applyMutinfoNorm(mutinfo, entropy, norm=which) suffix = '_norm_' + which tuffix = ' MI - Normalization: ' + which else: LOGGER.info('Applying {0} correction.'.format(repr(which))) matrix = applyMutinfoCorr(mutinfo, which) suffix = '_corr_' + which tuffix = ' MI - Correction: ' + which writeArray(prefix + suffix + '.txt', matrix, format=kwargs.get('numformat', '%12g')) if heatmap: writeHeatmap(prefix + suffix + '.hm', matrix, title = msa.getTitle() + tuffix, **hmargs) if kwargs.get('figcoevol'): try: import matplotlib.pyplot as plt except ImportError: LOGGER.warn('Matplotlib could not be imported, ' 'figures are not saved.') else: cmin = kwargs.get('cmin', matrix.min()) cmax = kwargs.get('cmax', matrix.max()) prody.SETTINGS['auto_show'] = False width = kwargs.get('figwidth', 8) height = kwargs.get('figheight', 6) xlabel = kwargs.get('xlabel') title = kwargs.get('title') figure = plt.figure(figsize=(width, height)) show = showMutinfoMatrix(matrix, msa=msa, clim=(cmin, cmax), xlabel=xlabel, title=title) format = kwargs.get('figformat', 'pdf') figure.savefig(prefix + suffix + '.' + format, format=format, dpi=kwargs.get('figdpi', 300))