示例#1
0
文件: test_msa.py 项目: njekin/ProDy
    def testSeqidLabel(self):

        seqid = 0.50
        label = "FSHB_BOVIN"
        labeled = refineMSA(FASTA, label=label)
        unique = uniqueSequences(labeled, seqid)
        unique[FASTA.getIndex(label)] = True
        refined = refineMSA(FASTA, label=label, seqid=seqid)
        assert_array_equal(refined._getArray(), labeled._getArray()[unique])
示例#2
0
文件: test_msa.py 项目: npabon/ProDy
    def testSeqid(self):

        seqid = 0.50
        label = 'FSHB_BOVIN'
        unique = uniqueSequences(FASTA, seqid)
        refined = refineMSA(FASTA, seqid=seqid)
        assert_array_equal(refined._getArray(), FASTA._getArray()[unique])
示例#3
0
文件: test_msa.py 项目: njekin/ProDy
    def testSeqid(self):

        seqid = 0.50
        label = "FSHB_BOVIN"
        unique = uniqueSequences(FASTA, seqid)
        refined = refineMSA(FASTA, seqid=seqid)
        assert_array_equal(refined._getArray(), FASTA._getArray()[unique])
示例#4
0
文件: test_msa.py 项目: npabon/ProDy
    def testLabel(self):
        label = 'FSHB_BOVIN'
        index = FASTA.getIndex(label)
        refined = refineMSA(FASTA, label=label)._getArray()

        expected = FASTA._getArray().take(FASTA_ALPHA[index].nonzero()[0], 1)

        assert_array_equal(refined, expected)
示例#5
0
文件: test_msa.py 项目: njekin/ProDy
    def testLabel(self):
        label = "FSHB_BOVIN"
        index = FASTA.getIndex(label)
        refined = refineMSA(FASTA, label=label)._getArray()

        expected = FASTA._getArray().take(FASTA_ALPHA[index].nonzero()[0], 1)

        assert_array_equal(refined, expected)
示例#6
0
文件: test_msa.py 项目: njekin/ProDy
    def testRowCol(self):

        rowocc = 0.9
        colocc = 1.0
        refined = refineMSA(FASTA, rowocc=rowocc, colocc=colocc)._getArray()
        rows = FASTA_ALPHA.sum(1) / 112.0 >= rowocc
        expected = FASTA._getArray()[rows]
        cols = char.isalpha(expected).sum(0, dtype=float) / expected.shape[0] >= colocc

        expected = expected.take(cols.nonzero()[0], 1)
        assert_array_equal(refined, expected)
示例#7
0
文件: test_msa.py 项目: npabon/ProDy
    def testRowCol(self):

        rowocc = 0.9
        colocc = 1.0
        refined = refineMSA(FASTA, rowocc=rowocc, colocc=colocc)._getArray()
        rows = FASTA_ALPHA.sum(1) / 112. >= rowocc
        expected = FASTA._getArray()[rows]
        cols = char.isalpha(expected).sum(
            0, dtype=float) / expected.shape[0] >= colocc

        expected = expected.take(cols.nonzero()[0], 1)
        assert_array_equal(refined, expected)
示例#8
0
def evol_refine(msa, **kwargs):

    from prody import parseMSA, refineMSA, writeMSA, LOGGER
    from os.path import splitext

    outname = kwargs.get('outname')
    if outname is None:
        outname, ext = splitext(msa)
        if ext.lower() == '.gz':
            outname, _ = splitext(msa)
        outname += '_refined' + ext

    writeMSA(outname, refineMSA(parseMSA(msa), **kwargs), **kwargs)
    LOGGER.info('Refined MSA is written in file: ' + outname)
示例#9
0
def evol_refine(msa, **kwargs):

    import prody
    from prody import parseMSA, refineMSA, writeMSA, LOGGER
    from os.path import splitext

    outname = kwargs.get('outname')
    if outname is None:
        outname, ext = splitext(msa)
        if ext.lower() == '.gz':
            outname, _ = splitext(msa)
        outname += '_refined' + ext

    writeMSA(outname, refineMSA(parseMSA(msa), **kwargs), **kwargs)
    LOGGER.info('Refined MSA is written in file: ' + outname)
示例#10
0
def evol_refine(msa, **kwargs):

    import prody
    from prody import parseMSA, refineMSA, writeMSA, LOGGER
    from os.path import splitext

    outname = kwargs.get("outname")
    if outname is None:
        outname, ext = splitext(msa)
        if ext.lower() == ".gz":
            outname, _ = splitext(msa)
        outname += "_refined" + ext

    writeMSA(outname, refineMSA(parseMSA(msa), **kwargs), **kwargs)
    LOGGER.info("Refined MSA is written in file: " + outname)
示例#11
0
文件: test_msa.py 项目: njekin/ProDy
    def testAll(self):

        rowocc = 0.9
        colocc = 0.9
        seqid = 0.98
        label = "FSHB_BOVIN"
        refined = refineMSA(FASTA, label=label, seqid=seqid, rowocc=rowocc, colocc=colocc)

        index = FASTA.getIndex(label)
        which = FASTA_ALPHA[index].nonzero()[0]
        expected = FASTA._getArray().take(which, 1)

        expected = expected[uniqueSequences(expected, seqid)]

        expected = expected[calcMSAOccupancy(expected, "row") >= rowocc]

        which = (calcMSAOccupancy(expected) >= colocc).nonzero()[0]
        expected = expected.take(which, 1)

        assert_array_equal(refined._getArray(), expected)
示例#12
0
    def testAll(self):

        rowocc = 0.9
        colocc = 0.9
        seqid = 0.98
        label = 'FSHB_BOVIN'
        refined = refineMSA(FASTA, label=label, seqid=seqid,
                            rowocc=rowocc, colocc=colocc)

        index = FASTA.getIndex(label)
        which = FASTA_ALPHA[index].nonzero()[0]
        expected = FASTA._getArray().take(which, 1)

        expected = expected[uniqueSequences(expected, seqid)]

        expected = expected[calcMSAOccupancy(expected, 'row') >= rowocc]

        which = (calcMSAOccupancy(expected) >= colocc).nonzero()[0]
        expected = expected.take(which, 1)

        assert_array_equal(refined._getArray(), expected)
示例#13
0
文件: Uniprot.py 项目: yaz62/rhapsody
 def calcEvolProperties(self,
                        resid='all',
                        refresh=False,
                        folder=None,
                        max_cols=None,
                        max_seqs=25000,
                        **kwargs):
     ''' Computes Evol properties, i.e. Shannon entropy, Mutual
     Information and Direct Information, from Pfam Multiple
     Sequence Alignments, for a given residue.
     '''
     assert type(refresh) is bool
     # recover Pfam mapping (if not found already)
     self._searchPfam(refresh=refresh)
     if resid == 'all':
         PF_list = self.Pfam.keys()
     else:
         # get list of Pfam domains containing resid
         PF_list = [
             k for k in self.Pfam if any([
                 resid >= int(segment['start'])
                 and resid <= int(segment['end'])
                 for segment in self.Pfam[k]['locations']
             ])
         ]
         if len(PF_list) == 0:
             raise RuntimeError(
                 'No Pfam domain for resid {}.'.format(resid))
         if len(PF_list) > 1:
             LOGGER.warn('Residue {} is found in multiple '.format(resid) + \
                         '({}) Pfam domains.'.format(len(PF_list)))
     if folder is None:
         folder = SETTINGS.get('rhapsody_local_folder', './')
     # iterate over Pfam families
     for PF in PF_list:
         d = self.Pfam[PF]
         # skip if properties are pre-computed
         if not refresh and d.get('mapping') is not None:
             continue
         d['mapping'] = None
         d['ref_MSA'] = None
         d['entropy'] = np.nan
         d['MutInfo'] = np.nan
         d['DirInfo'] = np.nan
         try:
             LOGGER.info('Processing {}...'.format(PF))
             # fetch & parse MSA
             #               fname = PF + '_full.sth'
             #               fullname = os.path.join(folder, fname)
             #               if not os.path.isfile(fullname):
             #                   f = fetchPfamMSA(PF)
             #                   os.rename(f, fullname)
             #               msa = parseMSA(fullname, **kwargs)
             # fetch & parse MSA without saving downloaded MSA
             f = fetchPfamMSA(PF)
             msa = parseMSA(f, **kwargs)
             os.remove(f)
             # slice MSA to match all segments of the Uniprot sequence
             sliced_msa, indexes = self._sliceMSA(msa)
             #               if max_cols is not None and sliced_msa.numResidues() > max_cols:
             #                   raise Exception('Unable to compute DI: MSA has ' +\
             #                                   'too many columns (max: {}).'.format(max_cols))
             # get mapping between Uniprot sequence and Pfam domain
             d['mapping'] = self._mapUniprot2Pfam(PF, sliced_msa, indexes)
         except Exception as e:
             LOGGER.warn('{}: {}'.format(PF, e))
             d['mapping'] = str(e)
             continue
         try:
             # refine MSA ('seqid' param. is set as in PolyPhen-2)
             rowocc = 0.6
             while True:
                 sliced_msa = refineMSA(sliced_msa, rowocc=rowocc)
                 rowocc += 0.02
                 if sliced_msa.numSequences() <= max_seqs or rowocc >= 1:
                     break
             ref_msa = refineMSA(sliced_msa, seqid=0.94, **kwargs)
             d['ref_MSA'] = ref_msa
             # compute evolutionary properties
             d['entropy'] = calcShannonEntropy(ref_msa)
             d['MutInfo'] = buildMutinfoMatrix(ref_msa)
             # d['DirInfo'] = buildDirectInfoMatrix(ref_msa)
         except Exception as e:
             LOGGER.warn('{}: {}'.format(PF, e))
     return {k: self.Pfam[k] for k in PF_list}
示例#14
0
文件: test_msa.py 项目: njekin/ProDy
    def testRowocc(self):

        refined = refineMSA(FASTA, rowocc=0.9)._getArray()
        expected = FASTA._getArray()[FASTA_ALPHA.sum(1) / 112.0 >= 0.9, :]

        assert_array_equal(refined, expected)
示例#15
0
文件: test_msa.py 项目: njekin/ProDy
    def testColocc(self):

        refined = refineMSA(FASTA, colocc=0.9)._getArray()
        expected = FASTA._getArray()[:, FASTA_ALPHA.sum(0) / NUMSEQ >= 0.9]

        assert_array_equal(refined, expected)
示例#16
0
文件: test_msa.py 项目: npabon/ProDy
    def testColocc(self):

        refined = refineMSA(FASTA, colocc=0.9)._getArray()
        expected = FASTA._getArray()[:, FASTA_ALPHA.sum(0) / NUMSEQ >= 0.9]

        assert_array_equal(refined, expected)
示例#17
0
文件: test_msa.py 项目: npabon/ProDy
    def testRowocc(self):

        refined = refineMSA(FASTA, rowocc=0.9)._getArray()
        expected = FASTA._getArray()[FASTA_ALPHA.sum(1) / 112. >= 0.9, :]

        assert_array_equal(refined, expected)