Exemplo n.º 1
0
    def test_pseudo_count(self):
        # use example from
        # http://biologie.univ-mrs.fr/upload/p202/01.4.PSSM_theory.pdf
        alpha = unambiguous_dna
        dna_align = MultipleSeqAlignment([
            SeqRecord(Seq("AACCACGTTTAA", alpha), id="ID001"),
            SeqRecord(Seq("CACCACGTGGGT", alpha), id="ID002"),
            SeqRecord(Seq("CACCACGTTCGC", alpha), id="ID003"),
            SeqRecord(Seq("GCGCACGTGGGG", alpha), id="ID004"),
            SeqRecord(Seq("TCGCACGTTGTG", alpha), id="ID005"),
            SeqRecord(Seq("TGGCACGTGTTT", alpha), id="ID006"),
            SeqRecord(Seq("TGACACGTGGGA", alpha), id="ID007"),
            SeqRecord(Seq("TTACACGTGCGC", alpha), id="ID008")
        ])

        summary = SummaryInfo(dna_align)
        expected = FreqTable({
            "A": 0.325,
            "G": 0.175,
            "T": 0.325,
            "C": 0.175
        }, FREQ, unambiguous_dna)
        ic = summary.information_content(e_freq_table=expected,
                                         log_base=math.exp(1),
                                         pseudo_count=1)
        self.assertAlmostEqualList(summary.ic_vector, [
            0.110, 0.090, 0.360, 1.290, 0.800, 1.290, 1.290, 0.80, 0.610,
            0.390, 0.470, 0.040
        ],
                                   places=2)
        self.assertAlmostEqual(ic, 7.546, places=3)
    def test_nucleotides(self):
        filename = "GFF/multi.fna"
        format = "fasta"
        alignment = AlignIO.read(filename, format, alphabet=unambiguous_dna)
        summary = SummaryInfo(alignment)

        c = summary.dumb_consensus(ambiguous="N")
        self.assertEqual(str(c), 'NNNNNNNN')
        self.assertNotEqual(c.alphabet, unambiguous_dna)
        self.assertTrue(isinstance(c.alphabet, DNAAlphabet))

        c = summary.gap_consensus(ambiguous="N")
        self.assertEqual(str(c), 'NNNNNNNN')
        self.assertNotEqual(c.alphabet, unambiguous_dna)
        self.assertTrue(isinstance(c.alphabet, DNAAlphabet))

        expected = FreqTable({
            "A": 0.25,
            "G": 0.25,
            "T": 0.25,
            "C": 0.25
        }, FREQ, unambiguous_dna)

        m = summary.pos_specific_score_matrix(chars_to_ignore=['-'],
                                              axis_seq=c)
        self.assertEqual(
            str(m), """    A   C   G   T
N  2.0 0.0 1.0 0.0
N  1.0 1.0 1.0 0.0
N  1.0 0.0 2.0 0.0
N  0.0 1.0 1.0 1.0
N  1.0 2.0 0.0 0.0
N  0.0 2.0 1.0 0.0
N  1.0 2.0 0.0 0.0
N  0.0 2.0 1.0 0.0
""")

        # Have a generic alphabet, without a declared gap char, so must tell
        # provide the frequencies and chars to ignore explicitly.
        ic = summary.information_content(e_freq_table=expected,
                                         chars_to_ignore=['-'])
        self.assertAlmostEqual(ic, 7.32029999423075, places=6)