示例#1
0
    def test_levenshtein(self):
        self.assertEqual(0, Distance.levenshtein('', ''))
        self.assertEqual(3, Distance.levenshtein('', 'abc'))
        self.assertEqual(2, Distance.levenshtein('abcde', 'acd'))
        self.assertEqual(4, Distance.levenshtein('abcabcabc', 'bcabc'))
        self.assertEqual(5, Distance.levenshtein('abcabcabc', 'bcac'))
        self.assertEqual(5, Distance.levenshtein('bcac', 'abcabcabc'))

        for _ in range(100):
            s = Sequence.randomDnaKmer(100)
            tmp = list(s)
            edited = []
            edits = 0
            i = 0
            while i < len(tmp):
                r = random()
                if r < 10 / 100:
                    # deletion
                    edits += 1
                elif r < 20 / 100:
                    # insertion
                    edited.append('-')
                    i -= 1  # do not move index
                    edits += 1
                elif r < 30 / 100:
                    # substitution
                    edited.append('*')
                    edits += 1
                else:
                    edited.append(tmp[i])
                i += 1
            edited = ''.join(edited)
            self.assertLessEqual(Distance.levenshtein(edited, s), edits)
示例#2
0
 def test_hash_certainty(self):
     for M in (8, 16, 32, 64):
         for K in range(8, 257, 8):
             dkh = DnaKmerHasher(K, M)
             dna = Sequence.randomDnaKmer(K)
             h = dkh.hash(dna)
             for _ in range(100):
                 self.assertEqual(h, dkh.hash(dna))
示例#3
0
    def test_randomDnaKmer(self):
        c = Counter()
        for k in range(0, 101):
            seq = Sequence.randomDnaKmer(k)
            self.assertEqual(k, len(seq))
            c += Counter(seq)

        self.assertEqual(tuple(sorted(c.keys())), tuple('ACGT'))

        self.assertGreaterEqual(chisquare(list(c.values()))[1], 0.05)
示例#4
0
    def test_to_string(self):
        for M in (1, 2, 4, 8, 11, 16, 17, 32, 64):
            for K in range(4, 257, 4):
                for _ in range(100):
                    dkh = DnaKmerHasher(K, M)
                    dna = Sequence.randomDnaKmer(K)
                    hash = dkh.hash(dna)

                    dkh = DnaKmerHasher(dkh.str())
                    self.assertEqual(hash, dkh.hash(dna))
示例#5
0
    def test_hash_uniform1(self):
        for M in (8, 16, 32, 64):
            upper_bound = 2**M - 1
            for K in range(8, 257, 8):
                dna = Sequence.randomDnaKmer(K)
                hashes = []
                for _ in range(10000):
                    dkh = DnaKmerHasher(K, M)
                    hashes.append(dkh.hash(dna) / upper_bound)

                self.assertAlmostEqual(min(hashes), 0, 2)
                self.assertAlmostEqual(max(hashes), 1, 2)
                hist, edges = np.histogram(hashes, bins=50, density=True)
                self.assertGreater(chisquare(hist)[1], 0.05)
示例#6
0
    def test_hamming(self):
        self.assertEqual(Distance.hamming('', ''), 0)
        self.assertEqual(Distance.hamming("123456789", "12345678*"), 1)

        with self.assertRaises(RuntimeError):
            Distance.hamming('12', '123')

        for i in range(100):
            s = Sequence.randomDnaKmer(100)
            tmp = list(s)
            for j in range(i):
                tmp[j] = '*'

            t = ''.join(tmp)
            self.assertEqual(Distance.hamming(s, t), i)
示例#7
0
    def test_hash_uniform0(self):
        for M in (8, 16, 32, 64):
            upper_bound = 2**M - 1

            for K in range(8, 257, 8):
                hashes = []
                dkh = DnaKmerHasher(K, M)
                for _ in range(10000):
                    dna = Sequence.randomDnaKmer(K)
                    hashes.append(dkh.hash(dna) / upper_bound)

                # for the random hash values to reach the boundary (0 and 1), k should be bigger than M/2
                self.assertAlmostEqual(0, min(hashes), 2)
                self.assertAlmostEqual(1, max(hashes), 2)

                hist, edges = np.histogram(hashes, bins=50, density=True)
                self.assertGreater(chisquare(hist)[1], 0.05)
示例#8
0
 def test_reverseComplementRna(self):
     self.assertEqual('auauauaaaucuuguuuCUAGa', Sequence.reverseComplementRna('uCUAGaaacaagauuuauauau'))
     for k in range(0, 101):
         rna = Sequence.randomRnaKmer(k)
         self.assertEqual(Sequence.complementRna(Sequence.reverse(rna)), Sequence.reverseComplementRna(rna))
         self.assertEqual(Sequence.reverse(Sequence.complementRna(rna)), Sequence.reverseComplementRna(rna))
示例#9
0
 def test_reverseComplementDna(self):
     self.assertEqual('atatataaatcttgtttCTAGa', Sequence.reverseComplementDna('tCTAGaaacaagatttatatat'))
     for k in range(0, 101):
         dna = Sequence.randomDnaKmer(k)
         self.assertEqual(Sequence.complementDna(Sequence.reverse(dna)), Sequence.reverseComplementDna(dna))
         self.assertEqual(Sequence.reverse(Sequence.complementDna(dna)), Sequence.reverseComplementDna(dna))
示例#10
0
 def test_reverse(self):
     for k in range(0, 101):
         dna = Sequence.randomDnaKmer(k)
         self.assertEqual(Sequence.reverse(dna), dna[::-1])