def test_levenshtein(self): self.assertEqual(0, Distance.levenshtein('', '')) self.assertEqual(3, Distance.levenshtein('', 'abc')) self.assertEqual(2, Distance.levenshtein('abcde', 'acd')) self.assertEqual(4, Distance.levenshtein('abcabcabc', 'bcabc')) self.assertEqual(5, Distance.levenshtein('abcabcabc', 'bcac')) self.assertEqual(5, Distance.levenshtein('bcac', 'abcabcabc')) for _ in range(100): s = Sequence.randomDnaKmer(100) tmp = list(s) edited = [] edits = 0 i = 0 while i < len(tmp): r = random() if r < 10 / 100: # deletion edits += 1 elif r < 20 / 100: # insertion edited.append('-') i -= 1 # do not move index edits += 1 elif r < 30 / 100: # substitution edited.append('*') edits += 1 else: edited.append(tmp[i]) i += 1 edited = ''.join(edited) self.assertLessEqual(Distance.levenshtein(edited, s), edits)
def test_hash_certainty(self): for M in (8, 16, 32, 64): for K in range(8, 257, 8): dkh = DnaKmerHasher(K, M) dna = Sequence.randomDnaKmer(K) h = dkh.hash(dna) for _ in range(100): self.assertEqual(h, dkh.hash(dna))
def test_randomDnaKmer(self): c = Counter() for k in range(0, 101): seq = Sequence.randomDnaKmer(k) self.assertEqual(k, len(seq)) c += Counter(seq) self.assertEqual(tuple(sorted(c.keys())), tuple('ACGT')) self.assertGreaterEqual(chisquare(list(c.values()))[1], 0.05)
def test_to_string(self): for M in (1, 2, 4, 8, 11, 16, 17, 32, 64): for K in range(4, 257, 4): for _ in range(100): dkh = DnaKmerHasher(K, M) dna = Sequence.randomDnaKmer(K) hash = dkh.hash(dna) dkh = DnaKmerHasher(dkh.str()) self.assertEqual(hash, dkh.hash(dna))
def test_hash_uniform1(self): for M in (8, 16, 32, 64): upper_bound = 2**M - 1 for K in range(8, 257, 8): dna = Sequence.randomDnaKmer(K) hashes = [] for _ in range(10000): dkh = DnaKmerHasher(K, M) hashes.append(dkh.hash(dna) / upper_bound) self.assertAlmostEqual(min(hashes), 0, 2) self.assertAlmostEqual(max(hashes), 1, 2) hist, edges = np.histogram(hashes, bins=50, density=True) self.assertGreater(chisquare(hist)[1], 0.05)
def test_hamming(self): self.assertEqual(Distance.hamming('', ''), 0) self.assertEqual(Distance.hamming("123456789", "12345678*"), 1) with self.assertRaises(RuntimeError): Distance.hamming('12', '123') for i in range(100): s = Sequence.randomDnaKmer(100) tmp = list(s) for j in range(i): tmp[j] = '*' t = ''.join(tmp) self.assertEqual(Distance.hamming(s, t), i)
def test_hash_uniform0(self): for M in (8, 16, 32, 64): upper_bound = 2**M - 1 for K in range(8, 257, 8): hashes = [] dkh = DnaKmerHasher(K, M) for _ in range(10000): dna = Sequence.randomDnaKmer(K) hashes.append(dkh.hash(dna) / upper_bound) # for the random hash values to reach the boundary (0 and 1), k should be bigger than M/2 self.assertAlmostEqual(0, min(hashes), 2) self.assertAlmostEqual(1, max(hashes), 2) hist, edges = np.histogram(hashes, bins=50, density=True) self.assertGreater(chisquare(hist)[1], 0.05)
def test_reverseComplementRna(self): self.assertEqual('auauauaaaucuuguuuCUAGa', Sequence.reverseComplementRna('uCUAGaaacaagauuuauauau')) for k in range(0, 101): rna = Sequence.randomRnaKmer(k) self.assertEqual(Sequence.complementRna(Sequence.reverse(rna)), Sequence.reverseComplementRna(rna)) self.assertEqual(Sequence.reverse(Sequence.complementRna(rna)), Sequence.reverseComplementRna(rna))
def test_reverseComplementDna(self): self.assertEqual('atatataaatcttgtttCTAGa', Sequence.reverseComplementDna('tCTAGaaacaagatttatatat')) for k in range(0, 101): dna = Sequence.randomDnaKmer(k) self.assertEqual(Sequence.complementDna(Sequence.reverse(dna)), Sequence.reverseComplementDna(dna)) self.assertEqual(Sequence.reverse(Sequence.complementDna(dna)), Sequence.reverseComplementDna(dna))
def test_reverse(self): for k in range(0, 101): dna = Sequence.randomDnaKmer(k) self.assertEqual(Sequence.reverse(dna), dna[::-1])