def test_distance(self): """Dinuc distance should calculate Euclidean dist. correctly""" s1 = 'AA' + 'GG' * 10 s2 = 'AA' * 5 + 'GG' * 7 d1 = DinucUsage(s1, Overlapping=False) d2 = DinucUsage(s2, Overlapping=False) self.assertEqual(d1.distance(d1), 0) self.assertEqual(d1.distance(d2), 5) self.assertEqual(d2.distance(d1), 5)
def test_init_from_seq(self): """DinucUsage should init correctly from string.""" s1 = 'AAAAA' s2 = 'ACTACG' fd = filter_dict self.assertEqual(fd(DinucUsage(s1)), {'AA': 4}) #NOTE: will map DNA seq tp RNA. self.assertEqual(fd(DinucUsage(s2)), { 'AC': 2, 'CU': 1, 'UA': 1, 'CG': 1 }) #check that it works for non-overlapping self.assertEqual(fd(DinucUsage(s1, Overlapping=False)), {'AA': 2}) self.assertEqual(fd(DinucUsage(s2, Overlapping=False)), \ {'AC':1,'UA':1,'CG':1}) #check that it works for the 3-1 case self.assertEqual(fd(DinucUsage(s1, Overlapping='3-1')), {'AA': 1}) self.assertEqual(fd(DinucUsage(s2, Overlapping='3-1')), \ {'UA':1}) s3 = 'ACG' * 5 self.assertEqual(fd(DinucUsage(s3, Overlapping='3-1')), \ {'GA':4}) s4 = s3 + 'GAA' self.assertEqual(fd(DinucUsage(s4, Overlapping='3-1')), \ {'GA':4,'GG':1})
def __init__(self, seq, overlapping=True, normalize=True): self.seq = seq.upper() self.overlapping = overlapping self.normalize = normalize # Sequence length self.len = len(seq) # Calculate dinuc. frequencies self.du = DinucUsage(seq, Overlapping=overlapping) if normalize: self.du.normalize() # Calculate frequencies for individual bases A_base = "A" T_base = "T" U_base = "U" G_base = "G" C_base = "C" self.bases = (A_base, T_base, G_base, C_base) self.base_freqs = defaultdict(int) for curr_base in self.bases: self.base_freqs[curr_base] = \ (self.seq.count(curr_base) / float(self.len)) # Equalize T/U -- pick the greater of the two frequencies # and then set them as equal self.base_freqs[T_base] = max( (self.base_freqs["T"], self.base_freqs["U"])) self.base_freqs[U_base] = self.base_freqs[T_base]