Exemplo n.º 1
0
 def __init__(self, seq, overlapping=True, normalize=True):
     self.seq = seq.upper()
     self.overlapping = overlapping
     self.normalize = normalize
     # Sequence length
     self.len = len(seq)
     # Calculate dinuc. frequencies
     self.du = DinucUsage(seq, Overlapping=overlapping)
     if normalize:
         self.du.normalize()
     # Calculate frequencies for individual bases
     A_base = "A"
     T_base = "T"
     U_base = "U"
     G_base = "G"
     C_base = "C"
     self.bases = (A_base, T_base, G_base, C_base)
     self.base_freqs = defaultdict(int)
     for curr_base in self.bases:
         self.base_freqs[curr_base] = \
             (self.seq.count(curr_base) / float(self.len))
     # Equalize T/U -- pick the greater of the two frequencies
     # and then set them as equal
     self.base_freqs[T_base] = max(
         (self.base_freqs["T"], self.base_freqs["U"]))
     self.base_freqs[U_base] = self.base_freqs[T_base]
Exemplo n.º 2
0
 def test_init_from_seq(self):
     """DinucUsage should init correctly from string."""
     s1 = 'AAAAA'
     s2 = 'ACTACG'
     fd = filter_dict
     self.assertEqual(fd(DinucUsage(s1)), {'AA': 4})
     #NOTE: will map DNA seq tp RNA.
     self.assertEqual(fd(DinucUsage(s2)), {
         'AC': 2,
         'CU': 1,
         'UA': 1,
         'CG': 1
     })
     #check that it works for non-overlapping
     self.assertEqual(fd(DinucUsage(s1, Overlapping=False)), {'AA': 2})
     self.assertEqual(fd(DinucUsage(s2, Overlapping=False)), \
         {'AC':1,'UA':1,'CG':1})
     #check that it works for the 3-1 case
     self.assertEqual(fd(DinucUsage(s1, Overlapping='3-1')), {'AA': 1})
     self.assertEqual(fd(DinucUsage(s2, Overlapping='3-1')), \
         {'UA':1})
     s3 = 'ACG' * 5
     self.assertEqual(fd(DinucUsage(s3, Overlapping='3-1')), \
         {'GA':4})
     s4 = s3 + 'GAA'
     self.assertEqual(fd(DinucUsage(s4, Overlapping='3-1')), \
         {'GA':4,'GG':1})
Exemplo n.º 3
0
 def test_distance(self):
     """Dinuc distance should calculate Euclidean dist. correctly"""
     s1 = 'AA' + 'GG' * 10
     s2 = 'AA' * 5 + 'GG' * 7
     d1 = DinucUsage(s1, Overlapping=False)
     d2 = DinucUsage(s2, Overlapping=False)
     self.assertEqual(d1.distance(d1), 0)
     self.assertEqual(d1.distance(d2), 5)
     self.assertEqual(d2.distance(d1), 5)
Exemplo n.º 4
0
class DinucFreqs:
    """
    Dinucleotide frequencies. Wrapper for pycogent class.
    """
    def __init__(self, seq, overlapping=True, normalize=True):
        self.seq = seq.upper()
        self.overlapping = overlapping
        self.normalize = normalize
        # Sequence length
        self.len = len(seq)
        # Calculate dinuc. frequencies
        self.du = DinucUsage(seq, Overlapping=overlapping)
        if normalize:
            self.du.normalize()
        # Calculate frequencies for individual bases
        A_base = "A"
        T_base = "T"
        U_base = "U"
        G_base = "G"
        C_base = "C"
        self.bases = (A_base, T_base, G_base, C_base)
        self.base_freqs = defaultdict(int)
        for curr_base in self.bases:
            self.base_freqs[curr_base] = \
                (self.seq.count(curr_base) / float(self.len))
        # Equalize T/U -- pick the greater of the two frequencies
        # and then set them as equal
        self.base_freqs[T_base] = max(
            (self.base_freqs["T"], self.base_freqs["U"]))
        self.base_freqs[U_base] = self.base_freqs[T_base]

    def get_dinuc_freqs_from(self, base, all_bases="ATGC"):
        """
        Get a list of dinucleotide base from the given base to
        each other base.

        base -> A
        base -> T
        ...
        """
        return [self.du["%s%s" %(base, possible_base)] \
                for possible_base in all_bases]

    def prob_score(self, subseq):
        """
        Score probability of subseq in sequence.
        """
        if len(subseq) == 0:
            return 0
        # Score first base
        total_logscore = np.log(self.base_freqs[subseq[0]])
        for prev_base, next_base in utils.iter_by_pair(subseq, 1):
            # Score current dinucleotide
            curr_dinuc = "%s%s" % (prev_base, next_base)
            # Divide by sum of all other transitions from the previous base
            curr_dinuc_freq = self.du[curr_dinuc]
            denom_dinuc_freqs = np.sum(self.get_dinuc_freqs_from(prev_base))
            total_logscore += (np.log(curr_dinuc_freq) - \
                               np.log(denom_dinuc_freqs))
        total_score = np.exp(total_logscore)
        return total_score

    def get_expected_num(self, subseq):
        """
        Calculcate the number of expected occurrences of subseq
        based on dinucleotide frequencies.
        """
        subseq_len = len(subseq)
        # Calculcate score of sequence (its probability)
        subseq_score = self.prob_score(subseq)
        # Calculate the number of possible positions for the subseq
        # to occur within the larger sequence
        num_positions = self.len - subseq_len + 1
        # Expected number is the score times the number of possible
        # positions
        exp_num = num_positions * subseq_score
        return exp_num

    def __str__(self):
        return "DinucFreqs(len=%d, seq=%s)" % (self.len, self.seq)

    def __repr__(self):
        return self.__str__()