def greedy_motif_search(dna, k, t, pseudo=0):
    """
    :param dna: DNA strings
    :param k: kmer size
    :param t: number of kmers in the DNA
    :param pseudo: pseudocount
    :return: best motif
    """

    n = len(dna[0])
    best_motifs = []
    best_score = 0
    for i in range(n - k + 1):
        motifs = [dna[0][i:i + k]]
        for j in range(1, t):
            profile = get_profile(get_count(motifs[0:j]))
            if pseudo:
                profile = add_scalar_to_matrix(pseudo, profile)
            motifs.append(profile_most_probable_pattern(dna[j], k, profile))

        current_score = score(motifs, consensus(get_count(motifs)))
        if not best_motifs or current_score < best_score:
            best_motifs = motifs
            best_score = current_score

    return best_motifs
Пример #2
0
    def test_greedy_motif_search_heavy(self):
        dna = [
            'GCGCCCCGCCCGGACAGCCATGCGCTAACCCTGGCTTCGATGGCGCCGGCTCAGTTAGGGCCGGAAGTCCCCAATGTGGCAGACCTTTCGCCCCTGGCGGACGAATGACCCCAGTGGCCGGGACTTCAGGCCCTATCGGAGGGCTCCGGCGCGGTGGTCGGATTTGTCTGTGGAGGTTACACCCCAATCGCAAGGATGCATTATGACCAGCGAGCTGAGCCTGGTCGCCACTGGAAAGGGGAGCAACATC',
            'CCGATCGGCATCACTATCGGTCCTGCGGCCGCCCATAGCGCTATATCCGGCTGGTGAAATCAATTGACAACCTTCGACTTTGAGGTGGCCTACGGCGAGGACAAGCCAGGCAAGCCAGCTGCCTCAACGCGCGCCAGTACGGGTCCATCGACCCGCGGCCCACGGGTCAAACGACCCTAGTGTTCGCTACGACGTGGTCGTACCTTCGGCAGCAGATCAGCAATAGCACCCCGACTCGAGGAGGATCCCG',
            'ACCGTCGATGTGCCCGGTCGCGCCGCGTCCACCTCGGTCATCGACCCCACGATGAGGACGCCATCGGCCGCGACCAAGCCCCGTGAAACTCTGACGGCGTGCTGGCCGGGCTGCGGCACCTGATCACCTTAGGGCACTTGGGCCACCACAACGGGCCGCCGGTCTCGACAGTGGCCACCACCACACAGGTGACTTCCGGCGGGACGTAAGTCCCTAACGCGTCGTTCCGCACGCGGTTAGCTTTGCTGCC',
            'GGGTCAGGTATATTTATCGCACACTTGGGCACATGACACACAAGCGCCAGAATCCCGGACCGAACCGAGCACCGTGGGTGGGCAGCCTCCATACAGCGATGACCTGATCGATCATCGGCCAGGGCGCCGGGCTTCCAACCGTGGCCGTCTCAGTACCCAGCCTCATTGACCCTTCGACGCATCCACTGCGCGTAAGTCGGCTCAACCCTTTCAAACCGCTGGATTACCGACCGCAGAAAGGGGGCAGGAC',
            'GTAGGTCAAACCGGGTGTACATACCCGCTCAATCGCCCAGCACTTCGGGCAGATCACCGGGTTTCCCCGGTATCACCAATACTGCCACCAAACACAGCAGGCGGGAAGGGGCGAAAGTCCCTTATCCGACAATAAAACTTCGCTTGTTCGACGCCCGGTTCACCCGATATGCACGGCGCCCAGCCATTCGTGACCGACGTCCCCAGCCCCAAGGCCGAACGACCCTAGGAGCCACGAGCAATTCACAGCG',
            'CCGCTGGCGACGCTGTTCGCCGGCAGCGTGCGTGACGACTTCGAGCTGCCCGACTACACCTGGTGACCACCGCCGACGGGCACCTCTCCGCCAGGTAGGCACGGTTTGTCGCCGGCAATGTGACCTTTGGGCGCGGTCTTGAGGACCTTCGGCCCCACCCACGAGGCCGCCGCCGGCCGATCGTATGACGTGCAATGTACGCCATAGGGTGCGTGTTACGGCGATTACCTGAAGGCGGCGGTGGTCCGGA',
            'GGCCAACTGCACCGCGCTCTTGATGACATCGGTGGTCACCATGGTGTCCGGCATGATCAACCTCCGCTGTTCGATATCACCCCGATCTTTCTGAACGGCGGTTGGCAGACAACAGGGTCAATGGTCCCCAAGTGGATCACCGACGGGCGCGGACAAATGGCCCGCGCTTCGGGGACTTCTGTCCCTAGCCCTGGCCACGATGGGCTGGTCGGATCAAAGGCATCCGTTTCCATCGATTAGGAGGCATCAA',
            'GTACATGTCCAGAGCGAGCCTCAGCTTCTGCGCAGCGACGGAAACTGCCACACTCAAAGCCTACTGGGCGCACGTGTGGCAACGAGTCGATCCACACGAAATGCCGCCGTTGGGCCGCGGACTAGCCGAATTTTCCGGGTGGTGACACAGCCCACATTTGGCATGGGACTTTCGGCCCTGTCCGCGTCCGTGTCGGCCAGACAAGCTTTGGGCATTGGCCACAATCGGGCCACAATCGAAAGCCGAGCAG',
            'GGCAGCTGTCGGCAACTGTAAGCCATTTCTGGGACTTTGCTGTGAAAAGCTGGGCGATGGTTGTGGACCTGGACGAGCCACCCGTGCGATAGGTGAGATTCATTCTCGCCCTGACGGGTTGCGTCTGTCATCGGTCGATAAGGACTAACGGCCCTCAGGTGGGGACCAACGCCCCTGGGAGATAGCGGTCCCCGCCAGTAACGTACCGCTGAACCGACGGGATGTATCCGCCCCAGCGAAGGAGACGGCG',
            'TCAGCACCATGACCGCCTGGCCACCAATCGCCCGTAACAAGCGGGACGTCCGCGACGACGCGTGCGCTAGCGCCGTGGCGGTGACAACGACCAGATATGGTCCGAGCACGCGGGCGAACCTCGTGTTCTGGCCTCGGCCAGTTGTGTAGAGCTCATCGCTGTCATCGAGCGATATCCGACCACTGATCCAAGTCGGGGGCTCTGGGGACCGAAGTCCCCGGGCTCGGAGCTATCGGACCTCACGATCACC',
        ]
        k = 15
        t = len(dna)
        expected = [
            'GTTAGGGCCGGAAGT', 'CCGATCGGCATCACT', 'ACCGTCGATGTGCCC',
            'GGGTCAGGTATATTT', 'GTGACCGACGTCCCC', 'CTGTTCGCCGGCAGC',
            'CTGTTCGATATCACC', 'GTACATGTCCAGAGC', 'GCGATAGGTGAGATT',
            'CTCATCGCTGTCATC'
        ]

        from motifs.count_matrix import get_count

        expected_consensus = 'gttAAAtAgaGatGtG'.upper()
        expected_score = 58

        result = greedy_motif_search(dna, k, t)
        self.assertEqual(expected, result)

        cons = consensus(get_count(result))
        print(cons)
def gibbs_sampler(dna, k, t, n):
    best_score = float('inf')
    motifs = random_motifs([string for string in dna], k, t)
    best_motifs = motifs

    for j in range(n):
        skip_index = randint(0, t - 1)
        dna_slice = dna[:skip_index] + dna[skip_index + 1:]
        profile = profile_with_pseudocounts(get_count(dna_slice))
        motifs[skip_index] = profile_generated_string(motifs[skip_index],
                                                      profile, k)

        current_score = score(motifs, consensus(get_count(motifs)))
        if current_score < best_score:
            best_motifs = motifs
            best_score = current_score

    return best_motifs
def repeated_gibbs_sampler(dna, k, t, n):
    best_score = float('inf')
    best_motifs = []
    for i in range(100):
        motifs = gibbs_sampler(dna, k, t, n)
        current_score = score(motifs, consensus(get_count(motifs)))
        if current_score < best_score:
            best_score = current_score
            best_motifs = motifs
    return best_motifs
Пример #5
0
    def test_count_with_pseudocounts(self):
        expected = {
            'A': [3, 4, 4, 4, 7, 5, 3, 3, 2, 4],
            'C': [3, 4, 5, 4, 3, 4, 3, 2, 4, 4],
            'T': [3, 3, 1, 5, 2, 1, 3, 3, 2, 5],
            'G': [5, 3, 4, 1, 2, 4, 5, 6, 6, 1]
        }
        count = get_count(self.motifs)
        result = add_scalar_to_matrix(1, count)

        self.assertEqual(expected, result)
Пример #6
0
    def test_count_with_pseudocounts(self):
        expected = {
            'T': [2, 2, 1, 2, 5, 3],
            'A': [2, 3, 2, 1, 1, 3],
            'C': [3, 2, 5, 3, 1, 1],
            'G': [2, 2, 1, 3, 2, 2]
        }
        count = get_count(self.motifs)
        result = add_scalar_to_matrix(1, count)

        self.assertEqual(expected, result)
Пример #7
0
    def test_consensus(self):
        motifs = [
            'AACGTA',
            'CCCGTT',
            'CACCTT',
            'GGATTA',
            'TTCCGG',
        ]
        expected = 'CACCTA'

        count_matrix = get_count(motifs)
        cons_string = consensus(count_matrix)
        self.assertEqual(expected, cons_string)
def profile_to_motifs(profile, dna):
    """
    :param profile: profile matrix
    :param dna: DNA strings
    :return: best motifs
    """

    k = len(list(profile.values())[0])
    t = len(dna)
    n = len(dna[0])
    best_motifs = []
    best_score = 0
    for i in range(n - k + 1):
        motifs = [dna[0][i:i + k]]
        for j in range(1, t):
            motifs.append(profile_most_probable_pattern(dna[j], k, profile))

        if not best_motifs or score(motifs, consensus(get_count(motifs))) < best_score:
            best_motifs = motifs
            best_score = score(best_motifs, consensus(get_count(best_motifs)))

    return best_motifs
    def test_score(self):
        motifs = [
            'AACGTA',
            'CCCGTT',
            'CACCTT',
            'GGATTA',
            'TTCCGG',
        ]
        expected = 14

        count = get_count(motifs)
        cons = consensus(count)
        sc = score(motifs, cons)
        self.assertEqual(expected, sc)
Пример #10
0
    def test_count(self):
        motifs = [
            'AACGTA',
            'CCCGTT',
            'CACCTT',
            'GGATTA',
            'TTCCGG',
        ]
        expected = {
            'A': [1, 2, 1, 0, 0, 2],
            'C': [2, 1, 4, 2, 0, 0],
            'G': [1, 1, 0, 2, 1, 1],
            'T': [1, 1, 0, 1, 4, 2]
        }

        count_matrix = get_count(motifs)
        self.assertEqual(expected, count_matrix)
    def test_profile(self):
        motifs = [
            'AACGTA',
            'CCCGTT',
            'CACCTT',
            'GGATTA',
            'TTCCGG',
        ]
        expected = {
            'A': [0.2, 0.4, 0.2, 0.0, 0.0, 0.4],
            'C': [0.4, 0.2, 0.8, 0.4, 0.0, 0.0],
            'G': [0.2, 0.2, 0.0, 0.4, 0.2, 0.2],
            'T': [0.2, 0.2, 0.0, 0.2, 0.8, 0.4]
        }

        profile = get_profile(get_count(motifs))
        self.assertEqual(profile, expected)
Пример #12
0
    def test_greedy_motif_search_with_pseudocounts_long(self):
        dna = [
            'GCGCCCCGCCCGGACAGCCATGCGCTAACCCTGGCTTCGATGGCGCCGGCTCAGTTAGGGCCGGAAGTCCCCAATGTGGCAGACCTTTCGCCCCTGGCGGACGAATGACCCCAGTGGCCGGGACTTCAGGCCCTATCGGAGGGCTCCGGCGCGGTGGTCGGATTTGTCTGTGGAGGTTACACCCCAATCGCAAGGATGCATTATGACCAGCGAGCTGAGCCTGGTCGCCACTGGAAAGGGGAGCAACATC',
            'CCGATCGGCATCACTATCGGTCCTGCGGCCGCCCATAGCGCTATATCCGGCTGGTGAAATCAATTGACAACCTTCGACTTTGAGGTGGCCTACGGCGAGGACAAGCCAGGCAAGCCAGCTGCCTCAACGCGCGCCAGTACGGGTCCATCGACCCGCGGCCCACGGGTCAAACGACCCTAGTGTTCGCTACGACGTGGTCGTACCTTCGGCAGCAGATCAGCAATAGCACCCCGACTCGAGGAGGATCCCG',
            'ACCGTCGATGTGCCCGGTCGCGCCGCGTCCACCTCGGTCATCGACCCCACGATGAGGACGCCATCGGCCGCGACCAAGCCCCGTGAAACTCTGACGGCGTGCTGGCCGGGCTGCGGCACCTGATCACCTTAGGGCACTTGGGCCACCACAACGGGCCGCCGGTCTCGACAGTGGCCACCACCACACAGGTGACTTCCGGCGGGACGTAAGTCCCTAACGCGTCGTTCCGCACGCGGTTAGCTTTGCTGCC',
            'GGGTCAGGTATATTTATCGCACACTTGGGCACATGACACACAAGCGCCAGAATCCCGGACCGAACCGAGCACCGTGGGTGGGCAGCCTCCATACAGCGATGACCTGATCGATCATCGGCCAGGGCGCCGGGCTTCCAACCGTGGCCGTCTCAGTACCCAGCCTCATTGACCCTTCGACGCATCCACTGCGCGTAAGTCGGCTCAACCCTTTCAAACCGCTGGATTACCGACCGCAGAAAGGGGGCAGGAC',
            'GTAGGTCAAACCGGGTGTACATACCCGCTCAATCGCCCAGCACTTCGGGCAGATCACCGGGTTTCCCCGGTATCACCAATACTGCCACCAAACACAGCAGGCGGGAAGGGGCGAAAGTCCCTTATCCGACAATAAAACTTCGCTTGTTCGACGCCCGGTTCACCCGATATGCACGGCGCCCAGCCATTCGTGACCGACGTCCCCAGCCCCAAGGCCGAACGACCCTAGGAGCCACGAGCAATTCACAGCG',
            'CCGCTGGCGACGCTGTTCGCCGGCAGCGTGCGTGACGACTTCGAGCTGCCCGACTACACCTGGTGACCACCGCCGACGGGCACCTCTCCGCCAGGTAGGCACGGTTTGTCGCCGGCAATGTGACCTTTGGGCGCGGTCTTGAGGACCTTCGGCCCCACCCACGAGGCCGCCGCCGGCCGATCGTATGACGTGCAATGTACGCCATAGGGTGCGTGTTACGGCGATTACCTGAAGGCGGCGGTGGTCCGGA',
            'GGCCAACTGCACCGCGCTCTTGATGACATCGGTGGTCACCATGGTGTCCGGCATGATCAACCTCCGCTGTTCGATATCACCCCGATCTTTCTGAACGGCGGTTGGCAGACAACAGGGTCAATGGTCCCCAAGTGGATCACCGACGGGCGCGGACAAATGGCCCGCGCTTCGGGGACTTCTGTCCCTAGCCCTGGCCACGATGGGCTGGTCGGATCAAAGGCATCCGTTTCCATCGATTAGGAGGCATCAA',
            'GTACATGTCCAGAGCGAGCCTCAGCTTCTGCGCAGCGACGGAAACTGCCACACTCAAAGCCTACTGGGCGCACGTGTGGCAACGAGTCGATCCACACGAAATGCCGCCGTTGGGCCGCGGACTAGCCGAATTTTCCGGGTGGTGACACAGCCCACATTTGGCATGGGACTTTCGGCCCTGTCCGCGTCCGTGTCGGCCAGACAAGCTTTGGGCATTGGCCACAATCGGGCCACAATCGAAAGCCGAGCAG',
            'GGCAGCTGTCGGCAACTGTAAGCCATTTCTGGGACTTTGCTGTGAAAAGCTGGGCGATGGTTGTGGACCTGGACGAGCCACCCGTGCGATAGGTGAGATTCATTCTCGCCCTGACGGGTTGCGTCTGTCATCGGTCGATAAGGACTAACGGCCCTCAGGTGGGGACCAACGCCCCTGGGAGATAGCGGTCCCCGCCAGTAACGTACCGCTGAACCGACGGGATGTATCCGCCCCAGCGAAGGAGACGGCG',
            'TCAGCACCATGACCGCCTGGCCACCAATCGCCCGTAACAAGCGGGACGTCCGCGACGACGCGTGCGCTAGCGCCGTGGCGGTGACAACGACCAGATATGGTCCGAGCACGCGGGCGAACCTCGTGTTCTGGCCTCGGCCAGTTGTGTAGAGCTCATCGCTGTCATCGAGCGATATCCGACCACTGATCCAAGTCGGGGGCTCTGGGGACCGAAGTCCCCGGGCTCGGAGCTATCGGACCTCACGATCACC',
        ]
        k = 15
        t = len(dna)
        expected_score = 35

        result = greedy_motif_search_with_pseudocounts(dna, k, t)
        print(result)
        self.assertEqual(expected_score,
                         score(result, consensus(get_count(result))))
Пример #13
0
    def test_greedy_motif_search_0(self):
        dna = [
            'GGCGTTCAGGCA',
            'AAGAATCAGTCA',
            'CAAGGAGTTCGC',
            'CACGTCAATCAC',
            'CAATAATATTCG',
        ]
        k = 3
        t = len(dna)
        expected = [
            'CAG',
            'CAG',
            'CAA',
            'CAA',
            'CAA',
        ]

        result = greedy_motif_search(dna, k, t)
        self.assertEqual(expected, result)

        cons = consensus(get_count(expected))
        self.assertEqual(2, score(result, cons))
Пример #14
0
    def test_greedy_motif_search_with_pseudocounts_0(self):
        dna = [
            'GGCGTTCAGGCA',
            'AAGAATCAGTCA',
            'CAAGGAGTTCGC',
            'CACGTCAATCAC',
            'CAATAATATTCG',
        ]
        k = 3
        t = len(dna)
        expected = [
            "TTC",
            "ATC",
            "TTC",
            "ATC",
            "TTC",
        ]
        expected_score = 2

        result = greedy_motif_search_with_pseudocounts(dna, k, t)
        self.assertEqual(expected, result)
        self.assertEqual(expected_score,
                         score(result, consensus(get_count(result))))