Exemplo n.º 1
0
def randomized_motif_search(dna, k, t):
    rand_ints = [randint(0, len(dna[0]) - k) for a in range(t)]
    motifs = [dna_list[i][r:r + k] for i, r in enumerate(rand_ints)]

    best_score = [score(motifs), motifs]

    while True:
        current_profile = profile_with_pseudocounts(motifs)
        motifs = motifs_from_profile(current_profile, dna_list, k)
        current_score = score(motifs)
        if current_score < best_score[0]:
            best_score = [current_score, motifs]
        else:
            return best_score
Exemplo n.º 2
0
def randomizedMotifSearch(dna, k, t):
    bestMotifs = []
    n = len(dna[0])
    for i in xrange(t):
        index = random.randint(0, n - k)
        bestMotifs.append(dna[i][index:index + k])
    motifs = bestMotifs
    while True:
        profile_matrix = formProfile(motifs)
        motifs = selectMotifs(profile_matrix, dna, k)
        if score(motifs) < score(bestMotifs):
            bestMotifs = motifs
        else:
            return score(bestMotifs), bestMotifs
Exemplo n.º 3
0
def gibbsSampler(dna, k, t, N):
    motifs = []
    n = len(dna[0])
    for i in xrange(t):
        index = random.randint(0, n - k)
        motifs.append(dna[i][index:index + k])
    best_motifs = motifs
    for j in xrange(N):
        r = random.randint(0, t - 1)
        profile_matrix = formProfileGibbs(motifs, r)
        motif = detectMostProbableKMer(dna[r], k, profile_matrix)
        motifs = [motif if index == r else m for index, m
                  in enumerate(motifs)]
        if score(motifs) < score(best_motifs):
            best_motifs = motifs
    return best_motifs
Exemplo n.º 4
0
def gibbs_sampler(dna, k, t, N):
    rand_ints = [randint(0, len(dna[0]) - k) for a in xrange(t)]
    motifs = [dna_list[i][r:r + k] for i, r in enumerate(rand_ints)]
    best_score = [score(motifs), motifs]
    for i in range(N):
        r = randint(0, t - 1)
        current_profile = profile_with_pseudocounts(
            [motif for index, motif in enumerate(motifs) if index != r])
        motifs = [
            profile_most_probable_kmer(dna[index], k, current_profile)
            if index == r else motif for index, motif in enumerate(motifs)
        ]
        current_score = score(motifs)
        if current_score < best_score[0]:
            best_score = [current_score, motifs]

    return best_score
Exemplo n.º 5
0
# k = 15
# t = 20
# N = 2000
#
# best_motifs = [k * t, None]
# for repeat in xrange(20):
#     current_motifs = gibbsSampler(dna, k, t, N)
#     if score(current_motifs) < best_motifs[0]:
#         best_motifs = [score(current_motifs), current_motifs]
#
# output = best_motifs[1]
# example = ['ACGTCCACCGGCGTC', 'AAGCGCACCGGGGTG', 'ACCCTTACCGGGGTG', 'AAGTTCCTCGGGGTG', 'AAGTTTTATGGGGTG', 'AAGTTTACCGGGTGC', 'AAGTTTCGAGGGGTG', 'CTGTTTACCGGGGTA', 'AAGTTGCTCGGGGTG', 'AAACATACCGGGGTG', 'AAGTTTAGGAGGGTG', 'AAGGAAACCGGGGTG', 'AAGTTTACACAGGTG', 'TAGTTTACCGGGGAT', 'CCTTTTACCGGGGTG', 'AAGTGAGCCGGGGTG', 'AAGTCGTCCGGGGTG', 'AAGTTTACCGGACAG', 'AAGTTTACCAATGTG', 'AAGTTTACCGTCATG']
#
# for item in output:
#     if item not in example:
#         print 'ATATA'

dna = ['AGATAAGCGAAAGTCGCCCGTTTGGGAATAGAATTCGGGATAAAAAAGGCCGTAGCTCACTAGAGCCGTTATGCGGACAGGTGATGTGAGAGTCCTGGCACATTCCGCGACTATCGATTTCTGGGGACATCAAGTCGCTCAACGAGACTCGGGAAATCGTTCGCGGTCGCGCGTCGGTTACCGCATATGGTTGGCCTATGTCATGTCGGTTAAGTTCTGTACTTACGAACTCCGTTGAAACCGCTGTGCGTGTAAAACCAGCACTTGGAGCGCCACTTTCGAAGGGAGGTACGAGCAAGTCAAAACTAGCTGTCGGCAGATAAGCGAAAGTC', 'GATGGTGGAGTGCTCGCCCGTTTGGGAATAGAATTCGGGATAAAAAAGGCCGTAGCTCACTAGAGCCGTTATGCGGACAGGTGATGTGAGAGTCCTGGCACATTCCGCGACTATCGATTTCTGGGGACATCAAGTCGCTCAACGAGACTCGGGAAATCGTTCGCGGTCGCGCGTCGGTTACCGCATATGGTTGGCCTATGTCATGTCGGTTAAGTTCTGTACTTACGAACTCCGTTGAAACCGCTGTGCGTGTAAAACCAGCACTTGGAGCGCCACTTTCGAAGGGAGGTACGAGCAAGTCAAAACTAGCTGTCGGCAGATAAGCGAAAGTC', 'CGCGGGGGCTCTGAGTGTCTAAATGCCGAATACCGGTACAAGACACCAAGATTGCGGAAACTGACCTACTTGAACCCTGCCAGGTGCTAGCCAAGATAAGGCTAGGGTCTTAAGTCCAGAAATAGCCTTGCGCCAGCCAAACACGGAGCTGCTTCAGGTTGTGTGGTCGCTGTCTCGCAGTACGAGTGGCACGACAAGCCCCAGCCTCCCAGATTCGCAAGAGCTGATCCATGATGTAGGAAGACCTAGCTCTCAGATAGGGAGGCTTTCTCGTGGCGTGGGATCATCCGTTGTGTTCCCGTAACCAAGAGATACGGGAGCAATCGATTGGT', 'TTGAGTTGTGTGACAGGCGAACGGATATTAGAGTGCTCATGAATTCAATCGACCGGACGGCCGTTAAAATGTGGATAGAAATTTGTCTTTCCATGGATCCGCCAATGTACATCGGCACTAAGGTTAAATCTTCCCGTCACGATGGAGAATACAATCGGACTGCTTTACGCCCAGAACTCGGTAATCCCCGAGGATAGTCCCATCTAGATCGCTACTATGATACGGTCGCACTGACTCGTCAGCACCATGGGATTTAATAAACACAACTGTAGCAGAAGTGAAACCCTGCTAAAGTGCAGTCCTCTACGTTGAATAATTTTTTAAAGCAGCCC', 'CGTGACCTGGGTGACAGTCGACCCAGTGAGTAGGACTCAAAAGTGCCGCTCTTACATATATGCCCCTCAGGTCAGGATCTCAGCCCTAGGCTAGATGATTTCTCTGCTCGCGGTATGGGTCTAAAACTACACCATCTAGGGAACCGGCTGATAGGCGTAAAGTGGGTTTAACTGTTTCCGTGAGCTATCTTAAATAGTCAGTAGACTAGGTTTGTGTATGCCCCGTAACTGATCGTCAGAAGGGTGCCTAAAGTAACGTTTACAGGTCGTACGGGAGTGCTACCACGCTCTTCCGTGTTCGTTAGCCTGGTTCGGACTACCCAGAACCCACA', 'ATCGAGGATACGGGGCAGCTCTGCCGCAAAACAAAGTGATAGGGACGAAATGGCAGTCCCAACTAATAACATAAACACTTTCGTCATACTACGACTCTGCAAGACAGATTATTAGAATGTCCACGCGACTAATCGGCCCCTTCACGAAAATACCAGAAGCCGTAAAGGTCCACGCGACACCCAAACAGGGGAAGGAGGGTCTGTTTGAACGGAAGAACGAGCTTTTATAAAGGAGCCCCAAACATGAATTGTCATGTAGGCAGGTGTGATTTTTAGCATCCCTTACAAGAGTATAGCCTTAATCAGCTGTAAGCATAACGCCTGGGACTGCA', 'CAACAGTTAGATCCGGATAACTCAGTTCGACCACTGGAATACGCATTGCATCAAATGCCCAAGGAGAGCACGTCTGTTCTGCACGGAGTTATTGGTCATCTATTGCTGCCAAAGTTGCGATGGCGGCTAAGACTGGGTGCATACTGGTGACCGGAGAGCGTCGGACTGGTTATCTATTGCGGACGGGGGTGCGCAATCCTACACGTTTAATAGAGTTCAGGTCACGAGATACGGGAGTAAACCAACTTGGCTATTTTATGAATGTTTCAGACTGAGTATTTCGTCTGTCTTTGGTCTAAAGACTGGGGAGCTGAACGCCACAGAACGCCTAG', 'GTGTGCTATTGAACGAGCGTTTCTGGGAGCGCACTATAGGGCACAGTCCCGTCATATTACCGTGGATTCTTTTCCGTTTTCGTTACCTTATGATATGGTGAGGGCACCCAGAATTGTGCGTGAGAATATTTGCCCGGGGCACATACGCCGGCCGTTCATCCCTCAGATGGCTGGCCGTGTGATACGGGACGCCTCATACCACTAGGGCACTAGCGTTGTCCGGCCACTGCCCACTTTGAGAAGGAAACTTCAGCTGTCAAAAAATGAGTTGCCTCGCTTCACCAGACGCTATATCTACTTTGTATGATACGGTCATCGGTCATGCGCACTTG', 'TTACTCACTTCCGCCTCCATTCTAGTGAGTTCTAAGGACCTGGAGGGGATTCGGTCGTCGTACAGAAAACTAACTACTTTCACTACATTTGTGCAAAGCCACGCTCCCCAATCATTTTGTCCATTTCCCGATACGGGAGTGGCTTACACGCTCTGTGAGAGATTATGGCTAGGGTGAAGGTGATCGTAGCCTGTGCAGACACTCCCACTATCGGAATGTAACCCCGCAAGTCTCTTCTTCCGCGCCACGAGATGTCTAGCCGCCTCATGCGCGTGGATCCTGTACTGCGTCTCCCCCTACTCTGGGTCCGGCCAGGTTCCCTTTTAAGTCCC', 'TTCCCCTGGGGGGTAGGCGGTTGTGGTACGATATGGGGAATGGCTTGCCCAATCCACTTACGCGGAACGTTACCAGAAACCCACTAACCAAATCGTACTGTTCTATTAGGCGATAGTAGTTCTTCCGTCAAGGTTACATAGTCTTAGCCTCTAAATGTATACGCGCGGAGGTTGCTGTTCACCTAGCCCATCAATTGTTACGTGGTCCCATATGTATTATGCGGTTGCCGCGGCCTGCGTAGGCTTTTAGGCGTCATTGAAGCACCGAGAGATAGACGAGTGCTCATACCACTAACCGTTATACTCGGGCTCACCAAGGGGCGGATATACTT', 'AGGGTGAGCTAACTTACCGCCGGGGGACAGTGACCTCATTTAGGTCCACAAGTGTTATACTCGCCCACTGAGCACGCAACTCTATTCCTGTCCTGGCGCTCTCTCTCGCACCCGATGATTAGCCCTAAGACTGCACCGCGCCGCTAACTCTAAATTACACCCCTCCTCGTGGAATTAGAGGTGCGGATTTAATGGATCACAGGCTCTCGACGTGACCCCCTGGAACTGTTACGCGAGTTAGATACACTAGTGCTCTGTGCAGATAAGTTACCCCAAAGGCTTGCTCGGAGTTAGAATGACGCACTGGTGTCCCGATATAGGCGTTCGGGAAA', 'CTATCCGTTTAACTGGCGCACAGACTATCAATCAGCCACCAAATGTGACGGTCTGACTTTTCTCAGGCCGGTAAACCCGCGCTTTGTTCTGAGGCATAAGCTATGCTACGGTGCTTGATACGTTGGTGGGACGCACATCCGCGTGACGTGGGAGTGCTCCGTACGGATGACGCCCCTCCCGATTAAGCGCTGTAATTGAAGGGTTTGTGGTTCGCGCCCAACAACGGCGCGATATTAGGGGACCCGAAGCTTGATCTCCCACGCACGAAGGTGCAAAGAGCAGAAAAGACGCCCCAATAACGCATTAACCAGTAGTGTTGGATGCCCTCACG', 'CTCTGCACGGCGGATTTTGTGATTTTCCAGGCGCGTGGCTGTTTCTAGCCCCGGGAGTGCTCAGAAAATACTATATGATTGCGCAGCAACCGTTATATAAAGAGTCTAACAAGGGCCATTAACGGTGCAGTGATCTATAGGGGGTGGCGCCACCAAAAGAATCAAGAAGCTTTGTAGGTGTTCAGCCGGTGGTCTAGAATGCCCAGACCTTCTCACCATCGGTTCGATCTTCCACAAGCATTACTGGCGAGGGAGGATGTAGGAGTGCACGACGACTACCTAACTTCGTAGGTCTAGAAAAGATTCCCCTATTAGCGGTAGCAAGTGCAACG', 'GGGCATAAATATAATTTAGAGTGACGAGCCGGAGGGTTCTCTTTCACGTGCTTCCGAAAGTCGATGGGTTTGAGTCAATCACCCGCCTGTTAGTATTTAATGAAGTACCTCGGAGAGCTTGTCTGAGCTAAAACGTTGTTGAGCTATAACCTAGGGTGGAAAGGCACTAATACACGTCCCAGTTTGAAGGTTCTTGACTCCTCGTTGGTCGAGCCCATAAGGCCAACATCGCATTCGCGCGACTGAAGTAAAGTCGCTCAGTGATGAGGCTCGCCAAACTCCACGGGAGTGCTCTGACACAGTCACGTCACTAATCCCTTCCTGAGTGCCAT', 'TTACTCTTGTTAGACCCGAGGGAGTAGGTGCATAACCCACGTTACGAAGGGCTCGGACGACATGATGAAATACTGTACTGTGCGTTCATTTCCCCCATTCTATGTAGATACGCACGTGCTCAACCATCCAGTTGGTGCCTCCAGAGATCCGTGTGAACGCACCAGTGTATAGCGCCGCAGAGCGTTCGTTTGCGCTGCTCTTCCTTCGTCTATTCGAAGCTCCAAGAGGAAGGGTCCGTTCGCCCCCTTCTATGTGTCTCGCGGCGTGTTGAGTATTGAAATTCACGGCTCTCTACAGGCGCGTTTCGGACACAGCTATCTTGCCTAGCCTT', 'AAAACCGCGAAGTACTCCGGAAAAGCGATTCGCAAAGGCCCATTTCAGGACCCAGCGTCCAAAGGCGTTGACCCCAACGGACAGGACAGCTTTAGAGTGGTTCCCCGCCGTTAAGGTGCGTTCAGCGGCGTTAGTATTTAAAGAACGAGTTTCCATTTAATAAGTCATAGAGTGTCACTTGTGGTTCACTTTGGTTAGGTCAACCCATGCATCATTCAGTGGGATCTTTCTTCTAATTCATGCCTATGATATACTGAACTGGTCACTGAACTTCGCGACTCGGCCGACTTCCCAAGTATTTTTTTCATACGGGAGTGCGGTACCGAGCGGGT', 'AGACGTATTCGCGCAAGTGGCTTCCCGCGAAAATCGGCTCGTGGTACCCTCAGAGGACCTACCTCCGACTGTTACGGACATCCTGTCACCAGGGCAATTCTACAACGCTGTTGAAAGGTCTGTTATATGATTGTCCCGTTACGCCTAACCTCGAACCGGCCGGAAAGCGGATCTACGACCCGCTGTCTAGTATGTGGTGTGTAGACATACGCCGGGAAGGCATAAATGGAGAGTTACCCAATTTGGCACCTGTAGTAAGGTACGGGGATAATGATACTTGCCCTCACCGAATGGAATGTAAGCCGGGATCACGGAGTGCTCCGAGTGGCCCA', 'CCAAAACGATACTCGTATGACAACCCCCGTGTACCACTCGCCTATGGAAGGACTAGTTCTGCAGACAAGATATCGGATACGAATGTGCTCGTTACTACTAACGTTAAATTACTAAGCGACGATAGTGCGTTGCAAAGTGAGCGTTTGAATGTCACCATACTAAATAGACCTCAGTACCTGCATCGATGCCTACGCACTATTATGTCCAGCATTCGGTTTTAGGACAAATCATAATAACCTAGTCGGCAGAACAAGCCATGAACGAATGCTTGTGTTGACCGCTGTCGTGGCGATTCGCGTAGAATGTGTGCCCGACTTTCGGCGTTTCAGTG', 'GTATATCCAATATACGGGCGACATTCATCTACTAGCGTACATAACGTTTGTGCGAGCACAATGAAGCGCCATAGTCTAGATCTCGAATTTAATGAGCCCAACTCGCGTCTTACATGTTCTCTGGATCGAGCCACAACCCGATACGGAGATGCTCATTCGTCTCTAAGCTACTTCCGTCGACCAGGCCGGAGAGTTAGGTAAAAGCATACAATTGAACAACTAGGCAGAGTGGTCCGCGTCCACATGATATACAGCTGCGTGTACTGCTCCTAGTATATATCCGTGTTCCTGCCTGGTTTCACCAGATGCCTGGAAGAGCTCCGTCAGCGCGT', 'TCGAGTTGTTAAGTCTTCAAAGAGCTCGCGCAATTAATTTCCATTATGGCGTAGAGTCGACAGCTACCGCTCGAAAAATCGTGGGGGTTCGCTTGAAGGAAAGGTCAGGTTCTCACGACCTGTAAGCGAGTCTGCACGTTGTGGCGAGCGGTACACGGTATTTTTAACAAAGAATACAGGAGGTGTCCCACCAGTGTAGGGACCACTGGTGCAGATCAAAATAAGGATTAAGTTGCAGCGTACCGATACAACAGTGCTCAGATTCTCAATATTAGGTCGGCTAGCCGGACAGTCAATAGCACCGAGCACTCATCATATAGTTCGCACATAAG']
k = 15
t = 20
N = 2000

best_motifs = [k * t, None]
for repeat in xrange(100):
    current_motifs = gibbsSampler(dna, k, t, N)
    if score(current_motifs) < best_motifs[0]:
        best_motifs = [score(current_motifs), current_motifs]

print best_motifs[1]