def main(): print("<div style=\"border:1px solid black;\">", end="\n\n") print("`{bm-disable-all}`", end="\n\n") try: dnas = [] while True: try: dna = input().strip().upper() if len(dna) > 0: dnas.append(dna) except EOFError: break hasher = md5() hasher.update(str(dnas).encode('utf-8')) logo_filename = 'motif_logo_' + hasher.hexdigest() + '.svg' if path.isdir('/output'): logo_path = '/output/' + logo_filename else: logo_path = '/tmp/' + logo_filename print(f'Generating logo for the following motif matrix...\n\n') print(f'{"<br>".join(dnas)}\n\n') counts = motif_matrix_count(dnas) profile = motif_matrix_profile(counts) logo = create_logo(profile) plt.savefig(logo_path) print(f'Result...\n\n') print(f'![Motif Logo]({logo_filename})\n\n') finally: print("</div>", end="\n\n") print("`{bm-enable-all}`", end="\n\n")
def gibbs_sampler_motif_search_with_psuedocounts(k: int, dnas: List[str], cycles: int) -> List[str]: motif_matrix = [] for dna in dnas: start = randrange(len(dna) - k + 1) kmer = dna[start:start + k] motif_matrix.append(kmer) best_motif_matrix = motif_matrix[:] # create a copy, otherwise you'll be modifying both motif and best_motif for j in range(0, cycles): i = randrange(len(dnas)) # pick a dna del motif_matrix[i] # remove the kmer for that dna from the motif str counts = motif_matrix_count(motif_matrix) apply_psuedocounts_to_count_matrix(counts) profile = motif_matrix_profile(counts) new_motif_kmer_probs = determine_probabilities_of_all_kmers_in_dna( profile, dnas[i], k) new_motif_kmer_idx = gibbs_rand(new_motif_kmer_probs) new_motif_kmer = dnas[i][new_motif_kmer_idx:new_motif_kmer_idx + k] motif_matrix.insert(i, new_motif_kmer) if score_motif(motif_matrix) < score_motif(best_motif_matrix): best_motif_matrix = motif_matrix[:] # create a copy, otherwise you'll be modifying both motif and best_motif return best_motif_matrix
def main(): print("<div style=\"border:1px solid black;\">", end="\n\n") print("`{bm-disable-all}`", end="\n\n") try: dnas = [] while True: try: dna = input().strip().upper() if len(dna) > 0: dnas.append(dna) except EOFError: break kmer = dnas[-1] motif_matrix = dnas[:-2] counts = motif_matrix_count(motif_matrix) profile = motif_matrix_profile(counts) prob = determine_probability_of_match_using_profile_matrix(profile, kmer) print(f'Motif matrix...\n\n') print(f'{"<br>".join(motif_matrix)}\n\n') print(f'Probability that {kmer} matches the motif {prob}...\n\n') finally: print("</div>", end="\n\n") print("`{bm-enable-all}`", end="\n\n")
def greedy_motif_search(k: int, dnas: List[str]): best_motif_matrix = [dna[0:k] for dna in dnas] for motif, _ in slide_window(dnas[0], k): motif_matrix = [motif] counts = motif_matrix_count(motif_matrix) profile = motif_matrix_profile(counts) for dna in dnas[1:]: next_motif, _ = find_most_probable_kmer_using_profile_matrix( profile, dna) # push in closest kmer as a motif member and recompute profile for the next iteration motif_matrix.append(next_motif) counts = motif_matrix_count(motif_matrix) profile = motif_matrix_profile(counts) if score_motif(motif_matrix) < score_motif(best_motif_matrix): best_motif_matrix = motif_matrix return best_motif_matrix
def score_motify_entropy(motif_matrix: List[str]) -> float: rows = len(motif_matrix) cols = len(motif_matrix[0]) # count up each column counts = motif_matrix_count(motif_matrix) profile = motif_matrix_profile(counts) # prob dist to entropy entropy_per_col = [] for c in range(cols): entropy = calculate_entropy([profile['A'][c], profile['C'][c], profile['G'][c], profile['T'][c]]) entropy_per_col.append(entropy) # sum up entropies to get entropy of motif return sum(entropy_per_col)
def score_motif_relative_entropy(motif_matrix: List[str], source_strs: List[str]) -> float: # calculate frequency of nucleotide across all source strings nuc_counter = Counter() nuc_total = 0 for source_str in source_strs: for nuc in source_str: nuc_counter[nuc] += 1 nuc_total += len(source_str) nuc_freqs = dict([(k, v / nuc_total) for k, v in nuc_counter.items()]) rows = len(motif_matrix) cols = len(motif_matrix[0]) # count up each column counts = motif_matrix_count(motif_matrix) profile = motif_matrix_profile(counts) relative_entropy_per_col = [] for c in range(cols): # get entropy of column in motif entropy = calculate_entropy([ profile['A'][c], profile['C'][c], profile['G'][c], profile['T'][c] ]) # get cross entropy of column in motif (mixes in global nucleotide frequencies) cross_entropy = calculate_cross_entropy([ profile['A'][c], profile['C'][c], profile['G'][c], profile['T'][c] ], [nuc_freqs['A'], nuc_freqs['C'], nuc_freqs['G'], nuc_freqs['T']]) relative_entropy = entropy - cross_entropy # Right now relative_entropy is calculated by subtracting cross_entropy from (a negated) entropy. But, according # to the Pevzner book, the calculation of relative_entropy can be simplified to just... # def calculate_relative_entropy(probabilities_for_nuc: List[float], total_frequencies_for_nucs: List[float]) -> float: # ret = 0.0 # for prob, total_freq in zip(probabilities_for_nuc, total_frequency_for_nucs): # ret += value * (log(value / total_freq, 2.0) if value > 0.0 else 0.0) # return ret relative_entropy_per_col.append(relative_entropy) # sum up entropies to get entropy of motif ret = sum(relative_entropy_per_col) # All of the other score_motif algorithms try to MINIMIZE score. In the case of relative entropy (this algorithm), # the greater the score is the better of a match it is. As such, negate this score so the existing algorithms can # still try to minimize. return -ret
def randomized_motif_search_with_psuedocounts(k: int, dnas: List[str]) -> List[str]: motif_matrix = [] for dna in dnas: start = randrange(len(dna) - k + 1) kmer = dna[start:start + k] motif_matrix.append(kmer) best_motif_matrix = motif_matrix while True: counts = motif_matrix_count(motif_matrix) apply_psuedocounts_to_count_matrix(counts) profile = motif_matrix_profile(counts) motif_matrix = [ find_most_probable_kmer_using_profile_matrix(profile, dna)[0] for dna in dnas ] if score_motif(motif_matrix) < score_motif(best_motif_matrix): best_motif_matrix = motif_matrix else: return best_motif_matrix
dnas = [ 'GGCGTTCAGGCA', 'AAGAATCAGTCA', 'CAAGGAGTTCGC', 'CACGTCAATCAC', 'CAATAATATTCG' ] k = 3 motifs = [] for dna in dnas: start = randrange(len(dna) - k) motif = dna[start:start + k] motifs.append(motif) best_motifs = motifs while True: counts_matrix = motif_matrix_count(motifs) for elem, counts in counts_matrix.items(): # add in pseudocounts counts_matrix[elem] = [c + 1 for c in counts] profile_matrix = motif_matrix_profile(counts_matrix) motifs = [ find_most_probable_kmer_using_profile_matrix(profile_matrix, dna)[0] for dna in dnas ] if score_motif(motifs) < score_motif(best_motifs): best_motifs = motifs else: break [print(f'{m}') for m in best_motifs]
# CGGGTCAAACGACCCTAGTG # CGGGACGTAAGTCCCTAACG # CCGGGCTTCCAACCGTGGCC # CGTGACCGACGTCCCCAGCC # GAGGACCTTCGGCCCCACCC # GGGGACTTCTGTCCCTAGCC # TGGGACTTTCGGCCCTGTCC # GGGGACCAACGCCCCTGGGA # GGGGACCGAAGTCCCCGGGC # 11 # consensus_kmer = 'CGGGACCTACGTCCCTAGCC' # this is consensus string for hte matrix it finds best_motif_matrix_counts = motif_matrix_count(best_motif_matrix) for elem, counts in best_motif_matrix_counts.items(): # add in pseudocounts best_motif_matrix_counts[elem] = [c + 1 for c in counts] best_motif_matrix_profile = motif_matrix_profile(best_motif_matrix_counts) with open('/home/user/Downloads/GCF_000195955.2_ASM19595v2_genomic.fna', mode='r', encoding='utf-8') as f: data = f.read() lines = data.split('\n') lines = [l.strip() for l in lines] # get rid of whitespace lines = [l if not l.startswith('>') else '' for l in lines] # remove comments dna = ''.join(lines) # concat into single dna str for kmer, _ in slide_window(dna, k): prob = determine_probability_of_match_using_profile_matrix(best_motif_matrix_profile, kmer) if prob >= 0.01: # 1% or greater print(f'{kmer} {prob}') # Nothing is found... # # The strings in DosR.txt aren't matching up to the genome at the link I posted (even though the name of the organism