def k_mer_generator(k): """returns a set of all 4^k k-mers""" k_mer_list = [] for i in range(4**k - 1): k_mer_list.append(ntp.number_to_pattern(i, k)) return set(k_mer_list)
def freq_seq_mismatch_sort(genome, k, d): frequent_patterns = set() neighborhoods = [] count = [0] pos_index = [0] for i in range(len(genome) - k): neighborhoods.append(neighbors.recursive_neighbors(genome[i:i + k], d)) flattened_neighborhood = [i for i in apc.iter_flatten(neighborhoods)] for i in range(len(flattened_neighborhood)): pattern = flattened_neighborhood[i] pos_index.append(ptn.pattern_to_number(pattern)) count.append(1) sorted_index = sorted(pos_index) for i in range(len(flattened_neighborhood)): if sorted_index[i] == sorted_index[i + 1]: count[i + 1] = count[i] + 1 max_count = max(count) for i in range(len(flattened_neighborhood)): if count[i] == max_count: pattern = ntp.number_to_pattern(sorted_index[i], k) frequent_patterns.add(pattern) return frequent_patterns
def all_kmer_generator(KMER_LENGTH): """ Generates a set of all possible kmers given the input KMER_LENGTH """ kmer_set = set() for i in range(4 ** KMER_LENGTH - 1): kmer_set.add(ntp.number_to_pattern(i, KMER_LENGTH)) return kmer_set
def freq_words_w_mismatches(text, k, d): freq_patterns = set() freq_patterns_w_reversecomp = [] close_array = [0] freq_array = [0] neighborhood = [] for i in range(4**k - 1): close_array.append(0) freq_array.append(0) for i in range(len(text) - k): neighborhood.append(neighbors.recursive_neighbors(text[i:i+k], d)) flat_neighborhood = [i for i in apc.iter_flatten(neighborhood)] for pattern in flat_neighborhood: the_index = ptn.pattern_to_number(pattern) close_array[the_index] = 1 for i in range(4**k - 1): if close_array[i] == 1: generated_pattern = ntp.number_to_pattern(i, k) freq_array[i] = apc.approx_pattern_count(text, generated_pattern, d) rc_pattern = rc.reverse_complement(generated_pattern) freq_patterns_w_reversecomp.append(rc_pattern) the_max_count = max(freq_array) for i in range(4**k - 1): if freq_array[i] == the_max_count: generated_kmer = ntp.number_to_pattern(i, k) freq_patterns.add(generated_kmer) return freq_patterns
def median_string(dna_list, k): """generates the median string that has the minimum distance""" median_list = [] distance = k * (len(dna_list) - 1) for i in range(4**k - 1): k_mer = ntp.number_to_pattern(i, k) distance_pattern = distance_pattern_string(k_mer, dna_list) if distance > distance_pattern: distance = distance_pattern median = k_mer median_list = [] if distance == distance_pattern: median_list.append(k_mer) return median_list