def frequent_words_with_mismatches_v2(text, k, hamming_dist): """(Failed) attempt to improve the efficiency of v1""" #initialize frequency array frequency_array = list() for index in range(4 ** k): frequency_array.append(0) #list all patterns in text patterns_in_text = list() for index in range(len(text)-k+1): pattern = text[index:index+k] patterns_in_text.append(pattern) #create list of all possible kmers possible_kmers = all_possible_kmers(k) #go through patterns and compare to kmers for pattern in patterns_in_text: for kmer in possible_kmers: if patterncount.hamming_distance(pattern, kmer) <= hamming_dist: frequency_array[pattern_to_number(kmer)] = frequency_array[pattern_to_number(kmer)] + 1 frequent_patterns = set([]) max_count = max(frequency_array) for index in range(4 ** k): if frequency_array[index] == max_count: pattern = number_to_pattern(index, k) frequent_patterns = frequent_patterns.union(set([pattern])) return (frequent_patterns, max_count)
def frequent_words_with_mismatches_v1(text, k, hamming_dist): """Find the most frequent word(s) of length within a text, allowing up to hamming_dist mismatches. N.B. that the most frequent word need not actually occur in the text -- for example, input ("ATATA", 3, 2) returns 10 distinct patterns that occur twice, despite the fact that there are only two distinct 3-mers that actually occur in the text, and one of those patterns that actually does occur in the text ('TAT') is not one of the most frequent 3-mers. Runs very inefficiently; use v3 instead!""" #enumerate all possible kmers possible_kmers = set(all_possible_kmers(k)) #enumerate kmers that actually occur in text patterns_in_text = list() for index in range(len(text)-k+1): pattern = text[index:index+k] patterns_in_text.append(pattern) print patterns_in_text #pare down to those that are within hamming_dist of each pattern in text relevant_kmers = set([]) for kmer in possible_kmers: for pattern in patterns_in_text: if patterncount.hamming_distance(pattern, kmer) <= hamming_dist: relevant_kmers = relevant_kmers.union(set([kmer])) break else: pass relevant_kmers = relevant_kmers.union(patterns_in_text) #add all patterns that actually occur in text! #initialize frequency array frequency_array = list() for index in range(4 ** k): frequency_array.append(0) #loop through text and count frequencies for pattern in patterns_in_text: pattern_list = [kmer for kmer in relevant_kmers if patterncount.hamming_distance(kmer, pattern) <= hamming_dist] pattern_list = set(pattern_list) pattern_list = pattern_list.union(set([pattern])) code_list = [pattern_to_number(pat) for pat in pattern_list] for code in code_list: frequency_array[code] = frequency_array[code] + 1 #count most frequent patterns frequent_patterns = set([]) max_count = max(frequency_array) for index in range(4 ** k): if frequency_array[index] == max_count: pattern = number_to_pattern(index, k) frequent_patterns = frequent_patterns.union(set([pattern])) return (frequent_patterns, max_count)