def count_approx_pattern(text, pattern, d): k = len(pattern) count = 0 for i in range(len(text) - k + 1): if hamming_dist(text[i:i + k], pattern) <= d: count += 1 return count
def distance(pattern, text): k = len(pattern) min_dist = float("Inf") for i in range(len(text) - k + 1): dist = hamming_dist(text[i:i + k], pattern) if dist < min_dist: min_dist = dist return min_dist
def neighbors(pattern, d): if d == 0: return pattern if len(pattern) == 1: return ['A', 'C', 'G', 'T'] neighborhood = set() suffix_neighbors = neighbors(pattern[1:], d) for suffix in suffix_neighbors: if hamming_dist(pattern[1:], suffix) < d: for nuc in ['A', 'C', 'G', 'T']: neighborhood.add(nuc + suffix) else: neighborhood.add(pattern[0] + suffix) return neighborhood
def main(k, d, data): variants = get_all_kmers(k) start = 0 data_len = len(data) while start + k <= data_len: sub = data[start:start + k] for item in variants.keys(): if hamming_dist(item, sub) <= d: variants[item] += 1 if hamming_dist(rev_complement(item), sub) <= d: variants[item] += 1 start += 1 values = variants.values() maximum = max(values) answer = [] for key in variants.keys(): if variants[key] == maximum: answer.append(key) return answer
def hamming_dist_approx(pattern, string, d): """ returns all approximate occurances (up to d mismatches) of pattern inside the string """ start = 0 pat_len = len(pattern) str_len = len(string) answer = [] while start + pat_len < str_len: sub = string[start:start + pat_len] if (hamming_dist(pattern, sub) <= d): answer.append(start) start += 1 return answer
def gen_d_neighb(d, fileName, data): """ returns and write to file_name d-neighborhood of a string """ variants = get_all_kmers(len(data)) var_len = len(variants) answers = [] for i in xrange(var_len): if hamming_dist(data, variants[i]) <= d: answers.append(variants[i]) file = open(fileName, 'w+') for item in answers: file.writelines(str(item) + "\n") return answers
def CalculateScore(Motifs): k = len(Motifs[0]) profile = FormProfile(Motifs) consensus = '' for i in range(k): most_freq = 0 for nuc in ['A', 'C', 'G', 'T']: if profile[nuc][i] > most_freq: most_freq = profile[nuc][i] to_add = nuc consensus += to_add score = 0 for motif in Motifs: score += hamming_dist(consensus, motif) return score
def hamming_dist_approx(pattern, string, d): """ returns all approximate occurances (up to d mismatches) of pattern inside the string """ start = 0 pat_len = len(pattern) str_len = len(string) answer = [] while start + pat_len < str_len: sub = string[start:start+pat_len] if (hamming_dist(pattern, sub) <= d): answer.append(start) start += 1 return answer