def distanceBetweenPatternAndStrings(pattern, DNA): """ Finds total distance between a pattern a set of sequences. Input: A string pattern and collection of strings DNA. Output: The total distance between DNA and pattern. """ k = len(pattern) distance = 0 # finds minimum distance by checking all kmers in each string for sequence in DNA: hammingDistance = float("inf") for i in range(len(sequence) - k + 1): kmer = sequence[i:i + k] d = f1.hammingDistance(pattern, kmer) # found minimum distance; save if hammingDistance > d: hammingDistance = d # distance keeps track of sum of distances distance += hammingDistance return distance
def motifEnumeration(DNA, k, d): """ Find all k-mer motifs with at most d mismatches given a collection of strings. Input: A collection of strings Dna, and integers k and d. Output: All (k, d)-motifs in Dna. """ k = int(k) d = int(d) patterns = [] DNAkmers = [[] for seq in DNA] # 2d array of all sequence kmers candidateMotifs = [] # contains set of generated kmer neighborhoods # create list of all kmers as well as kmer neighborhoods for dnaIndex in range(len(DNA)): sequence = DNA[dnaIndex] for seqIndex in range(len(sequence) - k + 1): kmer = sequence[seqIndex:seqIndex + k] DNAkmers[dnaIndex].append(kmer) candidateMotifs.extend(f1.neighbors(kmer, d)) candidateMotifs = set(candidateMotifs) # exhaustive search candidates for true motifs for candidate in candidateMotifs: found = [False] * len(DNAkmers) for listIndex in range(len(DNAkmers)): for kmer in DNAkmers[listIndex]: if f1.hammingDistance(candidate, kmer) <= d: found[listIndex] = True break if False not in found: patterns.append(candidate) return set(patterns)