def medianString(input, k): dnas = input.split() distance = sys.maxint kmers = [] median = '' for i in range(0, int(math.pow(4, k))): cur = i sb = '' for j in range(0, k): if cur % 4 == 0: sb = sb + 'A' elif cur % 4 == 1: sb = sb + 'C' elif cur % 4 == 2: sb = sb + 'G' else: sb = sb + 'T' cur = cur / 4 kmers.append(sb) for kmer in kmers: distanceSum = 0 for dna in dnas: d = sys.maxint for i in range(0, len(dna) - k + 1): if hammingDistance(kmer, dna[i:i + k]) < d: d = hammingDistance(kmer, dna[i:i + k]) distanceSum = distanceSum + d if distanceSum < distance: distance = distanceSum median = kmer print distance print median return
def frequentWordsWithMismatches(text, k, d): count = dict() max = 0 for i in range(0, int(math.pow(4, k))): cur = i sb = '' for j in range(0, k): if cur % 4 == 0: sb = sb + 'A' elif cur % 4 == 1: sb = sb + 'C' elif cur % 4 == 2: sb = sb + 'G' else: sb = sb + 'T' cur = cur / 4 count[sb] = 0 for key, value in count.items(): for i in range(0, len(text) - k + 1): if hammingDistance(key, text[i:i + k]) <= d: count[key] = count[key] + 1 if count[key] > max: max = count[key] for key, value in count.items(): if value == max: print key, return
def approxPatternMatching(pattern, text, k): pattern = pattern.replace(' ', '') text = text.replace(' ', '') for i in range(0, len(text) - len(pattern) + 1): if hammingDistance(pattern, text[i:i + len(pattern)]) <= k: print i, return
def motifEnumeration(input, k, d): dnas = input.split() patterns = [] kmers = [] for i in range(0, int(math.pow(4, k))): cur = i sb = '' for j in range(0, k): if cur % 4 == 0: sb = sb + 'A' elif cur % 4 == 1: sb = sb + 'C' elif cur % 4 == 2: sb = sb + 'G' else: sb = sb + 'T' cur = cur / 4 kmers.append(sb) for kmer in kmers: count = 0 for dna in dnas: for i in range(0, len(dna) - k + 1): if hammingDistance(kmer, dna[i:i + k]) <= d: count = count + 1 break if count == len(dnas): patterns.append(kmer) patterns = set(patterns) for p in patterns: print p return
def medianString(k, dna): print("starting frequentWordsMismatch") # print("text: ", text) count = 0 highCount = 0 rstring = "" k = int(k) # print("K: ",k) dict = {} kmers = allKmers(k) for i in range(len(kmers)): dict[kmers[i]] = k dictArray = [] keys = list(dict.keys()) # for every dna for entry in dna: # make a copy of dict dictCopy = copy.copy(dict) # move window over whole dna for w in range(len(entry)): if (w == (len(entry) - k - 1)): break window = entry[w:w + k] #print("window ", window, " : ", k) # hamminging distance window with every key for key in keys: # stort value in dict copy ham = hammingDistance(window, key) #print("Ham: ", ham, dictCopy[key]) if int(ham) < dictCopy[key]: dictCopy[key] = int(ham) #print(key, " window: ", window, " HD: ", dictCopy[key]) # append dictcopy to dict Array dictArray.append(dictCopy) #print(dictCopy) # compare keys and return one with least distance. dictCopy = copy.copy(dict) for key in keys: count = 0 #print("dictArrayLen: ",len(dictArray)) for entry in dictArray: #print("key: ", key," count: ", count, " val: ", int(entry[key])) count = count + int(entry[key]) dictCopy[key] = count #print(dictCopy) res = "" #this is lazy and will break with big enough input finalCount = 99999 for key in keys: if dictCopy[key] < finalCount: res = key finalCount = dictCopy[key] return res
def approximateMatches(Pattern, Text, d): # Input: Strings Pattern and Text as well as an integer d. # Output: Countd(Text, Pattern). count = 0 k = len(Pattern) for i in range(0, len(Text) - k + 1): if (hammingDistance(Pattern, Text[i:i + k]) <= d): count = count + 1 return count
def Neighbors(Pattern, d): # Input: A string Pattern and an integer d. # Output: The collection of strings Neighbors(Pattern, d). if d == 0: return {Pattern} if len(Pattern) == 1: return {"A", "C", "G", "T"} Neighborhood = list() suffixNeighbors = Neighbors(Pattern[1:], d) for text in suffixNeighbors: if hammingDistance(Pattern[1:], text) <= d: for x in {"A", "C", "G", "T"}: Neighborhood.append((x+text)) else: Neighborhood.append((Pattern[0]+text)) return Neighborhood
def MotifEnumeration(Dna, k, d): patterns = set() for word in Dna.split("\n"): for i in range(0, len(word) - k + 1): pattern = word[i:i + k] neighborhood = Neighbors(pattern, d) for neighbor in neighborhood: found = False valid = True for string in Dna.split("\n"): for j in range(0, len(string) - k + 1): if hammingDistance(neighbor, string[j:j + k]) <= d: found = True if not found: valid = False found = False if valid: patterns.add(neighbor) return patterns