def main(): args = get_args() # path to the text file of the vocabulary and create a dictionary vocabulary_path = args['vocabulary'] vocabulary = create_dictionary(vocabulary_path) # decide which distance to use if args['distance'] == 'levenshtein': # Levenshtein distance from levenshtein import levenshtein_distance levenshtein_distance(get_misspelling(), 'lexique.json') elif args['distance'] == 'levenshtein2': # Levenshtein distance using another library from levenshtein_v2 import levenshtein_distance2 levenshtein_distance2(get_misspelling(), vocabulary) elif args['distance'] == 'hamming': # Hamming distance from hamming import hamming_distance hamming_distance(get_misspelling(), vocabulary) elif args['distance'] == 'jarowinkler': # Jaro-Winkler from jarowinkler import jarowinkler_distance jarowinkler_distance(get_misspelling(), vocabulary) else: raise Exception("Unknown distance function : {}".format( args['distance']))
def test_repulsion(self): p1 = [1, 2, 3, 4, 5, 6, 7, 8] p2 = [1, 5, 2, 8, 7, 4, 3, 6] start_distance = hamming_distance(p1, p2) res = repulsion(p1, p2) self.assertEqual(p1, [1, 2, 3, 4, 5, 6, 7, 8]) self.assertEqual(p2, [1, 5, 2, 8, 7, 4, 3, 6]) end_distance = hamming_distance(p1, res) self.assertTrue(end_distance >= start_distance)
def motifEnumerate(dna,k,d): seq_first=dna[0] patterns=[] l=len(seq_first) for i in xrange(l-k+1): patterns.append(seq_first[i:i+k]) patterns=list(set(patterns))#unique patterns in the first string in DNA d_neighbor=[] for pattern in patterns: d_neighbor.extend(neighbors(pattern,d)) d_neighbor=list(set(d_neighbor))#collection of unique d_neighbor for All kmer pattern from first string #######Checking if d_neighbor pattern has a match in REST of dna collection of sequences########## motif=[] for patt in d_neighbor: s=0 for seq in dna[1:]: c=0 for i in xrange(l-k+1): if hamming_distance(patt,seq[i:i+k])<=d: c=1 s=s+1 break else: pass if c==0:#that is patt never exists as a d-neighbor of patterns of an encountered seq! then NO NEED TO CHECK FOR SUBSEQUENT Sequences! break elif s==len(dna)-1: motif.append(patt) motif=list(set(motif)) return motif
def test_hamming_dist_timing(self): list_of_strings = [string_generator() for i in range(10)] print(list_of_strings) string_result = [] string_start = time.process_time_ns() for s1 in list_of_strings: for s2 in list_of_strings: string_result.append(naive_hamming_distance(s1, s2)) string_end = time.process_time_ns() print(f'no of mismatches = {string_result}') print("string hamming: {:,}".format(string_end - string_start)) binary_result = [] binary_start = time.process_time_ns() for s1 in list_of_strings: for s2 in list_of_strings: binary_result.append(hamming_distance(s1, s2)) binary_end = time.process_time_ns() print("binary hamming: {:,}".format(binary_end - binary_start)) list_of_preprocessed_binaries = [] for s in list_of_strings: list_of_preprocessed_binaries.append(string_to_hamming_binary(s)) binary_result_pre = [] binary_start_pre = time.process_time_ns() for s1 in list_of_preprocessed_binaries: for s2 in list_of_preprocessed_binaries: binary_result_pre.append(binary_hamming_dist_calc(s1, s2)) binary_end_pre = time.process_time_ns() print("prepro hamming: {:,}".format(binary_end_pre - binary_start_pre)) self.assertEqual(string_result, binary_result) self.assertEqual(binary_result_pre, binary_result)
def test_hamming_dist_all_different(self): s1 = 'CAT' s2 = 'GGG' expected_dist = 3 actual_dist = naive_hamming_distance(s1, s2) self.assertEqual(expected_dist, actual_dist) actual_dist = hamming_distance(s1, s2) self.assertEqual(expected_dist, actual_dist)
def test_hamming_dist_same(self): s1 = 'CAT' s2 = 'CAT' expected_dist = 0 actual_dist = naive_hamming_distance(s1, s2) self.assertEqual(expected_dist, actual_dist) actual_dist = hamming_distance(s1, s2) self.assertEqual(expected_dist, actual_dist)
def test_haystack_generator(self): string_length = 6 expected_dist = 3 haystack = string_generator(string_length) needle = string_mutator(haystack, expected_dist) actual_dist = naive_hamming_distance(needle, haystack) self.assertEqual(expected_dist, actual_dist) actual_dist = hamming_distance(needle, haystack) self.assertEqual(expected_dist, actual_dist)
def neighbors(pattern, d): if d == 0: return pattern elif len(pattern) == 1: return ['A', 'C', 'G', 'T'] neighborhood = [] #should contain all d neighborhood k mer patterns suffixNeighbors = neighbors(suffix(pattern), d) for pat in suffixNeighbors: if hamming_distance(suffix(pattern), pat) < d: for nuc in ['A', 'C', 'G', 'T']: neighborhood.append(nuc + pat) else: neighborhood.append(first_symbol(pattern) + pat) return neighborhood
def aprxPattern(text,pattern,d=0): '''text-Nucleotide sequence\ pattern-pattern to be matched d-default value 0 other wise user entered hamming distance value\ The function returns locations of the aproximate matches of the pattern in the text provided ''' l=len(text) k=len(pattern) pos=[] for i in xrange(l-k+1): kmer=text[i:i+k] if hamming_distance(pattern,kmer)<=d: pos.append(i) else: pass return len(pos)
def imagediff(method, file_name1, file_name2): if method == "file size": try: size1 = os.path.getsize(file_name1) except os.error: print >> sys.stderr, "ERROR: Unable to access ", file_name1 sys.exit(-1) try: size2 = os.path.getsize(file_name2) except os.error: print >> sys.stderr, "ERROR: Unable to access ", file_name2 return float(abs(size1 - size2)) / max(size1, size2) else: try: file1 = open(file_name1, "r") string1 = file1.read() except IOError: print >> sys.stderr, "ERROR: Unable to open ", file_name1 finally: file1.close() try: file2 = open(file_name2, "r") string2 = file2.read() except IOError: print >> sys.stderr, "ERROR: Unable to open ", file_name2 finally: file2.close() if method == "levenshtein": try: return float(levenshtein_distance(string1, string2)) / max(len(string1), len(string2)) except ZeroDivisionError: return 1 elif method == "hamming": try: return float(hamming_distance(string1, string2)) / min(len(string1), len(string2)) except ZeroDivisionError: return 1 else: print >> sys.stderr, "ERROR: Invalid method." sys.exit(-1)
def d(pattern, dna): '''function takes in pattern and collection of strings dna and computes thelowest distance between the pattern and collection ''' k = len(pattern) #distance=k#maximum possible hamming distance between pattern and region of same length total_hd = 0 for region in dna: distance = k + 1 #print region for i in xrange( len(region) - k + 1 ): #for every region we compute the hamm distance between pattern and substrings of region and selct the substring with least distance hd = hamming.hamming_distance(pattern, region[i:i + k]) if distance > hd: distance = hd #we only keep the lowest hamming distance #print distance total_hd += distance #sum of the least distances are computed for ALL REGIONS in dna for the PATTERN provided #print 'total_hd',total_hd,pattern return total_hd
def h_dis(pattern, seq): k = len(pattern) kmer_distance = dict() #kmer and HD for i in xrange(len(seq) - k + 1): kmer = seq[i:i + k] kmer_distance[kmer] = hamming_distance( pattern, kmer) #for all kmers in seq compute HD with pattern provided kmer_dist = kmer_distance.items() kmer_dist = sorted(kmer_dist, key=lambda x: x[1]) smallest_hd = kmer_dist[0][1] kmer_dist_leastHD = [kmer_dist[0]] for k, v in kmer_dist[1:]: if v == smallest_hd: kmer_dist_leastHD.append((k, v)) else: break return kmer_dist_leastHD # return type is a tuple of form (KMER,leastHD)
def binary_inc_proccessing_time(): SETUP_CODE = ''' from hamming import hamming_distance from random import choice from __main__ import string_generator list_of_strings1 = [string_generator() for i in range(10)] list_of_strings2 = [string_generator() for i in range(10)]''' TEST_CODE = ''' s1 = choice(list_of_strings1) s2 = choice(list_of_strings2) hamming_distance(s1, s2) ''' # timeit.repeat statement times = timeit.repeat(setup=SETUP_CODE, stmt=TEST_CODE, repeat=3, number=10000) # printing minimum exec. time print('Binary hamming string search time (including preprocessing): {}'. format(min(times)))
def print_possible_key_sizes(encoded_content): assert len(encoded_content) >= MAX_KEY_LENGTH * 4 size2score = {} for key_size in range(MIN_KEY_LENGTH, MAX_KEY_LENGTH): # let's try to average as many samples as we can dist = 0 for i in range(int(len(encoded_content) / (2 * key_size))): dist += hamming.hamming_distance( encoded_content[i * 2 * key_size:(i * 2 + 1) * key_size], encoded_content[(i * 2 + 1) * key_size:(i * 2 + 2) * key_size]) size2score[key_size] = dist / float( int(len(encoded_content) / (2 * key_size)) * key_size) # dist1 = hamming.hamming_distance(encoded_content[:key_size], # encoded_content[key_size: 2*key_size]) # dist2 = hamming.hamming_distance(encoded_content[2*key_size: 3*key_size], # encoded_content[3*key_size: 4*key_size]) # # average and normalize by dividing by key length # size2score[key_size] = (dist1 + dist2) / float((2 * key_size)) for key_size, score in sorted(size2score.items(), key=lambda x: x[1]): print("%d: %.02f" % (key_size, score))
cur_ctext = line print("Key = %s" % hex(cur_key)) print("Score = %f" % largest) print("Ciphertext = %s" % cur_ctext) print("Plaintext = %s" % cur_ptext) #Challenge 5 print("\n\nChallenge 5: Repeating Key XOR Encryption") ptext = "Burning 'em, if you ain't quick and nimble\nI go crazy when I hear a cymbal" key = "ICE" print("Encrypting %s with key %s" % (ptext, key)) print(xor_encrypt(ptext, key)) #Challenge 6 print("\n\nChallenge 6: Break Repeating Key XOR Encrpytion") distance = hamming_distance("this is a test", "wokka wokka!!!") if distance != 37: print("Distance = %d" % distance) raise ValueError ciphertext = base64.b64decode(open("./6.txt", "r").read()) smallest_distance = 1000 cur_keysize = 0 for keysize in range(2, 41): block1 = ciphertext[0:keysize] block2 = ciphertext[keysize:keysize * 2] block3 = ciphertext[keysize * 2:keysize * 3] block4 = ciphertext[keysize * 3:keysize * 4]