def test_compute_alignment_matrix(self): scoring_matrix_0 = project4.build_scoring_matrix( set(['a', 'b', 'c']), 10, 5, -1) alignment_0 = project4.compute_alignment_matrix('a', 'cab', scoring_matrix_0, global_flag=True) alignment_1 = project4.compute_alignment_matrix('a', 'cab', scoring_matrix_0, global_flag=False) self.assertEqual(alignment_0, [[0, -1, -2, -3], [-1, 5, 9, 8]]) self.assertEqual(alignment_1, [[0, 0, 0, 0], [0, 5, 10, 9]]) scoring_matrix_1 = project4.build_scoring_matrix( set(['a', 'b', 'c']), 10, 5, -1) alignment_2 = project4.compute_alignment_matrix('cc', 'cab', scoring_matrix_1, global_flag=True) alignment_3 = project4.compute_alignment_matrix('cc', 'cab', scoring_matrix_1, global_flag=False) self.assertEqual(alignment_2, [[0, -1, -2, -3], [-1, 10, 9, 8], [-2, 9, 15, 14]]) self.assertEqual(alignment_3, [[0, 0, 0, 0], [0, 10, 9, 8], [0, 10, 15, 14]])
def test_build_scoring_matrix(self): scores_0 = project4.build_scoring_matrix(set(['a', 'b', 'c']), 10, 4, -1) self.assertEqual(sorted(scores_0.keys()), ['-', 'a', 'b', 'c']) self.assertEqual(scores_0['a']['a'], 10) self.assertEqual(scores_0['a']['b'], 4) self.assertEqual(scores_0['a']['-'], -1) scores_1 = project4.build_scoring_matrix(set(['a', 'b', 'c', '-']), 10, 4, -1) self.assertEqual(sorted(scores_1.keys()), ['-', 'a', 'b', 'c']) self.assertEqual(scores_1['a']['a'], 10) self.assertEqual(scores_1['a']['b'], 4) self.assertEqual(scores_1['a']['-'], -1)
def edit_dist(xs, ys): alphabet = ascii_lowercase # what is ascii_lowercase?? scoring = build_scoring_matrix(alphabet, 2, 1, 0) align = compute_alignment_matrix(xs, ys, scoring, True) # True means global alignment. score, _, _ = compute_global_alignment(xs, ys, scoring, align) return len(xs) + len(ys) - score
def find_scoring_matrix(x, y, med, dim): """ Find the scoring matrix that satisifes the definition of minimum edit distance: |x| + |y| - score(x, y) Inputs: x, y: english strings med: minimum edit distance between x, y dim: range of values to test for diag_score, off_score, dash_score note dash_scores will be <= 0 """ alphabet = set(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']) # med(kitten, sitting) = 3 correct = len(x) + len(y) - med # 10 solutions = np.zeros((dim, dim, dim)) for diag in range(dim): for off in range(dim): for dash in range(dim): sm = seq.build_scoring_matrix(alphabet, diag, off, -1 * dash) am = seq.compute_alignment_matrix(x, y, sm) solutions[diag, off, dash] = seq.compute_global_alignment(x, y, sm, am)[0] parameters = np.transpose(np.nonzero(solutions == correct)) parameters[:, 2] *= -1 return parameters
def edit_dist(xs, ys): ''' Helper function for Question 8 ''' alphabet = 'abcdefghijklmnopqrstuvwxyz' scoring = project4.build_scoring_matrix(alphabet, 2, 1, 0) align = project4.compute_alignment_matrix(xs, ys, scoring, True) score, x, y = project4.compute_global_alignment(xs, ys, scoring, align) return len(xs) + len(ys) - score
def check_spelling(checked_word, dist, word_list): # scoring matrix for edit distaion # edit distance = |x| + |y| - score(X,Y) # diag_socre = 2, off_diag_score = 1, dash_score = 0 alphabets = set("abcdefghijklmnopqrstuvwxyz") scoring_matrix = project4.build_scoring_matrix(alphabets,2,1,0) string_set = set([]) for word in word_list: alignment_matrix = project4.compute_alignment_matrix(checked_word ,word, scoring_matrix, True) score, _, _ = project4.compute_global_alignment(checked_word, word, scoring_matrix, alignment_matrix) score = len(checked_word) + len(word) - score if score <= dist: string_set.add(word) return string_set
def check_spelling(checked_word, dist, word_list): """ Function for Question 8 """ # we should do some pre-processing with the word_list # only consider the words that has length between |checked_word| +- dist # (2) maybe should not consider the words that have letters not existed # in the checked_word #word_list_new = [] #for each_word in word_list: # if (len(each_word) >= (len(checked_word) - dist)) and (len(each_word) <= (len(checked_word) + dist)): # word_list_new.append(each_word) alphabet = set([ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' ]) #print len(alphabet) if (checked_word in word_list): return checked_word score_matrix = project4.build_scoring_matrix(alphabet, 2, 1, 0) words = [] # build a set of chars in checked_word # I can even use a dictionary to check against the number of chars, it # would be more effective checked_word_chars = set(checked_word) num_checks = 0 for each_word in word_list: each_word_chars = set(each_word) num_diffs = 0 for char in each_word_chars: if char not in checked_word_chars: num_diffs += 1 if (len(each_word) >= (len(checked_word) - dist)) and (len(each_word) <= (len(checked_word) + dist) and num_diffs <= 2): align_matrix = project4.compute_alignment_matrix( checked_word, each_word, score_matrix, True) result = project4.compute_global_alignment(checked_word, each_word, score_matrix, align_matrix) if ((len(checked_word) + len(each_word) - result[0]) <= dist): words.append(each_word) num_checks += 1 print num_checks return words
def check_spelling(checked_word, dist, word_list): """ Returns a set of words from word_list that are dist edit distance from checked_word """ alphabet = set(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']) candidates = set([]) for word in word_list: smtrx = seq.build_scoring_matrix(alphabet, 2, 1, 0) amtrx = seq.compute_alignment_matrix(checked_word, word, smtrx) score = seq.compute_global_alignment(checked_word, word, smtrx, amtrx)[0] if len(checked_word) + len(word) - score <= dist: candidates.add(word) return candidates
def question7(): """ Question 7 """ alphabet = set(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']) #print len(alphabet) score_matrix = project4.build_scoring_matrix(alphabet, 2, 1, 0) test1 = 'abcde' test2 = 'xycdefg' align_matrix = project4.compute_alignment_matrix(test1, test2, score_matrix, True) result = project4.compute_global_alignment(test1, test2, score_matrix, align_matrix) print test1 print test2 print result print len(test1) + len(test2) - result[0]
def calculate_edit_distance(xseq, yseq): ''' Return the edit distance of xseq and yseq http://en.wikipedia.org/wiki/Edit_distance ''' alphabet = set(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '-']) scoring_matrix = project4.build_scoring_matrix(alphabet, 2, 1, 0) global_alignment_matrix = project4.compute_alignment_matrix(xseq, yseq, scoring_matrix, True) global_alignment = project4.compute_global_alignment(xseq, yseq, scoring_matrix,global_alignment_matrix) edit_distance = len(xseq) + len(yseq) - global_alignment[0] #print global_alignment # print edit_distance return edit_distance
def check_spelling(checked_word, dist, word_list): """ Function for Question 8 """ # we should do some pre-processing with the word_list # only consider the words that has length between |checked_word| +- dist # (2) maybe should not consider the words that have letters not existed # in the checked_word #word_list_new = [] #for each_word in word_list: # if (len(each_word) >= (len(checked_word) - dist)) and (len(each_word) <= (len(checked_word) + dist)): # word_list_new.append(each_word) alphabet = set(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']) #print len(alphabet) if (checked_word in word_list): return checked_word score_matrix = project4.build_scoring_matrix(alphabet, 2, 1, 0) words = [] # build a set of chars in checked_word # I can even use a dictionary to check against the number of chars, it # would be more effective checked_word_chars = set(checked_word) num_checks = 0 for each_word in word_list: each_word_chars = set(each_word) num_diffs = 0 for char in each_word_chars: if char not in checked_word_chars: num_diffs += 1 if (len(each_word) >= (len(checked_word) - dist)) and (len(each_word) <= (len(checked_word) + dist) and num_diffs <= 2): align_matrix = project4.compute_alignment_matrix(checked_word, each_word, score_matrix, True) result = project4.compute_global_alignment(checked_word, each_word, score_matrix, align_matrix) if ((len(checked_word) + len(each_word) - result[0]) <= dist): words.append(each_word) num_checks += 1 print num_checks return words
def check_spelling(check_word, dist, word_list): """ check spelling of check_word :param check_word: word to check :param dist: edit distance :param word_list: list of wrod (dictionary) :return: set of words from word_list that has the distance of 'dist' from check_word """ result =[] alphabet = list(string.ascii_lowercase) score_matrix = student.build_scoring_matrix(alphabet, 2, 1, 0) for each in word_list: alignment_matrix = student.compute_alignment_matrix(each, check_word, score_matrix, True) global_align = student.compute_global_alignment(each, check_word, score_matrix, alignment_matrix) distance = len(each)+len(check_word)-global_align[0] if distance <= dist: result.append(each) return result
def check_spelling(checked_word, dist, word_list): """ Iterates through word_list and returns the set of all words that are within edit distance dist of the string checked_word. """ ans = set([]) scoring_matrix = project4.build_scoring_matrix( 'abcdefghijklmnopqrstuvwxyz', 2, 1, 0) checked_word_length = len(checked_word) for word in word_list: word_length = len(word) alignment_matrix = project4.compute_alignment_matrix( checked_word, word, scoring_matrix, True) global_score = project4.compute_global_alignment( checked_word, word, scoring_matrix, alignment_matrix) edit_dist = checked_word_length + word_length - global_score[0] if edit_dist <= dist: ans.add(word) return ans
def question7(): """ Question 7 """ alphabet = set([ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' ]) #print len(alphabet) score_matrix = project4.build_scoring_matrix(alphabet, 2, 1, 0) test1 = 'abcde' test2 = 'xycdefg' align_matrix = project4.compute_alignment_matrix(test1, test2, score_matrix, True) result = project4.compute_global_alignment(test1, test2, score_matrix, align_matrix) print test1 print test2 print result print len(test1) + len(test2) - result[0]
def check_spelling(checked_word, dist, word_list): """ Iterates through word_list and returns the set of all words that are within edit distance dist of the string checked_word. Parameters ---------- checked_word: str the word to be checked dist: int the edit distance word_list: list a list of words Returns ------- result: list the list of words that are within edit distance of the checked_word. """ alphabets = "abcdefghijklmnopqrstuvwxyz" score_mat = build_scoring_matrix(alphabets, 2, 1, 0) result = [] for word in word_list: align_mat = compute_alignment_matrix(checked_word, word, score_mat, True) score = compute_global_alignment(checked_word, word, score_mat, align_mat)[0] current_dist = len(checked_word) + len(word) - score if current_dist <= dist: result.append(word) return result
def edit_distance(seq_x, seq_y): alphabet = string.ascii_lowercase scoring_matrix = project4.build_scoring_matrix(alphabet, 2, 1, 0) alignment_matrix = project4.compute_alignment_matrix(seq_x, seq_y, scoring_matrix,True) score = project4.compute_global_alignment(seq_x, seq_y, scoring_matrix, alignment_matrix) return len(seq_x) + len(seq_y) - score[0]