def edit_distance(seq_x, seq_y, scoring_matrix): """ calculate the edit distance through seq_x, seq_y and scoring matrix by return |seq_x| + |seq_y| - score of the corresponding global alignment """ alignment_matrix = pj4.compute_alignment_matrix(seq_x, seq_y, scoring_matrix, True) score, align_x, align_y = pj4.compute_global_alignment(seq_x, seq_y, scoring_matrix, alignment_matrix) return len(seq_x) + len(seq_y) - score
def ED_xcross_test(diag_score, off_diag_score, dash_score): """ Insertion: abc -> abbc Deletion: abc -> ac Subsititution: abc -> abd """ scoring_matrix = pj4.build_scoring_matrix({"a", "b", "c", "d"}, diag_score, off_diag_score, dash_score) # test I: x = 'abcd', y = 'ad', the edit distance is 4 test1_ED = edit_distance("ab", "acccc", scoring_matrix) # test II: x = 'abc', y = 'abcddd', the edit distance is 3 test2_ED = edit_distance("acccd", "ad", scoring_matrix) # test III: x = 'abcd', y = 'abb', the edit distance is 2 test3_ED = edit_distance("abcd", "addd", scoring_matrix) return (test1_ED, test2_ED, test3_ED)
def check_spelling(checked_word, dist, word_list): """ To iterates through word_list and returns the set of all words that are within edit distance dist of the string checked_word """ diag_score = 2 off_diag_score = 1 dash_score = 0 scoring_matrix = pj4.build_scoring_matrix(set('qazwsxedcrfvtgbyhnujmikolp'), diag_score, off_diag_score, dash_score) word_list = set(word_list) candidate_words = list() count = 0 for word in word_list: if len(word) < len(checked_word) - dist or len(word) > len(checked_word) + dist: continue # number of operation = 2 # 2 insertion passed = False for number in range(len(checked_word)): if checked_word[:number] in word and checked_word[number + 2:] in word: passed = True # 1 insertion passed = True for number in range(len(checked_word)): if checked_word[:number] not in word or checked_word[number + 1:] not in word: passed = False if not passed: continue count += 1 if sol4_7.edit_distance(checked_word, word, scoring_matrix) <= dist: candidate_words.append(word) print count return set(candidate_words)
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): """ To return a dictionary scoring_distribution that represents an un-normalized distribution generated by performing the following local alignment process num_trials times. """ scoring_distribution = dict() for dummy_idx in range(num_trials): tmp_y = list(seq_y) random.shuffle(tmp_y) rand_y = ''.join(tmp_y) alignment_matrix = pj4.compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False) score = max([max(value) for value in alignment_matrix]) #score, align_x, align_y = pj4.compute_local_alignment(seq_x, rand_y, scoring_matrix, alignment_matrix) if score not in scoring_distribution.keys(): scoring_distribution[score] = 1 else: scoring_distribution[score] += 1 return scoring_distribution
""" Algorithm thinking application 4-1 data: 2015/07/30 Author: You-Hao """ import alg_application4_provided as app4 import AT_project_4 as pj4 protein_human = app4.read_protein(app4.HUMAN_EYELESS_URL) protein_fruitfly = app4.read_protein(app4.FRUITFLY_EYELESS_URL) scoring_matrix = app4.read_scoring_matrix(app4.PAM50_URL) alignment_matrix_4_1 = pj4.compute_alignment_matrix(protein_human, protein_fruitfly, scoring_matrix, False) score_4_1, align_human_4_1, align_fruitfly_4_1 = pj4.compute_local_alignment(protein_human, protein_fruitfly, scoring_matrix, alignment_matrix_4_1) print score_4_1 print align_human_4_1 print align_fruitfly_4_1
seq_human_nodash = '' seq_fruitfly_nodash = '' for char in seq_human: if char != '-': seq_human_nodash = seq_human_nodash + char for char in seq_fruitfly: if char != '-': seq_fruitfly_nodash = seq_fruitfly_nodash + char print len(seq_human_nodash) print len(seq_fruitfly_nodash) # for human alignment_matrix = pj4.compute_alignment_matrix(seq_human_nodash, seq_PAX, scoring_matrix, True) score_human, align_human, align_PAX_1 = pj4.compute_global_alignment(seq_human_nodash, seq_PAX, scoring_matrix, alignment_matrix) print score_human print align_human print align_PAX_1 match_human = 0 for ind in range(len(align_human)): if align_human[ind] == align_PAX_1[ind]: match_human += 1 print float(match_human) / len(align_human) * 100. # for fruit fly alignment_matrix = pj4.compute_alignment_matrix(seq_fruitfly_nodash, seq_PAX, scoring_matrix, True)