def check_spelling(checked_word, dist, word_list): ''' Iterates through word_list and returns the set of all words that are within edit distance dist of the string checked_word. ''' # Set constants ALPHABET = set(list(string.ascii_lowercase)) DIAG_SCORE = 2 OFF_DIAG_SCORE = 1 DASH_SCORE = 0 # contruct scoring matrix over all lower case letters scoring_matrix = student.build_scoring_matrix(ALPHABET, DIAG_SCORE, OFF_DIAG_SCORE, DASH_SCORE) # Init list to store words close_words = [] # Loop over word in word_list for word in word_list: # compute alignment matrix alignment_matrix = student.compute_alignment_matrix(checked_word, word, scoring_matrix, True) # compute score of global alignments score, align_x, align_y = student.compute_global_alignment(checked_word, word, scoring_matrix, alignment_matrix) # calculate edit distance edit_distance = len(checked_word) + len(word) - score # Compare edit_distance and dist if edit_distance <= dist: # save word close_words.append(word) return close_words
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): ''' blah, blah, blah. Input: seq_x, seq_y - two sequences that share a common alphabet scoring_matrix - num_trials - Output: scoring_distribution - a dict of scores, which the key is the score and value is the number of times that score has appeared in the trials ''' # Init dict to store scores scoring_distribution = dict() # Perform trails while sum(scoring_distribution.values()) < num_trials: # convert seq_y into a list list_y = list(seq_y) # generates a random permutation of the list of seq_y random.shuffle(list_y) # convert the list into a string rand_y = ''.join(list_y) # Compute the alignment matrix alignment_matrix = student.compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False) # Score the local alignments score, align_x, align_y = student.compute_local_alignment(seq_x, rand_y, scoring_matrix, alignment_matrix) # Check if score is already in scoring_distribution if score in scoring_distribution.keys(): # Increment the score by 1 scoring_distribution[score] += 1 else: # Add score and set value to 1 scoring_distribution[score] = 1 return scoring_distribution
def question_one(): """ Compute local alignments and sequences of Human Eyeless Protein and Fruitfly Eyeless Protein """ scoring_matrix = read_scoring_matrix(PAM50_URL) human_seq = read_protein(HUMAN_EYELESS_URL) fly_seq = read_protein(FRUITFLY_EYELESS_URL) align_matrix = prj4.compute_alignment_matrix(human_seq, fly_seq, scoring_matrix, False) result = prj4.compute_local_alignment(human_seq, fly_seq, scoring_matrix, align_matrix) return result
def answer_Q1(): ''' Answers Q1. ''' # load the acid sequences that form the eyeless proteins for humans genomes human_sequence = provided.read_protein(HUMAN_EYELESS_URL) # load the acid sequences that form the eyeless proteins for fruit flies genomes fly_sequence = provided.read_protein(FRUITFLY_EYELESS_URL) # load the PAM50 scoring matrix pam50_scoring_matrix = provided.read_scoring_matrix(PAM50_URL) # compute the alignment method using method Q12 alignment_matrix = student.compute_alignment_matrix(human_sequence, fly_sequence, pam50_scoring_matrix, False) return student.compute_local_alignment(human_sequence, fly_sequence, pam50_scoring_matrix, alignment_matrix)
def question_two(): """ Compute comparison of two human and fruitfly local alignment sequences and return percentage of matches between both. """ scoring_matrix = read_scoring_matrix(PAM50_URL) local_results = question_one() pax_seq = read_protein(CONSENSUS_PAX_URL) dash, new_human, new_fly = "-", "", "" percentages = [] #remove dashes from human and fruit fly sequences for char in local_results[1]: if char != dash: new_human += char print("Old human seq: " + local_results[1]) print("New human seq: " + new_human) for char in local_results[2]: if char != dash: new_fly += char print("Old fly seq: " + local_results[2]) print("New fly seq: " + new_fly) #compute alignment matrices and calculate global alignments between human, fruit and pax print("Computing alignment matrices and global alignments...") align_matrix = prj4.compute_alignment_matrix(new_human, pax_seq, scoring_matrix, True) result_human_comp = prj4.compute_global_alignment(new_human, pax_seq, scoring_matrix, align_matrix) #print result_human_comp align_matrix = prj4.compute_alignment_matrix(new_fly, pax_seq, scoring_matrix, True) result_fly_comp = prj4.compute_global_alignment(new_fly, pax_seq, scoring_matrix, align_matrix) #print result_fly_comp #calculate percantage of matches between human, fruit, and pax matches = 0 for index in range(len(result_human_comp[2])): if result_human_comp[1][index] == result_human_comp[2][index]: matches += 1 percentages.append(matches / float(len(result_human_comp[2]))) matches = 0 for index in range(len(result_fly_comp[2])): if result_fly_comp[1][index] == result_fly_comp[2][index]: matches += 1 percentages.append(matches / float(len(result_fly_comp[2]))) #return the two percentages in a list return percentages
def answer_Q7(): alphabet = set(['A', 'C', 'T', 'G']) diag_score = 2 off_diag_score = 1 dash_score = 0 seq_x = 'AA' seq_y = 'TAAT' scoring_matrix = student.build_scoring_matrix(alphabet, diag_score, off_diag_score, dash_score) alignment_matrix = student.compute_alignment_matrix(seq_x, seq_y, scoring_matrix, True) score, align_x, align_y = student.compute_global_alignment(seq_x, seq_y, scoring_matrix, alignment_matrix) edit_distance = len(seq_x) + len(seq_y) - score return (diag_score, off_diag_score, dash_score)
def question_two(): """ Compute comparison of two human and fruitfly local alignment sequences and return percentage of matches between both. """ scoring_matrix = read_scoring_matrix(PAM50_URL) local_results = question_one() pax_seq = read_protein(CONSENSUS_PAX_URL) dash, new_human, new_fly = "-", "", "" percentages = [] #remove dashes from human and fruit fly sequences for char in local_results[1]: if char != dash: new_human += char print "Old human seq: " + local_results[1] print "New human seq: " + new_human for char in local_results[2]: if char != dash: new_fly += char print "Old fly seq: " + local_results[2] print "New fly seq: " + new_fly #compute alignment matrices and calculate global alignments between human, fruit and pax print "Computing alignment matrices and global alignments..." align_matrix = prj4.compute_alignment_matrix(new_human, pax_seq, scoring_matrix, True) result_human_comp = prj4.compute_global_alignment(new_human, pax_seq, scoring_matrix, align_matrix) #print result_human_comp align_matrix = prj4.compute_alignment_matrix(new_fly, pax_seq, scoring_matrix, True) result_fly_comp = prj4.compute_global_alignment(new_fly, pax_seq, scoring_matrix, align_matrix) #print result_fly_comp #calculate percantage of matches between human, fruit, and pax matches = 0 for index in xrange(len(result_human_comp[2])): if result_human_comp[1][index] == result_human_comp[2][index]: matches += 1 percentages.append(matches / float(len(result_human_comp[2]))) matches = 0 for index in xrange(len(result_fly_comp[2])): if result_fly_comp[1][index] == result_fly_comp[2][index]: matches += 1 percentages.append(matches / float(len(result_fly_comp[2]))) #return the two percentages in a list return percentages
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): """ Generates distribution of local alignment sequences stochastically """ distribution = {} #loop through num trials to calculation local alignments of random sequences for trial in xrange(num_trials): rand_y = list(seq_y) random.shuffle(rand_y) rand_y = "".join(rand_y) align_matrix = prj4.compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False) score = prj4.compute_local_alignment(seq_x, rand_y, scoring_matrix, align_matrix) if score[0] in distribution: distribution[score[0]] += 1 else: distribution[score[0]] = 1 #return unnormalized distribution of scores return distribution
def percent_match(local_alignment): ''' Computes the percent similarilty between a local alignment to the global alignment of the PAX sequence. ''' # remove the '-' from the local alignment local_alignment = local_alignment.replace('-', '') # load the PAM50 scoring matrix pam50_scoring_matrix = provided.read_scoring_matrix(PAM50_URL) # load the consensus sequence consensus_sequence = provided.read_protein(CONSENSUS_PAX_URL) # compute the global alignment alignment_matrix = student.compute_alignment_matrix(local_alignment, consensus_sequence, pam50_scoring_matrix, True) # compute the global alignment score, global_alignment, consensus_alignment = student.compute_global_alignment(local_alignment, consensus_sequence, pam50_scoring_matrix, alignment_matrix) # Init the variable to store matches match = 0 # loop over each character for char in range(len(global_alignment)): # compare characters between the two alignments if global_alignment[char] == consensus_alignment[char]: # increase the match score by 1 match += 1 return round(match/float(len(global_alignment))*100, 2)
""" Algorithmic Thinking - Module 4 Project Mark Hess Dynamic Programming and Sequence Alignment Computing Alginments of sequences Test File """ import Project_4 TEST1 = True TEST2 = True TEST3 = False TEST4 = False if TEST1: print (Project_4.build_scoring_matrix(set(['A', 'C', 'T', 'G']), 6, 2, -4)) #expected {'A': {'A': 6, 'C': 2, '-': -4, 'T': 2, 'G': 2}, 'C': {'A': 2, 'C': 6, '-': -4, 'T': 2, 'G': 2}, #'-': {'A': -4, 'C': -4, '-': -4, 'T': -4, 'G': -4}, 'T': {'A': 2, 'C': 2, '-': -4, 'T': 6, 'G': 2}, #'G': {'A': 2, 'C': 2, '-': -4, 'T': 2, 'G': 6}} if TEST2: print (Project_4.compute_alignment_matrix('', '', {'A': {'A': 6, 'C': 2, '-': -4, 'T': 2, 'G': 2}, 'C': {'A': 2, 'C': 6, '-': -4, 'T': 2, 'G': 2}, '-': {'A': -4, 'C': -4, '-': -4, 'T': -4, 'G': -4}, 'T': {'A': 2, 'C': 2, '-': -4, 'T': 6, 'G': 2}, 'G': {'A': 2, 'C': 2, '-': -4, 'T': 2, 'G': 6}}, True)) #expected [[0]] but received [] print (Project_4.compute_alignment_matrix('A', 'A', {'A': {'A': 6, 'C': 2, '-': -4, 'T': 2, 'G': 2}, 'C': {'A': 2, 'C': 6, '-': -4, 'T': 2, 'G': 2}, '-': {'A': -4, 'C': -4, '-': -4, 'T': -4, 'G': -4}, 'T': {'A': 2, 'C': 2, '-': -4, 'T': 6, 'G': 2}, 'G': {'A': 2, 'C': 2, '-': -4, 'T': 2, 'G': 6}}, True)) #expected [[0, -4], [-4, 6]] print (Project_4.compute_alignment_matrix('ATG', 'ACG', {'A': {'A': 6, 'C': 2, '-': -4, 'T': 2, 'G': 2}, 'C': {'A': 2, 'C': 6, '-': -4, 'T': 2, 'G': 2}, '-': {'A': -4, 'C': -4, '-': -4, 'T': -4, 'G': -4},
Algorithmic Thinking (Part 2) Application 4: Applications to Genomics and Beyond """ import Project_4 import alg_application4_provided as provided import math import matplotlib.pyplot as plt """ Question 1 """ seq_human = provided.read_protein(provided.HUMAN_EYELESS_URL) seq_fly = provided.read_protein(provided.FRUITFLY_EYELESS_URL) scoring_matrix = provided.read_scoring_matrix(provided.PAM50_URL) local_alignment_mx = Project_4.compute_alignment_matrix( seq_human, seq_fly, scoring_matrix, False) result = Project_4.compute_local_alignment(seq_human, seq_fly, scoring_matrix, local_alignment_mx) print 'Score:' + str(result[0]) print 'Human: ' + result[1] print 'Fly: ' + result[2] """ Question 2 """ ali_human = result[1] ali_fly = result[2] seq_con = provided.read_protein(provided.CONSENSUS_PAX_URL) ali_human = ali_human.replace('-', '') ali_fly = ali_fly.replace('-', '')
Algorithmic Thinking - Module 4 Project Mark Hess Dynamic Programming and Sequence Alignment Computing Alginments of sequences Test File """ import Project_4 TEST1 = False TEST2 = False TEST3 = True TEST4 = True if TEST1: print Project_4.build_scoring_matrix(set(['A', 'C', 'T', 'G']), 6, 2, -4) #expected {'A': {'A': 6, 'C': 2, '-': -4, 'T': 2, 'G': 2}, 'C': {'A': 2, 'C': 6, '-': -4, 'T': 2, 'G': 2}, #'-': {'A': -4, 'C': -4, '-': -4, 'T': -4, 'G': -4}, 'T': {'A': 2, 'C': 2, '-': -4, 'T': 6, 'G': 2}, #'G': {'A': 2, 'C': 2, '-': -4, 'T': 2, 'G': 6}} if TEST2: print Project_4.compute_alignment_matrix('', '', {'A': {'A': 6, 'C': 2, '-': -4, 'T': 2, 'G': 2}, 'C': {'A': 2, 'C': 6, '-': -4, 'T': 2, 'G': 2}, '-': {'A': -4, 'C': -4, '-': -4, 'T': -4, 'G': -4}, 'T': {'A': 2, 'C': 2, '-': -4, 'T': 6, 'G': 2}, 'G': {'A': 2, 'C': 2, '-': -4, 'T': 2, 'G': 6}}, True) #expected [[0]] but received [] print Project_4.compute_alignment_matrix('A', 'A', {'A': {'A': 6, 'C': 2, '-': -4, 'T': 2, 'G': 2}, 'C': {'A': 2, 'C': 6, '-': -4, 'T': 2, 'G': 2}, '-': {'A': -4, 'C': -4, '-': -4, 'T': -4, 'G': -4}, 'T': {'A': 2, 'C': 2, '-': -4, 'T': 6, 'G': 2}, 'G': {'A': 2, 'C': 2, '-': -4, 'T': 2, 'G': 6}}, True) #expected [[0, -4], [-4, 6]] print Project_4.compute_alignment_matrix('ATG', 'ACG', {'A': {'A': 6, 'C': 2, '-': -4, 'T': 2, 'G': 2}, 'C': {'A': 2, 'C': 6, '-': -4, 'T': 2, 'G': 2}, '-': {'A': -4, 'C': -4, '-': -4, 'T': -4, 'G': -4},
""" import Project_4 import alg_application4_provided as provided import math import matplotlib.pyplot as plt """ Question 1 """ seq_human = provided.read_protein(provided.HUMAN_EYELESS_URL) seq_fly = provided.read_protein(provided.FRUITFLY_EYELESS_URL) scoring_matrix = provided.read_scoring_matrix(provided.PAM50_URL) local_alignment_mx = Project_4.compute_alignment_matrix(seq_human, seq_fly, scoring_matrix, False) result = Project_4.compute_local_alignment(seq_human, seq_fly, scoring_matrix, local_alignment_mx) print 'Score:' + str(result[0]) print 'Human: ' + result[1] print 'Fly: ' + result[2] """ Question 2 """ ali_human = result[1] ali_fly = result[2] seq_con = provided.read_protein(provided.CONSENSUS_PAX_URL)