def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): ''' Takes as input two sequences seq_x and seq_y, a scoring matrix scoring_matrix, and a number of trials num_trials. Return a dictionary scoring_distribution that represents an un-normalized distribution generated by performing the following process num_trials times: 1.Generate a random permutation rand_y of the sequence seq_y using random.shuffle() 2.Compute the maximum value score for the local alignment of seq_x and rand_y 3.Increment the entry score in the dictionary scoring_distribution by one. ''' scoring_distribution = {} for _ in range(num_trials): #shuffle the seq_y list_y = list(seq_y) random.shuffle(list_y) rand_y = "".join(list_y) alignment_matrix = compute_alignment_matrix( seq_x=seq_x, seq_y=rand_y, scoring_matrix=scoring_matrix, global_flag=False) score = max([max(item) for item in alignment_matrix]) #score, align_human, align_fruitfly = compute_local_alignment(seq_x=seq_x, seq_y=rand_y, scoring_matrix=scoring_matrix, alignment_matrix=alignment_matrix) #import pdb; pdb.set_trace() try: scoring_distribution[score] += 1 except (KeyError): scoring_distribution[score] = 1 scoring_file = open('scoring_distribution.p', 'wb') pickle.dump(scoring_distribution, scoring_file) scoring_file.close() return scoring_distribution
def consensus_alignment(scoring_matrix, align_sequence): ''' compute the similarity of the two sequences in the local alignment computed to the PAX Domain. global alignment of local align_x vs PAX Domain, local align_y vs PAX Domain return the alignment and the persentage of elements in these two sequences that agree ''' pax_domain = read_protein('ConsensusPAXDomain.txt') align_sequence = align_sequence.replace("-", "") alignment_matrix = compute_alignment_matrix(seq_x=align_sequence, seq_y=pax_domain, scoring_matrix=scoring_matrix, global_flag=True) score, global_align_sequence, global_align_pax = compute_global_alignment( seq_x=align_sequence, seq_y=pax_domain, scoring_matrix=scoring_matrix, alignment_matrix=alignment_matrix) seq = difflib.SequenceMatcher(None, global_align_sequence, global_align_pax) ratio = seq.ratio() #import pdb; pdb.set_trace() return global_align_sequence, ratio
def random_alignment(scoring_matrix): ''' Take two random amino acids, compute the alignment and the consensus with pax Return the ratio of similarity To examine the solution of the homework ''' alphabet = "ACBEDGFIHKMLNQPSRTWVYXZ" length_human = 422 length_fruitfly = 857 random_human = '' for _ in range(length_human): random_human += random.choice(alphabet) random_fruitfly = '' for _ in range(length_fruitfly): random_fruitfly += random.choice(alphabet) alignment_matrix = compute_alignment_matrix(seq_x=random_human, seq_y=random_fruitfly, scoring_matrix=scoring_matrix, global_flag=False) score, align_human, align_fruitfly = compute_local_alignment( seq_x=random_human, seq_y=random_fruitfly, scoring_matrix=scoring_matrix, alignment_matrix=alignment_matrix) global_align_human, ratio_human_random = consensus_alignment( scoring_matrix, align_human) global_align_fruitfly, ratio_fruitfly_random = consensus_alignment( scoring_matrix, align_fruitfly) return ratio_human_random, ratio_fruitfly_random
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): ''' Takes as input two sequences seq_x and seq_y, a scoring matrix scoring_matrix, and a number of trials num_trials. Return a dictionary scoring_distribution that represents an un-normalized distribution generated by performing the following process num_trials times: 1.Generate a random permutation rand_y of the sequence seq_y using random.shuffle() 2.Compute the maximum value score for the local alignment of seq_x and rand_y 3.Increment the entry score in the dictionary scoring_distribution by one. ''' scoring_distribution = {} for _ in range(num_trials): #shuffle the seq_y list_y = list(seq_y) random.shuffle(list_y) rand_y = "".join(list_y) alignment_matrix = compute_alignment_matrix(seq_x=seq_x, seq_y=rand_y, scoring_matrix=scoring_matrix, global_flag=False) score = max([max(item) for item in alignment_matrix]) #score, align_human, align_fruitfly = compute_local_alignment(seq_x=seq_x, seq_y=rand_y, scoring_matrix=scoring_matrix, alignment_matrix=alignment_matrix) #import pdb; pdb.set_trace() try: scoring_distribution[score] += 1 except(KeyError): scoring_distribution[score] = 1 scoring_file = open('scoring_distribution.p', 'wb') pickle.dump(scoring_distribution, scoring_file) scoring_file.close() return scoring_distribution
def random_alignment(scoring_matrix): ''' Take two random amino acids, compute the alignment and the consensus with pax Return the ratio of similarity To examine the solution of the homework ''' alphabet = "ACBEDGFIHKMLNQPSRTWVYXZ" length_human = 422 length_fruitfly = 857 random_human = '' for _ in range(length_human): random_human += random.choice(alphabet) random_fruitfly = '' for _ in range(length_fruitfly): random_fruitfly += random.choice(alphabet) alignment_matrix = compute_alignment_matrix(seq_x=random_human, seq_y=random_fruitfly, scoring_matrix=scoring_matrix, global_flag=False) score, align_human, align_fruitfly = compute_local_alignment(seq_x=random_human, seq_y=random_fruitfly, scoring_matrix=scoring_matrix, alignment_matrix=alignment_matrix) global_align_human, ratio_human_random = consensus_alignment(scoring_matrix, align_human) global_align_fruitfly, ratio_fruitfly_random = consensus_alignment(scoring_matrix, align_fruitfly) return ratio_human_random, ratio_fruitfly_random
def get_edit_distance(seq_x, seq_y, scoring_matrix): ''' compute the seq_x and seq_y global alignment with scoring matrix return the edit distance can be expressed in term of: |x| + |y| - score(x, y) ''' alignment_matrix = compute_alignment_matrix(seq_x, seq_y, scoring_matrix, True) score, align_x, align_y = compute_global_alignment(seq_x, seq_y, scoring_matrix, alignment_matrix) return len(seq_x) + len(seq_y) - score
def protein_alignment(scoring_matrix): ''' compute the human eyeless protein and fruitfly eyeless portein alignment ''' human_protein = read_protein('HumanEyelessProtein.txt') fruitfly_protein = read_protein('FruitflyEyelessprotein.txt') alignment_matrix = compute_alignment_matrix(seq_x=human_protein, seq_y=fruitfly_protein, scoring_matrix=scoring_matrix, global_flag=False) score, align_human, align_fruitfly = compute_local_alignment(seq_x=human_protein, seq_y=fruitfly_protein, scoring_matrix=scoring_matrix, alignment_matrix=alignment_matrix) return score, align_human, align_fruitfly
def protein_alignment(scoring_matrix): ''' compute the human eyeless protein and fruitfly eyeless portein alignment ''' human_protein = read_protein('HumanEyelessProtein.txt') fruitfly_protein = read_protein('FruitflyEyelessprotein.txt') alignment_matrix = compute_alignment_matrix(seq_x=human_protein, seq_y=fruitfly_protein, scoring_matrix=scoring_matrix, global_flag=False) score, align_human, align_fruitfly = compute_local_alignment( seq_x=human_protein, seq_y=fruitfly_protein, scoring_matrix=scoring_matrix, alignment_matrix=alignment_matrix) return score, align_human, align_fruitfly
def consensus_alignment(scoring_matrix, align_sequence): ''' compute the similarity of the two sequences in the local alignment computed to the PAX Domain. global alignment of local align_x vs PAX Domain, local align_y vs PAX Domain return the alignment and the persentage of elements in these two sequences that agree ''' pax_domain = read_protein('ConsensusPAXDomain.txt') align_sequence = align_sequence.replace("-", "") alignment_matrix = compute_alignment_matrix(seq_x=align_sequence, seq_y=pax_domain, scoring_matrix=scoring_matrix, global_flag=True) score, global_align_sequence, global_align_pax = compute_global_alignment(seq_x=align_sequence, seq_y=pax_domain, scoring_matrix=scoring_matrix, alignment_matrix=alignment_matrix) seq = difflib.SequenceMatcher(None, global_align_sequence, global_align_pax) ratio = seq.ratio() #import pdb; pdb.set_trace() return global_align_sequence, ratio