def check_spelling(checked_word, dist, word_list): """ Function that iterates through word_list and returns a set of words that are within an edit distance dist of the string checked_word. """ # Initialize variables for this function _words_found = set() _len_checked = len(checked_word) _counter = 0 # Generate the scoring matrix needed to compare the word _str_alphabet = "abcdefghijklmnopqrstuvwxyz" _alphabet = set(_str_alphabet) scoring_matrix = sa.build_scoring_matrix(_alphabet, 2, 1, 0) # Iterate through the word list to find the words within distance for _word in word_list: _counter += 1 _alignment_matrix = sa.compute_alignment_matrix(checked_word, _word, scoring_matrix) if ( _len_checked + len(_word) - _alignment_matrix[_len_checked][len(_word)] ) <= dist: _words_found.add(_word) if _counter % 1000 == 0: print "..........", _counter, "words processed." return _words_found
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials): """ Function that takes as input two sequences seq_x and seq_y, a scoring matrix, and a number of trials num_trials. This functions returns a dictionary scoring_distribution that represents an un-normalized distribution generated by performing a local alignment process num_trials times. """ # Initialize variables used in this function _scoring_distribution = {} _trials_to_go = num_trials _rand_y = seq_y while _trials_to_go > 0: # Create a random permutation of the sequence seq_y _lst_rand_y = list(_rand_y) random.shuffle(_lst_rand_y) _rand_y = "" for _char in _lst_rand_y: _rand_y += _char # Compute the maximum score for the local alignment of seq_x and rand_y _alignment_matrix = sa.compute_alignment_matrix(seq_x, _rand_y, scoring_matrix, False) _num_rows = len(_alignment_matrix) _num_cols = len(_alignment_matrix[0]) _max_score = -1 for _index1 in range(_num_rows): for _index2 in range(_num_cols): if ( _alignment_matrix[_index1][_index2] > _max_score ): _max_score = _alignment_matrix[_index1][_index2] # Count the scores generated in the dictionary scoring_distribution _scoring_distribution[_max_score] = _scoring_distribution.get(_max_score, 0) + 1 if ( num_trials - _trials_to_go + 1 ) % 1 == 0: print ".......... Trial", ( num_trials - _trials_to_go + 1 ), "completed." _trials_to_go -= 1 return _scoring_distribution
print # Read the FruitflyEyelessProtein and print the result fruitfly_protein = rfs.read_protein(FRUITFLY_EYELESS_URL) print "Fruitfly protein:" print fruitfly_protein print "Length fruitfly protein =", len(fruitfly_protein) print # Read the PAM50 scoring matrix and print the result scoring_matrix = rfs.read_scoring_matrix(PAM50_URL) #print_scoring_matrix(scoring_matrix) #print # Compute local alignment matrix for the two proteins alignment_matrix = sa.compute_alignment_matrix(human_protein, fruitfly_protein, scoring_matrix, False) #print_alignment_matrix(alignment_matrix, human_protein, fruitfly_protein) #print # Compute and print local alignment local_alignment = sa.compute_local_alignment(human_protein, fruitfly_protein, scoring_matrix, alignment_matrix) print print "==================> Local alignment:" print ".......... Local alignment score: ", local_alignment[0] print ".......... Human protein sequence: ", local_alignment[1] if local_alignment[1] == HUMAN_RESULT: print ">>>>>>>>>>>>>>>>>>>> THIS IS THE RIGHT SEQUENCE" print ".......... Fruitfly protein sequence:", local_alignment[2] if local_alignment[2] == FRUITFLY_RESULT: print ">>>>>>>>>>>>>>>>>>>> THIS IS THE RIGHT SEQUENCE" print ".......... Length aligned sequence: ", len(local_alignment[1])
print "Consensus PAX domain protein:" print consensus_pax print "Length consensus PAX domain protein =", len(consensus_pax) print # Read the PAM50 scoring matrix and print the result scoring_matrix = rfs.read_scoring_matrix(PAM50_URL) # Computations for the human protein sequence # . remove all '-' from the sequence lst_of_strings = HUMAN_LOCAL.split("-") human_local = "" for substring in lst_of_strings: human_local += substring # . compute the alignment between this sequence and the ConsensusPAXDomain sequence alignment_matrix = sa.compute_alignment_matrix(human_local, consensus_pax, scoring_matrix) global_alignment = sa.compute_global_alignment(human_local, consensus_pax, scoring_matrix, alignment_matrix) # . determine percentage of agreement agreements = 0 for index in range(len(global_alignment[1])): if global_alignment[1][index] == global_alignment[2][index]: agreements += 1 print ">>>>>>>>>>>>>>> Percentage of agreement human local =", ( float(agreements) / len(global_alignment[1]) ) * 100, "%" # Computations for the fruitfly protein sequence # . remove all '-' from the sequence lst_of_strings = FRUITFLY_LOCAL.split("-") fruitfly_local = "" for substring in lst_of_strings: