Пример #1
0
def check_spelling(checked_word, dist, word_list):
    '''
    Iterates through word_list and returns the set of all words that are within 
    edit distance dist of the string checked_word.
    '''
    # Set constants 
    ALPHABET = set(list(string.ascii_lowercase))
    DIAG_SCORE = 2
    OFF_DIAG_SCORE = 1
    DASH_SCORE = 0
    # contruct scoring matrix over all lower case letters
    scoring_matrix = student.build_scoring_matrix(ALPHABET, DIAG_SCORE, OFF_DIAG_SCORE, DASH_SCORE)
    # Init list to store words
    close_words = []
    # Loop over word in word_list
    for word in word_list:
        # compute alignment matrix
        alignment_matrix = student.compute_alignment_matrix(checked_word, word, scoring_matrix, True)
        # compute score of global alignments
        score, align_x, align_y = student.compute_global_alignment(checked_word, word, scoring_matrix, alignment_matrix)
        # calculate edit distance
        edit_distance = len(checked_word) + len(word) - score
        # Compare edit_distance and dist
        if edit_distance <= dist:
            # save word
            close_words.append(word)
    return close_words
Пример #2
0
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials):
    '''
    blah, blah, blah.
    Input:
        seq_x, seq_y - two sequences that share a common alphabet
        scoring_matrix - 
        num_trials - 
    Output:
        scoring_distribution - a dict of scores, which the key is the score and value 
        is the number of times that score has appeared in the trials
    '''
    # Init dict to store scores
    scoring_distribution = dict()
    # Perform trails
    while sum(scoring_distribution.values()) < num_trials:
        # convert seq_y into a list
        list_y = list(seq_y)
        # generates a random permutation of the list of seq_y
        random.shuffle(list_y)
        # convert the list into a string
        rand_y = ''.join(list_y)
        # Compute the alignment matrix
        alignment_matrix = student.compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False)
        # Score the local alignments
        score, align_x, align_y = student.compute_local_alignment(seq_x, rand_y, scoring_matrix, alignment_matrix)
        # Check if score is already in scoring_distribution
        if score in scoring_distribution.keys():
            # Increment the score by 1
            scoring_distribution[score] += 1
        else:
            # Add score and set value to 1
            scoring_distribution[score] = 1
    return scoring_distribution
Пример #3
0
def question_one():
    """
    Compute local alignments and sequences of Human Eyeless Protein and Fruitfly Eyeless Protein
    """
    scoring_matrix = read_scoring_matrix(PAM50_URL)
    human_seq = read_protein(HUMAN_EYELESS_URL)
    fly_seq = read_protein(FRUITFLY_EYELESS_URL)
    align_matrix = prj4.compute_alignment_matrix(human_seq, fly_seq, scoring_matrix, False)
    result = prj4.compute_local_alignment(human_seq, fly_seq, scoring_matrix, align_matrix)
    return result
Пример #4
0
def answer_Q1():
    '''
    Answers Q1.
    '''
    # load the acid sequences that form the eyeless proteins for humans genomes
    human_sequence = provided.read_protein(HUMAN_EYELESS_URL)
    # load the acid sequences that form the eyeless proteins for fruit flies genomes
    fly_sequence = provided.read_protein(FRUITFLY_EYELESS_URL)
    # load the PAM50 scoring matrix
    pam50_scoring_matrix = provided.read_scoring_matrix(PAM50_URL)
    # compute the alignment method using method Q12
    alignment_matrix = student.compute_alignment_matrix(human_sequence, fly_sequence, pam50_scoring_matrix, False)
    return student.compute_local_alignment(human_sequence, fly_sequence, pam50_scoring_matrix, alignment_matrix)
Пример #5
0
def question_two():
    """
    Compute comparison of two human and fruitfly local alignment sequences and return percentage of matches between both.
    """
    scoring_matrix = read_scoring_matrix(PAM50_URL)
    local_results = question_one()
    pax_seq = read_protein(CONSENSUS_PAX_URL)
    dash, new_human, new_fly = "-", "", ""
    percentages = []

    #remove dashes from human and fruit fly sequences
    for char in local_results[1]:
        if char != dash:
            new_human += char
    print("Old human seq: " + local_results[1])
    print("New human seq: " + new_human)
    for char in local_results[2]:
        if char != dash:
            new_fly += char
    print("Old fly seq: " + local_results[2])
    print("New fly seq: " + new_fly)

    #compute alignment matrices and calculate global alignments between human, fruit and pax
    print("Computing alignment matrices and global alignments...")
    align_matrix = prj4.compute_alignment_matrix(new_human, pax_seq,
                                                 scoring_matrix, True)
    result_human_comp = prj4.compute_global_alignment(new_human, pax_seq,
                                                      scoring_matrix,
                                                      align_matrix)
    #print result_human_comp
    align_matrix = prj4.compute_alignment_matrix(new_fly, pax_seq,
                                                 scoring_matrix, True)
    result_fly_comp = prj4.compute_global_alignment(new_fly, pax_seq,
                                                    scoring_matrix,
                                                    align_matrix)
    #print result_fly_comp

    #calculate percantage of matches between human, fruit, and pax
    matches = 0
    for index in range(len(result_human_comp[2])):
        if result_human_comp[1][index] == result_human_comp[2][index]:
            matches += 1
    percentages.append(matches / float(len(result_human_comp[2])))
    matches = 0
    for index in range(len(result_fly_comp[2])):
        if result_fly_comp[1][index] == result_fly_comp[2][index]:
            matches += 1
    percentages.append(matches / float(len(result_fly_comp[2])))

    #return the two percentages in a list
    return percentages
Пример #6
0
def answer_Q7():
    alphabet = set(['A', 'C', 'T', 'G'])
    diag_score = 2
    off_diag_score = 1
    dash_score = 0
    
    seq_x = 'AA' 
    seq_y = 'TAAT'
    
    scoring_matrix = student.build_scoring_matrix(alphabet, diag_score, off_diag_score, dash_score)
    alignment_matrix = student.compute_alignment_matrix(seq_x, seq_y, scoring_matrix, True)
    
    score, align_x, align_y = student.compute_global_alignment(seq_x, seq_y, scoring_matrix, alignment_matrix)
    
    edit_distance = len(seq_x) + len(seq_y) - score
    
    return (diag_score, off_diag_score, dash_score)
Пример #7
0
def question_two():
    """
    Compute comparison of two human and fruitfly local alignment sequences and return percentage of matches between both.
    """
    scoring_matrix = read_scoring_matrix(PAM50_URL)
    local_results = question_one()
    pax_seq = read_protein(CONSENSUS_PAX_URL)
    dash, new_human, new_fly = "-", "", ""
    percentages = []

    #remove dashes from human and fruit fly sequences
    for char in local_results[1]:
        if char != dash:
            new_human += char
    print "Old human seq: " + local_results[1]
    print "New human seq: " + new_human
    for char in local_results[2]:
        if char != dash:
            new_fly += char
    print "Old fly seq: " + local_results[2]
    print "New fly seq: " + new_fly

    #compute alignment matrices and calculate global alignments between human, fruit and pax
    print "Computing alignment matrices and global alignments..."
    align_matrix = prj4.compute_alignment_matrix(new_human, pax_seq, scoring_matrix, True)
    result_human_comp = prj4.compute_global_alignment(new_human, pax_seq, scoring_matrix, align_matrix)
    #print result_human_comp
    align_matrix = prj4.compute_alignment_matrix(new_fly, pax_seq, scoring_matrix, True)
    result_fly_comp = prj4.compute_global_alignment(new_fly, pax_seq, scoring_matrix, align_matrix)
    #print result_fly_comp

    #calculate percantage of matches between human, fruit, and pax
    matches = 0
    for index in xrange(len(result_human_comp[2])):
        if result_human_comp[1][index] == result_human_comp[2][index]:
            matches += 1
    percentages.append(matches / float(len(result_human_comp[2])))
    matches = 0
    for index in xrange(len(result_fly_comp[2])):
        if result_fly_comp[1][index] == result_fly_comp[2][index]:
            matches += 1 
    percentages.append(matches / float(len(result_fly_comp[2])))

    #return the two percentages in a list
    return percentages
Пример #8
0
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials):
    """
    Generates distribution of local alignment sequences stochastically
    """
    distribution = {}

    #loop through num trials to calculation local alignments of random sequences
    for trial in xrange(num_trials):
        rand_y = list(seq_y)
        random.shuffle(rand_y)
        rand_y = "".join(rand_y)
        align_matrix = prj4.compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False)
        score = prj4.compute_local_alignment(seq_x, rand_y, scoring_matrix, align_matrix)
        if score[0] in distribution:
            distribution[score[0]] += 1
        else:
            distribution[score[0]] = 1

    #return unnormalized distribution of scores
    return distribution
Пример #9
0
def percent_match(local_alignment):
    ''' 
    Computes the percent similarilty between a local alignment to the 
    global alignment of the PAX sequence.
    '''
    # remove the '-' from the local alignment
    local_alignment = local_alignment.replace('-', '')
    # load the PAM50 scoring matrix
    pam50_scoring_matrix = provided.read_scoring_matrix(PAM50_URL)
    # load the consensus sequence
    consensus_sequence = provided.read_protein(CONSENSUS_PAX_URL)
    # compute the global alignment
    alignment_matrix = student.compute_alignment_matrix(local_alignment, consensus_sequence, pam50_scoring_matrix, True)
    # compute the global alignment
    score, global_alignment, consensus_alignment = student.compute_global_alignment(local_alignment, consensus_sequence, pam50_scoring_matrix, alignment_matrix)
    # Init the variable to store matches
    match = 0
    # loop over each character
    for char in range(len(global_alignment)):
        # compare characters between the two alignments
        if global_alignment[char] == consensus_alignment[char]:
            # increase the match score by 1
            match += 1
    return round(match/float(len(global_alignment))*100, 2)
Пример #10
0
"""
Algorithmic Thinking - Module 4 Project
Mark Hess
Dynamic Programming and Sequence Alignment
Computing Alginments of sequences
Test File
"""
import Project_4

TEST1 = True
TEST2 = True
TEST3 = False
TEST4 = False

if TEST1:
	print (Project_4.build_scoring_matrix(set(['A', 'C', 'T', 'G']), 6, 2, -4))
	#expected {'A': {'A': 6, 'C': 2, '-': -4, 'T': 2, 'G': 2}, 'C': {'A': 2, 'C': 6, '-': -4, 'T': 2, 'G': 2},
	#'-': {'A': -4, 'C': -4, '-': -4, 'T': -4, 'G': -4}, 'T': {'A': 2, 'C': 2, '-': -4, 'T': 6, 'G': 2},
	#'G': {'A': 2, 'C': 2, '-': -4, 'T': 2, 'G': 6}}

if TEST2:
	print (Project_4.compute_alignment_matrix('', '', {'A': {'A': 6, 'C': 2, '-': -4, 'T': 2, 'G': 2},
		'C': {'A': 2, 'C': 6, '-': -4, 'T': 2, 'G': 2}, '-': {'A': -4, 'C': -4, '-': -4, 'T': -4, 'G': -4},
		'T': {'A': 2, 'C': 2, '-': -4, 'T': 6, 'G': 2}, 'G': {'A': 2, 'C': 2, '-': -4, 'T': 2, 'G': 6}}, True))
	#expected [[0]] but received []
	print (Project_4.compute_alignment_matrix('A', 'A', {'A': {'A': 6, 'C': 2, '-': -4, 'T': 2, 'G': 2},
		'C': {'A': 2, 'C': 6, '-': -4, 'T': 2, 'G': 2}, '-': {'A': -4, 'C': -4, '-': -4, 'T': -4, 'G': -4},
		'T': {'A': 2, 'C': 2, '-': -4, 'T': 6, 'G': 2}, 'G': {'A': 2, 'C': 2, '-': -4, 'T': 2, 'G': 6}}, True))
	#expected [[0, -4], [-4, 6]]
	print (Project_4.compute_alignment_matrix('ATG', 'ACG', {'A': {'A': 6, 'C': 2, '-': -4, 'T': 2, 'G': 2},
		'C': {'A': 2, 'C': 6, '-': -4, 'T': 2, 'G': 2}, '-': {'A': -4, 'C': -4, '-': -4, 'T': -4, 'G': -4},
Algorithmic Thinking (Part 2)
Application 4: Applications to Genomics and Beyond
"""

import Project_4
import alg_application4_provided as provided
import math
import matplotlib.pyplot as plt
"""
Question 1
"""
seq_human = provided.read_protein(provided.HUMAN_EYELESS_URL)
seq_fly = provided.read_protein(provided.FRUITFLY_EYELESS_URL)
scoring_matrix = provided.read_scoring_matrix(provided.PAM50_URL)

local_alignment_mx = Project_4.compute_alignment_matrix(
    seq_human, seq_fly, scoring_matrix, False)
result = Project_4.compute_local_alignment(seq_human, seq_fly, scoring_matrix,
                                           local_alignment_mx)

print 'Score:' + str(result[0])
print 'Human: ' + result[1]
print 'Fly: ' + result[2]
"""
Question 2
"""
ali_human = result[1]
ali_fly = result[2]
seq_con = provided.read_protein(provided.CONSENSUS_PAX_URL)

ali_human = ali_human.replace('-', '')
ali_fly = ali_fly.replace('-', '')
Пример #12
0
Algorithmic Thinking - Module 4 Project
Mark Hess

Dynamic Programming and Sequence Alignment
Computing Alginments of sequences
Test File
"""
import Project_4

TEST1 = False
TEST2 = False
TEST3 = True
TEST4 = True

if TEST1:
	print Project_4.build_scoring_matrix(set(['A', 'C', 'T', 'G']), 6, 2, -4) 
	#expected {'A': {'A': 6, 'C': 2, '-': -4, 'T': 2, 'G': 2}, 'C': {'A': 2, 'C': 6, '-': -4, 'T': 2, 'G': 2}, 
	#'-': {'A': -4, 'C': -4, '-': -4, 'T': -4, 'G': -4}, 'T': {'A': 2, 'C': 2, '-': -4, 'T': 6, 'G': 2}, 
	#'G': {'A': 2, 'C': 2, '-': -4, 'T': 2, 'G': 6}}

if TEST2:
	print Project_4.compute_alignment_matrix('', '', {'A': {'A': 6, 'C': 2, '-': -4, 'T': 2, 'G': 2}, 
		'C': {'A': 2, 'C': 6, '-': -4, 'T': 2, 'G': 2}, '-': {'A': -4, 'C': -4, '-': -4, 'T': -4, 'G': -4}, 
		'T': {'A': 2, 'C': 2, '-': -4, 'T': 6, 'G': 2}, 'G': {'A': 2, 'C': 2, '-': -4, 'T': 2, 'G': 6}}, True)
	#expected [[0]] but received []
	print Project_4.compute_alignment_matrix('A', 'A', {'A': {'A': 6, 'C': 2, '-': -4, 'T': 2, 'G': 2}, 
		'C': {'A': 2, 'C': 6, '-': -4, 'T': 2, 'G': 2}, '-': {'A': -4, 'C': -4, '-': -4, 'T': -4, 'G': -4}, 
		'T': {'A': 2, 'C': 2, '-': -4, 'T': 6, 'G': 2}, 'G': {'A': 2, 'C': 2, '-': -4, 'T': 2, 'G': 6}}, True)
	#expected [[0, -4], [-4, 6]]
	print Project_4.compute_alignment_matrix('ATG', 'ACG', {'A': {'A': 6, 'C': 2, '-': -4, 'T': 2, 'G': 2}, 
		'C': {'A': 2, 'C': 6, '-': -4, 'T': 2, 'G': 2}, '-': {'A': -4, 'C': -4, '-': -4, 'T': -4, 'G': -4}, 
"""


import Project_4
import alg_application4_provided as provided
import math
import matplotlib.pyplot as plt

"""
Question 1
"""
seq_human = provided.read_protein(provided.HUMAN_EYELESS_URL)
seq_fly = provided.read_protein(provided.FRUITFLY_EYELESS_URL)
scoring_matrix = provided.read_scoring_matrix(provided.PAM50_URL)

local_alignment_mx = Project_4.compute_alignment_matrix(seq_human, seq_fly, scoring_matrix, False)
result = Project_4.compute_local_alignment(seq_human, seq_fly, scoring_matrix, local_alignment_mx)


print 'Score:' + str(result[0])
print 'Human: ' + result[1]
print 'Fly: ' + result[2]


"""
Question 2
"""
ali_human = result[1]
ali_fly = result[2]
seq_con = provided.read_protein(provided.CONSENSUS_PAX_URL)