def question2():
    scoring_matrix =read_scoring_matrix(PAM50_URL)
    human = read_protein(HUMAN_EYELESS_URL)
    fly = read_protein(FRUITFLY_EYELESS_URL)
    # for question 3
#    acids = 'ACBEDGFIHKMLNQPSRTWVYXZ'
#    hlen = len(human)
#    flen = len(fly)
#    human_random, fly_random = '', ''
#    for dummy_i in xrange(hlen):
#        human_random = human_random + human[random.randint(1,23)]
#    for dummy_i in xrange(flen):
#        fly_random = fly_random + fly[random.randint(1,23)]
#    human = human_random
#    fly = fly_random
    consensusPAX = read_protein(CONSENSUS_PAX_URL)
    alignment_matrix = student.compute_alignment_matrix(human, fly, scoring_matrix, False)
    local_result = student.compute_local_alignment(human, fly, scoring_matrix, alignment_matrix)
    local_human = ''.join(local_result[1].split('-'))
    local_fly = ''.join(local_result[2].split('-'))
    human_P = student.compute_alignment_matrix(local_human,consensusPAX,scoring_matrix, True)
    human_result = student.compute_global_alignment(local_human,consensusPAX, scoring_matrix, human_P)
    fly_P = student.compute_alignment_matrix(local_fly,consensusPAX, scoring_matrix, True)
    fly_result = student.compute_global_alignment(local_fly,consensusPAX, scoring_matrix, fly_P)
    total = len(consensusPAX)
    human_count, fly_count =0, 0
    for dummy_i in xrange(total):
        if human_result[1][dummy_i] == human_result[2][dummy_i]:
            human_count += 1
        if fly_result[1][dummy_i] ==fly_result[2][dummy_i]:
            fly_count += 1
    print human_count * 1.0 / total
    print fly_count * 1.0 / total
def check_spelling(checked_word, dist, word_list):
    alphabet = set('abcdefghijklmnopqrstuvwxyz')
    scoring_matrix = student.build_scoring_matrix(alphabet, 2, 1, 0)
    result = list()
    for word in word_list:
        align = student.compute_alignment_matrix(checked_word, word, scoring_matrix, True)
        scores = student.compute_global_alignment(checked_word, word, scoring_matrix, align)
        if (len(checked_word) + len(word) - scores[0]) <= dist:
            result.append(word)
    return result
def question2():
    """ determine global alignment of consensusPAX 
    with local human and frfly sequences
    """
    
    # load sequences and scoring matrix
    score_matrix = read_scoring_matrix(PAM50_URL)
    human_seq = "HSGVNQLGGVFVNGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATPEVVSKIAQYKRECPSIFAWEIRDRLLSEGVCTNDNIPSVSSINRVLRNLASEKQQ"
    frfly_seq = "HSGVNQLGGVFVGGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATAEVVSKISQYKRECPSIFAWEIRDRLLQENVCTNDNIPSVSSINRVLRNLAAQKEQQ"
    consensus_pax = read_protein(CONSENSUS_PAX_URL)
    
    # compute human and fruitfly global alignment matrix with consensus pax
    human_align_matrix = student.compute_alignment_matrix(human_seq, consensus_pax, score_matrix, True)
    frfly_align_matrix = student.compute_alignment_matrix(frfly_seq, consensus_pax, score_matrix, True)
    
    # compute human and fruitfly global alignment sequences
    score_human, human_align, consensus_align = student.compute_global_alignment(human_seq, consensus_pax, 
                                                                                 score_matrix, human_align_matrix)
    score_fly, frfly_align, consensus_align_2 = student.compute_global_alignment(frfly_seq, consensus_pax,
                                                                                 score_matrix, frfly_align_matrix)
    
    # compute percentages match for human and fruitfly
    human_count = 0.0
    for index in range(len(human_align)):
        if human_align[index] == consensus_align[index]:
            human_count += 1
            
    frfly_count = 0.0
    for index in range(len(frfly_align)):
        if frfly_align[index] == consensus_align_2[index]:
            frfly_count += 1
            
    print "% Human: " + str(human_count / len(human_align) * 100)
    print "Hmn: " + human_align
    print "PAX: " + consensus_align
    
    print ""
    
    print "% FrFly: " + str(frfly_count / len(frfly_align) * 100)
    print "Fly: " + frfly_align
    print "PAX: " + consensus_align_2
Exemplo n.º 4
0
def q3() :
    len_gen = len(seq_x)
    seq_x = []
    seq_y = []
    for _ in range(len_gen) :
        seq_x.append(random.choice("ACBEDGFIHKMLNQPSRTWVYXZ"))
        seq_y.append(random.choice("ACBEDGFIHKMLNQPSRTWVYXZ"))

        alignment_matrix = student.compute_alignment_matrix(seq_x, seq_y, scoring_matrix, False)
        score, human_aligen, fruit_aligen = student.compute_local_alignment(seq_x, seq_y, scoring_matrix, alignment_matrix)

    print score
    print human_aligen.replace('-', '')
    print fruit_aligen

    human_aligen = human_aligen.replace('-', '')
    fruit_aligen = fruit_aligen.replace('-', '')
    pax = read_protein(CONSENSUS_PAX_URL)

    alignment_matrix = student.compute_alignment_matrix(human_aligen, pax, scoring_matrix, False)
    score, h1, h2 = student.compute_global_alignment(human_aligen, pax, scoring_matrix, alignment_matrix)

    print len(h1), len(h2)
    same = 0
    for i in range(len(h1)) :
        if h1[i] == h2[i] :
            same += 1
    print same * 1.0 / len(h1)

    alignment_matrix = student.compute_alignment_matrix(fruit_aligen, pax, scoring_matrix, False)
    score, f1, f2 = student.compute_global_alignment(fruit_aligen, pax, scoring_matrix, alignment_matrix)

    print len(f1), len(f2)
    same = 0
    for i in range(len(f1)) :
        if f1[i] == f2[i] :
            same += 1
    print same * 1.0 / len(f1)
def run_q2(origin_seq_x):
    seq_x = origin_seq_x.replace('-', '')
    seq_y = 'GHGGVNQLGGVFVNGRPLPDVVRQRIVELAHQGVRPCDISRQLRVSHGCVSKILGRYYETGSIKPGVIGGSKPKVATPKVVEKIAEYKRQNPTMFAWEIRDRLLAERVCDNDTVPSVSSINRIIR'
    scoring_matrix = read_scoring_matrix(PAM50_URL)
    alignment_matrix = student.compute_alignment_matrix(seq_x, seq_y, scoring_matrix, True)
    score, aglin_x, aglin_y = student.compute_global_alignment(seq_x, seq_y, scoring_matrix, alignment_matrix)
    assert len(aglin_x) == len(aglin_y)
    length = len(aglin_y)
    match = 0
    print (len(seq_x), len(seq_y), len(aglin_x) , len(aglin_y))
    for idx in range(length):
        if aglin_x[idx] == aglin_y[idx]:
            match += 1
    return match * 1.0 / length
def run_q2(origin_seq_x):
    seq_x = origin_seq_x.replace('-', '')
    seq_y = 'GHGGVNQLGGVFVNGRPLPDVVRQRIVELAHQGVRPCDISRQLRVSHGCVSKILGRYYETGSIKPGVIGGSKPKVATPKVVEKIAEYKRQNPTMFAWEIRDRLLAERVCDNDTVPSVSSINRIIR'
    scoring_matrix = read_scoring_matrix(PAM50_URL)
    alignment_matrix = student.compute_alignment_matrix(
        seq_x, seq_y, scoring_matrix, True)
    score, aglin_x, aglin_y = student.compute_global_alignment(
        seq_x, seq_y, scoring_matrix, alignment_matrix)
    assert len(aglin_x) == len(aglin_y)
    length = len(aglin_y)
    match = 0
    print(len(seq_x), len(seq_y), len(aglin_x), len(aglin_y))
    for idx in range(length):
        if aglin_x[idx] == aglin_y[idx]:
            match += 1
    return match * 1.0 / length
def question7(seq_x, seq_y):
    """ determine scoring matrix of edit distance algorithm """
    
    diag_score = 2
    off_diag_score = 1
    dash_score = 0
    alphabet = "abcdefghijklmnopqrstuvwxyz"
    score_matrix = student.build_scoring_matrix(alphabet, diag_score, off_diag_score, dash_score)
    
    align_matrix = student.compute_alignment_matrix(seq_x, seq_y, score_matrix, True)
    score, align_x, align_y = student.compute_global_alignment(seq_x, seq_y, score_matrix, align_matrix)
    
    edit_distance = len(seq_x) + len(seq_y) - score
    
    print "Edit distance: " + str(edit_distance)
    print align_x
    print align_y
Exemplo n.º 8
0
def check_spelling(checked_word, dist, word_list):
    """
    input: word, target distance, and word list
    output: return a subset of word list which the distance between input word < target distance
    """
    result = set()
    for item in word_list:
        alignment_matrix = student.compute_alignment_matrix(
            checked_word, item, scoring_matrix, True)
        global_alignment = student.compute_global_alignment(
            checked_word, item, scoring_matrix, alignment_matrix)
        """
        print word_list[index]
        print alignment_matrix
        print global_alignment
        """
        if (len(checked_word) + len(item) - global_alignment[0]) <= dist:
            result = result.union(set([item]))
    return result
Exemplo n.º 9
0
def check_spelling(checked_word, dist, word_list):
    diag_score = 2
    off_diag_score = 1
    dash_score = 0
    chars = 'abcdefghijklmnopqrstuvwxyz'
    alphabet = set([char for char in chars])
    len_checkedword = len(checked_word)
    scoring_matrix = student.build_scoring_matrix(alphabet, diag_score,
                                                  off_diag_score, dash_score)
    similar_word_list = []
    for word in word_list:
        global_alignment_matrix = student.compute_alignment_matrix(
            checked_word, word, scoring_matrix, True)
        global_alignment_score = student.compute_global_alignment(
            checked_word, word, scoring_matrix, global_alignment_matrix)[0]
        edit_dist = len_checkedword + len(word) - global_alignment_score
        if edit_dist <= dist:
            similar_word_list.append(word)
    return similar_word_list
def check_spelling(checked_word, dist, word_list):
    """ helper function to determine all words edit distance away """
    
    diag_score = 2
    off_diag_score = 1
    dash_score = 0
    alphabet = "abcdefghijklmnopqrstuvwxyz"
    score_matrix = student.build_scoring_matrix(alphabet, diag_score, off_diag_score, dash_score)
    
    words = []
    
    for word in word_list:
        align_matrix = student.compute_alignment_matrix(checked_word, word, score_matrix, True)
        score, align_x, align_y = student.compute_global_alignment(checked_word, word,
                                                                   score_matrix, align_matrix)
    
        edit_distance = len(checked_word) + len(word) - score
        
        if edit_distance <= dist:
            words.append(word)
    
    return words
Exemplo n.º 11
0
def check_spelling(checked_word, dist, word_list):
    """
    input:
        iterates through word_list and returns the set of all words
        that are within edit distance dist of the string checked_word
    output:
        the set of all words that are within edit distance dist of
        the string checked_word
    """
    
    result_set = set([])
    diag_score = 2
    off_diag_score = 1
    dash_score = 0
    alphabet = set('abcdefghijklmnopqrstuvwxyz')
    matrix_M = student.build_scoring_matrix(alphabet, diag_score, off_diag_score, dash_score)
    for word in word_list:
        matrix_S = student.compute_alignment_matrix(checked_word, word, matrix_M, True)
        global_align_word = student.compute_global_alignment(checked_word, word, matrix_M, matrix_S)
        if len(checked_word) + len(word) - global_align_word[0] <= dist:
            result_set.add(word)
    
    return result_set
hep_fep_local_alignment = alg_project4_solution.compute_local_alignment(hep, fep, scoring_matrix,
                                                                        alg_project4_solution.compute_alignment_matrix(
                                                                            hep, fep, scoring_matrix, False))
human_eyeless_fruitfly_local_alignment_score = hep_fep_local_alignment[0]
# question 1 answer
print "local alignment for human and fruitfly eyeless genome: " + str(hep_fep_local_alignment)

# question 2
cpd = alg_alignment.read_protein(CONSENSUS_PAX_URL)

hep_local_alignment = hep_fep_local_alignment[1]
fep_local_alignment = hep_fep_local_alignment[2]

hep_local_alignment_no_dashes = hep_local_alignment.replace('-', '')

hep_no_dashes_cpd_global_alignment = alg_project4_solution.compute_global_alignment(hep_local_alignment_no_dashes, cpd, scoring_matrix, alg_project4_solution.compute_alignment_matrix(
                                                   hep_local_alignment_no_dashes, cpd, scoring_matrix, True))
fep_local_alignment_no_dashes = fep_local_alignment.replace('-', '')

fep_no_dashes_cpd_global_alignment = alg_project4_solution.compute_global_alignment(fep_local_alignment_no_dashes, cpd, scoring_matrix,
                                                                                    alg_project4_solution.compute_alignment_matrix(fep_local_alignment_no_dashes, cpd, scoring_matrix, True))

print hep_no_dashes_cpd_global_alignment
print fep_no_dashes_cpd_global_alignment

# compute the percentage of elements in these two sequences that agree
hndga = hep_no_dashes_cpd_global_alignment[1]
hndgacpd = hep_no_dashes_cpd_global_alignment[2]


human_consensus_match_count = 0
for idx, elem in enumerate(hndga):
                                                   False))
human_eyeless_fruitfly_local_alignment_score = hep_fep_local_alignment[0]
# question 1 answer
print "local alignment for human and fruitfly eyeless genome: " + str(
    hep_fep_local_alignment)

# question 2
cpd = alg_alignment.read_protein(CONSENSUS_PAX_URL)

hep_local_alignment = hep_fep_local_alignment[1]
fep_local_alignment = hep_fep_local_alignment[2]

hep_local_alignment_no_dashes = hep_local_alignment.replace('-', '')

hep_no_dashes_cpd_global_alignment = alg_project4_solution.compute_global_alignment(
    hep_local_alignment_no_dashes, cpd, scoring_matrix,
    alg_project4_solution.compute_alignment_matrix(
        hep_local_alignment_no_dashes, cpd, scoring_matrix, True))
fep_local_alignment_no_dashes = fep_local_alignment.replace('-', '')

fep_no_dashes_cpd_global_alignment = alg_project4_solution.compute_global_alignment(
    fep_local_alignment_no_dashes, cpd, scoring_matrix,
    alg_project4_solution.compute_alignment_matrix(
        fep_local_alignment_no_dashes, cpd, scoring_matrix, True))

print hep_no_dashes_cpd_global_alignment
print fep_no_dashes_cpd_global_alignment

# compute the percentage of elements in these two sequences that agree
hndga = hep_no_dashes_cpd_global_alignment[1]
hndgacpd = hep_no_dashes_cpd_global_alignment[2]
student.compute_local_alignment(HUMAN_EYELESS, FRUITFLY_EYELESS,\
                                SCORING_MATRIX, ALIGNMENT_MATRIX)

# Question 2 ##################################################################
PAX = read_protein(CONSENSUS_PAX_URL)

loc_score, loc_human, loc_fly = student.compute_local_alignment(HUMAN_EYELESS,\
                                                               FRUITFLY_EYELESS,\
                                                               SCORING_MATRIX,\
                                                               ALIGNMENT_MATRIX)

for align in (loc_human, loc_fly):
    align = align.replace('-', '')
    alignment_matrix = student.compute_alignment_matrix(
        align, PAX, SCORING_MATRIX, True)
    score, alignment, cons = student.compute_global_alignment(
        align, PAX, SCORING_MATRIX, alignment_matrix)
    print sum([alignment[i] == cons[i]
               for i in range(len(alignment))]) / float(len(alignment))

# Question 4 ##################################################################


def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials):
    scoring_distribution = {}
    list_y = list(seq_y)
    for trial in range(num_trials):
        temp_y = list_y
        random.shuffle(temp_y)
        rand_y = ''.join(temp_y)
        alignment_matrix = student.compute_alignment_matrix(
            seq_x, rand_y, scoring_matrix, False)
Exemplo n.º 15
0
 def score(x, y):
     alignment_matrix = student.compute_alignment_matrix(
         x, y, scoring_matrix, True)
     return student.compute_global_alignment(x, y, scoring_matrix,
                                             alignment_matrix)[0]
Exemplo n.º 16
0
def find_local_align():
    score_matrix = read_scoring_matrix(PAM50_URL)
    seq_human = read_protein(HUMAN_EYELESS_URL)
    seq_fly = read_protein(FRUITFLY_EYELESS_URL)
    local_alignment_matrix = student.compute_alignment_matrix(
        seq_human, seq_fly, score_matrix, False)
    score, seq_loc_human, seq_loc_fly = student.compute_local_alignment(
        seq_human, seq_fly, score_matrix, local_alignment_matrix)
    length = len(seq_loc_fly)
    agree = 0
    for idx in range(length):
        if seq_loc_fly[idx] == seq_loc_human[idx]:
            agree += 1
    print 'Question 1:\n'
    print 'score:', score, '\nhuman:', seq_loc_human, '\nfly:  ', seq_loc_fly
    print 'Agree percentage: %.2f' % (100 * float(agree) / length)
    """
    Question 1:
    local alignment score: 875 
    human: HSGVNQLGGVFVNGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATPEVVSKIAQYKRECPSIFAWEIRDRLLSEGVCTNDNIPSVSSINRVLRNLASEK-QQ 
    fly:   HSGVNQLGGVFVGGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATAEVVSKISQYKRECPSIFAWEIRDRLLQENVCTNDNIPSVSSINRVLRNLAAQKEQQ
    Agree percentage: 93.98%
    """

    ### Question 2 ###
    print '\nQuestion 2:\n'
    seq_loc_human = seq_loc_human.replace('-', '')
    seq_loc_fly = seq_loc_fly.replace('-', '')
    seq_pax = read_protein(CONSENSUS_PAX_URL)  #Q2
    # seq_pax = 'ACBEDGFIHKMLNQPSRTWVYXZ' #Q3
    for idx in range(2):
        if idx == 0:
            seq = seq_loc_human
            type = 'human'
        else:
            seq = seq_loc_fly
            type = 'fly'
        global_alignment_matrix = student.compute_alignment_matrix(
            seq, seq_pax, score_matrix, True)
        score, x_glbl, pax_glbl = student.compute_global_alignment(
            seq, seq_pax, score_matrix, global_alignment_matrix)
        length = len(x_glbl)
        agree = 0
        for idx in range(length):
            if x_glbl[idx] == pax_glbl[idx]:
                agree += 1

        print 'score:', score, '\n' + type, x_glbl, '\nPAX:  ', pax_glbl
        print type + ' agree percentage: %.2f' % (100 * float(agree) / length)
        """
        Question 2:

        human score: 613 
        human: -HSGVNQLGGVFVNGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATPEVVSKIAQYKRECPSIFAWEIRDRLLSEGVCTNDNIPSVSSINRVLRNLASEKQQ 
        PAX:   GHGGVNQLGGVFVNGRPLPDVVRQRIVELAHQGVRPCDISRQLRVSHGCVSKILGRYYETGSIKPGVIGGSKPKVATPKVVEKIAEYKRQNPTMFAWEIRDRLLAERVCDNDTVPSVSSINRIIR--------
        human agree percentage: 72.93
        
        flyscore: 586 
        fly:  -HSGVNQLGGVFVGGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATAEVVSKISQYKRECPSIFAWEIRDRLLQENVCTNDNIPSVSSINRVLRNLAAQKEQQ 
        PAX:  GHGGVNQLGGVFVNGRPLPDVVRQRIVELAHQGVRPCDISRQLRVSHGCVSKILGRYYETGSIKPGVIGGSKPKVATPKVVEKIAEYKRQNPTMFAWEIRDRLLAERVCDNDTVPSVSSINRIIR---------
        fly agree percentage: 70.15
        """
        """
Exemplo n.º 17
0
# Q2
TempHumanSeq = result_Q1[1]
FruitflySeq = result_Q1[2]
HumanSeq = TempHumanSeq[:len(TempHumanSeq) -
                        3] + TempHumanSeq[len(TempHumanSeq) - 2:]

ConsensusPAXDomain = read_protein(CONSENSUS_PAX_URL)

alignment_matrix_Q2_Human = student.compute_alignment_matrix(
    HumanSeq, ConsensusPAXDomain, PAM50, True)
alignment_matrix_Q2_Fruitfly = student.compute_alignment_matrix(
    FruitflySeq, ConsensusPAXDomain, PAM50, True)

result_Q2_Human = student.compute_global_alignment(HumanSeq,
                                                   ConsensusPAXDomain, PAM50,
                                                   alignment_matrix_Q2_Human)
result_Q2_Fruitfly = student.compute_global_alignment(
    FruitflySeq, ConsensusPAXDomain, PAM50, alignment_matrix_Q2_Fruitfly)


def calculate_score(seq1, seq2):
    if len(seq1) != len(seq2):
        print "Wrong!"
        return
    else:
        num_equal = 0
        for dummy_idx in range(len(seq1)):
            if seq1[dummy_idx] == seq2[dummy_idx]:
                num_equal += 1
        return (float(num_equal) / float(len(seq1)) * 100)
Exemplo n.º 18
0
 def score(x, y):
     alignment_matrix = student.compute_alignment_matrix(x, y, scoring_matrix, True)
     return student.compute_global_alignment(x, y, scoring_matrix, alignment_matrix)[0]