def q2(): def q2_helper(str1, str2): if len(str1) != len(str2): return 0 else: ctr = 0 for char in range(len(str1)): if str1[char] == str2[char]: ctr += 1 return (ctr * 1.0 / len(str1)) * 100 human = data.read_protein(data.HUMAN_EYELESS_URL) fly = data.read_protein(data.FRUITFLY_EYELESS_URL) scores = data.read_scoring_matrix(data.PAM50_URL) c_pax = data.read_protein(data.CONSENSUS_PAX_URL) # get local alignment of human and fly a_matrix = soln.compute_alignment_matrix(human, fly, scores, False) l_score, l_h, l_ff = soln.compute_local_alignment(human, fly, scores, a_matrix) # removing the dashes l_h = l_h.replace("-", "") l_ff = l_ff.replace("-", "") # get global alignment matrix for each local string and pax pax_a_h_matrix = soln.compute_alignment_matrix(l_h, c_pax, scores, True) pax_a_ff_matrix = soln.compute_alignment_matrix(l_ff, c_pax, scores, True) # compute global alignment h_ga = soln.compute_global_alignment(l_h, c_pax, scores, pax_a_h_matrix) ff_ga = soln.compute_global_alignment(l_ff, c_pax, scores, pax_a_ff_matrix) print "human:\t\t", q2_helper(h_ga[1], h_ga[2]) print "fruit fly:\t", q2_helper(ff_ga[1], ff_ga[2])
def q4(): norm_dist = {} human = data.read_protein(data.HUMAN_EYELESS_URL) fly = data.read_protein(data.FRUITFLY_EYELESS_URL) scores = data.read_scoring_matrix(data.PAM50_URL) n_trials = 10000 null_dist = generate_null_distribution(human, fly, scores, n_trials) print null_dist # normalize the distribution total = sum(null_dist.values()) for score in null_dist: norm_dist[score] = null_dist[score] * 1.0 / total print norm_dist # dist = {37: 1, 38: 1, 40: 2, 41: 12, 42: 17, 43: 29, 44: 35, 45: 51, 46: 65, 47: 73, 48: 68, 49: 67, 50: 74, 51: 73, # 52: 65, 53: 41, 54: 50, 55: 37, 56: 36, 57: 32, 58: 29, 59: 37, 60: 16, 61: 18, 62: 17, 63: 11, 64: 2, # 65: 9, 66: 7, 67: 5, 68: 3, 69: 1, 70: 2, 71: 1, 72: 2, 73: 3, 75: 1, 76: 1, 77: 2, 78: 2, 82: 1, 85: 1} # dist2 = {38: 3, 40: 7, 41: 13, 42: 23, 43: 27, 44: 29, 45: 45, 46: 74, 47: 69, 48: 72, 49: 71, 50: 65, 51: 52, # 52: 64, 53: 57, 54: 49, 55: 43, 56: 28, 57: 40, 58: 31, 59: 17, 60: 18, 61: 14, 62: 12, 63: 9, 64: 9, # 65: 4, 66: 10, 67: 8, 68: 5, 69: 6, 70: 2, 71: 3, 72: 4, 73: 1, 75: 3, 77: 4, 79: 2, 80: 1, 81: 1, 82: 2, # 84: 1, 85: 1, 93: 1} # 10K trials # {36: 1, 37: 3, 38: 6, 39: 27, 40: 66, 41: 112, 42: 213, 43: 317, 44: 430, 45: 553, 46: 609, 47: 689, 48: 695, 49: 702, 50: 666, 51: 610, 52: 570, 53: 534, 54: 463, 55: 396, 56: 346, 57: 298, 58: 272, 59: 216, 60: 196, 61: 176, 62: 137, 63: 138, 64: 89, 65: 75, 66: 62, 67: 48, 68: 44, 69: 51, 70: 31, 71: 33, 72: 20, 73: 18, 74: 17, 75: 11, 76: 10, 77: 14, 78: 9, 79: 5, 80: 6, 81: 2, 82: 3, 83: 3, 84: 3, 87: 2, 91: 1, 94: 1, 97: 1} # {36: 0.0001, 37: 0.0003, 38: 0.0006, 39: 0.0027, 40: 0.0066, 41: 0.0112, 42: 0.0213, 43: 0.0317, 44: 0.043, 45: 0.0553, 46: 0.0609, 47: 0.0689, 48: 0.0695, 49: 0.0702, 50: 0.0666, 51: 0.061, 52: 0.057, 53: 0.0534, 54: 0.0463, 55: 0.0396, 56: 0.0346, 57: 0.0298, 58: 0.0272, 59: 0.0216, 60: 0.0196, 61: 0.0176, 62: 0.0137, 63: 0.0138, 64: 0.0089, 65: 0.0075, 66: 0.0062, 67: 0.0048, 68: 0.0044, 69: 0.0051, 70: 0.0031, 71: 0.0033, 72: 0.002, 73: 0.0018, 74: 0.0017, 75: 0.0011, 76: 0.001, 77: 0.0014, 78: 0.0009, 79: 0.0005, 80: 0.0006, 81: 0.0002, 82: 0.0003, 83: 0.0003, 84: 0.0003, 87: 0.0002, 91: 0.0001, 94: 0.0001, 97: 0.0001} # norm_dist = {37: 0.001, 38: 0.001, 40: 0.002, 41: 0.012, 42: 0.017, 43: 0.029, 44: 0.035, 45: 0.051, 46: 0.065, # 47: 0.073, 48: 0.068, 49: 0.067, 50: 0.074, 51: 0.073, 52: 0.065, 53: 0.041, 54: 0.05, 55: 0.037, # 56: 0.036, 57: 0.032, 58: 0.029, 59: 0.037, 60: 0.016, 61: 0.018, 62: 0.017, 63: 0.011, 64: 0.002, # 65: 0.009, 66: 0.007, 67: 0.005, 68: 0.003, 69: 0.001, 70: 0.002, 71: 0.001, 72: 0.002, 73: 0.003, # 75: 0.001, 76: 0.001, 77: 0.002, 78: 0.002, 82: 0.001, 85: 0.001} plt.bar(norm_dist.keys(), norm_dist.values(), color='r') plt.ylabel('fraction of total trials corresponding to each score') plt.xlabel('score') plt.title('generate_null_distribution, {} trials'.format(n_trials)) plt.show()
def q1(): human = data.read_protein(data.HUMAN_EYELESS_URL) fly = data.read_protein(data.FRUITFLY_EYELESS_URL) scores = data.read_scoring_matrix(data.PAM50_URL) a_matrix = soln.compute_alignment_matrix(human, fly, scores, False) print soln.compute_local_alignment(human, fly, scores, a_matrix) a_matrix = soln.compute_alignment_matrix(human, fly, scores, True) print soln.compute_global_alignment(human, fly, scores, a_matrix) # local answer b = (875, 'HSGVNQLGGVFVNGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATPEVVSKIAQYKRECPSIFAWEIRDRLLSEGVCTNDNIPSVSSINRVLRNLASEK-QQ', 'HSGVNQLGGVFVGGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATAEVVSKISQYKRECPSIFAWEIRDRLLQENVCTNDNIPSVSSINRVLRNLAAQKEQQ') # global answer a = (4, 'MQN--------------------------------------S--------------HSGVNQLGGVFVNGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATPEVVSKIAQYKRECPSIFAWEIRDRLLSEGVCTNDNIPSVSSINRVLRNLASEK-QQ--------------------------------------------M------------GA----DG-----MYDKLRMLN-------G--Q----T---G-S---WGTR---P----G------------W----YPG----T--------------SV---------------P---------G-Q---P--T-------Q-DGCQQ-QE-G-G-GENTNSISSN-GEDSDEAQMRLQLKRKLQRNRTSFTQEQIEALEKEFERTHYPDVFARERLAAKIDLPEARIQVWFSNRRAKWRREEKLRNQRR--Q-----A-----S---N-T--P------SH-I------P----I---SS-S-FSTSVYQP-----I--PQ-PT-TP-V-SSFTSGSMLGR-T-D-----T--AL-T----NT-Y--S-------AL-P---P-M---P-SF-TM-AN--N--LPM-Q------P-P------V-----PS----Q---T-SS-YSC-M-L---PTSPS----V--N-GR--------------------S-YD--T-YT--PPHM------Q-------------T--H-M--NS-Q-P-MGTS--GTT-STGL----ISPGV-S---V----P--VQ-V-P----G-S---EPDMSQ------YWPRLQ', 'MRNLPCLGTAGGSGLGGIAGKPSPTMEAVEASTASHPHSTSSYFATTYYHLTDDECHSGVNQLGGVFVGGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATAEVVSKISQYKRECPSIFAWEIRDRLLQENVCTNDNIPSVSSINRVLRNLAAQKEQQSTGSGSSSTSAGNSISAKVSVSIGGNVSNVASGSRGTLSSSTDLMQTATPLNSSESGGASNSGEGSEQEAIYEKLRLLNTQHAAGPGPLEPARAAPLVGQSPNHLGTRSSHPQLVHGNHQALQQHQQQSWPPRHYSGSWYPTSLSEIPISSAPNIASVTAYASGPSLAHSLSPPNDIESLASIGHQRNCPVATEDIHLKKELDG-HQSDETGSGEGENSNGGASNIG-NTEDDQARLILKRKLQRNRTSFTNDQIDSLEKEFERTHYPDVFARERLAGKIGLPEARIQVWFSNRRAKWRREEKLRNQRRTPNSTGASATSSSTSATASLTDSPNSLSACSSLLSGSAGGPSVSTINGLSSPSTLSTNVNAPTLGAGIDSSESPTPIPHIRPSCTSDNDNGRQSEDCRRVCSPCPLGVGGHQNTHHIQSNGHAQGHALVPAISPRLNFNSGSFGAMYSNMHHTALSMSDSYGAVTPIPSFNHSAVGPLAPPSPIPQQGDLTPSSLYPCHMTLRPPPMAPAHHHIVPGDGGRPAGVGLGSGQSANLGASCSGSGYEVLSAYALPPPPMASSSAADSSFSAASSASANVTPHHTIAQESCPSPCSSASHFGVAHSSGFSSDPISPAVSSYAHMSYNYASSANTMTPSSASGTSAHVAPGKQQFFASCFYSPWV-')
def q5(): # calculate mean/standard deviation dist2 = {38: 3, 40: 7, 41: 13, 42: 23, 43: 27, 44: 29, 45: 45, 46: 74, 47: 69, 48: 72, 49: 71, 50: 65, 51: 52, 52: 64, 53: 57, 54: 49, 55: 43, 56: 28, 57: 40, 58: 31, 59: 17, 60: 18, 61: 14, 62: 12, 63: 9, 64: 9, 65: 4, 66: 10, 67: 8, 68: 5, 69: 6, 70: 2, 71: 3, 72: 4, 73: 1, 75: 3, 77: 4, 79: 2, 80: 1, 81: 1, 82: 2, 84: 1, 85: 1, 93: 1} avg = numpy.mean(dist2.keys()) std_d = numpy.std(dist2.keys()) print "mean:", avg print "standard deviation: ", std_d # get local scores human = data.read_protein(data.HUMAN_EYELESS_URL) fly = data.read_protein(data.FRUITFLY_EYELESS_URL) scores = data.read_scoring_matrix(data.PAM50_URL) # get local alignment of human and fly a_matrix = soln.compute_alignment_matrix(human, fly, scores, False) l_score, l_h, l_ff = soln.compute_local_alignment(human, fly, scores, a_matrix) print "local score: ", l_score print "z score: ", (l_score - avg) / std_d