Пример #1
0
def q2():
    def q2_helper(str1, str2):
        if len(str1) != len(str2):
            return 0
        else:
            ctr = 0
            for char in range(len(str1)):
                if str1[char] == str2[char]:
                    ctr += 1

        return (ctr * 1.0 / len(str1)) * 100

    human = data.read_protein(data.HUMAN_EYELESS_URL)
    fly = data.read_protein(data.FRUITFLY_EYELESS_URL)
    scores = data.read_scoring_matrix(data.PAM50_URL)
    c_pax = data.read_protein(data.CONSENSUS_PAX_URL)

    # get local alignment of human and fly
    a_matrix = soln.compute_alignment_matrix(human, fly, scores, False)
    l_score, l_h, l_ff = soln.compute_local_alignment(human, fly, scores, a_matrix)

    # removing the dashes
    l_h = l_h.replace("-", "")
    l_ff = l_ff.replace("-", "")

    # get global alignment matrix for each local string and pax
    pax_a_h_matrix = soln.compute_alignment_matrix(l_h, c_pax, scores, True)
    pax_a_ff_matrix = soln.compute_alignment_matrix(l_ff, c_pax, scores, True)

    # compute global alignment
    h_ga = soln.compute_global_alignment(l_h, c_pax, scores, pax_a_h_matrix)
    ff_ga = soln.compute_global_alignment(l_ff, c_pax, scores, pax_a_ff_matrix)

    print "human:\t\t", q2_helper(h_ga[1], h_ga[2])
    print "fruit fly:\t", q2_helper(ff_ga[1], ff_ga[2])
Пример #2
0
def q4():
    norm_dist = {}

    human = data.read_protein(data.HUMAN_EYELESS_URL)
    fly = data.read_protein(data.FRUITFLY_EYELESS_URL)
    scores = data.read_scoring_matrix(data.PAM50_URL)

    n_trials = 10000
    null_dist = generate_null_distribution(human, fly, scores, n_trials)
    print null_dist

    # normalize the distribution
    total = sum(null_dist.values())
    for score in null_dist:
        norm_dist[score] = null_dist[score] * 1.0 / total

    print norm_dist

    # dist = {37: 1, 38: 1, 40: 2, 41: 12, 42: 17, 43: 29, 44: 35, 45: 51, 46: 65, 47: 73, 48: 68, 49: 67, 50: 74, 51: 73,
    # 52: 65, 53: 41, 54: 50, 55: 37, 56: 36, 57: 32, 58: 29, 59: 37, 60: 16, 61: 18, 62: 17, 63: 11, 64: 2,
    # 65: 9, 66: 7, 67: 5, 68: 3, 69: 1, 70: 2, 71: 1, 72: 2, 73: 3, 75: 1, 76: 1, 77: 2, 78: 2, 82: 1, 85: 1}

    # dist2 = {38: 3, 40: 7, 41: 13, 42: 23, 43: 27, 44: 29, 45: 45, 46: 74, 47: 69, 48: 72, 49: 71, 50: 65, 51: 52,
    # 52: 64, 53: 57, 54: 49, 55: 43, 56: 28, 57: 40, 58: 31, 59: 17, 60: 18, 61: 14, 62: 12, 63: 9, 64: 9,
    #          65: 4, 66: 10, 67: 8, 68: 5, 69: 6, 70: 2, 71: 3, 72: 4, 73: 1, 75: 3, 77: 4, 79: 2, 80: 1, 81: 1, 82: 2,
    #          84: 1, 85: 1, 93: 1}

    # 10K trials
    # {36: 1, 37: 3, 38: 6, 39: 27, 40: 66, 41: 112, 42: 213, 43: 317, 44: 430, 45: 553, 46: 609, 47: 689, 48: 695, 49: 702, 50: 666, 51: 610, 52: 570, 53: 534, 54: 463, 55: 396, 56: 346, 57: 298, 58: 272, 59: 216, 60: 196, 61: 176, 62: 137, 63: 138, 64: 89, 65: 75, 66: 62, 67: 48, 68: 44, 69: 51, 70: 31, 71: 33, 72: 20, 73: 18, 74: 17, 75: 11, 76: 10, 77: 14, 78: 9, 79: 5, 80: 6, 81: 2, 82: 3, 83: 3, 84: 3, 87: 2, 91: 1, 94: 1, 97: 1}
    # {36: 0.0001, 37: 0.0003, 38: 0.0006, 39: 0.0027, 40: 0.0066, 41: 0.0112, 42: 0.0213, 43: 0.0317, 44: 0.043, 45: 0.0553, 46: 0.0609, 47: 0.0689, 48: 0.0695, 49: 0.0702, 50: 0.0666, 51: 0.061, 52: 0.057, 53: 0.0534, 54: 0.0463, 55: 0.0396, 56: 0.0346, 57: 0.0298, 58: 0.0272, 59: 0.0216, 60: 0.0196, 61: 0.0176, 62: 0.0137, 63: 0.0138, 64: 0.0089, 65: 0.0075, 66: 0.0062, 67: 0.0048, 68: 0.0044, 69: 0.0051, 70: 0.0031, 71: 0.0033, 72: 0.002, 73: 0.0018, 74: 0.0017, 75: 0.0011, 76: 0.001, 77: 0.0014, 78: 0.0009, 79: 0.0005, 80: 0.0006, 81: 0.0002, 82: 0.0003, 83: 0.0003, 84: 0.0003, 87: 0.0002, 91: 0.0001, 94: 0.0001, 97: 0.0001}

    # norm_dist = {37: 0.001, 38: 0.001, 40: 0.002, 41: 0.012, 42: 0.017, 43: 0.029, 44: 0.035, 45: 0.051, 46: 0.065,
    #              47: 0.073, 48: 0.068, 49: 0.067, 50: 0.074, 51: 0.073, 52: 0.065, 53: 0.041, 54: 0.05, 55: 0.037,
    #              56: 0.036, 57: 0.032, 58: 0.029, 59: 0.037, 60: 0.016, 61: 0.018, 62: 0.017, 63: 0.011, 64: 0.002,
    #              65: 0.009, 66: 0.007, 67: 0.005, 68: 0.003, 69: 0.001, 70: 0.002, 71: 0.001, 72: 0.002, 73: 0.003,
    #              75: 0.001, 76: 0.001, 77: 0.002, 78: 0.002, 82: 0.001, 85: 0.001}

    plt.bar(norm_dist.keys(), norm_dist.values(), color='r')
    plt.ylabel('fraction of total trials corresponding to each score')
    plt.xlabel('score')
    plt.title('generate_null_distribution, {} trials'.format(n_trials))
    plt.show()
Пример #3
0
def q1():
    human = data.read_protein(data.HUMAN_EYELESS_URL)
    fly = data.read_protein(data.FRUITFLY_EYELESS_URL)
    scores = data.read_scoring_matrix(data.PAM50_URL)

    a_matrix = soln.compute_alignment_matrix(human, fly, scores, False)
    print soln.compute_local_alignment(human, fly, scores, a_matrix)

    a_matrix = soln.compute_alignment_matrix(human, fly, scores, True)
    print soln.compute_global_alignment(human, fly, scores, a_matrix)

    # local answer
    b = (875,
         'HSGVNQLGGVFVNGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATPEVVSKIAQYKRECPSIFAWEIRDRLLSEGVCTNDNIPSVSSINRVLRNLASEK-QQ',
         'HSGVNQLGGVFVGGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATAEVVSKISQYKRECPSIFAWEIRDRLLQENVCTNDNIPSVSSINRVLRNLAAQKEQQ')

    # global answer
    a = (4,
         'MQN--------------------------------------S--------------HSGVNQLGGVFVNGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATPEVVSKIAQYKRECPSIFAWEIRDRLLSEGVCTNDNIPSVSSINRVLRNLASEK-QQ--------------------------------------------M------------GA----DG-----MYDKLRMLN-------G--Q----T---G-S---WGTR---P----G------------W----YPG----T--------------SV---------------P---------G-Q---P--T-------Q-DGCQQ-QE-G-G-GENTNSISSN-GEDSDEAQMRLQLKRKLQRNRTSFTQEQIEALEKEFERTHYPDVFARERLAAKIDLPEARIQVWFSNRRAKWRREEKLRNQRR--Q-----A-----S---N-T--P------SH-I------P----I---SS-S-FSTSVYQP-----I--PQ-PT-TP-V-SSFTSGSMLGR-T-D-----T--AL-T----NT-Y--S-------AL-P---P-M---P-SF-TM-AN--N--LPM-Q------P-P------V-----PS----Q---T-SS-YSC-M-L---PTSPS----V--N-GR--------------------S-YD--T-YT--PPHM------Q-------------T--H-M--NS-Q-P-MGTS--GTT-STGL----ISPGV-S---V----P--VQ-V-P----G-S---EPDMSQ------YWPRLQ',
         'MRNLPCLGTAGGSGLGGIAGKPSPTMEAVEASTASHPHSTSSYFATTYYHLTDDECHSGVNQLGGVFVGGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATAEVVSKISQYKRECPSIFAWEIRDRLLQENVCTNDNIPSVSSINRVLRNLAAQKEQQSTGSGSSSTSAGNSISAKVSVSIGGNVSNVASGSRGTLSSSTDLMQTATPLNSSESGGASNSGEGSEQEAIYEKLRLLNTQHAAGPGPLEPARAAPLVGQSPNHLGTRSSHPQLVHGNHQALQQHQQQSWPPRHYSGSWYPTSLSEIPISSAPNIASVTAYASGPSLAHSLSPPNDIESLASIGHQRNCPVATEDIHLKKELDG-HQSDETGSGEGENSNGGASNIG-NTEDDQARLILKRKLQRNRTSFTNDQIDSLEKEFERTHYPDVFARERLAGKIGLPEARIQVWFSNRRAKWRREEKLRNQRRTPNSTGASATSSSTSATASLTDSPNSLSACSSLLSGSAGGPSVSTINGLSSPSTLSTNVNAPTLGAGIDSSESPTPIPHIRPSCTSDNDNGRQSEDCRRVCSPCPLGVGGHQNTHHIQSNGHAQGHALVPAISPRLNFNSGSFGAMYSNMHHTALSMSDSYGAVTPIPSFNHSAVGPLAPPSPIPQQGDLTPSSLYPCHMTLRPPPMAPAHHHIVPGDGGRPAGVGLGSGQSANLGASCSGSGYEVLSAYALPPPPMASSSAADSSFSAASSASANVTPHHTIAQESCPSPCSSASHFGVAHSSGFSSDPISPAVSSYAHMSYNYASSANTMTPSSASGTSAHVAPGKQQFFASCFYSPWV-')
Пример #4
0
def q5():
    # calculate mean/standard deviation
    dist2 = {38: 3, 40: 7, 41: 13, 42: 23, 43: 27, 44: 29, 45: 45, 46: 74, 47: 69, 48: 72, 49: 71, 50: 65, 51: 52,
             52: 64, 53: 57, 54: 49, 55: 43, 56: 28, 57: 40, 58: 31, 59: 17, 60: 18, 61: 14, 62: 12, 63: 9, 64: 9,
             65: 4, 66: 10, 67: 8, 68: 5, 69: 6, 70: 2, 71: 3, 72: 4, 73: 1, 75: 3, 77: 4, 79: 2, 80: 1, 81: 1, 82: 2,
             84: 1, 85: 1, 93: 1}

    avg = numpy.mean(dist2.keys())
    std_d = numpy.std(dist2.keys())

    print "mean:", avg
    print "standard deviation: ", std_d

    # get local scores
    human = data.read_protein(data.HUMAN_EYELESS_URL)
    fly = data.read_protein(data.FRUITFLY_EYELESS_URL)
    scores = data.read_scoring_matrix(data.PAM50_URL)

    # get local alignment of human and fly
    a_matrix = soln.compute_alignment_matrix(human, fly, scores, False)
    l_score, l_h, l_ff = soln.compute_local_alignment(human, fly, scores, a_matrix)

    print "local score: ", l_score
    print "z score: ", (l_score - avg) / std_d