예제 #1
0
     chicken_all_vals)  = plot_fore_control(outfile, protein, 
                                            chicken_fore, 
                                            chicken_control, 
                                            chicken_all,
                                            chicken_host_freqs, 
                                            human_host_freqs)
     human_control_pval = utils.my_wil_rank_sum_gtr(human_fore_vals,
                                                    human_control_vals,
                                                    human_all_vals)
     chicken_control_pval = utils.my_wil_rank_sum_gtr(chicken_fore_vals,
                                                    chicken_control_vals,
                                                    chicken_all_vals)
    if protein in ('polymerase PB1', 'neuraminidase', 'polymerase PB2',
                   'nonstructural protein 2', 'polymerase PA',
                   'matrix protein 2'):
        human_pval = 'fore gtr ' + str(len(human_fore_vals)) + ' ' + str(len(human_control_vals) )+ ' ' + str(utils_stats.wilcox_gtr(human_fore_vals, human_control_vals))
        p = utils_stats.wilcox_gtr(chicken_fore_vals, chicken_control_vals)
        print 'pval', p, 'yo'
        chicken_pval = 'fore gtr ' + str(len(chicken_fore_vals)) + ' ' + str(len(chicken_control_vals)) + ' ' + str(p)
    else:
        human_pval = 'ctrl gtr ' + str(len(human_control_vals)) + ' ' + str(len(human_fore_vals)) + ' ' + str(utils_stats.wilcox_gtr(human_control_vals, human_fore_vals))
        chicken_pval = 'ctrl gtr ' + str(len(chicken_control_vals)) + ' ' + str(len(chicken_fore_vals)) + ' ' + str(utils_stats.wilcox_gtr(chicken_control_vals, chicken_fore_vals))
    print 'human pval', protein, human_pval
    print 'chicken pval', protein, chicken_pval
    # print(protein, len(human_fore), len(human_control), 
    #       len(chicken_fore), len(chicken_control),
    #       len(set(human_fore.keys()) & set(chicken_fore.keys())))
    masters_human.append((human_fore, human_control))
    masters_chicken.append((chicken_fore, chicken_control))
# for p1,p2 in itertools.combinations(masters_human, 2):
#     print len(utils_graph.intersectLists([p1[0],p2[1]])), len(utils_graph.intersectLists([p2[0],p1[1]]))
예제 #2
0
import utils_stats
elm_ls = [[1, 'YIIK'], [1, 'YIVK'], [1, 'YLDK'], [1, 'YTIR'], [2, 'YLMA'], [3, 'YLLV'], [5, 'YIEG'], [10, 'YVNT'], [15, 'YTID'], [28, 'YVSM'], [129, 'YLLA'], [143, 'YLLT'], [260, 'YTLD'], [266, 'YINT'], [271, 'YVRT'], [273, 'YCVL'], [274, 'YLEK'], [275, 'YFTA'], [275, 'YIMK'], [277, 'YVDG']]
cut = 200

virus = []
nonvirus = []
found_seqs = {}
with open('results/elmdict_Gallus_gallus.txt') as f:
    for line in f:
        elm, seq, count, frac_st = line.strip().split('\t')
        if elm == 'LIG_SH2_STAT5':
            appended = False
            for elm_count, elm_seq in elm_ls:
                if seq == elm_seq:
                    found_seqs[seq] = True
                    if elm_count > cut:
                        virus.append(float(frac_st))
                    else:
                        nonvirus.append(float(frac_st))
                    appended = True
                    break
            if not appended:
                nonvirus.append(float(frac_st))
for count, seq in elm_ls:
    if not seq in found_seqs and count > cut:
        virus.append(float(0))
print utils_stats.wilcox_gtr(virus, nonvirus)
                                             float(0.1))

virus_like = []
non_virus = []
host = 'H_sapiens'
virus = 'HIV'
for elm in species2dict[host]:
    if elm in flu2dict[virus]:
        for seq in species2dict[host][elm]:
            if seq in flu2dict[virus][elm]:
                #if flu2dict[virus][elm][seq] > float(.05):
#                virus_like.append([elm+':'+seq,species2dict[host][elm][seq]])
                virus_like.append(species2dict[host][elm][seq])
                #else:
                #    non_virus.append(species2dict[host][elm][seq])
            else:
                non_virus.append(species2dict[host][elm][seq])
    else:
        for seq in species2dict[host][elm]:
            non_virus.append(species2dict[host][elm][seq])
print utils_stats.wilcox_gtr(virus_like, non_virus)
with open('virus', 'w') as f:
    for item in virus_like:
        f.write('blank\t' + str(item) + '\n')
with open('nonvirus', 'w') as f:
    for item in non_virus:
        f.write('blank\t' + str(item) + '\n')
print len(virus_like), len(non_virus)

    
예제 #4
0
            found_seqs = {}
            for seq in elm_counts[protein][elm]:
                if float(elm_counts[protein][elm][seq])/protein_count > float(.9):
                    if seq in host_freqs[elm]:
                        virus_freqs.append(host_freqs[elm][seq])
                        found_seqs[seq] = True
                    else:
                        virus_freqs.append(float(0))
                else:
                     if seq in host_freqs[elm]:
                         non_virus_freqs.append(host_freqs[elm][seq])
                         found_seqs[seq] = True
                     else:
                         non_virus_freqs.append(float(0))
            for seq in host_freqs[elm]:
                if not seq in found_seqs:
                    non_virus_freqs.append(host_freqs[elm][seq])
            #line = ''
            if len(virus_freqs) > 2 and len(non_virus_freqs) > 2:
                lines += protein + '\t'+ elm + '\t'+ str(utils_stats.wilcox_gtr(virus_freqs, non_virus_freqs)) + '\t' + str(utils_stats.wilcox_less(virus_freqs, non_virus_freqs)) + '\n'
            #else:
            #    line = protein + '\t'+ elm + '\t'+ 'NO_DATA(' + str(len(virus_freqs)) + ',' + str(len(non_virus_freqs)) + ')'
            #lines += line + '\n'
with open(ofile,'w') as f:
    f.write(lines)
        




예제 #5
0
                                virus_freqs.append(host_freqs[elm][seq])
                                found_seqs[seq] = True
                            else:
                                virus_freqs.append(float(0))
                        else:
                            if seq in host_freqs[elm]:
                                non_virus_freqs.append(host_freqs[elm][seq])
                                found_seqs[seq] = True
                            else:
                                non_virus_freqs.append(float(0))
                    for seq in host_freqs[elm]:
                        if not seq in found_seqs:
                            non_virus_freqs.append(host_freqs[elm][seq])
                    #             #line = ''
                    if len(virus_freqs) > 2 and len(non_virus_freqs) > 2:
                        lines += (
                            protein
                            + "\t"
                            + elm
                            + "\t"
                            + str(utils_stats.wilcox_gtr(virus_freqs, non_virus_freqs))
                            + "\t"
                            + str(utils_stats.wilcox_less(virus_freqs, non_virus_freqs))
                            + "\n"
                        )
#             #else:
#             #    line = protein + '\t'+ elm + '\t'+ 'NO_DATA(' + str(len(virus_freqs)) + ',' + str(len(non_virus_freqs)) + ')'
#             #lines += line + '\n'
with open(ofile, "w") as f:
    f.write(lines)