예제 #1
0
    return True

results_dir = sys.argv[1]
suffix = sys.argv[2]
elmfile = sys.argv[3]

elms = {}
with open(elmfile) as f:
    for line in f:
        elm, exp = line.strip().split('\t')
        elms[elm] = True

for elm in elms:
    counts = utils.count_host_elmSeqs(global_settings.TEST_GENOMES,
                                      False, {},
                                      results_dir, {elm:True}, suffix)
    all_elmSeqs = {}
    for host in counts:
        for elmSeq in counts[host]:
            all_elmSeqs[elmSeq] = True
    host_vecs = utils.mk_count_vecs(counts, all_elmSeqs)
    host_dists = utils.mk_count_dists(host_vecs)
    js = defaultdict(dict)
    for host1, host2 in itertools.combinations(host_dists, 2):
        dis = utils.jensen_shannon_dists(host_dists[host1],
                                         host_dists[host2])
        js[host1][host2] = dis
        js[host2][host1] = dis
    if check_phylogeny(js):
        print elm
    #         (protein, st, stp,
    #          elm, seq, junk) = line.strip().split('\t')
    #         elmSeq = elm + ':' + seq
    #         all_elmSeqs[elmSeq] = True
    #         flu_counts[flu][elmSeq] += 1

for host in hosts:
    host_counts[host] = defaultdict(utils.init_zero)
    with open('results/roundup_all/elmdict_' + host + '.init') as f:
        for line in f:
            (elm, seq, count, fq) = line.strip().split('\t')
            elmSeq = elm + ':' + seq
            all_elmSeqs[elmSeq] = True
            host_counts[host][elmSeq] += int(count)

flu_vecs = utils.mk_count_vecs(flu_counts, all_elmSeqs)
flu_dists = utils.mk_count_dists(flu_vecs)
host_vecs = utils.mk_count_vecs(host_counts, all_elmSeqs)
host_dists = utils.mk_count_dists(host_vecs)

def print_it(name, vec):
    print name, float(count_0s(vec))/float(len(vec))

print_it('chicken_flu', flu_vecs['chicken'])
print_it('human flu', flu_vecs['human'])
print_it('H sapiens', host_vecs['H_sapiens'])
print_it('Gallus gallus', host_vecs['Gallus_gallus'])

flu_dists['chicken'].sort()
print flu_dists['chicken'][-3], flu_dists['chicken'][-2], flu_dists['chicken'][-1]
flu_counts['chicken'] = utils.count_flu(new_chicken_counts, all_elmSeqs)

for host in hosts:
    host_counts[host] = defaultdict(utils.init_zero)
    with open('working/runs/Jun24/elmdict_' + host + '.init') as f:
        for line in f:
            (elm, seq, count, fq) = line.strip().split('\t')
            elmSeq = elm + ':' + seq
            if elmSeq in host_flu_elmSeq_mapping:
                 for a_flu_elmSeq in host_flu_elmSeq_mapping[elmSeq]:
                     host_counts[host][a_flu_elmSeq] += int(count)
            # else:
            #     host_counts[host][elmSeq] += int(count)
            #     all_elmSeqs[elmSeq] = True

flu_vecs = utils.mk_count_vecs(flu_counts, mapped)
flu_dists = utils.mk_count_dists(flu_vecs)
host_vecs = utils.mk_count_vecs(host_counts, mapped)
host_dists = utils.mk_count_dists(host_vecs)

js_distances = defaultdict(dict)
for host in hosts:
    for flu in flus:
        js_dis = utils.jensen_shannon_dists(host_dists[host],
                                            flu_dists[flu])
        js_distances[host][flu] = js_dis
        print host, flu, js_dis

def print_it(name, vec):
    print name, float(count_0s(vec))/float(len(vec))
예제 #4
0
    seen_elms = defaultdict(dict)
    for host in ls_of_hosts:
        host_elmCounts[host] = defaultdict(utils.init_zero)
        with open(results_dir + 'elmdict_' + host + '.redo') as f:
            for line in f:
                (elm, seq, count, fq) = line.strip().split('\t')
                if elm in use_elms:
                    host_elmCounts[host][elm] += int(count)
                    seen_elms[elm][host] = True
    #use_elms = {}
    #for elm in seen_elms:
#        if len(seen_elms[elm]) == len(ls_of_hosts):
    #    use_elms[elm] = True
   # print len(use_elms)
    return (host_elmCounts, use_elms)

use_elms = {}
with open(use_elms_file) as f:
    for line in f:
        (elm, stuff) = line.strip().split('\t')
        use_elms[elm] = True

hosts = global_settings.GENOMES
if use_freqs == 'T':
    host_elmCounts, elms = get_host_freqs(hosts, use_elms)
else:
    host_elmCounts, elms = get_host_counts(hosts, use_elms)
host_vecs = utils.mk_count_vecs(host_elmCounts, elms)
host_dists = utils.mk_count_dists(host_vecs)
utils_plot.phylogeny_js(out_file, host_dists)
예제 #5
0
    seen_seqs_ls.append(use_seqs)

# remove seqs seen less than 10x
#for flu in flu_counts:
#    for elmSeq in 

use_seqs_pre = utils_graph.unionLists(seen_seqs_ls)

counts = utils.count_host_elmSeqs(('Gallus_gallus','H_sapiens'),
                                  False, {},
                                  'working/Jun29/', {use_elm:True},
                                  '.init')

use_seqs = utils_graph.unionLists([use_seqs_pre, counts['Gallus_gallus'],
                                       counts['H_sapiens']])
host_vecs = utils.mk_count_vecs(counts, use_seqs)
host_dists = utils.mk_count_dists(host_vecs)
flu_vecs = utils.mk_count_vecs(flu_counts, use_seqs)  
flu_dists = utils.mk_count_dists(flu_vecs)

# for flu in flu_dists:
#     print sum(flu_dists[flu])
# sys.exit(0)

for seq, hhuman, hchicken, hf, cf in zip(use_seqs, host_dists['Gallus_gallus'],
                                     host_dists['H_sapiens'],
                                     flu_dists['human'],
                                     flu_dists['chicken']):
    print seq + '\t' + str(hhuman) + '\t' + str(hchicken) + '\t' + str(hf) + '\t' + '\t' + str(cf)
for host in host_dists:
    for flu in flu_dists:
예제 #6
0
""" What is the Jensen-shannon distance
    between 2 flu groups?
"""
import utils, sys, utils_graph

flu_group_1_file = sys.argv[1]
flu_group_2_file = sys.argv[2]

flu_counts = {}
seen_elmSeqs = {}
seen_seqs_ls = []
for name, file in (('g1', flu_group_1_file),
                   ('g2', flu_group_2_file)):
    utils.count_flu_sampled(name, file,
                            flu_counts,
                            seen_elmSeqs, {}, False)
    seen_seqs_ls.append(seen_elmSeqs[name])
use_elmSeqs = utils_graph.unionLists(seen_seqs_ls)
flu_vecs = utils.mk_count_vecs(flu_counts, use_elmSeqs)
flu_dists = utils.mk_count_dists(flu_vecs)
js_dis = utils.jensen_shannon_dists(flu_dists['g1'],
                                    flu_dists['g2'])
print js_dis