Пример #1
0
def phylogeny_js(out_file, dists):
    """Make a dendrogram using Jensen-Shannon divergence."""
    
    tmp_input = 'tmp_data'
    tmp_r = 'tmp_r' + str(random.randint(0,100))
    tmp_labels = 'labels' + str(random.randint(0,100))

    js_distances = defaultdict(dict)
    for host1, host2 in itertools.combinations(dists, 2):
        js_dis = utils.jensen_shannon_dists(dists[host1],
                                            dists[host2])
        js_distances[host1][host2] = js_dis
        js_distances[host2][host1] = js_dis

    with open(tmp_input, 'w') as f:    
        for host1 in dists:
            line = ''
            for host2 in dists:
                if host1 == host2:
                    line += '0\t'
                else:
                    line += str(js_distances[host1][host2]) + '\t'
            f.write(line.strip('\t') + '\n')

    with open(tmp_labels, 'w') as f:
        f.write('\t'.join(dists.keys()) + '\n')

    with open(tmp_r, 'w') as f:
        f.write("source('funcs.R')\n")
        f.write("library('MASS')\n")
        f.write("d<-read.delim('"
                + tmp_input
                + "',header=FALSE,sep='\\t')\n")
        f.write('dist.r<-as.dist(d)\n')
        f.write("labels.d<-read.delim('"
                + tmp_labels
                + "',header=FALSE,sep='\\t')\n")
        f.write('labels<-as.matrix(labels.d)\n')
        f.write("h<-hclust(dist.r,method='average')\n")
        f.write("png('" + out_file + "')\n")
        f.write("plot(h,hang=-1,labels=labels[1,],main='Host Phylogeny')\n")
        f.write('dev.off()\n')
    os.system('R < ' + tmp_r + ' --no-save')
    os.system('rm ' + ' '.join((tmp_r, tmp_labels, tmp_input)))
Пример #2
0
    return True

results_dir = sys.argv[1]
suffix = sys.argv[2]
elmfile = sys.argv[3]

elms = {}
with open(elmfile) as f:
    for line in f:
        elm, exp = line.strip().split('\t')
        elms[elm] = True

for elm in elms:
    counts = utils.count_host_elmSeqs(global_settings.TEST_GENOMES,
                                      False, {},
                                      results_dir, {elm:True}, suffix)
    all_elmSeqs = {}
    for host in counts:
        for elmSeq in counts[host]:
            all_elmSeqs[elmSeq] = True
    host_vecs = utils.mk_count_vecs(counts, all_elmSeqs)
    host_dists = utils.mk_count_dists(host_vecs)
    js = defaultdict(dict)
    for host1, host2 in itertools.combinations(host_dists, 2):
        dis = utils.jensen_shannon_dists(host_dists[host1],
                                         host_dists[host2])
        js[host1][host2] = dis
        js[host2][host1] = dis
    if check_phylogeny(js):
        print elm
        for line in f:
            (elm, seq, count, fq) = line.strip().split('\t')
            elmSeq = elm + ':' + seq
            if elmSeq in host_flu_elmSeq_mapping:
                 for a_flu_elmSeq in host_flu_elmSeq_mapping[elmSeq]:
                     host_counts[host][a_flu_elmSeq] += int(count)
            # else:
            #     host_counts[host][elmSeq] += int(count)
            #     all_elmSeqs[elmSeq] = True

flu_vecs = utils.mk_count_vecs(flu_counts, mapped)
flu_dists = utils.mk_count_dists(flu_vecs)
host_vecs = utils.mk_count_vecs(host_counts, mapped)
host_dists = utils.mk_count_dists(host_vecs)

js_distances = defaultdict(dict)
for host in hosts:
    for flu in flus:
        js_dis = utils.jensen_shannon_dists(host_dists[host],
                                            flu_dists[flu])
        js_distances[host][flu] = js_dis
        print host, flu, js_dis

def print_it(name, vec):
    print name, float(count_0s(vec))/float(len(vec))

print_it('chicken_flu', flu_vecs['chicken'])
print_it('human flu', flu_vecs['human'])
print_it('H sapiens', host_vecs['H_sapiens'])
print_it('Gallus gallus', host_vecs['Gallus_gallus'])
Пример #4
0
counts = utils.count_host_elmSeqs(('Gallus_gallus','H_sapiens'),
                                  False, {},
                                  'working/Jun29/', {use_elm:True},
                                  '.init')

use_seqs = utils_graph.unionLists([use_seqs_pre, counts['Gallus_gallus'],
                                       counts['H_sapiens']])
host_vecs = utils.mk_count_vecs(counts, use_seqs)
host_dists = utils.mk_count_dists(host_vecs)
flu_vecs = utils.mk_count_vecs(flu_counts, use_seqs)  
flu_dists = utils.mk_count_dists(flu_vecs)

# for flu in flu_dists:
#     print sum(flu_dists[flu])
# sys.exit(0)

for seq, hhuman, hchicken, hf, cf in zip(use_seqs, host_dists['Gallus_gallus'],
                                     host_dists['H_sapiens'],
                                     flu_dists['human'],
                                     flu_dists['chicken']):
    print seq + '\t' + str(hhuman) + '\t' + str(hchicken) + '\t' + str(hf) + '\t' + '\t' + str(cf)
for host in host_dists:
    for flu in flu_dists:
        print host, flu, utils.jensen_shannon_dists(host_dists[host],
                                                    flu_dists[flu])
# for flu in flu_counts:
#     for elmSeq in flu_counts[flu]:
#         elm, seq = elmSeq.split(':')
#         if elm == use_elm:
#             print flu, seq, flu_counts[flu][elmSeq]
Пример #5
0
    # flu_elm_file = os.path.join('results',
    #                             flu + '.H5N1.elms')
    if "human" in flu:
        flu_elm_file = os.path.join("working/Jul1_year", flu + ".H3N2.2008.elms")
    else:
        flu_elm_file = os.path.join("working/Jul1_year/", flu + ".H5N1.2006.elms")
    utils.count_flu_sampled(flu, flu_elm_file, flu_counts, seen_seqs, {}, False)
    for elmseq in seen_seqs[flu]:
        elm, seq = elmseq.split(":")
        elm2seqs[elm][elmseq] = True

counts = utils.count_host_elmSeqs(("Gallus_gallus", "H_sapiens"), False, {}, "working/Jun29/", working_elms, ".init")

for elm in working_elms:
    use_seqs = elm2seqs[elm]
    host_vecs = utils.mk_count_vecs(counts, use_seqs)
    host_dists = utils.mk_count_dists(host_vecs)
    flu_vecs = utils.mk_count_vecs(flu_counts, use_seqs)
    flu_dists = utils.mk_count_dists(flu_vecs)

    flu = flu_dists["human"]
    human_score_H = utils.jensen_shannon_dists(host_dists["H_sapiens"], flu)
    chicken_score_H = utils.jensen_shannon_dists(host_dists["Gallus_gallus"], flu)

    flu = flu_dists["chicken"]
    human_score_C = utils.jensen_shannon_dists(host_dists["H_sapiens"], flu)
    chicken_score_C = utils.jensen_shannon_dists(host_dists["Gallus_gallus"], flu)

    if human_score_C > chicken_score_C and human_score_H < chicken_score_H:
        print elm
Пример #6
0
""" What is the Jensen-shannon distance
    between 2 flu groups?
"""
import utils, sys, utils_graph

flu_group_1_file = sys.argv[1]
flu_group_2_file = sys.argv[2]

flu_counts = {}
seen_elmSeqs = {}
seen_seqs_ls = []
for name, file in (('g1', flu_group_1_file),
                   ('g2', flu_group_2_file)):
    utils.count_flu_sampled(name, file,
                            flu_counts,
                            seen_elmSeqs, {}, False)
    seen_seqs_ls.append(seen_elmSeqs[name])
use_elmSeqs = utils_graph.unionLists(seen_seqs_ls)
flu_vecs = utils.mk_count_vecs(flu_counts, use_elmSeqs)
flu_dists = utils.mk_count_dists(flu_vecs)
js_dis = utils.jensen_shannon_dists(flu_dists['g1'],
                                    flu_dists['g2'])
print js_dis