def phylogeny_js(out_file, dists): """Make a dendrogram using Jensen-Shannon divergence.""" tmp_input = 'tmp_data' tmp_r = 'tmp_r' + str(random.randint(0,100)) tmp_labels = 'labels' + str(random.randint(0,100)) js_distances = defaultdict(dict) for host1, host2 in itertools.combinations(dists, 2): js_dis = utils.jensen_shannon_dists(dists[host1], dists[host2]) js_distances[host1][host2] = js_dis js_distances[host2][host1] = js_dis with open(tmp_input, 'w') as f: for host1 in dists: line = '' for host2 in dists: if host1 == host2: line += '0\t' else: line += str(js_distances[host1][host2]) + '\t' f.write(line.strip('\t') + '\n') with open(tmp_labels, 'w') as f: f.write('\t'.join(dists.keys()) + '\n') with open(tmp_r, 'w') as f: f.write("source('funcs.R')\n") f.write("library('MASS')\n") f.write("d<-read.delim('" + tmp_input + "',header=FALSE,sep='\\t')\n") f.write('dist.r<-as.dist(d)\n') f.write("labels.d<-read.delim('" + tmp_labels + "',header=FALSE,sep='\\t')\n") f.write('labels<-as.matrix(labels.d)\n') f.write("h<-hclust(dist.r,method='average')\n") f.write("png('" + out_file + "')\n") f.write("plot(h,hang=-1,labels=labels[1,],main='Host Phylogeny')\n") f.write('dev.off()\n') os.system('R < ' + tmp_r + ' --no-save') os.system('rm ' + ' '.join((tmp_r, tmp_labels, tmp_input)))
return True results_dir = sys.argv[1] suffix = sys.argv[2] elmfile = sys.argv[3] elms = {} with open(elmfile) as f: for line in f: elm, exp = line.strip().split('\t') elms[elm] = True for elm in elms: counts = utils.count_host_elmSeqs(global_settings.TEST_GENOMES, False, {}, results_dir, {elm:True}, suffix) all_elmSeqs = {} for host in counts: for elmSeq in counts[host]: all_elmSeqs[elmSeq] = True host_vecs = utils.mk_count_vecs(counts, all_elmSeqs) host_dists = utils.mk_count_dists(host_vecs) js = defaultdict(dict) for host1, host2 in itertools.combinations(host_dists, 2): dis = utils.jensen_shannon_dists(host_dists[host1], host_dists[host2]) js[host1][host2] = dis js[host2][host1] = dis if check_phylogeny(js): print elm
for line in f: (elm, seq, count, fq) = line.strip().split('\t') elmSeq = elm + ':' + seq if elmSeq in host_flu_elmSeq_mapping: for a_flu_elmSeq in host_flu_elmSeq_mapping[elmSeq]: host_counts[host][a_flu_elmSeq] += int(count) # else: # host_counts[host][elmSeq] += int(count) # all_elmSeqs[elmSeq] = True flu_vecs = utils.mk_count_vecs(flu_counts, mapped) flu_dists = utils.mk_count_dists(flu_vecs) host_vecs = utils.mk_count_vecs(host_counts, mapped) host_dists = utils.mk_count_dists(host_vecs) js_distances = defaultdict(dict) for host in hosts: for flu in flus: js_dis = utils.jensen_shannon_dists(host_dists[host], flu_dists[flu]) js_distances[host][flu] = js_dis print host, flu, js_dis def print_it(name, vec): print name, float(count_0s(vec))/float(len(vec)) print_it('chicken_flu', flu_vecs['chicken']) print_it('human flu', flu_vecs['human']) print_it('H sapiens', host_vecs['H_sapiens']) print_it('Gallus gallus', host_vecs['Gallus_gallus'])
counts = utils.count_host_elmSeqs(('Gallus_gallus','H_sapiens'), False, {}, 'working/Jun29/', {use_elm:True}, '.init') use_seqs = utils_graph.unionLists([use_seqs_pre, counts['Gallus_gallus'], counts['H_sapiens']]) host_vecs = utils.mk_count_vecs(counts, use_seqs) host_dists = utils.mk_count_dists(host_vecs) flu_vecs = utils.mk_count_vecs(flu_counts, use_seqs) flu_dists = utils.mk_count_dists(flu_vecs) # for flu in flu_dists: # print sum(flu_dists[flu]) # sys.exit(0) for seq, hhuman, hchicken, hf, cf in zip(use_seqs, host_dists['Gallus_gallus'], host_dists['H_sapiens'], flu_dists['human'], flu_dists['chicken']): print seq + '\t' + str(hhuman) + '\t' + str(hchicken) + '\t' + str(hf) + '\t' + '\t' + str(cf) for host in host_dists: for flu in flu_dists: print host, flu, utils.jensen_shannon_dists(host_dists[host], flu_dists[flu]) # for flu in flu_counts: # for elmSeq in flu_counts[flu]: # elm, seq = elmSeq.split(':') # if elm == use_elm: # print flu, seq, flu_counts[flu][elmSeq]
# flu_elm_file = os.path.join('results', # flu + '.H5N1.elms') if "human" in flu: flu_elm_file = os.path.join("working/Jul1_year", flu + ".H3N2.2008.elms") else: flu_elm_file = os.path.join("working/Jul1_year/", flu + ".H5N1.2006.elms") utils.count_flu_sampled(flu, flu_elm_file, flu_counts, seen_seqs, {}, False) for elmseq in seen_seqs[flu]: elm, seq = elmseq.split(":") elm2seqs[elm][elmseq] = True counts = utils.count_host_elmSeqs(("Gallus_gallus", "H_sapiens"), False, {}, "working/Jun29/", working_elms, ".init") for elm in working_elms: use_seqs = elm2seqs[elm] host_vecs = utils.mk_count_vecs(counts, use_seqs) host_dists = utils.mk_count_dists(host_vecs) flu_vecs = utils.mk_count_vecs(flu_counts, use_seqs) flu_dists = utils.mk_count_dists(flu_vecs) flu = flu_dists["human"] human_score_H = utils.jensen_shannon_dists(host_dists["H_sapiens"], flu) chicken_score_H = utils.jensen_shannon_dists(host_dists["Gallus_gallus"], flu) flu = flu_dists["chicken"] human_score_C = utils.jensen_shannon_dists(host_dists["H_sapiens"], flu) chicken_score_C = utils.jensen_shannon_dists(host_dists["Gallus_gallus"], flu) if human_score_C > chicken_score_C and human_score_H < chicken_score_H: print elm
""" What is the Jensen-shannon distance between 2 flu groups? """ import utils, sys, utils_graph flu_group_1_file = sys.argv[1] flu_group_2_file = sys.argv[2] flu_counts = {} seen_elmSeqs = {} seen_seqs_ls = [] for name, file in (('g1', flu_group_1_file), ('g2', flu_group_2_file)): utils.count_flu_sampled(name, file, flu_counts, seen_elmSeqs, {}, False) seen_seqs_ls.append(seen_elmSeqs[name]) use_elmSeqs = utils_graph.unionLists(seen_seqs_ls) flu_vecs = utils.mk_count_vecs(flu_counts, use_elmSeqs) flu_dists = utils.mk_count_dists(flu_vecs) js_dis = utils.jensen_shannon_dists(flu_dists['g1'], flu_dists['g2']) print js_dis