return True results_dir = sys.argv[1] suffix = sys.argv[2] elmfile = sys.argv[3] elms = {} with open(elmfile) as f: for line in f: elm, exp = line.strip().split('\t') elms[elm] = True for elm in elms: counts = utils.count_host_elmSeqs(global_settings.TEST_GENOMES, False, {}, results_dir, {elm:True}, suffix) all_elmSeqs = {} for host in counts: for elmSeq in counts[host]: all_elmSeqs[elmSeq] = True host_vecs = utils.mk_count_vecs(counts, all_elmSeqs) host_dists = utils.mk_count_dists(host_vecs) js = defaultdict(dict) for host1, host2 in itertools.combinations(host_dists, 2): dis = utils.jensen_shannon_dists(host_dists[host1], host_dists[host2]) js[host1][host2] = dis js[host2][host1] = dis if check_phylogeny(js): print elm
# (protein, st, stp, # elm, seq, junk) = line.strip().split('\t') # elmSeq = elm + ':' + seq # all_elmSeqs[elmSeq] = True # flu_counts[flu][elmSeq] += 1 for host in hosts: host_counts[host] = defaultdict(utils.init_zero) with open('results/roundup_all/elmdict_' + host + '.init') as f: for line in f: (elm, seq, count, fq) = line.strip().split('\t') elmSeq = elm + ':' + seq all_elmSeqs[elmSeq] = True host_counts[host][elmSeq] += int(count) flu_vecs = utils.mk_count_vecs(flu_counts, all_elmSeqs) flu_dists = utils.mk_count_dists(flu_vecs) host_vecs = utils.mk_count_vecs(host_counts, all_elmSeqs) host_dists = utils.mk_count_dists(host_vecs) def print_it(name, vec): print name, float(count_0s(vec))/float(len(vec)) print_it('chicken_flu', flu_vecs['chicken']) print_it('human flu', flu_vecs['human']) print_it('H sapiens', host_vecs['H_sapiens']) print_it('Gallus gallus', host_vecs['Gallus_gallus']) flu_dists['chicken'].sort() print flu_dists['chicken'][-3], flu_dists['chicken'][-2], flu_dists['chicken'][-1]
flu_counts['chicken'] = utils.count_flu(new_chicken_counts, all_elmSeqs) for host in hosts: host_counts[host] = defaultdict(utils.init_zero) with open('working/runs/Jun24/elmdict_' + host + '.init') as f: for line in f: (elm, seq, count, fq) = line.strip().split('\t') elmSeq = elm + ':' + seq if elmSeq in host_flu_elmSeq_mapping: for a_flu_elmSeq in host_flu_elmSeq_mapping[elmSeq]: host_counts[host][a_flu_elmSeq] += int(count) # else: # host_counts[host][elmSeq] += int(count) # all_elmSeqs[elmSeq] = True flu_vecs = utils.mk_count_vecs(flu_counts, mapped) flu_dists = utils.mk_count_dists(flu_vecs) host_vecs = utils.mk_count_vecs(host_counts, mapped) host_dists = utils.mk_count_dists(host_vecs) js_distances = defaultdict(dict) for host in hosts: for flu in flus: js_dis = utils.jensen_shannon_dists(host_dists[host], flu_dists[flu]) js_distances[host][flu] = js_dis print host, flu, js_dis def print_it(name, vec): print name, float(count_0s(vec))/float(len(vec))
seen_elms = defaultdict(dict) for host in ls_of_hosts: host_elmCounts[host] = defaultdict(utils.init_zero) with open(results_dir + 'elmdict_' + host + '.redo') as f: for line in f: (elm, seq, count, fq) = line.strip().split('\t') if elm in use_elms: host_elmCounts[host][elm] += int(count) seen_elms[elm][host] = True #use_elms = {} #for elm in seen_elms: # if len(seen_elms[elm]) == len(ls_of_hosts): # use_elms[elm] = True # print len(use_elms) return (host_elmCounts, use_elms) use_elms = {} with open(use_elms_file) as f: for line in f: (elm, stuff) = line.strip().split('\t') use_elms[elm] = True hosts = global_settings.GENOMES if use_freqs == 'T': host_elmCounts, elms = get_host_freqs(hosts, use_elms) else: host_elmCounts, elms = get_host_counts(hosts, use_elms) host_vecs = utils.mk_count_vecs(host_elmCounts, elms) host_dists = utils.mk_count_dists(host_vecs) utils_plot.phylogeny_js(out_file, host_dists)
seen_seqs_ls.append(use_seqs) # remove seqs seen less than 10x #for flu in flu_counts: # for elmSeq in use_seqs_pre = utils_graph.unionLists(seen_seqs_ls) counts = utils.count_host_elmSeqs(('Gallus_gallus','H_sapiens'), False, {}, 'working/Jun29/', {use_elm:True}, '.init') use_seqs = utils_graph.unionLists([use_seqs_pre, counts['Gallus_gallus'], counts['H_sapiens']]) host_vecs = utils.mk_count_vecs(counts, use_seqs) host_dists = utils.mk_count_dists(host_vecs) flu_vecs = utils.mk_count_vecs(flu_counts, use_seqs) flu_dists = utils.mk_count_dists(flu_vecs) # for flu in flu_dists: # print sum(flu_dists[flu]) # sys.exit(0) for seq, hhuman, hchicken, hf, cf in zip(use_seqs, host_dists['Gallus_gallus'], host_dists['H_sapiens'], flu_dists['human'], flu_dists['chicken']): print seq + '\t' + str(hhuman) + '\t' + str(hchicken) + '\t' + str(hf) + '\t' + '\t' + str(cf) for host in host_dists: for flu in flu_dists:
""" What is the Jensen-shannon distance between 2 flu groups? """ import utils, sys, utils_graph flu_group_1_file = sys.argv[1] flu_group_2_file = sys.argv[2] flu_counts = {} seen_elmSeqs = {} seen_seqs_ls = [] for name, file in (('g1', flu_group_1_file), ('g2', flu_group_2_file)): utils.count_flu_sampled(name, file, flu_counts, seen_elmSeqs, {}, False) seen_seqs_ls.append(seen_elmSeqs[name]) use_elmSeqs = utils_graph.unionLists(seen_seqs_ls) flu_vecs = utils.mk_count_vecs(flu_counts, use_elmSeqs) flu_dists = utils.mk_count_dists(flu_vecs) js_dis = utils.jensen_shannon_dists(flu_dists['g1'], flu_dists['g2']) print js_dis