""" What is the Jensen-shannon distance between 2 flu groups? """ import utils, sys, utils_graph flu_group_1_file = sys.argv[1] flu_group_2_file = sys.argv[2] flu_counts = {} seen_elmSeqs = {} seen_seqs_ls = [] for name, file in (('g1', flu_group_1_file), ('g2', flu_group_2_file)): utils.count_flu_sampled(name, file, flu_counts, seen_elmSeqs, {}, False) seen_seqs_ls.append(seen_elmSeqs[name]) use_elmSeqs = utils_graph.unionLists(seen_seqs_ls) flu_vecs = utils.mk_count_vecs(flu_counts, use_elmSeqs) flu_dists = utils.mk_count_dists(flu_vecs) js_dis = utils.jensen_shannon_dists(flu_dists['g1'], flu_dists['g2']) print js_dis
use_elm = sys.argv[1] flu_counts = {} seen_seqs = {} seen_seqs_ls = [] flus = ('human','chicken') for flu in flus: if 'human' in flu: flu_elm_file = os.path.join('working/Jul1_year', flu + '.H3N2.2008.elms') else: flu_elm_file = os.path.join('working/Jul1_year/', flu + '.H5N1.2006.elms') # flu_elm_file = os.path.join('working/Jun30/', # flu + '.H5N1.simpleELMs') utils.count_flu_sampled(flu, flu_elm_file, flu_counts, seen_seqs, {}, False) use_seqs = {} for elmSeq in seen_seqs[flu]: elm, seq = elmSeq.split(':') if elm == use_elm: if flu_counts[flu][elmSeq] > 0: use_seqs[elmSeq] = True seen_seqs_ls.append(use_seqs) # remove seqs seen less than 10x #for flu in flu_counts: # for elmSeq in use_seqs_pre = utils_graph.unionLists(seen_seqs_ls) counts = utils.count_host_elmSeqs(('Gallus_gallus','H_sapiens'),
if do_clustering: f = os.path.join(elm_count_dir, cluster_distance_file) mapping = utils.get_clusters(f, 2.5, float(2.5)) else: mapping = {} hosts = global_settings.TEST_GENOMES #all_elmSeqs = {} flus = ('human',) flu_counts = {} seen_seqs = {} seen_seqs_ls = [] for flu in flus: flu_elm_file = os.path.join('results/', flu + '.H5N1.elms') utils.count_flu_sampled(flu, flu_elm_file, flu_counts, seen_seqs, mapping, do_clustering) seen_seqs_ls.append(seen_seqs[flu]) if len(seen_seqs_ls) > 1: all_elmSeqs = utils_graph.unionLists(seen_seqs_ls) else: all_elmSeqs = seen_seqs_ls[0] host_counts = utils.count_host_elmSeqs(hosts, do_clustering, mapping, elm_count_dir) host_found_seqs = utils_graph.unionLists([host_counts['H_sapiens'], host_counts['Gallus_gallus']]) use_seqs = utils_graph.unionLists([all_elmSeqs, host_found_seqs]) flu_vecs = utils.mk_count_vecs(flu_counts, use_seqs) host_vecs = utils.mk_count_vecs(host_counts, use_seqs)