def no_args(): species2dict = {} for g in GENOMES: d = utils.get_seq2count_dict('results/elmdict_' + g + '.txt', float(0)) entropy = utils.get_species_entropy(d) with open(os.path.join(RESULTSDIR, g + '.elm_entropy'), 'w') as f: for elm in entropy: f.write(elm + '\t' + str(entropy[elm]) + '\n') for flu in FLU_NAMES: d = utils.get_seq2count_dict('results/flu_elmdict_' + flu, float(0)) entropy = utils.get_species_entropy(d) with open(os.path.join(RESULTSDIR, 'flu_' + flu + '.elm_entropy'), 'w') as f: for elm in entropy: f.write(elm + '\t' + str(entropy[elm]) + '\n')
def main(args): suffix = sys.argv[1] elms = sys.argv[2:] file_species_pairs = [] for g in global_settings.GENOMES: file_species_pairs.append(('results/elmdict_' + g + suffix, g)) plot_dir = 'plots/for_aydin/' species2elms = {} for file, species in file_species_pairs: species2elms[species] = utils.get_seq2count_dict(file, float(0)) for elm in elms: utils_plot.elm_host_barplot(species2elms, elm, os.path.join(plot_dir, elm + '.hosts' + suffix + '.png'))
def main(args): file_species_pairs = [] i = 1 while i < len(args)-2: file_species_pairs.append([args[i], args[i+1]]) i += 2 cutoff = float(sys.argv[-2]) plot_dir = sys.argv[-1] species2elms = {} for file, species in file_species_pairs: species2elms[species] = utils.get_seq2count_dict(file, cutoff) elms = {} for species in species2elms: for elm in species2elms[species]: elms[elm] = True for elm in elms: if utils.check_ones(species2elms, elm) and (elm in species2elms['swine'] or elm in species2elms['human'] or elm in species2elms['chicken']) and elm in species2elms['H_sapiens'] : utils_plot.elm_host_barplot(species2elms, elm, os.path.join(plot_dir, elm + '.hosts.png'))
def main(args): file_species_pairs = [] i = 1 while i < len(args)-2: file_species_pairs.append([args[i], args[i+1]]) i += 2 cutoff = float(sys.argv[-2]) plot_dir = sys.argv[-1] species2elms = {} virus2elms = {} # first grab virus ELMs for file, species in file_species_pairs: if file.find('flu') != -1: virus2elms[species] = utils.get_seq2count_dict(file, cutoff) else: species2elms[species] = True elms = {} for species in virus2elms: for elm in virus2elms[species]: elms[elm] = True for species in species2elms: species2elms[species] = utils.get_seq2count_dict_for_seqs(file, cutoff, virus2elms) for virus in virus2elms: species2elms[virus] = virus2elms[virus] for elm in elms: if utils.check_ones(species2elms, elm): if utils_distance.distance_elms(species2elms['Sus_scrofa'][elm], species2elms['H_sapiens'][elm]) > float(-1) or utils_distance.distance_elms(species2elms['Sus_scrofa'][elm], species2elms['Gallus_gallus'][elm]) > float(0): utils_plot.elm_host_barplot(species2elms, elm, os.path.join(plot_dir, elm + '.virus_hosts.png'))
#'HCV':('all',)} virus2conservedELMs = {} all_elms = {} for virus in viruses: virus2conservedELMs[virus] = getConservedELMs(virus, subtypes) for elm in virus2conservedELMs[virus]: all_elms[elm] = True # load ELM seq fractions host2elmFreqs = {} virus2elmFreqs = {} use_seqs = {} for host in hosts: host2elmFreqs[host] = utils.get_seq2count_dict(os.path.join(local_settings.RESULTSDIR, 'elmdict_' + host + suffix), float(0)) for elm in host2elmFreqs[host]: if elm not in use_seqs: use_seqs[elm] = {} for seq in host2elmFreqs[host][elm]: if seq not in use_seqs[elm]: use_seqs[elm][seq] = 0 use_seqs[elm][seq] += 1 for elm in use_seqs: rm_ls = [] for seq in use_seqs[elm]: if use_seqs[elm][seq] != len(hosts.keys()): rm_ls.append(seq) for seq in rm_ls:
""" I need to find ELM sequences that differ across species hosts and viruses. """ import sys, utils, utils_distance, itertools, utils_plot from global_settings import * virus2dict = {} virus2dict['chicken'] = utils.get_seq2count_dict('results/flu_elmdict_swine', float(0.05)) virus2dict['swine'] = utils.get_seq2count_dict('results/flu_elmdict_chicken', float(0.05)) virus2dict['human'] = utils.get_seq2count_dict('results/flu_elmdict_human', float(0.05)) genomes = ('H_sapiens', 'Gallus_gallus', 'Sus_scrofa') species2dict = {} for g in genomes: species2dict[g] = utils.get_seq2count_dict('results/elmdict_' + g + '.txt', float(0.01)) use_elms = {} for elm in virus2dict['human']: if elm in virus2dict['chicken'] and elm in virus2dict['swine']: distance_is_0 = False for v1, v2 in itertools.combinations(virus2dict.keys(), 2): distance = utils_distance.distance_elms(virus2dict[v1][elm], virus2dict[v2][elm]) if distance == float(0): distance_is_0 = True if not distance_is_0:
import utils d = {} for g in ('human', 'chicken', 'swine', 'equine'): d[g] = utils.get_seq2count_dict('results/' + g + '.elms.90.freq.redo', float(0)) top_seqs = {} for g in d: for elm in d[g]: ls = [] for seq in d[g][elm]: ls.append([d[g][elm][seq], seq]) ls.sort() if not g in top_seqs: top_seqs[g] = {} top_seqs[g][elm] = ls[-1][1] for elm in top_seqs['human']: if elm in top_seqs['swine']: if top_seqs['human'][elm] != top_seqs['swine'][elm] and top_seqs['human'][elm] != top_seqs['chicken'][elm] and top_seqs['human'][elm] != top_seqs['equine'][elm]: print elm + '\t' + top_seqs['human'][elm] + '\t' + top_seqs['swine'][elm] + '\t' + top_seqs['chicken'][elm] + '\t' + top_seqs['equine'][elm]
'equine':('H3N8',), 'chicken':('H5N1', 'H9N2'), 'duck':('H5N1', 'H9N2'), 'HIV':('all',), 'HCV':('all',)} virus2conservedELMs = {} for virus in viruses: virus2conservedELMs[virus] = getConservedELMs(virus, subtypes) # load ELM seq fractions host2elmFreqs = {} virus2elmFreqs = {} for host in hosts: host2elmFreqs[host] = utils.get_seq2count_dict(os.path.join(local_settings.RESULTSDIR, 'elmdict_' + host + '.txt'), float(0)) #tmp_input = 'tmp_input' + str(random.randint(0,100)) tmp_input = 'plots/for_aydin/cos_host_host.tab' with open(tmp_input, 'w') as f: f.write('Host_Host\tELM\tDistance\n') for elm in virus2conservedELMs[virus]: counter = 0 for host1,host2 in (('H_sapiens', 'Macaca_mulatta'), ('H_sapiens', 'M_musculus'), ('H_sapiens', 'R_norvegicus'), ('H_sapiens', 'Sus_scrofa'), ('H_sapiens', 'Equus_caballus'), ('H_sapiens', 'Canis_familiaris'), ('H_sapiens', 'Bos_taurus'), ('H_sapiens', 'Gallus_gallus'),
def one_arg(afile): d = utils.get_seq2count_dict(afile, float(0)) entropy = utils.get_species_entropy(d) for elm in entropy: print elm + '\t' + str(entropy[elm])
# float(.05)) # mouse = elm_hists.get_seq2count_dict('results/elmdict_M_musculus.txt', # float(.05)) # monkey = elm_hists.get_seq2count_dict('results/elmdict_Macaca_mulatta.txt', # float(.05)) # print utils_distance.distance_species(human, # monkey) # print utils_distance.distance_species(human, # mouse) # print utils_distance.distance_species(monkey, # mouse) species2dict = {} virus2dict = {} virus2dict['swineFlu'] = utils.get_seq2count_dict('results/flu_elmdict_swine', float(.4)) virus2dict['chickenFlu'] = utils.get_seq2count_dict('results/flu_elmdict_chicken', float(.4)) virus2dict['humanFlu'] = utils.get_seq2count_dict('results/flu_elmdict_human', float(.4)) for g in ('H_sapiens', 'Gallus_gallus', 'Sus_scrofa'): species2dict[g] = utils.get_seq2count_dict_for_seqs('results/elmdict_' + g + '.txt', float(0), virus2dict) for v in virus2dict: species2dict[v] = virus2dict[v] d = utils_distance.distance_matrix(species2dict) elm_d = utils_distance.elm_distance_matrix(species2dict) #for elm in elm_d:
import utils, utils_plot, utils_distance d = {'web':utils.get_seq2count_dict('results/human.website.elm.elmdict', float(.01)), 'regex':utils.get_seq2count_dict('results/hprd_new.regex.elms.elmdict', float(.01))} elms = utils_distance.get_elements(d['web'], d['regex']) for elm in elms: utils_plot.elm_host_barplot(d, elm, 'plots/hprd/' + elm + '.png')
""" Each human ELM/seq pair has a normalized fraction (normalized to that ELM). I'll split the ELM/seq pairs into virus and non-virus, and use the one-sided wilcoxon test to see if virus ELM/seq pairs have lower fractions. """ import utils, os, utils_stats from global_settings import * from local_settings import * species2dict = {} flu2dict = {} for g in ['H_sapiens']:#GENOMES: species2dict[g] = utils.get_seq2count_dict('results/elmdict_' + g + '.redo', float(0)) for flu in ['HIV']:#FLU_NAMES: flu2dict[flu] = utils.get_seq2count_dict('results/hiv_freq', float(0.1)) virus_like = [] non_virus = [] host = 'H_sapiens' virus = 'HIV' for elm in species2dict[host]: if elm in flu2dict[virus]: for seq in species2dict[host][elm]: if seq in flu2dict[virus][elm]: #if flu2dict[virus][elm][seq] > float(.05):