def get_cons_elms(dir, hosts, years, strains, per, d, out_file, suffix): """Find ELMs that are consered at some per for all host/strain/year combinations w/ at least 50 sequences""" d1 = {'ELM':True} d2 = d use_files = {} protein_counts_pass = defaultdict(dict) for host in hosts: for year in years: for strain in strains: f = os.path.join(dir, '.'.join((host, strain, str(year))) + '.elms') new_f = os.path.join(dir, '.'.join((host, strain, str(year))) + suffix + '.' + per) try: count_cons(use_files, protein_counts_pass, f, d1, new_f) #print host, year, strain except: pass for f in use_files: use_files[f] = utils_motif.protein2annotation(f, d2) # pass_elms = with open(out_file, 'w') as afile: for protein in protein_counts_pass: #print protein + '\t' + str(len(protein_counts_pass[protein])) + '\t' + str([x.split('/')[2].split('.')[0:3] for x in protein_counts_pass[protein].keys()]) elm_counts_local = defaultdict(init_zero) for f in protein_counts_pass[protein]: for elm in use_files[f][protein]: elm_counts_local[elm] += 1 for elm in elm_counts_local: if len(protein_counts_pass[protein]) == elm_counts_local[elm]: afile.write(protein + '\t' + elm + '\n')
def predict_die(useELMs, domain_tools, netFile): h1 = {} h2 = {} net = utils_graph.getEdges(netFile) proteins = {} tool_d = {} tool_d['ELM'] = True protein2elm = utils_motif.protein2annotation('/home/perry/Projects/Human_Virus/Data/human.annotations', tool_d) getProteinsForELMs(useELMs, proteins) expandProteinsForELMs(useELMs, proteins, domain_tools) for g1 in proteins.keys(): if net.has_key(g1): for elm in proteins[g1].keys(): for g2 in protein2elm.keys(): if net[g1].has_key(g2) and protein2elm[g2].has_key(elm) and g1 != g2: if not h1.has_key(elm): h1[elm] = {} if not h2.has_key(elm): h2[elm] = {} h1[elm][g1] = True h2[elm][g2] = True return [h1, h2]
def main(): req_args = ['virus annotation file', 'annotation tool', '% MSA cutoff'] examples = ['../../Data/ProfileScan/hiv.prosite', 'ProfileScan', '90'] utils_scripting.checkStart(sys.argv, req_args, examples, len(req_args), True) annotation_file = sys.argv[1] tool = sys.argv[2] conserved_cutoff = float(sys.argv[3]) protein2annotation = utils_motif.protein2annotation(annotation_file, {tool:True}) [virus2annotation, virus2proteinCount] = getCounts(protein2annotation) for vp in virus2annotation.keys(): for motif in virus2annotation[vp].keys(): percent = (float(100) * float(virus2annotation[vp][motif]) / float(virus2proteinCount[vp])) if percent >= conserved_cutoff: print vp + '\t0\t0\t' + motif + '\tseq\t' + tool sys.stderr.write(vp + '\t' + motif + '\t' + str(percent) + '\n')
# del freq_elms['LIG_PDZ_3'] # del freq_elms['MOD_CK1_1'] # del freq_elms['MOD_CK2_1'] # del freq_elms['MOD_GSK3_1'] elm2freq = {} for elm in freq_elms: elm2freq[elm] = {} for s in aa_freqs: if elm in aa_freqs[s][0]: elm2freq[elm][s] = aa_freqs[s][0][elm] else: elm2freq[elm][s] = float(0) cut = sys.argv[2] d = {'ELM':True} swine_H1N1_elms = utils_motif.protein2annotation('results/swine.H1N1.elms.' + cut, d) swine_H3N2_elms = utils_motif.protein2annotation('results/swine.H3N2.elms.' + cut, d) swine = [swine_H1N1_elms, swine_H3N2_elms] human_H1N1_elms = utils_motif.protein2annotation('results/human.H1N1.elms.' + cut, d) human_H3N2_elms = utils_motif.protein2annotation('results/human.H3N2.elms.' + cut, d) human_H5N1_elms = utils_motif.protein2annotation('results/human.H5N1.elms.' + cut, d) human = [human_H1N1_elms, human_H3N2_elms, human_H5N1_elms] chicken_H5N1_elms = utils_motif.protein2annotation('results/chicken.H5N1.elms.' + cut, d) chicken_H9N2_elms = utils_motif.protein2annotation('results/chicken.H9N2.elms.' + cut, d) chicken = [chicken_H5N1_elms, chicken_H9N2_elms] duck_H5N1_elms = utils_motif.protein2annotation('results/duck.H5N1.elms.' + cut, d) duck_H9N2_elms = utils_motif.protein2annotation('results/duck.H9N2.elms.' + cut, d) duck = [duck_H5N1_elms, duck_H9N2_elms]
for seq in seq_counts: if float(seq_counts[seq]) / protein_count > float(0.9): cons.append(seq) else: nonCons.append(seq) return [cons, nonCons] host_file = sys.argv[1] elm_file = sys.argv[2] cons_file = sys.argv[3] ofile = sys.argv[4] cmp_file = sys.argv[5] cmp_host_file = sys.argv[6] elms = utils_motif.protein2annotation(elm_file, {"ELM": True}) cmp_elms = utils_motif.protein2annotation(cmp_file, {"ELM": True}) cons_elms = utils_motif.protein2annotation(cons_file, {"ELM": True}) host_freqs = get_freq(host_file) cmp_freqs = get_freq(cmp_host_file) (elm_counts, protein_counts) = getProtein2elm2seq(elms, cons_elms) (cmp_counts, cmp_protein_counts) = getProtein2elm2seq(cmp_elms, cons_elms) lines = "" pos = 0 neg = 0 z = 0 for protein in elm_counts: if protein in cmp_counts: protein_count = float(protein_counts[protein]) cmp_protein_count = float(cmp_protein_counts[protein])
for elm in e: freq_elms[elm] = True elm2freq = {} for elm in freq_elms: elm2freq[elm] = {} for s in aa_freqs: if elm in aa_freqs[s][0]: elm2freq[elm][s] = aa_freqs[s][0][elm] else: elm2freq[elm][s] = float(0.0000000000000000000001) cut = sys.argv[1] d = {'ELM':True} swine_H1N1_elms = utils_motif.protein2annotation('results/swine.H1N1.elms.' + cut, d) swine_H3N2_elms = utils_motif.protein2annotation('results/swine.H3N2.elms.' + cut, d) swine = [swine_H1N1_elms, swine_H3N2_elms] human_H1N1_elms = utils_motif.protein2annotation('results/human.H1N1.elms.' + cut, d) human_H3N2_elms = utils_motif.protein2annotation('results/human.H3N2.elms.' + cut, d) human_H5N1_elms = utils_motif.protein2annotation('results/human.H5N1.elms.' + cut, d) human = [human_H1N1_elms, human_H3N2_elms, human_H5N1_elms] horse_H3N8_elms = utils_motif.protein2annotation('results/equine.H3N8.elms.' + cut, d) horse = [horse_H3N8_elms] chicken_H5N1_elms = utils_motif.protein2annotation('results/chicken.H5N1.elms.' + cut, d) chicken_H9N2_elms = utils_motif.protein2annotation('results/chicken.H9N2.elms.' + cut, d) chicken = [chicken_H5N1_elms, chicken_H9N2_elms]
""" I need to add HIV and HCV to the project, but I must first convert the ELMs hits to the frequencies used for this project. """ import utils_motif, sys def printELMusage(elm, seq2count): total = 0 for seq in seq2count: total += seq2count[seq] for seq in seq2count: print('%s\t%s\t%d\t%.10f' % (elm, seq, seq2count[seq], float(seq2count[seq]/float(total)))) protein2elm = utils_motif.protein2annotation(sys.argv[1], {'ELM':True}) elm2seq2count = {} for p in protein2elm: for elm in protein2elm[p]: if elm not in elm2seq2count: elm2seq2count[elm] = {} for [st, stp, seq] in protein2elm[p][elm]: if seq not in elm2seq2count[elm]: elm2seq2count[elm][seq] = 0 elm2seq2count[elm][seq] += 1 for elm in elm2seq2count: printELMusage(elm, elm2seq2count[elm])
""" Find ELMs that are not conserve by change. Enter subtype & cutoff for # trial an ELM for a protein can be found by chance. """ import sys, utils_motif, utils_graph import utils from collections import defaultdict elm_file = sys.argv[1] cutoff = int(sys.argv[2]) real = utils_motif.protein2annotation('results/' + elm_file, {'ELM':True}) protein2elms = {} for x in xrange(10): protein2annotation = utils_motif.protein2annotation('random_seq/' + str(x) + '/' + elm_file, {'ELM':True}) for protein in protein2annotation: for elm in protein2annotation[protein]: if not protein in protein2elms: protein2elms[protein] = {} if not elm in protein2elms[protein]: protein2elms[protein][elm] = 0 protein2elms[protein][elm] += 1 for protein in real: if protein in protein2elms: for elm in real[protein]: if elm in protein2elms[protein]:
if not protein in protein2elm2seq: protein2elm2seq[protein] = {} if not elm in protein2elm2seq[protein]: protein2elm2seq[protein][elm] = {} for [st, stp, seq] in elms[protein_id][elm]: if not seq in protein2elm2seq[protein][elm]: protein2elm2seq[protein][elm][seq] = 0 protein2elm2seq[protein][elm][seq] += 1 return (protein2elm2seq, proteinCounts) host_file = sys.argv[1] elm_file = sys.argv[2] cons_file = sys.argv[3] ofile = sys.argv[4] elms = utils_motif.protein2annotation(elm_file, {'ELM':True}) cons_elms = utils_motif.protein2annotation(cons_file, {'ELM':True}) host_freqs = get_freq(host_file) (elm_counts, protein_counts) = getProtein2elm2seq(elms, cons_elms) lines = '' for protein in elm_counts: protein_count = float(protein_counts[protein]) for elm in elm_counts[protein]: if elm in host_freqs: virus_freqs = [] non_virus_freqs = [] found_seqs = {} for seq in elm_counts[protein][elm]: if float(elm_counts[protein][elm][seq])/protein_count > float(.9):
import utils_motif, utils_graph use_elms = utils_graph.getNodes('use_elms') human = utils_motif.protein2annotation('human.H1N1.elms', {'ELM':True}) human_conserved = utils_motif.protein2annotation('human.H1N1.elms.90', {'ELM':True}) swine = utils_motif.protein2annotation('swine.H1N1.elms', {'ELM':True}) swine_conserved = utils_motif.protein2annotation('swine.H1N1.elms.90', {'ELM':True}) def get_entropy(afile): entropy = {} with open(afile) as f: for line in f: [elm, entropy_st] = line.strip().split('\t') if not elm in entropy: entropy[elm] = {} entropy[elm] = float(entropy_st) return entropy def get_best_seq(seqs): ls = [] for seq in seqs: ls.append([seqs[seq],seq]) ls.sort() #if len(ls) > 1: # print ls[0], ls[1]
import utils_motif, sys conserved_file = sys.argv[1] elm_file = sys.argv[2] conserved = utils_motif.protein2annotation(conserved_file, {'ELM':True}) elms_pre = utils_motif.protein2annotation(elm_file, {'ELM':True}) elms = {} for protein in elms_pre: vp = protein.split('.')[-1] if not vp in elms: elms[vp] = {} for elm in elms_pre[protein]: if not elm in elms[vp]: elms[vp][elm] = [] for tri in elms_pre[protein][elm]: elms[vp][elm].append(tri) elm2seq2count = {} for vp in conserved: for elm in conserved[vp]: for [st, stp, seq] in elms[vp][elm]: if not elm in elm2seq2count: elm2seq2count[elm] = {} if not seq in elm2seq2count[elm]: elm2seq2count[elm][seq] = 0 elm2seq2count[elm][seq] += 1 for elm in elm2seq2count: total = 0 for seq in elm2seq2count[elm]:
import utils_motif, sys flu = sys.argv[1] species = sys.argv[2] strain = sys.argv[3] human = utils_motif.protein2annotation('human.' + strain + '.elms', {'ELM':True}) human_conserved = utils_motif.protein2annotation('human.' + strain + '.elms.90', {'ELM':True}) swine = utils_motif.protein2annotation(flu + '.' + strain + '.elms', {'ELM':True}) swine_conserved = utils_motif.protein2annotation(flu + '.' + strain + '.elms.90', {'ELM':True}) def get_freq(afile): freq = {} with open(afile) as f: for line in f: #[elm_seq, freq_st] = line.strip().split('\t') #elm, seq = elm_seq.split(':') elm, seq, num, freq_st = line.strip().split('\t') if not elm in freq: freq[elm] = {} freq[elm][seq] = float(freq_st) return freq def get_best_seq(seqs): ls = [] for seq in seqs: ls.append([seqs[seq],seq])