def seqDistance(virus_d, host_d): seqs = utils_graph.intersectLists([virus_d, host_d]) sum = float(0) for seq in seqs: sum += virus_d[seq] return sum
def getConservedELMs(virus, subtypes): ls = [utils_motif.annotation2protein(os.path.join(local_settings.RESULTSDIR, virus + '.' + subtype + '.elms.70.controled'), {'ELM':True}) for subtype in subtypes[virus]] return utils_graph.intersectLists(ls)
def stEvalInfo(d1, d2, bg): """ Compute Match,Pval between d1 & d2. Return as string. """ match = utils_graph.intersectLists([d1, d2]) pval = utils_stats.prob3(len(bg.keys()), len(d1.keys()), len(d2.keys()), len(match.keys())) return str(len(d1.keys())) + '\t' + str(len(match.keys())) + '\t' + str(pval)
def getEnrichedClusters(gmeans_clusters, target_sets, background_genes, pval_cut): """ Return {} of cluster_id to [enriched_cat, enriched_genes {}, pval] target_sets is {} of cat to genes. """ # check to make sure everything is in background for cluster in gmeans_clusters.keys(): for gene in gmeans_clusters[cluster].keys(): if not background_genes.has_key(gene): print 'cluster gene not in background genes' sys.exit(0) for target in target_sets.keys(): for gene in target_sets[target].keys(): if not background_genes.has_key(gene): print 'target set gene not in background genes' sys.exit(0) ret_clusters = {} background_len = len(background_genes.keys()) for a_set in target_sets.keys(): target_genes = target_sets[a_set] target_len = len(target_genes.keys()) for cluster in gmeans_clusters.keys(): cluster_genes = gmeans_clusters[cluster] cluster_len = len(cluster_genes.keys()) match_genes = utils_graph.intersectLists([cluster_genes, target_genes]) match_len = len(match_genes.keys()) pval = utils_stats.prob3(background_len, cluster_len, target_len, match_len) if pval < pval_cut: if not ret_clusters.has_key(cluster): ret_clusters[cluster] = {} cluster_per = int(float(100)*float(match_len)/float(cluster_len)) target_per = int(float(100)*float(match_len)/float(target_len)) ret_clusters[cluster][a_set] = [match_genes, cluster_per, target_per, pval] return ret_clusters
# print elm # with open('mammal_bird.notTest', 'w') as f: # for elm in control_elms: # if not elm in test_elms: # if elm in elm2fracs: # if check_gtr(elm, elm2fracs): # f.write(elm + '\tGTR\n') # elif check_less(elm, elm2fracs): # f.write(elm + '\tLESS\n') # else: # f.write(elm + '\tSAME\n') test_elms = {} with open('mammal_bird.' + cut + '.test', 'w') as f: for elm in utils_graph.intersectLists([use_elms,freq_elms]): if elm in mammal_elms and elm in bird_elms: control_elms[elm] = True elif elm in elm2fracs: test_elms[elm] = True if check_gtr(elm, elm2fracs): f.write(elm + '\tGTR\n') elif check_less(elm, elm2fracs): f.write(elm + '\tLESS\n') else: f.write(elm + '\tSAME\n') else: test_elms[elm] = True if check_gtr(elm, elm2freq): f.write(elm + '\tGTR\n') elif check_less(elm, elm2freq):
non_virus_elms = 0 non_virus_elms_all = 0 control_elms = {} with open('mammal_bird.different.' + str(cut) + '.notest', 'w') as f: for elm in elm2freq: if not elm in test_elms: control_elms[elm] = True non_virus_elms_all += 1 if not elm in ignore_elms: count,same = test_it(elm, elm2freq, f) non_virus_elms += count non_virus_elms_same += same diff_diff = virus_elm_count-virus_elms_same diff_same = virus_elms_same + len(test_elms.keys())-virus_elm_count-len(utils_graph.intersectLists([test_elms,ignore_elms])) diff_bg_diff = non_virus_elms-non_virus_elms_same diff_bg_same = non_virus_elms_same + non_virus_elms_all-non_virus_elms-len(utils_graph.intersectLists([control_elms,ignore_elms])) with open(str(cut) + '.different.results', 'w') as f: p = utils_stats.fisher_positive_pval([diff_diff,diff_same], [diff_bg_diff,diff_bg_same]) f.write('pvalue\t' + str(p) + '\n') f.write('virus\t' + str(diff_diff) + '\t' + str(diff_same) + '\n') f.write('nvirus\t' + str(diff_bg_diff) + '\t' + str(diff_bg_same) + '\n') test_elms_2 = {} for c in common_all_elms: if not c in test_elms: test_elms_2[c] = True
#!/usr/bin/env python """For each HCV protein, calcuate the likelyhood of the GO BP similarity between predictions and gold standard. Do this for H1H2 & H1. """ import sys, utils_stats, utils_graph, utils_humanVirus, random, os hhe_file = sys.argv[1] hhp_file = sys.argv[2] background_file = sys.argv[3] out_file = sys.argv[4] # this takes a long time # utils_stats.gene_set_go_sim(background_file, 'results/HPRD.ls.entrez.gosim') hhe_vp2hp = utils_humanVirus.loadHHETargetPairs(hhe_file) pred2vp2hp = utils_humanVirus.loadPredictions_predType2vp2hp(hhp_file) all_hps = utils_graph.getNodes(background_file) for pred_type in ('h1', 'h1h2'): for vp in pred2vp2hp[pred_type].keys(): if hhe_vp2hp.has_key(vp): hhe = utils_graph.intersectLists([hhe_vp2hp[vp], all_hps]).keys() preds = pred2vp2hp[pred_type][vp].keys() go_pval = utils_stats.gene_set_go_sim_pval(preds, hhe, 'results/HPRD.ls.entrez.gosim') print('%s\t%s\t%.3f' % (vp, pred_type, go_pval))
elmSeq = elm + ':' + seq if elm == 'TRG_ENDOCYTIC_2': all_elmSeqs[elmSeq] = True flu_counts[flu][elmSeq] += 1 print flu_counts host_all_Seqs = {} for host in hosts: host_counts[host] = defaultdict(utils.init_zero) with open('working/Jun29/elmdict_' + host + '.init') as f: for line in f: (elm, seq, count, fq) = line.strip().split('\t') elmSeq = elm + ':' + seq if elm == 'TRG_ENDOCYTIC_2': #all_elmSeqs[elmSeq] = True host_all_Seqs[elmSeq] = True host_counts[host][elmSeq] += int(count) print len(utils_graph.intersectLists([host_all_Seqs, all_elmSeqs])) flu_vecs = mk_count_vecs(flu_counts, all_elmSeqs) flu_dists = mk_count_dists(flu_vecs) host_vecs = mk_count_vecs(host_counts, all_elmSeqs) host_dists = mk_count_dists(host_vecs) js_distances = defaultdict(dict) for host in hosts: for flu in flus: js_dis = utils.jensen_shannon_dists(host_dists[host], flu_dists[flu]) js_distances[host][flu] = js_dis print host, flu, js_dis
for line in f: (elm, seq, count, fq) = line.strip().split('\t') elmSeq = elm + ':' + seq if elm in mapping: if elmSeq in mapping[elm]: key = mapping[elm][elmSeq] host_counts[host][key] += int(count) found_seqs[-1][key] = True # else: # host_counts[host][elmSeq] += int(count) # found_seqs[-1][elmSeq] = True # else: # host_counts[host][elmSeq] += int(count) # found_seqs[-1][elmSeq] = True use_seqs = utils_graph.intersectLists(found_seqs) host_vecs = mk_count_vecs(host_counts, use_seqs) host_dists = mk_count_dists(host_vecs) tmp_input = 'tmp_data' tmp_r = 'tmp_r' + str(random.randint(0,100)) tmp_labels = 'labels' + str(random.randint(0,100)) out_file = 'working/try_clusters.png' js_distances = defaultdict(dict) for host1, host2 in itertools.combinations(hosts, 2): js_dis = utils.jensen_shannon_dists(host_dists[host1], host_dists[host2]) js_distances[host1][host2] = js_dis js_distances[host2][host1] = js_dis
"../../Runs/Conservation70_Cutoff.2_Window10", "../../Data/human.hprd.prosite", "some out file", ] utils_scripting.checkStart(sys.argv, req_args, examples, len(req_args), True) hhe_vp2hp = utils_humanVirus.loadHHETargetPairs(sys.argv[1]) pred2vp2hp = utils_humanVirus.loadPredictions_predType2vp2hp(sys.argv[2]) all_hps = utils_graph.getNodes(sys.argv[3]) with open(sys.argv[4], "w") as fout: fout.write("Prediction Type\tVP\tHHE\tHHP\tMatch\tPrecsion\tRecall\tRandomPrecision\tPval\n") for predtype in pred2vp2hp.keys(): for vp in pred2vp2hp[predtype].keys(): if hhe_vp2hp.has_key(vp): hhe = utils_graph.intersectLists([hhe_vp2hp[vp], all_hps]) hhe_len = len(hhe.keys()) preds = pred2vp2hp[predtype][vp] preds_len = len(preds.keys()) match = utils_graph.intersectLists([hhe, preds]) match_len = len(match.keys()) precision = int(round(float(100) * float(match_len) / float(preds_len))) if hhe_len > 0: recall = int(round(float(100) * float(match_len) / float(hhe_len))) else: recall = "NA" random_precision = int(round(float(100) * float(hhe_len) / float(len(all_hps.keys())))) if match_len != 0: pval = utils_stats.prob3(len(all_hps.keys()), preds_len, hhe_len, match_len) else: