def run_all(gbfile, groupfile, transfile, tagFiles): # does this filter out replicates that appear in every tag? annodb, al, dna = analyze.read_genbank_annots(gbfile) snps = analyze.read_tag_files(tagFiles) gsd = analyze.map_snps(snps, al, dna) # Count nonsyn vs. syn. sites for each gene site_counts = genes_sites_dict(annodb) # Count nonsyn vs. syn. snps for each gene snp_counts = analyze.get_gene_na_ns(gsd) genes = annodb.keys() functional_groups = load_func_assoc(groupfile, transfile) #binomial_tests(snp_counts, site_counts) #main(site_counts, snp_counts, genes, functional_groups, test_func=binomial_test) for name, test_func in [("Fisher test", fisher_test), ("Binomial Test", binomial_test)]: print name run_tests(site_counts, snp_counts, genes, functional_groups, test_func=test_func) print
def hypergeom_cmd(gbfile, groupfile, transfile, tagFiles, N=30): top_genes = phenoseq_top_genes(gbfile, tagFiles) pathway_dict = load_func_assoc(groupfile, transfile) top_genes_subset = [y for (x,y) in top_genes[:N]] #print top_genes_subset results = [] for name, genes_ in pathway_dict.items(): genes = genes_[1] num_genes_int_top_list = len([g for g in genes if g in top_genes_subset]) if num_genes_int_top_list: pval = p_value(len(genes), num_genes_int_top_list, len(top_genes_subset)) if isnan(pval): warnings.warn('ignoring invalid NaN pvalues...') else: results.append( (len(pathway_dict) * pval, name, len(genes), genes)) results.sort() for p, name, n, genes in results: print ",".join(map(str, [p, name, n, " ".join(genes)]))
def hypergeom_cmd(gbfile, groupfile, transfile, tagFiles, N=30): top_genes = phenoseq_top_genes(gbfile, tagFiles) pathway_dict = load_func_assoc(groupfile, transfile) top_genes_subset = [y for (x, y) in top_genes[:N]] #print top_genes_subset results = [] for name, genes_ in pathway_dict.items(): genes = genes_[1] num_genes_int_top_list = len( [g for g in genes if g in top_genes_subset]) if num_genes_int_top_list: pval = p_value(len(genes), num_genes_int_top_list, len(top_genes_subset)) if isnan(pval): warnings.warn('ignoring invalid NaN pvalues...') else: results.append( (len(pathway_dict) * pval, name, len(genes), genes)) results.sort() for p, name, n, genes in results: print ",".join(map(str, [p, name, n, " ".join(genes)]))