def check_enrichment(study_fn, pop_fn, assoc_fn, print_summary=False, save_summary=True, savepath=None, obo_dag=None): p = optparse.OptionParser(__doc__) p.add_option('--alpha', default=0.05, type="float", help="Test-wise alpha for multiple testing " "[default: %default]") p.add_option('--pval', default=None, type="float", help="Family-wise alpha (whole experiment), only print out " "Bonferroni p-value is less than this value. " "[default: %default]") p.add_option('--compare', dest='compare', default=False, action='store_true', help="the population file as a comparison group. if this " "flag is specified, the population is used as the study " "plus the `population/comparison`") p.add_option('--ratio', dest='ratio', type='float', default=None, help="only show values where the difference between study " "and population ratios is greater than this. useful for " "excluding GO categories with small differences, but " "containing large numbers of genes. should be a value " "between 1 and 2. ") p.add_option('--fdr', dest='fdr', default=False, action='store_true', help="Calculate the false discovery rate (alt. to the " "Bonferroni but slower)") p.add_option('--indent', dest='indent', default=False, action='store_true', help="indent GO terms") (opts, args) = p.parse_args() args = [study_fn, pop_fn, assoc_fn] bad = check_bad_args(args) if bad: print(bad) sys.exit(p.print_help()) min_ratio = opts.ratio if min_ratio is not None: assert 1 <= min_ratio <= 2 assert 0 < opts.alpha < 1, "Test-wise alpha must fall between (0, 1)" study_fn, pop_fn, assoc_fn = args study, pop = read_geneset(study_fn, pop_fn, compare=opts.compare) assoc = read_associations(assoc_fn) methods = ["bonferroni", "sidak", "holm"] if opts.fdr: methods.append("fdr") if obo_dag is None: obo_file = "go-basic.obo" obo_dag = GODag(obo_file=obo_file) g = GOEnrichmentStudy(pop, assoc, obo_dag, alpha=opts.alpha, methods=methods) results = g.run_study(study) if print_summary: g.print_summary(results, min_ratio=min_ratio, indent=opts.indent, pval=opts.pval) if save_summary: if savepath is None: savepath = study_fn.replace( study_fn.split("/")[-1], "enrichment_" + study_fn.split("/")[-1]) g.wr_tsv(savepath, results)
def term_enrichment(pop_genes, gene_sets, obo_path, assoc_path, folder, condition, regenerate=False, test_sig=True, **kwargs): kwargs.setdefault('alpha', 0.05) kwargs.setdefault('methods', ["bonferroni", "sidak", "holm"]) # Setup goatools enrichment if regenerate: assoc = read_associations(assoc_path) go_dag = GODag(obo_file=obo_path) pop = set(pop_genes) g = GOEnrichmentStudy(pop, assoc, go_dag, **kwargs) # go_enrich = OrderedDict() go_enrich = OrderedDict() for gc, genes in gene_sets.items(): # Write the gene list to a file out_path = '{}/go_enrich/{}_{}_list.txt'.format(folder, condition, gc) write_gene_list(genes, out_path) enrich_path = out_path.replace('list', 'enrich') try: if regenerate: raise ValueError('Override to retrain') enrich = pd.read_csv(enrich_path, sep='\t', index_col=0) except (FileNotFoundError, ValueError) as e: r = g.run_study(frozenset(genes)) g.wr_tsv(enrich_path, r) enrich = pd.read_csv(enrich_path, sep='\t', index_col=0) enrich = enrich[(enrich.p_bonferroni < kwargs['alpha'])] go_enrich[gc] = enrich # Compile the results # enrich_df = pd.concat(enrich_df, keys=gene_sets.keys()) # Get the sets # go = enrich_df.groupby(level=0).apply(lambda x: set(x.index.get_level_values(1))) go_sizes, go_terms = all_subsets(go_enrich) all_terms = pd.concat(go_terms.values()) all_depths = all_terms['depth'] all_median = np.median(all_depths) if test_sig: for gene_class, terms in go_terms.items(): d = terms['depth'].values if len(d) < 3: print(gene_class, ' Skipped') continue t_med = np.median(d) if t_med > all_median: alternative = 'less' elif t_med < all_median: alternative = 'greater' else: alternative = 'two.sided' ks_p = d_ks_test(d, all_depths, alternative=alternative) print(gene_class, t_med, all_median, ks_p, sep='\t') return pd.concat(go_terms.values(), keys=go_terms.keys())