示例#1
0
def check_enrichment(study_fn,
                     pop_fn,
                     assoc_fn,
                     print_summary=False,
                     save_summary=True,
                     savepath=None,
                     obo_dag=None):
    p = optparse.OptionParser(__doc__)

    p.add_option('--alpha',
                 default=0.05,
                 type="float",
                 help="Test-wise alpha for multiple testing "
                 "[default: %default]")
    p.add_option('--pval',
                 default=None,
                 type="float",
                 help="Family-wise alpha (whole experiment), only print out "
                 "Bonferroni p-value is less than this value. "
                 "[default: %default]")
    p.add_option('--compare',
                 dest='compare',
                 default=False,
                 action='store_true',
                 help="the population file as a comparison group. if this "
                 "flag is specified, the population is used as the study "
                 "plus the `population/comparison`")
    p.add_option('--ratio',
                 dest='ratio',
                 type='float',
                 default=None,
                 help="only show values where the difference between study "
                 "and population ratios is greater than this. useful for "
                 "excluding GO categories with small differences, but "
                 "containing large numbers of genes. should be a value "
                 "between 1 and 2. ")
    p.add_option('--fdr',
                 dest='fdr',
                 default=False,
                 action='store_true',
                 help="Calculate the false discovery rate (alt. to the "
                 "Bonferroni but slower)")
    p.add_option('--indent',
                 dest='indent',
                 default=False,
                 action='store_true',
                 help="indent GO terms")

    (opts, args) = p.parse_args()
    args = [study_fn, pop_fn, assoc_fn]
    bad = check_bad_args(args)
    if bad:
        print(bad)
        sys.exit(p.print_help())

    min_ratio = opts.ratio
    if min_ratio is not None:
        assert 1 <= min_ratio <= 2

    assert 0 < opts.alpha < 1, "Test-wise alpha must fall between (0, 1)"

    study_fn, pop_fn, assoc_fn = args
    study, pop = read_geneset(study_fn, pop_fn, compare=opts.compare)
    assoc = read_associations(assoc_fn)
    methods = ["bonferroni", "sidak", "holm"]
    if opts.fdr:
        methods.append("fdr")
    if obo_dag is None:
        obo_file = "go-basic.obo"
        obo_dag = GODag(obo_file=obo_file)
    g = GOEnrichmentStudy(pop,
                          assoc,
                          obo_dag,
                          alpha=opts.alpha,
                          methods=methods)

    results = g.run_study(study)

    if print_summary:
        g.print_summary(results,
                        min_ratio=min_ratio,
                        indent=opts.indent,
                        pval=opts.pval)

    if save_summary:
        if savepath is None:
            savepath = study_fn.replace(
                study_fn.split("/")[-1],
                "enrichment_" + study_fn.split("/")[-1])
        g.wr_tsv(savepath, results)
示例#2
0
def term_enrichment(pop_genes,
                    gene_sets,
                    obo_path,
                    assoc_path,
                    folder,
                    condition,
                    regenerate=False,
                    test_sig=True,
                    **kwargs):
    kwargs.setdefault('alpha', 0.05)
    kwargs.setdefault('methods', ["bonferroni", "sidak", "holm"])

    # Setup goatools enrichment
    if regenerate:
        assoc = read_associations(assoc_path)
        go_dag = GODag(obo_file=obo_path)
        pop = set(pop_genes)
        g = GOEnrichmentStudy(pop, assoc, go_dag, **kwargs)

    # go_enrich = OrderedDict()
    go_enrich = OrderedDict()

    for gc, genes in gene_sets.items():
        # Write the gene list to a file
        out_path = '{}/go_enrich/{}_{}_list.txt'.format(folder, condition, gc)

        write_gene_list(genes, out_path)
        enrich_path = out_path.replace('list', 'enrich')
        try:
            if regenerate:
                raise ValueError('Override to retrain')
            enrich = pd.read_csv(enrich_path, sep='\t', index_col=0)
        except (FileNotFoundError, ValueError) as e:
            r = g.run_study(frozenset(genes))
            g.wr_tsv(enrich_path, r)
            enrich = pd.read_csv(enrich_path, sep='\t', index_col=0)
        enrich = enrich[(enrich.p_bonferroni < kwargs['alpha'])]
        go_enrich[gc] = enrich

    # Compile the results
    # enrich_df = pd.concat(enrich_df, keys=gene_sets.keys())

    # Get the sets
    # go = enrich_df.groupby(level=0).apply(lambda x: set(x.index.get_level_values(1)))

    go_sizes, go_terms = all_subsets(go_enrich)
    all_terms = pd.concat(go_terms.values())
    all_depths = all_terms['depth']
    all_median = np.median(all_depths)

    if test_sig:
        for gene_class, terms in go_terms.items():
            d = terms['depth'].values
            if len(d) < 3:
                print(gene_class, ' Skipped')
                continue

            t_med = np.median(d)
            if t_med > all_median:
                alternative = 'less'
            elif t_med < all_median:
                alternative = 'greater'
            else:
                alternative = 'two.sided'
            ks_p = d_ks_test(d, all_depths, alternative=alternative)
            print(gene_class, t_med, all_median, ks_p, sep='\t')

    return pd.concat(go_terms.values(), keys=go_terms.keys())