Exemplo n.º 1
0
    assoc = read_associations(assoc_fn)

    methods = args.method.split(",")
  
    if args.fdr:
        methods.append("fdr")

    obo_dag = GODag(obo_file=args.obo)
    propagate_counts = not args.no_propagate_counts
    g = GOEnrichmentStudy(pop, assoc, obo_dag,
                          propagate_counts=propagate_counts,
                          alpha=args.alpha,
                          methods=methods)
    results = g.run_study(study)
    if args.outfile is None:
        g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval)
    else:
        # Users can print to both tab-separated file and xlsx file in one run.
        outfiles = args.outfile.split(",") 
        prt_if = None # Print all values
        if args.pval is not None:
            # Only print out when uncorrected p-value < this value.
            prt_if = lambda nt: nt.p_uncorrected < args.pval
        for outfile in outfiles:
            if outfile.endswith(".xlsx"):
                g.wr_xlsx(outfile, results, prt_if=prt_if)
            else:
                g.wr_tsv(outfile, results, prt_if=prt_if)
            
# Copyright (C) 2010-2016, H Tang et al., All rights reserved.
Exemplo n.º 2
0
                 "background population. Please check.\n".format(overlap))

    assoc = read_associations(assoc_fn)

    methods = args.method.split(",")

    obo_dag = GODag(obo_file=args.obo)
    propagate_counts = not args.no_propagate_counts
    g = GOEnrichmentStudy(pop, assoc, obo_dag,
                          propagate_counts=propagate_counts,
                          alpha=args.alpha,
                          pvalcalc=args.pvalcalc,
                          methods=methods)
    results = g.run_study(study)
    if args.outfile is None:
        g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval)
    else:
        # Users can print to both tab-separated file and xlsx file in one run.
        outfiles = args.outfile.split(",")
        prt_if = None # Print all values
        if args.pval is not None:
            # Only print out when uncorrected p-value < this value.
            prt_if = lambda nt: nt.p_uncorrected < args.pval
        for outfile in outfiles:
            if outfile.endswith(".xlsx"):
                g.wr_xlsx(outfile, results, prt_if=prt_if)
            else:
                g.wr_tsv(outfile, results, prt_if=prt_if)

# Copyright (C) 2010-2016, H Tang et al., All rights reserved.
Exemplo n.º 3
0
    assoc = read_associations(assoc_fn)

    methods = args.method.split(",")

    obo_dag = GODag(obo_file=args.obo)
    propagate_counts = not args.no_propagate_counts
    g = GOEnrichmentStudy(pop, assoc, obo_dag,
                          propagate_counts=propagate_counts,
                          alpha=args.alpha,
                          pvalcalc=args.pvalcalc,
                          methods=methods)
    results = g.run_study(study)
    if args.outfile is None:
        g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval)
    else:
        # Users can print to both tab-separated file and xlsx file in one run.
        outfiles = args.outfile.split(",")
        if args.pval is not None:
            # Only print results when uncorrected p-value < this value.A
            num_orig = len(results)
            results = [r for r in results if r.p_uncorrected <= args.pval]
            sys.stdout.write("{N:7,} of {M:,} results have uncorrected P-values <= {PVAL}=pval\n".format(
                N=len(results), M=num_orig, PVAL=args.pval))
        for outfile in outfiles:
            if outfile.endswith(".xlsx"):
                g.wr_xlsx(outfile, results, indent=args.indent)
            else:
                g.wr_tsv(outfile, results, indent=args.indent)

# Copyright (C) 2010-2018, H Tang et al. All rights reserved.
Exemplo n.º 4
0
def goe(
    genelist,
    go_file,
    goa_file,
    bg=None,
    nmin=5,
    conversion=None,
    evidence_set={
        'EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'HTP', 'HDA', 'HMP', 'HGI', 'IBA',
        'IBD', 'IKR', 'IRD', 'ISS', 'ISO', 'ISA', 'ISM'
    }):
    """Finds GO enrichment with goatools (0.7.11 tested).

	**WARNING**\ : This method is inexact for multi-maps in gene name conversion. However, it has a negligible effect in top GO component removal in single-cell co-expression.

	Parameters
	------------
	genelist:	list of str
		Genes to search for enrichment.
	go_file:	str
		File path for GO DAG (downloadable at http://geneontology.org/docs/download-ontology/)).
	goa_file:	str
		File path for GO associations. See parameter **conversion**.
	bg:			list of str
		Background genes.
	nmin:		int
		Minimum number of principal genes required in GO.
	conversion:	tuple
		Conversion of `gene ID system <https://docs.mygene.info/en/latest/doc/data.html>`_ from gene list to the GO annotation.

		* name_from:	Gene naming system of genelist. For gene names, use 'symbol,alias'.
		* name_to:		Gene naming system of goa_file. Examples:

			* Human: use 'uniprot.Swiss-Prot' (for GO annotations downloded from http://geneontology.org/gene-associations/goa_human.gaf.gz).
			* Mouse: use 'MGI' (for GO annotations downloded from http://current.geneontology.org/annotations/mgi.gaf.gz).

		* species:		Species for gene name conversion. Examples: 'human', 'mouse'.

	evidence_set:	set of str
		`GO evidences <http://geneontology.org/docs/guide-go-evidence-codes/>`_ to include. Defaults to non-expression based results to avoid circular reasoning bias.

	Returns
	----------
	goe:		pandas.DataFrame
		GO enrichment.
	gotop:		str
		Top enriched GO ID
	genes:		list of str or None
		Intersection list of genes in gotop and also bg. None if bg is None.

	"""
    from tempfile import NamedTemporaryFile
    from os import linesep
    from goatools.go_enrichment import GOEnrichmentStudy
    from goatools.obo_parser import GODag
    from goatools.associations import read_gaf
    from collections import defaultdict
    import itertools
    from biothings_client import get_client
    import pandas as pd
    import logging
    assert type(genelist) is list and len(genelist) > 0
    if nmin < 1:
        nmin = 1

    bg0 = bg
    # Convert gene names
    if conversion is not None:
        assert len(conversion) == 3
        name_from, name_to, species = conversion
        mg = get_client('gene')
        ans = set(genelist)
        if bg is not None:
            t1 = set(bg)
            assert len(ans - t1) == 0
            ans |= t1
        ans = list(ans)
        ans = mg.querymany(ans,
                           scopes=name_from,
                           fields=name_to,
                           species=species)
        t1 = set(['query', '_score', name_to.split('.')[0]])
        ans = list(filter(lambda x: len(t1 - set(x)) == 0, ans))
        ans = sorted(ans, key=lambda x: x['_score'])
        convert = {x['query']: x for x in ans}
        for xi in name_to.split('.'):
            convert = filter(lambda x: xi in x[1], convert.items())
            convert = {x[0]: x[1][xi] for x in convert}
        convert = {
            x[0]: x[1] if type(x[1]) is str else x[1][0]
            for x in convert.items()
        }
        genelist2 = list(
            set([convert[x]
                 for x in filter(lambda x: x in convert, genelist)]))
        if bg is not None:
            bg = list(
                set([convert[x] for x in filter(lambda x: x in convert, bg)]))
        t1 = set(genelist)
        converti = list(filter(lambda x: x[0] in t1, convert.items()))
        t1 = defaultdict(list)
        for xi in converti:
            t1[xi[1]].append(xi[0])
        converti = dict(t1)
        t1 = defaultdict(list)
        for xi in convert.items():
            t1[xi[1]].append(xi[0])
        convertia = dict(t1)
    else:
        genelist2 = genelist

    # Load GO DAG and association files
    logging.debug('Reading GO DAG file ' + go_file)
    godag = GODag(go_file)
    logging.debug('Reading GO association file ' + goa_file)
    goa = read_gaf(goa_file, evidence_set=evidence_set)
    if bg is None:
        bg = list(goa.keys())

    # Compute enrichment
    goe = GOEnrichmentStudy(bg, goa, godag)
    ans = goe.run_study(genelist2)
    # Format output
    with NamedTemporaryFile() as f:
        goe.wr_tsv(f.name, ans)
        ans = f.read()
    ans = ans.decode()
    ans = [x.split('\t') for x in ans.split(linesep)]
    if len(ans[-1]) < 2:
        ans = ans[:-1]
    if len(ans) == 0 or len(ans[0]) == 0:
        raise ValueError('No enrichment found. Check your input ID type.')
    ans[0][0] = ans[0][0].strip('# ')
    ans = pd.DataFrame(ans[1:], columns=ans[0])
    ans.drop(['NS', 'enrichment', 'study_count', 'p_sidak', 'p_holm'],
             axis=1,
             inplace=True)
    for xj in ['p_uncorrected', 'p_bonferroni']:
        ans[xj] = pd.to_numeric(ans[xj], errors='raise')
    ans['depth'] = pd.to_numeric(ans['depth'],
                                 errors='raise',
                                 downcast='unsigned')
    # Odds ratio column and sort column
    ans['odds_ratio'] = toratio(ans['ratio_in_study']) / toratio(
        ans['ratio_in_pop'])
    ans = ans[[
        'name', 'depth', 'p_uncorrected', 'p_bonferroni', 'odds_ratio',
        'ratio_in_study', 'ratio_in_pop', 'GO', 'study_items'
    ]]
    ans['study_items'] = ans['study_items'].apply(lambda x: x.replace(' ', ''))
    # Convert back study_items
    if conversion is not None:
        ans['study_items'] = ans['study_items'].apply(lambda x: ','.join(
            list(
                itertools.chain.from_iterable(
                    [converti[y] for y in x.split(',')])))
                                                      if len(x) > 0 else x)
    ans.sort_values('p_uncorrected', inplace=True)

    # Get top enriched GO by P-value
    gotop = ans[
        (ans['odds_ratio'] > 1)
        & ans['ratio_in_study'].apply(lambda x: int(x.split('/')[0]) >= nmin)]
    if len(gotop) == 0:
        raise ValueError('No GO enrichment found for given criteria.')
    gotop = str(gotop.iloc[0]['GO'])
    if bg0 is not None:
        # Children GOs
        gos = set([gotop] + list(godag.query_term(gotop).get_all_children()))
        # Look for genes
        genes = list(
            filter(lambda x: len(list(filter(lambda y: y in gos, goa[x]))) > 0,
                   goa))
        if conversion is not None:
            genes = [
                convertia[x] for x in filter(lambda x: x in convertia, genes)
            ]
            genes = list(set(list(itertools.chain.from_iterable(genes))))
        genes = set(genes)
        genes = list(filter(lambda x: x in genes, bg0))
    else:
        genes = None
    return (ans, gotop, genes)
Exemplo n.º 5
0
def enrich(gene2go: str,
           study: str,
           obo: str,
           population: str = None,
           geneid2symbol: str = None,
           correct='fdr_bh',
           alpha=0.05,
           top=20,
           goea_out=None,
           dag_out=None,
           dpi=300,
           show_gene_limit=6,
           only_plot_sig=False):
    """
    Go enrichment based on goatools
    :param gene2go: a file with two columns: gene_id \t go_term_id
    :param study: a file with at least one column, first column contains gene id, second columns is regulation direction
    :param obo: go-basic file download from GeneOntology
    :param population: a file with each row contains one gene; default to use all genes in gene2go file as population
    :param geneid2symbol: file with two columns: gene_id \t gene_symbol, used for DAG plot
    :param correct: pvalue adjustment method:
        Method used for testing and adjustment of pvalues. Can be either the
        full name or initial letters. Available methods are:
        - `bonferroni` : one-step correction
        - `sidak` : one-step correction
        - `holm-sidak` : step down method using Sidak adjustments
        - `holm` : step-down method using Bonferroni adjustments
        - `simes-hochberg` : step-up method  (independent)
        - `hommel` : closed method based on Simes tests (non-negative)
        - `fdr_bh` : Benjamini/Hochberg  (non-negative)
        - `fdr_by` : Benjamini/Yekutieli (negative)
        - `fdr_tsbh` : two stage fdr correction (non-negative)
        - `fdr_tsbky` : two stage fdr correction (non-negative)
    :param alpha: fdr cutoff, default 0.05
    :param top: n top go terms to plot, sorted by corrected pvalue
    :param goea_out: output enrichment result file
    :param dag_out: dag figure file
    :param dpi: resolution of image, no effect for svg
    :param show_gene_limit: the max number of gene in a node to show
    :param only_plot_sig: only plot dag for significantly enriched terms
    :return: None
    """
    if str(correct) == '3':
        correct = 'fdr_bh'
    if geneid2symbol:
        geneid2symbol = dict(x.strip().split()[:2] for x in open(geneid2symbol)
                             if x.strip())
    else:
        geneid2symbol = dict()
    obo = GODag(obo, optional_attrs=['relationship', 'is_a'])
    gene2go = read_associations(gene2go)
    study_genes = [x.strip().split()[0] for x in open(study)]
    try:
        reg_dict = dict(x.strip().split()[:2] for x in open(study))
    except:
        reg_dict = {x.strip(): '' for x in open(study)}
    if not population:
        population = gene2go.keys()
    else:
        population = [
            x.strip().split()[0] for x in open(population) if x.strip()
        ]

    goea_obj = GOEnrichmentStudy(population,
                                 gene2go,
                                 obo,
                                 propagate_counts=False,
                                 alpha=alpha,
                                 methods=('fdr_bh', ))
    keep_if = lambda r: r.ratio_in_study[0] != 0
    goea_results_all = goea_obj.run_study(study_genes, keep_if=keep_if)
    goea_out = goea_out or study + '.goea.xls'
    goea_obj.wr_tsv(goea_out, goea_results_all)

    def func(y):
        results = []
        genes = [x.strip() for x in y.split(',')]
        for gene in genes:
            tmp = [gene]
            if gene in reg_dict:
                tmp.append(reg_dict[gene])
            if gene in geneid2symbol:
                tmp.append(geneid2symbol[gene])
            results.append('|'.join(tmp))
        return ';'.join(results)

    # func = lambda y: ';'.join(x.strip()+'|'+reg_dict[x.strip()] if x.strip() in reg_dict else x.strip() for x in y.split(','))
    table = pd.read_table(goea_out, header=0, index_col=0)
    # 重新校正pvalue, 修改内容
    fdr = multipletests(table['p_uncorrected'], method=correct)[1]
    table['p_fdr_bh'] = fdr
    # 修改goea_result_all方便后续的画图
    for r, fdr in zip(goea_results_all, fdr):
        r.p_fdr_bh = fdr
    table.columns = [
        x if x != 'p_fdr_bh' else 'p_corrected' for x in table.columns
    ]
    table['enrichment'] = [
        'e' if x <= alpha else 'p' for x in table['p_corrected']
    ]
    table['study_items'] = table.loc[:, 'study_items'].map(func)
    # table = table.sort_values(by=['p_corrected', 'p_uncorrected'])
    table.to_csv(goea_out, header=True, index=True, sep='\t')

    # -------------------plot dag------------------------
    for each in ['BP', 'MF', 'CC']:
        if only_plot_sig:
            goea_results_sig = table[table['enrichment'] == 'e']
        else:
            goea_results_sig = table.copy()
        goea_results_sig = goea_results_sig[goea_results_sig['NS'] == each]
        if not goea_results_sig.shape[0]:
            print(f"No significant term to plot for {each} ")
            return
        if goea_results_sig.shape[0] >= top:
            goea_results_sig = goea_results_sig.iloc[:top]
        goid_subset = list(goea_results_sig.index)
        # t = obo[goid_subset[5]]
        # for k, v in t.relationship.items():
        #     print(t, k, type(v), list(v)[0].id)
        # print(dag_out[:-4]+'.'+each+dag_out[-4:])
        dag_out = dag_out or study + '.goea.dag.svg'
        plot_gos(
            dag_out[:-4] + '.' + each + dag_out[-4:],
            goid_subset,  # Source GO ids, 如果分析结果里面没有包含这个节点,则他的颜色会是苍白绿色,但这里这个情况不会出现
            obo,
            goea_results=
            goea_results_all,  # use pvals for coloring:"p_{M}".format(M=goea[0].method_flds[0].fieldname)
            # We can further configure the plot...
            id2symbol=geneid2symbol,  # Print study gene Symbols, not GeneIDs
            study_items=show_gene_limit,  # Only max 6 gene Symbols on GO terms
            items_p_line=3,  # Print 3 genes per line)
            dpi=0 if dag_out.endswith('svg') else dpi,
            # title="Directed Graph of enriched {} terms".format(each)
        )
Exemplo n.º 6
0
    obo_dag = GODag(obo_file=args.obo)
    propagate_counts = not args.no_propagate_counts
    g = GOEnrichmentStudy(pop,
                          assoc,
                          obo_dag,
                          propagate_counts=propagate_counts,
                          alpha=args.alpha,
                          pvalcalc=args.pvalcalc,
                          methods=methods)
    results = g.run_study(study)
    if args.outfile is None:
        g.print_summary(results,
                        min_ratio=min_ratio,
                        indent=args.indent,
                        pval=args.pval)
    else:
        # Users can print to both tab-separated file and xlsx file in one run.
        outfiles = args.outfile.split(",")
        prt_if = None  # Print all values
        if args.pval is not None:
            # Only print out when uncorrected p-value < this value.
            prt_if = lambda nt: nt.p_uncorrected < args.pval
        for outfile in outfiles:
            if outfile.endswith(".xlsx"):
                g.wr_xlsx(outfile, results, prt_if=prt_if, indent=args.indent)
            else:
                g.wr_tsv(outfile, results, prt_if=prt_if, indent=args.indent)

# Copyright (C) 2010-2017, H Tang et al. All rights reserved.
Exemplo n.º 7
0
                 "background population. Please check.\n".format(overlap))

    assoc = read_associations(assoc_fn)

    methods = args.method.split(",")

    obo_dag = GODag(obo_file=args.obo)
    propagate_counts = not args.no_propagate_counts
    g = GOEnrichmentStudy(pop, assoc, obo_dag,
                          propagate_counts=propagate_counts,
                          alpha=args.alpha,
                          pvalcalc=args.pvalcalc,
                          methods=methods)
    results = g.run_study(study)
    if args.outfile is None:
        g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval)
    else:
        # Users can print to both tab-separated file and xlsx file in one run.
        outfiles = args.outfile.split(",")
        prt_if = None # Print all values
        if args.pval is not None:
            # Only print out when uncorrected p-value < this value.
            prt_if = lambda nt: nt.p_uncorrected < args.pval
        for outfile in outfiles:
            if outfile.endswith(".xlsx"):
                g.wr_xlsx(outfile, results, prt_if=prt_if, indent=args.indent)
            else:
                g.wr_tsv(outfile, results, prt_if=prt_if, indent=args.indent)

# Copyright (C) 2010-2016, H Tang et al. All rights reserved.