예제 #1
0
def _get_id2gos(file_id2gos, godag, name2go):
    """Get annotations"""
    if os.path.exists(file_id2gos):
        return IdToGosReader(file_id2gos, godag=godag).get_id2gos('CC')
    id2num = {
        name2go['A']: 10,
        name2go['B']: 10,
        name2go['C']: 10,
        name2go['D']: 10,
        name2go['E']: 10,
        name2go['F']: 10,
        name2go['G']: 10,
        name2go['H']: 10,
        name2go['I']: 30,
        name2go['L']: 30,
        name2go['M']: 20,
        name2go['N']: 30,
    }
    go2genes = cx.defaultdict(set)
    genenum = 0
    for goid, qty in id2num.items():
        for _ in range(qty):
            go2genes[goid].add(genenum)
            genenum += 1
    id2gos = get_b2aset(go2genes)
    IdToGosReader.wr_id2gos(file_id2gos, id2gos)
    return id2gos
예제 #2
0
def _get_goeaobj(methods=None):
    """Test GOEA with method, fdr."""
    # REad GODag
    obo_fin = os.path.join(REPO, "go-basic.obo")
    obo_dag = get_godag(obo_fin, loading_bar=None)
    # Read association
    fin_assc = "{REPO}/tests/data/small_association".format(REPO=REPO)
    objanno = IdToGosReader(fin_assc, godag=obo_dag)
    ns2assc = objanno.get_ns2assc()
    popul_fin = "{REPO}/tests/data/small_population".format(REPO=REPO)
    popul_ids = [line.rstrip() for line in open(popul_fin)]
    goeaobj = GOEnrichmentStudyNS(popul_ids, ns2assc, obo_dag, methods=methods)
    return goeaobj
예제 #3
0
def _get_id2gos(file_id2gos, godag, name2go, name2num):
    """Get annotations"""
    if os.path.exists(file_id2gos):
        return IdToGosReader(file_id2gos, godag=godag).get_id2gos('CC')
    go2genes = cx.defaultdict(set)
    genenum = 0
    for name, qty in name2num.items():
        goid = name2go[name]
        for _ in range(qty):
            go2genes[goid].add(genenum)
            genenum += 1
    id2gos = get_b2aset(go2genes)
    IdToGosReader.wr_id2gos(file_id2gos, id2gos)
    return id2gos
예제 #4
0
def get_objanno(fin_anno, anno_type=None, **kws):
    """Read annotations in GAF, GPAD, Entrez gene2go, or text format."""
    # kws get_objanno: taxids hdr_only prt allow_missing_symbol
    anno_type = get_anno_desc(fin_anno, anno_type)
    if anno_type is not None:
        if anno_type == 'gene2go':
            # kws: taxid taxids
            kws_ncbi = {
                k: kws[k]
                for k in Gene2GoReader.exp_kws.intersection(kws.keys())
            }
            return Gene2GoReader(fin_anno, **kws_ncbi)
        if anno_type == 'gaf':
            kws_gaf = {
                k: kws[k]
                for k in GafReader.exp_kws.intersection(kws.keys())
            }
            return GafReader(fin_anno, **kws_gaf)
        if anno_type == 'gpad':
            kws_gpad = {
                k: kws[k]
                for k in GpadReader.exp_kws.intersection(kws.keys())
            }
            return GpadReader(fin_anno, **kws_gpad)
        if anno_type == 'id2gos':
            kws_id2go = {
                k: kws[k]
                for k in IdToGosReader.exp_kws.intersection(kws.keys())
            }
            return IdToGosReader(fin_anno, **kws_id2go)
    raise RuntimeError('UNEXPECTED ANNOTATION FILE FORMAT: {F} {D}'.format(
        F=fin_anno, D=anno_type))
예제 #5
0
def intialize_term_counts():
    go_freq_dict = dict()
    go_dag = GODag(os.path.join(DATA_DIR, "go-basic.obo"))

    associations = IdToGosReader(UNIPROT_ASSOCIATIONS_FILE_PATH,
                                 godag=go_dag).get_id2gos('all')
    term_counts = TermCounts(go_dag, associations)
    for i in go_dag.values():
        go_freq_dict[i.id] = term_counts.get_count(i.id)
    # write frequency dict to JSON file
    with open(JSON_INDEXED_FILE_PATH, 'w') as json_file:
        json.dump(go_freq_dict, json_file)
예제 #6
0
def _precompute_term_frequencies():
    print("Start precomputations of term frequencies...")
    go_freq_dict = dict()
    go_dag = GODag(GO_DAG_FILE_PATH, prt=open(os.devnull, 'w'))

    associations = IdToGosReader(UNIPROT_ASSOCIATIONS_FILE_PATH,
                                 godag=go_dag).get_id2gos('all')
    term_counts = TermCounts(go_dag, associations)

    for i in go_dag.values():
        go_freq_dict[i.id] = term_counts.get_count(i.id)
        for alt_id in i.alt_ids:
            go_freq_dict[alt_id] = term_counts.get_count(i.id)
    # write frequency dict to JSON file
    with open(FREQUENCY_COUNTS_FILE_PATH, 'w') as json_file:
        json.dump(go_freq_dict, json_file)
def test_tcntobj_relationships(do_plt=False):
    """Test loading of relationships, like part_of, into TermCounts"""
    # Filenames
    fin_obo = os.path.join(REPO, "tests/data/yangRWC/fig2a.obo")
    fin_anno = os.path.join(REPO, "tests/data/yangRWC/fig2a.anno")
    fout_png_r0 = os.path.join(REPO, 'yang_fig2a_r0.png')
    fout_png_r1 = os.path.join(REPO, 'yang_fig2a_r1.png')
    relationships = {
        'part_of',
    }

    # Load ontologies
    go2obj = GODag(fin_obo, optional_attrs=['relationship'])

    # Load annotations
    assoc = IdToGosReader(fin_anno, godag=go2obj).get_id2gos('CC')

    # Count genes annotated to GO terms w and wo/relationships
    tcntobj_r0 = TermCounts(go2obj, assoc)
    # relationship: G (GO:0000007) is part_of F (GO:0000006)
    tcntobj_r1 = TermCounts(go2obj, assoc, relationships)

    # Check results
    # Adding relationships does not change the total count of genes:
    assert tcntobj_r0.gocnts['GO:0005575'] == tcntobj_r1.gocnts['GO:0005575']
    # Counts without relationships:
    assert tcntobj_r0.gocnts['GO:0000002'] == 40  # GO Term B
    assert tcntobj_r0.gocnts['GO:0000006'] == 10  # GO Term F
    # Counts with relationships: F counts G's 30 genes, so does B
    assert tcntobj_r1.gocnts['GO:0000002'] == 70  # GO Term B
    assert tcntobj_r1.gocnts['GO:0000006'] == 40  # GO Term F

    # Optionally visualize the difference between term counts w and wo/relationships
    if do_plt:
        go2txt_r0 = {
            nt.GO: 'tcnt={}'.format(nt.tcnt)
            for nt in tcntobj_r0.gosubdag.go2nt.values()
        }
        GoSubDagPlot(tcntobj_r0.gosubdag,
                     go2txt=go2txt_r0).plt_dag(fout_png_r0)
        go2txt_r1 = {
            nt.GO: 'tcnt={}'.format(nt.tcnt)
            for nt in tcntobj_r1.gosubdag.go2nt.values()
        }
        GoSubDagPlot(tcntobj_r1.gosubdag,
                     go2txt=go2txt_r1).plt_dag(fout_png_r1)
예제 #8
0
                               uniprot_notnull['Encoding'])
])

#run gene ontology enrichment analysis
# Get http://geneontology.org/ontology/go-basic.obo

from goatools.base import download_go_basic_obo
obo_fname = download_go_basic_obo()

uniprot_df['Gene ontology IDs'] = uniprot_df['Gene ontology IDs'].str.replace(
    ' ', '')
uniprot_df.drop(['Encoding', 'Organism', 'Protein families', 'n'],
                axis=1).to_csv("GOA.txt", sep='\t', header=False, index=False)

from goatools.anno.idtogos_reader import IdToGosReader
objanno = IdToGosReader("GOA.txt")
ns2assoc = objanno.get_id2gos()

from goatools.obo_parser import GODag
obodag = GODag("go-basic.obo")

from goatools.go_enrichment import GOEnrichmentStudy
goeaobj = GOEnrichmentStudy(
    uniprot_df.Entry,
    ns2assoc,  # geneid/GO associations
    obodag,  # Ontologies
    propagate_counts=False,
    alpha=0.001,  # default significance cut-off
    methods=['fdr_bh'])  # default multipletest correction method

gos = []
예제 #9
0
sys.stderr = open(snakemake.log[0], "w")

import pandas as pd
import matplotlib.pyplot as plt
from goatools.obo_parser import GODag
from goatools.anno.idtogos_reader import IdToGosReader
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS
from goatools.godag_plot import plot_results#, plot_goid2goobj, plot_gos


# read in directed acyclic graph of GO terms / IDs
obodag = GODag(snakemake.input.obo)


# read in mapping gene ids from input to GO terms / IDs
objanno = IdToGosReader(snakemake.input.ens_gene_to_go, godag = obodag)


# extract namespace(?) -> id2gos mapping
ns2assoc = objanno.get_ns2assc()

for nspc, id2gos in ns2assoc.items():
    print("{NS} {N:,} annotated genes".format(NS=nspc, N=len(id2gos)))

# read gene diffexp table
all_genes = pd.read_table(snakemake.input.diffexp)

# select genes significantly differentially expressed according to BH FDR of sleuth
fdr_level_gene = float(snakemake.params.gene_fdr)
sig_genes = all_genes[all_genes['qval']<fdr_level_gene]