Exemplo n.º 1
0
def test_find_enrichment():
    """Recreate run in run.sh."""
    # Set params
    ntobj = cx.namedtuple("args_namespc",
                          ("filenames obo "
                           "pval alpha pvalcalc method no_propagate_counts "
                           "compare ratio "
                           "outfile indent min_overlap "))
    filenames = ['data/study', 'data/population', 'data/association']
    methods = ['bonferroni', 'sidak', 'holm', 'fdr_bh']
    alpha = 0.05
    fin_obo = os.path.join(REPO, 'go-basic.obo')
    download_go_basic_obo(fin_obo, prt=sys.stdout, loading_bar=None)
    args = ntobj(filenames=[os.path.join(REPO, f) for f in filenames],
                 obo=fin_obo,
                 pval=0.05,
                 alpha=alpha,
                 pvalcalc='fisher',
                 method=",".join(methods),
                 no_propagate_counts=False,
                 compare=False,
                 ratio=None,
                 outfile=None,
                 indent=True,
                 min_overlap=0.7)

    # Run test
    study, pop, assoc = rd_files(args.filenames, args.compare)
    objgoea = get_objgoea(pop, assoc, args)
    results = objgoea.run_study(study)
    # Check results
    expected_cnts = {'fdr_bh': 17, 'sidak': 5, 'holm': 5, 'bonferroni': 5}
    _chk_results(results, expected_cnts, methods, alpha)
    print("TEST PASSED")
Exemplo n.º 2
0
def dnld_ontology(filename):
    """Test downloading of ontologies."""
    # download_go_basic_obo(filename, loading_bar=None)
    os.system("rm -f {FILE}".format(FILE=filename))
    download_go_basic_obo(filename, loading_bar=None)
    download_go_basic_obo(filename, loading_bar=None)
    assert os.path.isfile(filename), "FILE({F}) EXPECTED TO EXIST".format(F=filename)
Exemplo n.º 3
0
def dnld_ontology(filename):
    """Test downloading of ontologies."""
    # download_go_basic_obo(filename, loading_bar=None)
    os.system("rm -f {FILE}".format(FILE=filename))
    download_go_basic_obo(filename, loading_bar=None)
    download_go_basic_obo(filename, loading_bar=None)
    assert os.path.isfile(filename), "FILE({F}) EXPECTED TO EXIST".format(F=filename)
Exemplo n.º 4
0
def test_i147_all_taxids():
    """Work with all taxids using Gene2GoReader"""
    # 1. Download Ontologies and Associations
    # 1a. Download Ontologies, if necessary
    #     Get http://geneontology.org/ontology/go-basic.obo
    download_go_basic_obo()

    # 1b. Download Associations, if necessary
    #     Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz
    fin_gene2go = download_ncbi_associations()

    # 2. Load Ontologies, Associations and Background gene set
    # 2a. Load Ontologies
    godag = GODag("go-basic.obo")

    # 2b. Load Associations for all species
    #     Read NCBI's gene2go. Store annotations in a list of namedtuples
    objanno_all = Gene2GoReader(fin_gene2go, godag=godag, taxids=True)
    objanno_mmu = Gene2GoReader(fin_gene2go, godag=godag, taxids=[10090])
    objanno_mmuhsa = Gene2GoReader(fin_gene2go,
                                   godag=godag,
                                   taxids=[10090, 9606])

    # Get associations
    # pylint: disable=bad-whitespace
    ns2assoc_all_mmu = _run_get_ns2assc(10090, objanno_all)
    ns2assoc_mmu_mmu = _run_get_ns2assc(10090, objanno_mmu)
    ns2assoc_mmuhsa_all = _run_get_ns2assc(True, objanno_mmuhsa)
    ns2assoc_mmuhsa_mmu = _run_get_ns2assc(10090, objanno_mmuhsa)

    # Check results
    for nspc in ['BP', 'MF', 'CC']:
        assert ns2assoc_mmu_mmu[nspc] == ns2assoc_all_mmu[nspc]
        assert ns2assoc_mmu_mmu[nspc] == ns2assoc_mmuhsa_mmu[nspc]
    _chk_mmuhsa_all(objanno_mmuhsa, objanno_all, ns2assoc_mmuhsa_all)
Exemplo n.º 5
0
def test_i147_all_taxids():
    """Work with all taxids using Gene2GoReader"""
    # 1. Download Ontologies and Associations
    # 1a. Download Ontologies, if necessary
    #     Get http://geneontology.org/ontology/go-basic.obo
    download_go_basic_obo()

    # 1b. Download Associations, if necessary
    #     Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz
    fin_gene2go = download_ncbi_associations()

    # 2. Load Ontologies, Associations and Background gene set
    # 2a. Load Ontologies
    godag = GODag("go-basic.obo")

    # 2b. Load Associations for all species
    #     Read NCBI's gene2go. Store annotations in a list of namedtuples
    objanno = Gene2GoReader(fin_gene2go, godag=godag, taxids=True)

    # Get namespace2association where:
    #    namespace is:
    #        BP: biological_process
    #        MF: molecular_function
    #        CC: cellular_component
    #    assocation is a dict:
    #        key: NCBI GeneID
    #        value: A set of GO IDs associated with that gene
    ns2assoc = objanno.get_ns2assc()

    for nspc, id2gos in ns2assoc.items():
        print("{NS} {N:,} annotated mouse genes".format(NS=nspc,
                                                        N=len(id2gos)))
Exemplo n.º 6
0
    def __GO_enrich__(self):
        go_file = "go-basic.obo"
        if not os.path.exists(go_file):
            download_go_basic_obo()

        # Load gene ontologies
        obodag = GODag("go-basic.obo")

        # Read NCBI's gene2go. Store annotations in a list of namedtuples
        fin_gene2go = download_ncbi_associations()
        objanno = Gene2GoReader(fin_gene2go, taxids=[9606])
        # Get namespace2association where:
        #    namespace is:
        #        BP: biological_process
        #        MF: molecular_function
        #        CC: cellular_component
        #    association is a dict:
        #        key: NCBI GeneID
        #        value: A set of GO IDs associated with that gene
        ns2assoc = objanno.get_ns2assc()

        self.goeaobj = GOEnrichmentStudyNS(
            GeneID2nt_hum.keys(),  # List of human protein-acoding genes
            ns2assoc,  # geneID/GO associations
            obodag,  # Ontologies
            propagate_counts=False,
            alpha=0.05,  # default significance cut-off
            methods=['fdr_bh'])  # default multipletest correction method
Exemplo n.º 7
0
def test_cli():
    """Add and remove markers for a file."""
    # pylint: disable=bad-whitespace
    args_exp = [
        # args                   exp_set expected_dict
        # --------               ------- ---------------------
        ([], {
            'dag': 'go-basic.obo',
            'dash_len': 6
        }),
        (['--dag=go-basic.obo'], {
            'dag': 'go-basic.obo',
            'dash_len': 6
        }),
        (['-o rpt.txt'], {
            'dag': 'go-basic.obo',
            'dash_len': 6,
            'o': 'rpt.txt'
        }),
        (['--max_indent=7'], {
            'dag': 'go-basic.obo',
            'dash_len': 6,
            'max_indent': 7
        }),
        (['CC', '--concise'], {
            'dag': 'go-basic.obo',
            'dash_len': 6,
            'GO': ['CC'],
            'concise': True
        }),
        (['--no_indent'], {
            'dag': 'go-basic.obo',
            'dash_len': 6,
            'no_indent': True
        }),
        (['--concise', '--no_indent'], {
            'dag': 'go-basic.obo',
            'dash_len': 6,
            'concise': True,
            'no_indent': True
        }),
    ]
    download_go_basic_obo('go-basic.obo', loading_bar=None)
    for idx, (args, exp_dict) in enumerate(args_exp):
        print("ARGS={ARGS}".format(ARGS=args))
        print("EXP={EXP}".format(EXP=exp_dict))
        obj = WrHierCli(args)
        print("DCT: {DCT}".format(DCT=obj.kws))
        print("WWWWWWWWWWWWWWWWWWW WrHierCli", obj.kws)
        assert obj.kws == exp_dict, "DCT: ACT({}) != EXP({})".format(
            obj.kws, exp_dict)
        print("")
        # Test writing to a file
        if obj.goids:
            fout_txt = os.path.join(REPO, 'wrhier{N}.txt'.format(N=idx))
            os.system('rm -f {FILE}'.format(FILE=fout_txt))
            obj.wrtxt_hier(fout_txt)
            assert os.path.exists(fout_txt), 'FILE NOT FOUND({F})'.format(
                F=fout_txt)
Exemplo n.º 8
0
def test_deprecatedloc_godagtimed():
    """Test deprecated location of GoDagTimed"""
    tic = timeit.default_timer()
    prt_hms(tic, 'prt_hms TESTED')

    fin_go_obo = os.path.join(REPO, "go-basic.obo")
    download_go_basic_obo(fin_go_obo, loading_bar=None)
    GoDagTimed(fin_go_obo)
Exemplo n.º 9
0
def prepare_data():
    #download the last obo
    #obo_fname = download_go_basic_obo()
    #format the last trypdb search res file
    #for goterms
    download_go_basic_obo()
    trytrip_file='data/GenesByGoTerm_Summary.txt'
    associations_file = 'data/associations.txt'
    format_file(trytrip_file=trytrip_file, res_file = associations_file)    
Exemplo n.º 10
0
 def wr_subobo(self):
     """Write a subset obo to be used for testing."""
     # Load GO-DAG: Load optional 'relationship'
     for name, goids in NAME2GOIDS.items():
         fout_obo = self.get_obo_name(name)
         fin_obo = os.path.join(REPO, "go-basic.obo")
         download_go_basic_obo(fin_obo, prt=sys.stdout, loading_bar=None)
         obj = WrSubObo(fin_obo, optional_attrs=['relationship'])
         # obj = WrSubObo(fin_obo)
         obj.wrobo(fout_obo, goids)
 def wr_subobo(self):
     """Write a subset obo to be used for testing."""
     # Load GO-DAG: Load optional 'relationship'
     for name, goids in NAME2GOIDS.items():
         fout_obo = self.get_obo_name(name)
         fin_obo = os.path.join(REPO, "go-basic.obo")
         download_go_basic_obo(fin_obo, prt=sys.stdout, loading_bar=None)
         obj = WrSubObo(fin_obo, optional_attrs=['relationship'])
         # obj = WrSubObo(fin_obo)
         obj.wrobo(fout_obo, goids)
Exemplo n.º 12
0
def test_gosubdag_relationships(prt=sys.stdout):
    """Plot both the standard 'is_a' field and the 'part_of' relationship."""
    goids = set([
        "GO:0032501",
        "GO:0044707",  # alt_id: GO:0032501  # BP  1011 L01 D01 B multicellular organismal process
        "GO:0050874",
        "GO:0007608",  # sensory perception of smell
        "GO:0050911"
    ])  # detection of chemical stimulus involved in sensory perception of smell

    # Load GO-DAG: Load optional 'relationship'
    fin_obo = os.path.join(REPO, "go-basic.obo")
    download_go_basic_obo(fin_obo, prt, loading_bar=None)
    go2obj_plain = GODag(fin_obo)
    go2obj_relat = GODag(fin_obo, optional_attrs=['relationship'])

    print("\nCreate GoSubDag with GO DAG containing no relationships.")
    tic = timeit.default_timer()
    # Create Plot object; Plot both 'is_a' and optional 'part_of' relationship
    gosubdag = GoSubDag(goids, go2obj_plain, relationships=False, prt=prt)
    # gosubdag.prt_goids(gosubdag.go2obj)
    goids_plain = set(gosubdag.go2obj)
    tic = _rpt_hms(tic, len(gosubdag.go2obj))

    print("\nCreate GoSubDag while IGNORING relationships")
    # Create Plot object; Plot both 'is_a' and optional 'part_of' relationship
    gosubdag = GoSubDag(goids, go2obj_relat, relationships=False, prt=prt)
    # gosubdag.prt_goids(gosubdag.go2obj)
    goids_false = set(gosubdag.go2obj)
    tic = _rpt_hms(tic, len(gosubdag.go2obj))
    assert goids_plain == goids_false

    print("\nCreate GoSubDag while loading only the 'part_of' relationship")
    gosubdag = GoSubDag(goids,
                        go2obj_relat,
                        relationships=['part_of'],
                        prt=prt)
    # gosubdag.prt_goids(gosubdag.go2obj)
    goids_part_of = set(gosubdag.go2obj)
    tic = _rpt_hms(tic, len(gosubdag.go2obj))
    assert goids_plain.intersection(goids_part_of) == goids_plain
    assert len(goids_part_of) > len(goids_plain)

    print("\nCreate GoSubDag while loading all relationships")
    gosubdag = GoSubDag(goids, go2obj_relat, relationships=True, prt=prt)
    # gosubdag.prt_goids(gosubdag.go2obj)
    goids_true = set(gosubdag.go2obj)
    tic = _rpt_hms(tic, len(gosubdag.go2obj))
    assert goids_part_of.intersection(goids_true) == goids_part_of
    assert len(goids_true) >= len(goids_part_of)
Exemplo n.º 13
0
 def __init__(self):
     download_go_basic_obo(self.obo, sys.stdout, loading_bar=None)
     self.godag_r0 = GODag(self.obo)
     self.godag_r1 = GODag(self.obo, optional_attrs=set(['relationship']))
     self.goids = list(set(o.id for o in self.godag_r0.values()))
     # GoSubDag (plain)
     tic = timeit.default_timer()
     self.gosubdag_r0 = GoSubDag(self.goids, self.godag_r0, prt=None)
     prt_hms(tic, "GoSubDag r0 {N:4} GOs {S:3} srcs".format(
         N=len(self.gosubdag_r0.go2obj), S=len(self.gosubdag_r0.go_sources)))
     # GoSubDag with relationships
     self.gosubdag_r1 = GoSubDag(self.goids, self.godag_r1, prt=None, relationships=True)
     prt_hms(tic, "GoSubDag r1 {N:4} GOs {S:3} srcs".format(
         N=len(self.gosubdag_r1.go2obj), S=len(self.gosubdag_r1.go_sources)))
Exemplo n.º 14
0
def test_dnlds():
    """Test downloads of ontologies and NCBI associations."""
    # Test downloads of ontologies.
    cwd = os.getcwd()
    file_obo = os.path.join(cwd, "go-basic.obo")
    download_go_basic_obo(file_obo, loading_bar=None)
    os.system("rm -f {FILE}".format(FILE=file_obo))
    download_go_basic_obo(file_obo, loading_bar=None)
    assert os.path.isfile(file_obo)
    # Test downloading of associations from NCBI.
    file_assc = os.path.join(cwd, "gene2go")
    download_ncbi_associations(file_assc, loading_bar=None)
    os.system("rm -f {FILE}".format(FILE=file_assc))
    download_ncbi_associations(file_assc, loading_bar=None)
    assert os.path.isfile(file_assc)
Exemplo n.º 15
0
def run_go_enrichment(strain,
                      genes_of_interest,
                      significant=True,
                      cutoff=0.05,
                      use_parent_terms=True):
    # Load GO term association dictionary
    with open(os.path.join('data', 'go_association.pickle'), 'rb') as handle:
        go_association = pickle.load(handle)

    background_genes = get_genes(
        os.path.join('data', strain + '_all_genes.csv'))
    obo_go_fname = download_go_basic_obo()
    obo_dag = GODag('go-basic.obo')

    if strain == 'PA14':
        genes_of_interest = map_pa14_genes(genes_of_interest)
        background_genes = map_pa14_genes(background_genes)

    goea_obj = GOEnrichmentStudyNS(background_genes,
                                   go_association,
                                   obo_dag,
                                   propagate_counts=use_parent_terms,
                                   alpha=cutoff,
                                   methods=['fdr_bh'])
    goea_results = goea_obj.run_study(genes_of_interest)

    if significant is True:
        goea_results = [
            result for result in goea_results if result.p_fdr_bh < cutoff
        ]

    enrichment_results = get_enrichment_results(goea_results)
    return [enrichment_results, goea_results]
Exemplo n.º 16
0
def get_GO_dag():
    try:
        GO_dag = GODag(obo_file=GO_PATH.as_posix())
    except Exception:
        obo_fname = download_go_basic_obo(obo=GO_PATH.as_posix())
        GO_dag = GODag(obo_file=GO_PATH.as_posix())
    return GO_dag
Exemplo n.º 17
0
def download_and_move_go_basic_obo(prt):

    if not os.path.exists('geneinfo_cache'): os.makedirs('geneinfo_cache')

    if not os.path.exists('geneinfo_cache/go-basic.obo'):
        obo_fname = download_go_basic_obo(prt=prt)
        shutil.move('go-basic.obo', 'geneinfo_cache/go-basic.obo')
    return 'geneinfo_cache/go-basic.obo'
Exemplo n.º 18
0
def test_gosearch(log=sys.stdout):
    """Test GoSearch class with no annotations."""
    taxids = [9606, 10090]
    # Download ontologies and annotations, if necessary
    fin_go_obo = os.path.join(REPO, "go-basic.obo")
    download_go_basic_obo(fin_go_obo, loading_bar=None)
    # Because get_assoc_ncbi_taxids returns id2gos, we will opt to
    # use the (optional) multi-level dictionary separate associations by taxid
    # taxid2asscs contains both GO2GeneIDs and GeneID2GOs.
    taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))
    get_assoc_ncbi_taxids(taxids, taxid2asscs=taxid2asscs, loading_bar=None)

    # Initialize GO-search helper object with obo and annotations(go2items)
    for taxid in taxids:
        obj = GoSearch(fin_go_obo, go2items=taxid2asscs[taxid]['GO2GeneIDs'], log=log)
        assert len(obj.obo_dag) > 40000
    GoSearch(fin_go_obo, dict(), log=log)
    assert len(obj.obo_dag) > 40000
Exemplo n.º 19
0
def test_go_print(prt=sys.stdout):
    """Test that all GO Terms can be printed, even if level/depth are not assigned."""
    obo_file = download_go_basic_obo(prt)
    reader = goatools.obo_parser.OBOReader(obo_file)
    prt.write("\n{OBJ}\n\n".format(OBJ=reader))
    go_terms = list(reader)
    prt.write("First GO Record: {REC}\n".format(REC=go_terms[0]))
    for idx, go_rec in enumerate(go_terms):
        prt.write("{I:>7,} {RECORD}\n".format(I=idx, RECORD=go_rec))
Exemplo n.º 20
0
def test_go_print(prt=sys.stdout):
    """Test that all GO Terms can be printed, even if level/depth are not assigned."""
    obo_file = download_go_basic_obo(prt=prt)
    reader = goatools.obo_parser.OBOReader(obo_file)
    prt.write("\n{OBJ}\n\n".format(OBJ=reader))
    go_terms = list(reader)
    prt.write("First GO Record: {REC}\n".format(REC=go_terms[0]))
    for idx, go_rec in enumerate(go_terms):
        prt.write("{I:>7,} {RECORD}\n".format(I=idx, RECORD=go_rec))
Exemplo n.º 21
0
def test_tcntobj_relationships(prt=sys.stdout):
    """Test loading of relationships, like part_of, into TermCounts"""
    fin_obo = os.path.join(REPO, "go-basic.obo")
    fin_anno = os.path.join(REPO, 'goa_human.gpad')

    download_go_basic_obo(fin_obo, prt, loading_bar=None)
    dnld_annotation(fin_anno)

    # Load ontologies
    go2obj_r0 = GODag(fin_obo)
    go2obj_r1 = GODag(fin_obo, optional_attrs=['relationship'])

    # Load annotations
    annoobj = GpadReader(fin_anno, godag=go2obj_r0)

    # Create TermCounts objects
    ns2tcntobj_r0 = {ns:TermCounts(go2obj_r0, annoobj.get_id2gos(ns)) for ns in NSS}
    ns2tcntobj_r1 = {ns:TermCounts(go2obj_r1, annoobj.get_id2gos(ns), RELS) for ns in NSS}
    _chk_pass_fail(ns2tcntobj_r0, ns2tcntobj_r1)
Exemplo n.º 22
0
    def load_ontologies_and_associations(self):
        print "---LOADING ONTOLOGIES AND ASSOCIATIONS---"
        # Check if files exist and download if not
        obo_fname = download_go_basic_obo()
        gene2go = download_ncbi_associations()

        # Load ontologies and associations
        obodag = GODag(obo_fname)
        geneid2gos_human = read_ncbi_gene2go("gene2go", taxids=[9606])
        print "{N:,} annotated human genes".format(N=len(geneid2gos_human))

        return obodag, geneid2gos_human
Exemplo n.º 23
0
def test_go_print(prt=sys.stdout):
    """Test that all GO Terms can be printed, even if level/depth are not assigned."""
    obo_file = download_go_basic_obo(prt=prt)
    reader = goatools.obo_parser.OBOReader(obo_file)
    go_terms = list(reader)
    prt.write("Python Version: {VER}\n\n".format(VER=sys.version))
    prt.write("\nOBOReader: {OBJ}\n\n".format(OBJ=reader))
    prt.write("format-version: {VER}\n".format(VER=reader.format_version))
    prt.write("data-version: {VER}\n\n".format(VER=reader.data_version))
    prt.write("Found {N} GO Records:\n".format(N=len(go_terms)))
    for idx, go_rec in enumerate(go_terms):
        prt.write("{I:>7,} {RECORD}\n".format(I=idx, RECORD=go_rec))
Exemplo n.º 24
0
 def __init__(self):
     download_go_basic_obo(self.obo, sys.stdout, loading_bar=None)
     self.godag_r0 = GODag(self.obo)
     self.godag_r1 = GODag(self.obo, optional_attrs=set(['relationship']))
     self.goids = list(set(o.id for o in self.godag_r0.values()))
     # GoSubDag (plain)
     tic = timeit.default_timer()
     self.gosubdag_r0 = GoSubDag(self.goids, self.godag_r0, prt=None)
     prt_hms(
         tic, "GoSubDag r0 {N:4} GOs {S:3} srcs".format(
             N=len(self.gosubdag_r0.go2obj),
             S=len(self.gosubdag_r0.go_sources)))
     # GoSubDag with relationships
     self.gosubdag_r1 = GoSubDag(self.goids,
                                 self.godag_r1,
                                 prt=None,
                                 relationships=True)
     prt_hms(
         tic, "GoSubDag r1 {N:4} GOs {S:3} srcs".format(
             N=len(self.gosubdag_r1.go2obj),
             S=len(self.gosubdag_r1.go_sources)))
Exemplo n.º 25
0
def get_genes_cell_cycle(taxid=9606, log=sys.stdout):
    """Test GOEA with local multipletest correction methods for cell cycle."""
    # Download ontologies and annotations, if necessary
    fin_go_obo = os.path.join(os.getcwd(), "go-basic.obo")
    download_go_basic_obo(fin_go_obo, loading_bar=None)
    # Because get_assoc_ncbi_taxids returns id2gos, we will opt to
    # use the (optional) multi-level dictionary separate associations by taxid
    # taxid2asscs contains both GO2GeneIDs and GeneID2GOs.
    taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))
    get_assoc_ncbi_taxids([taxid], taxid2asscs=taxid2asscs, loading_bar=None)

    # Initialize GO-search helper object with obo and annotations(go2items)
    srch = GoSearch(fin_go_obo, go2items=taxid2asscs[taxid]['GO2GeneIDs'])
    # Compile search pattern for 'cell cycle'
    cell_cycle = re.compile(r'cell cycle', flags=re.IGNORECASE)
    # Find ALL GOs that have 'cell cycle'. Store results in file.
    fout_allgos = "cell_cycle_gos_{TAXID}.log".format(TAXID=taxid)
    with open(fout_allgos, "w") as prt:
        # Search for 'cell cycle' in GO terms
        gos_cc_all = srch.get_matching_gos(cell_cycle, prt=prt)
        # Researcher carefully reviews GO results and finds GO:0005764(lysosome)
        # in the results when it should not be because the match was found:
        #     cell cycle-independent
        # Researcher removes 'lysosome' from 'cell cycle' results
        # by removing any GOs matching 'cell cycle-independent'
        cell_cycle_ind = re.compile(r'cell cycle.independent',
                                    flags=re.IGNORECASE)
        gos_no_cc = srch.get_matching_gos(cell_cycle_ind,
                                          gos=gos_cc_all,
                                          prt=prt)
        gos = gos_cc_all.difference(gos_no_cc)
        # Add children GOs of cell cycle GOs
        gos_all = srch.add_children_gos(gos)
        if log is not None:
            log.write('    taxid {TAXID:>5}\n'.format(TAXID=taxid))
            log.write('    FOUND {N:>5} GOs:   {F}\n'.format(N=len(gos_all),
                                                             F=fout_allgos))
    # Get Entrez GeneIDs for cell cycle GOs
    geneids = srch.get_items(gos_all)
    return geneids
Exemplo n.º 26
0
    def __init__(
        self,
        work_dir: str = '.',
        clean_work_dir: bool = False,
        organism: str = 'human',
        study_parameters: Dict[str, Union[int, float, str, List, Dict]] = {
            'propagate_counts': False,
            'alpha': 0.05,
            'methods': ['fdr_bh']
        }
    ) -> GOEngine:
        """A GOEngine that can be used for performing analysis using GOATOOLS

        Args:
            work_dir (str, optional): The path to a temp directory were intermediate-results and raw data will be downloaded/written to. Defaults to the current working directory.
            clean_work_dir (bool, optional): Whether or not to remove data written to the work directory at class termination, default to True.
            organism (str, optional): The organism . Defaults to 'human'.
            study_parameters (Dict[str,Union[int,float,str,List,Dict]], optional): A dict of parameters to control the base function, defaults to {'propagate_counts':False,'alpha':0.05, 'methods':['fdr_bh']}
        Returns:
            GOEngine: return a GO engine that can be used for performing GO enrichment analysis GOEnrichmentStudyNS
        """
        print("Creating a GO Engine ...")
        if not os.path.exists(work_dir):
            raise ValueError(
                f"The provided work path: {work_dir} does not exist!!!")
        self.work_dir = work_dir
        if organism != 'human' and organism != 'mouse':
            raise ValueError(
                f"The provided organism: {organism} is not support, current engine mainly work with human and moues only"
            )
        print(f"\t --> Downloading data ...")
        obo_fname = download_go_basic_obo(
            os.path.join(work_dir, 'go-basic.obo'))
        gene2go_fname = download_ncbi_associations(
            os.path.join(work_dir, 'gene2go'))
        ## parse the GO term
        print(
            f"\t --> parsing the data and intializing the base GOEA object...")
        obo_dag = GODag(obo_fname)
        if organism == 'human':
            self._goea_obj = GOEnrichmentStudyNS(
                gene2iden_human.keys(),
                Gene2GoReader(gene2go_fname, taxids=[9606]).get_ns2assc(),
                obo_dag, **study_parameters)
        else:
            self._goea_obj = GOEnrichmentStudyNS(
                gene2iden_human.keys(),
                Gene2GoReader(gene2go_fname, taxids=[10090]).get_ns2assc(),
                obo_dag, **study_parameters)
        self._clean_work_dir = clean_work_dir
        self._gene_ids = None
        return
Exemplo n.º 27
0
def _get_pvals(pvalfnc_names, prt=sys.stdout):
    fisher2pvals = {}
    taxid = 10090  # Mouse study
    obo_dag = GODag(download_go_basic_obo(prt=prt))
    geneids_pop = GeneID2nt_mus.keys()
    assoc_geneid2gos = get_assoc_ncbi_taxids([taxid])
    geneids_study = _get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx", prt)
    for fisher in pvalfnc_names:
        goeaobj = GOEnrichmentStudy(
            geneids_pop, assoc_geneid2gos, obo_dag, propagate_counts=False, alpha=0.05, methods=None, pvalcalc=fisher
        )
        fisher2pvals[fisher] = goeaobj._get_pval_uncorr(geneids_study, prt)
    return fisher2pvals
Exemplo n.º 28
0
def test_go_print(prt=sys.stdout):
    """Test that all GO Terms can be printed, even if level/depth are not assigned."""
    prt_pypath(prt)
    file_obo = os.path.join(os.getcwd(), "go-basic.obo")
    obo_file = download_go_basic_obo(file_obo, prt=prt, loading_bar=None)
    reader = goatools.obo_parser.OBOReader(obo_file)
    go_terms = list(reader)
    prt.write("Python Version: {VER}\n\n".format(VER=sys.version))
    prt.write("\nOBOReader: {OBJ}\n\n".format(OBJ=reader))
    prt.write("format-version: {VER}\n".format(VER=reader.format_version))
    prt.write("data-version: {VER}\n\n".format(VER=reader.data_version))
    prt.write("Found {N} GO Records:\n".format(N=len(go_terms)))
    for idx, go_rec in enumerate(go_terms):
        prt.write("{I:>7,} {RECORD}\n".format(I=idx, RECORD=go_rec))
Exemplo n.º 29
0
def get_genes_cell_cycle(taxid=9606, log=sys.stdout):
    """Test GOEA with local multipletest correction methods for cell cycle."""
    # Download ontologies and annotations, if necessary
    fin_go_obo = os.path.join(os.getcwd(), "go-basic.obo")
    download_go_basic_obo(fin_go_obo, loading_bar=None)
    # Because get_assoc_ncbi_taxids returns id2gos, we will opt to
    # use the (optional) multi-level dictionary separate associations by taxid
    # taxid2asscs contains both GO2IDs and ID2GOs.
    taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))
    get_assoc_ncbi_taxids([taxid], taxid2asscs=taxid2asscs, loading_bar=None)

    # Initialize GO-search helper object with obo and annotations(go2items)
    srch = GoSearch(fin_go_obo, go2items=taxid2asscs[taxid]['GO2IDs'])
    # Compile search pattern for 'cell cycle'
    cell_cycle = re.compile(r'cell cycle', flags=re.IGNORECASE)
    # Find ALL GOs that have 'cell cycle'. Store results in file.
    fout_allgos = "cell_cycle_gos_{TAXID}.log".format(TAXID=taxid)
    with open(fout_allgos, "w") as prt:
        # Search for 'cell cycle' in GO terms
        gos_cc_all = srch.get_matching_gos(cell_cycle, prt=prt)
        # Researcher carefully reviews GO results and finds GO:0005764(lysosome)
        # in the results when it should not be because the match was found:
        #     cell cycle-independent
        # Researcher removes 'lysosome' from 'cell cycle' results
        # by removing any GOs matching 'cell cycle-independent'
        cell_cycle_ind = re.compile(r'cell cycle.independent', flags=re.IGNORECASE)
        gos_no_cc = srch.get_matching_gos(cell_cycle_ind, gos=gos_cc_all, prt=prt)
        gos = gos_cc_all.difference(gos_no_cc)
        # Add children GOs of cell cycle GOs
        gos_all = srch.add_children_gos(gos)
        if log is not None:
            log.write('    taxid {TAXID:>5}\n'.format(TAXID=taxid))
            log.write('    FOUND {N:>5} GOs:   {F}\n'.format(
                N=len(gos_all), F=fout_allgos))
    # Get Entrez GeneIDs for cell cycle GOs
    geneids = srch.get_items(gos_all)
    return geneids
Exemplo n.º 30
0
def prep_goea(taxid=9606,
              prop_counts=True,
              alpha=0.05,
              method='fdr_bh',
              ref_list=None):
    ### DOWNLOAD AND LOAD ALL THE GENE STUFF for GOEA
    # download ontology
    from goatools.base import download_go_basic_obo
    obo_fname = download_go_basic_obo()

    # download associations
    from goatools.base import download_ncbi_associations
    fin_gene2go = download_ncbi_associations()

    # load ontology
    from goatools.obo_parser import GODag
    obodag = GODag("go-basic.obo")

    # load human gene ontology
    from goatools.anno.genetogo_reader import Gene2GoReader
    objanno = Gene2GoReader(fin_gene2go,
                            taxids=[taxid
                                    ])  #9606 is taxonomy ID for h**o sapiens
    ns2assoc = objanno.get_ns2assc()
    for nspc, id2gos in ns2assoc.items():
        print("{NS} {N:,} annotated human genes".format(NS=nspc,
                                                        N=len(id2gos)))

    from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS
    #pop_ids = pd.read_csv('../data/df_human_geneinfo.csv',index_col=0)['GeneID'].to_list()
    df_genehumans = pd.read_csv('../data/df_human_geneinfo.csv', index_col=0)

    # if no reference list is given, default to all genes in ABHA
    if ref_list is None:
        ref_list = df_genehumans['GeneID'].to_list()

    goeaobj = GOEnrichmentStudyNS(ref_list,
                                  ns2assoc,
                                  obodag,
                                  propagate_counts=prop_counts,
                                  alpha=alpha,
                                  methods=[method])

    # get symbol to ID translation dictionary to get overexpressed IDs
    symbol2id = dict(
        zip(df_genehumans['Symbol'].str.upper(), df_genehumans['GeneID']))

    return goeaobj, symbol2id
Exemplo n.º 31
0
def _get_pvals(pvalfnc_names, prt=sys.stdout):
    fisher2pvals = {}
    taxid = 10090  # Mouse study
    obo_dag = GODag(download_go_basic_obo(prt=prt))
    geneids_pop = GeneID2nt_mus.keys()
    assoc_geneid2gos = get_assoc_ncbi_taxids([taxid])
    geneids_study = _get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx", prt)
    for fisher in pvalfnc_names:
        goeaobj = GOEnrichmentStudy(geneids_pop,
                                    assoc_geneid2gos,
                                    obo_dag,
                                    propagate_counts=False,
                                    alpha=0.05,
                                    methods=None,
                                    pvalcalc=fisher)
        fisher2pvals[fisher] = goeaobj._get_pval_uncorr(geneids_study, prt)
    return fisher2pvals
Exemplo n.º 32
0
def test_i154_semsim_lin():
    """Test for issue 148, Lin Similarity if a term has no annotations"""
    fin_dag = download_go_basic_obo()
    tic = timeit.default_timer()

    optional_attrs = {'consider', 'replaced_by'}
    load_obsolete = True
    prt = sys.stdout

    godag = GODag(fin_dag, optional_attrs, load_obsolete, prt)
    prt_hms(tic, 'Loaded GO DAG')
    assert godag['GO:0000067'].consider
    assert godag['GO:0003734'].replaced_by == 'GO:0030532'

    godag = GODag(fin_dag, 'consider', load_obsolete, prt)
    prt_hms(tic, 'Loaded GO DAG')
    assert godag['GO:0000067'].consider
def dl_files(go_directory):
    """function to download latest ontologies and associations files from geneontology.org
    specify the directory to download the files to"""

    # change to go directory
    os.chdir(go_directory)

    # Get http://geneontology.org/ontology/go-basic.obo
    obo_fname = download_go_basic_obo()

    # print go file version:
    with open(obo_fname) as fin:
        for line in islice(fin, 1, 2):
            print(line)

    # download gene2go annotation file
    fin_gene2go = download_ncbi_associations()

    return obo_fname, fin_gene2go
Exemplo n.º 34
0
def read_go_basic(): 
    
    oboFile = download_go_basic_obo()
     
    obodag = GODag( oboFile)

    # Format example
    # ['name', 'level', 'is_obsolete', 'namespace', 'id', 'depth', 'parents', 'children', '_parents', 'alt_ids']
    # name:secondary active monocarboxylate transmembrane transporter activity
    # level:5
    # is_obsolete:False
    # namespace:molecular_function
    # id:GO:0015355
    # depth:9
    # parents: 2 items (more info)
    # children: 0 items
    # alt_ids: 0 items, GOTerm('GO:0042879'):

    return obodag
Exemplo n.º 35
0
def test_obo():
    """Test downloading of Ontology file."""
    fdnld = download_go_basic_obo()
    os.system("rm -f {FILE}".format(FILE=fdnld))
    fdnld = download_go_basic_obo()
    assert os.path.isfile(fdnld)
Exemplo n.º 36
0
# Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz
# Data will be stored in this variable
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import goatools
from goatools.base import download_go_basic_obo
from goatools.base import download_ncbi_associations
from goatools.obo_parser import GODag
from goatools.associations import read_ncbi_gene2go
from goatools.test_data.genes_NCBI_10090_ProteinCoding import GeneID2nt as GeneID2nt_mus
from goatools.go_enrichment import GOEnrichmentStudy

obo_fname = download_go_basic_obo()
gene2go = download_ncbi_associations()
obodag = GODag("go-basic.obo")
geneid2gos_mouse = read_ncbi_gene2go("gene2go", taxids=[10090])

geneid2symbol = {}

print("{N:,} annotated mouse genes".format(N=len(geneid2gos_mouse)))
print(GeneID2nt_mus.keys().head())

goeaobj = GOEnrichmentStudy(
    GeneID2nt_mus.keys(),  # List of mouse protein-coding genes
    geneid2gos_mouse,  # geneid/GO associations
    obodag,  # Ontologies
    propagate_counts=False,
    alpha=0.05,  # default significance cut-off
Exemplo n.º 37
0
def get_go_ids(go_ids, species='H**o sapiens'):
    '''
    Fetch all gene symbols associated with a list of gene ontology term IDs.

    Parameters
    ----------
    go_ids : str or list of str
    species : str, optional

    Returns
    -------
    list of str
    '''
    assert species in TAXA

    if isinstance(go_ids, str):
        go_ids = [go_ids]

    obo_fname = download_go_basic_obo('db/go/go-basic.obo')
    gene2go = download_ncbi_associations('db/go/gene2go')

    taxid = TAXA[species]

    fin_symbols = 'genes_NCBI_{TAXID}_All.py'.format(TAXID=taxid)

    module_name = ''.join(['goatools.test_data.', fin_symbols[:-3]])
    module = importlib.import_module(module_name)
    GeneID2nt = module.GENEID2NT

    go2geneids = Gene2GoReader(
        'db/go/gene2go',
        taxids=[taxid],
    )

    go2items = defaultdict(list)
    for i in go2geneids.taxid2asscs[taxid]:
        go2items[i.GO_ID].append(i.DB_ID)

    srchhelp = GoSearch('db/go/go-basic.obo', go2items=go2items)

    with open('go.log', 'w') as log:
        # Add children GOs
        gos_all = srchhelp.add_children_gos(go_ids)

        # Get Entrez GeneIDs for cell cycle GOs
        gene_ids = set()

        for go_items in [
                go_ids,
                gos_all,
        ]:
            gene_ids.update(srchhelp.get_items(go_items))

    genes = []

    for geneid in gene_ids:
        nt = GeneID2nt.get(geneid, None)

        if nt is not None:
            genes.append(nt.Symbol)

    return genes
Exemplo n.º 38
0
 def _init_dnld_dag(self):
     """If dag does not exist, download it."""
     if not os.path.exists(self.obo):
         download_go_basic_obo(self.obo, loading_bar=None)