def test_find_enrichment(): """Recreate run in run.sh.""" # Set params ntobj = cx.namedtuple("args_namespc", ("filenames obo " "pval alpha pvalcalc method no_propagate_counts " "compare ratio " "outfile indent min_overlap ")) filenames = ['data/study', 'data/population', 'data/association'] methods = ['bonferroni', 'sidak', 'holm', 'fdr_bh'] alpha = 0.05 fin_obo = os.path.join(REPO, 'go-basic.obo') download_go_basic_obo(fin_obo, prt=sys.stdout, loading_bar=None) args = ntobj(filenames=[os.path.join(REPO, f) for f in filenames], obo=fin_obo, pval=0.05, alpha=alpha, pvalcalc='fisher', method=",".join(methods), no_propagate_counts=False, compare=False, ratio=None, outfile=None, indent=True, min_overlap=0.7) # Run test study, pop, assoc = rd_files(args.filenames, args.compare) objgoea = get_objgoea(pop, assoc, args) results = objgoea.run_study(study) # Check results expected_cnts = {'fdr_bh': 17, 'sidak': 5, 'holm': 5, 'bonferroni': 5} _chk_results(results, expected_cnts, methods, alpha) print("TEST PASSED")
def dnld_ontology(filename): """Test downloading of ontologies.""" # download_go_basic_obo(filename, loading_bar=None) os.system("rm -f {FILE}".format(FILE=filename)) download_go_basic_obo(filename, loading_bar=None) download_go_basic_obo(filename, loading_bar=None) assert os.path.isfile(filename), "FILE({F}) EXPECTED TO EXIST".format(F=filename)
def test_i147_all_taxids(): """Work with all taxids using Gene2GoReader""" # 1. Download Ontologies and Associations # 1a. Download Ontologies, if necessary # Get http://geneontology.org/ontology/go-basic.obo download_go_basic_obo() # 1b. Download Associations, if necessary # Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz fin_gene2go = download_ncbi_associations() # 2. Load Ontologies, Associations and Background gene set # 2a. Load Ontologies godag = GODag("go-basic.obo") # 2b. Load Associations for all species # Read NCBI's gene2go. Store annotations in a list of namedtuples objanno_all = Gene2GoReader(fin_gene2go, godag=godag, taxids=True) objanno_mmu = Gene2GoReader(fin_gene2go, godag=godag, taxids=[10090]) objanno_mmuhsa = Gene2GoReader(fin_gene2go, godag=godag, taxids=[10090, 9606]) # Get associations # pylint: disable=bad-whitespace ns2assoc_all_mmu = _run_get_ns2assc(10090, objanno_all) ns2assoc_mmu_mmu = _run_get_ns2assc(10090, objanno_mmu) ns2assoc_mmuhsa_all = _run_get_ns2assc(True, objanno_mmuhsa) ns2assoc_mmuhsa_mmu = _run_get_ns2assc(10090, objanno_mmuhsa) # Check results for nspc in ['BP', 'MF', 'CC']: assert ns2assoc_mmu_mmu[nspc] == ns2assoc_all_mmu[nspc] assert ns2assoc_mmu_mmu[nspc] == ns2assoc_mmuhsa_mmu[nspc] _chk_mmuhsa_all(objanno_mmuhsa, objanno_all, ns2assoc_mmuhsa_all)
def test_i147_all_taxids(): """Work with all taxids using Gene2GoReader""" # 1. Download Ontologies and Associations # 1a. Download Ontologies, if necessary # Get http://geneontology.org/ontology/go-basic.obo download_go_basic_obo() # 1b. Download Associations, if necessary # Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz fin_gene2go = download_ncbi_associations() # 2. Load Ontologies, Associations and Background gene set # 2a. Load Ontologies godag = GODag("go-basic.obo") # 2b. Load Associations for all species # Read NCBI's gene2go. Store annotations in a list of namedtuples objanno = Gene2GoReader(fin_gene2go, godag=godag, taxids=True) # Get namespace2association where: # namespace is: # BP: biological_process # MF: molecular_function # CC: cellular_component # assocation is a dict: # key: NCBI GeneID # value: A set of GO IDs associated with that gene ns2assoc = objanno.get_ns2assc() for nspc, id2gos in ns2assoc.items(): print("{NS} {N:,} annotated mouse genes".format(NS=nspc, N=len(id2gos)))
def __GO_enrich__(self): go_file = "go-basic.obo" if not os.path.exists(go_file): download_go_basic_obo() # Load gene ontologies obodag = GODag("go-basic.obo") # Read NCBI's gene2go. Store annotations in a list of namedtuples fin_gene2go = download_ncbi_associations() objanno = Gene2GoReader(fin_gene2go, taxids=[9606]) # Get namespace2association where: # namespace is: # BP: biological_process # MF: molecular_function # CC: cellular_component # association is a dict: # key: NCBI GeneID # value: A set of GO IDs associated with that gene ns2assoc = objanno.get_ns2assc() self.goeaobj = GOEnrichmentStudyNS( GeneID2nt_hum.keys(), # List of human protein-acoding genes ns2assoc, # geneID/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=['fdr_bh']) # default multipletest correction method
def test_cli(): """Add and remove markers for a file.""" # pylint: disable=bad-whitespace args_exp = [ # args exp_set expected_dict # -------- ------- --------------------- ([], { 'dag': 'go-basic.obo', 'dash_len': 6 }), (['--dag=go-basic.obo'], { 'dag': 'go-basic.obo', 'dash_len': 6 }), (['-o rpt.txt'], { 'dag': 'go-basic.obo', 'dash_len': 6, 'o': 'rpt.txt' }), (['--max_indent=7'], { 'dag': 'go-basic.obo', 'dash_len': 6, 'max_indent': 7 }), (['CC', '--concise'], { 'dag': 'go-basic.obo', 'dash_len': 6, 'GO': ['CC'], 'concise': True }), (['--no_indent'], { 'dag': 'go-basic.obo', 'dash_len': 6, 'no_indent': True }), (['--concise', '--no_indent'], { 'dag': 'go-basic.obo', 'dash_len': 6, 'concise': True, 'no_indent': True }), ] download_go_basic_obo('go-basic.obo', loading_bar=None) for idx, (args, exp_dict) in enumerate(args_exp): print("ARGS={ARGS}".format(ARGS=args)) print("EXP={EXP}".format(EXP=exp_dict)) obj = WrHierCli(args) print("DCT: {DCT}".format(DCT=obj.kws)) print("WWWWWWWWWWWWWWWWWWW WrHierCli", obj.kws) assert obj.kws == exp_dict, "DCT: ACT({}) != EXP({})".format( obj.kws, exp_dict) print("") # Test writing to a file if obj.goids: fout_txt = os.path.join(REPO, 'wrhier{N}.txt'.format(N=idx)) os.system('rm -f {FILE}'.format(FILE=fout_txt)) obj.wrtxt_hier(fout_txt) assert os.path.exists(fout_txt), 'FILE NOT FOUND({F})'.format( F=fout_txt)
def test_deprecatedloc_godagtimed(): """Test deprecated location of GoDagTimed""" tic = timeit.default_timer() prt_hms(tic, 'prt_hms TESTED') fin_go_obo = os.path.join(REPO, "go-basic.obo") download_go_basic_obo(fin_go_obo, loading_bar=None) GoDagTimed(fin_go_obo)
def prepare_data(): #download the last obo #obo_fname = download_go_basic_obo() #format the last trypdb search res file #for goterms download_go_basic_obo() trytrip_file='data/GenesByGoTerm_Summary.txt' associations_file = 'data/associations.txt' format_file(trytrip_file=trytrip_file, res_file = associations_file)
def wr_subobo(self): """Write a subset obo to be used for testing.""" # Load GO-DAG: Load optional 'relationship' for name, goids in NAME2GOIDS.items(): fout_obo = self.get_obo_name(name) fin_obo = os.path.join(REPO, "go-basic.obo") download_go_basic_obo(fin_obo, prt=sys.stdout, loading_bar=None) obj = WrSubObo(fin_obo, optional_attrs=['relationship']) # obj = WrSubObo(fin_obo) obj.wrobo(fout_obo, goids)
def test_gosubdag_relationships(prt=sys.stdout): """Plot both the standard 'is_a' field and the 'part_of' relationship.""" goids = set([ "GO:0032501", "GO:0044707", # alt_id: GO:0032501 # BP 1011 L01 D01 B multicellular organismal process "GO:0050874", "GO:0007608", # sensory perception of smell "GO:0050911" ]) # detection of chemical stimulus involved in sensory perception of smell # Load GO-DAG: Load optional 'relationship' fin_obo = os.path.join(REPO, "go-basic.obo") download_go_basic_obo(fin_obo, prt, loading_bar=None) go2obj_plain = GODag(fin_obo) go2obj_relat = GODag(fin_obo, optional_attrs=['relationship']) print("\nCreate GoSubDag with GO DAG containing no relationships.") tic = timeit.default_timer() # Create Plot object; Plot both 'is_a' and optional 'part_of' relationship gosubdag = GoSubDag(goids, go2obj_plain, relationships=False, prt=prt) # gosubdag.prt_goids(gosubdag.go2obj) goids_plain = set(gosubdag.go2obj) tic = _rpt_hms(tic, len(gosubdag.go2obj)) print("\nCreate GoSubDag while IGNORING relationships") # Create Plot object; Plot both 'is_a' and optional 'part_of' relationship gosubdag = GoSubDag(goids, go2obj_relat, relationships=False, prt=prt) # gosubdag.prt_goids(gosubdag.go2obj) goids_false = set(gosubdag.go2obj) tic = _rpt_hms(tic, len(gosubdag.go2obj)) assert goids_plain == goids_false print("\nCreate GoSubDag while loading only the 'part_of' relationship") gosubdag = GoSubDag(goids, go2obj_relat, relationships=['part_of'], prt=prt) # gosubdag.prt_goids(gosubdag.go2obj) goids_part_of = set(gosubdag.go2obj) tic = _rpt_hms(tic, len(gosubdag.go2obj)) assert goids_plain.intersection(goids_part_of) == goids_plain assert len(goids_part_of) > len(goids_plain) print("\nCreate GoSubDag while loading all relationships") gosubdag = GoSubDag(goids, go2obj_relat, relationships=True, prt=prt) # gosubdag.prt_goids(gosubdag.go2obj) goids_true = set(gosubdag.go2obj) tic = _rpt_hms(tic, len(gosubdag.go2obj)) assert goids_part_of.intersection(goids_true) == goids_part_of assert len(goids_true) >= len(goids_part_of)
def __init__(self): download_go_basic_obo(self.obo, sys.stdout, loading_bar=None) self.godag_r0 = GODag(self.obo) self.godag_r1 = GODag(self.obo, optional_attrs=set(['relationship'])) self.goids = list(set(o.id for o in self.godag_r0.values())) # GoSubDag (plain) tic = timeit.default_timer() self.gosubdag_r0 = GoSubDag(self.goids, self.godag_r0, prt=None) prt_hms(tic, "GoSubDag r0 {N:4} GOs {S:3} srcs".format( N=len(self.gosubdag_r0.go2obj), S=len(self.gosubdag_r0.go_sources))) # GoSubDag with relationships self.gosubdag_r1 = GoSubDag(self.goids, self.godag_r1, prt=None, relationships=True) prt_hms(tic, "GoSubDag r1 {N:4} GOs {S:3} srcs".format( N=len(self.gosubdag_r1.go2obj), S=len(self.gosubdag_r1.go_sources)))
def test_dnlds(): """Test downloads of ontologies and NCBI associations.""" # Test downloads of ontologies. cwd = os.getcwd() file_obo = os.path.join(cwd, "go-basic.obo") download_go_basic_obo(file_obo, loading_bar=None) os.system("rm -f {FILE}".format(FILE=file_obo)) download_go_basic_obo(file_obo, loading_bar=None) assert os.path.isfile(file_obo) # Test downloading of associations from NCBI. file_assc = os.path.join(cwd, "gene2go") download_ncbi_associations(file_assc, loading_bar=None) os.system("rm -f {FILE}".format(FILE=file_assc)) download_ncbi_associations(file_assc, loading_bar=None) assert os.path.isfile(file_assc)
def run_go_enrichment(strain, genes_of_interest, significant=True, cutoff=0.05, use_parent_terms=True): # Load GO term association dictionary with open(os.path.join('data', 'go_association.pickle'), 'rb') as handle: go_association = pickle.load(handle) background_genes = get_genes( os.path.join('data', strain + '_all_genes.csv')) obo_go_fname = download_go_basic_obo() obo_dag = GODag('go-basic.obo') if strain == 'PA14': genes_of_interest = map_pa14_genes(genes_of_interest) background_genes = map_pa14_genes(background_genes) goea_obj = GOEnrichmentStudyNS(background_genes, go_association, obo_dag, propagate_counts=use_parent_terms, alpha=cutoff, methods=['fdr_bh']) goea_results = goea_obj.run_study(genes_of_interest) if significant is True: goea_results = [ result for result in goea_results if result.p_fdr_bh < cutoff ] enrichment_results = get_enrichment_results(goea_results) return [enrichment_results, goea_results]
def get_GO_dag(): try: GO_dag = GODag(obo_file=GO_PATH.as_posix()) except Exception: obo_fname = download_go_basic_obo(obo=GO_PATH.as_posix()) GO_dag = GODag(obo_file=GO_PATH.as_posix()) return GO_dag
def download_and_move_go_basic_obo(prt): if not os.path.exists('geneinfo_cache'): os.makedirs('geneinfo_cache') if not os.path.exists('geneinfo_cache/go-basic.obo'): obo_fname = download_go_basic_obo(prt=prt) shutil.move('go-basic.obo', 'geneinfo_cache/go-basic.obo') return 'geneinfo_cache/go-basic.obo'
def test_gosearch(log=sys.stdout): """Test GoSearch class with no annotations.""" taxids = [9606, 10090] # Download ontologies and annotations, if necessary fin_go_obo = os.path.join(REPO, "go-basic.obo") download_go_basic_obo(fin_go_obo, loading_bar=None) # Because get_assoc_ncbi_taxids returns id2gos, we will opt to # use the (optional) multi-level dictionary separate associations by taxid # taxid2asscs contains both GO2GeneIDs and GeneID2GOs. taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) get_assoc_ncbi_taxids(taxids, taxid2asscs=taxid2asscs, loading_bar=None) # Initialize GO-search helper object with obo and annotations(go2items) for taxid in taxids: obj = GoSearch(fin_go_obo, go2items=taxid2asscs[taxid]['GO2GeneIDs'], log=log) assert len(obj.obo_dag) > 40000 GoSearch(fin_go_obo, dict(), log=log) assert len(obj.obo_dag) > 40000
def test_go_print(prt=sys.stdout): """Test that all GO Terms can be printed, even if level/depth are not assigned.""" obo_file = download_go_basic_obo(prt) reader = goatools.obo_parser.OBOReader(obo_file) prt.write("\n{OBJ}\n\n".format(OBJ=reader)) go_terms = list(reader) prt.write("First GO Record: {REC}\n".format(REC=go_terms[0])) for idx, go_rec in enumerate(go_terms): prt.write("{I:>7,} {RECORD}\n".format(I=idx, RECORD=go_rec))
def test_go_print(prt=sys.stdout): """Test that all GO Terms can be printed, even if level/depth are not assigned.""" obo_file = download_go_basic_obo(prt=prt) reader = goatools.obo_parser.OBOReader(obo_file) prt.write("\n{OBJ}\n\n".format(OBJ=reader)) go_terms = list(reader) prt.write("First GO Record: {REC}\n".format(REC=go_terms[0])) for idx, go_rec in enumerate(go_terms): prt.write("{I:>7,} {RECORD}\n".format(I=idx, RECORD=go_rec))
def test_tcntobj_relationships(prt=sys.stdout): """Test loading of relationships, like part_of, into TermCounts""" fin_obo = os.path.join(REPO, "go-basic.obo") fin_anno = os.path.join(REPO, 'goa_human.gpad') download_go_basic_obo(fin_obo, prt, loading_bar=None) dnld_annotation(fin_anno) # Load ontologies go2obj_r0 = GODag(fin_obo) go2obj_r1 = GODag(fin_obo, optional_attrs=['relationship']) # Load annotations annoobj = GpadReader(fin_anno, godag=go2obj_r0) # Create TermCounts objects ns2tcntobj_r0 = {ns:TermCounts(go2obj_r0, annoobj.get_id2gos(ns)) for ns in NSS} ns2tcntobj_r1 = {ns:TermCounts(go2obj_r1, annoobj.get_id2gos(ns), RELS) for ns in NSS} _chk_pass_fail(ns2tcntobj_r0, ns2tcntobj_r1)
def load_ontologies_and_associations(self): print "---LOADING ONTOLOGIES AND ASSOCIATIONS---" # Check if files exist and download if not obo_fname = download_go_basic_obo() gene2go = download_ncbi_associations() # Load ontologies and associations obodag = GODag(obo_fname) geneid2gos_human = read_ncbi_gene2go("gene2go", taxids=[9606]) print "{N:,} annotated human genes".format(N=len(geneid2gos_human)) return obodag, geneid2gos_human
def test_go_print(prt=sys.stdout): """Test that all GO Terms can be printed, even if level/depth are not assigned.""" obo_file = download_go_basic_obo(prt=prt) reader = goatools.obo_parser.OBOReader(obo_file) go_terms = list(reader) prt.write("Python Version: {VER}\n\n".format(VER=sys.version)) prt.write("\nOBOReader: {OBJ}\n\n".format(OBJ=reader)) prt.write("format-version: {VER}\n".format(VER=reader.format_version)) prt.write("data-version: {VER}\n\n".format(VER=reader.data_version)) prt.write("Found {N} GO Records:\n".format(N=len(go_terms))) for idx, go_rec in enumerate(go_terms): prt.write("{I:>7,} {RECORD}\n".format(I=idx, RECORD=go_rec))
def __init__(self): download_go_basic_obo(self.obo, sys.stdout, loading_bar=None) self.godag_r0 = GODag(self.obo) self.godag_r1 = GODag(self.obo, optional_attrs=set(['relationship'])) self.goids = list(set(o.id for o in self.godag_r0.values())) # GoSubDag (plain) tic = timeit.default_timer() self.gosubdag_r0 = GoSubDag(self.goids, self.godag_r0, prt=None) prt_hms( tic, "GoSubDag r0 {N:4} GOs {S:3} srcs".format( N=len(self.gosubdag_r0.go2obj), S=len(self.gosubdag_r0.go_sources))) # GoSubDag with relationships self.gosubdag_r1 = GoSubDag(self.goids, self.godag_r1, prt=None, relationships=True) prt_hms( tic, "GoSubDag r1 {N:4} GOs {S:3} srcs".format( N=len(self.gosubdag_r1.go2obj), S=len(self.gosubdag_r1.go_sources)))
def get_genes_cell_cycle(taxid=9606, log=sys.stdout): """Test GOEA with local multipletest correction methods for cell cycle.""" # Download ontologies and annotations, if necessary fin_go_obo = os.path.join(os.getcwd(), "go-basic.obo") download_go_basic_obo(fin_go_obo, loading_bar=None) # Because get_assoc_ncbi_taxids returns id2gos, we will opt to # use the (optional) multi-level dictionary separate associations by taxid # taxid2asscs contains both GO2GeneIDs and GeneID2GOs. taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) get_assoc_ncbi_taxids([taxid], taxid2asscs=taxid2asscs, loading_bar=None) # Initialize GO-search helper object with obo and annotations(go2items) srch = GoSearch(fin_go_obo, go2items=taxid2asscs[taxid]['GO2GeneIDs']) # Compile search pattern for 'cell cycle' cell_cycle = re.compile(r'cell cycle', flags=re.IGNORECASE) # Find ALL GOs that have 'cell cycle'. Store results in file. fout_allgos = "cell_cycle_gos_{TAXID}.log".format(TAXID=taxid) with open(fout_allgos, "w") as prt: # Search for 'cell cycle' in GO terms gos_cc_all = srch.get_matching_gos(cell_cycle, prt=prt) # Researcher carefully reviews GO results and finds GO:0005764(lysosome) # in the results when it should not be because the match was found: # cell cycle-independent # Researcher removes 'lysosome' from 'cell cycle' results # by removing any GOs matching 'cell cycle-independent' cell_cycle_ind = re.compile(r'cell cycle.independent', flags=re.IGNORECASE) gos_no_cc = srch.get_matching_gos(cell_cycle_ind, gos=gos_cc_all, prt=prt) gos = gos_cc_all.difference(gos_no_cc) # Add children GOs of cell cycle GOs gos_all = srch.add_children_gos(gos) if log is not None: log.write(' taxid {TAXID:>5}\n'.format(TAXID=taxid)) log.write(' FOUND {N:>5} GOs: {F}\n'.format(N=len(gos_all), F=fout_allgos)) # Get Entrez GeneIDs for cell cycle GOs geneids = srch.get_items(gos_all) return geneids
def __init__( self, work_dir: str = '.', clean_work_dir: bool = False, organism: str = 'human', study_parameters: Dict[str, Union[int, float, str, List, Dict]] = { 'propagate_counts': False, 'alpha': 0.05, 'methods': ['fdr_bh'] } ) -> GOEngine: """A GOEngine that can be used for performing analysis using GOATOOLS Args: work_dir (str, optional): The path to a temp directory were intermediate-results and raw data will be downloaded/written to. Defaults to the current working directory. clean_work_dir (bool, optional): Whether or not to remove data written to the work directory at class termination, default to True. organism (str, optional): The organism . Defaults to 'human'. study_parameters (Dict[str,Union[int,float,str,List,Dict]], optional): A dict of parameters to control the base function, defaults to {'propagate_counts':False,'alpha':0.05, 'methods':['fdr_bh']} Returns: GOEngine: return a GO engine that can be used for performing GO enrichment analysis GOEnrichmentStudyNS """ print("Creating a GO Engine ...") if not os.path.exists(work_dir): raise ValueError( f"The provided work path: {work_dir} does not exist!!!") self.work_dir = work_dir if organism != 'human' and organism != 'mouse': raise ValueError( f"The provided organism: {organism} is not support, current engine mainly work with human and moues only" ) print(f"\t --> Downloading data ...") obo_fname = download_go_basic_obo( os.path.join(work_dir, 'go-basic.obo')) gene2go_fname = download_ncbi_associations( os.path.join(work_dir, 'gene2go')) ## parse the GO term print( f"\t --> parsing the data and intializing the base GOEA object...") obo_dag = GODag(obo_fname) if organism == 'human': self._goea_obj = GOEnrichmentStudyNS( gene2iden_human.keys(), Gene2GoReader(gene2go_fname, taxids=[9606]).get_ns2assc(), obo_dag, **study_parameters) else: self._goea_obj = GOEnrichmentStudyNS( gene2iden_human.keys(), Gene2GoReader(gene2go_fname, taxids=[10090]).get_ns2assc(), obo_dag, **study_parameters) self._clean_work_dir = clean_work_dir self._gene_ids = None return
def _get_pvals(pvalfnc_names, prt=sys.stdout): fisher2pvals = {} taxid = 10090 # Mouse study obo_dag = GODag(download_go_basic_obo(prt=prt)) geneids_pop = GeneID2nt_mus.keys() assoc_geneid2gos = get_assoc_ncbi_taxids([taxid]) geneids_study = _get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx", prt) for fisher in pvalfnc_names: goeaobj = GOEnrichmentStudy( geneids_pop, assoc_geneid2gos, obo_dag, propagate_counts=False, alpha=0.05, methods=None, pvalcalc=fisher ) fisher2pvals[fisher] = goeaobj._get_pval_uncorr(geneids_study, prt) return fisher2pvals
def test_go_print(prt=sys.stdout): """Test that all GO Terms can be printed, even if level/depth are not assigned.""" prt_pypath(prt) file_obo = os.path.join(os.getcwd(), "go-basic.obo") obo_file = download_go_basic_obo(file_obo, prt=prt, loading_bar=None) reader = goatools.obo_parser.OBOReader(obo_file) go_terms = list(reader) prt.write("Python Version: {VER}\n\n".format(VER=sys.version)) prt.write("\nOBOReader: {OBJ}\n\n".format(OBJ=reader)) prt.write("format-version: {VER}\n".format(VER=reader.format_version)) prt.write("data-version: {VER}\n\n".format(VER=reader.data_version)) prt.write("Found {N} GO Records:\n".format(N=len(go_terms))) for idx, go_rec in enumerate(go_terms): prt.write("{I:>7,} {RECORD}\n".format(I=idx, RECORD=go_rec))
def get_genes_cell_cycle(taxid=9606, log=sys.stdout): """Test GOEA with local multipletest correction methods for cell cycle.""" # Download ontologies and annotations, if necessary fin_go_obo = os.path.join(os.getcwd(), "go-basic.obo") download_go_basic_obo(fin_go_obo, loading_bar=None) # Because get_assoc_ncbi_taxids returns id2gos, we will opt to # use the (optional) multi-level dictionary separate associations by taxid # taxid2asscs contains both GO2IDs and ID2GOs. taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) get_assoc_ncbi_taxids([taxid], taxid2asscs=taxid2asscs, loading_bar=None) # Initialize GO-search helper object with obo and annotations(go2items) srch = GoSearch(fin_go_obo, go2items=taxid2asscs[taxid]['GO2IDs']) # Compile search pattern for 'cell cycle' cell_cycle = re.compile(r'cell cycle', flags=re.IGNORECASE) # Find ALL GOs that have 'cell cycle'. Store results in file. fout_allgos = "cell_cycle_gos_{TAXID}.log".format(TAXID=taxid) with open(fout_allgos, "w") as prt: # Search for 'cell cycle' in GO terms gos_cc_all = srch.get_matching_gos(cell_cycle, prt=prt) # Researcher carefully reviews GO results and finds GO:0005764(lysosome) # in the results when it should not be because the match was found: # cell cycle-independent # Researcher removes 'lysosome' from 'cell cycle' results # by removing any GOs matching 'cell cycle-independent' cell_cycle_ind = re.compile(r'cell cycle.independent', flags=re.IGNORECASE) gos_no_cc = srch.get_matching_gos(cell_cycle_ind, gos=gos_cc_all, prt=prt) gos = gos_cc_all.difference(gos_no_cc) # Add children GOs of cell cycle GOs gos_all = srch.add_children_gos(gos) if log is not None: log.write(' taxid {TAXID:>5}\n'.format(TAXID=taxid)) log.write(' FOUND {N:>5} GOs: {F}\n'.format( N=len(gos_all), F=fout_allgos)) # Get Entrez GeneIDs for cell cycle GOs geneids = srch.get_items(gos_all) return geneids
def prep_goea(taxid=9606, prop_counts=True, alpha=0.05, method='fdr_bh', ref_list=None): ### DOWNLOAD AND LOAD ALL THE GENE STUFF for GOEA # download ontology from goatools.base import download_go_basic_obo obo_fname = download_go_basic_obo() # download associations from goatools.base import download_ncbi_associations fin_gene2go = download_ncbi_associations() # load ontology from goatools.obo_parser import GODag obodag = GODag("go-basic.obo") # load human gene ontology from goatools.anno.genetogo_reader import Gene2GoReader objanno = Gene2GoReader(fin_gene2go, taxids=[taxid ]) #9606 is taxonomy ID for h**o sapiens ns2assoc = objanno.get_ns2assc() for nspc, id2gos in ns2assoc.items(): print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos))) from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS #pop_ids = pd.read_csv('../data/df_human_geneinfo.csv',index_col=0)['GeneID'].to_list() df_genehumans = pd.read_csv('../data/df_human_geneinfo.csv', index_col=0) # if no reference list is given, default to all genes in ABHA if ref_list is None: ref_list = df_genehumans['GeneID'].to_list() goeaobj = GOEnrichmentStudyNS(ref_list, ns2assoc, obodag, propagate_counts=prop_counts, alpha=alpha, methods=[method]) # get symbol to ID translation dictionary to get overexpressed IDs symbol2id = dict( zip(df_genehumans['Symbol'].str.upper(), df_genehumans['GeneID'])) return goeaobj, symbol2id
def _get_pvals(pvalfnc_names, prt=sys.stdout): fisher2pvals = {} taxid = 10090 # Mouse study obo_dag = GODag(download_go_basic_obo(prt=prt)) geneids_pop = GeneID2nt_mus.keys() assoc_geneid2gos = get_assoc_ncbi_taxids([taxid]) geneids_study = _get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx", prt) for fisher in pvalfnc_names: goeaobj = GOEnrichmentStudy(geneids_pop, assoc_geneid2gos, obo_dag, propagate_counts=False, alpha=0.05, methods=None, pvalcalc=fisher) fisher2pvals[fisher] = goeaobj._get_pval_uncorr(geneids_study, prt) return fisher2pvals
def test_i154_semsim_lin(): """Test for issue 148, Lin Similarity if a term has no annotations""" fin_dag = download_go_basic_obo() tic = timeit.default_timer() optional_attrs = {'consider', 'replaced_by'} load_obsolete = True prt = sys.stdout godag = GODag(fin_dag, optional_attrs, load_obsolete, prt) prt_hms(tic, 'Loaded GO DAG') assert godag['GO:0000067'].consider assert godag['GO:0003734'].replaced_by == 'GO:0030532' godag = GODag(fin_dag, 'consider', load_obsolete, prt) prt_hms(tic, 'Loaded GO DAG') assert godag['GO:0000067'].consider
def dl_files(go_directory): """function to download latest ontologies and associations files from geneontology.org specify the directory to download the files to""" # change to go directory os.chdir(go_directory) # Get http://geneontology.org/ontology/go-basic.obo obo_fname = download_go_basic_obo() # print go file version: with open(obo_fname) as fin: for line in islice(fin, 1, 2): print(line) # download gene2go annotation file fin_gene2go = download_ncbi_associations() return obo_fname, fin_gene2go
def read_go_basic(): oboFile = download_go_basic_obo() obodag = GODag( oboFile) # Format example # ['name', 'level', 'is_obsolete', 'namespace', 'id', 'depth', 'parents', 'children', '_parents', 'alt_ids'] # name:secondary active monocarboxylate transmembrane transporter activity # level:5 # is_obsolete:False # namespace:molecular_function # id:GO:0015355 # depth:9 # parents: 2 items (more info) # children: 0 items # alt_ids: 0 items, GOTerm('GO:0042879'): return obodag
def test_obo(): """Test downloading of Ontology file.""" fdnld = download_go_basic_obo() os.system("rm -f {FILE}".format(FILE=fdnld)) fdnld = download_go_basic_obo() assert os.path.isfile(fdnld)
# Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz # Data will be stored in this variable import os import sys import pandas as pd import numpy as np import matplotlib.pyplot as plt import goatools from goatools.base import download_go_basic_obo from goatools.base import download_ncbi_associations from goatools.obo_parser import GODag from goatools.associations import read_ncbi_gene2go from goatools.test_data.genes_NCBI_10090_ProteinCoding import GeneID2nt as GeneID2nt_mus from goatools.go_enrichment import GOEnrichmentStudy obo_fname = download_go_basic_obo() gene2go = download_ncbi_associations() obodag = GODag("go-basic.obo") geneid2gos_mouse = read_ncbi_gene2go("gene2go", taxids=[10090]) geneid2symbol = {} print("{N:,} annotated mouse genes".format(N=len(geneid2gos_mouse))) print(GeneID2nt_mus.keys().head()) goeaobj = GOEnrichmentStudy( GeneID2nt_mus.keys(), # List of mouse protein-coding genes geneid2gos_mouse, # geneid/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off
def get_go_ids(go_ids, species='H**o sapiens'): ''' Fetch all gene symbols associated with a list of gene ontology term IDs. Parameters ---------- go_ids : str or list of str species : str, optional Returns ------- list of str ''' assert species in TAXA if isinstance(go_ids, str): go_ids = [go_ids] obo_fname = download_go_basic_obo('db/go/go-basic.obo') gene2go = download_ncbi_associations('db/go/gene2go') taxid = TAXA[species] fin_symbols = 'genes_NCBI_{TAXID}_All.py'.format(TAXID=taxid) module_name = ''.join(['goatools.test_data.', fin_symbols[:-3]]) module = importlib.import_module(module_name) GeneID2nt = module.GENEID2NT go2geneids = Gene2GoReader( 'db/go/gene2go', taxids=[taxid], ) go2items = defaultdict(list) for i in go2geneids.taxid2asscs[taxid]: go2items[i.GO_ID].append(i.DB_ID) srchhelp = GoSearch('db/go/go-basic.obo', go2items=go2items) with open('go.log', 'w') as log: # Add children GOs gos_all = srchhelp.add_children_gos(go_ids) # Get Entrez GeneIDs for cell cycle GOs gene_ids = set() for go_items in [ go_ids, gos_all, ]: gene_ids.update(srchhelp.get_items(go_items)) genes = [] for geneid in gene_ids: nt = GeneID2nt.get(geneid, None) if nt is not None: genes.append(nt.Symbol) return genes
def _init_dnld_dag(self): """If dag does not exist, download it.""" if not os.path.exists(self.obo): download_go_basic_obo(self.obo, loading_bar=None)