def test_dnlds(): """Test downloads of ontologies and NCBI associations.""" # Test downloads of ontologies. cwd = os.getcwd() file_obo = os.path.join(cwd, "go-basic.obo") download_go_basic_obo(file_obo, loading_bar=None) os.system("rm -f {FILE}".format(FILE=file_obo)) download_go_basic_obo(file_obo, loading_bar=None) assert os.path.isfile(file_obo) # Test downloading of associations from NCBI. file_assc = os.path.join(cwd, "gene2go") download_ncbi_associations(file_assc, loading_bar=None) os.system("rm -f {FILE}".format(FILE=file_assc)) download_ncbi_associations(file_assc, loading_bar=None) assert os.path.isfile(file_assc)
def test_i147_all_taxids(): """Work with all taxids using Gene2GoReader""" # 1. Download Ontologies and Associations # 1a. Download Ontologies, if necessary # Get http://geneontology.org/ontology/go-basic.obo download_go_basic_obo() # 1b. Download Associations, if necessary # Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz fin_gene2go = download_ncbi_associations() # 2. Load Ontologies, Associations and Background gene set # 2a. Load Ontologies godag = GODag("go-basic.obo") # 2b. Load Associations for all species # Read NCBI's gene2go. Store annotations in a list of namedtuples objanno = Gene2GoReader(fin_gene2go, godag=godag, taxids=True) # Get namespace2association where: # namespace is: # BP: biological_process # MF: molecular_function # CC: cellular_component # assocation is a dict: # key: NCBI GeneID # value: A set of GO IDs associated with that gene ns2assoc = objanno.get_ns2assc() for nspc, id2gos in ns2assoc.items(): print("{NS} {N:,} annotated mouse genes".format(NS=nspc, N=len(id2gos)))
def __GO_enrich__(self): go_file = "go-basic.obo" if not os.path.exists(go_file): download_go_basic_obo() # Load gene ontologies obodag = GODag("go-basic.obo") # Read NCBI's gene2go. Store annotations in a list of namedtuples fin_gene2go = download_ncbi_associations() objanno = Gene2GoReader(fin_gene2go, taxids=[9606]) # Get namespace2association where: # namespace is: # BP: biological_process # MF: molecular_function # CC: cellular_component # association is a dict: # key: NCBI GeneID # value: A set of GO IDs associated with that gene ns2assoc = objanno.get_ns2assc() self.goeaobj = GOEnrichmentStudyNS( GeneID2nt_hum.keys(), # List of human protein-acoding genes ns2assoc, # geneID/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=['fdr_bh']) # default multipletest correction method
def test_i147_all_taxids(): """Work with all taxids using Gene2GoReader""" # 1. Download Ontologies and Associations # 1a. Download Ontologies, if necessary # Get http://geneontology.org/ontology/go-basic.obo download_go_basic_obo() # 1b. Download Associations, if necessary # Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz fin_gene2go = download_ncbi_associations() # 2. Load Ontologies, Associations and Background gene set # 2a. Load Ontologies godag = GODag("go-basic.obo") # 2b. Load Associations for all species # Read NCBI's gene2go. Store annotations in a list of namedtuples objanno_all = Gene2GoReader(fin_gene2go, godag=godag, taxids=True) objanno_mmu = Gene2GoReader(fin_gene2go, godag=godag, taxids=[10090]) objanno_mmuhsa = Gene2GoReader(fin_gene2go, godag=godag, taxids=[10090, 9606]) # Get associations # pylint: disable=bad-whitespace ns2assoc_all_mmu = _run_get_ns2assc(10090, objanno_all) ns2assoc_mmu_mmu = _run_get_ns2assc(10090, objanno_mmu) ns2assoc_mmuhsa_all = _run_get_ns2assc(True, objanno_mmuhsa) ns2assoc_mmuhsa_mmu = _run_get_ns2assc(10090, objanno_mmuhsa) # Check results for nspc in ['BP', 'MF', 'CC']: assert ns2assoc_mmu_mmu[nspc] == ns2assoc_all_mmu[nspc] assert ns2assoc_mmu_mmu[nspc] == ns2assoc_mmuhsa_mmu[nspc] _chk_mmuhsa_all(objanno_mmuhsa, objanno_all, ns2assoc_mmuhsa_all)
def load_ontologies_and_associations(self): print "---LOADING ONTOLOGIES AND ASSOCIATIONS---" # Check if files exist and download if not obo_fname = download_go_basic_obo() gene2go = download_ncbi_associations() # Load ontologies and associations obodag = GODag(obo_fname) geneid2gos_human = read_ncbi_gene2go("gene2go", taxids=[9606]) print "{N:,} annotated human genes".format(N=len(geneid2gos_human)) return obodag, geneid2gos_human
def __init__( self, work_dir: str = '.', clean_work_dir: bool = False, organism: str = 'human', study_parameters: Dict[str, Union[int, float, str, List, Dict]] = { 'propagate_counts': False, 'alpha': 0.05, 'methods': ['fdr_bh'] } ) -> GOEngine: """A GOEngine that can be used for performing analysis using GOATOOLS Args: work_dir (str, optional): The path to a temp directory were intermediate-results and raw data will be downloaded/written to. Defaults to the current working directory. clean_work_dir (bool, optional): Whether or not to remove data written to the work directory at class termination, default to True. organism (str, optional): The organism . Defaults to 'human'. study_parameters (Dict[str,Union[int,float,str,List,Dict]], optional): A dict of parameters to control the base function, defaults to {'propagate_counts':False,'alpha':0.05, 'methods':['fdr_bh']} Returns: GOEngine: return a GO engine that can be used for performing GO enrichment analysis GOEnrichmentStudyNS """ print("Creating a GO Engine ...") if not os.path.exists(work_dir): raise ValueError( f"The provided work path: {work_dir} does not exist!!!") self.work_dir = work_dir if organism != 'human' and organism != 'mouse': raise ValueError( f"The provided organism: {organism} is not support, current engine mainly work with human and moues only" ) print(f"\t --> Downloading data ...") obo_fname = download_go_basic_obo( os.path.join(work_dir, 'go-basic.obo')) gene2go_fname = download_ncbi_associations( os.path.join(work_dir, 'gene2go')) ## parse the GO term print( f"\t --> parsing the data and intializing the base GOEA object...") obo_dag = GODag(obo_fname) if organism == 'human': self._goea_obj = GOEnrichmentStudyNS( gene2iden_human.keys(), Gene2GoReader(gene2go_fname, taxids=[9606]).get_ns2assc(), obo_dag, **study_parameters) else: self._goea_obj = GOEnrichmentStudyNS( gene2iden_human.keys(), Gene2GoReader(gene2go_fname, taxids=[10090]).get_ns2assc(), obo_dag, **study_parameters) self._clean_work_dir = clean_work_dir self._gene_ids = None return
def prep_goea(taxid=9606, prop_counts=True, alpha=0.05, method='fdr_bh', ref_list=None): ### DOWNLOAD AND LOAD ALL THE GENE STUFF for GOEA # download ontology from goatools.base import download_go_basic_obo obo_fname = download_go_basic_obo() # download associations from goatools.base import download_ncbi_associations fin_gene2go = download_ncbi_associations() # load ontology from goatools.obo_parser import GODag obodag = GODag("go-basic.obo") # load human gene ontology from goatools.anno.genetogo_reader import Gene2GoReader objanno = Gene2GoReader(fin_gene2go, taxids=[taxid ]) #9606 is taxonomy ID for h**o sapiens ns2assoc = objanno.get_ns2assc() for nspc, id2gos in ns2assoc.items(): print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos))) from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS #pop_ids = pd.read_csv('../data/df_human_geneinfo.csv',index_col=0)['GeneID'].to_list() df_genehumans = pd.read_csv('../data/df_human_geneinfo.csv', index_col=0) # if no reference list is given, default to all genes in ABHA if ref_list is None: ref_list = df_genehumans['GeneID'].to_list() goeaobj = GOEnrichmentStudyNS(ref_list, ns2assoc, obodag, propagate_counts=prop_counts, alpha=alpha, methods=[method]) # get symbol to ID translation dictionary to get overexpressed IDs symbol2id = dict( zip(df_genehumans['Symbol'].str.upper(), df_genehumans['GeneID'])) return goeaobj, symbol2id
def fetch_go_hierarchy(): obo_file_location = os.path.join(constants.GO_DIR,constants.GO_FILE_NAME) if not os.path.exists(os.path.join(constants.GO_DIR,constants.GO_FILE_NAME)): wget.download(constants.GO_OBO_URL, os.path.join(constants.GO_DIR,constants.GO_FILE_NAME)) go = obo_parser.GODag(obo_file_location,optional_attrs=['relationship']) # also use print "Downloading gene-GO associations" association_file_location = os.path.join(constants.GO_DIR,constants.GO_ASSOCIATION_FILE_NAME) if not os.path.exists(association_file_location): association_file_location = download_ncbi_associations(association_file_location) # wget.download(constants.GO_ASSOCIATION_GENE2GEO_URL, os.path.join(constants.GO_DIR,constants.GO_ASSOCIATION_FILE_NAME)) print "Loading gene-GO associations" go2geneids_human = read_ncbi_gene2go(association_file_location, taxids=[9606], go2geneids=True) print "Writing out GO child-parent links" if not os.path.exists(constants.OUTPUT_GLOBAL_DIR): os.makedirs(constants.OUTPUT_GLOBAL_DIR) out_fname = "go_output_{}_{}.txt".format(constants.CANCER_TYPE, time.time()) genes = [] isa = [] relship = [] with open(os.path.join(constants.OUTPUT_GLOBAL_DIR,out_fname),'w') as o: for goid in go2geneids_human.keys(): if not go.has_key(goid): print "GO obo file does not contain {}".format(goid) continue entry = go[goid] for gene in go2geneids_human[entry.id]: genes.append((str(gene), entry.id)) o.write("{}\t{}\t{}\n".format("genes", *genes[-1])) children = entry.children for c in children: isa.append((c.id, entry.id)) o.write("{}\t{}\t{}\n".format("is a", *isa[-1])) rels = entry.relationship_rev for rtype in rels.keys(): rs = rels[rtype] for r in rs: relship.append((rtype, r.id, entry.id)) o.write("{}\t{}\t{}\n".format(rtype, *relship[-1][1:])) return (genes, isa, relship)
def get_ensembl_ids(go_process_id, biomart_fpath): entrez_to_ensembl = map_entrez_to_ensembl(biomart_fpath) gene2go = download_ncbi_associations() # taxids=[9606] means select only human. # TODO: ask Marinka if we should use EXP code for evidence!! go_to_entrez_ids_human = read_ncbi_gene2go(gene2go, taxids=[9606], go2geneids=True) """, evidence_set='EXP'""" entrez_ids = go_to_entrez_ids_human[GO_PROCESS_ID] ensembl_ids = [] for ent_id in entrez_ids: ensembl_ids.append(entrez_to_ensembl[str(ent_id)]) print("{N} GO terms associated with human NCBI Entrez GeneIDs".format(N=len(go_to_entrez_ids_human))) return ensembl_ids
def dl_files(go_directory): """function to download latest ontologies and associations files from geneontology.org specify the directory to download the files to""" # change to go directory os.chdir(go_directory) # Get http://geneontology.org/ontology/go-basic.obo obo_fname = download_go_basic_obo() # print go file version: with open(obo_fname) as fin: for line in islice(fin, 1, 2): print(line) # download gene2go annotation file fin_gene2go = download_ncbi_associations() return obo_fname, fin_gene2go
# Data will be stored in this variable import os import sys import pandas as pd import numpy as np import matplotlib.pyplot as plt import goatools from goatools.base import download_go_basic_obo from goatools.base import download_ncbi_associations from goatools.obo_parser import GODag from goatools.associations import read_ncbi_gene2go from goatools.test_data.genes_NCBI_10090_ProteinCoding import GeneID2nt as GeneID2nt_mus from goatools.go_enrichment import GOEnrichmentStudy obo_fname = download_go_basic_obo() gene2go = download_ncbi_associations() obodag = GODag("go-basic.obo") geneid2gos_mouse = read_ncbi_gene2go("gene2go", taxids=[10090]) geneid2symbol = {} print("{N:,} annotated mouse genes".format(N=len(geneid2gos_mouse))) print(GeneID2nt_mus.keys().head()) goeaobj = GOEnrichmentStudy( GeneID2nt_mus.keys(), # List of mouse protein-coding genes geneid2gos_mouse, # geneid/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=['fdr_bh']) # defult multipletest correction method
def test_NCBI_assc(): """Test downloading of associations from NCBI.""" fdnld = download_ncbi_associations() os.system("rm -f {FILE}".format(FILE=fdnld)) fdnld = download_ncbi_associations() assert os.path.isfile(fdnld)
def plot_go_enrichment(coef_df, auc_vals, pheno_dict, args, mode='abs'): obo_fl = os.path.join(args.go_dir, "go-basic.obo") download_go_basic_obo(obo_fl) obodag = GODag(obo_fl) assoc_fl = os.path.join(args.go_dir, "gene2go") download_ncbi_associations(assoc_fl) objanno = Gene2GoReader(assoc_fl, taxids=[9606]) ns2assoc = objanno.get_ns2assc() ncbi_map = {info.Symbol: ncbi_id for ncbi_id, info in GENEID2NT.items()} use_genes = set(coef_df.columns) & set(ncbi_map) bgrd_ids = [ncbi_map[gn] for gn in use_genes] goeaobj = GOEnrichmentStudyNS(bgrd_ids, ns2assoc, obodag, propagate_counts=False, alpha=0.05, methods=['fdr_bh']) plot_dict = dict() use_gos = set() coef_mat = coef_df.loc[:, [gene in use_genes for gene in coef_df.columns]] if mode == 'bayes': coef_means = coef_mat.groupby(level=0, axis=1).mean() coef_stds = coef_mat.groupby(level=0, axis=1).std() else: coef_mat = coef_mat.groupby(level=0, axis=1).mean() for mtype, coefs in coef_mat.iterrows(): if not isinstance(mtype, RandomType): if mode == 'abs': fgrd_ctf = coefs.abs().quantile(0.95) fgrd_genes = coefs.index[coefs.abs() > fgrd_ctf] use_clr = 3.17 elif mode == 'high': fgrd_ctf = coefs.quantile(0.95) fgrd_genes = coefs.index[coefs > fgrd_ctf] use_clr = 2.03 elif mode == 'low': fgrd_ctf = coefs.quantile(0.05) fgrd_genes = coefs.index[coefs < fgrd_ctf] use_clr = 1.03 elif mode == 'bayes': gene_scrs = coef_means.loc[mtype].abs() - coef_stds.loc[mtype] fgrd_genes = gene_scrs.index[gene_scrs > 0] use_clr = 3.17 else: raise ValueError( "Unrecognized `mode` argument <{}>!".format(mode)) fgrd_ids = [ncbi_map[gn] for gn in fgrd_genes] goea_out = goeaobj.run_study(fgrd_ids, prt=None) plot_dict[mtype] = { rs.name: np.log10(rs.p_fdr_bh) for rs in goea_out if rs.enrichment == 'e' and rs.p_fdr_bh < 0.05 } plot_df = pd.DataFrame(plot_dict, columns=plot_dict.keys()) if plot_df.shape[0] == 0: print("Could not find any enriched GO terms across {} " "subgroupings!".format(plot_df.shape[1])) return None fig, ax = plt.subplots(figsize=(4.7 + plot_df.shape[0] / 2.3, 2 + plot_df.shape[1] / 5.3)) if plot_df.shape[0] > 2: plot_df = plot_df.iloc[dendrogram(linkage(distance.pdist( plot_df.fillna(0.0), metric='cityblock'), method='centroid'), no_plot=True)['leaves']].transpose() else: plot_df = plot_df.transpose() xlabs = [rs_nm for rs_nm in plot_df.columns] ylabs = [ get_fancy_label(tuple(mtype.subtype_iter())[0][1]) for mtype in plot_df.index ] pval_cmap = sns.cubehelix_palette(start=use_clr, rot=0, dark=0, light=1, reverse=True, as_cmap=True) sns.heatmap(plot_df, cmap=pval_cmap, vmin=-5, vmax=0, linewidths=0.23, linecolor='0.73', xticklabels=xlabs, yticklabels=ylabs) ax.set_xticklabels(xlabs, size=15, ha='right', rotation=31) ax.set_yticklabels(ylabs, size=9, ha='right', rotation=0) ax.set_xlim((plot_df.shape[1] / -83, plot_df.shape[1] * 1.009)) ax.set_ylim((plot_df.shape[0] * 1.009, plot_df.shape[0] / -83)) plt.savefig(os.path.join( plot_dir, '__'.join([args.expr_source, args.cohort]), "{}_go-{}-enrichment_{}.svg".format(args.gene, mode, args.classif)), bbox_inches='tight', format='svg') plt.close()
def get_go_ids(go_ids, species='H**o sapiens'): ''' Fetch all gene symbols associated with a list of gene ontology term IDs. Parameters ---------- go_ids : str or list of str species : str, optional Returns ------- list of str ''' assert species in TAXA if isinstance(go_ids, str): go_ids = [go_ids] obo_fname = download_go_basic_obo('db/go/go-basic.obo') gene2go = download_ncbi_associations('db/go/gene2go') taxid = TAXA[species] fin_symbols = 'genes_NCBI_{TAXID}_All.py'.format(TAXID=taxid) module_name = ''.join(['goatools.test_data.', fin_symbols[:-3]]) module = importlib.import_module(module_name) GeneID2nt = module.GENEID2NT go2geneids = Gene2GoReader( 'db/go/gene2go', taxids=[taxid], ) go2items = defaultdict(list) for i in go2geneids.taxid2asscs[taxid]: go2items[i.GO_ID].append(i.DB_ID) srchhelp = GoSearch('db/go/go-basic.obo', go2items=go2items) with open('go.log', 'w') as log: # Add children GOs gos_all = srchhelp.add_children_gos(go_ids) # Get Entrez GeneIDs for cell cycle GOs gene_ids = set() for go_items in [ go_ids, gos_all, ]: gene_ids.update(srchhelp.get_items(go_items)) genes = [] for geneid in gene_ids: nt = GeneID2nt.get(geneid, None) if nt is not None: genes.append(nt.Symbol) return genes
def pullGOenrichment(inputFile, project): GeneID2nt_hum = genes_NCBI_9606_ProteinCoding.GENEID2NT obo_fname = download_go_basic_obo() fin_gene2go = download_ncbi_associations() obodag = GODag("go-basic.obo") # Read NCBI's gene2go. Store annotations in a list of namedtuples objanno = Gene2GoReader(fin_gene2go, taxids=[9606]) # Get namespace2association where: # namespace is: # BP: biological_process # MF: molecular_function # CC: cellular_component # assocation is a dict: # key: NCBI GeneID # value: A set of GO IDs associated with that gene ns2assoc = objanno.get_ns2assc() for nspc, id2gos in ns2assoc.items(): print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos))) print(len(GeneID2nt_hum)) goeaobj = GOEnrichmentStudyNS( GeneID2nt_hum.keys(), # List of human protein-coding genes ns2assoc, # geneid/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=['fdr_bh']) # defult multipletest correction method geneid2symbol = {} with open(inputFile, 'r') as infile: input_genes = csv.reader(infile) for line in input_genes: geneid = line[0] symbol = line[1] if geneid: geneid2symbol[int(geneid)] = symbol infile.close() geneids_study = geneid2symbol.keys() goea_results_all = goeaobj.run_study(geneids_study) goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05] import collections as cx ctr = cx.Counter([r.NS for r in goea_results_sig]) print('Significant results[{TOTAL}] = {BP} BP + {MF} MF + {CC} CC'.format( TOTAL=len(goea_results_sig), BP=ctr['BP'], # biological_process MF=ctr['MF'], # molecular_function CC=ctr['CC'])) # cellular_component goeaobj.wr_xlsx("Data/go_enrichment" + project + ".csv", goea_results_sig) goeaobj.wr_txt("Data/go_enrichment" + project + ".txt", goea_results_sig)
big_ax.set_ylabel('Density (a.u.)') fig.tight_layout() fig.savefig(os.path.join(outdir, "kde_pc_values.png"), dpi=200) ## Run GO analysis using GOAtools # TODO: if this works well, move to a module from goatools import base import wget obo_fn = os.path.join(LOCAL_DATA_DIR, 'gene_ontology', 'current', 'go-basic.obo') genetogo_fn = os.path.join(LOCAL_DATA_DIR, 'gene_ontology', 'current', 'gene2go') genetoens_fn = os.path.join(LOCAL_DATA_DIR, 'gene_ontology', 'current', 'gene2ensembl.gz') genetoens_url = "ftp://ftp.ncbi.nih.gov/gene/DATA/gene2ensembl.gz" obo_fn = base.download_go_basic_obo(obo_fn) genetogo_fn = base.download_ncbi_associations(genetogo_fn) if not os.path.isfile(genetoens_fn): logger.info("Downloading RefGene-Ensembl converter from %s, saving to %s.", genetoens_url, genetoens_fn) wget.download(genetoens_url, out=genetoens_fn) def ens_to_entrez(ens, genetoens_fn): gene2ens = pd.read_csv(genetoens_fn, header=0, sep='\t') gene2ens = gene2ens.loc[gene2ens['#tax_id'] == 9606] conv_df = gene2ens.loc[gene2ens.Ensembl_gene_identifier.isin(ens), ['GeneID', 'Ensembl_gene_identifier']] # reduce to unique (Entrez ID, Ensembl ID) pairs conv = collections.defaultdict(list) for _, row in conv_df.iterrows(): conv[row['Ensembl_gene_identifier']].append(conv['GeneID']) res = [] for e in ens:
from goatools.base import download_ncbi_associations from goatools.obo_parser import GODag from goatools.test_data.genes_NCBI_10090_ProteinCoding import GENEID2NT as GeneID2nt_mus from goatools.test_data.genes_NCBI_9606_ProteinCoding import GENEID2NT as GeneID2nt_hum from goatools.anno.genetogo_reader import Gene2GoReader from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS PATH = os.path.dirname(__file__) go_basic_path = os.path.join(PATH, 'data', 'go-basic.obo') gene2go_path = os.path.join(PATH, 'data', 'gene2go') try: # Get http://geneontology.org/ontology/go-basic.obo go_basic_path = download_go_basic_obo(go_basic_path) # Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz gene2go_path = download_ncbi_associations(gene2go_path) except: # if directory is not writeable for whatever reason, just save to /tmp go_basic_path = os.path.join('/tmp', 'go-basic.obo') gene2go_path = os.path.join('/tmp', 'gene2go') go_basic_path = download_go_basic_obo(go_basic_path) gene2go_path = download_ncbi_associations(gene2go_path) obodag = GODag(go_basic_path) # Read NCBI's gene2go. Store annotations in a list of namedtuples #objanno = Gene2GoReader(gene2go_path, taxids=[10090]) #ns2assoc = objanno.get_ns2assc() #symbols_to_ids = {val.Symbol : key for key, val in GeneID2nt_mus.items()} #ids_to_symbols = {val : key for key, val in symbols_to_ids.items()}
def test_NCBI_assc(): """Test downloading of associations from NCBI.""" fdnld = download_ncbi_associations() os.system("rm -f {FILE}".format(FILE=fdnld)) fdnld = download_ncbi_associations() assert os.path.isfile(fdnld)