def run(study, pop, assoc, alpha=0.05, p_value=0.05, compare=False, ratio=None, obo='go-basic.obo', no_propagate_counts=False, method='bonferroni,sidak,holm', pvalcalc='fisher'): ''' This is the wrapper of the Goatools function. :param study: a list of study gene :param pop: a list of population gene :param assoc: the association from the gene to the go term :return: ''' if type(study) == str and type(pop) == str: # load the study and pop from the file study, pop = GO._read_geneset(study, pop, compare=compare) else: # convert to the set study = frozenset(study) pop = set(pop) methods = method.split(",") if obo == 'go-basic.obo': obo = os.path.dirname(os.path.realpath(__file__)) + "/obo/go.obo" if not os.path.exists(obo): print("obo file not found, start to download") wget.download('http://purl.obolibrary.org/obo/go/go-basic.obo', obo) obo_dag = GODag(obo) propagate_counts = not no_propagate_counts if type(assoc) == dict: buf = "" for k, v in assoc.items(): if not v: continue line = ";".join([str(x) for x in v if x]) buf += "{}\t{}\n".format(k, line) path = os.path.dirname(os.path.realpath(__file__)) + "/assoc" with open(path, 'w') as fp: fp.write(buf) assoc = read_associations(path) elif type(assoc) == defaultdict: pass else: # if from a file assoc = read_associations(assoc) g = GOEnrichmentStudy(pop, assoc, obo_dag, propagate_counts=propagate_counts, alpha=alpha, pvalcalc=pvalcalc, methods=methods) results = g.run_study(study) # g.print_summary(results, min_ratio=ratio, indent=False, pval=p_value) r = 'GO\tNS\tenrichment\tname\tratio_in_study\tratio_in_pop\tp_uncorrected\tdepth\tstudy_count\tp_bonferroni\tp_sidak\tp_holm\thit\n' for x in results: r += x.__str__() + "\n" tb = pd.read_table(StringIO(r)) return GO(tb, study, pop, assoc, alpha, p_value, compare, ratio, obo, no_propagate_counts, method, pvalcalc, obo_dag)
def __init__(self, go_obo_path='data/go.obo'): canonical_orfs = paper_orfs self.obodag = GODag(go_obo_path) # read genes containing GO Ontology annotations orfs_with_go = read_sgd_orfs() # only use canonical orfs dataset self.orfs_with_go = orfs_with_go.join(canonical_orfs[[]], how='inner') # create mapping of gene names to set of GO annotaitons assoc = defaultdict(set) for idx, gene in self.orfs_with_go.iterrows(): assoc[gene['name']] = set(gene.ontology.split(',')) self.assoc = assoc self.methods = ['fdr_bh', 'bonferroni'] self.devnull = open('/dev/null', 'w') # create GO enrichment object to run GO self.goeaobj = GOEnrichmentStudy( assoc.keys(), # List of protein-coding genes assoc, # geneid/GO associations self.obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=self.methods, log=self.devnull)
def test_i122(): """Test to re-produce issue#122: Passes currently.""" obj = _Run(9606, 'gene2go', 'go-basic.obo') study_ids, population_ids = obj.get_genes_study_n_bg() # Result is the same whether fisher_scipy_stats of fisher pvalcalc = 'fisher_scipy_stats' goeaobj = GOEnrichmentStudy(population_ids, obj.gene2go, obj.godag, methods=['bonferroni', 'fdr_bh'], pvalcalc=pvalcalc) # Run GOEA Gene Ontology Enrichment Analysis results_goeas = goeaobj.run_study_nts(study_ids) print( 'NS GO p stu_ratio pop_ratio p-uncorr bonferro fdr_bh stu ' ) for ntd in results_goeas: if ntd.study_count == 0: doprt = False if ntd.p_bonferroni < 0.05: assert ntd.enrichment == 'p' doprt = True if ntd.p_fdr_bh < 0.05: assert ntd.enrichment == 'p' doprt = True if doprt: print(obj.str_nt(ntd))
def perform_gene_enrichment_analysis(self, metagene_matrix, method='fdr'): # Load the Gene Ontology n_comps = metagene_matrix.shape[1] self.download_and_cache_resources( ) # Download ontology and annotations, if necessary gene_ontology = obo_parser.GODag('../DownloadedResources/go-basic.obo') # Load the human annotations c = 0 with gzip.open('../DownloadedResources/goa_human.gaf.gz', 'rt') as gaf: funcs = {} for entry in GOA.gafiterator(gaf): c += 1 uniprot_id = entry.pop('DB_Object_Symbol') funcs[uniprot_id] = entry # Our population is the set of genes we are analysing population = self.gene_symbols() print("We have %d genes in our population" % len(population)) # Build associations from functional annotations we got from the gaf file associations = {} for x in funcs: if x not in associations: associations[x] = set() associations[x].add(str(funcs[x]['GO_ID'])) gea = GOEnrichmentStudy(population, associations, gene_ontology, propagate_counts=True, alpha=0.05, methods=[method]) gea_results_by_component = {} rankings = self.ranked_genes_by_component(metagene_matrix) for ci in range(n_comps): study_genes = rankings[ci] print('\nComp. %d: %s...' % (ci, str(study_genes[:10]))) gea_results_by_component[ci] = gea.run_study(study_genes) # Get results into a dataframe per component. Easiest way is to use routine to # write a .tsv file, then read back and filter gea_results_df_by_component = [] for ci in range(n_comps): ge_df = self._perform_gene_enrichment_analysis_one_component( ci, gea_results_by_component, gea) if ge_df is not None: gea_results_df_by_component += [ge_df] # Merge the per-component dataframes into a single one gea_all_sig_results_df = pd.DataFrame() gea_all_sig_results_df = gea_all_sig_results_df.append( gea_results_df_by_component) gea_all_sig_results_df.to_csv(self.cache_dir + '%s_gea_all.tsv' % self.prefix, sep='\t')
class GoEnrich(): def __init__(self): obodag = GODag("../Data/evaluation_reference/goslim_yeast.obo") background = [line.strip() for line in open('../Data/evaluation_reference/gene_list.txt')] geneid2gos_yeast = read_associations('../Data/evaluation_reference/geneid2gos_yeast.txt') self.goeaobj = GOEnrichmentStudy( background, geneid2gos_yeast, obodag, propogate_counts=False, alpha=0.05, methods=['fdr_bh']) def measure_enrichment(self, gene_set=['YML106W', 'YKL135C', 'YDR516C', 'YLR420W', 'YNL111C', 'YHR007C', 'YLR014C', 'YKL216W', 'YNL078W', 'YJR005W', 'YJL130C'], run_name='base', cluster_id=1): gene_ids = ['YML106W', 'YKL135C', 'YDR516C', 'YLR420W', 'YNL111C', 'YHR007C', 'YLR014C', 'YKL216W', 'YNL078W', 'YJR005W', 'YJL130C'] goea_results_all = self.goeaobj.run_study(gene_ids) # we can get significant only # goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05] self.goeaobj.wr_txt("../Results/" + run_name + "_" + str(cluster_id) + ".txt", goea_results_all)
def __init__(self, dir, params): """ """ super().__init__(dir, params) set_logger(os.path.join(self.dir, 'experiment.log'), level=logging.INFO, console=True) logging.info("Loading disease associations...") self.diseases_dict = load_diseases(self.params["associations_path"], self.params["disease_subset"], exclude_splits=['none']) logging.info("Loading network...") self.network = Network(self.params["ppi_network"]) self.degrees = np.array(list(dict(self.network.nx.degree()).values())) logging.info("Loading weights...") with open(os.path.join(params["model_path"], "models", "models.tar"), "rb") as f: split_to_model = pickle.load(f) self.ci_weights = ci_weights = np.mean([model['ci_weight'][0, 0].numpy() for model in split_to_model.values()], axis=0) self.ci_weights_norm = self.ci_weights / np.sqrt(self.degrees) logging.info("Loading enrichment study...") geneid2go = read_ncbi_gene2go("data/go/gene2go.txt", taxids=[9606]) obodag = GODag("data/go/go-basic.obo") self.go_study = GOEnrichmentStudy(self.network.get_names(), geneid2go, obodag, propagate_counts = True, alpha = 0.05, methods = ['fdr_bh'])
def check_group_enrichment(tested_gene_file_name, total_gene_file_name): total_gene_list = load_gene_list(total_gene_file_name) tested_gene = load_gene_list(tested_gene_file_name) if not os.path.exists(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)): download(constants.GO_OBO_URL, constants.GO_DIR) obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) if not os.path.exists(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME)): download(constants.GO_ASSOCIATION_GENE2GEO_URL, constants.GO_DIR) with gzip.open(os.path.join(constants.GO_DIR, os.path.basename(constants.GO_ASSOCIATION_GENE2GEO_URL)), 'rb') as f_in: with open(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME),'wb') as f_out: shutil.copyfileobj(f_in, f_out) assoc = read_ncbi_gene2go(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME), no_top=True) g = GOEnrichmentStudy([int(cur) for cur in ensembl2entrez_convertor(total_gene_list)], assoc, obo_dag, methods=["bonferroni", "fdr_bh"]) g_res = g.run_study([int(cur) for cur in ensembl2entrez_convertor(tested_gene)]) GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.p_uncorrected, cur.p_fdr_bh) for cur in g_res if cur.p_fdr_bh <= 0.05] if len(GO_results) > 0: go_ns, go_terms, go_names, uncorrectd_pvals, FDRs = zip(*GO_results) else: go_terms = [] uncorrectd_pvals = [] FDRs = [] go_names = [] go_ns = [] output_rows = [("\r\n".join(e2g_convertor(tested_gene)), "\r\n".join(go_ns), "\r\n".join(go_terms), "\r\n".join(go_names), "\r\n".join(map(str, uncorrectd_pvals)), "\r\n".join(map(str, FDRs)))] print_to_excel(output_rows, tested_gene_file_name, total_gene_file_name)
def test_goea(): """Test GOEA with method, fdr.""" obo_dag = GODag("go-basic.obo") assoc = read_associations("../data/association", no_top=True) popul_ids = [line.rstrip() for line in open("../data/population")] study_ids = [line.rstrip() for line in open("../data/study")] goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=['fdr']) goea_results = goeaobj.run_study(study_ids) goeaobj.print_summary(goea_results)
def get_goea_results(method="fdr_bh"): """Get GOEA results.""" root_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") obo_fin = os.path.join(root_dir, "goslim_generic.obo") obo_dag = GODag(obo_fin) assoc = read_associations(os.path.join(root_dir, "slim_association"), no_top=True) popul_ids = [line.rstrip() for line in open(os.path.join(root_dir, "small_population"))] goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=[method]) study_ids = [line.rstrip() for line in open(os.path.join(root_dir, "small_study"))] goea_results = goeaobj.run_study(study_ids, methods=[method]) return goea_results
def test_i96(): """Test to re-produce issue#96: Passes currently.""" # Trying to duplicate: ValueError("All values in table must be nonnegative. # Get genes study_ids = _get_geneids() population_ids = GeneID2nt.keys() # Get databases gene2go = get_assoc_ncbi_taxids([9606], loading_bar=None) fin_obo = os.path.join(os.getcwd(), "go-basic.obo") godag = get_godag(fin_obo, loading_bar=None) goeaobj = GOEnrichmentStudy(population_ids, gene2go, godag, methods=['fdr_bh']) # Run GOEA Gene Ontology Enrichment Analysis results_goeas = goeaobj.run_study(study_ids)
def _get_pvals(pvalfnc_names, prt=sys.stdout): fisher2pvals = {} taxid = 10090 # Mouse study obo_dag = GODag(download_go_basic_obo(prt=prt)) geneids_pop = GeneID2nt_mus.keys() assoc_geneid2gos = get_assoc_ncbi_taxids([taxid]) geneids_study = _get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx", prt) for fisher in pvalfnc_names: goeaobj = GOEnrichmentStudy( geneids_pop, assoc_geneid2gos, obo_dag, propagate_counts=False, alpha=0.05, methods=None, pvalcalc=fisher ) fisher2pvals[fisher] = goeaobj._get_pval_uncorr(geneids_study, prt) return fisher2pvals
def run_bonferroni(log): """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways.""" # --------------------------------------------------------------------- # Run Gene Ontology Analysis (GOEA) # # 1. Initialize obo_dag = GODag("go-basic.obo") assoc = read_associations("../data/association", no_top=True) popul_ids = [line.rstrip() for line in open("../data/population")] study_ids = [line.rstrip() for line in open("../data/study")] # 2. Run enrichment analysis goea = GOEnrichmentStudy(popul_ids, assoc, obo_dag, alpha=0.05, methods=['bonferroni']) results_nt = goea.run_study(study_ids) return results_nt, goea
def run_bonferroni(): """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways.""" # --------------------------------------------------------------------- # Run Gene Ontology Analysis (GOEA) # # 1. Initialize godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None) assoc = read_associations(os.path.join(REPO, "data/association"), no_top=True) popul_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/population"))] study_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/study"))] # 2. Run enrichment analysis goea = GOEnrichmentStudy(popul_ids, assoc, godag, alpha=0.05, methods=['bonferroni']) results_nt = goea.run_study(study_ids) return results_nt, goea
def _get_results(godag, propagate_counts, relationships, prt=sys.stdout): """Run a GOEA. Return results""" taxid = 10090 # Mouse study geneids_pop = set(GeneID2nt_mus.keys()) assoc_geneid2gos = get_assoc_ncbi_taxids([taxid], loading_bar=None) geneids_study = get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx") goeaobj = GOEnrichmentStudy(geneids_pop, assoc_geneid2gos, godag, propagate_counts=propagate_counts, relationships=relationships, alpha=0.05, methods=['fdr_bh']) return goeaobj.run_study(geneids_study, prt=prt)
def _get_results(godag, propagate_counts, relationships, prt=sys.stdout): """Run a GOEA. Return results""" taxid = 10090 # Mouse study geneids_pop = set(GeneID2nt_mus.keys()) assoc_geneid2gos = get_assoc_ncbi_taxids([taxid], loading_bar=None) geneids_study = get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx") goeaobj = GOEnrichmentStudy( geneids_pop, assoc_geneid2gos, godag, propagate_counts=propagate_counts, relationships=relationships, alpha=0.05, methods=['fdr_bh']) return goeaobj.run_study(geneids_study, prt=prt)
def run_bonferroni(): """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways.""" # --------------------------------------------------------------------- # Run Gene Ontology Analysis (GOEA) # # 1. Initialize godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None) fin_assc = os.path.join(REPO, "data/association") assoc = read_associations(fin_assc, 'id2gos', no_top=True) popul_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/population"))] study_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/study"))] # 2. Run enrichment analysis goea = GOEnrichmentStudy(popul_ids, assoc, godag, alpha=0.05, methods=['bonferroni']) results_nt = goea.run_study(study_ids) return results_nt, goea
def go_enrichment_study(self): if self._go_enrichment_study is None: # Load the human annotations c = 0 with gzip.open('../DownloadedResources/goa_human.gaf.gz', 'rt') as gaf: funcs = {} for entry in GOA.gafiterator(gaf): c += 1 uniprot_id = entry.pop('DB_Object_Symbol') funcs[uniprot_id] = entry # Our population is the set of genes we are analysing population = self.gene_symbols() print("We have %d genes in our population" % len(population)) # Build associations from functional annotations we got from the gaf file associations = {} for x in funcs: if x not in associations: associations[x] = set() associations[x].add(str(funcs[x]['GO_ID'])) self._go_enrichment_study = \ GOEnrichmentStudy(population, associations, self._gene_ontology, propagate_counts=True, alpha=0.01, methods=[self.method]) return self._go_enrichment_study
def get_goeaobj(methods=None): """Test GOEA with method, fdr.""" obo_dag = GODag("go-basic.obo") assoc = read_associations("../data/association", no_top=True) popul_ids = [line.rstrip() for line in open("../data/population")] goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods) return goeaobj
def get_goeaobj(methods=None): """Test GOEA with method, fdr.""" obo_dag = GODag(ROOT + "goslim_generic.obo") assoc = read_associations(ROOT + "slim_association", no_top=True) popul_ids = [line.rstrip() for line in open(ROOT + "small_population")] goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods) return goeaobj
def _get_pvals(pvalfnc_names, prt=sys.stdout): fisher2pvals = {} taxid = 10090 # Mouse study obo_dag = GODag(download_go_basic_obo(prt=prt)) geneids_pop = GeneID2nt_mus.keys() assoc_geneid2gos = get_assoc_ncbi_taxids([taxid]) geneids_study = _get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx", prt) for fisher in pvalfnc_names: goeaobj = GOEnrichmentStudy(geneids_pop, assoc_geneid2gos, obo_dag, propagate_counts=False, alpha=0.05, methods=None, pvalcalc=fisher) fisher2pvals[fisher] = goeaobj._get_pval_uncorr(geneids_study, prt) return fisher2pvals
def __init__(self, dir, params): """ Constructor Args: dir (string) directory of the experiment to be run """ super().__init__(dir, params) # Set the logger set_logger(os.path.join(self.dir, 'experiment.log'), level=logging.INFO, console=True) # Log title logging.info("Disease Protein Prediction") logging.info("Sabri Eyuboglu -- SNAP Group") logging.info("======================================") logging.info("Loading Disease Associations...") self.diseases_dict = load_diseases(self.params["associations_path"], self.params["disease_subset"], exclude_splits=['none']) logging.info("Loading Network...") self.network = Network(self.params["ppi_network"]) logging.info("Loading enrichment study...") obodag = GODag(self.params["go_path"]) geneid2go = read_ncbi_gene2go(self.params["gene_to_go_path"], taxids=[9606]) self.enrichment_study = GOEnrichmentStudy(self.network.get_names(), geneid2go, obodag, log=None, **self.params["enrichment_params"]) logging.info("Loading predictions...") self.method_to_preds = {name: pd.read_csv(os.path.join(preds, "predictions.csv"), index_col=0) for name, preds in self.params["method_to_preds"].items()} outputs_path = os.path.join(self.dir, "outputs.pkl") if os.path.exists(outputs_path): logging.info("Loading outputs...") with open(outputs_path, 'rb') as f: self.outputs = pickle.load(f) else: self.outputs = {}
def _get_pvals(pvalfnc_names, prt=sys.stdout): fisher2pvals = {} taxid = 10090 # Mouse study file_obo = os.path.join(os.getcwd(), "go-basic.obo") obo_dag = get_godag(file_obo, prt, loading_bar=None) geneids_pop = GeneID2nt_mus.keys() assoc_geneid2gos = get_assoc_ncbi_taxids([taxid], loading_bar=None) geneids_study = get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx") for fisher in pvalfnc_names: goeaobj = GOEnrichmentStudy(geneids_pop, assoc_geneid2gos, obo_dag, propagate_counts=False, alpha=0.05, methods=None, pvalcalc=fisher) fisher2pvals[fisher] = goeaobj.get_pval_uncorr(geneids_study, prt) return fisher2pvals
def get_goeaobj(methods=None): """Test GOEA with method, fdr.""" obo_fin = os.path.join(REPO, "go-basic.obo") obo_dag = get_godag(obo_fin, loading_bar=None) assoc = read_associations("{REPO}/tests/data/small_association".format(REPO=REPO), no_top=True) popul_fin = "{REPO}/tests/data/small_population".format(REPO=REPO) popul_ids = [line.rstrip() for line in open(popul_fin)] goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods) return goeaobj
def get_goeaobj(self, pop_genes, assoc_geneid2gos): """Return a GOEnrichmentStudy specific for user-provided pop_genes and associations.""" return GOEnrichmentStudy(pop_genes, assoc_geneid2gos, self.go_dag, propagate_counts=self.propagate_counts, alpha=self.alpha, methods=[self.method], log=None)
def init_goea(**kws): """Initialize GODag and GOEnrichmentStudy.""" godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None) fin_assc = ROOT + "association" assoc = read_associations(fin_assc, 'id2gos', no_top=True) popul_ids = [line.rstrip() for line in open(ROOT + "population")] methods = kws['methods'] if 'methods' in kws else ['not_bonferroni'] study_ids = [line.rstrip() for line in open(ROOT + "study")] return GOEnrichmentStudy(popul_ids, assoc, godag, methods=methods), study_ids
def init_goea(**kws): """Initialize GODag and GOEnrichmentStudy.""" obo_dag = GODag(ROOT + "go-basic.obo") assoc = read_associations(ROOT + "association", no_top=True) popul_ids = [line.rstrip() for line in open(ROOT + "population")] methods = kws['methods'] if 'methods' in kws else ['not_bonferroni'] study_ids = [line.rstrip() for line in open(ROOT + "study")] return GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods), study_ids
def _init_objgoea(self, pop, assoc): """Run gene ontology enrichment analysis (GOEA).""" propagate_counts = not self.args.no_propagate_counts return GOEnrichmentStudy(pop, assoc, self.godag, propagate_counts=propagate_counts, relationships=False, alpha=self.args.alpha, pvalcalc=self.args.pvalcalc, methods=self.methods)
def test_i96(): """Test to re-produce issue#96: Passes currently.""" # Trying to duplicate: ValueError("All values in table must be nonnegative. # Get genes print('CWD', os.getcwd()) study_ids = _get_geneids() population_ids = GENEID2NT.keys() # Get databases print(os.getcwd()) fin = os.path.join(REPO, 'gene2go') dnld_ncbi_gene_file(fin, loading_bar=None) gene2go = read_ncbi_gene2go(fin, [9606]) fin_obo = os.path.join(REPO, "go-basic.obo") godag = get_godag(fin_obo, loading_bar=None) goeaobj = GOEnrichmentStudy(population_ids, gene2go, godag, methods=['fdr_bh']) # Run GOEA Gene Ontology Enrichment Analysis results_goeas = goeaobj.run_study(study_ids)
def create_enrichment_study(self): obj = GOEnrichmentStudy( self.background, # List of human protein-coding genes (Entrez IDs) self.geneid2gos_human, # Gene ID / GO associtations self.obodag, # Ontologies propagate_counts=False, alpha=self.alpha, # Significance cutoff methods=['fdr_bh'] # Multiple hypothesis correction ) return obj
def __init__(self): obodag = GODag("../Data/evaluation_reference/goslim_yeast.obo") background = [line.strip() for line in open('../Data/evaluation_reference/gene_list.txt')] geneid2gos_yeast = read_associations('../Data/evaluation_reference/geneid2gos_yeast.txt') self.goeaobj = GOEnrichmentStudy( background, geneid2gos_yeast, obodag, propogate_counts=False, alpha=0.05, methods=['fdr_bh'])
def get_objgoea(pop, assoc, args): """Run gene ontology enrichment analysis (GOEA).""" obo_dag = GODag(obo_file=args.obo) methods = args.method.split(",") propagate_counts = not args.no_propagate_counts return GOEnrichmentStudy(pop, assoc, obo_dag, propagate_counts=propagate_counts, alpha=args.alpha, pvalcalc=args.pvalcalc, methods=methods)
def get_goeaobj(method, geneids_pop, taxid): """Load: ontologies, associations, and population geneids.""" obo_dag = get_godag() assoc_geneid2gos = get_assoc_ncbi_taxids([taxid]) goeaobj = GOEnrichmentStudy(geneids_pop, assoc_geneid2gos, obo_dag, propagate_counts=False, alpha=0.05, methods=[method]) # obo_dag is also found in goeaobj.obo_dag return goeaobj
def test_i122(): """Test to re-produce issue#122: Passes currently.""" obj = _Run(9606, 'gene2go', 'go-basic.obo') study_ids, population_ids = obj.get_genes_study_n_bg() # Result is the same whether fisher_scipy_stats of fisher pvalcalc = 'fisher_scipy_stats' goeaobj = GOEnrichmentStudy(population_ids, obj.gene2go, obj.godag, methods=['bonferroni', 'fdr_bh'], pvalcalc=pvalcalc) # Run GOEA Gene Ontology Enrichment Analysis results_goeas = goeaobj.run_study_nts(study_ids) print('NS GO p stu_ratio pop_ratio p-uncorr bonferro fdr_bh stu ') for ntd in results_goeas: if ntd.study_count == 0: doprt = False if ntd.p_bonferroni < 0.05: assert ntd.enrichment == 'p' doprt = True if ntd.p_fdr_bh < 0.05: assert ntd.enrichment == 'p' doprt = True if doprt: print(obj.str_nt(ntd))
def get_goeaobj(method, geneids_pop, taxid): """Load: ontologies, associations, and population geneids.""" fin_obo = os.path.join(os.getcwd(), "go-basic.obo") godag = get_godag(fin_obo, loading_bar=None) assoc_geneid2gos = get_assoc_ncbi_taxids([taxid], loading_bar=None) goeaobj = GOEnrichmentStudy(geneids_pop, assoc_geneid2gos, godag, propagate_counts=False, alpha=0.05, methods=[method]) # godag is also found in goeaobj.godag return goeaobj
def return_enrichment_study_obj(gaf_taxfiltered): ''' Generate go enrichment study object with a background dataset. ''' obodag = GODag(config_utils.datadir + "/GOData/go-basic.obo") goeaobj = GOEnrichmentStudy( gaf_taxfiltered.keys(), # gaf_taxfiltered, # geneid/GO associations possible with tree used for DB obodag, # Ontologies propagate_counts=False, alpha=0.15, # default significance cut-off methods=['fdr_bh']) # defult multipletest correction method return goeaobj
def get_goeaobj(method, geneids_pop, taxid): """Load: ontologies, associations, and population geneids.""" fin_obo = "go-basic.obo" if not os.path.isfile(fin_obo): wget.download("wget http://geneontology.org/ontology/go-basic.obo") obo_dag = GODag(fin_obo) assoc_geneid2gos = get_assoc_ncbi_taxids([taxid]) goeaobj = GOEnrichmentStudy(geneids_pop, assoc_geneid2gos, obo_dag, propagate_counts=False, alpha=0.05, methods=[method]) return goeaobj
if not args.compare: # sanity check if len(pop) < len(study): exit("\nERROR: The study file contains more elements than the population file. " "Please check that the study file is a subset of the population file.\n") # check the fraction of genomic ids that overlap between study # and population overlap = float(len(study & pop)) / len(study) if 0.7 < overlap < 0.95: sys.stderr.write("\nWARNING: only {} fraction of genes/proteins in study are found in " "the population background.\n\n".format(overlap)) if overlap <= 0.7: exit("\nERROR: only {} of genes/proteins in the study are found in the " "background population. Please check.\n".format(overlap)) assoc = read_associations(assoc_fn) methods = ["bonferroni", "sidak", "holm"] if args.fdr: methods.append("fdr") obo_dag = GODag(obo_file=args.obo) propagate_counts = not args.no_propagate_counts g = GOEnrichmentStudy(pop, assoc, obo_dag, propagate_counts=propagate_counts, alpha=args.alpha, methods=methods) results = g.run_study(study) g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval)
def _ns2o(pop, ns2assoc, godag, propagate_counts, alpha, methods, **kws): return { ns:GOEnrichmentStudy(pop, a, godag, propagate_counts, alpha, methods, name=ns, **kws) \ for ns, a in sorted(ns2assoc.items())}
# Data will be stored in this variable import os import sys import pandas as pd import numpy as np import matplotlib.pyplot as plt import goatools from goatools.base import download_go_basic_obo from goatools.base import download_ncbi_associations from goatools.obo_parser import GODag from goatools.associations import read_ncbi_gene2go from goatools.test_data.genes_NCBI_10090_ProteinCoding import GeneID2nt as GeneID2nt_mus from goatools.go_enrichment import GOEnrichmentStudy obo_fname = download_go_basic_obo() gene2go = download_ncbi_associations() obodag = GODag("go-basic.obo") geneid2gos_mouse = read_ncbi_gene2go("gene2go", taxids=[10090]) geneid2symbol = {} print("{N:,} annotated mouse genes".format(N=len(geneid2gos_mouse))) print(GeneID2nt_mus.keys().head()) goeaobj = GOEnrichmentStudy( GeneID2nt_mus.keys(), # List of mouse protein-coding genes geneid2gos_mouse, # geneid/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=['fdr_bh']) # defult multipletest correction method
if 0.7 < overlap < 0.95: sys.stderr.write("\nWARNING: only {} fraction of genes/proteins in study are found in " "the population background.\n\n".format(overlap)) if overlap <= 0.7: exit("\nERROR: only {} of genes/proteins in the study are found in the " "background population. Please check.\n".format(overlap)) assoc = read_associations(assoc_fn) methods = args.method.split(",") obo_dag = GODag(obo_file=args.obo) propagate_counts = not args.no_propagate_counts g = GOEnrichmentStudy(pop, assoc, obo_dag, propagate_counts=propagate_counts, alpha=args.alpha, pvalcalc=args.pvalcalc, methods=methods) results = g.run_study(study) if args.outfile is None: g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval) else: # Users can print to both tab-separated file and xlsx file in one run. outfiles = args.outfile.split(",") prt_if = None # Print all values if args.pval is not None: # Only print out when uncorrected p-value < this value. prt_if = lambda nt: nt.p_uncorrected < args.pval for outfile in outfiles: if outfile.endswith(".xlsx"): g.wr_xlsx(outfile, results, prt_if=prt_if)
elif snakemake.wildcards.state_type == 'Enhancer': min_dist = 5000 max_dist = 50000 else: sys.exit(-1) with open(snakemake.input.clusters) as f: for line in f: cols = line.strip().split() cluster = chr(int(cols[3]) + 65) if int(cols[-1]) <= max_dist and int(cols[-1]) >= min_dist: genes[cluster].add(cols[7]) background.add(cols[7]) obodag = GODag("go-basic.obo") id2go = read_associations("sym2go.txt") goeaobj = GOEnrichmentStudy(background, id2go, obodag, propagate_counts=False, alpha=0.05, methods=['fdr_bh']) outfile = open(snakemake.output.txt, 'w') for cluster, geneids in sorted(genes.items()): outfile.write("Cluster {}\n".format(cluster)) goea_results_all = goeaobj.run_study(geneids) for fdr, name, enrichment in sorted([(r.p_fdr_bh, r.name, r.enrichment) for r in goea_results_all if r.p_fdr_bh < 0.2]): outfile.write("\t{}\t{}\t{}\n".format(name, fdr, enrichment)) outfile.write("\n") #GOEnrichmentStudy.print_summary(goea_results_sig)