def run(study, pop, assoc, alpha=0.05, p_value=0.05, compare=False, ratio=None, obo='go-basic.obo', no_propagate_counts=False, method='bonferroni,sidak,holm', pvalcalc='fisher'): ''' This is the wrapper of the Goatools function. :param study: a list of study gene :param pop: a list of population gene :param assoc: the association from the gene to the go term :return: ''' if type(study) == str and type(pop) == str: # load the study and pop from the file study, pop = GO._read_geneset(study, pop, compare=compare) else: # convert to the set study = frozenset(study) pop = set(pop) methods = method.split(",") if obo == 'go-basic.obo': obo = os.path.dirname(os.path.realpath(__file__)) + "/obo/go.obo" if not os.path.exists(obo): print("obo file not found, start to download") wget.download('http://purl.obolibrary.org/obo/go/go-basic.obo', obo) obo_dag = GODag(obo) propagate_counts = not no_propagate_counts if type(assoc) == dict: buf = "" for k, v in assoc.items(): if not v: continue line = ";".join([str(x) for x in v if x]) buf += "{}\t{}\n".format(k, line) path = os.path.dirname(os.path.realpath(__file__)) + "/assoc" with open(path, 'w') as fp: fp.write(buf) assoc = read_associations(path) elif type(assoc) == defaultdict: pass else: # if from a file assoc = read_associations(assoc) g = GOEnrichmentStudy(pop, assoc, obo_dag, propagate_counts=propagate_counts, alpha=alpha, pvalcalc=pvalcalc, methods=methods) results = g.run_study(study) # g.print_summary(results, min_ratio=ratio, indent=False, pval=p_value) r = 'GO\tNS\tenrichment\tname\tratio_in_study\tratio_in_pop\tp_uncorrected\tdepth\tstudy_count\tp_bonferroni\tp_sidak\tp_holm\thit\n' for x in results: r += x.__str__() + "\n" tb = pd.read_table(StringIO(r)) return GO(tb, study, pop, assoc, alpha, p_value, compare, ratio, obo, no_propagate_counts, method, pvalcalc, obo_dag)
def get_goeaobj(methods=None): """Test GOEA with method, fdr.""" obo_dag = GODag(ROOT + "goslim_generic.obo") assoc = read_associations(ROOT + "slim_association", no_top=True) popul_ids = [line.rstrip() for line in open(ROOT + "small_population")] goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods) return goeaobj
def get_goeaobj(methods=None): """Test GOEA with method, fdr.""" obo_dag = GODag("go-basic.obo") assoc = read_associations("../data/association", no_top=True) popul_ids = [line.rstrip() for line in open("../data/population")] goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods) return goeaobj
def rd_files(self): """Read files and return study and population.""" study_fn, pop_fn, assoc_fn = self.args.filenames assoc = read_associations(assoc_fn) study, pop = self._read_geneset(study_fn, pop_fn) print("Study: {0} vs. Population {1}\n".format(len(study), len(pop))) return study, pop, assoc
def test_fdr_bh(fout_log=None): """Do Gene Ontology Enrichment Analysis w/Benjamini-Hochberg multipletest. Print results""" # --------------------------------------------------------------------- # Run Gene Ontology Analysis (GOEA) # # 1. Initialize log = sys.stdout if fout_log is None else open(fout_log, 'w') obo_dag = GODag("go-basic.obo") assoc = read_associations("../data/association", no_top=True) popul_ids = [line.rstrip() for line in open("../data/population")] study_ids = [line.rstrip() for line in open("../data/study")] # 2. Run enrichment analysis goea = GOEA(obo_dag, assoc, log) goea.set_population(popul_ids) goea.set_params(alpha=0.05, method='fdr_bh') results_nt = goea.find_enrichment(study_ids) # --------------------------------------------------------------------- # Print results 3 ways: to screen, to tsv(tab-separated file), to xlsx(Excel spreadsheet) fout_tsv = "goea_fdr_bh.tsv" fout_xls = "goea_fdr_bh.xlsx" field_names = ['NS', 'study_cnt', 'fdr_bh', 'level', 'depth', 'GO', 'name', 'fdr_bh_sig'] # collect these print_names = ['NS', 'study_cnt', 'fdr_bh', 'level', 'depth', 'GO', 'name'] # print these in tsv and xlsx # Optional user customizable sort: # Sort by: 1st) BP, MF, CC; 2nd) corrected pval, with smallest first. sort_by = lambda nt: [nt.NS, nt.fdr_bh] # 1. Print results to screen using format in prtfmt. For example: # # BP 22 3.073e-03 L06 D07 GO:0006468 protein phosphorylation # BP 9 1.023e-02 L07 D08 GO:0006511 ubiquitin-dependent protein catabolic process # BP 2 1.023e-02 L05 D09 GO:0019877 diaminopimelate biosynthetic process # BP 2 1.223e-02 L04 D08 GO:0006301 postreplication repair # BP 2 1.223e-02 L05 D09 GO:0030418 nicotianamine biosynthetic process # BP 2 1.492e-02 L04 D06 GO:0006909 phagocytosis # BP 2 1.492e-02 L03 D03 GO:0051322 anaphase # ... # Print format field names are the same names as in the "field_names" variable. prtfmt = "{NS} {study_cnt:2} {fdr_bh:5.3e} L{level:02} D{depth:02} {GO} {name}\n" keep_if = lambda nt: nt.fdr_bh_sig # T/F: Keep the GOEA GO Term result only if the result is significant. goea.prt_txt(log, results_nt, field_names, prtfmt, sort_by=sort_by, keep_if=keep_if) # 2. Write results to tsv file # Sort by: 1st) BP, MF, CC; 2nd) By GO depth, deepest GO first. sort_by = lambda nt: [nt.NS, -1*nt.depth] fld2fmt = {'fdr_bh':'{:8.2e}'} # Optional user defined formatting for specific fields goea.wr_tsv(fout_tsv, results_nt, field_names, keep_if=keep_if, sort_by=sort_by, fld2fmt=fld2fmt, print_names=print_names) # 3. Write results to xlsx file # Use these headers instead of the print_names for the xlsx header hdrs = ['NS', 'Cnt', 'fdr_bh', 'L', 'D', 'Term', 'Ontology Term Name'] # TBD Check that header and size of fields printed match goea.wr_xlsx(fout_xls, results_nt, field_names, # optional key-word args (ie, kwargs, kws) keep_if=keep_if, sort_by=sort_by, hdrs=hdrs, fld2fmt=fld2fmt, print_names=print_names) if fout_log is not None: log.close() sys.stdout.write(" WROTE: {}\n".format(fout_log))
def init_goea(**kws): """Initialize GODag and GOEnrichmentStudy.""" obo_dag = GODag(ROOT + "go-basic.obo") assoc = read_associations(ROOT + "association", no_top=True) popul_ids = [line.rstrip() for line in open(ROOT + "population")] methods = kws['methods'] if 'methods' in kws else ['not_bonferroni'] study_ids = [line.rstrip() for line in open(ROOT + "study")] return GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods), study_ids
def get_goeaobj(methods=None): """Test GOEA with method, fdr.""" obo_fin = os.path.join(REPO, "go-basic.obo") obo_dag = get_godag(obo_fin, loading_bar=None) assoc = read_associations("{REPO}/tests/data/small_association".format(REPO=REPO), no_top=True) popul_fin = "{REPO}/tests/data/small_population".format(REPO=REPO) popul_ids = [line.rstrip() for line in open(popul_fin)] goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods) return goeaobj
def init_goea(**kws): """Initialize GODag and GOEnrichmentStudy.""" godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None) fin_assc = ROOT + "association" assoc = read_associations(fin_assc, 'id2gos', no_top=True) popul_ids = [line.rstrip() for line in open(ROOT + "population")] methods = kws['methods'] if 'methods' in kws else ['not_bonferroni'] study_ids = [line.rstrip() for line in open(ROOT + "study")] return GOEnrichmentStudy(popul_ids, assoc, godag, methods=methods), study_ids
def rd_files(filenames, compare, prt=sys.stdout): """Read files and return study and population.""" study_fn, pop_fn, assoc_fn = filenames assoc = read_associations(assoc_fn) study, pop = read_geneset(study_fn, pop_fn, compare=compare) if prt: prt.write("Study: {0} vs. Population {1}\n".format( len(study), len(pop))) return study, pop, assoc
def test_goea(): """Test GOEA with method, fdr.""" obo_dag = GODag("go-basic.obo") assoc = read_associations("../data/association", no_top=True) popul_ids = [line.rstrip() for line in open("../data/population")] study_ids = [line.rstrip() for line in open("../data/study")] goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=['fdr']) goea_results = goeaobj.run_study(study_ids) goeaobj.print_summary(goea_results)
def get_goeaobj(methods=None): """Test GOEA with method, fdr.""" obo_fin = os.path.join(REPO, "go-basic.obo") obo_dag = get_godag(obo_fin, loading_bar=None) fin_assc = "{REPO}/tests/data/small_association".format(REPO=REPO) assoc = read_associations(fin_assc, 'id2gos', no_top=True) popul_fin = "{REPO}/tests/data/small_population".format(REPO=REPO) popul_ids = [line.rstrip() for line in open(popul_fin)] goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods) return goeaobj
def get_goea_results(method="fdr_bh"): """Get GOEA results.""" root_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") obo_fin = os.path.join(root_dir, "goslim_generic.obo") obo_dag = GODag(obo_fin) assoc = read_associations(os.path.join(root_dir, "slim_association"), no_top=True) popul_ids = [line.rstrip() for line in open(os.path.join(root_dir, "small_population"))] goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=[method]) study_ids = [line.rstrip() for line in open(os.path.join(root_dir, "small_study"))] goea_results = goeaobj.run_study(study_ids, methods=[method]) return goea_results
def __init__(self): obodag = GODag("../Data/evaluation_reference/goslim_yeast.obo") background = [line.strip() for line in open('../Data/evaluation_reference/gene_list.txt')] geneid2gos_yeast = read_associations('../Data/evaluation_reference/geneid2gos_yeast.txt') self.goeaobj = GOEnrichmentStudy( background, geneid2gos_yeast, obodag, propogate_counts=False, alpha=0.05, methods=['fdr_bh'])
def init_goea(log): """Read Ontologies and Annotations once.""" # --------------------------------------------------------------------- # Run Gene Ontology Analysis (GOEA) # # 1. Initialize obo_dag = GODag("go-basic.obo") assoc = read_associations("../data/association", no_top=True) popul_ids = [line.rstrip() for line in open("../data/population")] # 2. Run enrichment analysis goeaobj = GOEA(obo_dag, assoc, log) goeaobj.set_population(popul_ids) return goeaobj
def run_bonferroni(): """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways.""" # --------------------------------------------------------------------- # Run Gene Ontology Analysis (GOEA) # # 1. Initialize godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None) assoc = read_associations(os.path.join(REPO, "data/association"), no_top=True) popul_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/population"))] study_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/study"))] # 2. Run enrichment analysis goea = GOEnrichmentStudy(popul_ids, assoc, godag, alpha=0.05, methods=['bonferroni']) results_nt = goea.run_study(study_ids) return results_nt, goea
def run_bonferroni(log): """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways.""" # --------------------------------------------------------------------- # Run Gene Ontology Analysis (GOEA) # # 1. Initialize obo_dag = GODag("go-basic.obo") assoc = read_associations("../data/association", no_top=True) popul_ids = [line.rstrip() for line in open("../data/population")] study_ids = [line.rstrip() for line in open("../data/study")] # 2. Run enrichment analysis goea = GOEnrichmentStudy(popul_ids, assoc, obo_dag, alpha=0.05, methods=['bonferroni']) results_nt = goea.run_study(study_ids) return results_nt, goea
def run_bonferroni(): """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways.""" # --------------------------------------------------------------------- # Run Gene Ontology Analysis (GOEA) # # 1. Initialize godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None) fin_assc = os.path.join(REPO, "data/association") assoc = read_associations(fin_assc, 'id2gos', no_top=True) popul_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/population"))] study_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/study"))] # 2. Run enrichment analysis goea = GOEnrichmentStudy(popul_ids, assoc, godag, alpha=0.05, methods=['bonferroni']) results_nt = goea.run_study(study_ids) return results_nt, goea
if opts.term not in go_dag: sys.stderr.write(("term %s not found!\n" % opts.term)) sys.exit(1) direct_anc, all_anc = mapslim(opts.term, go_dag, goslim_dag) # output either all or only direct slims, depending on user command if only_direct: slim_terms_str = ";".join(direct_anc) else: slim_terms_str = ";".join(all_anc) print(slim_terms_str) # in case a association file is given as input if opts.ass_file_name: assert os.path.exists(opts.ass_file_name), ("file %s not found!" % opts.ass_file_name) assocs = read_associations(opts.ass_file_name, 'id2gos') for protein_product, go_terms in assocs.items(): all_direct_anc = set() all_covered_anc = set() all_all_anc = set() for go_term in go_terms: if go_term not in go_dag: continue direct_anc, all_anc = mapslim(go_term, go_dag, goslim_dag) all_all_anc |= all_anc # collect all covered ancestors, so the direct ancestors # can be calculated afterwards all_covered_anc |= (all_anc - direct_anc) all_direct_anc = all_all_anc - all_covered_anc # output either all or only direct, depending on user command if only_direct:
if not args.compare: # sanity check if len(pop) < len(study): exit("\nERROR: The study file contains more elements than the population file. " "Please check that the study file is a subset of the population file.\n") # check the fraction of genomic ids that overlap between study # and population overlap = float(len(study & pop)) / len(study) if 0.7 < overlap < 0.95: sys.stderr.write("\nWARNING: only {} fraction of genes/proteins in study are found in " "the population background.\n\n".format(overlap)) if overlap <= 0.7: exit("\nERROR: only {} of genes/proteins in the study are found in the " "background population. Please check.\n".format(overlap)) assoc = read_associations(assoc_fn) methods = args.method.split(",") obo_dag = GODag(obo_file=args.obo) propagate_counts = not args.no_propagate_counts g = GOEnrichmentStudy(pop, assoc, obo_dag, propagate_counts=propagate_counts, alpha=args.alpha, pvalcalc=args.pvalcalc, methods=methods) results = g.run_study(study) if args.outfile is None: g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval) else: # Users can print to both tab-separated file and xlsx file in one run.
in_memb = in_memb or term in plasma_membrane in_nucl = in_nucl or term in nucleus if in_cyto: print("cytoplasmic_part\t" + prot) # if in_memb: # print("plasma_membrane_part\t" + prot) if in_nucl: print("nuclear_part\t" + prot) if __name__ == "__main__": if len(sys.argv) != 3: sys.exit("USAGE: <script> ASSOCS OBO") assoc_file = sys.argv[1] obo_file = sys.argv[2] obo_dag = GODag(obo_file=obo_file, optional_attrs=["relationship"]) cytoplasm = set(["GO:0005737"]) plasma_membrane = set(["GO:0005886"]) nucleus = set(["GO:0005634"]) for term in obo_dag: term_rec = obo_dag[term] parents = get_really_all_parents(term_rec) if "GO:0044444" in parents: cytoplasm.add(term) if "GO:0044459" in parents: plasma_membrane.add(term) if "GO:0044428" in parents: nucleus.add(term) assoc = read_associations(assoc_file) print_locations(assoc, cytoplasm, plasma_membrane, nucleus)
# res_df_name = 'res_clustering.csv', # method='ward' , # metric='euclidean' # ) #============================================================================== data_path = os.path.abspath("data") res_path = os.path.abspath("results") in_go = data_path + '/go-basic.obo' in_assoc = data_path + '/associations.txt' gene_found = [ strip(n.split('\t')[0]) for n in open(data_path + '/in_df.txt').read().split('\n') ] obodag = GODag(in_go) geneid2gos = read_associations(in_assoc) goeaobj = GOEnrichmentStudy( gene_found, # List of mouse protein-coding genes geneid2gos, # geneid/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=['fdr_bh']) clustering_df = pd.DataFrame.from_csv('res_clustering.csv') for index, col in enumerate(clustering_df.columns): in_data = clustering_df[col].value_counts() in_data = in_data[in_data >= 2] cut_distance = col.split('_')[-1] print index, col, cut_distance for cluster in in_data.index.values:
if opts.term not in go_dag: sys.stderr.write(("term %s not found!\n" % opts.term)) sys.exit(1) direct_anc, all_anc = mapslim(opts.term, go_dag, goslim_dag) # output either all or only direct slims, depending on user command if only_direct: slim_terms_str = ";".join(direct_anc) else: slim_terms_str = ";".join(all_anc) print(slim_terms_str) # in case a association file is given as input if opts.ass_file_name: assert os.path.exists(opts.ass_file_name), ("file %s not found!" % opts.ass_file_name) assocs = read_associations(opts.ass_file_name) for protein_product, go_terms in assocs.items(): all_direct_anc = set() all_covered_anc = set() all_all_anc = set() for go_term in go_terms: if go_term not in go_dag: continue direct_anc, all_anc = mapslim(go_term, go_dag, goslim_dag) all_all_anc |= all_anc # collect all covered ancestors, so the direct ancestors # can be calculated afterwards all_covered_anc |= (all_anc - direct_anc) all_direct_anc = all_all_anc - all_covered_anc # output either all or only direct, depending on user command if only_direct:
def enrich(gene2go: str, study: str, obo: str, population: str = None, geneid2symbol: str = None, correct='fdr_bh', alpha=0.05, top=20, goea_out=None, dag_out=None, dpi=300, show_gene_limit=6, only_plot_sig=False): """ Go enrichment based on goatools :param gene2go: a file with two columns: gene_id \t go_term_id :param study: a file with at least one column, first column contains gene id, second columns is regulation direction :param obo: go-basic file download from GeneOntology :param population: a file with each row contains one gene; default to use all genes in gene2go file as population :param geneid2symbol: file with two columns: gene_id \t gene_symbol, used for DAG plot :param correct: pvalue adjustment method: Method used for testing and adjustment of pvalues. Can be either the full name or initial letters. Available methods are: - `bonferroni` : one-step correction - `sidak` : one-step correction - `holm-sidak` : step down method using Sidak adjustments - `holm` : step-down method using Bonferroni adjustments - `simes-hochberg` : step-up method (independent) - `hommel` : closed method based on Simes tests (non-negative) - `fdr_bh` : Benjamini/Hochberg (non-negative) - `fdr_by` : Benjamini/Yekutieli (negative) - `fdr_tsbh` : two stage fdr correction (non-negative) - `fdr_tsbky` : two stage fdr correction (non-negative) :param alpha: fdr cutoff, default 0.05 :param top: n top go terms to plot, sorted by corrected pvalue :param goea_out: output enrichment result file :param dag_out: dag figure file :param dpi: resolution of image, no effect for svg :param show_gene_limit: the max number of gene in a node to show :param only_plot_sig: only plot dag for significantly enriched terms :return: None """ if str(correct) == '3': correct = 'fdr_bh' if geneid2symbol: geneid2symbol = dict(x.strip().split()[:2] for x in open(geneid2symbol) if x.strip()) else: geneid2symbol = dict() obo = GODag(obo, optional_attrs=['relationship', 'is_a']) gene2go = read_associations(gene2go) study_genes = [x.strip().split()[0] for x in open(study)] try: reg_dict = dict(x.strip().split()[:2] for x in open(study)) except: reg_dict = {x.strip(): '' for x in open(study)} if not population: population = gene2go.keys() else: population = [ x.strip().split()[0] for x in open(population) if x.strip() ] goea_obj = GOEnrichmentStudy(population, gene2go, obo, propagate_counts=False, alpha=alpha, methods=('fdr_bh', )) keep_if = lambda r: r.ratio_in_study[0] != 0 goea_results_all = goea_obj.run_study(study_genes, keep_if=keep_if) goea_out = goea_out or study + '.goea.xls' goea_obj.wr_tsv(goea_out, goea_results_all) def func(y): results = [] genes = [x.strip() for x in y.split(',')] for gene in genes: tmp = [gene] if gene in reg_dict: tmp.append(reg_dict[gene]) if gene in geneid2symbol: tmp.append(geneid2symbol[gene]) results.append('|'.join(tmp)) return ';'.join(results) # func = lambda y: ';'.join(x.strip()+'|'+reg_dict[x.strip()] if x.strip() in reg_dict else x.strip() for x in y.split(',')) table = pd.read_table(goea_out, header=0, index_col=0) # 重新校正pvalue, 修改内容 fdr = multipletests(table['p_uncorrected'], method=correct)[1] table['p_fdr_bh'] = fdr # 修改goea_result_all方便后续的画图 for r, fdr in zip(goea_results_all, fdr): r.p_fdr_bh = fdr table.columns = [ x if x != 'p_fdr_bh' else 'p_corrected' for x in table.columns ] table['enrichment'] = [ 'e' if x <= alpha else 'p' for x in table['p_corrected'] ] table['study_items'] = table.loc[:, 'study_items'].map(func) # table = table.sort_values(by=['p_corrected', 'p_uncorrected']) table.to_csv(goea_out, header=True, index=True, sep='\t') # -------------------plot dag------------------------ for each in ['BP', 'MF', 'CC']: if only_plot_sig: goea_results_sig = table[table['enrichment'] == 'e'] else: goea_results_sig = table.copy() goea_results_sig = goea_results_sig[goea_results_sig['NS'] == each] if not goea_results_sig.shape[0]: print(f"No significant term to plot for {each} ") return if goea_results_sig.shape[0] >= top: goea_results_sig = goea_results_sig.iloc[:top] goid_subset = list(goea_results_sig.index) # t = obo[goid_subset[5]] # for k, v in t.relationship.items(): # print(t, k, type(v), list(v)[0].id) # print(dag_out[:-4]+'.'+each+dag_out[-4:]) dag_out = dag_out or study + '.goea.dag.svg' plot_gos( dag_out[:-4] + '.' + each + dag_out[-4:], goid_subset, # Source GO ids, 如果分析结果里面没有包含这个节点,则他的颜色会是苍白绿色,但这里这个情况不会出现 obo, goea_results= goea_results_all, # use pvals for coloring:"p_{M}".format(M=goea[0].method_flds[0].fieldname) # We can further configure the plot... id2symbol=geneid2symbol, # Print study gene Symbols, not GeneIDs study_items=show_gene_limit, # Only max 6 gene Symbols on GO terms items_p_line=3, # Print 3 genes per line) dpi=0 if dag_out.endswith('svg') else dpi, # title="Directed Graph of enriched {} terms".format(each) )
max_dist = 1000 elif snakemake.wildcards.state_type == 'Enhancer': min_dist = 5000 max_dist = 50000 else: sys.exit(-1) with open(snakemake.input.clusters) as f: for line in f: cols = line.strip().split() cluster = chr(int(cols[3]) + 65) if int(cols[-1]) <= max_dist and int(cols[-1]) >= min_dist: genes[cluster].add(cols[7]) background.add(cols[7]) obodag = GODag("go-basic.obo") id2go = read_associations("sym2go.txt") goeaobj = GOEnrichmentStudy(background, id2go, obodag, propagate_counts=False, alpha=0.05, methods=['fdr_bh']) outfile = open(snakemake.output.txt, 'w') for cluster, geneids in sorted(genes.items()): outfile.write("Cluster {}\n".format(cluster)) goea_results_all = goeaobj.run_study(geneids) for fdr, name, enrichment in sorted([(r.p_fdr_bh, r.name, r.enrichment) for r in goea_results_all if r.p_fdr_bh < 0.2]): outfile.write("\t{}\t{}\t{}\n".format(name, fdr, enrichment)) outfile.write("\n")
max_dist = 1000 elif snakemake.wildcards.state_type == 'Enhancer': min_dist = 5000 max_dist = 50000 else: sys.exit(-1) with open(snakemake.input.clusters) as f: for line in f: cols = line.strip().split() cluster = chr(int(cols[3]) + 65) if int(cols[-1]) <= max_dist and int(cols[-1]) >= min_dist: genes[cluster].add(cols[7]) background.add(cols[7]) obodag = GODag("go-basic.obo") id2go = read_associations("sym2go.txt") goeaobj = GOEnrichmentStudy(background, id2go, obodag, propagate_counts=False, alpha=0.05, methods=['fdr_bh']) outfile = open(snakemake.output.txt, 'w') for cluster, geneids in sorted(genes.items()): outfile.write("Cluster {}\n".format(cluster)) goea_results_all = goeaobj.run_study(geneids) for fdr, name, enrichment in sorted([(r.p_fdr_bh, r.name, r.enrichment) for r in goea_results_all if r.p_fdr_bh < 0.2]): outfile.write("\t{}\t{}\t{}\n".format(name, fdr, enrichment)) outfile.write("\n") #GOEnrichmentStudy.print_summary(goea_results_sig)
if not args.compare: # sanity check if len(pop) < len(study): exit("\nERROR: The study file contains more elements than the population file. " "Please check that the study file is a subset of the population file.\n") # check the fraction of genomic ids that overlap between study # and population overlap = float(len(study & pop)) / len(study) if 0.7 < overlap < 0.95: sys.stderr.write("\nWARNING: only {} fraction of genes/proteins in study are found in " "the population background.\n\n".format(overlap)) if overlap <= 0.7: exit("\nERROR: only {} of genes/proteins in the study are found in the " "background population. Please check.\n".format(overlap)) assoc = read_associations(assoc_fn) methods = args.method.split(",") if args.fdr: methods.append("fdr") obo_dag = GODag(obo_file=args.obo) propagate_counts = not args.no_propagate_counts g = GOEnrichmentStudy(pop, assoc, obo_dag, propagate_counts=propagate_counts, alpha=args.alpha, methods=methods) results = g.run_study(study) if args.outfile is None: g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval)
def test_fdr_bh(fout_log=None): """Do Gene Ontology Enrichment Analysis w/Benjamini-Hochberg multipletest. Print results""" # --------------------------------------------------------------------- # Run Gene Ontology Analysis (GOEA) # # 1. Initialize log = sys.stdout if fout_log is None else open(fout_log, 'w') obo_dag = GODag("go-basic.obo") assoc = read_associations("../data/association", no_top=True) popul_ids = [line.rstrip() for line in open("../data/population")] study_ids = [line.rstrip() for line in open("../data/study")] # 2. Run enrichment analysis goea = GOEA(obo_dag, assoc, log) goea.set_population(popul_ids) goea.set_params(alpha=0.05, method='fdr_bh') results_nt = goea.find_enrichment(study_ids) # --------------------------------------------------------------------- # Print results 3 ways: to screen, to tsv(tab-separated file), to xlsx(Excel spreadsheet) fout_tsv = "goea_fdr_bh.tsv" fout_xls = "goea_fdr_bh.xlsx" field_names = [ 'NS', 'study_cnt', 'fdr_bh', 'level', 'depth', 'GO', 'name', 'fdr_bh_sig' ] # collect these print_names = [ 'NS', 'study_cnt', 'fdr_bh', 'level', 'depth', 'GO', 'name' ] # print these in tsv and xlsx # Optional user customizable sort: # Sort by: 1st) BP, MF, CC; 2nd) corrected pval, with smallest first. sort_by = lambda nt: [nt.NS, nt.fdr_bh] # 1. Print results to screen using format in prtfmt. For example: # # BP 22 3.073e-03 L06 D07 GO:0006468 protein phosphorylation # BP 9 1.023e-02 L07 D08 GO:0006511 ubiquitin-dependent protein catabolic process # BP 2 1.023e-02 L05 D09 GO:0019877 diaminopimelate biosynthetic process # BP 2 1.223e-02 L04 D08 GO:0006301 postreplication repair # BP 2 1.223e-02 L05 D09 GO:0030418 nicotianamine biosynthetic process # BP 2 1.492e-02 L04 D06 GO:0006909 phagocytosis # BP 2 1.492e-02 L03 D03 GO:0051322 anaphase # ... # Print format field names are the same names as in the "field_names" variable. prtfmt = "{NS} {study_cnt:2} {fdr_bh:5.3e} L{level:02} D{depth:02} {GO} {name}\n" keep_if = lambda nt: nt.fdr_bh_sig # T/F: Keep the GOEA GO Term result only if the result is significant. goea.prt_txt(log, results_nt, field_names, prtfmt, sort_by=sort_by, keep_if=keep_if) # 2. Write results to tsv file # Sort by: 1st) BP, MF, CC; 2nd) By GO depth, deepest GO first. sort_by = lambda nt: [nt.NS, -1 * nt.depth] fld2fmt = { 'fdr_bh': '{:8.2e}' } # Optional user defined formatting for specific fields goea.wr_tsv(fout_tsv, results_nt, field_names, keep_if=keep_if, sort_by=sort_by, fld2fmt=fld2fmt, print_names=print_names) # 3. Write results to xlsx file # Use these headers instead of the print_names for the xlsx header hdrs = ['NS', 'Cnt', 'fdr_bh', 'L', 'D', 'Term', 'Ontology Term Name'] # TBD Check that header and size of fields printed match goea.wr_xlsx( fout_xls, results_nt, field_names, # optional key-word args (ie, kwargs, kws) keep_if=keep_if, sort_by=sort_by, hdrs=hdrs, fld2fmt=fld2fmt, print_names=print_names) if fout_log is not None: log.close() sys.stdout.write(" WROTE: {}\n".format(fout_log))
Pre = Precision(TP,FP) Sen = Sensitivity(TP,FN) F = F1(Pre,Sen) # return (Pre , Sen , F1) #return (Pre) #return (Sen) return (F) mean=[] num=[] #df = pd.read_csv('/sf/smpdata1/pronozinau/OrthoDB/odb10v0_gene_xrefs_onlyGO.tab', sep='\t', header=None) #df.columns = ['ort', 'GO', '3'] #zipbO = zip(df['ort'].to_list(), df['GO'].to_list()) #my_dict = defaultdict(list) #for k, v in zipbO: # my_dict[k].append(v) my_dict = read_associations('/sf/smpdata1/pronozinau/Blast_test/GO_slim/GO_slim.csv', 'id2gos') #def find_csv_filenames( path_to_dir, suffix=".csv" ): # filenames = listdir(path_to_dir) # return [ filename for filename in filenames if filename.endswith( suffix ) ] #ba = find_csv_filenames("/sf/smpdata1/pronozinau/clust/group3", "csv") ba = pd.read_csv('/storage/pronozinau/OrthoDB/mono_sp.csv', sep=',') for w in ba['0']: try: clustal = pd.read_csv('/storage/pronozinau/OrthoDB/clustalw/group_1/' + w + '.csv', sep='\t', header=None) blast = pd.read_csv('/storage/pronozinau/ALL_base_OtrhoDB/metout/group1_bla/' + w + '.csv', sep='\t', header=None) first = pd.read_csv('first_prot.csv', sep='\t', header=None) blast.columns = ['id_prot', 'id_orth', 'persent', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'] clustal.columns = ['id_orth', 'id_prot', 'persent', '4'] first.columns = ['ort', 'id', 'gr']