def __init__(self, pop, assoc, obo_dag, propagate_counts=True, alpha=.05, methods=None, **kws): self.name = kws.get('name', 'GOEA') print('\nLoad {OBJNAME} Gene Ontology Analysis ...'.format( OBJNAME=self.name)) self.log = kws['log'] if 'log' in kws else sys.stdout self._run_multitest = { 'local': self._run_multitest_local, 'statsmodels': self._run_multitest_statsmodels } self.pop = set(pop) self.pop_n = len(pop) self.assoc = assoc self.obo_dag = obo_dag self.alpha = alpha if methods is None: methods = ["bonferroni", "sidak", "holm"] self.methods = Methods(methods) self.pval_obj = FisherFactory(**kws).pval_obj if propagate_counts: update_association(assoc, obo_dag, kws.get('relationships', None)) ## BROAD broad_goids = get_goids_to_remove(kws.get('remove_goids')) ## BROAD if broad_goids: ## BROAD assoc = self._remove_assc_goids(assoc, broad_goids) self.go2popitems = get_terms("population", pop, assoc, obo_dag, self.log)
def __init__(self, pop, assoc, obo_dag, propagate_counts=True, alpha=.05, methods=None, **kws): self.log = kws['log'] if 'log' in kws else sys.stdout self._run_multitest = { 'local': lambda iargs: self._run_multitest_local(iargs), 'statsmodels': lambda iargs: self._run_multitest_statsmodels(iargs) } self.pop = pop self.pop_n = len(pop) self.assoc = assoc self.obo_dag = obo_dag self.alpha = alpha if methods is None: methods = ["bonferroni", "sidak", "holm"] self.methods = Methods(methods) self.pval_obj = FisherFactory(**kws).pval_obj if propagate_counts: sys.stderr.write("Propagating term counts to parents ..\n") obo_dag.update_association(assoc) self.go2popitems = get_terms("population", pop, assoc, obo_dag, self.log)
def run_study(self, study, **kws): """Run Gene Ontology Enrichment Study (GOEA) on study ids.""" # Key-word arguments: methods = Methods(kws['methods']) if 'methods' in kws else self.methods alpha = kws['alpha'] if 'alpha' in kws else self.alpha log = kws['log'] if 'log' in kws else self.log # Calculate uncorrected pvalues results = self.get_pval_uncorr(study, log) if not results: return [] if log is not None: log.write(" {MSG}\n".format(MSG="\n ".join(self.get_results_msg(results, study)))) # Do multipletest corrections on uncorrected pvalues and update results self._run_multitest_corr(results, methods, alpha, study, log) for rec in results: # get go term for name and level rec.set_goterm(self.obo_dag) # 'keep_if' can be used to keep only significant GO terms. Example: # >>> keep_if = lambda nt: nt.p_fdr_bh < 0.05 # if results are significant # >>> goea_results = goeaobj.run_study(geneids_study, keep_if=keep_if) if 'keep_if' in kws: keep_if = kws['keep_if'] results = [r for r in results if keep_if(r)] # Default sort order: First, sort by BP, MF, CC. Second, sort by pval results.sort(key=lambda r: [r.NS, r.enrichment, r.p_uncorrected]) return results # list of GOEnrichmentRecord objects
def __init__(self, pop, assoc, obo_dag, propagate_counts=True, alpha=.05, methods=None, **kws): self.log = kws['log'] if 'log' in kws else sys.stdout self._run_multitest = { 'local':lambda iargs: self._run_multitest_local(iargs), 'statsmodels':lambda iargs: self._run_multitest_statsmodels(iargs)} self.pop = pop self.pop_n = len(pop) self.assoc = assoc self.obo_dag = obo_dag self.alpha = alpha if methods is None: methods = ["bonferroni", "sidak", "holm"] self.methods = Methods(methods) self.pval_obj = FisherFactory(**kws).pval_obj if propagate_counts: sys.stderr.write("Propagating term counts to parents ..\n") obo_dag.update_association(assoc) self.go2popitems = get_terms("population", pop, assoc, obo_dag, self.log)
def run_study(self, study, **kws): """Run Gene Ontology Enrichment Study (GOEA) on study ids.""" study_name = kws.get('name', 'current') log = self._get_log_or_prt(kws) if log: log.write( '\nRun {OBJNAME} Gene Ontology Analysis: {STU} study set of {N} IDs ...' .format(OBJNAME=self.name, N=len(study), STU=study_name)) if not study: return [] # Key-word arguments: methods = Methods(kws['methods']) if 'methods' in kws else self.methods alpha = kws['alpha'] if 'alpha' in kws else self.alpha # Calculate uncorrected pvalues results = self.get_pval_uncorr(study, log) if not results: return [] if log is not None: log.write(" {MSG}\n".format( MSG="\n ".join(self.get_results_msg(results, study)))) # Do multipletest corrections on uncorrected pvalues and update results self._run_multitest_corr(results, methods, alpha, study, log) for rec in results: # get go term for name and level rec.set_goterm(self.obo_dag) # 'keep_if' can be used to keep only significant GO terms. Example: # >>> keep_if = lambda nt: nt.p_fdr_bh < 0.05 # if results are significant # >>> goea_results = goeaobj.run_study(geneids_study, keep_if=keep_if) if 'keep_if' in kws: keep_if = kws['keep_if'] results = [r for r in results if keep_if(r)] # Default sort order: results.sort(key=lambda r: [r.enrichment, r.NS, r.p_uncorrected]) return results # list of GOEnrichmentRecord objects
def _init_args(self): """Get enrichment arg parser.""" #pylint: disable=invalid-name p = argparse.ArgumentParser( __doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) p.add_argument('filenames', type=str, nargs=3, help='data/study data/population data/association') p.add_argument( '--annofmt', default=None, type=str, help=('Annotation file format. ' 'Not needed if type can be determined using filename'), choices=['gene2go', 'gaf', 'gpad', 'id2gos']) p.add_argument( '--taxid', default=9606, type=int, help= "When using NCBI's gene2go annotation file, specify desired taxid") p.add_argument('--alpha', default=0.05, type=float, help='Test-wise alpha for multiple testing') p.add_argument( '--pval', default=.05, type=float, help='Only print results with uncorrected p-value < PVAL.') p.add_argument('--pval_field', type=str, help='Only print results when PVAL_FIELD < PVAL.') p.add_argument('--outfile', default=None, type=str, help='Write enrichment results into xlsx or tsv file') p.add_argument('--ns', default='BP,MF,CC', type=str, help='Limit GOEA to specified branch categories. ' 'BP=Biological Process; ' 'MF=Molecular Function; ' 'CC=Cellular Component') p.add_argument( '--id2sym', default=None, type=str, help='ASCII file containing one geneid and its symbol per line') p.add_argument( '--sections', default=None, type=str, help=('Use sections file for printing grouped GOEA results. ' 'Example SECTIONS values:\n' 'goatools.test_data.sections.gjoneska_pfenning \n' 'goatools/test_data/sections/gjoneska_pfenning.py \n' 'data/gjoneska_pfenning/sections_in.txt\n')) p.add_argument( '--outfile_detail', type=str, help=( 'Write enrichment results into a text file \n' 'containing the following information: \n' '1) GOEA GO terms, grouped into sections \n\n' '2) List of genes and ASCII art showing section membership \n' '3) Detailed list of each gene and GO terms w/their P-values \n' )) p.add_argument( '--compare', dest='compare', default=False, action='store_true', help="the population file as a comparison group. if this " "flag is specified, the population is used as the study " "plus the `population/comparison`") p.add_argument( '--ratio', dest='ratio', type=float, default=None, help="only show values where the difference between study " "and population ratios is greater than this. useful for " "excluding GO categories with small differences, but " "containing large numbers of genes. should be a value " "between 1 and 2. ") p.add_argument('--indent', dest='indent', default=False, action='store_true', help="indent GO terms") p.add_argument('--obo', default="go-basic.obo", type=str, help="Specifies location and name of the obo file") p.add_argument('--no_propagate_counts', default=False, action='store_true', help="Do not propagate counts to parent terms") p.add_argument('--method', default="bonferroni,sidak,holm,fdr_bh", type=str, help=Methods().getmsg_valid_methods()) p.add_argument('--pvalcalc', default="fisher", type=str, help=str(FisherFactory())) p.add_argument( '--min_overlap', default=0.7, type=float, help= "Check that a minimum amount of study genes are in the population") p.add_argument('--goslim', default='goslim_generic.obo', type=str, help="The GO slim file is used when grouping GO terms.") p.add_argument( '--ev_inc', type=str, help= "Include specified evidence codes and groups separated by commas") p.add_argument( '--ev_exc', type=str, help= "Exclude specified evidence codes and groups separated by commas") p.add_argument('--ev_help', dest='ev_help', action='store_false', help="Print all Evidence codes, with descriptions") p.add_argument('--ev_help_short', dest='ev_help_short', action='store_false', help="Print all Evidence codes") if len(sys.argv) == 1: sys.exit(not p.print_help()) self._prt_evidence_codes(set(sys.argv[1:])) args = p.parse_args() # Namespace object from argparse self._check_input_files(args, p) return args
"containing large numbers of genes. should be a value " "between 1 and 2. ") p.add_argument('--fdr', dest='fdr', default=False, action='store_true', help="Calculate the false discovery rate (alt. to the " "Bonferroni but slower)") p.add_argument('--indent', dest='indent', default=False, action='store_true', help="indent GO terms") p.add_argument('--obo', default="go-basic.obo", type=str, help="Specifies location and name of the obo file") p.add_argument('--no_propagate_counts', default=False, action='store_true', help="Do not propagate counts to parent terms") p.add_argument('--outfile', default=None, type=str, help="Write enrichment results into xlsx or tsv file") p.add_argument('--method', default="bonferroni,sidak,holm", type=str, help=Methods().getmsg_valid_methods()) args = p.parse_args() check_input_files(args, p) min_ratio = args.ratio if min_ratio is not None: assert 1 <= min_ratio <= 2 study_fn, pop_fn, assoc_fn = args.filenames study, pop = read_geneset(study_fn, pop_fn, compare=args.compare) print("Study: {0} vs. Population {1}".format(len(study), len(pop)), file=sys.stderr) if not args.compare: # sanity check if len(pop) < len(study): exit("\nERROR: The study file contains more elements than the population file. "
class GOEnrichmentStudy(object): """Runs Fisher's exact test, as well as multiple corrections """ # Default Excel table column widths for GOEA results default_fld2col_widths = { 'NS' : 3, 'GO' : 12, 'alt' : 2, 'level' : 3, 'depth' : 3, 'enrichment': 1, 'name' : 60, 'ratio_in_study': 8, 'ratio_in_pop' : 12, 'study_items' : 15, } def __init__(self, pop, assoc, obo_dag, propagate_counts=True, alpha=.05, methods=None, **kws): self.log = kws['log'] if 'log' in kws else sys.stdout self._run_multitest = { 'local':lambda iargs: self._run_multitest_local(iargs), 'statsmodels':lambda iargs: self._run_multitest_statsmodels(iargs)} self.pop = pop self.pop_n = len(pop) self.assoc = assoc self.obo_dag = obo_dag self.alpha = alpha if methods is None: methods = ["bonferroni", "sidak", "holm"] self.methods = Methods(methods) self.pval_obj = FisherFactory(**kws).pval_obj if propagate_counts: sys.stderr.write("Propagating term counts to parents ..\n") obo_dag.update_association(assoc) self.go2popitems = get_terms("population", pop, assoc, obo_dag, self.log) def run_study(self, study, **kws): """Run Gene Ontology Enrichment Study (GOEA) on study ids.""" # Key-word arguments: methods = Methods(kws['methods']) if 'methods' in kws else self.methods alpha = kws['alpha'] if 'alpha' in kws else self.alpha log = kws['log'] if 'log' in kws else self.log # Calculate uncorrected pvalues results = self.get_pval_uncorr(study, log) if not results: return [] if log is not None: log.write(" {MSG}\n".format(MSG="\n ".join(self.get_results_msg(results, study)))) # Do multipletest corrections on uncorrected pvalues and update results self._run_multitest_corr(results, methods, alpha, study, log) for rec in results: # get go term for name and level rec.set_goterm(self.obo_dag) # 'keep_if' can be used to keep only significant GO terms. Example: # >>> keep_if = lambda nt: nt.p_fdr_bh < 0.05 # if results are significant # >>> goea_results = goeaobj.run_study(geneids_study, keep_if=keep_if) if 'keep_if' in kws: keep_if = kws['keep_if'] results = [r for r in results if keep_if(r)] # Default sort order: First, sort by BP, MF, CC. Second, sort by pval results.sort(key=lambda r: [r.NS, r.enrichment, r.p_uncorrected]) return results # list of GOEnrichmentRecord objects def run_study_nts(self, study, **kws): """Run GOEA on study ids. Return results as a list of namedtuples.""" goea_results = self.run_study(study, **kws) return get_goea_nts_all(goea_results) def get_results_msg(self, results, study): """Return summary for GOEA results.""" # To convert msg list to string: "\n".join(msg) msg = [] if results: fmt = "{M:6,} GO terms are associated with {N:6,} of {NT:6,}" stu_items, num_gos_stu = self.get_item_cnt(results, "study_items") pop_items, num_gos_pop = self.get_item_cnt(results, "pop_items") stu_txt = fmt.format(N=len(stu_items), M=num_gos_stu, NT=len(set(study))) pop_txt = fmt.format(N=len(pop_items), M=num_gos_pop, NT=self.pop_n) msg.append("{POP} population items".format(POP=pop_txt)) msg.append("{STU} study items".format(STU=stu_txt)) return msg def get_pval_uncorr(self, study, log=sys.stdout): """Calculate the uncorrected pvalues for study items.""" results = [] go2studyitems = get_terms("study", study, self.assoc, self.obo_dag, log) pop_n, study_n = self.pop_n, len(study) allterms = set(go2studyitems).union(set(self.go2popitems)) if log is not None: log.write("Calculating {N:,} uncorrected p-values using {PFNC}\n".format( N=len(allterms), PFNC=self.pval_obj.name)) calc_pvalue = self.pval_obj.calc_pvalue for term in allterms: study_items = go2studyitems.get(term, set()) study_count = len(study_items) pop_items = self.go2popitems.get(term, set()) pop_count = len(pop_items) one_record = GOEnrichmentRecord( GO=term, p_uncorrected=calc_pvalue(study_count, study_n, pop_count, pop_n), study_items=study_items, pop_items=pop_items, ratio_in_study=(study_count, study_n), ratio_in_pop=(pop_count, pop_n)) results.append(one_record) return results def _run_multitest_corr(self, results, usr_methods, alpha, study, log): """Do multiple-test corrections on uncorrected pvalues.""" assert 0 < alpha < 1, "Test-wise alpha must fall between (0, 1)" pvals = [r.p_uncorrected for r in results] ntobj = cx.namedtuple("ntobj", "results pvals alpha nt_method study") for nt_method in usr_methods: ntmt = ntobj(results, pvals, alpha, nt_method, study) self._run_multitest[nt_method.source](ntmt) if log is not None: self._log_multitest_corr(log, results, ntmt, alpha) def _log_multitest_corr(self, log, results, ntmt, alpha): """Print information regarding multitest correction results.""" ntm = ntmt.nt_method attr_mult = "p_{M}".format(M=self.methods.get_fieldname(ntm.source, ntm.method)) sig_cnt = sum(1 for r in results if getattr(r, attr_mult) < alpha) log.write("{N:8,} GO terms found significant (< {A}=alpha) after ".format(N=sig_cnt, A=alpha)) log.write("multitest correction: ") log.write("{MSRC} {METHOD}\n".format(MSRC=ntm.source, METHOD=ntm.method)) def _run_multitest_statsmodels(self, ntmt): """Use multitest mthods that have been implemented in statsmodels.""" # Only load statsmodels if it is used multipletests = self.methods.get_statsmodels_multipletests() results = multipletests(ntmt.pvals, ntmt.alpha, ntmt.nt_method.method) pvals_corrected = results[1] # reject_lst, pvals_corrected, alphacSidak, alphacBonf self._update_pvalcorr(ntmt, pvals_corrected) def _run_multitest_local(self, ntmt): """Use multitest mthods that have been implemented locally.""" corrected_pvals = None method = ntmt.nt_method.method if method == "bonferroni": corrected_pvals = Bonferroni(ntmt.pvals, ntmt.alpha).corrected_pvals elif method == "sidak": corrected_pvals = Sidak(ntmt.pvals, ntmt.alpha).corrected_pvals elif method == "holm": corrected_pvals = HolmBonferroni(ntmt.pvals, ntmt.alpha).corrected_pvals elif method == "fdr": # get the empirical p-value distributions for FDR term_pop = getattr(self, 'term_pop', None) if term_pop is None: term_pop = count_terms(self.pop, self.assoc, self.obo_dag) p_val_distribution = calc_qval(len(ntmt.study), self.pop_n, self.pop, self.assoc, term_pop, self.obo_dag) corrected_pvals = FDR(p_val_distribution, ntmt.results, ntmt.alpha).corrected_pvals self._update_pvalcorr(ntmt, corrected_pvals) @staticmethod def _update_pvalcorr(ntmt, corrected_pvals): """Add data members to store multiple test corrections.""" if corrected_pvals is None: return for rec, val in zip(ntmt.results, corrected_pvals): rec.set_corrected_pval(ntmt.nt_method, val) # Methods for writing results into tables: text, tab-separated, Excel spreadsheets def wr_txt(self, fout_txt, goea_results, prtfmt=None, **kws): """Print GOEA results to text file.""" if not goea_results: sys.stdout.write(" 0 GOEA results. NOT WRITING {FOUT}\n".format(FOUT=fout_txt)) return with open(fout_txt, 'w') as prt: if 'title' in kws: prt.write("{TITLE}\n".format(TITLE=kws['title'])) data_nts = self.prt_txt(prt, goea_results, prtfmt, **kws) log = self.log if self.log is not None else sys.stdout log.write(" {N:>5} GOEA results for {CUR:5} study items. WROTE: {F}\n".format( N=len(data_nts), CUR=len(get_study_items(goea_results)), F=fout_txt)) def prt_txt(self, prt, goea_results, prtfmt=None, **kws): """Print GOEA results in text format.""" if prtfmt is None: prtfmt = ("{GO} {NS} {p_uncorrected:5.2e} {ratio_in_study:>6} {ratio_in_pop:>9} " "{depth:02} {name:40} {study_items}\n") prtfmt = self.adjust_prtfmt(prtfmt) prt_flds = RPT.get_fmtflds(prtfmt) data_nts = get_goea_nts_prt(goea_results, prt_flds, **kws) RPT.prt_txt(prt, data_nts, prtfmt, prt_flds, **kws) return data_nts def wr_xlsx(self, fout_xlsx, goea_results, **kws): """Write a xlsx file.""" # kws: prt_if indent itemid2name(study_items) prt_flds = kws.get('prt_flds', self.get_prtflds_default(goea_results)) xlsx_data = get_goea_nts_prt(goea_results, prt_flds, **kws) if 'fld2col_widths' not in kws: kws['fld2col_widths'] = {f:self.default_fld2col_widths.get(f, 8) for f in prt_flds} RPT.wr_xlsx(fout_xlsx, xlsx_data, **kws) def wr_tsv(self, fout_tsv, goea_results, **kws): """Write tab-separated table data to file""" prt_flds = kws.get('prt_flds', self.get_prtflds_default(goea_results)) tsv_data = get_goea_nts_prt(goea_results, prt_flds, **kws) RPT.wr_tsv(fout_tsv, tsv_data, **kws) def prt_tsv(self, prt, goea_results, **kws): """Write tab-separated table data""" prt_flds = kws.get('prt_flds', self.get_prtflds_default(goea_results)) tsv_data = get_goea_nts_prt(goea_results, prt_flds, **kws) RPT.prt_tsv(prt, tsv_data, prt_flds, **kws) @staticmethod def adjust_prtfmt(prtfmt): """Adjust format_strings for legal values.""" prtfmt = prtfmt.replace("{p_holm-sidak", "{p_holm_sidak") prtfmt = prtfmt.replace("{p_simes-hochberg", "{p_simes_hochberg") return prtfmt @staticmethod def get_ns2nts(results, fldnames=None, **kws): """Get namedtuples of GOEA results, split into BP, MF, CC.""" ns2nts = cx.defaultdict(list) nts = get_goea_nts_all(results, fldnames, **kws) for ntgoea in nts: ns2nts[ntgoea.NS].append(ntgoea) return ns2nts @staticmethod def get_item_cnt(results, attrname="study_items"): """Get all study or population items (e.g., geneids).""" items = set() go_cnt = 0 for rec in results: if hasattr(rec, attrname): items_cur = getattr(rec, attrname) # Only count GO term if there are items in the set. if len(items_cur) != 0: items |= items_cur go_cnt += 1 return items, go_cnt @staticmethod def get_prtflds_default(results): """Get default fields names. Used in printing GOEA results. Researchers can control which fields they want to print in the GOEA results or they can use the default fields. """ if results: return results[0].get_prtflds_default() return [] @staticmethod def print_summary(results, min_ratio=None, indent=False, pval=0.05): """Print summary.""" import goatools # Header contains provenance and parameters date = datetime.date.today() print("# Generated by GOATOOLS v{0} ({1})".format(goatools.__version__, date)) print("# min_ratio={0} pval={1}".format(min_ratio, pval)) # field names for output if results: print("\t".join(GOEnrichmentStudy.get_prtflds_default(results))) for rec in results: # calculate some additional statistics # (over_under, is_ratio_different) rec.update_remaining_fldsdefprt(min_ratio=min_ratio) if pval is not None and rec.p_uncorrected >= pval: continue if rec.is_ratio_different: print(rec.__str__(indent=indent)) def wr_py_goea_results(self, fout_py, goea_results, **kws): """Save GOEA results into Python package containing list of namedtuples.""" var_name = kws.get("var_name", "goea_results") docstring = kws.get("docstring", "") sortby = kws.get("sortby", None) if goea_results: from goatools.nt_utils import wr_py_nts nts_goea = goea_results # If list has GOEnrichmentRecords or verbose namedtuples, exclude some fields. if hasattr(goea_results[0], "_fldsdefprt") or hasattr(goea_results[0], 'goterm'): # Exclude some attributes from the namedtuple when saving results # to a Python file because the information is redundant or verbose. nts_goea = get_goea_nts_prt(goea_results, **kws) docstring = "\n".join([docstring, "# {VER}\n\n".format(VER=self.obo_dag.version)]) assert hasattr(nts_goea[0], '_fields') if sortby is None: sortby = lambda nt: [getattr(nt, 'namespace'), getattr(nt, 'enrichment'), getattr(nt, 'p_uncorrected'), getattr(nt, 'depth'), getattr(nt, 'GO')] nts_goea = sorted(nts_goea, key=sortby) wr_py_nts(fout_py, nts_goea, docstring, var_name)
class GOEnrichmentStudy(object): """Runs Fisher's exact test, as well as multiple corrections """ # Default Excel table column widths for GOEA results default_fld2col_widths = { 'NS' : 3, 'GO' : 12, 'level' : 3, 'enrichment': 1, 'name' : 60, 'ratio_in_study': 8, 'ratio_in_pop' : 12, 'study_items' : 15, } def __init__(self, pop, assoc, obo_dag, propagate_counts=True, alpha=.05, methods=None, **kws): self.log = kws['log'] if 'log' in kws else sys.stdout self._run_multitest = { 'local':lambda iargs: self._run_multitest_local(iargs), 'statsmodels':lambda iargs: self._run_multitest_statsmodels(iargs)} self.pop = pop self.pop_n = len(pop) self.assoc = assoc self.obo_dag = obo_dag self.alpha = alpha if methods is None: methods = ["bonferroni", "sidak", "holm"] self.methods = Methods(methods) self.pval_obj = FisherFactory(**kws).pval_obj if propagate_counts: sys.stderr.write("Propagating term counts to parents ..\n") obo_dag.update_association(assoc) self.go2popitems = get_terms("population", pop, assoc, obo_dag, self.log) def run_study(self, study, **kws): """Run Gene Ontology Enrichment Study (GOEA) on study ids.""" # Key-word arguments: methods = Methods(kws['methods']) if 'methods' in kws else self.methods alpha = kws['alpha'] if 'alpha' in kws else self.alpha log = kws['log'] if 'log' in kws else self.log # Calculate uncorrected pvalues results = self._get_pval_uncorr(study) if not results: return [] # Do multipletest corrections on uncorrected pvalues and update results self._run_multitest_corr(results, methods, alpha, study) for rec in results: # get go term for name and level rec.set_goterm(self.obo_dag) # 'keep_if' can be used to keep only significant GO terms. Example: # >>> keep_if = lambda nt: nt.p_fdr_bh < 0.05 # if results are significant # >>> goea_results = goeaobj.run_study(geneids_study, keep_if=keep_if) if 'keep_if' in kws: keep_if = kws['keep_if'] results = [r for r in results if keep_if(r)] # Default sort order: First, sort by BP, MF, CC. Second, sort by pval results.sort(key=lambda r: [r.NS, r.p_uncorrected]) if log is not None: log.write(" {MSG}\n".format(MSG="\n ".join(self.get_results_msg(results)))) return results # list of GOEnrichmentRecord objects def run_study_nts(self, study, **kws): """Run GOEA on study ids. Return results as a list of namedtuples.""" goea_results = self.run_study(study, **kws) return get_goea_nts_all(goea_results) def get_results_msg(self, results): """Return summary for GOEA results.""" # To convert msg list to string: "\n".join(msg) msg = [] if results: stu_items, num_gos_stu = self.get_item_cnt(results, "study_items") pop_items, num_gos_pop = self.get_item_cnt(results, "pop_items") msg.append("{M:,} GO terms are associated with {N:,} of {NT:,} study items".format( N=len(stu_items), NT=results[0].study_count, M=num_gos_stu)) msg.append("{M:,} GO terms are associated with {N:,} of {NT:,} population items".format( N=len(pop_items), NT=self.pop_n, M=num_gos_pop)) return msg def _get_pval_uncorr(self, study, log=sys.stdout): """Calculate the uncorrected pvalues for study items.""" log.write("Calculating uncorrected p-values using {PFNC}\n".format(PFNC=self.pval_obj.name)) results = [] go2studyitems = get_terms("study", study, self.assoc, self.obo_dag, log) pop_n, study_n = self.pop_n, len(study) allterms = set(go2studyitems.keys()).union( set(self.go2popitems.keys())) calc_pvalue = self.pval_obj.calc_pvalue for term in allterms: study_items = go2studyitems.get(term, set()) study_count = len(study_items) pop_items = self.go2popitems.get(term, set()) pop_count = len(pop_items) one_record = GOEnrichmentRecord( GO=term, p_uncorrected=calc_pvalue(study_count, study_n, pop_count, pop_n), study_items=study_items, pop_items=pop_items, ratio_in_study=(study_count, study_n), ratio_in_pop=(pop_count, pop_n)) results.append(one_record) return results def _run_multitest_corr(self, results, usr_methods, alpha, study): """Do multiple-test corrections on uncorrected pvalues.""" assert 0 < alpha < 1, "Test-wise alpha must fall between (0, 1)" pvals = [r.p_uncorrected for r in results] NtMt = cx.namedtuple("NtMt", "results pvals alpha nt_method study") for nt_method in usr_methods: ntmt = NtMt(results, pvals, alpha, nt_method, study) sys.stdout.write("Running multitest correction: {MSRC} {METHOD}\n".format( MSRC=ntmt.nt_method.source, METHOD=ntmt.nt_method.method)) self._run_multitest[nt_method.source](ntmt) def _run_multitest_statsmodels(self, ntmt): """Use multitest mthods that have been implemented in statsmodels.""" # Only load statsmodels if it is used multipletests = self.methods.get_statsmodels_multipletests() results = multipletests(ntmt.pvals, ntmt.alpha, ntmt.nt_method.method) reject_lst, pvals_corrected, alphacSidak, alphacBonf = results self._update_pvalcorr(ntmt, pvals_corrected) def _run_multitest_local(self, ntmt): """Use multitest mthods that have been implemented locally.""" corrected_pvals = None method = ntmt.nt_method.method if method == "bonferroni": corrected_pvals = Bonferroni(ntmt.pvals, ntmt.alpha).corrected_pvals elif method == "sidak": corrected_pvals = Sidak(ntmt.pvals, ntmt.alpha).corrected_pvals elif method == "holm": corrected_pvals = HolmBonferroni(ntmt.pvals, ntmt.alpha).corrected_pvals elif method == "fdr": # get the empirical p-value distributions for FDR term_pop = getattr(self, 'term_pop', None) if term_pop is None: term_pop = count_terms(self.pop, self.assoc, self.obo_dag) p_val_distribution = calc_qval(len(ntmt.study), self.pop_n, self.pop, self.assoc, term_pop, self.obo_dag) corrected_pvals = FDR(p_val_distribution, ntmt.results, ntmt.alpha).corrected_pvals self._update_pvalcorr(ntmt, corrected_pvals) @staticmethod def _update_pvalcorr(ntmt, corrected_pvals): """Add data members to store multiple test corrections.""" if corrected_pvals is None: return for rec, val in zip(ntmt.results, corrected_pvals): rec.set_corrected_pval(ntmt.nt_method, val) # Methods for writing results into tables: text, tab-separated, Excel spreadsheets def wr_txt(self, fout_txt, goea_results, prtfmt=None, **kws): """Print GOEA results to text file.""" with open(fout_txt, 'w') as prt: data_nts = self.prt_txt(prt, goea_results, prtfmt, **kws) self.log.write(" {N:>5} items WROTE: {F}\n".format( N=len(data_nts), F=fout_txt)) def prt_txt(self, prt, goea_results, prtfmt=None, **kws): """Print GOEA results in text format.""" if prtfmt is None: prtfmt = "{GO} {NS} {p_uncorrected:5.2e} {study_count:>5} {name}\n" prtfmt = self.adjust_prtfmt(prtfmt) prt_flds = RPT.get_fmtflds(prtfmt) data_nts = get_goea_nts_prt(goea_results, prt_flds, **kws) RPT.prt_txt(prt, data_nts, prtfmt, prt_flds, **kws) return data_nts def wr_xlsx(self, fout_xlsx, goea_results, **kws): """Write a xlsx file.""" # kws: prt_if indent prt_flds = kws.get('prt_flds', self.get_prtflds_default(goea_results)) xlsx_data = get_goea_nts_prt(goea_results, prt_flds, **kws) if 'fld2col_widths' not in kws: kws['fld2col_widths'] = {f:self.default_fld2col_widths.get(f, 8) for f in prt_flds} RPT.wr_xlsx(fout_xlsx, xlsx_data, **kws) def wr_tsv(self, fout_tsv, goea_results, **kws): """Write tab-separated table data to file""" prt_flds = kws.get('prt_flds', self.get_prtflds_default(goea_results)) tsv_data = get_goea_nts_prt(goea_results, prt_flds, **kws) RPT.wr_tsv(fout_tsv, tsv_data, **kws) def prt_tsv(self, prt, goea_results, **kws): """Write tab-separated table data""" prt_flds = kws.get('prt_flds', self.get_prtflds_default(goea_results)) tsv_data = get_goea_nts_prt(goea_results, prt_flds, **kws) RPT.prt_tsv(prt, tsv_data, prt_flds, **kws) @staticmethod def adjust_prtfmt(prtfmt): """Adjust format_strings for legal values.""" prtfmt = prtfmt.replace("{p_holm-sidak", "{p_holm_sidak") prtfmt = prtfmt.replace("{p_simes-hochberg", "{p_simes_hochberg") return prtfmt @staticmethod def get_NS2nts(results, fldnames=None, **kws): """Get namedtuples of GOEA results, split into BP, MF, CC.""" NS2nts = cx.defaultdict(list) nts = get_goea_nts_all(results, fldnames, **kws) for nt in nts: NS2nts[nt.NS].append(nt) return NS2nts @staticmethod def get_item_cnt(results, attrname="study_items"): """Get all study or population items (e.g., geneids).""" items = set() go_cnt = 0 for rec in results: if hasattr(rec, attrname): items_cur = getattr(rec, attrname) # Only count GO term if there are items in the set. if len(items_cur) != 0: items |= items_cur go_cnt += 1 return items, go_cnt @staticmethod def get_prtflds_default(results): """Get default fields names. Used in printing GOEA results. Researchers can control which fields they want to print in the GOEA results or they can use the default fields. """ if results: return results[0].get_prtflds_default() return [] @staticmethod def print_summary(results, min_ratio=None, indent=False, pval=0.05): from .version import __version__ as version # Header contains provenance and parameters print("# Generated by GOATOOLS v{0} ({1})".format(version, datetime.date.today())) print("# min_ratio={0} pval={1}".format(min_ratio, pval)) # field names for output if results: print("\t".join(GOEnrichmentStudy.get_prtflds_default(results))) for rec in results: # calculate some additional statistics # (over_under, is_ratio_different) rec.update_remaining_fldsdefprt(min_ratio=min_ratio) if pval is not None and rec.p_uncorrected >= pval: continue if rec.is_ratio_different: print(rec.__str__(indent=indent)) def wr_py_goea_results(self, fout_py, goea_results, **kws): """Save GOEA results into Python package containing list of namedtuples.""" var_name = kws.get("var_name", "goea_results") docstring = kws.get("docstring", "") sortby = kws.get("sortby", None) if goea_results: from goatools.nt_utils import wr_py_nts nts_goea = goea_results # If list has GOEnrichmentRecords or verbose namedtuples, exclude some fields. if hasattr(goea_results[0], "_fldsdefprt") or hasattr(goea_results[0], 'goterm'): # Exclude some attributes from the namedtuple when saving results # to a Python file because the information is redundant or verbose. nts_goea = get_goea_nts_prt(goea_results) docstring = "\n".join([docstring, "# {OBO_VER}\n\n".format(OBO_VER=self.obo_dag.version)]) assert hasattr(nts_goea[0], '_fields') if sortby is None: sortby = lambda nt: getattr(nt, 'p_uncorrected') nts_goea = sorted(nts_goea, key=sortby) wr_py_nts(fout_py, nts_goea, docstring, var_name)
class GOEnrichmentStudy(object): """Runs Fisher's exact test, as well as multiple corrections """ objprtres = GoeaPrintFunctions() def __init__(self, pop, assoc, obo_dag, propagate_counts=True, alpha=.05, methods=None, **kws): self.name = kws.get('name', 'GOEA') print('\nLoad {OBJNAME} Gene Ontology Analysis ...'.format(OBJNAME=self.name)) self.log = kws['log'] if 'log' in kws else sys.stdout self._run_multitest = { 'local':lambda iargs: self._run_multitest_local(iargs), 'statsmodels':lambda iargs: self._run_multitest_statsmodels(iargs)} self.pop = set(pop) self.pop_n = len(pop) self.assoc = assoc self.obo_dag = obo_dag self.alpha = alpha if methods is None: methods = ["bonferroni", "sidak", "holm"] self.methods = Methods(methods) self.pval_obj = FisherFactory(**kws).pval_obj if propagate_counts: sys.stderr.write("Propagating term counts to parents ..\n") obo_dag.update_association(assoc) self.go2popitems = get_terms("population", pop, assoc, obo_dag, self.log) # def get_objresults(self, name, study, **kws): # """Run GOEA, return results in an object""" # results = self.run_study(study, **kws) # study_in_pop = self.pop.intersection(study) # return GoeaResults(study_in_pop, results, self, name) def run_study(self, study, **kws): """Run Gene Ontology Enrichment Study (GOEA) on study ids.""" study_name = kws.get('name', 'current') print('\nRun {OBJNAME} Gene Ontology Analysis: {STU} study set of {N} IDs ...'.format( OBJNAME=self.name, N=len(study), STU=study_name)) if not study: return [] # Key-word arguments: methods = Methods(kws['methods']) if 'methods' in kws else self.methods alpha = kws['alpha'] if 'alpha' in kws else self.alpha log = kws['log'] if 'log' in kws else self.log # Calculate uncorrected pvalues results = self.get_pval_uncorr(study, log) if not results: return [] if log is not None: log.write(" {MSG}\n".format(MSG="\n ".join(self.get_results_msg(results, study)))) # Do multipletest corrections on uncorrected pvalues and update results self._run_multitest_corr(results, methods, alpha, study, log) for rec in results: # get go term for name and level rec.set_goterm(self.obo_dag) # 'keep_if' can be used to keep only significant GO terms. Example: # >>> keep_if = lambda nt: nt.p_fdr_bh < 0.05 # if results are significant # >>> goea_results = goeaobj.run_study(geneids_study, keep_if=keep_if) if 'keep_if' in kws: keep_if = kws['keep_if'] results = [r for r in results if keep_if(r)] # Default sort order: results.sort(key=lambda r: [r.enrichment, r.NS, r.p_uncorrected]) return results # list of GOEnrichmentRecord objects def run_study_nts(self, study, **kws): """Run GOEA on study ids. Return results as a list of namedtuples.""" goea_results = self.run_study(study, **kws) return MgrNtGOEAs(goea_results).get_goea_nts_all() def get_results_msg(self, results, study): """Return summary for GOEA results.""" # To convert msg list to string: "\n".join(msg) msg = [] if results: fmt = "{M:6,} GO terms are associated with {N:6,} of {NT:6,}" stu_items, num_gos_stu = self.get_item_cnt(results, "study_items") pop_items, num_gos_pop = self.get_item_cnt(results, "pop_items") stu_txt = fmt.format(N=len(stu_items), M=num_gos_stu, NT=len(set(study))) pop_txt = fmt.format(N=len(pop_items), M=num_gos_pop, NT=self.pop_n) msg.append("{POP} population items".format(POP=pop_txt)) msg.append("{STU} study items".format(STU=stu_txt)) return msg def get_pval_uncorr(self, study, log=sys.stdout): """Calculate the uncorrected pvalues for study items.""" results = [] study_in_pop = self.pop.intersection(study) # " 99% 378 of 382 study items found in population" go2studyitems = get_terms("study", study_in_pop, self.assoc, self.obo_dag, log) pop_n, study_n = self.pop_n, len(study_in_pop) allterms = set(go2studyitems).union(set(self.go2popitems)) if log is not None: # Some study genes may not have been found in the population. Report from orig study_n_orig = len(study) perc = 100.0*study_n/study_n_orig if study_n_orig != 0 else 0.0 log.write("{R:3.0f}% {N:>6,} of {M:>6,} study items found in population({P})\n".format( N=study_n, M=study_n_orig, P=pop_n, R=perc)) if study_n: log.write("Calculating {N:,} uncorrected p-values using {PFNC}\n".format( N=len(allterms), PFNC=self.pval_obj.name)) # If no study genes were found in the population, return empty GOEA results if not study_n: return [] calc_pvalue = self.pval_obj.calc_pvalue for goid in allterms: study_items = go2studyitems.get(goid, set()) study_count = len(study_items) pop_items = self.go2popitems.get(goid, set()) pop_count = len(pop_items) one_record = GOEnrichmentRecord( goid, p_uncorrected=calc_pvalue(study_count, study_n, pop_count, pop_n), study_items=study_items, pop_items=pop_items, ratio_in_study=(study_count, study_n), ratio_in_pop=(pop_count, pop_n)) results.append(one_record) return results def _run_multitest_corr(self, results, usrmethod_flds, alpha, study, log): """Do multiple-test corrections on uncorrected pvalues.""" assert 0 < alpha < 1, "Test-wise alpha must fall between (0, 1)" pvals = [r.p_uncorrected for r in results] ntobj = cx.namedtuple("ntobj", "results pvals alpha nt_method study") for nt_method in usrmethod_flds: ntmt = ntobj(results, pvals, alpha, nt_method, study) self._run_multitest[nt_method.source](ntmt) if log is not None: self._log_multitest_corr(log, results, ntmt, alpha) def _log_multitest_corr(self, log, results, ntmt, alpha): """Print information regarding multitest correction results.""" ntm = ntmt.nt_method attr_mult = "p_{M}".format(M=self.methods.get_fieldname(ntm.source, ntm.method)) eps = [r for r in results if getattr(r, attr_mult) < alpha] sig_cnt = len(eps) ctr = cx.Counter([r.enrichment for r in eps]) log.write(' METHOD {M}:\n'.format(M=ntm.method)) log.write("{N:8,} GO terms found significant (< {A}=alpha) ".format( N=sig_cnt, A=alpha)) log.write('({E:3} enriched + {P:3} purified): '.format(E=ctr['e'], P=ctr['p'])) log.write("{MSRC} {METHOD}\n".format(MSRC=ntm.source, METHOD=ntm.method)) log.write("{N:8,} study items associated with significant GO IDs (enriched)\n".format( N=len(self.get_study_items(r for r in eps if r.enrichment == 'e')))) log.write("{N:8,} study items associated with significant GO IDs (purified)\n".format( N=len(self.get_study_items(r for r in eps if r.enrichment == 'p')))) @staticmethod def get_study_items(results): """Return a list of study items associated with the given results.""" study_items = set() for obj in results: study_items.update(obj.study_items) return study_items def _run_multitest_statsmodels(self, ntmt): """Use multitest mthods that have been implemented in statsmodels.""" # Only load statsmodels if it is used multipletests = self.methods.get_statsmodels_multipletests() results = multipletests(ntmt.pvals, ntmt.alpha, ntmt.nt_method.method) pvals_corrected = results[1] # reject_lst, pvals_corrected, alphacSidak, alphacBonf self._update_pvalcorr(ntmt, pvals_corrected) def _run_multitest_local(self, ntmt): """Use multitest mthods that have been implemented locally.""" corrected_pvals = None method = ntmt.nt_method.method if method == "bonferroni": corrected_pvals = Bonferroni(ntmt.pvals, ntmt.alpha).corrected_pvals elif method == "sidak": corrected_pvals = Sidak(ntmt.pvals, ntmt.alpha).corrected_pvals elif method == "holm": corrected_pvals = HolmBonferroni(ntmt.pvals, ntmt.alpha).corrected_pvals elif method == "fdr": # get the empirical p-value distributions for FDR term_pop = getattr(self, 'term_pop', None) if term_pop is None: term_pop = count_terms(self.pop, self.assoc, self.obo_dag) p_val_distribution = calc_qval(len(ntmt.study), self.pop_n, self.pop, self.assoc, term_pop, self.obo_dag) corrected_pvals = FDR(p_val_distribution, ntmt.results, ntmt.alpha).corrected_pvals self._update_pvalcorr(ntmt, corrected_pvals) @staticmethod def _update_pvalcorr(ntmt, corrected_pvals): """Add data members to store multiple test corrections.""" if corrected_pvals is None: return for rec, val in zip(ntmt.results, corrected_pvals): rec.set_corrected_pval(ntmt.nt_method, val) # Methods for writing results into tables: text, tab-separated, Excel spreadsheets def wr_txt(self, fout_txt, goea_results, prtfmt=None, **kws): """Print GOEA results to text file.""" if not goea_results: sys.stdout.write(" 0 GOEA results. NOT WRITING {FOUT}\n".format(FOUT=fout_txt)) return with open(fout_txt, 'w') as prt: if 'title' in kws: prt.write("{TITLE}\n".format(TITLE=kws['title'])) data_nts = self.prt_txt(prt, goea_results, prtfmt, **kws) log = self.log if self.log is not None else sys.stdout log.write(" {N:>5} GOEA results for {CUR:5} study items. WROTE: {F}\n".format( N=len(data_nts), CUR=len(MgrNtGOEAs(goea_results).get_study_items()), F=fout_txt)) @staticmethod def prt_txt(prt, goea_results, prtfmt=None, **kws): """Print GOEA results in text format.""" objprt = PrtFmt() if prtfmt is None: flds = ['GO', 'NS', 'p_uncorrected', 'ratio_in_study', 'ratio_in_pop', 'depth', 'name', 'study_items'] prtfmt = objprt.get_prtfmt_str(flds) prtfmt = objprt.adjust_prtfmt(prtfmt) prt_flds = RPT.get_fmtflds(prtfmt) data_nts = MgrNtGOEAs(goea_results).get_goea_nts_prt(prt_flds, **kws) RPT.prt_txt(prt, data_nts, prtfmt, prt_flds, **kws) return data_nts def wr_xlsx(self, fout_xlsx, goea_results, **kws): """Write a xlsx file.""" # kws: prt_if indent itemid2name(study_items) objprt = PrtFmt() prt_flds = kws.get('prt_flds', self.objprtres.get_prtflds_default(goea_results)) xlsx_data = MgrNtGOEAs(goea_results).get_goea_nts_prt(prt_flds, **kws) if 'fld2col_widths' not in kws: kws['fld2col_widths'] = {f:objprt.default_fld2col_widths.get(f, 8) for f in prt_flds} RPT.wr_xlsx(fout_xlsx, xlsx_data, **kws) def wr_tsv(self, fout_tsv, goea_results, **kws): """Write tab-separated table data to file""" prt_flds = kws.get('prt_flds', self.objprtres.get_prtflds_default(goea_results)) tsv_data = MgrNtGOEAs(goea_results).get_goea_nts_prt(prt_flds, **kws) RPT.wr_tsv(fout_tsv, tsv_data, **kws) def prt_tsv(self, prt, goea_results, **kws): """Write tab-separated table data""" prt_flds = kws.get('prt_flds', self.objprtres.get_prtflds_default(goea_results)) tsv_data = MgrNtGOEAs(goea_results).get_goea_nts_prt(prt_flds, **kws) RPT.prt_tsv(prt, tsv_data, **kws) @staticmethod def get_ns2nts(results, fldnames=None, **kws): """Get namedtuples of GOEA results, split into BP, MF, CC.""" ns2nts = cx.defaultdict(list) nts = MgrNtGOEAs(results).get_goea_nts_all(fldnames, **kws) for ntgoea in nts: ns2nts[ntgoea.NS].append(ntgoea) return ns2nts @staticmethod def get_item_cnt(results, attrname="study_items"): """Get all study or population items (e.g., geneids).""" items = set() go_cnt = 0 for rec in results: if hasattr(rec, attrname): items_cur = getattr(rec, attrname) # Only count GO term if there are items in the set. if len(items_cur) != 0: items |= items_cur go_cnt += 1 return items, go_cnt def wr_py_goea_results(self, fout_py, goea_results, **kws): """Save GOEA results into Python package containing list of namedtuples.""" var_name = kws.get("var_name", "goea_results") docstring = kws.get("docstring", "") sortby = kws.get("sortby", None) if goea_results: from goatools.nt_utils import wr_py_nts nts_goea = goea_results # If list has GOEnrichmentRecords or verbose namedtuples, exclude some fields. if hasattr(goea_results[0], "_fldsdefprt") or hasattr(goea_results[0], 'goterm'): # Exclude some attributes from the namedtuple when saving results # to a Python file because the information is redundant or verbose. nts_goea = MgrNtGOEAs(goea_results).get_goea_nts_prt(**kws) docstring = "\n".join([docstring, "# {VER}\n\n".format(VER=self.obo_dag.version)]) assert hasattr(nts_goea[0], '_fields') if sortby is None: sortby = MgrNtGOEAs.dflt_sortby_objgoea nts_goea = sorted(nts_goea, key=sortby) wr_py_nts(fout_py, nts_goea, docstring, var_name)
def _init_args(self): """Get enrichment arg parser.""" #pylint: disable=invalid-name p = argparse.ArgumentParser( __doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) p.add_argument('filenames', type=str, nargs=3, help='data/study data/population data/association') p.add_argument('--alpha', default=0.05, type=float, help='Test-wise alpha for multiple testing') p.add_argument( '--pval', default=.05, type=float, help='Only print results with uncorrected p-value < PVAL.') p.add_argument('--pval_field', type=str, help='Only print results when PVAL_FIELD < PVAL.') p.add_argument('--outfile', default=None, type=str, help='Write enrichment results into xlsx or tsv file') p.add_argument( '--sections', default=None, type=str, help=('Use sections file for printing grouped GOEA results. ' 'Example SECTIONS values:\n' 'goatools.test_data.sections.gjoneska_pfenning \n' 'goatools/test_data/sections/gjoneska_pfenning.py \n' 'data/gjoneska_pfenning/sections_in.txt\n')) p.add_argument( '--outfile_detail', type=str, help=( 'Write enrichment results into a text file \n' 'containing the following information: \n' '1) GOEA GO terms, grouped into sections \n\n' '2) List of genes and ASCII art showing section membership \n' '3) Detailed list of each gene and GO terms w/their P-values \n' )) p.add_argument( '--compare', dest='compare', default=False, action='store_true', help="the population file as a comparison group. if this " "flag is specified, the population is used as the study " "plus the `population/comparison`") p.add_argument( '--ratio', dest='ratio', type=float, default=None, help="only show values where the difference between study " "and population ratios is greater than this. useful for " "excluding GO categories with small differences, but " "containing large numbers of genes. should be a value " "between 1 and 2. ") p.add_argument('--indent', dest='indent', default=False, action='store_true', help="indent GO terms") p.add_argument('--obo', default="go-basic.obo", type=str, help="Specifies location and name of the obo file") p.add_argument('--no_propagate_counts', default=False, action='store_true', help="Do not propagate counts to parent terms") p.add_argument('--method', default="bonferroni,sidak,holm,fdr_bh", type=str, help=Methods().getmsg_valid_methods()) p.add_argument('--pvalcalc', default="fisher", type=str, help=str(FisherFactory())) p.add_argument( '--min_overlap', default=0.7, type=float, help= "Check that a minimum amount of study genes are in the population") p.add_argument('--goslim', default='goslim_generic.obo', type=str, help="The GO slim file is used when grouping GO terms.") if len(sys.argv) == 1: sys.exit(not p.print_help()) args = p.parse_args() # Namespace object from argparse self._check_input_files(args, p) return args
def get_arg_parser(): """Get enrichment arg parser.""" #pylint: disable=invalid-name p = argparse.ArgumentParser( __doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) p.add_argument('filenames', type=str, nargs=3, help='data/study data/population data/association') p.add_argument('--alpha', default=0.05, type=float, help="Test-wise alpha for multiple testing ") p.add_argument( '--pval', default=.05, type=float, help="Only print out when uncorrected p-value < this value.") p.add_argument('--compare', dest='compare', default=False, action='store_true', help="the population file as a comparison group. if this " "flag is specified, the population is used as the study " "plus the `population/comparison`") p.add_argument('--ratio', dest='ratio', type=float, default=None, help="only show values where the difference between study " "and population ratios is greater than this. useful for " "excluding GO categories with small differences, but " "containing large numbers of genes. should be a value " "between 1 and 2. ") p.add_argument('--indent', dest='indent', default=False, action='store_true', help="indent GO terms") p.add_argument('--obo', default="go-basic.obo", type=str, help="Specifies location and name of the obo file") p.add_argument('--no_propagate_counts', default=False, action='store_true', help="Do not propagate counts to parent terms") p.add_argument('--outfile', default=None, type=str, help="Write enrichment results into xlsx or tsv file") p.add_argument('--method', default="bonferroni,sidak,holm,fdr_bh", type=str, help=Methods().getmsg_valid_methods()) p.add_argument('--pvalcalc', default="fisher", type=str, help=str(FisherFactory())) p.add_argument( '--min_overlap', default=0.7, type=float, help="Check that a minimum amount of study genes are in the population" ) if len(sys.argv) == 1: sys.exit(not p.print_help()) args = p.parse_args() # Namespace object from argparse _check_input_files(args, p) return args
def test_init_methods(): """Test initializing methods.""" mobj = Methods() assert mobj._srcmethod2fieldname == get_exp_fieldnames() assert mobj.getmsg_valid_methods() == get_expstr_fieldnames() assert mobj.methods == [mobj.NtMethodInfo(source='local', method='bonferroni', fieldname='bonferroni')] mobj._add_method_src('statsmodels', 'fdr_bh') assert mobj.methods == [ mobj.NtMethodInfo(source='local', method='bonferroni', fieldname='bonferroni'), mobj.NtMethodInfo(source='statsmodels', method='fdr_bh', fieldname='fdr_bh')] sm_methods = ['sm_{}'.format(m) for m in mobj.all_methods[1][1]] # statsmodels mobj._init_methods(sm_methods) assert mobj.methods == [ mobj.NtMethodInfo(source='statsmodels', method='bonferroni', fieldname='sm_bonferroni'), mobj.NtMethodInfo(source='statsmodels', method='sidak', fieldname='sm_sidak'), mobj.NtMethodInfo(source='statsmodels', method='holm-sidak', fieldname='sm_holm-sidak'), mobj.NtMethodInfo(source='statsmodels', method='holm', fieldname='sm_holm'), mobj.NtMethodInfo(source='statsmodels', method='simes-hochberg', fieldname='sm_simes-hochberg'), mobj.NtMethodInfo(source='statsmodels', method='hommel', fieldname='sm_hommel'), mobj.NtMethodInfo(source='statsmodels', method='fdr_bh', fieldname='sm_fdr_bh'), mobj.NtMethodInfo(source='statsmodels', method='fdr_by', fieldname='sm_fdr_by'), mobj.NtMethodInfo(source='statsmodels', method='fdr_tsbh', fieldname='sm_fdr_tsbh'), mobj.NtMethodInfo(source='statsmodels', method='fdr_tsbky', fieldname='sm_fdr_tsbky'), mobj.NtMethodInfo(source='statsmodels', method='fdr_gbs', fieldname='sm_fdr_gbs')]
help=('Write enrichment results into a text file \n' 'containing the following information: \n' '1) GOEA GO terms, grouped into sections \n\n' '2) List of genes and ASCII art showing section membership \n' '3) Detailed list of each gene and GO terms w/their P-values \n'), abbrev='D') @plac.opt('ratio', type=float, help="only show values where the difference between study " "and population ratios is greater than this. useful for " "excluding GO categories with small differences, but " "containing large numbers of genes. should be a value " "between 1 and 2. ") @plac.opt('relationships', abbrev='R', help=('Propagate counts up user-specified relationships ( comma separated ), which include: ' '{RELS}').format(RELS=' '.join(RELATIONSHIP_SET))) @plac.opt('method', type=str, help=Methods().getmsg_valid_methods()) @plac.opt('pvalcalc', type=str, help=str(FisherFactory()), abbrev='calc') @plac.opt('min_overlap', type=float, help="Check that a minimum amount of study genes are in the population", abbrev='M') @plac.opt('goslim', type=str, help="The GO slim file is used when grouping GO terms.") def run(name='human', taxid=9606, download=False, alpha=0.05, pval=.05, field='p_uncorrected', outfile='result.tsv', ns='BP,MF,CC', id2sym=None, detail='', sections=None, compare=False, ratio=None, prtstd=False, indent=False, noprop=False, relationship=False, relationships='', plot=False, enrich=False, method="bonferroni,sidak,holm,fdr_bh", pvalcalc="fisher", min_overlap=0.7, goslim='goslim_generic.obo', inc='', exc='', *study): # Construct arguments to pass down to GO. go_params = dict(alpha=alpha, pval=pval, pval_field=field, outfile=outfile,