Exemplo n.º 1
1
 def run(study, pop, assoc, alpha=0.05, p_value=0.05, compare=False, ratio=None, obo='go-basic.obo', no_propagate_counts=False,
         method='bonferroni,sidak,holm', pvalcalc='fisher'):
     '''
     This is the wrapper of the Goatools function.
     
     :param study: a list of study gene
     :param pop: a list of population gene
     :param assoc: the association from the gene to the go term
     :return: 
     '''
     if type(study) == str and type(pop) == str:
         # load the study and pop from the file
         study, pop = GO._read_geneset(study, pop, compare=compare)
     else:
         # convert to the set
         study = frozenset(study)
         pop = set(pop)
     methods = method.split(",")
     if obo == 'go-basic.obo':
         obo = os.path.dirname(os.path.realpath(__file__)) + "/obo/go.obo"
     if not os.path.exists(obo):
         print("obo file not found, start to download")
         wget.download('http://purl.obolibrary.org/obo/go/go-basic.obo', obo)
     obo_dag = GODag(obo)
     propagate_counts = not no_propagate_counts
     if type(assoc) == dict:
         buf = ""
         for k, v in assoc.items():
             if not v: continue
             line = ";".join([str(x) for x in v if x])
             buf += "{}\t{}\n".format(k, line)
         path = os.path.dirname(os.path.realpath(__file__)) + "/assoc"
         with open(path, 'w') as fp:
             fp.write(buf)
         assoc = read_associations(path)
     elif type(assoc) == defaultdict:
         pass
     else:
         # if from a file
         assoc = read_associations(assoc)
     g = GOEnrichmentStudy(pop, assoc, obo_dag,
                           propagate_counts=propagate_counts,
                           alpha=alpha,
                           pvalcalc=pvalcalc,
                           methods=methods)
     results = g.run_study(study)
     # g.print_summary(results, min_ratio=ratio, indent=False, pval=p_value)
     r = 'GO\tNS\tenrichment\tname\tratio_in_study\tratio_in_pop\tp_uncorrected\tdepth\tstudy_count\tp_bonferroni\tp_sidak\tp_holm\thit\n'
     for x in results:
         r += x.__str__() + "\n"
     tb = pd.read_table(StringIO(r))
     return GO(tb, study, pop, assoc, alpha, p_value, compare, ratio, obo, no_propagate_counts, method, pvalcalc, obo_dag)
Exemplo n.º 2
0
def get_goeaobj(methods=None):
    """Test GOEA with method, fdr."""
    obo_dag = GODag(ROOT + "goslim_generic.obo")
    assoc = read_associations(ROOT + "slim_association", no_top=True)
    popul_ids = [line.rstrip() for line in open(ROOT + "small_population")]
    goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods)
    return goeaobj
Exemplo n.º 3
0
def get_goeaobj(methods=None):
    """Test GOEA with method, fdr."""
    obo_dag = GODag(ROOT + "goslim_generic.obo")
    assoc = read_associations(ROOT + "slim_association", no_top=True)
    popul_ids = [line.rstrip() for line in open(ROOT + "small_population")]
    goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods)
    return goeaobj
Exemplo n.º 4
0
def get_goeaobj(methods=None):
    """Test GOEA with method, fdr."""
    obo_dag = GODag("go-basic.obo")
    assoc = read_associations("../data/association", no_top=True)
    popul_ids = [line.rstrip() for line in open("../data/population")]
    goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods)
    return goeaobj
Exemplo n.º 5
0
 def rd_files(self):
     """Read files and return study and population."""
     study_fn, pop_fn, assoc_fn = self.args.filenames
     assoc = read_associations(assoc_fn)
     study, pop = self._read_geneset(study_fn, pop_fn)
     print("Study: {0} vs. Population {1}\n".format(len(study), len(pop)))
     return study, pop, assoc
Exemplo n.º 6
0
def get_goeaobj(methods=None):
    """Test GOEA with method, fdr."""
    obo_dag = GODag("go-basic.obo")
    assoc = read_associations("../data/association", no_top=True)
    popul_ids = [line.rstrip() for line in open("../data/population")]
    goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods)
    return goeaobj
Exemplo n.º 7
0
def test_fdr_bh(fout_log=None):
    """Do Gene Ontology Enrichment Analysis w/Benjamini-Hochberg multipletest. Print results"""
    # ---------------------------------------------------------------------
    # Run Gene Ontology Analysis (GOEA)
    #
    # 1. Initialize
    log = sys.stdout if fout_log is None else open(fout_log, 'w')
    obo_dag = GODag("go-basic.obo")
    assoc = read_associations("../data/association", no_top=True)
    popul_ids = [line.rstrip() for line in open("../data/population")]
    study_ids = [line.rstrip() for line in open("../data/study")]
    # 2. Run enrichment analysis
    goea = GOEA(obo_dag, assoc, log)
    goea.set_population(popul_ids)
    goea.set_params(alpha=0.05, method='fdr_bh')
    results_nt = goea.find_enrichment(study_ids)

    # ---------------------------------------------------------------------
    # Print results 3 ways: to screen, to tsv(tab-separated file), to xlsx(Excel spreadsheet)
    fout_tsv = "goea_fdr_bh.tsv"
    fout_xls = "goea_fdr_bh.xlsx"
   
    field_names = ['NS', 'study_cnt', 'fdr_bh', 'level', 'depth', 'GO', 'name', 'fdr_bh_sig'] # collect these
    print_names = ['NS', 'study_cnt', 'fdr_bh', 'level', 'depth', 'GO', 'name'] # print these in tsv and xlsx
    # Optional user customizable sort: 
    #     Sort by: 1st) BP, MF, CC; 2nd) corrected pval, with smallest first.
    sort_by = lambda nt: [nt.NS, nt.fdr_bh]
    # 1. Print results to screen using format in prtfmt. For example:
    #
    #      BP 22 3.073e-03 L06 D07 GO:0006468 protein phosphorylation
    #      BP  9 1.023e-02 L07 D08 GO:0006511 ubiquitin-dependent protein catabolic process
    #      BP  2 1.023e-02 L05 D09 GO:0019877 diaminopimelate biosynthetic process
    #      BP  2 1.223e-02 L04 D08 GO:0006301 postreplication repair
    #      BP  2 1.223e-02 L05 D09 GO:0030418 nicotianamine biosynthetic process
    #      BP  2 1.492e-02 L04 D06 GO:0006909 phagocytosis
    #      BP  2 1.492e-02 L03 D03 GO:0051322 anaphase
    #      ...
    # Print format field names are the same names as in the "field_names" variable.
    prtfmt = "{NS} {study_cnt:2} {fdr_bh:5.3e} L{level:02} D{depth:02} {GO} {name}\n"
    keep_if = lambda nt: nt.fdr_bh_sig # T/F: Keep the GOEA GO Term result only if the result is significant.
    goea.prt_txt(log, results_nt, field_names, prtfmt, sort_by=sort_by, keep_if=keep_if)

    # 2. Write results to tsv file
    # Sort by: 1st) BP, MF, CC; 2nd) By GO depth, deepest GO first.
    sort_by = lambda nt: [nt.NS, -1*nt.depth] 
    fld2fmt = {'fdr_bh':'{:8.2e}'} # Optional user defined formatting for specific fields
    goea.wr_tsv(fout_tsv, results_nt, field_names, 
        keep_if=keep_if, sort_by=sort_by, fld2fmt=fld2fmt, print_names=print_names)

    # 3. Write results to xlsx file
    # Use these headers instead of the print_names for the xlsx header
    hdrs = ['NS', 'Cnt', 'fdr_bh', 'L', 'D', 'Term', 'Ontology Term Name']
    # TBD Check that header and size of fields printed match
    goea.wr_xlsx(fout_xls, results_nt, field_names, 
        # optional key-word args (ie, kwargs, kws)
        keep_if=keep_if, sort_by=sort_by, hdrs=hdrs, fld2fmt=fld2fmt, print_names=print_names) 
    if fout_log is not None:
        log.close()
        sys.stdout.write("  WROTE: {}\n".format(fout_log))
Exemplo n.º 8
0
def init_goea(**kws):
    """Initialize GODag and GOEnrichmentStudy."""
    obo_dag = GODag(ROOT + "go-basic.obo")
    assoc = read_associations(ROOT + "association", no_top=True)
    popul_ids = [line.rstrip() for line in open(ROOT + "population")]
    methods = kws['methods'] if 'methods' in kws else ['not_bonferroni']
    study_ids = [line.rstrip() for line in open(ROOT + "study")]
    return GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods), study_ids
Exemplo n.º 9
0
def get_goeaobj(methods=None):
    """Test GOEA with method, fdr."""
    obo_fin = os.path.join(REPO, "go-basic.obo")
    obo_dag = get_godag(obo_fin, loading_bar=None)
    assoc = read_associations("{REPO}/tests/data/small_association".format(REPO=REPO), no_top=True)
    popul_fin = "{REPO}/tests/data/small_population".format(REPO=REPO)
    popul_ids = [line.rstrip() for line in open(popul_fin)]
    goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods)
    return goeaobj
Exemplo n.º 10
0
def init_goea(**kws):
    """Initialize GODag and GOEnrichmentStudy."""
    godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None)
    fin_assc = ROOT + "association"
    assoc = read_associations(fin_assc, 'id2gos', no_top=True)
    popul_ids = [line.rstrip() for line in open(ROOT + "population")]
    methods = kws['methods'] if 'methods' in kws else ['not_bonferroni']
    study_ids = [line.rstrip() for line in open(ROOT + "study")]
    return GOEnrichmentStudy(popul_ids, assoc, godag, methods=methods), study_ids
Exemplo n.º 11
0
def rd_files(filenames, compare, prt=sys.stdout):
    """Read files and return study and population."""
    study_fn, pop_fn, assoc_fn = filenames
    assoc = read_associations(assoc_fn)
    study, pop = read_geneset(study_fn, pop_fn, compare=compare)
    if prt:
        prt.write("Study: {0} vs. Population {1}\n".format(
            len(study), len(pop)))
    return study, pop, assoc
Exemplo n.º 12
0
def init_goea(**kws):
    """Initialize GODag and GOEnrichmentStudy."""
    obo_dag = GODag(ROOT + "go-basic.obo")
    assoc = read_associations(ROOT + "association", no_top=True)
    popul_ids = [line.rstrip() for line in open(ROOT + "population")]
    methods = kws['methods'] if 'methods' in kws else ['not_bonferroni']
    study_ids = [line.rstrip() for line in open(ROOT + "study")]
    return GOEnrichmentStudy(popul_ids, assoc, obo_dag,
                             methods=methods), study_ids
Exemplo n.º 13
0
def test_goea():
    """Test GOEA with method, fdr."""
    obo_dag = GODag("go-basic.obo")
    assoc = read_associations("../data/association", no_top=True)
    popul_ids = [line.rstrip() for line in open("../data/population")]
    study_ids = [line.rstrip() for line in open("../data/study")]
    goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=['fdr'])
    goea_results = goeaobj.run_study(study_ids)
    goeaobj.print_summary(goea_results)
Exemplo n.º 14
0
def get_goeaobj(methods=None):
    """Test GOEA with method, fdr."""
    obo_fin = os.path.join(REPO, "go-basic.obo")
    obo_dag = get_godag(obo_fin, loading_bar=None)
    fin_assc = "{REPO}/tests/data/small_association".format(REPO=REPO)
    assoc = read_associations(fin_assc, 'id2gos', no_top=True)
    popul_fin = "{REPO}/tests/data/small_population".format(REPO=REPO)
    popul_ids = [line.rstrip() for line in open(popul_fin)]
    goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods)
    return goeaobj
Exemplo n.º 15
0
def get_goea_results(method="fdr_bh"):
    """Get GOEA results."""
    root_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
    obo_fin = os.path.join(root_dir, "goslim_generic.obo")
    obo_dag = GODag(obo_fin)
    assoc = read_associations(os.path.join(root_dir, "slim_association"), no_top=True)
    popul_ids = [line.rstrip() for line in open(os.path.join(root_dir, "small_population"))]
    goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=[method])
    study_ids = [line.rstrip() for line in open(os.path.join(root_dir, "small_study"))]
    goea_results = goeaobj.run_study(study_ids, methods=[method])
    return goea_results
Exemplo n.º 16
0
def get_goea_results(method="fdr_bh"):
    """Get GOEA results."""
    root_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
    obo_fin = os.path.join(root_dir, "goslim_generic.obo")
    obo_dag = GODag(obo_fin)
    assoc = read_associations(os.path.join(root_dir, "slim_association"), no_top=True)
    popul_ids = [line.rstrip() for line in open(os.path.join(root_dir, "small_population"))]
    goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=[method])
    study_ids = [line.rstrip() for line in open(os.path.join(root_dir, "small_study"))]
    goea_results = goeaobj.run_study(study_ids, methods=[method])
    return goea_results
Exemplo n.º 17
0
    def __init__(self):
        obodag = GODag("../Data/evaluation_reference/goslim_yeast.obo")
        background = [line.strip() for line in open('../Data/evaluation_reference/gene_list.txt')]
        geneid2gos_yeast = read_associations('../Data/evaluation_reference/geneid2gos_yeast.txt')

        self.goeaobj = GOEnrichmentStudy(
            background,
            geneid2gos_yeast,
            obodag,
            propogate_counts=False,
            alpha=0.05,
            methods=['fdr_bh'])
Exemplo n.º 18
0
def init_goea(log):
    """Read Ontologies and Annotations once."""
    # ---------------------------------------------------------------------
    # Run Gene Ontology Analysis (GOEA)
    #
    # 1. Initialize
    obo_dag = GODag("go-basic.obo")
    assoc = read_associations("../data/association", no_top=True)
    popul_ids = [line.rstrip() for line in open("../data/population")]
    # 2. Run enrichment analysis
    goeaobj = GOEA(obo_dag, assoc, log)
    goeaobj.set_population(popul_ids)
    return goeaobj
Exemplo n.º 19
0
def run_bonferroni():
    """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways."""
    # ---------------------------------------------------------------------
    # Run Gene Ontology Analysis (GOEA)
    #
    # 1. Initialize
    godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None)
    assoc = read_associations(os.path.join(REPO, "data/association"), no_top=True)
    popul_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/population"))]
    study_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/study"))]
    # 2. Run enrichment analysis
    goea = GOEnrichmentStudy(popul_ids, assoc, godag, alpha=0.05, methods=['bonferroni'])
    results_nt = goea.run_study(study_ids)
    return results_nt, goea
Exemplo n.º 20
0
def run_bonferroni(log):
    """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways."""
    # ---------------------------------------------------------------------
    # Run Gene Ontology Analysis (GOEA)
    #
    # 1. Initialize
    obo_dag = GODag("go-basic.obo")
    assoc = read_associations("../data/association", no_top=True)
    popul_ids = [line.rstrip() for line in open("../data/population")]
    study_ids = [line.rstrip() for line in open("../data/study")]
    # 2. Run enrichment analysis
    goea = GOEnrichmentStudy(popul_ids, assoc, obo_dag, alpha=0.05, methods=['bonferroni'])
    results_nt = goea.run_study(study_ids)
    return results_nt, goea
Exemplo n.º 21
0
def run_bonferroni():
    """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways."""
    # ---------------------------------------------------------------------
    # Run Gene Ontology Analysis (GOEA)
    #
    # 1. Initialize
    godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None)
    fin_assc = os.path.join(REPO, "data/association")
    assoc = read_associations(fin_assc, 'id2gos', no_top=True)
    popul_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/population"))]
    study_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/study"))]
    # 2. Run enrichment analysis
    goea = GOEnrichmentStudy(popul_ids, assoc, godag, alpha=0.05, methods=['bonferroni'])
    results_nt = goea.run_study(study_ids)
    return results_nt, goea
Exemplo n.º 22
0
def run_bonferroni(log):
    """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways."""
    # ---------------------------------------------------------------------
    # Run Gene Ontology Analysis (GOEA)
    #
    # 1. Initialize
    obo_dag = GODag("go-basic.obo")
    assoc = read_associations("../data/association", no_top=True)
    popul_ids = [line.rstrip() for line in open("../data/population")]
    study_ids = [line.rstrip() for line in open("../data/study")]
    # 2. Run enrichment analysis
    goea = GOEnrichmentStudy(popul_ids,
                             assoc,
                             obo_dag,
                             alpha=0.05,
                             methods=['bonferroni'])
    results_nt = goea.run_study(study_ids)
    return results_nt, goea
Exemplo n.º 23
0
        if opts.term not in go_dag:
            sys.stderr.write(("term %s not found!\n" % opts.term))
            sys.exit(1)
        direct_anc, all_anc = mapslim(opts.term, go_dag, goslim_dag)
        # output either all or only direct slims, depending on user command
        if only_direct:
            slim_terms_str = ";".join(direct_anc)
        else:
            slim_terms_str = ";".join(all_anc)
        print(slim_terms_str)

    # in case a association file is given as input
    if opts.ass_file_name:
        assert os.path.exists(opts.ass_file_name), ("file %s not found!"
                                                    % opts.ass_file_name)
        assocs = read_associations(opts.ass_file_name, 'id2gos')
        for protein_product, go_terms in assocs.items():
            all_direct_anc = set()
            all_covered_anc = set()
            all_all_anc = set()
            for go_term in go_terms:
                if go_term not in go_dag:
                    continue
                direct_anc, all_anc = mapslim(go_term, go_dag, goslim_dag)
                all_all_anc |= all_anc
                # collect all covered ancestors, so the direct ancestors
                # can be calculated afterwards
                all_covered_anc |= (all_anc - direct_anc)
            all_direct_anc = all_all_anc - all_covered_anc
            # output either all or only direct, depending on user command
            if only_direct:
Exemplo n.º 24
0
        if opts.term not in go_dag:
            sys.stderr.write(("term %s not found!\n" % opts.term))
            sys.exit(1)
        direct_anc, all_anc = mapslim(opts.term, go_dag, goslim_dag)
        # output either all or only direct slims, depending on user command
        if only_direct:
            slim_terms_str = ";".join(direct_anc)
        else:
            slim_terms_str = ";".join(all_anc)
        print(slim_terms_str)

    # in case a association file is given as input
    if opts.ass_file_name:
        assert os.path.exists(opts.ass_file_name), ("file %s not found!" %
                                                    opts.ass_file_name)
        assocs = read_associations(opts.ass_file_name, 'id2gos')
        for protein_product, go_terms in assocs.items():
            all_direct_anc = set()
            all_covered_anc = set()
            all_all_anc = set()
            for go_term in go_terms:
                if go_term not in go_dag:
                    continue
                direct_anc, all_anc = mapslim(go_term, go_dag, goslim_dag)
                all_all_anc |= all_anc
                # collect all covered ancestors, so the direct ancestors
                # can be calculated afterwards
                all_covered_anc |= (all_anc - direct_anc)
            all_direct_anc = all_all_anc - all_covered_anc
            # output either all or only direct, depending on user command
            if only_direct:
Exemplo n.º 25
0
    if not args.compare:  # sanity check
        if len(pop) < len(study):
            exit("\nERROR: The study file contains more elements than the population file. "
                 "Please check that the study file is a subset of the population file.\n")
        # check the fraction of genomic ids that overlap between study
        # and population
        overlap = float(len(study & pop)) / len(study)
        if 0.7 < overlap < 0.95:
            sys.stderr.write("\nWARNING: only {} fraction of genes/proteins in study are found in "
                             "the population  background.\n\n".format(overlap))
        if overlap <= 0.7:
            exit("\nERROR: only {} of genes/proteins in the study are found in the "
                 "background population. Please check.\n".format(overlap))

    assoc = read_associations(assoc_fn)

    methods = args.method.split(",")

    obo_dag = GODag(obo_file=args.obo)
    propagate_counts = not args.no_propagate_counts
    g = GOEnrichmentStudy(pop, assoc, obo_dag,
                          propagate_counts=propagate_counts,
                          alpha=args.alpha,
                          pvalcalc=args.pvalcalc,
                          methods=methods)
    results = g.run_study(study)
    if args.outfile is None:
        g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval)
    else:
        # Users can print to both tab-separated file and xlsx file in one run.
Exemplo n.º 26
0
            in_memb = in_memb or term in plasma_membrane
            in_nucl = in_nucl or term in nucleus
        if in_cyto:
            print("cytoplasmic_part\t" + prot)
        # if in_memb:
        #     print("plasma_membrane_part\t" + prot)
        if in_nucl:
            print("nuclear_part\t" + prot)


if __name__ == "__main__":
    if len(sys.argv) != 3:
        sys.exit("USAGE: <script> ASSOCS OBO")
    assoc_file = sys.argv[1]
    obo_file = sys.argv[2]
    obo_dag = GODag(obo_file=obo_file, optional_attrs=["relationship"])
    cytoplasm = set(["GO:0005737"])
    plasma_membrane = set(["GO:0005886"])
    nucleus = set(["GO:0005634"])
    for term in obo_dag:
        term_rec = obo_dag[term]
        parents = get_really_all_parents(term_rec)
        if "GO:0044444" in parents:
            cytoplasm.add(term)
        if "GO:0044459" in parents:
            plasma_membrane.add(term)
        if "GO:0044428" in parents:
            nucleus.add(term)
    assoc = read_associations(assoc_file)
    print_locations(assoc, cytoplasm, plasma_membrane, nucleus)
Exemplo n.º 27
0
    #                        res_df_name = 'res_clustering.csv',
    #                        method='ward' ,
    #                        metric='euclidean'
    #                           )
    #==============================================================================

    data_path = os.path.abspath("data")
    res_path = os.path.abspath("results")
    in_go = data_path + '/go-basic.obo'
    in_assoc = data_path + '/associations.txt'
    gene_found = [
        strip(n.split('\t')[0])
        for n in open(data_path + '/in_df.txt').read().split('\n')
    ]
    obodag = GODag(in_go)
    geneid2gos = read_associations(in_assoc)
    goeaobj = GOEnrichmentStudy(
        gene_found,  # List of mouse protein-coding genes
        geneid2gos,  # geneid/GO associations
        obodag,  # Ontologies
        propagate_counts=False,
        alpha=0.05,  # default significance cut-off
        methods=['fdr_bh'])

    clustering_df = pd.DataFrame.from_csv('res_clustering.csv')
    for index, col in enumerate(clustering_df.columns):
        in_data = clustering_df[col].value_counts()
        in_data = in_data[in_data >= 2]
        cut_distance = col.split('_')[-1]
        print index, col, cut_distance
        for cluster in in_data.index.values:
Exemplo n.º 28
0
 def run(study,
         pop,
         assoc,
         alpha=0.05,
         p_value=0.05,
         compare=False,
         ratio=None,
         obo='go-basic.obo',
         no_propagate_counts=False,
         method='bonferroni,sidak,holm',
         pvalcalc='fisher'):
     '''
     This is the wrapper of the Goatools function.
     
     :param study: a list of study gene
     :param pop: a list of population gene
     :param assoc: the association from the gene to the go term
     :return: 
     '''
     if type(study) == str and type(pop) == str:
         # load the study and pop from the file
         study, pop = GO._read_geneset(study, pop, compare=compare)
     else:
         # convert to the set
         study = frozenset(study)
         pop = set(pop)
     methods = method.split(",")
     if obo == 'go-basic.obo':
         obo = os.path.dirname(os.path.realpath(__file__)) + "/obo/go.obo"
     if not os.path.exists(obo):
         print("obo file not found, start to download")
         wget.download('http://purl.obolibrary.org/obo/go/go-basic.obo',
                       obo)
     obo_dag = GODag(obo)
     propagate_counts = not no_propagate_counts
     if type(assoc) == dict:
         buf = ""
         for k, v in assoc.items():
             if not v: continue
             line = ";".join([str(x) for x in v if x])
             buf += "{}\t{}\n".format(k, line)
         path = os.path.dirname(os.path.realpath(__file__)) + "/assoc"
         with open(path, 'w') as fp:
             fp.write(buf)
         assoc = read_associations(path)
     elif type(assoc) == defaultdict:
         pass
     else:
         # if from a file
         assoc = read_associations(assoc)
     g = GOEnrichmentStudy(pop,
                           assoc,
                           obo_dag,
                           propagate_counts=propagate_counts,
                           alpha=alpha,
                           pvalcalc=pvalcalc,
                           methods=methods)
     results = g.run_study(study)
     # g.print_summary(results, min_ratio=ratio, indent=False, pval=p_value)
     r = 'GO\tNS\tenrichment\tname\tratio_in_study\tratio_in_pop\tp_uncorrected\tdepth\tstudy_count\tp_bonferroni\tp_sidak\tp_holm\thit\n'
     for x in results:
         r += x.__str__() + "\n"
     tb = pd.read_table(StringIO(r))
     return GO(tb, study, pop, assoc, alpha, p_value, compare, ratio, obo,
               no_propagate_counts, method, pvalcalc, obo_dag)
Exemplo n.º 29
0
        if opts.term not in go_dag:
            sys.stderr.write(("term %s not found!\n" % opts.term))
            sys.exit(1)
        direct_anc, all_anc = mapslim(opts.term, go_dag, goslim_dag)
        # output either all or only direct slims, depending on user command
        if only_direct:
            slim_terms_str = ";".join(direct_anc)
        else:
            slim_terms_str = ";".join(all_anc)
        print(slim_terms_str)

    # in case a association file is given as input
    if opts.ass_file_name:
        assert os.path.exists(opts.ass_file_name), ("file %s not found!"
                                                    % opts.ass_file_name)
        assocs = read_associations(opts.ass_file_name)
        for protein_product, go_terms in assocs.items():
            all_direct_anc = set()
            all_covered_anc = set()
            all_all_anc = set()
            for go_term in go_terms:
                if go_term not in go_dag:
                    continue
                direct_anc, all_anc = mapslim(go_term, go_dag, goslim_dag)
                all_all_anc |= all_anc
                # collect all covered ancestors, so the direct ancestors
                # can be calculated afterwards
                all_covered_anc |= (all_anc - direct_anc)
            all_direct_anc = all_all_anc - all_covered_anc
            # output either all or only direct, depending on user command
            if only_direct:
Exemplo n.º 30
0
def enrich(gene2go: str,
           study: str,
           obo: str,
           population: str = None,
           geneid2symbol: str = None,
           correct='fdr_bh',
           alpha=0.05,
           top=20,
           goea_out=None,
           dag_out=None,
           dpi=300,
           show_gene_limit=6,
           only_plot_sig=False):
    """
    Go enrichment based on goatools
    :param gene2go: a file with two columns: gene_id \t go_term_id
    :param study: a file with at least one column, first column contains gene id, second columns is regulation direction
    :param obo: go-basic file download from GeneOntology
    :param population: a file with each row contains one gene; default to use all genes in gene2go file as population
    :param geneid2symbol: file with two columns: gene_id \t gene_symbol, used for DAG plot
    :param correct: pvalue adjustment method:
        Method used for testing and adjustment of pvalues. Can be either the
        full name or initial letters. Available methods are:
        - `bonferroni` : one-step correction
        - `sidak` : one-step correction
        - `holm-sidak` : step down method using Sidak adjustments
        - `holm` : step-down method using Bonferroni adjustments
        - `simes-hochberg` : step-up method  (independent)
        - `hommel` : closed method based on Simes tests (non-negative)
        - `fdr_bh` : Benjamini/Hochberg  (non-negative)
        - `fdr_by` : Benjamini/Yekutieli (negative)
        - `fdr_tsbh` : two stage fdr correction (non-negative)
        - `fdr_tsbky` : two stage fdr correction (non-negative)
    :param alpha: fdr cutoff, default 0.05
    :param top: n top go terms to plot, sorted by corrected pvalue
    :param goea_out: output enrichment result file
    :param dag_out: dag figure file
    :param dpi: resolution of image, no effect for svg
    :param show_gene_limit: the max number of gene in a node to show
    :param only_plot_sig: only plot dag for significantly enriched terms
    :return: None
    """
    if str(correct) == '3':
        correct = 'fdr_bh'
    if geneid2symbol:
        geneid2symbol = dict(x.strip().split()[:2] for x in open(geneid2symbol)
                             if x.strip())
    else:
        geneid2symbol = dict()
    obo = GODag(obo, optional_attrs=['relationship', 'is_a'])
    gene2go = read_associations(gene2go)
    study_genes = [x.strip().split()[0] for x in open(study)]
    try:
        reg_dict = dict(x.strip().split()[:2] for x in open(study))
    except:
        reg_dict = {x.strip(): '' for x in open(study)}
    if not population:
        population = gene2go.keys()
    else:
        population = [
            x.strip().split()[0] for x in open(population) if x.strip()
        ]

    goea_obj = GOEnrichmentStudy(population,
                                 gene2go,
                                 obo,
                                 propagate_counts=False,
                                 alpha=alpha,
                                 methods=('fdr_bh', ))
    keep_if = lambda r: r.ratio_in_study[0] != 0
    goea_results_all = goea_obj.run_study(study_genes, keep_if=keep_if)
    goea_out = goea_out or study + '.goea.xls'
    goea_obj.wr_tsv(goea_out, goea_results_all)

    def func(y):
        results = []
        genes = [x.strip() for x in y.split(',')]
        for gene in genes:
            tmp = [gene]
            if gene in reg_dict:
                tmp.append(reg_dict[gene])
            if gene in geneid2symbol:
                tmp.append(geneid2symbol[gene])
            results.append('|'.join(tmp))
        return ';'.join(results)

    # func = lambda y: ';'.join(x.strip()+'|'+reg_dict[x.strip()] if x.strip() in reg_dict else x.strip() for x in y.split(','))
    table = pd.read_table(goea_out, header=0, index_col=0)
    # 重新校正pvalue, 修改内容
    fdr = multipletests(table['p_uncorrected'], method=correct)[1]
    table['p_fdr_bh'] = fdr
    # 修改goea_result_all方便后续的画图
    for r, fdr in zip(goea_results_all, fdr):
        r.p_fdr_bh = fdr
    table.columns = [
        x if x != 'p_fdr_bh' else 'p_corrected' for x in table.columns
    ]
    table['enrichment'] = [
        'e' if x <= alpha else 'p' for x in table['p_corrected']
    ]
    table['study_items'] = table.loc[:, 'study_items'].map(func)
    # table = table.sort_values(by=['p_corrected', 'p_uncorrected'])
    table.to_csv(goea_out, header=True, index=True, sep='\t')

    # -------------------plot dag------------------------
    for each in ['BP', 'MF', 'CC']:
        if only_plot_sig:
            goea_results_sig = table[table['enrichment'] == 'e']
        else:
            goea_results_sig = table.copy()
        goea_results_sig = goea_results_sig[goea_results_sig['NS'] == each]
        if not goea_results_sig.shape[0]:
            print(f"No significant term to plot for {each} ")
            return
        if goea_results_sig.shape[0] >= top:
            goea_results_sig = goea_results_sig.iloc[:top]
        goid_subset = list(goea_results_sig.index)
        # t = obo[goid_subset[5]]
        # for k, v in t.relationship.items():
        #     print(t, k, type(v), list(v)[0].id)
        # print(dag_out[:-4]+'.'+each+dag_out[-4:])
        dag_out = dag_out or study + '.goea.dag.svg'
        plot_gos(
            dag_out[:-4] + '.' + each + dag_out[-4:],
            goid_subset,  # Source GO ids, 如果分析结果里面没有包含这个节点,则他的颜色会是苍白绿色,但这里这个情况不会出现
            obo,
            goea_results=
            goea_results_all,  # use pvals for coloring:"p_{M}".format(M=goea[0].method_flds[0].fieldname)
            # We can further configure the plot...
            id2symbol=geneid2symbol,  # Print study gene Symbols, not GeneIDs
            study_items=show_gene_limit,  # Only max 6 gene Symbols on GO terms
            items_p_line=3,  # Print 3 genes per line)
            dpi=0 if dag_out.endswith('svg') else dpi,
            # title="Directed Graph of enriched {} terms".format(each)
        )
    max_dist = 1000
elif snakemake.wildcards.state_type == 'Enhancer':
    min_dist = 5000
    max_dist = 50000
else:
    sys.exit(-1)
with open(snakemake.input.clusters) as f:
    for line in f:
        cols = line.strip().split()
        cluster = chr(int(cols[3]) + 65)
        if int(cols[-1]) <= max_dist and int(cols[-1]) >= min_dist:
            genes[cluster].add(cols[7])
            background.add(cols[7])

obodag = GODag("go-basic.obo")
id2go = read_associations("sym2go.txt")
goeaobj = GOEnrichmentStudy(background,
                            id2go,
                            obodag,
                            propagate_counts=False,
                            alpha=0.05,
                            methods=['fdr_bh'])
outfile = open(snakemake.output.txt, 'w')
for cluster, geneids in sorted(genes.items()):
    outfile.write("Cluster {}\n".format(cluster))
    goea_results_all = goeaobj.run_study(geneids)
    for fdr, name, enrichment in sorted([(r.p_fdr_bh, r.name, r.enrichment)
                                         for r in goea_results_all
                                         if r.p_fdr_bh < 0.2]):
        outfile.write("\t{}\t{}\t{}\n".format(name, fdr, enrichment))
    outfile.write("\n")
Exemplo n.º 32
0
        if opts.term not in go_dag:
            sys.stderr.write(("term %s not found!\n" % opts.term))
            sys.exit(1)
        direct_anc, all_anc = mapslim(opts.term, go_dag, goslim_dag)
        # output either all or only direct slims, depending on user command
        if only_direct:
            slim_terms_str = ";".join(direct_anc)
        else:
            slim_terms_str = ";".join(all_anc)
        print(slim_terms_str)

    # in case a association file is given as input
    if opts.ass_file_name:
        assert os.path.exists(opts.ass_file_name), ("file %s not found!" %
                                                    opts.ass_file_name)
        assocs = read_associations(opts.ass_file_name)
        for protein_product, go_terms in assocs.items():
            all_direct_anc = set()
            all_covered_anc = set()
            all_all_anc = set()
            for go_term in go_terms:
                if go_term not in go_dag:
                    continue
                direct_anc, all_anc = mapslim(go_term, go_dag, goslim_dag)
                all_all_anc |= all_anc
                # collect all covered ancestors, so the direct ancestors
                # can be calculated afterwards
                all_covered_anc |= (all_anc - direct_anc)
            all_direct_anc = all_all_anc - all_covered_anc
            # output either all or only direct, depending on user command
            if only_direct:
    max_dist = 1000
elif snakemake.wildcards.state_type == 'Enhancer':
    min_dist = 5000
    max_dist = 50000
else:
    sys.exit(-1)
with open(snakemake.input.clusters) as f:
    for line in f:
        cols = line.strip().split()
        cluster = chr(int(cols[3]) + 65)
        if int(cols[-1]) <= max_dist and int(cols[-1]) >= min_dist:
            genes[cluster].add(cols[7])
            background.add(cols[7])

obodag = GODag("go-basic.obo")
id2go = read_associations("sym2go.txt")
goeaobj = GOEnrichmentStudy(background, id2go, obodag, propagate_counts=False, alpha=0.05, methods=['fdr_bh'])
outfile = open(snakemake.output.txt, 'w')
for cluster, geneids in sorted(genes.items()):
    outfile.write("Cluster {}\n".format(cluster))
    goea_results_all = goeaobj.run_study(geneids)
    for fdr, name, enrichment in sorted([(r.p_fdr_bh, r.name, r.enrichment) for r in goea_results_all if r.p_fdr_bh < 0.2]):
        outfile.write("\t{}\t{}\t{}\n".format(name, fdr, enrichment))
    outfile.write("\n")
    #GOEnrichmentStudy.print_summary(goea_results_sig)





Exemplo n.º 34
0
    if not args.compare:  # sanity check
        if len(pop) < len(study):
            exit("\nERROR: The study file contains more elements than the population file. "
                 "Please check that the study file is a subset of the population file.\n")
        # check the fraction of genomic ids that overlap between study
        # and population
        overlap = float(len(study & pop)) / len(study)
        if 0.7 < overlap < 0.95:
            sys.stderr.write("\nWARNING: only {} fraction of genes/proteins in study are found in "
                             "the population  background.\n\n".format(overlap))
        if overlap <= 0.7:
            exit("\nERROR: only {} of genes/proteins in the study are found in the "
                 "background population. Please check.\n".format(overlap))

    assoc = read_associations(assoc_fn)

    methods = args.method.split(",")
  
    if args.fdr:
        methods.append("fdr")

    obo_dag = GODag(obo_file=args.obo)
    propagate_counts = not args.no_propagate_counts
    g = GOEnrichmentStudy(pop, assoc, obo_dag,
                          propagate_counts=propagate_counts,
                          alpha=args.alpha,
                          methods=methods)
    results = g.run_study(study)
    if args.outfile is None:
        g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval)
Exemplo n.º 35
0
def test_fdr_bh(fout_log=None):
    """Do Gene Ontology Enrichment Analysis w/Benjamini-Hochberg multipletest. Print results"""
    # ---------------------------------------------------------------------
    # Run Gene Ontology Analysis (GOEA)
    #
    # 1. Initialize
    log = sys.stdout if fout_log is None else open(fout_log, 'w')
    obo_dag = GODag("go-basic.obo")
    assoc = read_associations("../data/association", no_top=True)
    popul_ids = [line.rstrip() for line in open("../data/population")]
    study_ids = [line.rstrip() for line in open("../data/study")]
    # 2. Run enrichment analysis
    goea = GOEA(obo_dag, assoc, log)
    goea.set_population(popul_ids)
    goea.set_params(alpha=0.05, method='fdr_bh')
    results_nt = goea.find_enrichment(study_ids)

    # ---------------------------------------------------------------------
    # Print results 3 ways: to screen, to tsv(tab-separated file), to xlsx(Excel spreadsheet)
    fout_tsv = "goea_fdr_bh.tsv"
    fout_xls = "goea_fdr_bh.xlsx"

    field_names = [
        'NS', 'study_cnt', 'fdr_bh', 'level', 'depth', 'GO', 'name',
        'fdr_bh_sig'
    ]  # collect these
    print_names = [
        'NS', 'study_cnt', 'fdr_bh', 'level', 'depth', 'GO', 'name'
    ]  # print these in tsv and xlsx
    # Optional user customizable sort:
    #     Sort by: 1st) BP, MF, CC; 2nd) corrected pval, with smallest first.
    sort_by = lambda nt: [nt.NS, nt.fdr_bh]
    # 1. Print results to screen using format in prtfmt. For example:
    #
    #      BP 22 3.073e-03 L06 D07 GO:0006468 protein phosphorylation
    #      BP  9 1.023e-02 L07 D08 GO:0006511 ubiquitin-dependent protein catabolic process
    #      BP  2 1.023e-02 L05 D09 GO:0019877 diaminopimelate biosynthetic process
    #      BP  2 1.223e-02 L04 D08 GO:0006301 postreplication repair
    #      BP  2 1.223e-02 L05 D09 GO:0030418 nicotianamine biosynthetic process
    #      BP  2 1.492e-02 L04 D06 GO:0006909 phagocytosis
    #      BP  2 1.492e-02 L03 D03 GO:0051322 anaphase
    #      ...
    # Print format field names are the same names as in the "field_names" variable.
    prtfmt = "{NS} {study_cnt:2} {fdr_bh:5.3e} L{level:02} D{depth:02} {GO} {name}\n"
    keep_if = lambda nt: nt.fdr_bh_sig  # T/F: Keep the GOEA GO Term result only if the result is significant.
    goea.prt_txt(log,
                 results_nt,
                 field_names,
                 prtfmt,
                 sort_by=sort_by,
                 keep_if=keep_if)

    # 2. Write results to tsv file
    # Sort by: 1st) BP, MF, CC; 2nd) By GO depth, deepest GO first.
    sort_by = lambda nt: [nt.NS, -1 * nt.depth]
    fld2fmt = {
        'fdr_bh': '{:8.2e}'
    }  # Optional user defined formatting for specific fields
    goea.wr_tsv(fout_tsv,
                results_nt,
                field_names,
                keep_if=keep_if,
                sort_by=sort_by,
                fld2fmt=fld2fmt,
                print_names=print_names)

    # 3. Write results to xlsx file
    # Use these headers instead of the print_names for the xlsx header
    hdrs = ['NS', 'Cnt', 'fdr_bh', 'L', 'D', 'Term', 'Ontology Term Name']
    # TBD Check that header and size of fields printed match
    goea.wr_xlsx(
        fout_xls,
        results_nt,
        field_names,
        # optional key-word args (ie, kwargs, kws)
        keep_if=keep_if,
        sort_by=sort_by,
        hdrs=hdrs,
        fld2fmt=fld2fmt,
        print_names=print_names)
    if fout_log is not None:
        log.close()
        sys.stdout.write("  WROTE: {}\n".format(fout_log))
Exemplo n.º 36
0
    Pre = Precision(TP,FP)
    Sen = Sensitivity(TP,FN)
    F = F1(Pre,Sen)
    #    return (Pre , Sen , F1)
    #return (Pre)
    #return (Sen)
    return (F)
mean=[]
num=[]
#df = pd.read_csv('/sf/smpdata1/pronozinau/OrthoDB/odb10v0_gene_xrefs_onlyGO.tab', sep='\t', header=None)
#df.columns = ['ort', 'GO', '3']
#zipbO = zip(df['ort'].to_list(), df['GO'].to_list())
#my_dict = defaultdict(list)
#for k, v in zipbO:
#     my_dict[k].append(v)
my_dict = read_associations('/sf/smpdata1/pronozinau/Blast_test/GO_slim/GO_slim.csv', 'id2gos')

#def find_csv_filenames( path_to_dir, suffix=".csv" ):
#    filenames = listdir(path_to_dir)
#    return [ filename for filename in filenames if filename.endswith( suffix ) ]
#ba = find_csv_filenames("/sf/smpdata1/pronozinau/clust/group3", "csv")
ba = pd.read_csv('/storage/pronozinau/OrthoDB/mono_sp.csv', sep=',')
for w in ba['0']:
   try:    
    clustal = pd.read_csv('/storage/pronozinau/OrthoDB/clustalw/group_1/' + w + '.csv', sep='\t', header=None)
    blast = pd.read_csv('/storage/pronozinau/ALL_base_OtrhoDB/metout/group1_bla/' + w + '.csv', sep='\t', header=None)
    first = pd.read_csv('first_prot.csv', sep='\t', header=None)

    blast.columns = ['id_prot', 'id_orth', 'persent', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
    clustal.columns = ['id_orth', 'id_prot', 'persent', '4']
    first.columns = ['ort', 'id',  'gr']