예제 #1
1
 def run(study, pop, assoc, alpha=0.05, p_value=0.05, compare=False, ratio=None, obo='go-basic.obo', no_propagate_counts=False,
         method='bonferroni,sidak,holm', pvalcalc='fisher'):
     '''
     This is the wrapper of the Goatools function.
     
     :param study: a list of study gene
     :param pop: a list of population gene
     :param assoc: the association from the gene to the go term
     :return: 
     '''
     if type(study) == str and type(pop) == str:
         # load the study and pop from the file
         study, pop = GO._read_geneset(study, pop, compare=compare)
     else:
         # convert to the set
         study = frozenset(study)
         pop = set(pop)
     methods = method.split(",")
     if obo == 'go-basic.obo':
         obo = os.path.dirname(os.path.realpath(__file__)) + "/obo/go.obo"
     if not os.path.exists(obo):
         print("obo file not found, start to download")
         wget.download('http://purl.obolibrary.org/obo/go/go-basic.obo', obo)
     obo_dag = GODag(obo)
     propagate_counts = not no_propagate_counts
     if type(assoc) == dict:
         buf = ""
         for k, v in assoc.items():
             if not v: continue
             line = ";".join([str(x) for x in v if x])
             buf += "{}\t{}\n".format(k, line)
         path = os.path.dirname(os.path.realpath(__file__)) + "/assoc"
         with open(path, 'w') as fp:
             fp.write(buf)
         assoc = read_associations(path)
     elif type(assoc) == defaultdict:
         pass
     else:
         # if from a file
         assoc = read_associations(assoc)
     g = GOEnrichmentStudy(pop, assoc, obo_dag,
                           propagate_counts=propagate_counts,
                           alpha=alpha,
                           pvalcalc=pvalcalc,
                           methods=methods)
     results = g.run_study(study)
     # g.print_summary(results, min_ratio=ratio, indent=False, pval=p_value)
     r = 'GO\tNS\tenrichment\tname\tratio_in_study\tratio_in_pop\tp_uncorrected\tdepth\tstudy_count\tp_bonferroni\tp_sidak\tp_holm\thit\n'
     for x in results:
         r += x.__str__() + "\n"
     tb = pd.read_table(StringIO(r))
     return GO(tb, study, pop, assoc, alpha, p_value, compare, ratio, obo, no_propagate_counts, method, pvalcalc, obo_dag)
예제 #2
0
    def __init__(self, go_obo_path='data/go.obo'):
        canonical_orfs = paper_orfs

        self.obodag = GODag(go_obo_path)

        # read genes containing GO Ontology annotations
        orfs_with_go = read_sgd_orfs()

        # only use canonical orfs dataset
        self.orfs_with_go = orfs_with_go.join(canonical_orfs[[]], how='inner')

        # create mapping of gene names to set of GO annotaitons
        assoc = defaultdict(set)
        for idx, gene in self.orfs_with_go.iterrows():
            assoc[gene['name']] = set(gene.ontology.split(','))
        self.assoc = assoc
        self.methods = ['fdr_bh', 'bonferroni']

        self.devnull = open('/dev/null', 'w')

        # create GO enrichment object to run GO
        self.goeaobj = GOEnrichmentStudy(
            assoc.keys(),  # List of protein-coding genes
            assoc,  # geneid/GO associations
            self.obodag,  # Ontologies
            propagate_counts=False,
            alpha=0.05,  # default significance cut-off
            methods=self.methods,
            log=self.devnull)
예제 #3
0
def test_i122():
    """Test to re-produce issue#122: Passes currently."""
    obj = _Run(9606, 'gene2go', 'go-basic.obo')
    study_ids, population_ids = obj.get_genes_study_n_bg()

    # Result is the same whether fisher_scipy_stats of fisher
    pvalcalc = 'fisher_scipy_stats'
    goeaobj = GOEnrichmentStudy(population_ids,
                                obj.gene2go,
                                obj.godag,
                                methods=['bonferroni', 'fdr_bh'],
                                pvalcalc=pvalcalc)
    # Run GOEA Gene Ontology Enrichment Analysis
    results_goeas = goeaobj.run_study_nts(study_ids)
    print(
        'NS GO         p stu_ratio pop_ratio    p-uncorr bonferro fdr_bh   stu  '
    )
    for ntd in results_goeas:
        if ntd.study_count == 0:
            doprt = False
            if ntd.p_bonferroni < 0.05:
                assert ntd.enrichment == 'p'
                doprt = True
            if ntd.p_fdr_bh < 0.05:
                assert ntd.enrichment == 'p'
                doprt = True
            if doprt:
                print(obj.str_nt(ntd))
예제 #4
0
    def perform_gene_enrichment_analysis(self, metagene_matrix, method='fdr'):
        # Load the Gene Ontology
        n_comps = metagene_matrix.shape[1]

        self.download_and_cache_resources(
        )  # Download ontology and annotations, if necessary
        gene_ontology = obo_parser.GODag('../DownloadedResources/go-basic.obo')

        # Load the human annotations
        c = 0
        with gzip.open('../DownloadedResources/goa_human.gaf.gz', 'rt') as gaf:
            funcs = {}
            for entry in GOA.gafiterator(gaf):
                c += 1
                uniprot_id = entry.pop('DB_Object_Symbol')
                funcs[uniprot_id] = entry

        # Our population is the set of genes we are analysing

        population = self.gene_symbols()
        print("We have %d genes in our population" % len(population))

        # Build associations from functional annotations we got from the gaf file
        associations = {}
        for x in funcs:
            if x not in associations:
                associations[x] = set()
            associations[x].add(str(funcs[x]['GO_ID']))

        gea = GOEnrichmentStudy(population,
                                associations,
                                gene_ontology,
                                propagate_counts=True,
                                alpha=0.05,
                                methods=[method])
        gea_results_by_component = {}
        rankings = self.ranked_genes_by_component(metagene_matrix)
        for ci in range(n_comps):
            study_genes = rankings[ci]
            print('\nComp. %d: %s...' % (ci, str(study_genes[:10])))
            gea_results_by_component[ci] = gea.run_study(study_genes)

        # Get results into a dataframe per component.  Easiest way is to use routine to
        # write a .tsv file, then read back and filter

        gea_results_df_by_component = []
        for ci in range(n_comps):
            ge_df = self._perform_gene_enrichment_analysis_one_component(
                ci, gea_results_by_component, gea)
            if ge_df is not None:
                gea_results_df_by_component += [ge_df]

        # Merge the per-component dataframes into a single one
        gea_all_sig_results_df = pd.DataFrame()
        gea_all_sig_results_df = gea_all_sig_results_df.append(
            gea_results_df_by_component)

        gea_all_sig_results_df.to_csv(self.cache_dir +
                                      '%s_gea_all.tsv' % self.prefix,
                                      sep='\t')
예제 #5
0
class GoEnrich():
    def __init__(self):
        obodag = GODag("../Data/evaluation_reference/goslim_yeast.obo")
        background = [line.strip() for line in open('../Data/evaluation_reference/gene_list.txt')]
        geneid2gos_yeast = read_associations('../Data/evaluation_reference/geneid2gos_yeast.txt')

        self.goeaobj = GOEnrichmentStudy(
            background,
            geneid2gos_yeast,
            obodag,
            propogate_counts=False,
            alpha=0.05,
            methods=['fdr_bh'])

    def measure_enrichment(self,
                           gene_set=['YML106W', 'YKL135C', 'YDR516C',
                                     'YLR420W', 'YNL111C', 'YHR007C',
                                     'YLR014C', 'YKL216W', 'YNL078W',
                                     'YJR005W', 'YJL130C'],
                           run_name='base',
                           cluster_id=1):

        gene_ids = ['YML106W', 'YKL135C', 'YDR516C', 'YLR420W', 'YNL111C',
                    'YHR007C', 'YLR014C', 'YKL216W', 'YNL078W', 'YJR005W',
                    'YJL130C']

        goea_results_all = self.goeaobj.run_study(gene_ids)

        # we can get significant only
        # goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]

        self.goeaobj.wr_txt("../Results/" + run_name + "_" + str(cluster_id) +
                            ".txt", goea_results_all)
예제 #6
0
    def __init__(self, dir, params):
        """
        """
        super().__init__(dir, params)
        
        set_logger(os.path.join(self.dir, 'experiment.log'), 
                   level=logging.INFO, console=True)

        logging.info("Loading disease associations...")
        self.diseases_dict = load_diseases(self.params["associations_path"], 
                                           self.params["disease_subset"],
                                           exclude_splits=['none'])
        
        logging.info("Loading network...")
        self.network = Network(self.params["ppi_network"]) 
        self.degrees = np.array(list(dict(self.network.nx.degree()).values()))
        
        logging.info("Loading weights...")
        with open(os.path.join(params["model_path"], "models", "models.tar"), "rb") as f:
            split_to_model = pickle.load(f)
            
        self.ci_weights = ci_weights = np.mean([model['ci_weight'][0, 0].numpy() 
                                                for model in split_to_model.values()], axis=0)
        self.ci_weights_norm = self.ci_weights / np.sqrt(self.degrees)
        
        logging.info("Loading enrichment study...")
        geneid2go = read_ncbi_gene2go("data/go/gene2go.txt", taxids=[9606])
        obodag = GODag("data/go/go-basic.obo")
        self.go_study = GOEnrichmentStudy(self.network.get_names(),
                                          geneid2go,
                                          obodag, 
                                          propagate_counts = True,
                                          alpha = 0.05,
                                          methods = ['fdr_bh'])
예제 #7
0
def check_group_enrichment(tested_gene_file_name, total_gene_file_name):
    total_gene_list = load_gene_list(total_gene_file_name)
    tested_gene = load_gene_list(tested_gene_file_name)

    if not os.path.exists(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)):
        download(constants.GO_OBO_URL, constants.GO_DIR)

    obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME))

    if not os.path.exists(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME)):
        download(constants.GO_ASSOCIATION_GENE2GEO_URL, constants.GO_DIR)
        with gzip.open(os.path.join(constants.GO_DIR, os.path.basename(constants.GO_ASSOCIATION_GENE2GEO_URL)), 'rb') as f_in:
            with open(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME),'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    assoc = read_ncbi_gene2go(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME), no_top=True)

    g = GOEnrichmentStudy([int(cur) for cur in ensembl2entrez_convertor(total_gene_list)],
                          assoc, obo_dag, methods=["bonferroni", "fdr_bh"])
    g_res = g.run_study([int(cur) for cur in ensembl2entrez_convertor(tested_gene)])

    GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.p_uncorrected, cur.p_fdr_bh) for cur in g_res if
                  cur.p_fdr_bh <= 0.05]
    if len(GO_results) > 0:
        go_ns, go_terms, go_names, uncorrectd_pvals, FDRs = zip(*GO_results)
    else:
        go_terms = []
        uncorrectd_pvals = []
        FDRs = []
        go_names = []
        go_ns = []
    output_rows = [("\r\n".join(e2g_convertor(tested_gene)),  "\r\n".join(go_ns),
                        "\r\n".join(go_terms), "\r\n".join(go_names), "\r\n".join(map(str, uncorrectd_pvals)),
                        "\r\n".join(map(str, FDRs)))]
    print_to_excel(output_rows, tested_gene_file_name, total_gene_file_name)
예제 #8
0
def test_goea():
    """Test GOEA with method, fdr."""
    obo_dag = GODag("go-basic.obo")
    assoc = read_associations("../data/association", no_top=True)
    popul_ids = [line.rstrip() for line in open("../data/population")]
    study_ids = [line.rstrip() for line in open("../data/study")]
    goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=['fdr'])
    goea_results = goeaobj.run_study(study_ids)
    goeaobj.print_summary(goea_results)
예제 #9
0
def get_goea_results(method="fdr_bh"):
    """Get GOEA results."""
    root_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
    obo_fin = os.path.join(root_dir, "goslim_generic.obo")
    obo_dag = GODag(obo_fin)
    assoc = read_associations(os.path.join(root_dir, "slim_association"), no_top=True)
    popul_ids = [line.rstrip() for line in open(os.path.join(root_dir, "small_population"))]
    goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=[method])
    study_ids = [line.rstrip() for line in open(os.path.join(root_dir, "small_study"))]
    goea_results = goeaobj.run_study(study_ids, methods=[method])
    return goea_results
예제 #10
0
def get_goea_results(method="fdr_bh"):
    """Get GOEA results."""
    root_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
    obo_fin = os.path.join(root_dir, "goslim_generic.obo")
    obo_dag = GODag(obo_fin)
    assoc = read_associations(os.path.join(root_dir, "slim_association"), no_top=True)
    popul_ids = [line.rstrip() for line in open(os.path.join(root_dir, "small_population"))]
    goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=[method])
    study_ids = [line.rstrip() for line in open(os.path.join(root_dir, "small_study"))]
    goea_results = goeaobj.run_study(study_ids, methods=[method])
    return goea_results
예제 #11
0
def test_i96():
    """Test to re-produce issue#96: Passes currently."""
    # Trying to duplicate: ValueError("All values in table must be nonnegative.
    # Get genes
    study_ids = _get_geneids()
    population_ids = GeneID2nt.keys()
    # Get databases
    gene2go = get_assoc_ncbi_taxids([9606], loading_bar=None)
    fin_obo = os.path.join(os.getcwd(), "go-basic.obo")
    godag = get_godag(fin_obo, loading_bar=None)
    goeaobj = GOEnrichmentStudy(population_ids, gene2go, godag, methods=['fdr_bh'])
    # Run GOEA Gene Ontology Enrichment Analysis
    results_goeas = goeaobj.run_study(study_ids)
예제 #12
0
def _get_pvals(pvalfnc_names, prt=sys.stdout):
    fisher2pvals = {}
    taxid = 10090  # Mouse study
    obo_dag = GODag(download_go_basic_obo(prt=prt))
    geneids_pop = GeneID2nt_mus.keys()
    assoc_geneid2gos = get_assoc_ncbi_taxids([taxid])
    geneids_study = _get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx", prt)
    for fisher in pvalfnc_names:
        goeaobj = GOEnrichmentStudy(
            geneids_pop, assoc_geneid2gos, obo_dag, propagate_counts=False, alpha=0.05, methods=None, pvalcalc=fisher
        )
        fisher2pvals[fisher] = goeaobj._get_pval_uncorr(geneids_study, prt)
    return fisher2pvals
예제 #13
0
def run_bonferroni(log):
    """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways."""
    # ---------------------------------------------------------------------
    # Run Gene Ontology Analysis (GOEA)
    #
    # 1. Initialize
    obo_dag = GODag("go-basic.obo")
    assoc = read_associations("../data/association", no_top=True)
    popul_ids = [line.rstrip() for line in open("../data/population")]
    study_ids = [line.rstrip() for line in open("../data/study")]
    # 2. Run enrichment analysis
    goea = GOEnrichmentStudy(popul_ids, assoc, obo_dag, alpha=0.05, methods=['bonferroni'])
    results_nt = goea.run_study(study_ids)
    return results_nt, goea
예제 #14
0
def run_bonferroni():
    """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways."""
    # ---------------------------------------------------------------------
    # Run Gene Ontology Analysis (GOEA)
    #
    # 1. Initialize
    godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None)
    assoc = read_associations(os.path.join(REPO, "data/association"), no_top=True)
    popul_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/population"))]
    study_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/study"))]
    # 2. Run enrichment analysis
    goea = GOEnrichmentStudy(popul_ids, assoc, godag, alpha=0.05, methods=['bonferroni'])
    results_nt = goea.run_study(study_ids)
    return results_nt, goea
def _get_results(godag, propagate_counts, relationships, prt=sys.stdout):
    """Run a GOEA. Return results"""
    taxid = 10090  # Mouse study
    geneids_pop = set(GeneID2nt_mus.keys())
    assoc_geneid2gos = get_assoc_ncbi_taxids([taxid], loading_bar=None)
    geneids_study = get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx")
    goeaobj = GOEnrichmentStudy(geneids_pop,
                                assoc_geneid2gos,
                                godag,
                                propagate_counts=propagate_counts,
                                relationships=relationships,
                                alpha=0.05,
                                methods=['fdr_bh'])
    return goeaobj.run_study(geneids_study, prt=prt)
def _get_results(godag, propagate_counts, relationships, prt=sys.stdout):
    """Run a GOEA. Return results"""
    taxid = 10090 # Mouse study
    geneids_pop = set(GeneID2nt_mus.keys())
    assoc_geneid2gos = get_assoc_ncbi_taxids([taxid], loading_bar=None)
    geneids_study = get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx")
    goeaobj = GOEnrichmentStudy(
        geneids_pop,
        assoc_geneid2gos,
        godag,
        propagate_counts=propagate_counts,
        relationships=relationships,
        alpha=0.05,
        methods=['fdr_bh'])
    return goeaobj.run_study(geneids_study, prt=prt)
def run_bonferroni():
    """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways."""
    # ---------------------------------------------------------------------
    # Run Gene Ontology Analysis (GOEA)
    #
    # 1. Initialize
    godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None)
    fin_assc = os.path.join(REPO, "data/association")
    assoc = read_associations(fin_assc, 'id2gos', no_top=True)
    popul_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/population"))]
    study_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/study"))]
    # 2. Run enrichment analysis
    goea = GOEnrichmentStudy(popul_ids, assoc, godag, alpha=0.05, methods=['bonferroni'])
    results_nt = goea.run_study(study_ids)
    return results_nt, goea
예제 #18
0
    def go_enrichment_study(self):
        if self._go_enrichment_study is None:

            # Load the human annotations
            c = 0
            with gzip.open('../DownloadedResources/goa_human.gaf.gz',
                           'rt') as gaf:
                funcs = {}
                for entry in GOA.gafiterator(gaf):
                    c += 1
                    uniprot_id = entry.pop('DB_Object_Symbol')
                    funcs[uniprot_id] = entry
            # Our population is the set of genes we are analysing
            population = self.gene_symbols()
            print("We have %d genes in our population" % len(population))
            # Build associations from functional annotations we got from the gaf file
            associations = {}
            for x in funcs:
                if x not in associations:
                    associations[x] = set()
                associations[x].add(str(funcs[x]['GO_ID']))
            self._go_enrichment_study = \
                GOEnrichmentStudy(population, associations, self._gene_ontology,
                                  propagate_counts=True,
                                  alpha=0.01,
                                  methods=[self.method])
        return self._go_enrichment_study
예제 #19
0
def get_goeaobj(methods=None):
    """Test GOEA with method, fdr."""
    obo_dag = GODag("go-basic.obo")
    assoc = read_associations("../data/association", no_top=True)
    popul_ids = [line.rstrip() for line in open("../data/population")]
    goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods)
    return goeaobj
예제 #20
0
def get_goeaobj(methods=None):
    """Test GOEA with method, fdr."""
    obo_dag = GODag(ROOT + "goslim_generic.obo")
    assoc = read_associations(ROOT + "slim_association", no_top=True)
    popul_ids = [line.rstrip() for line in open(ROOT + "small_population")]
    goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods)
    return goeaobj
예제 #21
0
def _get_pvals(pvalfnc_names, prt=sys.stdout):
    fisher2pvals = {}
    taxid = 10090  # Mouse study
    obo_dag = GODag(download_go_basic_obo(prt=prt))
    geneids_pop = GeneID2nt_mus.keys()
    assoc_geneid2gos = get_assoc_ncbi_taxids([taxid])
    geneids_study = _get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx", prt)
    for fisher in pvalfnc_names:
        goeaobj = GOEnrichmentStudy(geneids_pop,
                                    assoc_geneid2gos,
                                    obo_dag,
                                    propagate_counts=False,
                                    alpha=0.05,
                                    methods=None,
                                    pvalcalc=fisher)
        fisher2pvals[fisher] = goeaobj._get_pval_uncorr(geneids_study, prt)
    return fisher2pvals
예제 #22
0
    def __init__(self, dir, params):
        """
        Constructor 
        Args: 
            dir (string) directory of the experiment to be run
        """
        super().__init__(dir, params)

        # Set the logger
        set_logger(os.path.join(self.dir, 'experiment.log'), 
                   level=logging.INFO, console=True)

        # Log title 
        logging.info("Disease Protein Prediction")
        logging.info("Sabri Eyuboglu  -- SNAP Group")
        logging.info("======================================")
        
        logging.info("Loading Disease Associations...")
        self.diseases_dict = load_diseases(self.params["associations_path"], 
                                           self.params["disease_subset"],
                                           exclude_splits=['none'])
        
        logging.info("Loading Network...")
        self.network = Network(self.params["ppi_network"]) 
        
        logging.info("Loading enrichment study...")
        obodag = GODag(self.params["go_path"])
        geneid2go = read_ncbi_gene2go(self.params["gene_to_go_path"], taxids=[9606])
        self.enrichment_study = GOEnrichmentStudy(self.network.get_names(),
                                                  geneid2go,
                                                  obodag,
                                                  log=None,
                                                  **self.params["enrichment_params"])

        logging.info("Loading predictions...")
        self.method_to_preds = {name: pd.read_csv(os.path.join(preds, "predictions.csv"), 
                                                  index_col=0) 
                                for name, preds in self.params["method_to_preds"].items()}
        
        outputs_path = os.path.join(self.dir, "outputs.pkl")
        if os.path.exists(outputs_path):
            logging.info("Loading outputs...")
            with open(outputs_path, 'rb') as f:
                self.outputs = pickle.load(f)
        else:
            self.outputs = {}
예제 #23
0
def _get_pvals(pvalfnc_names, prt=sys.stdout):
    fisher2pvals = {}
    taxid = 10090  # Mouse study
    file_obo = os.path.join(os.getcwd(), "go-basic.obo")
    obo_dag = get_godag(file_obo, prt, loading_bar=None)
    geneids_pop = GeneID2nt_mus.keys()
    assoc_geneid2gos = get_assoc_ncbi_taxids([taxid], loading_bar=None)
    geneids_study = get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx")
    for fisher in pvalfnc_names:
        goeaobj = GOEnrichmentStudy(geneids_pop,
                                    assoc_geneid2gos,
                                    obo_dag,
                                    propagate_counts=False,
                                    alpha=0.05,
                                    methods=None,
                                    pvalcalc=fisher)
        fisher2pvals[fisher] = goeaobj.get_pval_uncorr(geneids_study, prt)
    return fisher2pvals
예제 #24
0
def get_goeaobj(methods=None):
    """Test GOEA with method, fdr."""
    obo_fin = os.path.join(REPO, "go-basic.obo")
    obo_dag = get_godag(obo_fin, loading_bar=None)
    assoc = read_associations("{REPO}/tests/data/small_association".format(REPO=REPO), no_top=True)
    popul_fin = "{REPO}/tests/data/small_population".format(REPO=REPO)
    popul_ids = [line.rstrip() for line in open(popul_fin)]
    goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods)
    return goeaobj
예제 #25
0
 def get_goeaobj(self, pop_genes, assoc_geneid2gos):
     """Return a GOEnrichmentStudy specific for user-provided pop_genes and associations."""
     return GOEnrichmentStudy(pop_genes,
                              assoc_geneid2gos,
                              self.go_dag,
                              propagate_counts=self.propagate_counts,
                              alpha=self.alpha,
                              methods=[self.method],
                              log=None)
예제 #26
0
def init_goea(**kws):
    """Initialize GODag and GOEnrichmentStudy."""
    godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None)
    fin_assc = ROOT + "association"
    assoc = read_associations(fin_assc, 'id2gos', no_top=True)
    popul_ids = [line.rstrip() for line in open(ROOT + "population")]
    methods = kws['methods'] if 'methods' in kws else ['not_bonferroni']
    study_ids = [line.rstrip() for line in open(ROOT + "study")]
    return GOEnrichmentStudy(popul_ids, assoc, godag, methods=methods), study_ids
예제 #27
0
def init_goea(**kws):
    """Initialize GODag and GOEnrichmentStudy."""
    obo_dag = GODag(ROOT + "go-basic.obo")
    assoc = read_associations(ROOT + "association", no_top=True)
    popul_ids = [line.rstrip() for line in open(ROOT + "population")]
    methods = kws['methods'] if 'methods' in kws else ['not_bonferroni']
    study_ids = [line.rstrip() for line in open(ROOT + "study")]
    return GOEnrichmentStudy(popul_ids, assoc, obo_dag,
                             methods=methods), study_ids
예제 #28
0
 def _init_objgoea(self, pop, assoc):
     """Run gene ontology enrichment analysis (GOEA)."""
     propagate_counts = not self.args.no_propagate_counts
     return GOEnrichmentStudy(pop, assoc, self.godag,
                              propagate_counts=propagate_counts,
                              relationships=False,
                              alpha=self.args.alpha,
                              pvalcalc=self.args.pvalcalc,
                              methods=self.methods)
예제 #29
0
def run_bonferroni(log):
    """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways."""
    # ---------------------------------------------------------------------
    # Run Gene Ontology Analysis (GOEA)
    #
    # 1. Initialize
    obo_dag = GODag("go-basic.obo")
    assoc = read_associations("../data/association", no_top=True)
    popul_ids = [line.rstrip() for line in open("../data/population")]
    study_ids = [line.rstrip() for line in open("../data/study")]
    # 2. Run enrichment analysis
    goea = GOEnrichmentStudy(popul_ids,
                             assoc,
                             obo_dag,
                             alpha=0.05,
                             methods=['bonferroni'])
    results_nt = goea.run_study(study_ids)
    return results_nt, goea
예제 #30
0
def test_i96():
    """Test to re-produce issue#96: Passes currently."""
    # Trying to duplicate: ValueError("All values in table must be nonnegative.
    # Get genes
    print('CWD', os.getcwd())
    study_ids = _get_geneids()
    population_ids = GENEID2NT.keys()
    # Get databases

    print(os.getcwd())
    fin = os.path.join(REPO, 'gene2go')
    dnld_ncbi_gene_file(fin, loading_bar=None)
    gene2go = read_ncbi_gene2go(fin, [9606])

    fin_obo = os.path.join(REPO, "go-basic.obo")
    godag = get_godag(fin_obo, loading_bar=None)
    goeaobj = GOEnrichmentStudy(population_ids, gene2go, godag, methods=['fdr_bh'])
    # Run GOEA Gene Ontology Enrichment Analysis
    results_goeas = goeaobj.run_study(study_ids)
예제 #31
0
    def create_enrichment_study(self):
        obj = GOEnrichmentStudy(
            self.background,  # List of human protein-coding genes (Entrez IDs)
            self.geneid2gos_human,  # Gene ID / GO associtations
            self.obodag,  # Ontologies
            propagate_counts=False,
            alpha=self.alpha,  # Significance cutoff
            methods=['fdr_bh']  # Multiple hypothesis correction
        )

        return obj
예제 #32
0
    def __init__(self):
        obodag = GODag("../Data/evaluation_reference/goslim_yeast.obo")
        background = [line.strip() for line in open('../Data/evaluation_reference/gene_list.txt')]
        geneid2gos_yeast = read_associations('../Data/evaluation_reference/geneid2gos_yeast.txt')

        self.goeaobj = GOEnrichmentStudy(
            background,
            geneid2gos_yeast,
            obodag,
            propogate_counts=False,
            alpha=0.05,
            methods=['fdr_bh'])
예제 #33
0
def get_objgoea(pop, assoc, args):
    """Run gene ontology enrichment analysis (GOEA)."""
    obo_dag = GODag(obo_file=args.obo)
    methods = args.method.split(",")
    propagate_counts = not args.no_propagate_counts
    return GOEnrichmentStudy(pop,
                             assoc,
                             obo_dag,
                             propagate_counts=propagate_counts,
                             alpha=args.alpha,
                             pvalcalc=args.pvalcalc,
                             methods=methods)
예제 #34
0
def get_goeaobj(method, geneids_pop, taxid):
    """Load: ontologies, associations, and population geneids."""
    obo_dag = get_godag()
    assoc_geneid2gos = get_assoc_ncbi_taxids([taxid])
    goeaobj = GOEnrichmentStudy(geneids_pop,
                                assoc_geneid2gos,
                                obo_dag,
                                propagate_counts=False,
                                alpha=0.05,
                                methods=[method])
    # obo_dag is also found in goeaobj.obo_dag
    return goeaobj
예제 #35
0
def test_i122():
    """Test to re-produce issue#122: Passes currently."""
    obj = _Run(9606, 'gene2go', 'go-basic.obo')
    study_ids, population_ids = obj.get_genes_study_n_bg()

    # Result is the same whether fisher_scipy_stats of fisher
    pvalcalc = 'fisher_scipy_stats'
    goeaobj = GOEnrichmentStudy(population_ids, obj.gene2go, obj.godag, methods=['bonferroni', 'fdr_bh'], pvalcalc=pvalcalc)
    # Run GOEA Gene Ontology Enrichment Analysis
    results_goeas = goeaobj.run_study_nts(study_ids)
    print('NS GO         p stu_ratio pop_ratio    p-uncorr bonferro fdr_bh   stu  ')
    for ntd in results_goeas:
        if ntd.study_count == 0:
            doprt = False
            if ntd.p_bonferroni < 0.05:
                assert ntd.enrichment == 'p'
                doprt = True
            if ntd.p_fdr_bh < 0.05:
                assert ntd.enrichment == 'p'
                doprt = True
            if doprt:
                print(obj.str_nt(ntd))
예제 #36
0
def test_i96():
    """Test to re-produce issue#96: Passes currently."""
    # Trying to duplicate: ValueError("All values in table must be nonnegative.
    # Get genes
    print('CWD', os.getcwd())
    study_ids = _get_geneids()
    population_ids = GENEID2NT.keys()
    # Get databases

    print(os.getcwd())
    fin = os.path.join(REPO, 'gene2go')
    dnld_ncbi_gene_file(fin, loading_bar=None)
    gene2go = read_ncbi_gene2go(fin, [9606])

    fin_obo = os.path.join(REPO, "go-basic.obo")
    godag = get_godag(fin_obo, loading_bar=None)
    goeaobj = GOEnrichmentStudy(population_ids,
                                gene2go,
                                godag,
                                methods=['fdr_bh'])
    # Run GOEA Gene Ontology Enrichment Analysis
    results_goeas = goeaobj.run_study(study_ids)
예제 #37
0
def get_goeaobj(method, geneids_pop, taxid):
    """Load: ontologies, associations, and population geneids."""
    fin_obo = os.path.join(os.getcwd(), "go-basic.obo")
    godag = get_godag(fin_obo, loading_bar=None)
    assoc_geneid2gos = get_assoc_ncbi_taxids([taxid], loading_bar=None)
    goeaobj = GOEnrichmentStudy(geneids_pop,
                                assoc_geneid2gos,
                                godag,
                                propagate_counts=False,
                                alpha=0.05,
                                methods=[method])
    # godag is also found in goeaobj.godag
    return goeaobj
예제 #38
0
def return_enrichment_study_obj(gaf_taxfiltered):
    '''
    Generate go enrichment study object with a background dataset.
    '''

    obodag = GODag(config_utils.datadir + "/GOData/go-basic.obo")
    goeaobj = GOEnrichmentStudy(
        gaf_taxfiltered.keys(),  #
        gaf_taxfiltered,  # geneid/GO associations possible with tree used for DB
        obodag,  # Ontologies
        propagate_counts=False,
        alpha=0.15,  # default significance cut-off
        methods=['fdr_bh'])  # defult multipletest correction method
    return goeaobj
예제 #39
0
def get_goeaobj(method, geneids_pop, taxid):
    """Load: ontologies, associations, and population geneids."""
    fin_obo = "go-basic.obo"
    if not os.path.isfile(fin_obo):
        wget.download("wget http://geneontology.org/ontology/go-basic.obo")
    obo_dag = GODag(fin_obo)
    assoc_geneid2gos = get_assoc_ncbi_taxids([taxid])
    goeaobj = GOEnrichmentStudy(geneids_pop,
                                assoc_geneid2gos,
                                obo_dag,
                                propagate_counts=False,
                                alpha=0.05,
                                methods=[method])
    return goeaobj
예제 #40
0
    if not args.compare:  # sanity check
        if len(pop) < len(study):
            exit("\nERROR: The study file contains more elements than the population file. "
                 "Please check that the study file is a subset of the population file.\n")
        # check the fraction of genomic ids that overlap between study
        # and population
        overlap = float(len(study & pop)) / len(study)
        if 0.7 < overlap < 0.95:
            sys.stderr.write("\nWARNING: only {} fraction of genes/proteins in study are found in "
                             "the population  background.\n\n".format(overlap))
        if overlap <= 0.7:
            exit("\nERROR: only {} of genes/proteins in the study are found in the "
                 "background population. Please check.\n".format(overlap))

    assoc = read_associations(assoc_fn)

    methods = ["bonferroni", "sidak", "holm"]
    if args.fdr:
        methods.append("fdr")

    obo_dag = GODag(obo_file=args.obo)
    propagate_counts = not args.no_propagate_counts
    g = GOEnrichmentStudy(pop, assoc, obo_dag,
                          propagate_counts=propagate_counts,
                          alpha=args.alpha,
                          methods=methods)
    results = g.run_study(study)
    g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval)


예제 #41
0
 def _ns2o(pop, ns2assoc, godag, propagate_counts, alpha, methods, **kws):
     return {
         ns:GOEnrichmentStudy(pop, a, godag, propagate_counts, alpha, methods, name=ns, **kws) \
             for ns, a in sorted(ns2assoc.items())}
예제 #42
0
# Data will be stored in this variable
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import goatools
from goatools.base import download_go_basic_obo
from goatools.base import download_ncbi_associations
from goatools.obo_parser import GODag
from goatools.associations import read_ncbi_gene2go
from goatools.test_data.genes_NCBI_10090_ProteinCoding import GeneID2nt as GeneID2nt_mus
from goatools.go_enrichment import GOEnrichmentStudy

obo_fname = download_go_basic_obo()
gene2go = download_ncbi_associations()
obodag = GODag("go-basic.obo")
geneid2gos_mouse = read_ncbi_gene2go("gene2go", taxids=[10090])

geneid2symbol = {}

print("{N:,} annotated mouse genes".format(N=len(geneid2gos_mouse)))
print(GeneID2nt_mus.keys().head())

goeaobj = GOEnrichmentStudy(
    GeneID2nt_mus.keys(),  # List of mouse protein-coding genes
    geneid2gos_mouse,  # geneid/GO associations
    obodag,  # Ontologies
    propagate_counts=False,
    alpha=0.05,  # default significance cut-off
    methods=['fdr_bh'])  # defult multipletest correction method
예제 #43
0
        if 0.7 < overlap < 0.95:
            sys.stderr.write("\nWARNING: only {} fraction of genes/proteins in study are found in "
                             "the population  background.\n\n".format(overlap))
        if overlap <= 0.7:
            exit("\nERROR: only {} of genes/proteins in the study are found in the "
                 "background population. Please check.\n".format(overlap))

    assoc = read_associations(assoc_fn)

    methods = args.method.split(",")

    obo_dag = GODag(obo_file=args.obo)
    propagate_counts = not args.no_propagate_counts
    g = GOEnrichmentStudy(pop, assoc, obo_dag,
                          propagate_counts=propagate_counts,
                          alpha=args.alpha,
                          pvalcalc=args.pvalcalc,
                          methods=methods)
    results = g.run_study(study)
    if args.outfile is None:
        g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval)
    else:
        # Users can print to both tab-separated file and xlsx file in one run.
        outfiles = args.outfile.split(",")
        prt_if = None # Print all values
        if args.pval is not None:
            # Only print out when uncorrected p-value < this value.
            prt_if = lambda nt: nt.p_uncorrected < args.pval
        for outfile in outfiles:
            if outfile.endswith(".xlsx"):
                g.wr_xlsx(outfile, results, prt_if=prt_if)
elif snakemake.wildcards.state_type == 'Enhancer':
    min_dist = 5000
    max_dist = 50000
else:
    sys.exit(-1)
with open(snakemake.input.clusters) as f:
    for line in f:
        cols = line.strip().split()
        cluster = chr(int(cols[3]) + 65)
        if int(cols[-1]) <= max_dist and int(cols[-1]) >= min_dist:
            genes[cluster].add(cols[7])
            background.add(cols[7])

obodag = GODag("go-basic.obo")
id2go = read_associations("sym2go.txt")
goeaobj = GOEnrichmentStudy(background, id2go, obodag, propagate_counts=False, alpha=0.05, methods=['fdr_bh'])
outfile = open(snakemake.output.txt, 'w')
for cluster, geneids in sorted(genes.items()):
    outfile.write("Cluster {}\n".format(cluster))
    goea_results_all = goeaobj.run_study(geneids)
    for fdr, name, enrichment in sorted([(r.p_fdr_bh, r.name, r.enrichment) for r in goea_results_all if r.p_fdr_bh < 0.2]):
        outfile.write("\t{}\t{}\t{}\n".format(name, fdr, enrichment))
    outfile.write("\n")
    #GOEnrichmentStudy.print_summary(goea_results_sig)