Python download_ncbi_associations 예제들, goatools.base.download_ncbi_associations Python 예제들

예제 #1

0

파일 보기

파일: test_dnlds.py 프로젝트: toluadeyelu/goatools

def test_dnlds():
    """Test downloads of ontologies and NCBI associations."""
    # Test downloads of ontologies.
    cwd = os.getcwd()
    file_obo = os.path.join(cwd, "go-basic.obo")
    download_go_basic_obo(file_obo, loading_bar=None)
    os.system("rm -f {FILE}".format(FILE=file_obo))
    download_go_basic_obo(file_obo, loading_bar=None)
    assert os.path.isfile(file_obo)
    # Test downloading of associations from NCBI.
    file_assc = os.path.join(cwd, "gene2go")
    download_ncbi_associations(file_assc, loading_bar=None)
    os.system("rm -f {FILE}".format(FILE=file_assc))
    download_ncbi_associations(file_assc, loading_bar=None)
    assert os.path.isfile(file_assc)

예제 #2

0

파일 보기

def test_i147_all_taxids():
    """Work with all taxids using Gene2GoReader"""
    # 1. Download Ontologies and Associations
    # 1a. Download Ontologies, if necessary
    #     Get http://geneontology.org/ontology/go-basic.obo
    download_go_basic_obo()

    # 1b. Download Associations, if necessary
    #     Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz
    fin_gene2go = download_ncbi_associations()

    # 2. Load Ontologies, Associations and Background gene set
    # 2a. Load Ontologies
    godag = GODag("go-basic.obo")

    # 2b. Load Associations for all species
    #     Read NCBI's gene2go. Store annotations in a list of namedtuples
    objanno = Gene2GoReader(fin_gene2go, godag=godag, taxids=True)

    # Get namespace2association where:
    #    namespace is:
    #        BP: biological_process
    #        MF: molecular_function
    #        CC: cellular_component
    #    assocation is a dict:
    #        key: NCBI GeneID
    #        value: A set of GO IDs associated with that gene
    ns2assoc = objanno.get_ns2assc()

    for nspc, id2gos in ns2assoc.items():
        print("{NS} {N:,} annotated mouse genes".format(NS=nspc,
                                                        N=len(id2gos)))

예제 #3

0

파일 보기

파일: layers.py 프로젝트: NYXFLOWER/PoSEPath

    def __GO_enrich__(self):
        go_file = "go-basic.obo"
        if not os.path.exists(go_file):
            download_go_basic_obo()

        # Load gene ontologies
        obodag = GODag("go-basic.obo")

        # Read NCBI's gene2go. Store annotations in a list of namedtuples
        fin_gene2go = download_ncbi_associations()
        objanno = Gene2GoReader(fin_gene2go, taxids=[9606])
        # Get namespace2association where:
        #    namespace is:
        #        BP: biological_process
        #        MF: molecular_function
        #        CC: cellular_component
        #    association is a dict:
        #        key: NCBI GeneID
        #        value: A set of GO IDs associated with that gene
        ns2assoc = objanno.get_ns2assc()

        self.goeaobj = GOEnrichmentStudyNS(
            GeneID2nt_hum.keys(),  # List of human protein-acoding genes
            ns2assoc,  # geneID/GO associations
            obodag,  # Ontologies
            propagate_counts=False,
            alpha=0.05,  # default significance cut-off
            methods=['fdr_bh'])  # default multipletest correction method

예제 #4

0

파일 보기

def test_i147_all_taxids():
    """Work with all taxids using Gene2GoReader"""
    # 1. Download Ontologies and Associations
    # 1a. Download Ontologies, if necessary
    #     Get http://geneontology.org/ontology/go-basic.obo
    download_go_basic_obo()

    # 1b. Download Associations, if necessary
    #     Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz
    fin_gene2go = download_ncbi_associations()

    # 2. Load Ontologies, Associations and Background gene set
    # 2a. Load Ontologies
    godag = GODag("go-basic.obo")

    # 2b. Load Associations for all species
    #     Read NCBI's gene2go. Store annotations in a list of namedtuples
    objanno_all = Gene2GoReader(fin_gene2go, godag=godag, taxids=True)
    objanno_mmu = Gene2GoReader(fin_gene2go, godag=godag, taxids=[10090])
    objanno_mmuhsa = Gene2GoReader(fin_gene2go,
                                   godag=godag,
                                   taxids=[10090, 9606])

    # Get associations
    # pylint: disable=bad-whitespace
    ns2assoc_all_mmu = _run_get_ns2assc(10090, objanno_all)
    ns2assoc_mmu_mmu = _run_get_ns2assc(10090, objanno_mmu)
    ns2assoc_mmuhsa_all = _run_get_ns2assc(True, objanno_mmuhsa)
    ns2assoc_mmuhsa_mmu = _run_get_ns2assc(10090, objanno_mmuhsa)

    # Check results
    for nspc in ['BP', 'MF', 'CC']:
        assert ns2assoc_mmu_mmu[nspc] == ns2assoc_all_mmu[nspc]
        assert ns2assoc_mmu_mmu[nspc] == ns2assoc_mmuhsa_mmu[nspc]
    _chk_mmuhsa_all(objanno_mmuhsa, objanno_all, ns2assoc_mmuhsa_all)

예제 #5

0

파일 보기

파일: enrichment.py 프로젝트: iamjli/GO_enrichment

    def load_ontologies_and_associations(self):
        print "---LOADING ONTOLOGIES AND ASSOCIATIONS---"
        # Check if files exist and download if not
        obo_fname = download_go_basic_obo()
        gene2go = download_ncbi_associations()

        # Load ontologies and associations
        obodag = GODag(obo_fname)
        geneid2gos_human = read_ncbi_gene2go("gene2go", taxids=[9606])
        print "{N:,} annotated human genes".format(N=len(geneid2gos_human))

        return obodag, geneid2gos_human

예제 #6

0

파일 보기

    def __init__(
        self,
        work_dir: str = '.',
        clean_work_dir: bool = False,
        organism: str = 'human',
        study_parameters: Dict[str, Union[int, float, str, List, Dict]] = {
            'propagate_counts': False,
            'alpha': 0.05,
            'methods': ['fdr_bh']
        }
    ) -> GOEngine:
        """A GOEngine that can be used for performing analysis using GOATOOLS

        Args:
            work_dir (str, optional): The path to a temp directory were intermediate-results and raw data will be downloaded/written to. Defaults to the current working directory.
            clean_work_dir (bool, optional): Whether or not to remove data written to the work directory at class termination, default to True.
            organism (str, optional): The organism . Defaults to 'human'.
            study_parameters (Dict[str,Union[int,float,str,List,Dict]], optional): A dict of parameters to control the base function, defaults to {'propagate_counts':False,'alpha':0.05, 'methods':['fdr_bh']}
        Returns:
            GOEngine: return a GO engine that can be used for performing GO enrichment analysis GOEnrichmentStudyNS
        """
        print("Creating a GO Engine ...")
        if not os.path.exists(work_dir):
            raise ValueError(
                f"The provided work path: {work_dir} does not exist!!!")
        self.work_dir = work_dir
        if organism != 'human' and organism != 'mouse':
            raise ValueError(
                f"The provided organism: {organism} is not support, current engine mainly work with human and moues only"
            )
        print(f"\t --> Downloading data ...")
        obo_fname = download_go_basic_obo(
            os.path.join(work_dir, 'go-basic.obo'))
        gene2go_fname = download_ncbi_associations(
            os.path.join(work_dir, 'gene2go'))
        ## parse the GO term
        print(
            f"\t --> parsing the data and intializing the base GOEA object...")
        obo_dag = GODag(obo_fname)
        if organism == 'human':
            self._goea_obj = GOEnrichmentStudyNS(
                gene2iden_human.keys(),
                Gene2GoReader(gene2go_fname, taxids=[9606]).get_ns2assc(),
                obo_dag, **study_parameters)
        else:
            self._goea_obj = GOEnrichmentStudyNS(
                gene2iden_human.keys(),
                Gene2GoReader(gene2go_fname, taxids=[10090]).get_ns2assc(),
                obo_dag, **study_parameters)
        self._clean_work_dir = clean_work_dir
        self._gene_ids = None
        return

예제 #7

0

파일 보기

def prep_goea(taxid=9606,
              prop_counts=True,
              alpha=0.05,
              method='fdr_bh',
              ref_list=None):
    ### DOWNLOAD AND LOAD ALL THE GENE STUFF for GOEA
    # download ontology
    from goatools.base import download_go_basic_obo
    obo_fname = download_go_basic_obo()

    # download associations
    from goatools.base import download_ncbi_associations
    fin_gene2go = download_ncbi_associations()

    # load ontology
    from goatools.obo_parser import GODag
    obodag = GODag("go-basic.obo")

    # load human gene ontology
    from goatools.anno.genetogo_reader import Gene2GoReader
    objanno = Gene2GoReader(fin_gene2go,
                            taxids=[taxid
                                    ])  #9606 is taxonomy ID for h**o sapiens
    ns2assoc = objanno.get_ns2assc()
    for nspc, id2gos in ns2assoc.items():
        print("{NS} {N:,} annotated human genes".format(NS=nspc,
                                                        N=len(id2gos)))

    from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS
    #pop_ids = pd.read_csv('../data/df_human_geneinfo.csv',index_col=0)['GeneID'].to_list()
    df_genehumans = pd.read_csv('../data/df_human_geneinfo.csv', index_col=0)

    # if no reference list is given, default to all genes in ABHA
    if ref_list is None:
        ref_list = df_genehumans['GeneID'].to_list()

    goeaobj = GOEnrichmentStudyNS(ref_list,
                                  ns2assoc,
                                  obodag,
                                  propagate_counts=prop_counts,
                                  alpha=alpha,
                                  methods=[method])

    # get symbol to ID translation dictionary to get overexpressed IDs
    symbol2id = dict(
        zip(df_genehumans['Symbol'].str.upper(), df_genehumans['GeneID']))

    return goeaobj, symbol2id

예제 #8

0

파일 보기

파일: get_flat_go.py 프로젝트: dpellow/MLPROJ

def fetch_go_hierarchy():

    obo_file_location = os.path.join(constants.GO_DIR,constants.GO_FILE_NAME)
    if not os.path.exists(os.path.join(constants.GO_DIR,constants.GO_FILE_NAME)):
            wget.download(constants.GO_OBO_URL, os.path.join(constants.GO_DIR,constants.GO_FILE_NAME))

    go = obo_parser.GODag(obo_file_location,optional_attrs=['relationship']) # also use

    print "Downloading gene-GO associations"
    association_file_location = os.path.join(constants.GO_DIR,constants.GO_ASSOCIATION_FILE_NAME)
    if not os.path.exists(association_file_location):
	    association_file_location = download_ncbi_associations(association_file_location)
#            wget.download(constants.GO_ASSOCIATION_GENE2GEO_URL, os.path.join(constants.GO_DIR,constants.GO_ASSOCIATION_FILE_NAME))

    print "Loading gene-GO associations"
    go2geneids_human = read_ncbi_gene2go(association_file_location, taxids=[9606], go2geneids=True)


    print "Writing out GO child-parent links"
    if not os.path.exists(constants.OUTPUT_GLOBAL_DIR):
            os.makedirs(constants.OUTPUT_GLOBAL_DIR)

    out_fname = "go_output_{}_{}.txt".format(constants.CANCER_TYPE, time.time())
    genes = []
    isa = []
    relship = []
    with open(os.path.join(constants.OUTPUT_GLOBAL_DIR,out_fname),'w') as o:
        for goid in go2geneids_human.keys():
            if not go.has_key(goid):
                print "GO obo file does not contain {}".format(goid)
                continue
            entry = go[goid]
            for gene in go2geneids_human[entry.id]:
                genes.append((str(gene), entry.id))
                o.write("{}\t{}\t{}\n".format("genes", *genes[-1]))
            children = entry.children
            for c in children:
                isa.append((c.id, entry.id))
                o.write("{}\t{}\t{}\n".format("is a", *isa[-1]))
            rels = entry.relationship_rev
            for rtype in rels.keys():
                rs = rels[rtype]
                for r in rs:
                    relship.append((rtype, r.id, entry.id))
                    o.write("{}\t{}\t{}\n".format(rtype, *relship[-1][1:]))

    return (genes, isa, relship)

예제 #9

0

파일 보기

파일: genes_to_GO_process.py 프로젝트: nmchaves/CS341_Code

def get_ensembl_ids(go_process_id, biomart_fpath):

    entrez_to_ensembl = map_entrez_to_ensembl(biomart_fpath)

    gene2go = download_ncbi_associations()
    # taxids=[9606] means select only human.
    # TODO: ask Marinka if we should use EXP code for evidence!!
    go_to_entrez_ids_human = read_ncbi_gene2go(gene2go, taxids=[9606], go2geneids=True)
    """, evidence_set='EXP'"""

    entrez_ids = go_to_entrez_ids_human[GO_PROCESS_ID]
    ensembl_ids = []
    for ent_id in entrez_ids:
        ensembl_ids.append(entrez_to_ensembl[str(ent_id)])

    print("{N} GO terms associated with human NCBI Entrez GeneIDs".format(N=len(go_to_entrez_ids_human)))
    return ensembl_ids

예제 #10

0

파일 보기

파일: go_term_enrichment.py 프로젝트: Switham1/PromoterArchitecture

def dl_files(go_directory):
    """function to download latest ontologies and associations files from geneontology.org
    specify the directory to download the files to"""

    # change to go directory
    os.chdir(go_directory)

    # Get http://geneontology.org/ontology/go-basic.obo
    obo_fname = download_go_basic_obo()

    # print go file version:
    with open(obo_fname) as fin:
        for line in islice(fin, 1, 2):
            print(line)

    # download gene2go annotation file
    fin_gene2go = download_ncbi_associations()

    return obo_fname, fin_gene2go

예제 #11

0

파일 보기

파일: goatools.py 프로젝트: dylansucich/schatz

# Data will be stored in this variable
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import goatools
from goatools.base import download_go_basic_obo
from goatools.base import download_ncbi_associations
from goatools.obo_parser import GODag
from goatools.associations import read_ncbi_gene2go
from goatools.test_data.genes_NCBI_10090_ProteinCoding import GeneID2nt as GeneID2nt_mus
from goatools.go_enrichment import GOEnrichmentStudy

obo_fname = download_go_basic_obo()
gene2go = download_ncbi_associations()
obodag = GODag("go-basic.obo")
geneid2gos_mouse = read_ncbi_gene2go("gene2go", taxids=[10090])

geneid2symbol = {}

print("{N:,} annotated mouse genes".format(N=len(geneid2gos_mouse)))
print(GeneID2nt_mus.keys().head())

goeaobj = GOEnrichmentStudy(
    GeneID2nt_mus.keys(),  # List of mouse protein-coding genes
    geneid2gos_mouse,  # geneid/GO associations
    obodag,  # Ontologies
    propagate_counts=False,
    alpha=0.05,  # default significance cut-off
    methods=['fdr_bh'])  # defult multipletest correction method

예제 #12

0

파일 보기

파일: test_dnlds.py 프로젝트: brandoninvergo/goatools

def test_NCBI_assc():
    """Test downloading of associations from NCBI."""
    fdnld = download_ncbi_associations()
    os.system("rm -f {FILE}".format(FILE=fdnld))
    fdnld = download_ncbi_associations()
    assert os.path.isfile(fdnld)

예제 #13

0

파일 보기

def plot_go_enrichment(coef_df, auc_vals, pheno_dict, args, mode='abs'):
    obo_fl = os.path.join(args.go_dir, "go-basic.obo")
    download_go_basic_obo(obo_fl)
    obodag = GODag(obo_fl)

    assoc_fl = os.path.join(args.go_dir, "gene2go")
    download_ncbi_associations(assoc_fl)
    objanno = Gene2GoReader(assoc_fl, taxids=[9606])
    ns2assoc = objanno.get_ns2assc()

    ncbi_map = {info.Symbol: ncbi_id for ncbi_id, info in GENEID2NT.items()}
    use_genes = set(coef_df.columns) & set(ncbi_map)
    bgrd_ids = [ncbi_map[gn] for gn in use_genes]

    goeaobj = GOEnrichmentStudyNS(bgrd_ids,
                                  ns2assoc,
                                  obodag,
                                  propagate_counts=False,
                                  alpha=0.05,
                                  methods=['fdr_bh'])

    plot_dict = dict()
    use_gos = set()
    coef_mat = coef_df.loc[:, [gene in use_genes for gene in coef_df.columns]]

    if mode == 'bayes':
        coef_means = coef_mat.groupby(level=0, axis=1).mean()
        coef_stds = coef_mat.groupby(level=0, axis=1).std()
    else:
        coef_mat = coef_mat.groupby(level=0, axis=1).mean()

    for mtype, coefs in coef_mat.iterrows():
        if not isinstance(mtype, RandomType):
            if mode == 'abs':
                fgrd_ctf = coefs.abs().quantile(0.95)
                fgrd_genes = coefs.index[coefs.abs() > fgrd_ctf]
                use_clr = 3.17

            elif mode == 'high':
                fgrd_ctf = coefs.quantile(0.95)
                fgrd_genes = coefs.index[coefs > fgrd_ctf]
                use_clr = 2.03
            elif mode == 'low':
                fgrd_ctf = coefs.quantile(0.05)
                fgrd_genes = coefs.index[coefs < fgrd_ctf]
                use_clr = 1.03

            elif mode == 'bayes':
                gene_scrs = coef_means.loc[mtype].abs() - coef_stds.loc[mtype]
                fgrd_genes = gene_scrs.index[gene_scrs > 0]
                use_clr = 3.17

            else:
                raise ValueError(
                    "Unrecognized `mode` argument <{}>!".format(mode))

            fgrd_ids = [ncbi_map[gn] for gn in fgrd_genes]
            goea_out = goeaobj.run_study(fgrd_ids, prt=None)

            plot_dict[mtype] = {
                rs.name: np.log10(rs.p_fdr_bh)
                for rs in goea_out
                if rs.enrichment == 'e' and rs.p_fdr_bh < 0.05
            }

    plot_df = pd.DataFrame(plot_dict, columns=plot_dict.keys())
    if plot_df.shape[0] == 0:
        print("Could not find any enriched GO terms across {} "
              "subgroupings!".format(plot_df.shape[1]))
        return None

    fig, ax = plt.subplots(figsize=(4.7 + plot_df.shape[0] / 2.3,
                                    2 + plot_df.shape[1] / 5.3))

    if plot_df.shape[0] > 2:
        plot_df = plot_df.iloc[dendrogram(linkage(distance.pdist(
            plot_df.fillna(0.0), metric='cityblock'),
                                                  method='centroid'),
                                          no_plot=True)['leaves']].transpose()
    else:
        plot_df = plot_df.transpose()

    xlabs = [rs_nm for rs_nm in plot_df.columns]
    ylabs = [
        get_fancy_label(tuple(mtype.subtype_iter())[0][1])
        for mtype in plot_df.index
    ]

    pval_cmap = sns.cubehelix_palette(start=use_clr,
                                      rot=0,
                                      dark=0,
                                      light=1,
                                      reverse=True,
                                      as_cmap=True)

    sns.heatmap(plot_df,
                cmap=pval_cmap,
                vmin=-5,
                vmax=0,
                linewidths=0.23,
                linecolor='0.73',
                xticklabels=xlabs,
                yticklabels=ylabs)

    ax.set_xticklabels(xlabs, size=15, ha='right', rotation=31)
    ax.set_yticklabels(ylabs, size=9, ha='right', rotation=0)
    ax.set_xlim((plot_df.shape[1] / -83, plot_df.shape[1] * 1.009))
    ax.set_ylim((plot_df.shape[0] * 1.009, plot_df.shape[0] / -83))

    plt.savefig(os.path.join(
        plot_dir, '__'.join([args.expr_source, args.cohort]),
        "{}_go-{}-enrichment_{}.svg".format(args.gene, mode, args.classif)),
                bbox_inches='tight',
                format='svg')

    plt.close()

예제 #14

0

파일 보기

파일: go.py 프로젝트: zpeng1989/pyproteome

def get_go_ids(go_ids, species='H**o sapiens'):
    '''
    Fetch all gene symbols associated with a list of gene ontology term IDs.

    Parameters
    ----------
    go_ids : str or list of str
    species : str, optional

    Returns
    -------
    list of str
    '''
    assert species in TAXA

    if isinstance(go_ids, str):
        go_ids = [go_ids]

    obo_fname = download_go_basic_obo('db/go/go-basic.obo')
    gene2go = download_ncbi_associations('db/go/gene2go')

    taxid = TAXA[species]

    fin_symbols = 'genes_NCBI_{TAXID}_All.py'.format(TAXID=taxid)

    module_name = ''.join(['goatools.test_data.', fin_symbols[:-3]])
    module = importlib.import_module(module_name)
    GeneID2nt = module.GENEID2NT

    go2geneids = Gene2GoReader(
        'db/go/gene2go',
        taxids=[taxid],
    )

    go2items = defaultdict(list)
    for i in go2geneids.taxid2asscs[taxid]:
        go2items[i.GO_ID].append(i.DB_ID)

    srchhelp = GoSearch('db/go/go-basic.obo', go2items=go2items)

    with open('go.log', 'w') as log:
        # Add children GOs
        gos_all = srchhelp.add_children_gos(go_ids)

        # Get Entrez GeneIDs for cell cycle GOs
        gene_ids = set()

        for go_items in [
                go_ids,
                gos_all,
        ]:
            gene_ids.update(srchhelp.get_items(go_items))

    genes = []

    for geneid in gene_ids:
        nt = GeneID2nt.get(geneid, None)

        if nt is not None:
            genes.append(nt.Symbol)

    return genes

예제 #15

0

파일 보기

파일: GO.py 프로젝트: gmaline/BIODB_Project

def pullGOenrichment(inputFile, project):
    GeneID2nt_hum = genes_NCBI_9606_ProteinCoding.GENEID2NT

    obo_fname = download_go_basic_obo()

    fin_gene2go = download_ncbi_associations()

    obodag = GODag("go-basic.obo")

    # Read NCBI's gene2go. Store annotations in a list of namedtuples
    objanno = Gene2GoReader(fin_gene2go, taxids=[9606])

    # Get namespace2association where:
    #    namespace is:
    #        BP: biological_process
    #        MF: molecular_function
    #        CC: cellular_component
    #    assocation is a dict:
    #        key: NCBI GeneID
    #        value: A set of GO IDs associated with that gene
    ns2assoc = objanno.get_ns2assc()

    for nspc, id2gos in ns2assoc.items():
        print("{NS} {N:,} annotated human genes".format(NS=nspc,
                                                        N=len(id2gos)))

    print(len(GeneID2nt_hum))

    goeaobj = GOEnrichmentStudyNS(
        GeneID2nt_hum.keys(),  # List of human protein-coding genes
        ns2assoc,  # geneid/GO associations
        obodag,  # Ontologies
        propagate_counts=False,
        alpha=0.05,  # default significance cut-off
        methods=['fdr_bh'])  # defult multipletest correction method

    geneid2symbol = {}
    with open(inputFile, 'r') as infile:
        input_genes = csv.reader(infile)
        for line in input_genes:
            geneid = line[0]
            symbol = line[1]
            if geneid:
                geneid2symbol[int(geneid)] = symbol

    infile.close()

    geneids_study = geneid2symbol.keys()
    goea_results_all = goeaobj.run_study(geneids_study)
    goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]

    import collections as cx
    ctr = cx.Counter([r.NS for r in goea_results_sig])
    print('Significant results[{TOTAL}] = {BP} BP + {MF} MF + {CC} CC'.format(
        TOTAL=len(goea_results_sig),
        BP=ctr['BP'],  # biological_process
        MF=ctr['MF'],  # molecular_function
        CC=ctr['CC']))  # cellular_component

    goeaobj.wr_xlsx("Data/go_enrichment" + project + ".csv", goea_results_sig)
    goeaobj.wr_txt("Data/go_enrichment" + project + ".txt", goea_results_sig)

예제 #16

0

파일 보기

파일: biplot.py 프로젝트: gaberosser/qmul-bioinf

    big_ax.set_ylabel('Density (a.u.)')
    fig.tight_layout()
    fig.savefig(os.path.join(outdir, "kde_pc_values.png"), dpi=200)

    ## Run GO analysis using GOAtools
    # TODO: if this works well, move to a module
    from goatools import base
    import wget

    obo_fn = os.path.join(LOCAL_DATA_DIR, 'gene_ontology', 'current', 'go-basic.obo')
    genetogo_fn = os.path.join(LOCAL_DATA_DIR, 'gene_ontology', 'current', 'gene2go')
    genetoens_fn = os.path.join(LOCAL_DATA_DIR, 'gene_ontology', 'current', 'gene2ensembl.gz')
    genetoens_url = "ftp://ftp.ncbi.nih.gov/gene/DATA/gene2ensembl.gz"

    obo_fn = base.download_go_basic_obo(obo_fn)
    genetogo_fn = base.download_ncbi_associations(genetogo_fn)
    if not os.path.isfile(genetoens_fn):
        logger.info("Downloading RefGene-Ensembl converter from %s, saving to %s.", genetoens_url, genetoens_fn)
        wget.download(genetoens_url, out=genetoens_fn)

    def ens_to_entrez(ens, genetoens_fn):
        gene2ens = pd.read_csv(genetoens_fn, header=0, sep='\t')
        gene2ens = gene2ens.loc[gene2ens['#tax_id'] == 9606]
        conv_df = gene2ens.loc[gene2ens.Ensembl_gene_identifier.isin(ens), ['GeneID', 'Ensembl_gene_identifier']]
        # reduce to unique (Entrez ID, Ensembl ID) pairs
        conv = collections.defaultdict(list)
        for _, row in conv_df.iterrows():
            conv[row['Ensembl_gene_identifier']].append(conv['GeneID'])

        res = []
        for e in ens:

예제 #17

0

파일 보기

from goatools.base import download_ncbi_associations
from goatools.obo_parser import GODag
from goatools.test_data.genes_NCBI_10090_ProteinCoding import GENEID2NT as GeneID2nt_mus
from goatools.test_data.genes_NCBI_9606_ProteinCoding import GENEID2NT as GeneID2nt_hum
from goatools.anno.genetogo_reader import Gene2GoReader
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS

PATH = os.path.dirname(__file__)
go_basic_path = os.path.join(PATH, 'data', 'go-basic.obo')
gene2go_path = os.path.join(PATH, 'data', 'gene2go')

try:
    # Get http://geneontology.org/ontology/go-basic.obo
    go_basic_path = download_go_basic_obo(go_basic_path)
    # Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz
    gene2go_path = download_ncbi_associations(gene2go_path)
except:
    # if directory is not writeable for whatever reason, just save to /tmp
    go_basic_path = os.path.join('/tmp', 'go-basic.obo')
    gene2go_path = os.path.join('/tmp', 'gene2go')
    go_basic_path = download_go_basic_obo(go_basic_path)
    gene2go_path = download_ncbi_associations(gene2go_path)

obodag = GODag(go_basic_path)
# Read NCBI's gene2go. Store annotations in a list of namedtuples
#objanno = Gene2GoReader(gene2go_path, taxids=[10090])
#ns2assoc = objanno.get_ns2assc()
#symbols_to_ids = {val.Symbol : key for key, val in GeneID2nt_mus.items()}
#ids_to_symbols = {val : key for key, val in symbols_to_ids.items()}

예제 #18

0

파일 보기

def test_NCBI_assc():
    """Test downloading of associations from NCBI."""
    fdnld = download_ncbi_associations()
    os.system("rm -f {FILE}".format(FILE=fdnld))
    fdnld = download_ncbi_associations()
    assert os.path.isfile(fdnld)