def test_gaf_read(log=sys.stdout): """Return GO associations from a GAF file. Download if necessary.""" # Get associations for human(9606), mouse(10090), and fly(7227) species_ids = ['goa_human', 'mgi', 'fb'] # (optional) multi-level dictionary separate associations by taxid taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) local_dir = os.path.dirname(os.path.abspath(__file__)) for fin_gaf in dnld_gafs(species_ids): fin_gaf = os.path.join(local_dir, fin_gaf) log.write("\n") id2gos = read_gaf(fin_gaf, taxid2asscs=taxid2asscs) if "gene_association.mgi" in fin_gaf: _chk_key(id2gos, "MGI:") log.write(" {N:>6,} IDs found in {F}\n".format(N=len(id2gos), F=fin_gaf)) go2ids = read_gaf(fin_gaf, go2geneids=True) _chk_key(go2ids, "GO:") log.write(" {N:>6,} GOs found in {F}\n".format(N=len(go2ids), F=fin_gaf)) # Report findings stored in optional taxid dictionary log.write("\n") for taxid, asscs in taxid2asscs.items(): num_gene2gos = len(asscs.get('ID2GOs')) num_go2genes = len(asscs.get('GO2IDs')) log.write("{N:>6,} GOs and {M:>6,} annotated gene ids for tax_id: {TAXID:>6}\n".format( TAXID=taxid, N=num_go2genes, M=num_gene2gos)) # Basic check to ensure gene2go was downloaded and data was returned. assert num_gene2gos > 11000 assert num_go2genes > 6000
def test_gaf_read(log=sys.stdout): """Return GO associations from a GAF file. Download if necessary.""" # On 2017/04/10, there were 3 GO IDs with ND Evidence Codes: # # $ cut -f5,7 goa_human.gaf | grep ND | sort | uniq -c # 739 GO:0003674 ND # 484 GO:0005575 ND # 639 GO:0008150 ND # Example species_ids: goa_human mgi fb fin_gaf = dnld_gaf('goa_human', loading_bar=None) # Example 1: Read GAF go2ids = read_gaf(fin_gaf, go2geneids=True) num_gos_dflt = len(go2ids) log.write("Read {N} GOs with all default values\n\n".format(N=num_gos_dflt)) # Example 2: Read GAF using defaults (No NOT Qualifiers and no ND Evidence Codes) go2ids = read_gaf(fin_gaf, go2geneids=True, keep_ND=False, keep_NOT=False) log.write("Read {N} GOs; keepif is default in goatools.associations.read_gaf\n\n".format( N=len(go2ids))) # Example 3: Read GAF allowing GOs with ND Evidence Codes go2ids = read_gaf(fin_gaf, go2geneids=True, keep_ND=True) log.write("Read {N} GOs; Allow ND Evidence codes\n\n".format(N=len(go2ids))) # Example 4: Read GAF allowing all GOs, even those with NOT Qualifiers or ND Evidence Codes go2ids = read_gaf(fin_gaf, go2geneids=True, keep_ND=True, keep_NOT=True) log.write("Read {N} GOs; Allow ND Evidence codes and NOT Qualifiers\n\n".format(N=len(go2ids)))
def _test_gaf_read(msg, species_ids, keepif, log=sys.stdout): # (optional) multi-level dictionary separate associations by taxid taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) local_dir = os.path.dirname(os.path.abspath(__file__)) for fin_gaf in dnld_gafs(species_ids, loading_bar=None): fin_gaf = os.path.join(local_dir, fin_gaf) log.write("\n") id2gos_bp = read_gaf(fin_gaf, taxid2asscs=taxid2asscs, keepif=keepif) id2gos_all = read_gaf(fin_gaf, taxid2asscs=taxid2asscs, keepif=keepif, namespace='all') assert len(id2gos_all) > len(id2gos_bp) if "mgi.gaf" in fin_gaf: _chk_key(id2gos_bp, "MGI:") log.write(" {N:>6,} IDs found in BP {F}\n".format(N=len(id2gos_bp), F=fin_gaf)) log.write(" {N:>6,} IDs found in ALL {F}\n".format(N=len(id2gos_all), F=fin_gaf)) go2ids = read_gaf(fin_gaf, go2geneids=True, keepif=keepif) _chk_key(go2ids, "GO:") log.write(" {N:>6,} GOs found in {F}\n".format(N=len(go2ids), F=fin_gaf)) # Report findings stored in optional taxid dictionary log.write("\n{MSG}\n".format(MSG=msg)) txtpat = " {N:>6,} GOs and {M:>6,} annotated gene ids for tax_id: {TAXID:>6}\n" for taxid, asscs in taxid2asscs.items(): num_gene2gos = len(asscs.get('ID2GOs')) num_go2genes = len(asscs.get('GO2IDs')) log.write(txtpat.format(TAXID=taxid, N=num_go2genes, M=num_gene2gos)) # Basic check to ensure gene2go was downloaded and data was returned. assert num_gene2gos > 11000 assert num_go2genes > 6000
def test_missingsym(): """Tests read a GAF with missing (required) DB_Symbol text.""" # Original gaf file (gene_association.mgi) was reduced fin_gaf = "tests/data/gaf_missingsym.mgi" # Test that gene products that are missing the required DB_Symbol are ignored gene2gos = read_gaf(os.path.join(REPO, fin_gaf)) assert len(gene2gos) == 16, len(gene2gos) assert 'MGI:3643263' not in gene2gos assert 'P84751' not in gene2gos # Tests saving annotation, even if missing required DB_Symbol gene2gos = read_gaf(os.path.join(REPO, fin_gaf), allow_missing_symbol=True) assert len(gene2gos) == 18 assert 'MGI:3643263' in gene2gos assert 'P84751' in gene2gos
def _get_assc(godag): """Get association reduced for the test subset of the GO DAG.""" fin_assc = "http://geneontology.org/gene-associations/gene_association.tair.gz" assc = {} goids_dag = set(godag.keys()) for gene, goids_cur in read_gaf(fin_assc).items(): assc[gene] = goids_cur.intersection(goids_dag) return assc
def test_gaf_read(log=sys.stdout): """Return GO associations from a GAF file. Download if necessary.""" # Get associations for human(9606), mouse(10090), and fly(7227) species_ids = ['goa_human', 'mgi', 'fb'] # (optional) multi-level dictionary separate associations by taxid taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) fin_gafs = dnld_gafs(species_ids) for fin_gaf in fin_gafs: id2gos = read_gaf(fin_gaf, taxid2asscs=taxid2asscs) log.write(" {N:>6,} IDs found in {F}\n".format(N=len(id2gos), F=fin_gaf)) go2ids = read_gaf(fin_gaf, go2geneids=True) log.write(" {N:>6,} GOs found in {F}\n".format(N=len(go2ids), F=fin_gaf)) # Report findings stored in optional taxid dictionary for taxid, asscs in taxid2asscs.items(): num_gene2gos = len(asscs['ID2GOs']) num_go2genes = len(asscs['GO2IDs']) log.write("{N:>6,} GOs and {M:>6,} annotated gene ids for tax_id: {TAXID:>6}\n".format( TAXID=taxid, N=num_go2genes, M=num_gene2gos)) # Basic check to ensure gene2go was downloaded and data was returned. assert num_gene2gos > 11000 assert num_go2genes > 6000
def get_associations(self, ontology=None): """Get associations of gene IDs to GO terms. Ontologies: P = biological process, F = molecular function, C = cellular component # Arguments ontology: str (optional), one of {"P", "F", "C"} # Returns dict: maps gene IDs to the GO terms it is annotated them # Raises GeneOntologyError: if `ontology` is not valid """ if ontology is not None and ontology not in ("P", "F", "C"): raise GeneOntologyError(f"Not a valid ontology: {ontology}") # Load a defaultdict mapping gene_ids to the GO terms annotated to them # if not hasattr(self, "all_associations"): associations = read_gaf(self.associations_path) mapping = {rec['DB_Object_Symbol']: rec['DB_Object_ID'] for rec in self} self.all_associations = {} for i in associations: if i in mapping: self.all_associations[mapping[i]] = associations[i] all_associations = copy.deepcopy(self.all_associations) # Remove genes that do not have any annotations with an accepted # evidence code wanted_genes = set(rec["DB_Object_ID"] for rec in self) associations = self.remove_unwanted_genes(wanted_genes, all_associations) # Only consider GO terms from a particular ontology if ontology is not None: # term2ontology_dict = self.term2ontology() d = self.ontology2term() accepted_terms = d[ontology] # Iterate over dictionary of associations for gene, go_terms in associations.items(): for go_id in go_terms.copy(): # Remove obsolete terms if go_id in self.go_dag: # Remove GO terms from other ontologies if go_id not in accepted_terms: go_terms.remove(go_id) self.associations = associations return associations
def __init__(self, obo, gaf, prt): self.prt = prt self.cwd = os.getcwd() # Gene Ontologies self.go2obj_all = get_godag(os.path.join(REPO, "../goatools/", obo)) # Annotations #_file_gaf = dnld_gaf(os.path.join(REPO, gaf)) _file_gaf = dnld_gaf(gaf) print("GAF: {GAF}\n".format(GAF=_file_gaf)) self.gene2gos = read_gaf(_file_gaf) self.tcntobj = TermCounts(self.go2obj_all, self.gene2gos) # GoSubDag self.gosubdag_all = GoSubDag(None, self.go2obj_all, tcntobj=self.tcntobj, prt=prt) self.prtfmt = self.gosubdag_all.prt_attr['fmta']
def test_gaf_read(log=sys.stdout): """Return GO associations from a GAF file. Download if necessary.""" # Get associations for human(9606), mouse(10090), and fly(7227) species_ids = ['goa_human', 'mgi', 'fb'] # (optional) multi-level dictionary separate associations by taxid taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) fin_gafs = dnld_gafs(species_ids) for fin_gaf in fin_gafs: id2gos = read_gaf(fin_gaf, taxid2asscs=taxid2asscs) log.write(" {N:>6,} IDs found in {F}\n".format(N=len(id2gos), F=fin_gaf)) go2ids = read_gaf(fin_gaf, go2geneids=True) log.write(" {N:>6,} GOs found in {F}\n".format(N=len(go2ids), F=fin_gaf)) # Report findings stored in optional taxid dictionary for taxid, asscs in taxid2asscs.items(): num_gene2gos = len(asscs['ID2GOs']) num_go2genes = len(asscs['GO2IDs']) log.write( "{N:>6,} GOs and {M:>6,} annotated gene ids for tax_id: {TAXID:>6}\n" .format(TAXID=taxid, N=num_go2genes, M=num_gene2gos)) # Basic check to ensure gene2go was downloaded and data was returned. assert num_gene2gos > 11000 assert num_go2genes > 6000
def _test_gaf_read(msg, species_ids, keepif, log=sys.stdout): # (optional) multi-level dictionary separate associations by taxid taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) local_dir = os.path.dirname(os.path.abspath(__file__)) for fin_gaf in dnld_gafs(species_ids, loading_bar=None): fin_gaf = os.path.join(local_dir, fin_gaf) log.write("\n") id2gos = read_gaf(fin_gaf, taxid2asscs=taxid2asscs, keepif=keepif) if "mgi.gaf" in fin_gaf: _chk_key(id2gos, "MGI:") log.write(" {N:>6,} IDs found in {F}\n".format(N=len(id2gos), F=fin_gaf)) go2ids = read_gaf(fin_gaf, go2geneids=True, keepif=keepif) _chk_key(go2ids, "GO:") log.write(" {N:>6,} GOs found in {F}\n".format(N=len(go2ids), F=fin_gaf)) # Report findings stored in optional taxid dictionary log.write("\n{MSG}\n".format(MSG=msg)) txtpat = " {N:>6,} GOs and {M:>6,} annotated gene ids for tax_id: {TAXID:>6}\n" for taxid, asscs in taxid2asscs.items(): num_gene2gos = len(asscs.get('ID2GOs')) num_go2genes = len(asscs.get('GO2IDs')) log.write(txtpat.format(TAXID=taxid, N=num_go2genes, M=num_gene2gos)) # Basic check to ensure gene2go was downloaded and data was returned. assert num_gene2gos > 11000 assert num_go2genes > 6000
def test_semantic_similarity(): """Computing basic semantic similarities between GO terms.""" godag = obo_parser.GODag("go-basic.obo") # Get all the annotations from arabidopsis. associations = read_gaf("http://geneontology.org/gene-associations/gene_association.tair.gz") # Now we can calculate the semantic distance and semantic similarity, as so: # "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25. go_id3 = 'GO:0048364' # BP level-03 depth-04 root development go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process sim = semantic_similarity(go_id3, go_id4, godag) print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format( GO1=go_id3, GO2=go_id4, VAL=sim)) print(godag[go_id3]) print(godag[go_id4]) # Then we can calculate the information content of the single term, <code>GO:0048364</code>. # "Information content (GO:0048364) = 7.75481392334 # First get the counts of each GO term. termcounts = TermCounts(godag, associations) # Calculate the information content go_id = "GO:0048364" infocontent = get_info_content(go_id, termcounts) print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent)) # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in # the GO. Then we can calculate this as follows: # "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252 sim_r = resnik_sim(go_id3, go_id4, godag, termcounts) print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_r)) # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_id3, go_id4, godag, termcounts) print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l))
query_result_list = [] for genes_chunk in np.array_split(genes, max(genes.shape[0] // 1000, 1)): query_res = mg.querymany(genes_chunk, scopes='entrezgene', fields='entrezgene,symbol', species='human', entrezonly=True, as_dataframe=True, df_index=False, verbose=False) if 'notfound' in query_res.columns: query_res = query_res[query_res.notfound != True] # ignore PEP8 warnings. query_result_list.append(query_res) df_res = pd.concat(query_result_list) res = dict(zip(df_res.entrezgene, df_res.symbol)) return res with HidePrints(): _go_dag = obo_parser.GODag(go_obo_path) _gaf = read_gaf(gaf_path, prt=None) _termcounts = TermCounts(_go_dag, _gaf) _gene2go = read_ncbi_gene2go(gene2go_path) _gene2symbol = _init_gene2symbol_dict() _symbol2gene = {symbol: gene for gene, symbol in _gene2symbol.items()} def get_genes(): return list(_gene2go.keys()) def get_symbols(): return list(_gene2symbol.values()) def get_gene2go():
qtl_old_info[qtl][i].append(entry[i]) else: qtl_old_info[qtl] = [[] for c in c2g[d]] for i in range(len(c2g[d])): mhq_dat[d + '_' + c2g[d][i]] = mhq_dat['QTL'].apply( lambda q: ';'.join([str(c) for c in qtl_old_info[q][i]])) mhq_dat.to_csv('../../Analysis/Multi_hit_QTLs.csv', index=False) #GO term analysis, modified from https://github.com/tanghaibao/goatools/blob/master/notebooks/goea_nbt3102.ipynb # Get http://geneontology.org/ontology/go-basic.obo obo_fname = download_go_basic_obo() obodag = GODag("go-basic.obo") geneid2gos_yeast = read_gaf( '../accessory_files/gene_association.sgd' ) #http://downloads.yeastgenome.org/curation/literature/gene_association.sgd.gz genename_2_id = dict() with open('../accessory_files/gene_association.sgd', 'r') as infile: for line in infile: if line[0] != '!': s = line.split('\t') genename_2_id[s[2]] = s[1] id_2_genename = {genename_2_id[i]: i for i in genename_2_id} ids = [i for i in geneid2gos_yeast.keys()] all_measured_genes = set(tp.loc[tp['num.measured'] >= 50]['Gene.Use'].apply( lambda s: s.split(' ')[1])) background_set = [ genename_2_id.setdefault(i, 'NA') for i in all_measured_genes
def goe( genelist, go_file, goa_file, bg=None, nmin=5, conversion=None, evidence_set={ 'EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'HTP', 'HDA', 'HMP', 'HGI', 'IBA', 'IBD', 'IKR', 'IRD', 'ISS', 'ISO', 'ISA', 'ISM' }): """Finds GO enrichment with goatools (0.7.11 tested). **WARNING**\ : This method is inexact for multi-maps in gene name conversion. However, it has a negligible effect in top GO component removal in single-cell co-expression. Parameters ------------ genelist: list of str Genes to search for enrichment. go_file: str File path for GO DAG (downloadable at http://geneontology.org/docs/download-ontology/)). goa_file: str File path for GO associations. See parameter **conversion**. bg: list of str Background genes. nmin: int Minimum number of principal genes required in GO. conversion: tuple Conversion of `gene ID system <https://docs.mygene.info/en/latest/doc/data.html>`_ from gene list to the GO annotation. * name_from: Gene naming system of genelist. For gene names, use 'symbol,alias'. * name_to: Gene naming system of goa_file. Examples: * Human: use 'uniprot.Swiss-Prot' (for GO annotations downloded from http://geneontology.org/gene-associations/goa_human.gaf.gz). * Mouse: use 'MGI' (for GO annotations downloded from http://current.geneontology.org/annotations/mgi.gaf.gz). * species: Species for gene name conversion. Examples: 'human', 'mouse'. evidence_set: set of str `GO evidences <http://geneontology.org/docs/guide-go-evidence-codes/>`_ to include. Defaults to non-expression based results to avoid circular reasoning bias. Returns ---------- goe: pandas.DataFrame GO enrichment. gotop: str Top enriched GO ID genes: list of str or None Intersection list of genes in gotop and also bg. None if bg is None. """ from tempfile import NamedTemporaryFile from os import linesep from goatools.go_enrichment import GOEnrichmentStudy from goatools.obo_parser import GODag from goatools.associations import read_gaf from collections import defaultdict import itertools from biothings_client import get_client import pandas as pd import logging assert type(genelist) is list and len(genelist) > 0 if nmin < 1: nmin = 1 bg0 = bg # Convert gene names if conversion is not None: assert len(conversion) == 3 name_from, name_to, species = conversion mg = get_client('gene') ans = set(genelist) if bg is not None: t1 = set(bg) assert len(ans - t1) == 0 ans |= t1 ans = list(ans) ans = mg.querymany(ans, scopes=name_from, fields=name_to, species=species) t1 = set(['query', '_score', name_to.split('.')[0]]) ans = list(filter(lambda x: len(t1 - set(x)) == 0, ans)) ans = sorted(ans, key=lambda x: x['_score']) convert = {x['query']: x for x in ans} for xi in name_to.split('.'): convert = filter(lambda x: xi in x[1], convert.items()) convert = {x[0]: x[1][xi] for x in convert} convert = { x[0]: x[1] if type(x[1]) is str else x[1][0] for x in convert.items() } genelist2 = list( set([convert[x] for x in filter(lambda x: x in convert, genelist)])) if bg is not None: bg = list( set([convert[x] for x in filter(lambda x: x in convert, bg)])) t1 = set(genelist) converti = list(filter(lambda x: x[0] in t1, convert.items())) t1 = defaultdict(list) for xi in converti: t1[xi[1]].append(xi[0]) converti = dict(t1) t1 = defaultdict(list) for xi in convert.items(): t1[xi[1]].append(xi[0]) convertia = dict(t1) else: genelist2 = genelist # Load GO DAG and association files logging.debug('Reading GO DAG file ' + go_file) godag = GODag(go_file) logging.debug('Reading GO association file ' + goa_file) goa = read_gaf(goa_file, evidence_set=evidence_set) if bg is None: bg = list(goa.keys()) # Compute enrichment goe = GOEnrichmentStudy(bg, goa, godag) ans = goe.run_study(genelist2) # Format output with NamedTemporaryFile() as f: goe.wr_tsv(f.name, ans) ans = f.read() ans = ans.decode() ans = [x.split('\t') for x in ans.split(linesep)] if len(ans[-1]) < 2: ans = ans[:-1] if len(ans) == 0 or len(ans[0]) == 0: raise ValueError('No enrichment found. Check your input ID type.') ans[0][0] = ans[0][0].strip('# ') ans = pd.DataFrame(ans[1:], columns=ans[0]) ans.drop(['NS', 'enrichment', 'study_count', 'p_sidak', 'p_holm'], axis=1, inplace=True) for xj in ['p_uncorrected', 'p_bonferroni']: ans[xj] = pd.to_numeric(ans[xj], errors='raise') ans['depth'] = pd.to_numeric(ans['depth'], errors='raise', downcast='unsigned') # Odds ratio column and sort column ans['odds_ratio'] = toratio(ans['ratio_in_study']) / toratio( ans['ratio_in_pop']) ans = ans[[ 'name', 'depth', 'p_uncorrected', 'p_bonferroni', 'odds_ratio', 'ratio_in_study', 'ratio_in_pop', 'GO', 'study_items' ]] ans['study_items'] = ans['study_items'].apply(lambda x: x.replace(' ', '')) # Convert back study_items if conversion is not None: ans['study_items'] = ans['study_items'].apply(lambda x: ','.join( list( itertools.chain.from_iterable( [converti[y] for y in x.split(',')]))) if len(x) > 0 else x) ans.sort_values('p_uncorrected', inplace=True) # Get top enriched GO by P-value gotop = ans[ (ans['odds_ratio'] > 1) & ans['ratio_in_study'].apply(lambda x: int(x.split('/')[0]) >= nmin)] if len(gotop) == 0: raise ValueError('No GO enrichment found for given criteria.') gotop = str(gotop.iloc[0]['GO']) if bg0 is not None: # Children GOs gos = set([gotop] + list(godag.query_term(gotop).get_all_children())) # Look for genes genes = list( filter(lambda x: len(list(filter(lambda y: y in gos, goa[x]))) > 0, goa)) if conversion is not None: genes = [ convertia[x] for x in filter(lambda x: x in convertia, genes) ] genes = list(set(list(itertools.chain.from_iterable(genes)))) genes = set(genes) genes = list(filter(lambda x: x in genes, bg0)) else: genes = None return (ans, gotop, genes)
# Convert ORF names to SGDIDs for GO analysis multi_hit_sgdids = list(gene_info[gene_info['ORF'].isin(orf_names)]['SGDID']) obodag = GODag("../accessory_files/go-basic.obo" ) # http://geneontology.org/ontology/go-basic.obo goid_to_gene_list = defaultdict(list) genename_2_id = dict() with open('../accessory_files/gene_association.sgd', 'r') as infile: for line in infile: if line[0] != '!': s = line.split('\t') goid_to_gene_list[s[4]].append(s[1]) genename_2_id[s[2]] = s[1] id_2_genename = {genename_2_id[i]: i for i in genename_2_id} # Only looking at "biological process" GO terms geneid2gos_yeast = read_gaf('../accessory_files/gene_association.sgd', namespace='BP') ids = [i for i in geneid2gos_yeast.keys()] background_set = [genename_2_id[i] for i in genename_2_id] goeaobj = GOEnrichmentStudy( background_set, # List of all genes in analysis geneid2gos_yeast, # geneid/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=['fdr_bh']) # defult multipletest correction method goea_results_all = goeaobj.run_study(multi_hit_sgdids, keep_if=lambda x: x.p_uncorrected < 0.05) go_results = sorted(goea_results_all, key=lambda r: r.p_fdr_bh) cols = [