def test_missingsym(): """Tests read a GAF with missing (required) DB_Symbol text.""" # Original gaf file (mgi.gaf) was reduced fin_gaf = "tests/data/gaf_missingsym.mgi" # Test that gene products that are missing the required DB_Symbol are ignored gafobj = GafReader(fin_gaf, hdr_only=False) assert not gafobj.chk_associations('gaf_missingsym.err')
def read_gaf(fin_gaf, prt=sys.stdout, **kws): """Read Gene Association File (GAF). Return data.""" # keyword arguments what is read from GAF. hdr_only = kws.get('hdr_only', None) # Read all data from GAF by default # Read GAF file gafobj = GafReader(fin_gaf, hdr_only, prt, **kws) return gafobj.read_gaf(**kws)
def test_gaf_illegal(prt=sys.stdout): """Test finding and reporting illegal GAF lines seen in the field.""" fin_gaf = os.path.join(REPO, 'data/gaf/goa_human_illegal.gaf') gafobj = GafReader(fin_gaf, hdr_only=False, prt=prt) # id2gos = gafobj.read_gaf() # Read associations # for ntd in gafobj.associations: # print(ntd) assert not gafobj.chk_associations('goa_human_illegal.err')
def test_gaf_illegal(prt=sys.stdout): """Test finding and reporting illegal GAF lines seen in the field.""" fin_gaf = os.path.join(REPO, 'data/gaf/goa_human_illegal.gaf') gafobj = GafReader(fin_gaf, hdr_only=False, prt=prt) id2gos = gafobj.read_gaf() # Read associations assert len(id2gos) == 15, "IDS FOUND: EXP({E}) ACT({A})".format( E=15, A=len(id2gos)) assert len(gafobj.datobj.ignored) == 1 assert len(gafobj.datobj.illegal_lines['ILLEGAL TAXON']) == 1
def test_semantic_similarity(usr_assc=None): """Computing basic semantic similarities between GO terms.""" not_these = {'goa_uniprot_all.gaf', 'goa_uniprot_all_noiea.gaf'} associations = sorted(ASSOCIATIONS.difference(not_these)) go2obj = get_go2obj() # goids = go2obj.keys() # http://current.geneontology.org/annotations/ if usr_assc is not None: associations = [usr_assc] not_found = set() errs = [] for assc_name in associations: # Limit test numbers for speed tic = timeit.default_timer() # Get all the annotations from arabidopsis. fin_gaf = os.path.join(REPO, assc_name) if not os.path.exists(fin_gaf): dnld_annotation(fin_gaf) annoobj = GafReader(fin_gaf) #### for nspc in ['BP', 'MF', 'CC']: assc_gene2gos = annoobj.get_id2gos('all') if not assc_gene2gos: not_found.add(assc_name) continue # Calculate the information content of the single term, GO:0048364 # "Information content (GO:0048364) = 7.75481392334 # Initialize the counts of each GO term. tcntobj = TermCounts(go2obj, assc_gene2gos) go_cnt = tcntobj.gocnts.most_common() #print tcntobj.gocnts.most_common() if go_cnt: print("{ASSC}".format(ASSC=assc_name)) print(tcntobj.aspect_counts) gocnt_max = go_cnt[0][1] prt_info(tcntobj, go_cnt, None) prt_info(tcntobj, go_cnt, gocnt_max / 2.0) prt_info(tcntobj, go_cnt, gocnt_max / 10.0) print("{HMS} {hms} {ASSC}\n".format(ASSC=assc_name, HMS=_hms(TIC), hms=_hms(tic))) print('{HMS} {N} Associations'.format(HMS=_hms(TIC), N=len(associations))) if not_found: _prt_not_found(not_found) if errs: fout_err = 'namespace_errors.txt' with open(fout_err, 'w') as prt: for err in errs: prt.write(err) print(' {N} ERRORS WROTE: {TXT}'.format(N=len(errs), TXT=fout_err))
def test_i148b_semsim_lin(do_plt=False): """Test for issue 148, Lin Similarity if a term has no annotations""" fin_gaf = os.path.join(REPO, 'tests/data/yangRWC/fig2a_nonleaf0.gaf') godag = GODag(os.path.join(REPO, "tests/data/yangRWC/fig2a.obo")) annoobj = GafReader(fin_gaf, godag=godag) associations = annoobj.get_id2gos('CC') tcntobj = TermCounts(godag, associations) if do_plt: _do_plt(tcntobj, godag) goids = list(godag.keys()) ##print(lin_sim('GO:0000006', 'GO:0000002', godag, tcntobj, 1.0)) ## print(lin_sim('GO:0005575', 'GO:0005575', godag, tcntobj, 1.0)) ##return # Calculate Resnik values p2r = { frozenset([a, b]): resnik_sim(a, b, godag, tcntobj) for a, b in combo_w_rplc(goids, 2) } _prt_values('Resnik', goids, p2r) # Calculate Lin values p2l = { frozenset([a, b]): lin_sim(a, b, godag, tcntobj) for a, b in combo_w_rplc(goids, 2) } _prt_values('Lin', goids, p2l) _chk_lin(p2l) return # Calculate Resnik values p2r = { frozenset([a, b]): resnik_sim(a, b, godag, tcntobj) for a, b in combo_w_rplc(goids, 2) } _prt_values('Resnik', goids, p2r) # Calculate Lin values p2l = { frozenset([a, b]): lin_sim(a, b, godag, tcntobj) for a, b in combo_w_rplc(goids, 2) } _prt_values('Lin', goids, p2l) _chk_lin(p2l)
def read_gaf(fin_gaf, prt=sys.stdout, **kws): """Read Gene Association File (GAF). Return data.""" # keyword arguments for choosing which GO IDs to keep taxid2asscs = kws.get('taxid2asscs', None) b_geneid2gos = not kws.get('go2geneids', False) evs = kws.get('evidence_set', None) eval_nd = get_nd(kws.get('keep_ND', False)) eval_not = get_not(kws.get('keep_NOT', False)) # keyword arguments what is read from GAF. hdr_only = kws.get('hdr_only', None) # Read all data from GAF by default # Read GAF file from goatools.anno.gaf_reader import GafReader # Simple associations id2gos = defaultdict(set) # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs gafobj = GafReader(fin_gaf, hdr_only, prt) # Optionally specify a subset of GOs based on their evidence. # By default, return id2gos. User can cause go2geneids to be returned by: # >>> read_ncbi_gene2go(..., go2geneids=True for ntgaf in gafobj.associations: if eval_nd(ntgaf) and eval_not(ntgaf): if evs is None or ntgaf.Evidence_Code in evs: taxid = ntgaf.Taxon[0] geneid = ntgaf.DB_ID go_id = ntgaf.GO_ID if b_geneid2gos: id2gos[geneid].add(go_id) else: id2gos[go_id].add(geneid) if taxid2asscs is not None: taxid2asscs[taxid]['ID2GOs'][geneid].add(go_id) taxid2asscs[taxid]['GO2IDs'][go_id].add(geneid) return id2gos # return simple associations
def test_semantic_similarity(usr_assc=None): """Computing basic semantic similarities between GO terms.""" not_these = {'goa_uniprot_all.gaf', 'goa_uniprot_all_noiea.gaf'} assc_names = sorted(ASSOCIATIONS.difference(not_these)) go2obj = get_go2obj() # http://current.geneontology.org/annotations/ if usr_assc is not None: assc_names = [usr_assc] not_found = set() gaf2errs = cx.defaultdict(list) for assc_name in assc_names: # Limit test numbers for speed tic = timeit.default_timer() # Get all the annotations from arabidopsis. fin_gaf = os.path.join(REPO, assc_name) if not os.path.exists(fin_gaf): dnld_annotation(fin_gaf) annoobj = GafReader(fin_gaf) for nta in annoobj.associations: if nta.GO_ID in go2obj: goterm = go2obj[nta.GO_ID] namespace_anno = NS2NAMESPACE.get(nta.NS) if namespace_anno != goterm.namespace: gaf2errs[assc_name].append(nta) else: not_found.add(nta.GO_ID) print('{HMS} {N} Associations'.format(HMS=_hms(TIC), N=len(assc_names))) if not_found: _prt_not_found(not_found) if gaf2errs: _wr_errs('namespace_errors.txt', gaf2errs, go2obj)
def get_objanno(fin_anno, anno_type=None, **kws): """Read annotations in GAF, GPAD, Entrez gene2go, or text format.""" # kws get_objanno: taxids hdr_only prt allow_missing_symbol anno_type = get_anno_desc(fin_anno, anno_type) if anno_type is not None: if anno_type == 'gene2go': # kws: taxid taxids kws_ncbi = { k: kws[k] for k in Gene2GoReader.exp_kws.intersection(kws.keys()) } return Gene2GoReader(fin_anno, **kws_ncbi) if anno_type == 'gaf': kws_gaf = { k: kws[k] for k in GafReader.exp_kws.intersection(kws.keys()) } return GafReader(fin_anno, **kws_gaf) if anno_type == 'gpad': kws_gpad = { k: kws[k] for k in GpadReader.exp_kws.intersection(kws.keys()) } return GpadReader(fin_anno, **kws_gpad) if anno_type == 'id2gos': kws_id2go = { k: kws[k] for k in IdToGosReader.exp_kws.intersection(kws.keys()) } return IdToGosReader(fin_anno, **kws_id2go) raise RuntimeError('UNEXPECTED ANNOTATION FILE FORMAT: {F} {D}'.format( F=fin_anno, D=anno_type))
def read_gaf(fin_gaf, prt=sys.stdout, hdr_only=False, namespace='BP', allow_missing_symbol=False, **kws): """Read Gene Association File (GAF). Return data.""" return GafReader(fin_gaf, hdr_only=hdr_only, prt=prt, allow_missing_symbol=allow_missing_symbol, godag=kws.get('godag')).get_id2gos(namespace, **kws)
def test_i195(): """Investigate GAF reading error on saccharomyces""" fin_gaf1 = join(REPO, 'sgd.gaf') dnld_gaf1 = 'http://current.geneontology.org/annotations/sgd.gaf.gz' _dnld_gaf(fin_gaf1, dnld_gaf1) fin_gaf2 = join(REPO, 'gene_association.sgd.gaf') dnld_gaf2 = 'http://downloads.yeastgenome.org/curation/literature/gene_association.sgd.gaf.gz' _dnld_gaf(fin_gaf2, dnld_gaf2) # Read GAF print('READING: {GAF}'.format(GAF=basename(fin_gaf1))) ##DVK objanno_sgd1 = GafReader(fin_gaf1) print('READING: {GAF}'.format(GAF=basename(fin_gaf2))) objanno_sgd2 = GafReader(fin_gaf2)
def my_read_gaf(fin_gaf, prt=sys.stdout, before_date=None, **kws): """Read Gene Association File (GAF). # Arguments before_date: int, only consider annotation before this date (YYYYMMDD) # Returns dict: maps gene IDs to the GO terms it is annotated them """ # keyword arguments for choosing which GO IDs to keep taxid2asscs = kws.get('taxid2asscs', None) b_geneid2gos = not kws.get('go2geneids', False) evs = kws.get('evidence_set', None) eval_nd = get_nd(kws.get('keep_ND', False)) eval_not = get_not(kws.get('keep_NOT', False)) # keyword arguments what is read from GAF. hdr_only = kws.get('hdr_only', None) # Read all data from GAF by default # Read GAF file # Simple associations id2gos = defaultdict(set) # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs gafobj = GafReader(fin_gaf, hdr_only, prt, **kws) # Optionally specify a subset of GOs based on their evidence. # By default, return id2gos. User can cause go2geneids to be returned by: # >>> read_ncbi_gene2go(..., go2geneids=True for idx, ntgaf in enumerate(gafobj.associations): if eval_nd(ntgaf) and eval_not(ntgaf): if evs is None or ntgaf.Evidence_Code in evs: # My addition to GOATOOLS function if before_date: return ntgaf, idx if int(ntgaf.Date) > before_date: continue taxid = ntgaf.Taxon[0] geneid = ntgaf.DB_ID go_id = ntgaf.GO_ID if b_geneid2gos: id2gos[geneid].add(go_id) else: id2gos[go_id].add(geneid) if taxid2asscs is not None: taxid2asscs[taxid]['ID2GOs'][geneid].add(go_id) taxid2asscs[taxid]['GO2IDs'][go_id].add(geneid) return id2gos # return simple associations
def test_anno_read(): """Test reading annotation file.""" fin_anno = os.path.join(REPO, 'goa_human.gaf') dnld_annofile(fin_anno, 'gaf') print('\nTEST STORING ONLY ONE SPECIES') obj = GafReader(fin_anno) obj.prt_summary_anno2ev() ## new = obj.read_gaf() new = obj.get_id2gos_nss() old = read_gaf(obj) _prt_differences(new, old, obj) print('{N} NEW'.format(N=len(new))) print('{N} OLD'.format(N=len(old))) assert len(new) == len(old), 'new({N}) != old({O})'.format(N=len(new), O=len(old)) print('\nTEST KWS: keep_ND and keep_NOT') # pylint: disable=bad-whitespace kws_lst = [ {'keep_ND': False, 'keep_NOT': False}, {'keep_ND': False, 'keep_NOT': True}, {'keep_ND': True, 'keep_NOT': False}, {'keep_ND': True, 'keep_NOT': True}, ] for kws in kws_lst: print('\nTEST KWS:', kws) ## new = obj.read_gaf(namespace='BP', **kws) new = obj.get_id2gos_nss(**kws) old = read_gaf(obj, **kws) _prt_differences(new, old, obj) assert len(new) == len(old), 'new({N}) != old({O})'.format(N=len(new), O=len(old)) print('\nTEST GETTING REVERSE ASSOCIATIONS: GO2GENES') ## new = obj.read_gaf(go2geneids=True) new = obj.get_id2gos_nss(go2geneids=True) old = read_gaf(obj, go2geneids=True) _prt_differences(new, old, obj) assert len(new) == len(old), 'new({N}) != old({O})'.format(N=len(new), O=len(old)) print('\nTEST RETURNING ASSOCIATIONS FOR SELECTED EVIDENCE CODES') evcodes = set(['ISO', 'IKR']) print("\nTEST 9606 ev_include={CODES}".format(CODES=' '.join(evcodes))) ## new = obj.read_gaf(ev_include=evcodes) new = obj.get_id2gos_nss(ev_include=evcodes) old = read_gaf(obj, ev_include=evcodes) _prt_differences(new, old, obj) assert new == old
def main(planteome_dir, output_dir, file_prefix): # Get names of GAF files print('\nGetting names of .assoc files...') gaf_files = [] for f in listdir(planteome_dir): if f.endswith('.assoc'): gaf_files.append(join(planteome_dir, f)) # Read in GAF files print('\nReading in files...') gaf_dicts = {} problem_files = [] for f in tqdm(gaf_files): try: gaf = GafReader(f) except: print(f'Exception raised in GAF parsing, skipping file') problem_files.append(f) else: gaf_dicts[f] = gaf # Get keywords print('\nGetting keywords...') names = defaultdict(set) for f, gaf in tqdm(gaf_dicts.items()): for namedTup in gaf.associations: name = namedTup.DB_Name syns = namedTup.DB_Synonym names[f].update(name) names[f].update(syns) for key, value in names.items(): names[key] = list(value) # Write out files print('\nWriting output files...') with open(f'{output_dir}/{file_prefix}_keywords.json', 'w') as f: json.dump(names, f) with open(f'{output_dir}/{file_prefix}_problem_files.txt', 'w') as f: f.write('\n'.join(problem_files)) print('\nDone!')
network = 'thr' # 'complete', 'backbone', 'threshold' threshold = 0.5 layer = 'DM' threshold_str = str(threshold).replace('.', 'p') # GO Information dict_annotation_file = { 'HS': 'goa_human.gaf', 'MM': 'mgi.gaf', 'DM': 'fb.gaf' } annotation = '../data/GeneOntology/' + dict_annotation_file[layer] ontology = '../data/GeneOntology/go-basic.obo' # godag = obo_parser.GODag(ontology) gaf = GafReader(name='GAF ' + layer, filename=annotation, godag=godag) # Dict of Associations ns2assoc = gaf.get_ns2assc() # Load Population of Genes (for background comparison) rFPKMFile = '../02-core_genes/results/FPKM/{layer:s}/{layer:s}-FPKM-{celltype:s}.csv.gz'.format( celltype=celltype, layer=layer) dfP = pd.read_csv(rFPKMFile, usecols=['id_gene', 'gene']) # Load PCA rPCAFile = 'results/pca/{celltype:s}/pca-{celltype:s}-{network:s}-{threshold:s}-{layer:s}-dim.csv.gz'.format( celltype=celltype, network=network, threshold=threshold_str, layer=layer) df_pca = pd.read_csv(rPCAFile, index_col=0, encoding='utf-8')
def get_gaf_hdr(fin_gaf): """Read Gene Association File (GAF). Return GAF version and data info.""" return GafReader(fin_gaf, hdr_only=True).hdr
def test_anno_read(): """Test reading annotation file.""" fin_anno = os.path.join(REPO, 'goa_human.gaf') dnld_annofile(fin_anno, 'gaf') print('\nTEST STORING ONLY ONE SPECIES') obj = GafReader(fin_anno) obj.prt_summary_anno2ev() ## new = obj.read_gaf() new = obj.get_id2gos_nss() old = read_gaf(obj) _prt_differences(new, old, obj) print('{N} NEW'.format(N=len(new))) print('{N} OLD'.format(N=len(old))) assert len(new) == len(old), 'new({N}) != old({O})'.format(N=len(new), O=len(old)) print('\nTEST KWS: keep_ND and keep_NOT') # pylint: disable=bad-whitespace kws_lst = [ { 'keep_ND': False, 'keep_NOT': False }, { 'keep_ND': False, 'keep_NOT': True }, { 'keep_ND': True, 'keep_NOT': False }, { 'keep_ND': True, 'keep_NOT': True }, ] for kws in kws_lst: print('\nTEST KWS:', kws) ## new = obj.read_gaf(namespace='BP', **kws) new = obj.get_id2gos_nss(**kws) old = read_gaf(obj, **kws) _prt_differences(new, old, obj) assert len(new) == len(old), 'new({N}) != old({O})'.format(N=len(new), O=len(old)) print('\nTEST GETTING REVERSE ASSOCIATIONS: GO2GENES') ## new = obj.read_gaf(go2geneids=True) new = obj.get_id2gos_nss(go2geneids=True) old = read_gaf(obj, go2geneids=True) _prt_differences(new, old, obj) assert len(new) == len(old), 'new({N}) != old({O})'.format(N=len(new), O=len(old)) print('\nTEST RETURNING ASSOCIATIONS FOR SELECTED EVIDENCE CODES') evcodes = set(['ISO', 'IKR']) print("\nTEST 9606 ev_include={CODES}".format(CODES=' '.join(evcodes))) ## new = obj.read_gaf(ev_include=evcodes) new = obj.get_id2gos_nss(ev_include=evcodes) old = read_gaf(obj, ev_include=evcodes) _prt_differences(new, old, obj) assert new == old
def get_gaf_hdr(fin_gaf): """Read Gene Association File (GAF). Return GAF version and data info.""" from goatools.anno.gaf_reader import GafReader return GafReader(fin_gaf, hdr_only=True).hdr
pop_uniprot = set(dfQ['UniProtKB/Swiss-Prot ID'].dropna().tolist()) pop = pop_flybase.union(pop_uniprot) elif layer == 'MM': pop_mgi = set(dfQ['MGI ID'].dropna().tolist()) pop_uniprot = set(dfQ['UniProtKB/Swiss-Prot ID'].dropna().tolist()) pop = pop_mgi.union(pop_uniprot) elif layer == 'HS': pop_uniprot = set(dfQ['UniProtKB/Swiss-Prot ID'].dropna().tolist()) pop = pop_uniprot # Load GO print("Load GO files") godag = obo_parser.GODag(ontology) # GAF files for both MM and DM gaf_species = GafReader(name='GAF ' + layer + ' Specie', filename=annotation, godag=godag, namespaces=set(['BP'])) gaf_reactome = GafReader(name='GAF ' + layer + ' Reactome', filename=annotation_reactome, godag=godag, namespaces=set(['BP'])) # Dict of Associations ns2assoc_species = gaf_species.get_ns2assc() n_assoc_species = sum([len(v) for k, v in ns2assoc_species['BP'].items()]) print('Specie associations: {n:d}'.format(n=n_assoc_species)) # We also need to add the multi-species annotations ns2assoc_reactome = gaf_reactome.get_ns2assc() n_assoc_reactome = sum([len(v) for k, v in ns2assoc_reactome['BP'].items()]) print('Reactome associations: {n:d}'.format(n=n_assoc_reactome)) # combine associations ns2assoc_combined = merge(ns2assoc_species, ns2assoc_reactome) n_assoc_combined = sum([len(v) for k, v in ns2assoc_combined['BP'].items()]) print('Combined associations: {n:d}'.format(n=n_assoc_combined))