def __init__(self): self.prt = sys.stdout _fin_assc = os.path.join(REPO, "goa_human.gaf") self.gene2gos_orig = dnld_assc(_fin_assc, go2obj=None, prt=self.prt) self.go2genes_orig = get_b2aset(self.gene2gos_orig) _num_genes = [len(gs) for gs in self.go2genes_orig.values()] self.min_genes = min(_num_genes) self.max_genes = max(_num_genes) assert self.gene2gos_orig == get_b2aset(self.go2genes_orig)
def __init__(self, go2obj, annots, relationships=None, **kws): ''' Initialise the counts and ''' _prt = kws.get('prt') # Backup self.go2obj = go2obj # Full GODag self.annots, go_alts = clean_anno(annots, go2obj, _prt)[:2] # Genes annotated to all associated GO, including inherited up ancestors' _relationship_set = RelationshipCombos(go2obj).get_set(relationships) self.go2genes = self._init_go2genes(_relationship_set, go2obj) self.gene2gos = get_b2aset(self.go2genes) # Annotation main GO IDs (prefer main id to alt_id) self.goids = set(self.go2genes.keys()) self.gocnts = Counter({go:len(geneset) for go, geneset in self.go2genes.items()}) # Get total count for each branch: BP MF CC self.aspect_counts = { 'biological_process': self.gocnts.get(NAMESPACE2GO['biological_process'], 0), 'molecular_function': self.gocnts.get(NAMESPACE2GO['molecular_function'], 0), 'cellular_component': self.gocnts.get(NAMESPACE2GO['cellular_component'], 0)} self._init_add_goid_alt(go_alts) self.gosubdag = GoSubDag( set(self.gocnts.keys()), go2obj, tcntobj=self, relationships=_relationship_set, prt=None) if _prt: self.prt_objdesc(_prt)
def _run_get_id2gos(annoobjs): """Test get_id2gos""" for idx, obj in enumerate(annoobjs): print('\n{I}) get_id2gos {DESC} {NSs} {N:,} annotations'.format( I=idx, DESC=obj.get_desc(), NSs=obj.namespaces, N=len(obj.associations))) # If all namespaces are loaded, returns BP, else returns loaded NS print('Load all evidence codes') id2gos = obj.get_id2gos() assert id2gos, 'NO ANNOTATIONS FOUND' ## print(next(iter(obj.associations))) print('Load all evidence codes, except IEA') id2gos_inc = obj.get_id2gos(ev_include=INC_GOOD) id2gos_exc = obj.get_id2gos(ev_exclude={'IEA'}) assert id2gos_exc, 'NO NON-IEA ANNOTATIONS FOUND' assert id2gos_inc == id2gos_exc, \ 'INC ALL({A}) != EXC IEA({I}): {DIF}'.format( A=len(id2gos_inc), I=len(id2gos_exc), # DIF=set(obj.get_id2gos(ev_inc=INC_GOOD).keys()). # symmetric_difference(obj.get_id2gos(ev_exclude={'IEA'}))) DIF='') num_ids = len(id2gos) print('>>>>> {I} >>>>> get_id2gos {N:6,} go2ids[{B:6,}] {ANNO}'.format( I=idx, N=num_ids, ANNO=obj.get_desc(), B=len(get_b2aset(id2gos)))) assert next(iter(next(iter(id2gos.values()))))[:3] == "GO:" if obj.filename[-16:] == 'data/association' and obj.godag is None: assert num_ids == 34284
def _get_id2gos(file_id2gos, godag, name2go): """Get annotations""" if os.path.exists(file_id2gos): return IdToGosReader(file_id2gos, godag=godag).get_id2gos('CC') id2num = { name2go['A']: 10, name2go['B']: 10, name2go['C']: 10, name2go['D']: 10, name2go['E']: 10, name2go['F']: 10, name2go['G']: 10, name2go['H']: 10, name2go['I']: 30, name2go['L']: 30, name2go['M']: 20, name2go['N']: 30, } go2genes = cx.defaultdict(set) genenum = 0 for goid, qty in id2num.items(): for _ in range(qty): go2genes[goid].add(genenum) genenum += 1 id2gos = get_b2aset(go2genes) IdToGosReader.wr_id2gos(file_id2gos, id2gos) return id2gos
def _adj_for_assc(self): """Print only GO IDs from associations and their ancestors.""" if self.gene2gos: gos_assoc = set(get_b2aset(self.gene2gos).keys()) if 'item_marks' not in self.kws: self.kws['item_marks'] = {go: '>' for go in gos_assoc} if 'include_only' not in self.kws: gosubdag = GoSubDag(gos_assoc, self.gosubdag.go2obj, self.gosubdag.relationships) self.kws['include_only'] = gosubdag.go2obj
def _get_id2gos(file_id2gos, godag, name2go, name2num): """Get annotations""" if os.path.exists(file_id2gos): return IdToGosReader(file_id2gos, godag=godag).get_id2gos('CC') go2genes = cx.defaultdict(set) genenum = 0 for name, qty in name2num.items(): goid = name2go[name] for _ in range(qty): go2genes[goid].add(genenum) genenum += 1 id2gos = get_b2aset(go2genes) IdToGosReader.wr_id2gos(file_id2gos, id2gos) return id2gos
def describe_assc(org, fin_assc, go2obj, obj, prt): """Report statistics for a single association.""" # Assc. | # Assc| range | 25th | median | 75th | mean | stddev # ------------|-------|------------|------|--------|------|------|------- # hsa GO/gene | 19394 | 1 to 212 | 5 | 9 | 17 | 13 | 14 # hsa gene/GO | 17277 | 1 to 8,897 | 1 | 3 | 8 | 15 | 120 # # mus GO/gene | 19870 | 1 to 261 | 5 | 10 | 18 | 14 | 15 # mus gene/GO | 17491 | 1 to 7,009 | 1 | 3 | 8 | 16 | 129 # # dme GO/gene | 12551 | 1 to 137 | 2 | 4 | 8 | 6 | 7 # dme gene/GO | 7878 | 1 to 1,675 | 1 | 3 | 7 | 10 | 41 gene2gos = dnld_assc(fin_assc, go2obj, prt=None) # Associations go2genes = get_b2aset(gene2gos) assert gene2gos assert go2genes cnts_gos_p_gene = [len(gos) for gos in gene2gos.values()] cnts_genes_p_go = [len(genes) for genes in go2genes.values()] obj.prt_data("{ORG} GO/gene".format(ORG=org), cnts_gos_p_gene, prt) obj.prt_data("{ORG} gene/GO".format(ORG=org), cnts_genes_p_go, prt)
def get_annotations_reversed(self): """Return go2geneset for all GO IDs explicitly annotated to a gene""" return set.union(*get_b2aset(self.annots))
def get_go2chrs(sec2gos, sec2chr): """Dict: given a GO return a set of letters representing it's section membership(s).""" go2chrs = {} for goid, sections in get_b2aset(sec2gos).items(): go2chrs[goid] = set(sec2chr[s] for s in sections) return go2chrs