def test_evcode_picker(): """Test that expected 3 letter codes are chosen when given: inc, exc for codes and groups""" obj = EvidenceCodes() # pylint: disable=superfluous-parens act = obj.get_evcodes() print('ALL POSITIVE CODES: {C}'.format(C=' '.join(sorted(act)))) assert 'ND' not in act and len(act) > 15, act # act = obj.get_evcodes({'Experimental'}) assert act == set(['EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP']), act # act = obj.get_evcodes({'Experimental'}, {'IEP'}) assert act == set(['EXP', 'IDA', 'IPI', 'IMP', 'IGI']), act # act = obj.get_evcodes({'Experimental', 'Similarity'}, {'IEP', 'IMR'}) exp = { 'EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'ISS', 'ISO', 'ISA', 'ISM', 'IGC', 'IBA', 'IBD', 'IKR', 'IRD'} assert act == exp, act # act = obj.get_evcodes(None, {'IEA'}) exp = set(obj.code2nt) exp.difference_update({'IEA', 'ND'}) assert act == exp, act.symmetric_difference(exp) # obj.prt_details() obj.prt_summary_code() print("**TEST PASSED")
def __init__(self, filename=None, hdr_only=False, prt=sys.stdout): self.filename = filename self.evobj = EvidenceCodes() # Initialize associations and header information self.hdr = None self.associations = self.read_gaf(filename, hdr_only, prt) if filename is not None else []
def __init__(self, filename=None, hdr_only=False, prt=sys.stdout, **kws): # kws: allow_missing_symbol self.kws = {k:v for k, v in kws.items() if k in self.exp_kwdct} self.filename = filename self.evobj = EvidenceCodes() # Initialize associations and header information self.hdr = None self.associations = self.read_gaf(filename, hdr_only, prt) if filename is not None else []
def _prt_evidence_codes(args): if not {'--ev_help', '--ev_help_short'}.isdisjoint(args): print('\nEVIDENCE CODE HELP: --ev_exc --ev_inc') print('Use any of these group names, ') print('like Experimental or Similarity or Experimental,Similarity,') print('or evidence codes, like IEA or ISS,ISO,ISA in --ev_exc or --ev_inc:') obj = EvidenceCodes() if '--ev_help' in args: print('') obj.prt_details() if '--ev_help_short' in args: print('') obj.prt_summary_code() sys.exit(0)
def __init__(self, name, filename=None, **kws): # kws: allow_missing_symbol self.name = name # name is one of valid_formats self.filename = filename self.godag = kws.get('godag') self.namespaces = kws.get('namespaces') self.evobj = EvidenceCodes() # Read anotation file, store namedtuples: # Gene2GoReader(filename=None, taxids=None): # GafReader(filename=None, hdr_only=False, prt=sys.stdout, allow_missing_symbol=False): # GpadReader(filename=None, hdr_only=False): self.hdr = None self.datobj = None # pylint: disable=no-member self.associations = self._init_associations(filename, **kws) # assert self.associations, 'NO ANNOTATIONS FOUND: {ANNO}'.format(ANNO=filename) assert self.namespaces is None or isinstance(self.namespaces, set)
def __init__(self, filename=None, **kws): # kws: allow_missing_symbol self.filename = filename self.evobj = EvidenceCodes() # Read anotation file, store namedtuples: # Gene2GoReader(filename=None, taxids=None): # GafReader(filename=None, hdr_only=False, prt=sys.stdout, allow_missing_symbol=False): # GpadReader(filename=None, hdr_only=False): self.hdr = None self.datobj = None self.associations = self._init_associations(filename, **kws)
def __init__(self, name, filename=None, **kws): # kws: allow_missing_symbol self.name = name self.filename = filename self.godag = kws.get('godag') self.namespaces = kws.get('namespaces') self.evobj = EvidenceCodes() # Read anotation file, store namedtuples: # Gene2GoReader(filename=None, taxids=None): # GafReader(filename=None, hdr_only=False, prt=sys.stdout, allow_missing_symbol=False): # GpadReader(filename=None, hdr_only=False): self.hdr = None self.datobj = None # pylint: disable=no-member self.associations = self._init_associations(filename, **kws) # assert self.associations, 'NO ANNOTATIONS FOUND: {ANNO}'.format(ANNO=filename) assert self.namespaces is None or isinstance(self.namespaces, set)
def _prt_evidence_codes(args): if not {'--ev_help', '--ev_help_short'}.isdisjoint(args): print('\nEVIDENCE CODE HELP: --ev_exc --ev_inc') print('Use any of these group names, ') print( 'like Experimental or Similarity or Experimental,Similarity,') print( 'or evidence codes, like IEA or ISS,ISO,ISA in --ev_exc or --ev_inc:' ) obj = EvidenceCodes() if '--ev_help' in args: print('') obj.prt_details() if '--ev_help_short' in args: print('') obj.prt_summary_code() sys.exit(0)
def test_evcode_picker(): """Test that expected 3 letter codes are chosen when given: inc, exc for codes and groups""" obj = EvidenceCodes() # pylint: disable=superfluous-parens act = obj.get_evcodes() print('ALL POSITIVE CODES: {C}'.format(C=' '.join(sorted(act)))) assert 'ND' not in act and len(act) > 15, act # act = obj.get_evcodes({'Experimental'}) assert act == set(['EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP']), act # act = obj.get_evcodes({'Experimental'}, {'IEP'}) assert act == set(['EXP', 'IDA', 'IPI', 'IMP', 'IGI']), act # act = obj.get_evcodes({'Experimental', 'Similarity'}, {'IEP', 'IMR'}) exp = { 'EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'ISS', 'ISO', 'ISA', 'ISM', 'IGC', 'IBA', 'IBD', 'IKR', 'IRD' } assert act == exp, act # act = obj.get_evcodes(None, {'IEA'}) exp = set(obj.code2nt) exp.difference_update({'IEA', 'ND'}) assert act == exp, act.symmetric_difference(exp) # obj.prt_details() obj.prt_summary_code() print("**TEST PASSED")
class GafReader(object): """Reads a Gene Annotation File (GAF). Returns a Python object.""" exp_kwdct = set(['allow_missing_symbol']) def __init__(self, filename=None, hdr_only=False, prt=sys.stdout, **kws): # kws: allow_missing_symbol self.kws = {k: v for k, v in kws.items() if k in self.exp_kwdct} self.filename = filename self.evobj = EvidenceCodes() # Initialize associations and header information self.hdr = None self.associations = self.read_gaf(filename, hdr_only, prt) if filename is not None else [] def read_gaf(self, fin_gaf, hdr_only, prt): """Read GAF file. Store annotation data in a list of namedtuples.""" nts = [] ver = None hdrobj = GafHdr() datobj = None lnum = line = -1 ignored = [] try: with open(fin_gaf) as ifstrm: for lnum, line in enumerate(ifstrm, 1): # Read header if datobj is None: if line[0] == '!': if ver is None and line[1:13] == 'gaf-version:': ver = line[13:].strip() hdrobj.chkaddhdr(line) else: self.hdr = hdrobj.get_hdr() if hdr_only: return nts datobj = GafData(ver, **self.kws) # Read data if datobj is not None and line[0] != '!': ntgaf = datobj.get_ntgaf(line) if ntgaf is not None: nts.append(ntgaf) else: ignored.append((lnum, line)) except Exception as inst: import traceback traceback.print_exc() sys.stderr.write( "\n **FATAL in read_gaf: {MSG}\n\n".format(MSG=str(inst))) sys.stderr.write("**FATAL: {FIN}[{LNUM}]:\n{L}".format(FIN=fin_gaf, L=line, LNUM=lnum)) if datobj is not None: datobj.prt_line_detail(prt, line) sys.exit(1) # GAF file has been read self._prt_read_summary(prt, fin_gaf, nts, datobj, ignored) return self.evobj.sort_nts(nts, 'Evidence_Code') def _prt_read_summary(self, prt, fin_gaf, nts, datobj, ignored): """Print a summary about the GAF file that was read.""" fout_log = self._prt_ignored_lines(ignored, datobj, fin_gaf) if ignored else None if prt is not None: prt.write(" READ {N:9,} associations: {FIN}\n".format( N=len(nts), FIN=fin_gaf)) if ignored: prt.write(" IGNORED {N:9,} associations: {FIN}\n".format( N=len(ignored), FIN=fout_log)) def _prt_ignored_lines(self, ignored, datobj, fin_gaf): """Print ignored lines to a log file.""" fout_log = "{}.log".format(fin_gaf) with open(fout_log, 'w') as prt: for lnum, line in ignored: self.prt_ignore_line(prt, fin_gaf, line, lnum) datobj.prt_line_detail(prt, line) prt.write("\n") return fout_log def prt_summary_anno2ev(self, prt=sys.stdout): """Print annotation/evidence code summary.""" ctr = cx.Counter() for ntgaf in self.associations: evidence_code = ntgaf.Evidence_Code if 'NOT' not in ntgaf.Qualifier: ctr[evidence_code] += 1 elif 'NOT' in ntgaf.Qualifier: ctr["NOT {EV}".format(EV=ntgaf.Evidence_Code)] += 1 else: raise Exception("UNEXPECTED INFO") self.evobj.prt_ev_cnts(ctr, prt) @staticmethod def prt_ignore_line(prt, fin_gaf, line, lnum): """Print a message saying that we are ignoring an association line.""" prt.write( "**WARNING: BADLY FORMATTED LINE. IGNORED {FIN}[{LNUM}]:\n{L}\n". format(FIN=os.path.basename(fin_gaf), L=line, LNUM=lnum))
class AnnoReaderBase(object): """Reads a Gene Association File. Returns a Python object.""" # pylint: disable=broad-except,line-too-long,too-many-instance-attributes tic = timeit.default_timer() # Expected values for a Qualifier exp_qualifiers = set([ # Seen in both GAF and gene2go 'not', 'contributes_to', 'colocalizes_with', ]) valid_formats = {'gpad', 'gaf', 'gene2go', 'id2gos'} exp_nss = set(['BP', 'MF', 'CC']) def __init__(self, name, filename=None, **kws): # kws: allow_missing_symbol self.name = name # name is one of valid_formats self.filename = filename self.godag = kws.get('godag') self.namespaces = kws.get('namespaces') self.evobj = EvidenceCodes() # Read anotation file, store namedtuples: # Gene2GoReader(filename=None, taxids=None): # GafReader(filename=None, hdr_only=False, prt=sys.stdout, allow_missing_symbol=False): # GpadReader(filename=None, hdr_only=False): self.hdr = None self.datobj = None # pylint: disable=no-member self.associations = self._init_associations(filename, **kws) # assert self.associations, 'NO ANNOTATIONS FOUND: {ANNO}'.format(ANNO=filename) assert self.namespaces is None or isinstance(self.namespaces, set) def get_desc(self): """Get description""" return '{NAME} {NSs} {GODAG}'.format( NAME=self.name, NSs='' if self.namespaces is None else ','.join(self.namespaces), GODAG='' if self.godag is None else 'godag') # pylint: disable=unused-argument def get_associations(self, taxid=None): """Get associations""" # taxid is for NCBI's gene2gos return self.associations def prt_summary_anno2ev(self, prt=sys.stdout): """Print annotation/evidence code summary.""" self.evobj.prt_summary_anno2ev(self.associations, prt) def get_name(self): """Return type of annotation""" return self.name # pylint: disable=no-self-use def get_taxid(self): """Return taxid, if one was provided, otherwise return -1""" return -1 # Arg, taxid, is used by NCBI's annotations, but not by gpad, gaf, etc. def get_ns2assc(self, taxid=None, **kws): """Return given associations into 3 (BP, MF, CC) dicts, id2gos""" return { ns: self._get_id2gos(nts, **kws) for ns, nts in self.get_ns2ntsanno().items() } # pylint: disable=unused-argument # Arg, taxid, is used by NCBI's annotations, but not by gpad, gaf, etc. def get_ns2ntsanno(self, taxid=None): """Split list of annotations into 3 lists: BP, MF, CC""" return self._get_ns2ntsanno(self.associations) # Used by gpad, gaf, etc., but not used by NCBI's annotation reader def _get_ns2ntsanno(self, annotations): """Split list of annotations into 3 lists: BP, MF, CC""" if self.name in {'gpad', 'id2gos'}: assert self.godag is not None, "{T}: LOAD godag TO USE {C}::ns2ntsanno".format( C=self.__class__.__name__, T=self.name) ns2nts = cx.defaultdict(list) for nta in annotations: ns2nts[nta.NS].append(nta) return {ns: ns2nts[ns] for ns in self.exp_nss.intersection(ns2nts)} def get_id2gos_nss(self, **kws): """Return all associations in a dict, id2gos, regardless of namespace""" return self._get_id2gos(self.associations, **kws) def get_id2gos(self, namespace=None, prt=sys.stdout, **kws): """Return associations from specified namespace in a dict, id2gos""" # pylint: disable=superfluous-parens if self.has_ns(): # Anno namedtuple has NS field nspc, assoc = self._get_1ns_assn(namespace) id2gos = self._get_id2gos(assoc, **kws) if prt: prt.write( '{N} IDs in loaded association branch, {NS}\n'.format( N=len(id2gos), NS=nspc)) return id2gos if prt and namespace is not None: print( '**ERROR {CLS}(..., godag=None).get_id2gos: GODAG is None. IGNORING namespace({NS})\n' .format(NS=namespace, CLS=type(self).__name__)) id2gos = self._get_id2gos(self.associations, **kws) if prt: prt.write('{N} IDs in all associations\n'.format(N=len(id2gos))) return id2gos def _get_1ns_assn(self, namespace_usr): """Get one namespace, given a user-provided namespace or a default""" # If all namespaces were loaded if self.namespaces is None: # Return user-specified namespace, if provided. Otherwise BP nspc = 'BP' if namespace_usr is None else namespace_usr # Return one namespace if nspc in set(NAMESPACE2NS.values()): return nspc, [nt for nt in self.associations if nt.NS == nspc] # Return all namespaces return nspc, self.associations # If one namespace was loaded, use that regardless of what user specfies if len(self.namespaces) == 1: nspc = next(iter(self.namespaces)) if namespace_usr is not None and nspc != namespace_usr: print('**WARNING: IGNORING {ns}; ONLY {NS} WAS LOADED'.format( ns=namespace_usr, NS=nspc)) return nspc, self.associations if namespace_usr is None: print('**ERROR get_id2gos: GODAG NOT LOADED. USING: {NSs}'.format( NSs=' '.join(sorted(self.namespaces)))) return namespace_usr, self.associations def has_ns(self): """Return True if namespace field, NS exists on annotation namedtuples""" assert self.associations, 'NO ASSOCIATIONS IN file({}): {}'.format( self.filename, self.associations) return hasattr(next(iter(self.associations)), 'NS') def _get_id2gos(self, ntannos_usr, propagate_counts=False, relationships=None, prt=sys.stdout, **kws): """Return given ntannos_usr in a dict, id2gos""" options = AnnoOptions(self.evobj, **kws) # Default reduction is to remove. For all options, see goatools/anno/opts.py: # * Evidence_Code == ND -> No biological data No biological Data available # * Qualifiers contain NOT ntannos_m = self.reduce_annotations(ntannos_usr, options) dbid2goids = self.get_dbid2goids(ntannos_m, propagate_counts, relationships, prt) if options.b_geneid2gos: return dbid2goids # if not a2bs: # raise RuntimeError('**ERROR: NO ASSOCATIONS FOUND: {FILE}'.format(FILE=self.filename)) return self._get_goid2dbids(dbid2goids) @staticmethod def _get_goid2dbids(dbid2goids): """Return dict of GO ID keys and a set of gene products as values""" goid2dbids = cx.defaultdict(set) for dbid, goids in dbid2goids.items(): for goid in goids: goid2dbids[goid].add(dbid) return dict(goid2dbids) def _get_namespaces(self, nts): """Get the set of namespaces seen in the namedtuples.""" return set(nt.NS for nt in nts) if self.has_ns() else set() # Qualifier (column 4) # Flags that modify the interpretation of an annotation one (or more) of NOT, contributes_to, colocalizes_with # This field is not mandatory; # * cardinality 0, 1, >1; # * for cardinality >1 use a pipe to separate entries (e.g. NOT|contributes_to) def prt_qualifiers(self, prt=sys.stdout): """Print Qualifiers: 1,462 colocalizes_with; 1,454 contributes_to; 1,157 not""" # 13 not colocalizes_with (TBD: CHK - Seen in gene2go, but not gafs) # 4 not contributes_to (TBD: CHK - Seen in gene2go, but not gafs) self._prt_qualifiers(self.associations, prt) @staticmethod def _prt_qualifiers(associations, prt=sys.stdout): """Print Qualifiers found in the annotations. QUALIFIERS: 1,462 colocalizes_with 1,454 contributes_to 1,157 not 13 not colocalizes_with (TBD: CHK - Seen in gene2go, but not gafs) 4 not contributes_to (TBD: CHK - Seen in gene2go, but not gafs) """ prt.write('QUALIFIERS:\n') for fld, cnt in cx.Counter(q for nt in associations for q in nt.Qualifier).most_common(): prt.write(' {N:6,} {FLD}\n'.format(N=cnt, FLD=fld)) def reduce_annotations(self, annotations, options): """Reduce annotations to ones used to identify enrichment (normally exclude ND and NOT).""" getfnc_qual_ev = options.getfnc_qual_ev() return [ nt for nt in annotations if getfnc_qual_ev(nt.Qualifier, nt.Evidence_Code) ] @staticmethod def update_association(assc_goidsets, go2ancestors, prt=sys.stdout): """Update the GO sets in assc_gene2gos to include all GO ancestors""" goids_avail = set(go2ancestors) # assc_gos is assc_gene2gos.values() for assc_goids_cur in assc_goidsets: parents = set() for goid in assc_goids_cur.intersection(goids_avail): parents.update(go2ancestors[goid]) assc_goids_cur.update(parents) def _get_go2ancestors(self, goids_assoc_usr, relationships, prt=sys.stdout): """Return go2ancestors (set of parent GO IDs) for all GO ID keys in go2obj.""" assert self.godag is not None _godag = self.godag # Get GO IDs in annotations that are in GO DAG goids_avail = set(_godag) self._rpt_goids_notfound(goids_assoc_usr, goids_avail) goids_assoc_cur = goids_assoc_usr.intersection(goids_avail) # Get GO Term for each current GO ID in the annotations _go2obj_assc = {go: _godag[go] for go in goids_assoc_cur} go2ancestors = get_go2parents_go2obj(_go2obj_assc, relationships, prt) if prt: prt.write('{N} GO IDs -> {M} go2ancestors\n'.format( N=len(goids_avail), M=len(go2ancestors))) return go2ancestors @staticmethod def _rpt_goids_notfound(goids_assoc_all, goids_avail): """Report the number of GO IDs in the association, but not in the GODAG""" goids_missing = goids_assoc_all.difference(goids_avail) if goids_missing: print("{N} GO IDs NOT FOUND IN ASSOCIATION: {GOs}".format( N=len(goids_missing), GOs=" ".join(goids_missing))) def get_dbid2goids(self, ntannos, propagate_counts=False, relationships=None, prt=sys.stdout): """Return gene2go data for user-specified taxids.""" if propagate_counts: return self._get_dbid2goids_p1(ntannos, relationships, prt) return self._get_dbid2goids_p0(ntannos) @staticmethod def _get_dbid2goids_p0(associations): """Return gene2goids with annotations as-is (propagate_counts == False)""" id2gos = cx.defaultdict(set) for ntd in associations: id2gos[ntd.DB_ID].add(ntd.GO_ID) return dict(id2gos) def _get_dbid2goids_p1(self, ntannos, relationships=None, prt=sys.stdout): """Return gene2goids with propagate_counts == True""" id2gos = cx.defaultdict(set) goids_annos = set(nt.GO_ID for nt in ntannos) go2ancestors = self._get_go2ancestors(goids_annos, relationships, prt) # https://github.com/geneontology/go-annotation/issues/3523 exclude = {'GO:2000325', 'GO:2000327'} for ntd in ntannos: goid = ntd.GO_ID # https://github.com/geneontology/go-annotation/issues/3523 if goid not in exclude: goids = id2gos[ntd.DB_ID] goids.add(goid) goids.update(go2ancestors[goid]) else: print('**WARNING: OBSOLETE GO ID({GO})'.format(GO=goid)) return dict(id2gos) @staticmethod def get_goid2dbids(associations): """Return gene2go data for user-specified taxids.""" go2ids = cx.defaultdict(set) for ntd in associations: go2ids[ntd.GO_ID].add(ntd.DB_ID) return dict(go2ids) def hms(self, msg, tic=None, prt=sys.stdout): """Print elapsed time and message.""" if tic is None: tic = self.tic now = timeit.default_timer() hms = str(datetime.timedelta(seconds=(now - tic))) prt.write('{HMS}: {MSG}\n'.format(HMS=hms, MSG=msg)) return now def chk_associations(self, fout_err=None): """Check that associations are in expected format.""" # pylint: disable=unnecessary-pass pass def nts_ev_nd(self): """Get annotations where Evidence_code == 'ND' (No biological data)""" return [nt for nt in self.associations if nt.Evidence_Code == 'ND'] def nts_qual_not(self): """Get annotations having Qualifiers containing NOT""" return [nt for nt in self.associations if self._has_not_qual(nt)] def chk_qualifiers(self): """Check format of qualifier""" if self.name == 'id2gos': return for ntd in self.associations: # print(ntd) qual = ntd.Qualifier assert isinstance( qual, set), '{NAME}: QUALIFIER MUST BE A LIST: {NT}'.format( NAME=self.name, NT=ntd) assert qual != set(['']), ntd assert qual != set(['-']), ntd assert 'always' not in qual, 'SPEC SAID IT WOULD BE THERE' def chk_godag(self): """Check that a GODag was loaded""" if not self.godag: raise RuntimeError( '{CLS} MUST INCLUDE GODag: {CLS}(file.anno, godag=godag)'. format(CLS=self.__class__.__name__)) @staticmethod def _has_not_qual(ntd): """Return True if the qualifiers contain a 'NOT'""" for qual in ntd.Qualifier: if 'not' in qual: return True if 'NOT' in qual: return True return False def prt_counts(self, prt=sys.stdout): """Print the number of taxids stored.""" num_annos = len(self.associations) # 792,891 annotations for 3 taxids stored: 10090 7227 9606 prt.write('{A:8,} annotations\n'.format(A=num_annos))
class GafReader(object): """Reads a Gene Annotation File (GAF). Returns a Python object.""" exp_kwdct = set(['allow_missing_symbol']) def __init__(self, filename=None, hdr_only=False, prt=sys.stdout, **kws): # kws: allow_missing_symbol self.kws = {k: v for k, v in kws.items() if k in self.exp_kwdct} self.filename = filename self.evobj = EvidenceCodes() # Initialize associations and header information self.hdr = None self.datobj = None self.associations = self._init_assn( filename, hdr_only, prt) if filename is not None else [] def read_gaf(self, **kws): """Read Gene Association File (GAF). Return data.""" # Simple associations id2gos = cx.defaultdict(set) # keyword arguments for choosing which GO IDs to keep # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs taxid2asscs = kws.get('taxid2asscs', None) b_geneid2gos = not kws.get('go2geneids', False) evs = kws.get('evidence_set', None) eval_nd = self._get_nd(kws.get('keep_ND', False)) eval_not = self._get_not(kws.get('keep_NOT', False)) # Optionally specify a subset of GOs based on their evidence. # By default, return id2gos. User can cause go2geneids to be returned by: # >>> read_ncbi_gene2go(..., go2geneids=True for ntgaf in self.associations: if eval_nd(ntgaf) and eval_not(ntgaf): if evs is None or ntgaf.Evidence_Code in evs: geneid = ntgaf.DB_ID go_id = ntgaf.GO_ID if b_geneid2gos: id2gos[geneid].add(go_id) else: id2gos[go_id].add(geneid) if taxid2asscs is not None: if ntgaf.Taxon: taxid = ntgaf.Taxon[0] taxid2asscs[taxid]['ID2GOs'][geneid].add(go_id) taxid2asscs[taxid]['GO2IDs'][go_id].add(geneid) return id2gos # return simple associations @staticmethod def _get_nd(keep_nd): """Allow GAF values always or never.""" if keep_nd: return lambda nt: True return lambda nt: nt.Evidence_Code != 'ND' @staticmethod def _get_not(keep_not): """Allow GAF values always or never.""" if keep_not: return lambda nt: True return lambda nt: 'NOT' not in nt.Qualifier def _init_assn(self, fin_gaf, hdr_only, prt): """Read GAF file. Store annotation data in a list of namedtuples.""" nts = self._read_gaf_nts(fin_gaf, hdr_only) # GAF file has been read if prt: prt.write(" READ {N:9,} associations: {FIN}\n".format( N=len(nts), FIN=fin_gaf)) # If there are illegal GAF lines ... if self.datobj: if self.datobj.ignored or self.datobj.illegal_lines: self.datobj.prt_error_summary(fin_gaf) return self.evobj.sort_nts(nts, 'Evidence_Code') def _read_gaf_nts(self, fin_gaf, hdr_only): """Read GAF file. Store annotation data in a list of namedtuples.""" nts = [] ver = None hdrobj = GafHdr() datobj = None lnum = line = -1 try: with open(fin_gaf) as ifstrm: for lnum, line in enumerate(ifstrm, 1): # Read header if datobj is None: if line[0] == '!': if ver is None and line[1:13] == 'gaf-version:': ver = line[13:].strip() hdrobj.chkaddhdr(line) else: self.hdr = hdrobj.get_hdr() if hdr_only: return nts datobj = GafData(ver, **self.kws) # Read data if datobj is not None and line[0] != '!': # print(lnum, line) ntgaf = datobj.get_ntgaf(line, lnum) if ntgaf is not None: nts.append(ntgaf) else: datobj.ignored.append((lnum, line)) except Exception as inst: import traceback traceback.print_exc() sys.stderr.write("\n **FATAL: {MSG}\n\n".format(MSG=str(inst))) sys.stderr.write("**FATAL: {FIN}[{LNUM}]:\n{L}".format(FIN=fin_gaf, L=line, LNUM=lnum)) if datobj is not None: datobj.prt_line_detail(sys.stdout, line) sys.exit(1) self.datobj = datobj return nts def prt_summary_anno2ev(self, prt=sys.stdout): """Print annotation/evidence code summary.""" ctr = cx.Counter() for ntgaf in self.associations: evidence_code = ntgaf.Evidence_Code if 'NOT' not in ntgaf.Qualifier: ctr[evidence_code] += 1 elif 'NOT' in ntgaf.Qualifier: ctr["NOT {EV}".format(EV=ntgaf.Evidence_Code)] += 1 else: raise Exception("UNEXPECTED INFO") self.evobj.prt_ev_cnts(ctr, prt)
def __init__(self, filename=None, log=sys.stdout): self.filename = filename self.log = log self.evobj = EvidenceCodes() self.associations = self.read_gaf( filename) if filename is not None else []
class AnnoReaderBase(object): """Reads a Gene Association File. Returns a Python object.""" # pylint: disable=broad-except,line-too-long,too-many-instance-attributes tic = timeit.default_timer() # Expected values for a Qualifier exp_qualifiers = set([ # Seen in both GAF and gene2go 'not', 'contributes_to', 'colocalizes_with', ]) # pylint: disable=too-many-instance-attributes def __init__(self, name, filename=None, **kws): # kws: allow_missing_symbol self.name = name self.filename = filename self.godag = kws.get('godag') self.namespaces = kws.get('namespaces') self.evobj = EvidenceCodes() # Read anotation file, store namedtuples: # Gene2GoReader(filename=None, taxids=None): # GafReader(filename=None, hdr_only=False, prt=sys.stdout, allow_missing_symbol=False): # GpadReader(filename=None, hdr_only=False): self.hdr = None self.datobj = None # pylint: disable=no-member self.associations = self._init_associations(filename, **kws) # assert self.associations, 'NO ANNOTATIONS FOUND: {ANNO}'.format(ANNO=filename) assert self.namespaces is None or isinstance(self.namespaces, set) def get_desc(self): """Get description""" return '{NAME} {NSs} {GODAG}'.format( NAME=self.name, NSs='' if self.namespaces is None else ','.join(self.namespaces), GODAG='' if self.godag is None else 'godag') # pylint: disable=unused-argument def get_associations(self, taxid=None): """Get associations""" # taxid is for NCBI's gene2gos return self.associations def prt_summary_anno2ev(self, prt=sys.stdout): """Print annotation/evidence code summary.""" self.evobj.prt_summary_anno2ev(self.associations, prt) def get_name(self): """Return type of annotation""" return self.name # pylint: disable=no-self-use def get_taxid(self): """Return taxid, if one was provided, otherwise return -1""" return -1 def get_ns2assc(self, **kws): """Return given associations into 3 (BP, MF, CC) dicts, id2gos""" return { ns: self._get_id2gos(nts, **kws) for ns, nts in self.get_ns2ntsanno(kws.get('taxid')).items() } # pylint: disable=unused-argument def get_ns2ntsanno(self, taxid=None): """Split list of annotations into 3 lists: BP, MF, CC""" return self._get_ns2ntsanno(self.associations) def _get_ns2ntsanno(self, annotations): """Split list of annotations into 3 lists: BP, MF, CC""" if self.name in {'gpad', 'id2gos'}: assert self.godag is not None, "{T}: LOAD godag TO USE {C}::ns2ntsanno".format( C=self.__class__.__name__, T=self.name) ns2nts = cx.defaultdict(list) for nta in annotations: ns2nts[nta.NS].append(nta) return { ns: ns2nts[ns] for ns in set(['BP', 'MF', 'CC']).intersection(ns2nts) } def get_id2gos_nss(self, **kws): """Return all associations in a dict, id2gos, regardless of namespace""" return self._get_id2gos(self.associations, **kws) def get_id2gos(self, namespace='BP', **kws): """Return associations from specified namespace in a dict, id2gos""" # pylint: disable=superfluous-parens if self.has_ns(): assoc = [nt for nt in self.associations if nt.NS == namespace] id2gos = self._get_id2gos(assoc, **kws) print('{N} IDs in association branch, {NS}'.format(N=len(id2gos), NS=namespace)) return id2gos print('**ERROR: GODAG NOT LOADED. IGNORING namespace({NS})'.format( NS=namespace)) id2gos = self._get_id2gos(self.associations, **kws) print('{N} IDs in association branch, {NS}'.format(N=len(id2gos), NS=namespace)) return id2gos def has_ns(self): """Return True if namespace field, NS exists on annotation namedtuples""" return hasattr(next(iter(self.associations)), 'NS') def _get_id2gos(self, associations, **kws): """Return given associations in a dict, id2gos""" options = AnnoOptions(self.evobj, **kws) # Default reduction is to remove. For all options, see goatools/anno/opts.py: # * Evidence_Code == ND -> No biological data No biological Data available # * Qualifiers contain NOT assc = self.reduce_annotations(associations, options) return self.get_dbid2goids( assc) if options.b_geneid2gos else self.get_goid2dbids(assc) def _get_namespaces(self, nts): """Get the set of namespaces seen in the namedtuples.""" return set(nt.NS for nt in nts) if self.has_ns() else set() # Qualifier (column 4) # Flags that modify the interpretation of an annotation one (or more) of NOT, contributes_to, colocalizes_with # This field is not mandatory; # * cardinality 0, 1, >1; # * for cardinality >1 use a pipe to separate entries (e.g. NOT|contributes_to) def prt_qualifiers(self, prt=sys.stdout): """Print Qualifiers: 1,462 colocalizes_with; 1,454 contributes_to; 1,157 not""" # 13 not colocalizes_with (TBD: CHK - Seen in gene2go, but not gafs) # 4 not contributes_to (TBD: CHK - Seen in gene2go, but not gafs) self._prt_qualifiers(self.associations, prt) @staticmethod def _prt_qualifiers(associations, prt=sys.stdout): """Print Qualifiers found in the annotations. QUALIFIERS: 1,462 colocalizes_with 1,454 contributes_to 1,157 not 13 not colocalizes_with (TBD: CHK - Seen in gene2go, but not gafs) 4 not contributes_to (TBD: CHK - Seen in gene2go, but not gafs) """ prt.write('QUALIFIERS:\n') for fld, cnt in cx.Counter(q for nt in associations for q in nt.Qualifier).most_common(): prt.write(' {N:6,} {FLD}\n'.format(N=cnt, FLD=fld)) def reduce_annotations(self, annotations, options): """Reduce annotations to ones used to identify enrichment (normally exclude ND and NOT).""" getfnc_qual_ev = options.getfnc_qual_ev() return [ nt for nt in annotations if getfnc_qual_ev(nt.Qualifier, nt.Evidence_Code) ] @staticmethod def get_dbid2goids(associations): """Return gene2go data for user-specified taxids.""" id2gos = cx.defaultdict(set) for ntd in associations: id2gos[ntd.DB_ID].add(ntd.GO_ID) return dict(id2gos) @staticmethod def get_goid2dbids(associations): """Return gene2go data for user-specified taxids.""" go2ids = cx.defaultdict(set) for ntd in associations: go2ids[ntd.GO_ID].add(ntd.DB_ID) return dict(go2ids) def hms(self, msg, tic=None, prt=sys.stdout): """Print elapsed time and message.""" if tic is None: tic = self.tic now = timeit.default_timer() hms = str(datetime.timedelta(seconds=(now - tic))) prt.write('{HMS}: {MSG}\n'.format(HMS=hms, MSG=msg)) return now def chk_associations(self, fout_err=None): """Check that associations are in expected format.""" pass def nts_ev_nd(self): """Get annotations where Evidence_code == 'ND' (No biological data)""" return [nt for nt in self.associations if nt.Evidence_Code == 'ND'] def nts_qual_not(self): """Get annotations having Qualifiers containing NOT""" return [nt for nt in self.associations if self._has_not_qual(nt)] def chk_qualifiers(self): """Check format of qualifier""" if self.name == 'id2gos': return for ntd in self.associations: # print(ntd) qual = ntd.Qualifier assert isinstance( qual, set), '{NAME}: QUALIFIER MUST BE A LIST: {NT}'.format( NAME=self.name, NT=ntd) assert qual != set(['']), ntd assert qual != set(['-']), ntd assert 'always' not in qual, 'SPEC SAID IT WOULD BE THERE' @staticmethod def _has_not_qual(ntd): """Return True if the qualifiers contain a 'NOT'""" for qual in ntd.Qualifier: if 'not' in qual: return True if 'NOT' in qual: return True return False
class AnnoReaderBase(object): """Reads a Gene Association File. Returns a Python object.""" # pylint: disable=broad-except,line-too-long,too-many-instance-attributes tic = timeit.default_timer() # Expected values for a Qualifier exp_qualifiers = set([ # Seen in both GAF and gene2go 'not', 'contributes_to', 'colocalizes_with', ]) # pylint: disable=too-many-instance-attributes def __init__(self, name, filename=None, **kws): # kws: allow_missing_symbol self.name = name self.filename = filename self.godag = kws.get('godag') self.namespaces = kws.get('namespaces') self.evobj = EvidenceCodes() # Read anotation file, store namedtuples: # Gene2GoReader(filename=None, taxids=None): # GafReader(filename=None, hdr_only=False, prt=sys.stdout, allow_missing_symbol=False): # GpadReader(filename=None, hdr_only=False): self.hdr = None self.datobj = None # pylint: disable=no-member self.associations = self._init_associations(filename, **kws) # assert self.associations, 'NO ANNOTATIONS FOUND: {ANNO}'.format(ANNO=filename) assert self.namespaces is None or isinstance(self.namespaces, set) def get_desc(self): """Get description""" return '{NAME} {NSs} {GODAG}'.format( NAME=self.name, NSs='' if self.namespaces is None else ','.join(self.namespaces), GODAG='' if self.godag is None else 'godag') # pylint: disable=unused-argument def get_associations(self, taxid=None): """Get associations""" # taxid is for NCBI's gene2gos return self.associations def prt_summary_anno2ev(self, prt=sys.stdout): """Print annotation/evidence code summary.""" self.evobj.prt_summary_anno2ev(self.associations, prt) def get_name(self): """Return type of annotation""" return self.name # pylint: disable=no-self-use def get_taxid(self): """Return taxid, if one was provided, otherwise return -1""" return -1 def get_ns2assc(self, **kws): """Return given associations into 3 (BP, MF, CC) dicts, id2gos""" return {ns:self._get_id2gos(nts, **kws) for ns, nts in self.get_ns2ntsanno(kws.get('taxid')).items()} # pylint: disable=unused-argument def get_ns2ntsanno(self, taxid=None): """Split list of annotations into 3 lists: BP, MF, CC""" return self._get_ns2ntsanno(self.associations) def _get_ns2ntsanno(self, annotations): """Split list of annotations into 3 lists: BP, MF, CC""" if self.name in {'gpad', 'id2gos'}: assert self.godag is not None, "{T}: LOAD godag TO USE {C}::ns2ntsanno".format( C=self.__class__.__name__, T=self.name) ns2nts = cx.defaultdict(list) for nta in annotations: ns2nts[nta.NS].append(nta) return {ns:ns2nts[ns] for ns in set(['BP', 'MF', 'CC']).intersection(ns2nts)} def get_id2gos_nss(self, **kws): """Return all associations in a dict, id2gos, regardless of namespace""" return self._get_id2gos(self.associations, **kws) def get_id2gos(self, namespace='BP', **kws): """Return associations from specified namespace in a dict, id2gos""" # pylint: disable=superfluous-parens if self.has_ns(): assoc = [nt for nt in self.associations if nt.NS == namespace] id2gos = self._get_id2gos(assoc, **kws) print('{N} IDs in association branch, {NS}'.format(N=len(id2gos), NS=namespace)) return id2gos print('**ERROR: GODAG NOT LOADED. IGNORING namespace({NS})'.format(NS=namespace)) id2gos = self._get_id2gos(self.associations, **kws) print('{N} IDs in association branch, {NS}'.format(N=len(id2gos), NS=namespace)) return id2gos def has_ns(self): """Return True if namespace field, NS exists on annotation namedtuples""" return hasattr(next(iter(self.associations)), 'NS') def _get_id2gos(self, associations, **kws): """Return given associations in a dict, id2gos""" options = AnnoOptions(self.evobj, **kws) # Default reduction is to remove. For all options, see goatools/anno/opts.py: # * Evidence_Code == ND -> No biological data No biological Data available # * Qualifiers contain NOT assc = self.reduce_annotations(associations, options) return self.get_dbid2goids(assc) if options.b_geneid2gos else self.get_goid2dbids(assc) def _get_namespaces(self, nts): """Get the set of namespaces seen in the namedtuples.""" return set(nt.NS for nt in nts) if self.has_ns() else set() # Qualifier (column 4) # Flags that modify the interpretation of an annotation one (or more) of NOT, contributes_to, colocalizes_with # This field is not mandatory; # * cardinality 0, 1, >1; # * for cardinality >1 use a pipe to separate entries (e.g. NOT|contributes_to) def prt_qualifiers(self, prt=sys.stdout): """Print Qualifiers: 1,462 colocalizes_with; 1,454 contributes_to; 1,157 not""" # 13 not colocalizes_with (TBD: CHK - Seen in gene2go, but not gafs) # 4 not contributes_to (TBD: CHK - Seen in gene2go, but not gafs) self._prt_qualifiers(self.associations, prt) @staticmethod def _prt_qualifiers(associations, prt=sys.stdout): """Print Qualifiers found in the annotations. QUALIFIERS: 1,462 colocalizes_with 1,454 contributes_to 1,157 not 13 not colocalizes_with (TBD: CHK - Seen in gene2go, but not gafs) 4 not contributes_to (TBD: CHK - Seen in gene2go, but not gafs) """ prt.write('QUALIFIERS:\n') for fld, cnt in cx.Counter(q for nt in associations for q in nt.Qualifier).most_common(): prt.write(' {N:6,} {FLD}\n'.format(N=cnt, FLD=fld)) def reduce_annotations(self, annotations, options): """Reduce annotations to ones used to identify enrichment (normally exclude ND and NOT).""" getfnc_qual_ev = options.getfnc_qual_ev() return [nt for nt in annotations if getfnc_qual_ev(nt.Qualifier, nt.Evidence_Code)] @staticmethod def get_dbid2goids(associations): """Return gene2go data for user-specified taxids.""" id2gos = cx.defaultdict(set) for ntd in associations: id2gos[ntd.DB_ID].add(ntd.GO_ID) return dict(id2gos) @staticmethod def get_goid2dbids(associations): """Return gene2go data for user-specified taxids.""" go2ids = cx.defaultdict(set) for ntd in associations: go2ids[ntd.GO_ID].add(ntd.DB_ID) return dict(go2ids) def hms(self, msg, tic=None, prt=sys.stdout): """Print elapsed time and message.""" if tic is None: tic = self.tic now = timeit.default_timer() hms = str(datetime.timedelta(seconds=(now-tic))) prt.write('{HMS}: {MSG}\n'.format(HMS=hms, MSG=msg)) return now def chk_associations(self, fout_err=None): """Check that associations are in expected format.""" pass def nts_ev_nd(self): """Get annotations where Evidence_code == 'ND' (No biological data)""" return [nt for nt in self.associations if nt.Evidence_Code == 'ND'] def nts_qual_not(self): """Get annotations having Qualifiers containing NOT""" return [nt for nt in self.associations if self._has_not_qual(nt)] def chk_qualifiers(self): """Check format of qualifier""" if self.name == 'id2gos': return for ntd in self.associations: # print(ntd) qual = ntd.Qualifier assert isinstance(qual, set), '{NAME}: QUALIFIER MUST BE A LIST: {NT}'.format( NAME=self.name, NT=ntd) assert qual != set(['']), ntd assert qual != set(['-']), ntd assert 'always' not in qual, 'SPEC SAID IT WOULD BE THERE' @staticmethod def _has_not_qual(ntd): """Return True if the qualifiers contain a 'NOT'""" for qual in ntd.Qualifier: if 'not' in qual: return True if 'NOT' in qual: return True return False
def __init__(self, filename=None, log=sys.stdout): self.filename = filename self.log = log self.evobj = EvidenceCodes() self.associations = self.read_gaf(filename) if filename is not None else []
def test_ev(): """Return GO associations from a GAF file. Download if necessary.""" evs = _get_evidencecodes('gene2go') obj = EvidenceCodes() missing = evs.difference(obj.code2nt) assert not missing, 'MISSING({EV})'.format(EV=missing)
class GafReader(object): """Reads a Gene Annotation File (GAF). Returns a Python object.""" gafhdr = [ # Col Req? Cardinality Example # --- -------- -------------- ----------------- 'DB', # 0 required 1 UniProtKB 'DB_ID', # 1 required 1 P12345 'DB_Symbol', # 2 required 1 PHO3 'Qualifier', # 3 optional 0 or greater NOT 'GO_ID', # 4 required 1 GO:0003993 'DB_Reference', # 5 required 1 or greater PMID:2676709 'Evidence_Code', # 6 required 1 IMP 'With_From', # 7 optional 0 or greater GO:0000346 'Aspect', # 8 required 1 F 'DB_Name', # 9 optional 0 or 1 Toll-like receptor 4 'DB_Synonym', # 10 optional 0 or greater hToll|Tollbooth 'DB_Type', # 11 required 1 protein 'Taxon', # 12 required 1 or 2 taxon:9606 'Date', # 13 required 1 20090118 'Assigned_By', # 14 required 1 SGD ] # Col Required Cardinality Example gafhdr2 = [ # --- -------- ------------ ------------------- 'Annotation_Extension', # 15 optional 0 or greater part_of(CL:0000576) 'Gene_Product_Form_ID', # 16 optional 0 or 1 UniProtKB:P12345-2 ] gaf_columns = { "2.1": gafhdr + gafhdr2, # !gaf-version: 2.1 "2.0": gafhdr + gafhdr2, # !gaf-version: 2.0 "1.0": gafhdr } # !gaf-version: 1.0 # Expected numbers of columns for various versions gaf_numcol = {"2.1": 17, "2.0": 17, "1.0": 15} # Expected values for a Qualifier exp_qualifiers = set( ['NOT', 'contributes_to', 'Contributes_to', 'colocalizes_with']) def __init__(self, filename=None, hdr_only=False, prt=sys.stdout): self.filename = filename self.evobj = EvidenceCodes() # Initialize associations and header information self.hdr = None self.associations = self.read_gaf(filename, hdr_only, prt) if filename is not None else [] def prt_summary_anno2ev(self, prt=sys.stdout): """Print annotation/evidence code summary.""" ctr = cx.Counter() for ntgaf in self.associations: evidence_code = ntgaf.Evidence_Code if 'NOT' not in ntgaf.Qualifier: ctr[evidence_code] += 1 elif 'NOT' in ntgaf.Qualifier: ctr["NOT {EV}".format(EV=ntgaf.Evidence_Code)] += 1 else: raise Exception("UNEXPECTED INFO") self.evobj.prt_ev_cnts(ctr, prt) def _get_ntgaf(self, ntgafobj, flds, ver): """Convert fields from string to preferred format for GAF ver 2.1 and 2.0.""" # Cardinality is_set = False is_list = True qualifiers = self._rd_fld_vals("Qualifier", flds[3], is_set) db_reference = self._rd_fld_vals("DB_Reference", flds[5], is_set, 1) with_from = self._rd_fld_vals("With_From", flds[7], is_set) db_name = self._rd_fld_vals("DB_Name", flds[9], is_set, 0, 1) db_synonym = self._rd_fld_vals("DB_Synonym", flds[10], is_set) taxons = self._rd_fld_vals("Taxon", flds[12], is_list, 1, 2) self._chk_qty_eq_1(flds, [0, 1, 2, 4, 6, 8, 11, 13, 14]) # Additional Formatting taxons = self._do_taxons(taxons) self._chk_qualifier(qualifiers) # Create list of values gafvals = [ flds[0], # 0 DB flds[1], # 1 DB_ID flds[2], # 2 DB_Symbol qualifiers, # 3 Qualifier flds[4], # 4 GO_ID db_reference, # 5 DB_Reference flds[6], # 6 Evidence_Code with_from, # 7 With_From flds[8], # 8 Aspect db_name, # 9 DB_Name db_synonym, # 10 DB_Synonym flds[11], # 11 DB_Type taxons, # 12 Taxon flds[12], # 13 Date flds[13] ] # 14 Assigned_By # Version 2.x has these additional fields not found in v1.0 if ver[0] == '2': gafvals += [ self._rd_fld_vals("Annotation_Extension", flds[15], is_set), self._rd_fld_vals("Gene_Product_Form_ID", flds[16], is_set) ] return ntgafobj._make(gafvals) def _rd_fld_vals(self, name, val, set_list_ft=True, qty_min=0, qty_max=None): """Further split a GAF value within a single field.""" if not val and qty_min == 0: return [] if set_list_ft else set() vals = val.split('|') # Use a pipe to separate entries num_vals = len(vals) assert num_vals >= qty_min, \ "FIELD({F}): MIN QUANTITY({Q}) WASN'T MET: {V} in {GAF}".format( F=name, Q=qty_min, V=vals, GAF=self.filename) if qty_max is not None: assert num_vals <= qty_max, \ "FIELD({F}): MAX QUANTITY({Q}) EXCEEDED: {V} in {GAF}".format( F=name, Q=qty_max, V=vals, GAF=self.filename) return vals if set_list_ft else set(vals) def read_gaf(self, fin_gaf, hdr_only, prt): """Read GAF file. HTTP address okay. GZIPPED/BZIPPED file okay.""" ga_lst = [] ver = None ntgafobj = None exp_numcol = None hdrobj = GafHdr() ifstrm = nopen(fin_gaf) for line in ifstrm: # Read header if ntgafobj is None: if line[0] == '!': if line[1:13] == 'gaf-version:': ver = line[13:].strip() hdrobj.chkaddhdr(line) else: self.hdr = hdrobj.get_hdr() if hdr_only: return ga_lst ntgafobj = cx.namedtuple("ntgafobj", " ".join(self.gaf_columns[ver])) exp_numcol = self.gaf_numcol[ver] # Read data if ntgafobj is not None: flds = self._split_line(line, exp_numcol) ntgaf = self._get_ntgaf(ntgafobj, flds, ver) ga_lst.append(ntgaf) # GAF file has been read if prt is not None: readmsg = " READ {N:,} associations: {FIN}\n" prt.write(readmsg.format(N=len(ga_lst), FIN=fin_gaf)) return self.evobj.sort_nts(ga_lst, 'Evidence_Code') @staticmethod def _split_line(line, exp_numcol): """Split line into field values.""" line = line.rstrip('\r\n') flds = re.split('\t', line) assert len(flds) == exp_numcol, "UNEXPECTED NUMBER OF COLUMNS" return flds def _chk_qualifier(self, qualifiers): """Check that qualifiers are expected values.""" # http://geneontology.org/page/go-annotation-conventions#qual for qual in qualifiers: assert qual in self.exp_qualifiers, "UNEXPECTED QUALIFIER({Q}) IN {GAF}".format( Q=qual, GAF=self.filename) @staticmethod def _chk_qty_eq_1(flds, col_lst): """Check that these fields have only one value: required 1.""" for col in col_lst: assert flds[ col], "UNEXPECTED REQUIRED VALUE({V}) AT INDEX({R})".format( V=flds[col], R=col) @staticmethod def _do_taxons(taxons): """Taxon""" taxons = [int(v[6:]) for v in taxons] # strip "taxon:" num_taxons = len(taxons) assert num_taxons == 1 or num_taxons == 2 return taxons
class GafReader(object): """Reads a Gene Annotation File (GAF). Returns a Python object.""" gafhdr = [ # Col Req? Cardinality Example # --- -------- -------------- ----------------- "DB", # 0 required 1 UniProtKB "DB_ID", # 1 required 1 P12345 "DB_Symbol", # 2 required 1 PHO3 "Qualifier", # 3 optional 0 or greater NOT "GO_ID", # 4 required 1 GO:0003993 "DB_Reference", # 5 required 1 or greater PMID:2676709 "Evidence_Code", # 6 required 1 IMP "With_From", # 7 optional 0 or greater GO:0000346 "Aspect", # 8 required 1 F "DB_Name", # 9 optional 0 or 1 Toll-like receptor 4 "DB_Synonym", # 10 optional 0 or greater hToll|Tollbooth "DB_Type", # 11 required 1 protein "Taxon", # 12 required 1 or 2 taxon:9606 "Date", # 13 required 1 20090118 "Assigned_By", # 14 required 1 SGD ] # Col Required Cardinality Example gafhdr2 = [ # --- -------- ------------ ------------------- "Annotation_Extension", # 15 optional 0 or greater part_of(CL:0000576) "Gene_Product_Form_ID", # 16 optional 0 or 1 UniProtKB:P12345-2 ] gaf_columns = { "2.1": gafhdr + gafhdr2, # !gaf-version: 2.1 "2.0": gafhdr + gafhdr2, # !gaf-version: 2.0 "1.0": gafhdr, } # !gaf-version: 1.0 # Expected numbers of columns for various versions gaf_numcol = {"2.1": 17, "2.0": 17, "1.0": 15} # Expected values for a Qualifier exp_qualifiers = set(["NOT", "contributes_to", "colocalizes_with"]) def __init__(self, filename=None, log=sys.stdout): self.filename = filename self.log = log self.evobj = EvidenceCodes() self.associations = self.read_gaf(filename) if filename is not None else [] def prt_summary_anno2ev(self, prt=sys.stdout): """Print annotation/evidence code summary.""" ctr = cx.Counter() for ntgaf in self.associations: evidence_code = ntgaf.Evidence_Code if "NOT" not in ntgaf.Qualifier: ctr[evidence_code] += 1 elif "NOT" in ntgaf.Qualifier: ctr["NOT {EV}".format(EV=ntgaf.Evidence_Code)] += 1 else: raise Exception("UNEXPECTED INFO") self.evobj.prt_ev_cnts(ctr, prt) def _get_ntgaf(self, ntgafobj, flds, ver): """Convert fields from string to preferred format for GAF ver 2.1 and 2.0.""" # Cardinality is_set = False is_list = True qualifiers = self._rd_fld_vals("Qualifier", flds[3], is_set) db_reference = self._rd_fld_vals("DB_Reference", flds[5], is_set, 1) with_from = self._rd_fld_vals("With_From", flds[7], is_set) db_name = self._rd_fld_vals("DB_Name", flds[9], is_set, 0, 1) db_synonym = self._rd_fld_vals("DB_Synonym", flds[10], is_set) taxons = self._rd_fld_vals("Taxon", flds[12], is_list, 1, 2) self._chk_qty_eq_1(flds, [0, 1, 2, 4, 6, 8, 11, 13, 14]) # Additional Formatting taxons = self._do_taxons(taxons) self._chk_qualifier(qualifiers) # Create list of values gafvals = [ flds[0], # 0 DB flds[1], # 1 DB_ID flds[2], # 2 DB_Symbol qualifiers, # 3 Qualifier flds[4], # 4 GO_ID db_reference, # 5 DB_Reference flds[6], # 6 Evidence_Code with_from, # 7 With_From flds[8], # 8 Aspect db_name, # 9 DB_Name db_synonym, # 10 DB_Synonym flds[11], # 11 DB_Type taxons, # 12 Taxon flds[12], # 13 Date flds[13], ] # 14 Assigned_By # Version 2.x has these additional fields not found in v1.0 if ver[0] == "2": gafvals += [ self._rd_fld_vals("Annotation_Extension", flds[15], is_set), self._rd_fld_vals("Gene_Product_Form_ID", flds[16], is_set), ] return ntgafobj._make(gafvals) @staticmethod def _rd_fld_vals(name, val, set_list_ft=True, qty_min=0, qty_max=None): """Further split a GAF value within a single field.""" if not val and qty_min == 0: return [] if set_list_ft else set() vals = val.split("|") # Use a pipe to separate entries num_vals = len(vals) assert num_vals >= qty_min, "FLD({F}): MIN QUANTITY({Q}) NOT MET: {V}".format(F=name, Q=qty_min, V=vals) if qty_max is not None: assert num_vals <= qty_max, "FLD({F}): MAX QUANTITY({Q}) EXCEEDED: {V}".format(F=name, Q=qty_max, V=vals) return vals if set_list_ft else set(vals) def read_gaf(self, fin_gaf): """Read GAF file. HTTP address okay. GZIPPED/BZIPPED file okay.""" ga_lst = [] ifstrm = nopen(fin_gaf) ver = None ntgafobj = None exp_numcol = None for line in ifstrm: if ntgafobj is not None and not line.startswith("!"): flds = self._split_line(line, exp_numcol) ntgaf = self._get_ntgaf(ntgafobj, flds, ver) ga_lst.append(ntgaf) elif ntgafobj is None and line.startswith("!gaf-version:"): ver = line[13:].strip() ntgafobj = cx.namedtuple("ntgafobj", " ".join(self.gaf_columns[ver])) exp_numcol = self.gaf_numcol[ver] self.log.write(" READ {N:,} associations: {FIN}\n".format(N=len(ga_lst), FIN=fin_gaf)) ga_lst = self.evobj.sort_nts(ga_lst, "Evidence_Code") return ga_lst @staticmethod def _split_line(line, exp_numcol): """Split line into field values.""" line = line.rstrip("\r\n") flds = re.split("\t", line) assert len(flds) == exp_numcol, "UNEXPECTED NUMBER OF COLUMNS" return flds def _chk_qualifier(self, qualifiers): """Check that qualifiers are expected values.""" # http://geneontology.org/page/go-annotation-conventions#qual for qual in qualifiers: assert qual in self.exp_qualifiers, "UNEXPECTED QUALIFIER({Q})".format(Q=qual) @staticmethod def _chk_qty_eq_1(flds, col_lst): """Check that these fields have only one value: required 1.""" for col in col_lst: assert flds[col], "UNEXPECTED REQUIRED VALUE({V}) AT INDEX({R})".format(V=flds[col], R=col) @staticmethod def _do_taxons(taxons): """Taxon""" taxons = [int(v[6:]) for v in taxons] # strip "taxon:" num_taxons = len(taxons) assert num_taxons == 1 or num_taxons == 2 return taxons