class GafReader(object): """Reads a Gene Annotation File (GAF). Returns a Python object.""" exp_kwdct = set(['allow_missing_symbol']) def __init__(self, filename=None, hdr_only=False, prt=sys.stdout, **kws): # kws: allow_missing_symbol self.kws = {k: v for k, v in kws.items() if k in self.exp_kwdct} self.filename = filename self.evobj = EvidenceCodes() # Initialize associations and header information self.hdr = None self.associations = self.read_gaf(filename, hdr_only, prt) if filename is not None else [] def read_gaf(self, fin_gaf, hdr_only, prt): """Read GAF file. Store annotation data in a list of namedtuples.""" nts = [] ver = None hdrobj = GafHdr() datobj = None lnum = line = -1 ignored = [] try: with open(fin_gaf) as ifstrm: for lnum, line in enumerate(ifstrm, 1): # Read header if datobj is None: if line[0] == '!': if ver is None and line[1:13] == 'gaf-version:': ver = line[13:].strip() hdrobj.chkaddhdr(line) else: self.hdr = hdrobj.get_hdr() if hdr_only: return nts datobj = GafData(ver, **self.kws) # Read data if datobj is not None and line[0] != '!': ntgaf = datobj.get_ntgaf(line) if ntgaf is not None: nts.append(ntgaf) else: ignored.append((lnum, line)) except Exception as inst: import traceback traceback.print_exc() sys.stderr.write( "\n **FATAL in read_gaf: {MSG}\n\n".format(MSG=str(inst))) sys.stderr.write("**FATAL: {FIN}[{LNUM}]:\n{L}".format(FIN=fin_gaf, L=line, LNUM=lnum)) if datobj is not None: datobj.prt_line_detail(prt, line) sys.exit(1) # GAF file has been read self._prt_read_summary(prt, fin_gaf, nts, datobj, ignored) return self.evobj.sort_nts(nts, 'Evidence_Code') def _prt_read_summary(self, prt, fin_gaf, nts, datobj, ignored): """Print a summary about the GAF file that was read.""" fout_log = self._prt_ignored_lines(ignored, datobj, fin_gaf) if ignored else None if prt is not None: prt.write(" READ {N:9,} associations: {FIN}\n".format( N=len(nts), FIN=fin_gaf)) if ignored: prt.write(" IGNORED {N:9,} associations: {FIN}\n".format( N=len(ignored), FIN=fout_log)) def _prt_ignored_lines(self, ignored, datobj, fin_gaf): """Print ignored lines to a log file.""" fout_log = "{}.log".format(fin_gaf) with open(fout_log, 'w') as prt: for lnum, line in ignored: self.prt_ignore_line(prt, fin_gaf, line, lnum) datobj.prt_line_detail(prt, line) prt.write("\n") return fout_log def prt_summary_anno2ev(self, prt=sys.stdout): """Print annotation/evidence code summary.""" ctr = cx.Counter() for ntgaf in self.associations: evidence_code = ntgaf.Evidence_Code if 'NOT' not in ntgaf.Qualifier: ctr[evidence_code] += 1 elif 'NOT' in ntgaf.Qualifier: ctr["NOT {EV}".format(EV=ntgaf.Evidence_Code)] += 1 else: raise Exception("UNEXPECTED INFO") self.evobj.prt_ev_cnts(ctr, prt) @staticmethod def prt_ignore_line(prt, fin_gaf, line, lnum): """Print a message saying that we are ignoring an association line.""" prt.write( "**WARNING: BADLY FORMATTED LINE. IGNORED {FIN}[{LNUM}]:\n{L}\n". format(FIN=os.path.basename(fin_gaf), L=line, LNUM=lnum))
class GafReader(object): """Reads a Gene Annotation File (GAF). Returns a Python object.""" exp_kwdct = set(['allow_missing_symbol']) def __init__(self, filename=None, hdr_only=False, prt=sys.stdout, **kws): # kws: allow_missing_symbol self.kws = {k: v for k, v in kws.items() if k in self.exp_kwdct} self.filename = filename self.evobj = EvidenceCodes() # Initialize associations and header information self.hdr = None self.datobj = None self.associations = self._init_assn( filename, hdr_only, prt) if filename is not None else [] def read_gaf(self, **kws): """Read Gene Association File (GAF). Return data.""" # Simple associations id2gos = cx.defaultdict(set) # keyword arguments for choosing which GO IDs to keep # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs taxid2asscs = kws.get('taxid2asscs', None) b_geneid2gos = not kws.get('go2geneids', False) evs = kws.get('evidence_set', None) eval_nd = self._get_nd(kws.get('keep_ND', False)) eval_not = self._get_not(kws.get('keep_NOT', False)) # Optionally specify a subset of GOs based on their evidence. # By default, return id2gos. User can cause go2geneids to be returned by: # >>> read_ncbi_gene2go(..., go2geneids=True for ntgaf in self.associations: if eval_nd(ntgaf) and eval_not(ntgaf): if evs is None or ntgaf.Evidence_Code in evs: geneid = ntgaf.DB_ID go_id = ntgaf.GO_ID if b_geneid2gos: id2gos[geneid].add(go_id) else: id2gos[go_id].add(geneid) if taxid2asscs is not None: if ntgaf.Taxon: taxid = ntgaf.Taxon[0] taxid2asscs[taxid]['ID2GOs'][geneid].add(go_id) taxid2asscs[taxid]['GO2IDs'][go_id].add(geneid) return id2gos # return simple associations @staticmethod def _get_nd(keep_nd): """Allow GAF values always or never.""" if keep_nd: return lambda nt: True return lambda nt: nt.Evidence_Code != 'ND' @staticmethod def _get_not(keep_not): """Allow GAF values always or never.""" if keep_not: return lambda nt: True return lambda nt: 'NOT' not in nt.Qualifier def _init_assn(self, fin_gaf, hdr_only, prt): """Read GAF file. Store annotation data in a list of namedtuples.""" nts = self._read_gaf_nts(fin_gaf, hdr_only) # GAF file has been read if prt: prt.write(" READ {N:9,} associations: {FIN}\n".format( N=len(nts), FIN=fin_gaf)) # If there are illegal GAF lines ... if self.datobj: if self.datobj.ignored or self.datobj.illegal_lines: self.datobj.prt_error_summary(fin_gaf) return self.evobj.sort_nts(nts, 'Evidence_Code') def _read_gaf_nts(self, fin_gaf, hdr_only): """Read GAF file. Store annotation data in a list of namedtuples.""" nts = [] ver = None hdrobj = GafHdr() datobj = None lnum = line = -1 try: with open(fin_gaf) as ifstrm: for lnum, line in enumerate(ifstrm, 1): # Read header if datobj is None: if line[0] == '!': if ver is None and line[1:13] == 'gaf-version:': ver = line[13:].strip() hdrobj.chkaddhdr(line) else: self.hdr = hdrobj.get_hdr() if hdr_only: return nts datobj = GafData(ver, **self.kws) # Read data if datobj is not None and line[0] != '!': # print(lnum, line) ntgaf = datobj.get_ntgaf(line, lnum) if ntgaf is not None: nts.append(ntgaf) else: datobj.ignored.append((lnum, line)) except Exception as inst: import traceback traceback.print_exc() sys.stderr.write("\n **FATAL: {MSG}\n\n".format(MSG=str(inst))) sys.stderr.write("**FATAL: {FIN}[{LNUM}]:\n{L}".format(FIN=fin_gaf, L=line, LNUM=lnum)) if datobj is not None: datobj.prt_line_detail(sys.stdout, line) sys.exit(1) self.datobj = datobj return nts def prt_summary_anno2ev(self, prt=sys.stdout): """Print annotation/evidence code summary.""" ctr = cx.Counter() for ntgaf in self.associations: evidence_code = ntgaf.Evidence_Code if 'NOT' not in ntgaf.Qualifier: ctr[evidence_code] += 1 elif 'NOT' in ntgaf.Qualifier: ctr["NOT {EV}".format(EV=ntgaf.Evidence_Code)] += 1 else: raise Exception("UNEXPECTED INFO") self.evobj.prt_ev_cnts(ctr, prt)
class GafReader(object): """Reads a Gene Annotation File (GAF). Returns a Python object.""" gafhdr = [ # Col Req? Cardinality Example # --- -------- -------------- ----------------- 'DB', # 0 required 1 UniProtKB 'DB_ID', # 1 required 1 P12345 'DB_Symbol', # 2 required 1 PHO3 'Qualifier', # 3 optional 0 or greater NOT 'GO_ID', # 4 required 1 GO:0003993 'DB_Reference', # 5 required 1 or greater PMID:2676709 'Evidence_Code', # 6 required 1 IMP 'With_From', # 7 optional 0 or greater GO:0000346 'Aspect', # 8 required 1 F 'DB_Name', # 9 optional 0 or 1 Toll-like receptor 4 'DB_Synonym', # 10 optional 0 or greater hToll|Tollbooth 'DB_Type', # 11 required 1 protein 'Taxon', # 12 required 1 or 2 taxon:9606 'Date', # 13 required 1 20090118 'Assigned_By', # 14 required 1 SGD ] # Col Required Cardinality Example gafhdr2 = [ # --- -------- ------------ ------------------- 'Annotation_Extension', # 15 optional 0 or greater part_of(CL:0000576) 'Gene_Product_Form_ID', # 16 optional 0 or 1 UniProtKB:P12345-2 ] gaf_columns = { "2.1": gafhdr + gafhdr2, # !gaf-version: 2.1 "2.0": gafhdr + gafhdr2, # !gaf-version: 2.0 "1.0": gafhdr } # !gaf-version: 1.0 # Expected numbers of columns for various versions gaf_numcol = {"2.1": 17, "2.0": 17, "1.0": 15} # Expected values for a Qualifier exp_qualifiers = set( ['NOT', 'contributes_to', 'Contributes_to', 'colocalizes_with']) def __init__(self, filename=None, hdr_only=False, prt=sys.stdout): self.filename = filename self.evobj = EvidenceCodes() # Initialize associations and header information self.hdr = None self.associations = self.read_gaf(filename, hdr_only, prt) if filename is not None else [] def prt_summary_anno2ev(self, prt=sys.stdout): """Print annotation/evidence code summary.""" ctr = cx.Counter() for ntgaf in self.associations: evidence_code = ntgaf.Evidence_Code if 'NOT' not in ntgaf.Qualifier: ctr[evidence_code] += 1 elif 'NOT' in ntgaf.Qualifier: ctr["NOT {EV}".format(EV=ntgaf.Evidence_Code)] += 1 else: raise Exception("UNEXPECTED INFO") self.evobj.prt_ev_cnts(ctr, prt) def _get_ntgaf(self, ntgafobj, flds, ver): """Convert fields from string to preferred format for GAF ver 2.1 and 2.0.""" # Cardinality is_set = False is_list = True qualifiers = self._rd_fld_vals("Qualifier", flds[3], is_set) db_reference = self._rd_fld_vals("DB_Reference", flds[5], is_set, 1) with_from = self._rd_fld_vals("With_From", flds[7], is_set) db_name = self._rd_fld_vals("DB_Name", flds[9], is_set, 0, 1) db_synonym = self._rd_fld_vals("DB_Synonym", flds[10], is_set) taxons = self._rd_fld_vals("Taxon", flds[12], is_list, 1, 2) self._chk_qty_eq_1(flds, [0, 1, 2, 4, 6, 8, 11, 13, 14]) # Additional Formatting taxons = self._do_taxons(taxons) self._chk_qualifier(qualifiers) # Create list of values gafvals = [ flds[0], # 0 DB flds[1], # 1 DB_ID flds[2], # 2 DB_Symbol qualifiers, # 3 Qualifier flds[4], # 4 GO_ID db_reference, # 5 DB_Reference flds[6], # 6 Evidence_Code with_from, # 7 With_From flds[8], # 8 Aspect db_name, # 9 DB_Name db_synonym, # 10 DB_Synonym flds[11], # 11 DB_Type taxons, # 12 Taxon flds[12], # 13 Date flds[13] ] # 14 Assigned_By # Version 2.x has these additional fields not found in v1.0 if ver[0] == '2': gafvals += [ self._rd_fld_vals("Annotation_Extension", flds[15], is_set), self._rd_fld_vals("Gene_Product_Form_ID", flds[16], is_set) ] return ntgafobj._make(gafvals) def _rd_fld_vals(self, name, val, set_list_ft=True, qty_min=0, qty_max=None): """Further split a GAF value within a single field.""" if not val and qty_min == 0: return [] if set_list_ft else set() vals = val.split('|') # Use a pipe to separate entries num_vals = len(vals) assert num_vals >= qty_min, \ "FIELD({F}): MIN QUANTITY({Q}) WASN'T MET: {V} in {GAF}".format( F=name, Q=qty_min, V=vals, GAF=self.filename) if qty_max is not None: assert num_vals <= qty_max, \ "FIELD({F}): MAX QUANTITY({Q}) EXCEEDED: {V} in {GAF}".format( F=name, Q=qty_max, V=vals, GAF=self.filename) return vals if set_list_ft else set(vals) def read_gaf(self, fin_gaf, hdr_only, prt): """Read GAF file. HTTP address okay. GZIPPED/BZIPPED file okay.""" ga_lst = [] ver = None ntgafobj = None exp_numcol = None hdrobj = GafHdr() ifstrm = nopen(fin_gaf) for line in ifstrm: # Read header if ntgafobj is None: if line[0] == '!': if line[1:13] == 'gaf-version:': ver = line[13:].strip() hdrobj.chkaddhdr(line) else: self.hdr = hdrobj.get_hdr() if hdr_only: return ga_lst ntgafobj = cx.namedtuple("ntgafobj", " ".join(self.gaf_columns[ver])) exp_numcol = self.gaf_numcol[ver] # Read data if ntgafobj is not None: flds = self._split_line(line, exp_numcol) ntgaf = self._get_ntgaf(ntgafobj, flds, ver) ga_lst.append(ntgaf) # GAF file has been read if prt is not None: readmsg = " READ {N:,} associations: {FIN}\n" prt.write(readmsg.format(N=len(ga_lst), FIN=fin_gaf)) return self.evobj.sort_nts(ga_lst, 'Evidence_Code') @staticmethod def _split_line(line, exp_numcol): """Split line into field values.""" line = line.rstrip('\r\n') flds = re.split('\t', line) assert len(flds) == exp_numcol, "UNEXPECTED NUMBER OF COLUMNS" return flds def _chk_qualifier(self, qualifiers): """Check that qualifiers are expected values.""" # http://geneontology.org/page/go-annotation-conventions#qual for qual in qualifiers: assert qual in self.exp_qualifiers, "UNEXPECTED QUALIFIER({Q}) IN {GAF}".format( Q=qual, GAF=self.filename) @staticmethod def _chk_qty_eq_1(flds, col_lst): """Check that these fields have only one value: required 1.""" for col in col_lst: assert flds[ col], "UNEXPECTED REQUIRED VALUE({V}) AT INDEX({R})".format( V=flds[col], R=col) @staticmethod def _do_taxons(taxons): """Taxon""" taxons = [int(v[6:]) for v in taxons] # strip "taxon:" num_taxons = len(taxons) assert num_taxons == 1 or num_taxons == 2 return taxons
class GafReader(object): """Reads a Gene Annotation File (GAF). Returns a Python object.""" gafhdr = [ # Col Req? Cardinality Example # --- -------- -------------- ----------------- "DB", # 0 required 1 UniProtKB "DB_ID", # 1 required 1 P12345 "DB_Symbol", # 2 required 1 PHO3 "Qualifier", # 3 optional 0 or greater NOT "GO_ID", # 4 required 1 GO:0003993 "DB_Reference", # 5 required 1 or greater PMID:2676709 "Evidence_Code", # 6 required 1 IMP "With_From", # 7 optional 0 or greater GO:0000346 "Aspect", # 8 required 1 F "DB_Name", # 9 optional 0 or 1 Toll-like receptor 4 "DB_Synonym", # 10 optional 0 or greater hToll|Tollbooth "DB_Type", # 11 required 1 protein "Taxon", # 12 required 1 or 2 taxon:9606 "Date", # 13 required 1 20090118 "Assigned_By", # 14 required 1 SGD ] # Col Required Cardinality Example gafhdr2 = [ # --- -------- ------------ ------------------- "Annotation_Extension", # 15 optional 0 or greater part_of(CL:0000576) "Gene_Product_Form_ID", # 16 optional 0 or 1 UniProtKB:P12345-2 ] gaf_columns = { "2.1": gafhdr + gafhdr2, # !gaf-version: 2.1 "2.0": gafhdr + gafhdr2, # !gaf-version: 2.0 "1.0": gafhdr, } # !gaf-version: 1.0 # Expected numbers of columns for various versions gaf_numcol = {"2.1": 17, "2.0": 17, "1.0": 15} # Expected values for a Qualifier exp_qualifiers = set(["NOT", "contributes_to", "colocalizes_with"]) def __init__(self, filename=None, log=sys.stdout): self.filename = filename self.log = log self.evobj = EvidenceCodes() self.associations = self.read_gaf(filename) if filename is not None else [] def prt_summary_anno2ev(self, prt=sys.stdout): """Print annotation/evidence code summary.""" ctr = cx.Counter() for ntgaf in self.associations: evidence_code = ntgaf.Evidence_Code if "NOT" not in ntgaf.Qualifier: ctr[evidence_code] += 1 elif "NOT" in ntgaf.Qualifier: ctr["NOT {EV}".format(EV=ntgaf.Evidence_Code)] += 1 else: raise Exception("UNEXPECTED INFO") self.evobj.prt_ev_cnts(ctr, prt) def _get_ntgaf(self, ntgafobj, flds, ver): """Convert fields from string to preferred format for GAF ver 2.1 and 2.0.""" # Cardinality is_set = False is_list = True qualifiers = self._rd_fld_vals("Qualifier", flds[3], is_set) db_reference = self._rd_fld_vals("DB_Reference", flds[5], is_set, 1) with_from = self._rd_fld_vals("With_From", flds[7], is_set) db_name = self._rd_fld_vals("DB_Name", flds[9], is_set, 0, 1) db_synonym = self._rd_fld_vals("DB_Synonym", flds[10], is_set) taxons = self._rd_fld_vals("Taxon", flds[12], is_list, 1, 2) self._chk_qty_eq_1(flds, [0, 1, 2, 4, 6, 8, 11, 13, 14]) # Additional Formatting taxons = self._do_taxons(taxons) self._chk_qualifier(qualifiers) # Create list of values gafvals = [ flds[0], # 0 DB flds[1], # 1 DB_ID flds[2], # 2 DB_Symbol qualifiers, # 3 Qualifier flds[4], # 4 GO_ID db_reference, # 5 DB_Reference flds[6], # 6 Evidence_Code with_from, # 7 With_From flds[8], # 8 Aspect db_name, # 9 DB_Name db_synonym, # 10 DB_Synonym flds[11], # 11 DB_Type taxons, # 12 Taxon flds[12], # 13 Date flds[13], ] # 14 Assigned_By # Version 2.x has these additional fields not found in v1.0 if ver[0] == "2": gafvals += [ self._rd_fld_vals("Annotation_Extension", flds[15], is_set), self._rd_fld_vals("Gene_Product_Form_ID", flds[16], is_set), ] return ntgafobj._make(gafvals) @staticmethod def _rd_fld_vals(name, val, set_list_ft=True, qty_min=0, qty_max=None): """Further split a GAF value within a single field.""" if not val and qty_min == 0: return [] if set_list_ft else set() vals = val.split("|") # Use a pipe to separate entries num_vals = len(vals) assert num_vals >= qty_min, "FLD({F}): MIN QUANTITY({Q}) NOT MET: {V}".format(F=name, Q=qty_min, V=vals) if qty_max is not None: assert num_vals <= qty_max, "FLD({F}): MAX QUANTITY({Q}) EXCEEDED: {V}".format(F=name, Q=qty_max, V=vals) return vals if set_list_ft else set(vals) def read_gaf(self, fin_gaf): """Read GAF file. HTTP address okay. GZIPPED/BZIPPED file okay.""" ga_lst = [] ifstrm = nopen(fin_gaf) ver = None ntgafobj = None exp_numcol = None for line in ifstrm: if ntgafobj is not None and not line.startswith("!"): flds = self._split_line(line, exp_numcol) ntgaf = self._get_ntgaf(ntgafobj, flds, ver) ga_lst.append(ntgaf) elif ntgafobj is None and line.startswith("!gaf-version:"): ver = line[13:].strip() ntgafobj = cx.namedtuple("ntgafobj", " ".join(self.gaf_columns[ver])) exp_numcol = self.gaf_numcol[ver] self.log.write(" READ {N:,} associations: {FIN}\n".format(N=len(ga_lst), FIN=fin_gaf)) ga_lst = self.evobj.sort_nts(ga_lst, "Evidence_Code") return ga_lst @staticmethod def _split_line(line, exp_numcol): """Split line into field values.""" line = line.rstrip("\r\n") flds = re.split("\t", line) assert len(flds) == exp_numcol, "UNEXPECTED NUMBER OF COLUMNS" return flds def _chk_qualifier(self, qualifiers): """Check that qualifiers are expected values.""" # http://geneontology.org/page/go-annotation-conventions#qual for qual in qualifiers: assert qual in self.exp_qualifiers, "UNEXPECTED QUALIFIER({Q})".format(Q=qual) @staticmethod def _chk_qty_eq_1(flds, col_lst): """Check that these fields have only one value: required 1.""" for col in col_lst: assert flds[col], "UNEXPECTED REQUIRED VALUE({V}) AT INDEX({R})".format(V=flds[col], R=col) @staticmethod def _do_taxons(taxons): """Taxon""" taxons = [int(v[6:]) for v in taxons] # strip "taxon:" num_taxons = len(taxons) assert num_taxons == 1 or num_taxons == 2 return taxons