def _parseAC(self, line: str): accessions = Parser.AC_RE.findall(line) # only once, even if there are multiple AC lines: if self.record is None: # ensure a species ID has to be set later: #noinspection PyTypeChecker self.record = ProteinRecord(-1, length=self._length) self.db_key = DBRef(Namespace.uniprot, accessions[0]) if self._id: self.record.addSymbol(self._id) self.record.addDBRef(self.db_key) for acc in accessions: self.record.addString("accession", acc) return 0
class Parser(AbstractLoader): """ A parser for UniProtKB text files. Implements the `AbstractParser._parse` method. """ def _setup(self, stream: io.TextIOWrapper) -> int: lines = super(Parser, self)._setup(stream) self.db_key = None self.record = None self._id = '' self._length = None self._name_cat = None self._name_state = None self._skip_sequence = False # Set up a dispatcher pattern for parsing lines given the line # type, which is defined by the first two letters on the line: self._dispatcher = { "ID": self._parseID, "AC": self._parseAC, "DE": self._parseDE, "GN": self._parseGN, "OX": self._parseOX, "RX": self._parseRX, "DR": self._parseDR, "KW": self._parseKW, "SQ": self._parseSQ, "//": self._parseEND, # "DT": self._parseDT, "OS": skip, "OG": skip, "OC": skip, "OH": skip, "RN": skip, "RP": skip, "RC": skip, "RG": skip, "RA": skip, "RT": skip, "RL": skip, "CC": skip, "PE": skip, "FT": skip, "DT": skip, } # UniProt sometimes has species not (yet) in the NCBI Taxonomy; # To avoid issues, map these IDs to the "unknown" species ID; # However, to do this, all valid species IDs need to be known: self._species_ids = frozenset( i[0] for i in self.session.query(Species.id) ) return lines def _cleanup(self, stream: io.TextIOWrapper) -> int: return super(Parser, self)._cleanup(stream) def _parse(self, line: str) -> int: if line and not self._skip_sequence: return self._dispatcher[line[0:2]](line) elif self._skip_sequence and line.startswith('//'): return self._parseEND(line) else: return 0 ID_RE = re.compile( 'ID\s+' '(?P<id>\w+)\s+' '(?P<status>Reviewed|Unreviewed);\s+' '(?P<length>\d+)\s+' 'AA\.' ) def _parseID(self, line: str): mo = Parser.ID_RE.match(line) self._id = mo.group('id') self._length = int(mo.group('length')) return 0 AC_RE = re.compile('\s+(?P<accession>[A-Z][0-9][A-Z0-9]{3}[0-9]);') def _parseAC(self, line: str): accessions = Parser.AC_RE.findall(line) # only once, even if there are multiple AC lines: if self.record is None: # ensure a species ID has to be set later: #noinspection PyTypeChecker self.record = ProteinRecord(-1, length=self._length) self.db_key = DBRef(Namespace.uniprot, accessions[0]) if self._id: self.record.addSymbol(self._id) self.record.addDBRef(self.db_key) for acc in accessions: self.record.addString("accession", acc) return 0 # No place to store record versions; Would this be useful? # DT_RE = re.compile( # 'DT\s+\d{2}\-[A-Z]{3}\-\d{4}, entry version (?P<version>\d+)\s*\.' # ) # # def _parseDT(self, line: str): # mo = Parser.DT_RE.match(line) # # if mo: # self.record.version = mo.group('version') # # return 0 DE_RE = re.compile( 'DE\s+' '(?:(?P<category>(?:Rec|Alt|Sub)Name|Flags|Contains|Includes):)?' '(?:\s*(?P<subcategory>[^=]+)(?:=(?P<name>.+))?)?' ) def _parseDE(self, line: str): mo = Parser.DE_RE.match(line) cat = mo.group('category') subcat = mo.group('subcategory') name = mo.group('name') if cat in ('Flags', 'Contains', 'Includes'): return 0 elif cat: self._name_cat = cat assert subcat is not None and name is not None, line assert name[-1] == ';', name name = name[:-1] # remove backslash on names ending with a backslash in TrEMBL while name.endswith('\\'): name = name[:-1] # swap rather peculiar short and full name assignments # treat the former as symobl and the latter as name if subcat == "Short" and len(name) > 16 and ' ' in name: subcat = "Full" if subcat == "Full" and len(name) < 6 and name.find(' ') == -1: subcat = "Short" if subcat == "Full": end = name.find(' ') if end == -1: end = len(name) if name[0].isupper() and name[1:end].islower(): name = "{}{}".format(name[0].lower(), name[1:]) if subcat == "Short" and name.startswith( "uncharacterized protein") or \ name.startswith("putative ") or \ name.startswith("probable ") or \ name.startswith("similar to "): return 0 comma = name.rfind(', ') while comma != -1: name = "{} {}".format(name[comma + 2:], name[:comma]) comma = name.rfind(', ') if subcat == "Short" and name.startswith( "uncharacterized protein") or \ name.startswith("putative ") or \ name.startswith("probable ") or \ name.startswith("similar to "): return 0 if self._name_cat == 'RecName': if subcat == 'Full' and not self.record.name: self.record.name = name elif subcat == 'Short' and not self.record.symbol: self.record.symbol = name elif subcat == 'EC' and not self.record.symbol: self.record.symbol = name if subcat == 'Full': self.record.addName(name) elif subcat == 'Short': self.record.addSymbol(name) elif subcat == 'EC': self.record.addKeyword("EC{}".format(name)) elif subcat in ('Allergen', 'Biotech', 'CD_antigen', 'INN'): pass else: raise RuntimeError( 'unknown DE subcategory field "{}"'.format(subcat) ) return 0 GN_RE = re.compile('\s+(?P<key>\w+)\s*=\s*(?P<value>[^;]+);') def _parseGN(self, line: str): if line == 'and': return for key, value in Parser.GN_RE.findall(line): if key == 'Name': if len(value) < 16 or ' ' not in value: self.record.addSymbol(value) else: self.record.addName(value) elif key == 'Synonyms': for s in value.split(','): s = s.strip() if len(s) < 16 or ' ' not in s: self.record.addSymbol(s) else: self.record.addName(s) elif key in ('OrderedLocusNames', 'ORFNames'): for s in value.split(','): self.record.addKeyword(s.strip()) else: raise RuntimeError( 'unknown GN category field "{}"'.format(key) ) return 0 OX_RE = re.compile('OX\s+NCBI_TaxID\s*=\s*(?P<species>\d+);') def _parseOX(self, line: str): matched = Parser.OX_RE.match(line) if matched: species = int(matched.group('species')) # UniProt declares TaxIDs that sometimes don't (yet) exist... if species not in self._species_ids: logging.debug('unknown species ID=%d for %s (%s)', species, self.db_key.accession, self._id) species = SpeciesIds.unidentified else: logging.debug('known species ID=%d for %s (%s)', species, self.db_key.accession, self._id) self.record.species_id = species return 0 RX_RE = re.compile('RX\s+.*?PubMed\s*=\s*(?P<pmid>\d+);?') def _parseRX(self, line: str): matched = Parser.RX_RE.match(line) if matched: self.record.pmids.add(int(matched.group('pmid'))) return 0 DR_RE = re.compile( 'DR\s+(?P<namespace>[\w/\-]+)\s*;\s+(?P<accessions>.*)' ) def _parseDR(self, line: str): mo = Parser.DR_RE.match(line) namespace = None try: namespace = mo.group('namespace') # raise KeyError if unknown NSs are added: if TRANSLATE[namespace]: assert mo.group('accessions')[-1] == '.', mo.group( 'accessions') for db_ref in TRANSLATE[namespace]([ i.strip() for i in mo.group('accessions')[:-1].split(';') ]): self.record.addDBRef(db_ref) except KeyError: logging.info("unknown Namespace '%s'", namespace) except AttributeError: pass return 0 KW_RE = re.compile('\s+(?P<keyword>[^;]+)(?:;|\.$)') def _parseKW(self, line: str): for kwd in Parser.KW_RE.findall(line): if kwd != 'Complete proteome': self.record.addKeyword(kwd) return 0 SQ_RE = re.compile( 'SQ\s+SEQUENCE\s+' '(?P<length>\d+)\s+AA;\s+' '(?P<mass>\d+)\s+MW;\s+(?P<crc64>\w+)\s+CRC64;' ) def _parseSQ(self, line: str): self.record.mass = int(Parser.SQ_RE.match(line).group('mass')) self._skip_sequence = True return 0 #noinspection PyUnusedLocal def _parseEND(self, line: str): #noinspection PyTypeChecker self._loadRecord(self.db_key, self.record) self.db_key = None self.record = None self._id = '' self._length = None self._name_cat = None self._skip_sequence = False return 1