def __init__(self, message, current): """Initializes an Entry instance.""" if not isinstance(current, Current): raise ArgumentError('Entry: expected type Current') super().__init__(message=message, proteins=current.proteins, references=current.references) self.information = current.information self.comment = current.comment
def __init__(self, organism, current): """Initializes a Protein instance.""" if not isinstance(current, Current): raise ArgumentError('Protein: expected type Current') self._index = self._counter self.__class__._counter += 1 self.organism = organism self.identifiers = current.proteins self.references = current.references self.information = current.information self.comment = current.comment
def _parse_protein(self, text): """Parses a PROTEIN (PR) entry from the BRENDA flat file. :param text: text that represents a PROTEIN (PR) entry """ self._reset_parser() text = self.extract_information(text) text = self.extract_comment(text, enforce_structural_check=False) mobj = self._tags.protein.search(text) if not mobj: raise ArgumentError( 'Protein reference missing: \'{}\' @ #{}'.format(text, self._current.line_number)) protein_id = int(mobj.group(1)) text = text[:mobj.start()] + text[mobj.end():] text, accessions = self._extract_accessions(text) self._current.proteins = sorted(list(set(accessions))) text, self._current.references = self._extract_numbers(text, self._tags.reference) self._current.ec_number.proteins[protein_id] = Protein(text.strip(), self._current)
def _extract_numbers(self, text, pattern): """Extracts and returns numbers from a text according to a given pattern, as well as the text resulting from removing the numbers. The pattern may be one defining proteins or references (the instance's _tags.protein or _tags.reference attributes). :param text: text that may contain numbers according to a pattern :param pattern: a re pattern :return: text resulting from number extraction """ if not isinstance(pattern, typing.Pattern): raise ArgumentError('Expected re.Pattern: {}'.format(pattern)) numbers = None mobj = pattern.search(text) if mobj: numbers = [int(num.group(0)) for num in self._tags.numbers.finditer(mobj.group(1))] text = text[:mobj.start()] + text[mobj.end():] return text.strip(), numbers
def _get_numbers_in_comment(self, comment, pattern): """Returns a list of numbers present in the given comment according to the specified pattern. :param comment: text that may contain numbers :param pattern: pattern describing the numbers :return: the list of numbers in comment """ if not isinstance(pattern, typing.Pattern): raise ArgumentError('Expected re.Pattern: {}'.format(pattern)) if comment is None or not comment.strip(): return None numbers = list() matches = [match.group() for match in pattern.finditer(comment)] for number in matches: _, values = self._extract_numbers(number, pattern) numbers.extend([v for v in values if v not in numbers]) return numbers
def extract_proteins(self, text): """Extracts and stores protein references and returns the text resulting from removing the proteins field. The proteins field format is defined by the instance's _tags.protein attribute. It is assumed that the proteins field occurs at the beginning of the text (since this method is used to extract protein fields from reaction-type entries). If a proteins field is present in the text, the protein references are stored in the instance's _current.proteins attribute. :param text: text that may contain protein references (at the beginning) :return: text resulting from the extraction of protein references """ if text.startswith('#'): text = self._clean_extra_hash_characters(text) pobj = self._tags.protein.search(text) if not pobj or pobj.start() != 0: raise ArgumentError('Protein reference missing: \'{}\' @ #%{}' .format(text, self._current.line_number)) _, self._current.proteins = self._extract_numbers(pobj.group(), self._tags.protein) if self._current.proteins is not None: text = text[pobj.end():] return text.strip()
def parse(self): """Parses multiple Enzyme sections. :return: dict of Enzyme objects """ section_name = '' # long section identifier, e.g. 'PROTEIN' section_contents = list() # contents of the section identified by section_name short_entry = '' # two- or three- letter section identifier, e.g. 'PR' for 'PROTEIN' entry = list() # contents of an entry identified by short_entry parser = self._parse_generic_entry for line in self._file_handle: if self._current.line_number % 1000 == 0: self._progress.update(self._current.line_number) self._current.line_number += 1 line = line.rstrip() if not line or line.startswith('*'): continue content = line.split(None, 1) if content[0] == 'ID' and has_ec_number(content[1]): self._parse_id(content[1]) elif content[0] in self._sections.keys(): # handle new section # Finish handling previous section if content[1:]: # not a new section, actually entry.append(line.lstrip()) continue if entry: section_contents.append(parser(' '.join(entry))) if section_contents and not self.is_section_redundant(section_name): if self._current.ec_number is None: # skip to next EC due to invalid ID self._skip = True continue self._current.ec_number.entries[section_name] = section_contents # Prepare to process current section section_contents = list() entry = list() section_name = content[0] parser = self._determine_parser_from_section_name(section_name) short_entry = self._sections.get(section_name, False) if not short_entry: raise ArgumentError('Unrecognised entry: \'{}\' @ #%{}' .format(line, self._current.line_number)) elif content[0] == short_entry: # handle previous and current entries if entry: section_contents.append(parser(' '.join(entry))) entry = content[1:] elif content[0] == '///': # handle end of EC number description if self._skip: self._skip = False continue # end one enzyme entry if entry: section_contents.append(parser(' '.join(entry))) if section_contents and not self.is_section_redundant(section_name): self._current.ec_number.entries[section_name] = section_contents self._current.ec_number = None else: entry.append(line.lstrip()) # convert to normal dictionary again res = dict(self.enzymes) self._progress.close() return res