Пример #1
0
 def __init__(self, message, current):
     """Initializes an Entry instance."""
     if not isinstance(current, Current):
         raise ArgumentError('Entry: expected type Current')
     super().__init__(message=message, proteins=current.proteins, references=current.references)
     self.information = current.information
     self.comment = current.comment
Пример #2
0
 def __init__(self, organism, current):
     """Initializes a Protein instance."""
     if not isinstance(current, Current):
         raise ArgumentError('Protein: expected type Current')
     self._index = self._counter
     self.__class__._counter += 1
     self.organism = organism
     self.identifiers = current.proteins
     self.references = current.references
     self.information = current.information
     self.comment = current.comment
Пример #3
0
    def _parse_protein(self, text):
        """Parses a PROTEIN (PR) entry from the BRENDA flat file.

        :param text: text that represents a PROTEIN (PR) entry
        """
        self._reset_parser()
        text = self.extract_information(text)
        text = self.extract_comment(text, enforce_structural_check=False)

        mobj = self._tags.protein.search(text)
        if not mobj:
            raise ArgumentError(
                'Protein reference missing: \'{}\' @ #{}'.format(text, self._current.line_number))

        protein_id = int(mobj.group(1))
        text = text[:mobj.start()] + text[mobj.end():]
        text, accessions = self._extract_accessions(text)

        self._current.proteins = sorted(list(set(accessions)))
        text, self._current.references = self._extract_numbers(text, self._tags.reference)
        self._current.ec_number.proteins[protein_id] = Protein(text.strip(), self._current)
Пример #4
0
    def _extract_numbers(self, text, pattern):
        """Extracts and returns numbers from a text according to a given pattern,
        as well as the text resulting from removing the numbers.

        The pattern may be one defining proteins or references (the instance's
        _tags.protein or _tags.reference attributes).

        :param text: text that may contain numbers according to a pattern
        :param pattern: a re pattern
        :return: text resulting from number extraction
        """
        if not isinstance(pattern, typing.Pattern):
            raise ArgumentError('Expected re.Pattern: {}'.format(pattern))

        numbers = None
        mobj = pattern.search(text)
        if mobj:
            numbers = [int(num.group(0)) for num in self._tags.numbers.finditer(mobj.group(1))]
            text = text[:mobj.start()] + text[mobj.end():]

        return text.strip(), numbers
Пример #5
0
    def _get_numbers_in_comment(self, comment, pattern):
        """Returns a list of numbers present in the given comment according
        to the specified pattern.

        :param comment: text that may contain numbers
        :param pattern: pattern describing the numbers
        :return: the list of numbers in comment
        """
        if not isinstance(pattern, typing.Pattern):
            raise ArgumentError('Expected re.Pattern: {}'.format(pattern))

        if comment is None or not comment.strip():
            return None

        numbers = list()

        matches = [match.group() for match in pattern.finditer(comment)]
        for number in matches:
            _, values = self._extract_numbers(number, pattern)
            numbers.extend([v for v in values if v not in numbers])

        return numbers
Пример #6
0
    def extract_proteins(self, text):
        """Extracts and stores protein references and returns the text resulting
        from removing the proteins field.

        The proteins field format is defined by the instance's _tags.protein
        attribute. It is assumed that the proteins field occurs at the beginning
        of the text (since this method is used to extract protein fields from
        reaction-type entries). If a proteins field is present in the text, the
        protein references are stored in the instance's _current.proteins
        attribute.

        :param text: text that may contain protein references (at the beginning)
        :return: text resulting from the extraction of protein references
        """
        if text.startswith('#'):
            text = self._clean_extra_hash_characters(text)
            pobj = self._tags.protein.search(text)
            if not pobj or pobj.start() != 0:
                raise ArgumentError('Protein reference missing: \'{}\' @ #%{}'
                                    .format(text, self._current.line_number))
            _, self._current.proteins = self._extract_numbers(pobj.group(), self._tags.protein)
            if self._current.proteins is not None:
                text = text[pobj.end():]
        return text.strip()
Пример #7
0
    def parse(self):
        """Parses multiple Enzyme sections.

        :return: dict of Enzyme objects
        """
        section_name = ''  # long section identifier, e.g. 'PROTEIN'
        section_contents = list()  # contents of the section identified by section_name
        short_entry = ''  # two- or three- letter section identifier, e.g. 'PR' for 'PROTEIN'
        entry = list()  # contents of an entry identified by short_entry
        parser = self._parse_generic_entry

        for line in self._file_handle:
            if self._current.line_number % 1000 == 0:
                self._progress.update(self._current.line_number)
            self._current.line_number += 1

            line = line.rstrip()
            if not line or line.startswith('*'):
                continue

            content = line.split(None, 1)
            if content[0] == 'ID' and has_ec_number(content[1]):
                self._parse_id(content[1])
            elif content[0] in self._sections.keys():  # handle new section
                # Finish handling previous section
                if content[1:]:  # not a new section, actually
                    entry.append(line.lstrip())
                    continue
                if entry:
                    section_contents.append(parser(' '.join(entry)))
                if section_contents and not self.is_section_redundant(section_name):
                    if self._current.ec_number is None:  # skip to next EC due to invalid ID
                        self._skip = True
                        continue
                    self._current.ec_number.entries[section_name] = section_contents

                # Prepare to process current section
                section_contents = list()
                entry = list()
                section_name = content[0]
                parser = self._determine_parser_from_section_name(section_name)
                short_entry = self._sections.get(section_name, False)
                if not short_entry:
                    raise ArgumentError('Unrecognised entry: \'{}\' @ #%{}'
                                        .format(line, self._current.line_number))
            elif content[0] == short_entry:  # handle previous and current entries
                if entry:
                    section_contents.append(parser(' '.join(entry)))
                entry = content[1:]
            elif content[0] == '///':  # handle end of EC number description
                if self._skip:
                    self._skip = False
                    continue
                # end one enzyme entry
                if entry:
                    section_contents.append(parser(' '.join(entry)))
                if section_contents and not self.is_section_redundant(section_name):
                    self._current.ec_number.entries[section_name] = section_contents
                self._current.ec_number = None
            else:
                entry.append(line.lstrip())
        # convert to normal dictionary again
        res = dict(self.enzymes)
        self._progress.close()
        return res