def to_where_condition(self) -> WhereCondition:
        """
        Returns a where condition, containing the WHERE-part of a SQL-query and the values for the query,
        which can be used to fetch peptides for the mass/modification-combination.

        Returns
        -------
        WhereCondition
        """
        if len(self):
            finished_where_condition = WhereCondition([], [])
            for combination in self:
                finished_where_condition.concatenate(
                    combination.to_where_condition(), "OR")
            return finished_where_condition
        else:
            precursor_range = PrecursorRange(
                self.__precursor, self.__lower_precursor_tolerance_ppm,
                self.__upper_precursor_tolerance_ppm)
            return WhereCondition([
                "partition BETWEEN %s AND %s", "AND", "mass BETWEEN %s AND %s"
            ], [
                Peptide.get_partition(precursor_range.lower_limit),
                Peptide.get_partition(precursor_range.upper_limit),
                precursor_range.lower_limit, precursor_range.upper_limit
            ])
Пример #2
0
 def show(sequence: str):
     is_reviewed = request.args.get("is_reviewed", None)
     if is_reviewed is not None:
         is_reviewed = bool(int(is_reviewed))
     sequence = sequence.upper()
     database_connection = get_database_connection()
     with database_connection.cursor() as database_cursor:
         peptide = Peptide(sequence, 0, None)
         peptide = Peptide.select(
             database_cursor,
             WhereCondition([
                 "partition = %s", "AND", "mass = %s", "AND",
                 "sequence = %s"
             ], [peptide.partition, peptide.mass, peptide.sequence]),
             include_metadata=True)
         if peptide is None:
             return jsonify({"errors": {"sequence": ["not found"]}}), 404
         # Return peptide if is_reviewed is not requested (None),
         # or is_reviewed is requested and True and metadata is_swiss_prot is also True
         # or is_reviewed is requested and False and metadata is_trembl is True
         if is_reviewed is None \
             or is_reviewed and peptide.metadata.is_swiss_prot \
             or not is_reviewed and peptide.metadata.is_trembl:
             return Response(peptide.to_json(),
                             content_type="application/json")
         return jsonify({}), 404
Пример #3
0
    def test_with_modifications(self):  # lower hit
        # Add peptides to database
        with self.database_connection:
            with self.database_connection.cursor() as database_cursor:
                for key in PEPTIDES_FOR_MODIFIED_SEARCH.keys():
                    Peptide.bulk_insert(database_cursor, [
                        Peptide(sequence, 0)
                        for sequence in PEPTIDES_FOR_MODIFIED_SEARCH[key]
                    ])

        csv_file_path = pathlib.Path("./test_files/modifications.csv")
        modification_collection = ModificationCollection.read_from_csv_file(
            csv_file_path)

        precursor = mass_to_int(thomson_to_dalton(MASS_TO_CHARGE_RATIO,
                                                  CHARGE))

        with self.database_connection:
            with self.database_connection.cursor() as database_cursor:
                modification_combination_list = ModificationCombinationList(
                    modification_collection, precursor, PRECURSOR_TOLERANCE,
                    PRECURSOR_TOLERANCE, VARIABLE_MODIFICATION_MAXIMUM)
                where_condition = modification_combination_list.to_where_condition(
                )
                peptides = Peptide.select(database_cursor,
                                          where_condition,
                                          fetchall=True)

                # Check if only matching peptides were found
                self.assertEqual(len(peptides),
                                 len(PEPTIDES_FOR_MODIFIED_SEARCH['matching']))
                for peptide in peptides:
                    self.assertIn(peptide.sequence,
                                  PEPTIDES_FOR_MODIFIED_SEARCH['matching'])
Пример #4
0
    def peptides(accession: str):

        database_connection = get_database_connection()
        with database_connection.cursor() as database_cursor:
                peptides = Peptide.select(
                    database_cursor,
                    WhereCondition(
                        [f"(peps.partition, peps.mass, peps.sequence) IN (SELECT partition, peptide_mass, peptide_sequence FROM {ProteinPeptideAssociation.TABLE_NAME} as ppa WHERE ppa.protein_accession = %s)"],
                        [accession]
                    ),
                    fetchall=True,
                    include_metadata= True
                )
                peptides.sort(key = lambda peptide: peptide.mass)

                def json_stream() -> Iterator[bytes]:
                    yield b"{\"peptides\": ["
                    for peptide_idx, peptide in enumerate(peptides):
                        if peptide_idx > 0:
                            yield b","
                        yield from peptide.to_json()
                    yield b"]}"
                    
                return Response(
                    json_stream(),
                    content_type="application/json"
                )
Пример #5
0
    def proteins(sequence: str):
        peptide = Peptide(sequence.upper(), 0)
        database_connection = get_database_connection()
        with database_connection.cursor() as database_cursor:
            proteins = Protein.select(
                database_cursor,
                WhereCondition([
                    f"accession = ANY(SELECT protein_accession FROM {ProteinPeptideAssociation.TABLE_NAME} as ppa WHERE ppa.partition = %s AND ppa.peptide_mass = %s AND ppa.peptide_sequence = %s)"
                ], [peptide.partition, peptide.mass, peptide.sequence]), True)

            reviewed_proteins = []
            unreviewed_proteins = []

            for protein in proteins:
                if protein.is_reviewed:
                    reviewed_proteins.append(protein)
                else:
                    unreviewed_proteins.append(protein)

            def json_stream() -> Iterator[bytes]:
                yield b"{\"reviewed_proteins\": ["
                for protein_idx, protein in enumerate(reviewed_proteins):
                    if protein_idx > 0:
                        yield b","
                    yield from protein.to_json()
                yield b"],\"unreviewed_proteins\": ["
                for protein_idx, protein in enumerate(unreviewed_proteins):
                    if protein_idx > 0:
                        yield b","
                    yield from protein.to_json()
                yield b"]}"

            return Response(json_stream(), content_type="application/json")
    def test_mass_calculation(self):
        fictional_peptide = Peptide(FICTIONAL_SEQUENCE, 0)

        # If the amino acid cound test passes, we can use the counts to calculate the mass manally.
        # Actually there is no external tool which supports all of our known amino acids, so we can double check the weigth.
        mass = fictional_peptide.a_count * AminoAcid.get_by_one_letter_code('A').mono_mass \
            + fictional_peptide.b_count * AminoAcid.get_by_one_letter_code('B').mono_mass \
            + fictional_peptide.c_count * AminoAcid.get_by_one_letter_code('C').mono_mass \
            + fictional_peptide.d_count * AminoAcid.get_by_one_letter_code('D').mono_mass \
            + fictional_peptide.e_count * AminoAcid.get_by_one_letter_code('E').mono_mass \
            + fictional_peptide.f_count * AminoAcid.get_by_one_letter_code('F').mono_mass \
            + fictional_peptide.g_count * AminoAcid.get_by_one_letter_code('G').mono_mass \
            + fictional_peptide.h_count * AminoAcid.get_by_one_letter_code('H').mono_mass \
            + fictional_peptide.i_count * AminoAcid.get_by_one_letter_code('I').mono_mass \
            + fictional_peptide.j_count * AminoAcid.get_by_one_letter_code('J').mono_mass \
            + fictional_peptide.k_count * AminoAcid.get_by_one_letter_code('K').mono_mass \
            + fictional_peptide.l_count * AminoAcid.get_by_one_letter_code('L').mono_mass \
            + fictional_peptide.m_count * AminoAcid.get_by_one_letter_code('M').mono_mass \
            + fictional_peptide.n_count * AminoAcid.get_by_one_letter_code('N').mono_mass \
            + fictional_peptide.o_count * AminoAcid.get_by_one_letter_code('O').mono_mass \
            + fictional_peptide.p_count * AminoAcid.get_by_one_letter_code('P').mono_mass \
            + fictional_peptide.q_count * AminoAcid.get_by_one_letter_code('Q').mono_mass \
            + fictional_peptide.r_count * AminoAcid.get_by_one_letter_code('R').mono_mass \
            + fictional_peptide.s_count * AminoAcid.get_by_one_letter_code('S').mono_mass \
            + fictional_peptide.t_count * AminoAcid.get_by_one_letter_code('T').mono_mass \
            + fictional_peptide.u_count * AminoAcid.get_by_one_letter_code('U').mono_mass \
            + fictional_peptide.v_count * AminoAcid.get_by_one_letter_code('V').mono_mass \
            + fictional_peptide.w_count * AminoAcid.get_by_one_letter_code('W').mono_mass \
            + fictional_peptide.y_count * AminoAcid.get_by_one_letter_code('Y').mono_mass \
            + fictional_peptide.z_count * AminoAcid.get_by_one_letter_code('Z').mono_mass \
            + H2O.mono_mass

        self.assertEqual(mass, fictional_peptide.mass)
    def test_amino_acid_counts(self):
        fictional_peptide = Peptide(FICTIONAL_SEQUENCE, 0)
        self.assertEqual(fictional_peptide.a_count, 2)
        self.assertEqual(fictional_peptide.b_count, 2)
        self.assertEqual(fictional_peptide.c_count, 4)
        self.assertEqual(fictional_peptide.d_count, 5)
        self.assertEqual(fictional_peptide.e_count, 2)
        self.assertEqual(fictional_peptide.f_count, 1)
        self.assertEqual(fictional_peptide.g_count, 2)
        self.assertEqual(fictional_peptide.h_count, 2)
        self.assertEqual(fictional_peptide.i_count, 6)
        self.assertEqual(fictional_peptide.j_count, 3)
        self.assertEqual(fictional_peptide.k_count, 5)
        self.assertEqual(fictional_peptide.l_count, 7)
        self.assertEqual(fictional_peptide.m_count, 1)
        self.assertEqual(fictional_peptide.n_count, 1)
        self.assertEqual(fictional_peptide.o_count, 4)
        self.assertEqual(fictional_peptide.p_count, 3)
        self.assertEqual(fictional_peptide.q_count, 4)
        self.assertEqual(fictional_peptide.r_count, 3)
        self.assertEqual(fictional_peptide.s_count, 6)
        self.assertEqual(fictional_peptide.t_count, 7)
        self.assertEqual(fictional_peptide.u_count, 1)
        self.assertEqual(fictional_peptide.v_count, 5)
        self.assertEqual(fictional_peptide.w_count, 4)
        self.assertEqual(fictional_peptide.y_count, 1)
        self.assertEqual(fictional_peptide.z_count, 1)

        amino_acid_sum = fictional_peptide.a_count \
            + fictional_peptide.b_count \
            + fictional_peptide.c_count \
            + fictional_peptide.d_count \
            + fictional_peptide.e_count \
            + fictional_peptide.f_count \
            + fictional_peptide.g_count \
            + fictional_peptide.h_count \
            + fictional_peptide.i_count \
            + fictional_peptide.j_count \
            + fictional_peptide.k_count \
            + fictional_peptide.l_count \
            + fictional_peptide.m_count \
            + fictional_peptide.n_count \
            + fictional_peptide.o_count \
            + fictional_peptide.p_count \
            + fictional_peptide.q_count \
            + fictional_peptide.r_count \
            + fictional_peptide.s_count \
            + fictional_peptide.t_count \
            + fictional_peptide.u_count \
            + fictional_peptide.v_count \
            + fictional_peptide.w_count \
            + fictional_peptide.y_count \
            + fictional_peptide.z_count

        self.assertEqual(len(fictional_peptide.sequence), amino_acid_sum)
Пример #8
0
    def test_without_modifications(self):
        # Add peptides to database
        with self.database_connection:
            with self.database_connection.cursor() as database_cursor:
                for key in PEPTIDES_FOR_UNMODIFIED_SEARCH.keys():
                    Peptide.bulk_insert(database_cursor, [
                        Peptide(sequence, 0)
                        for sequence in PEPTIDES_FOR_UNMODIFIED_SEARCH[key]
                    ])

        modification_collection = ModificationCollection([])
        precursor = mass_to_int(thomson_to_dalton(MASS_TO_CHARGE_RATIO,
                                                  CHARGE))

        with self.database_connection:
            with self.database_connection.cursor() as database_cursor:
                modification_combination_list = ModificationCombinationList(
                    modification_collection, precursor, PRECURSOR_TOLERANCE,
                    PRECURSOR_TOLERANCE, VARIABLE_MODIFICATION_MAXIMUM)

                where_condition = modification_combination_list.to_where_condition(
                )
                select_conditions_string = database_cursor.mogrify(
                    where_condition.get_condition_str(),
                    where_condition.values).decode('utf-8')
                matches = re.findall(self.__class__.MASS_TOLERANCE_REGEX,
                                     select_conditions_string)
                # Without modifications there is only one between-condition.
                self.assertEqual(len(matches), 1)

                peptides = Peptide.select(database_cursor,
                                          where_condition,
                                          fetchall=True)

                # Check if only matching peptides were found
                self.assertEqual(
                    len(peptides),
                    len(PEPTIDES_FOR_UNMODIFIED_SEARCH['matching']))
                for peptide in peptides:
                    self.assertIn(peptide.sequence,
                                  PEPTIDES_FOR_UNMODIFIED_SEARCH['matching'])
Пример #9
0
 def test_validation(self):
     """
     Checks if mass validation works.
     """
     peptide_mass_validator = PeptideMassValidator(
         self.__class__.MODIFICATION_COLLECTION,
         self.__class__.NUMBER_OF_VARIABLE_MODIFICATIONS,
         self.__class__.PRECURSOR_RANGE)
     for plain_sequence, annotated_sequence in self.__class__.PEPTIDE_SEQUENCES:
         peptide = Peptide(plain_sequence,
                           Trypsin.count_missed_cleavages(plain_sequence))
         self.assertTrue(
             peptide_mass_validator.validate(peptide, True),
             f"expected: {annotated_sequence}; is: {peptide.sequence_with_modification_markers}"
         )
Пример #10
0
    def run(self):
        """
        Starts the process and enqueing of updateable peptides.
        """
        self.activate_signal_handling()

        self.__general_log.send("Start enqueuing updatable peptides.")
        database_connection = None
        # retry loop
        while not self.termination_event.is_set():
            try:
                if not database_connection or (
                        database_connection
                        and database_connection.closed != 0):
                    database_connection = psycopg2.connect(self.__database_url)
                with database_connection.cursor(
                        name='updatable_peptide_collector') as database_cursor:
                    database_cursor.itersize = 1000
                    database_cursor.execute(
                        f"SELECT sequence, number_of_missed_cleavages FROM {Peptide.TABLE_NAME} WHERE is_metadata_up_to_date = false;"
                    )
                    peptides = []
                    for peptide_row in database_cursor:
                        peptides.append(Peptide(peptide_row[0],
                                                peptide_row[1]))
                        if len(peptides) == self.__class__.PEPTIDE_BATCH:
                            self.__enque_peptides(peptides)
                            peptides = []
                        if self.termination_event.is_set():
                            break  # breat cursor loop
                    # Enqueue last batch of peptides
                    if len(peptides):
                        self.__enque_peptides(peptides)
                        peptides = []
                    break  # break retry loop
            except psycopg2.Error as error:
                # Catch database errors and disconnects
                self.__general_log.send(
                    f"error occured, see:\n{error}\ntry again")
        self.__general_log.send("All updatable peptides enqueued.")
        self.__general_log.close()
Пример #11
0
        def generate_text_stream() -> Iterator[str]:
            database_connection = macpepdb_pool.getconn()
            try:
                with database_connection.cursor() as database_cursor:
                    database_cursor.itersize = ApiPeptidesController.PEPTIDE_LOOKUP_CHUNKS
                    # Request each partition in a separate query
                    for partition, part_peptides in partitions.items():
                        # Chunk the peptides in 500
                        chunk_start = 0
                        while chunk_start < len(part_peptides):
                            chunk = part_peptides[chunk_start:(
                                chunk_start +
                                ApiPeptidesController.PEPTIDE_LOOKUP_CHUNKS)]
                            for peptide_idx, peptide in enumerate(
                                    Peptide.select(
                                        database_cursor,
                                        WhereCondition(
                                            [
                                                "partition = %s", "AND"
                                                "(mass, sequence) IN (" +
                                                ",".join(["(%s, %s)"] *
                                                         len(chunk)) + ")"
                                            ],
                                            [partition] + list(
                                                itertools.chain(
                                                    *[(peptide.mass,
                                                       peptide.sequence)
                                                      for peptide in chunk])),
                                        ),
                                        stream=True)):
                                if peptide_idx > 0:
                                    yield "\n"
                                yield peptide.sequence
                            chunk_start += ApiPeptidesController.PEPTIDE_LOOKUP_CHUNKS

            finally:
                macpepdb_pool.putconn(database_connection)
Пример #12
0
    def test_with_modifications(self):  # lower hit
        # Add peptides to database
        session = self.session_factory()
        for key in PEPTIDES_FOR_MODIFIED_SEARCH.keys():
            for sequence in PEPTIDES_FOR_MODIFIED_SEARCH[key]:
                peptide = Peptide(sequence, 0)
                session.add(peptide)
        session.commit()
        session.close()

        csv_file_path = pathlib.Path("./test_files/modifications.csv")
        modification_collection = ModificationCollection.read_from_csv_file(
            csv_file_path)

        precursor = mass_to_int(thomson_to_dalton(MASS_TO_CHARGE_RATIO,
                                                  CHARGE))

        builder = ModifiedPeptideWhereClauseBuilder(
            modification_collection, precursor, PRECURSOR_TOLERANCE,
            PRECURSOR_TOLERANCE, VARIABLE_MODIFICATION_MAXIMUM)

        where_clause = builder.build(Peptide)
        # Create fresh session
        session = self.session_factory()
        builder = ModifiedPeptideWhereClauseBuilder(
            modification_collection, precursor, PRECURSOR_TOLERANCE,
            PRECURSOR_TOLERANCE, VARIABLE_MODIFICATION_MAXIMUM)
        where_clause = builder.build(Peptide)
        peptides = session.query(Peptide).filter(where_clause).all()

        # Check if only matching peptides were found
        self.assertEqual(len(peptides),
                         len(PEPTIDES_FOR_MODIFIED_SEARCH['matching']))
        for peptide in peptides:
            self.assertIn(peptide.sequence,
                          PEPTIDES_FOR_MODIFIED_SEARCH['matching'])
Пример #13
0
    def test_without_modifications(self):
        # Add peptides to database
        session = self.session_factory()
        for key in PEPTIDES_FOR_UNMODIFIED_SEARCH.keys():
            for sequence in PEPTIDES_FOR_UNMODIFIED_SEARCH[key]:
                peptide = Peptide(sequence, 0)
                session.add(peptide)
        session.commit()
        session.close()

        modification_collection = ModificationCollection([])
        precursor = mass_to_int(thomson_to_dalton(MASS_TO_CHARGE_RATIO,
                                                  CHARGE))

        # Create fresh session
        session = self.session_factory()
        builder = ModifiedPeptideWhereClauseBuilder(
            modification_collection, precursor, PRECURSOR_TOLERANCE,
            PRECURSOR_TOLERANCE, VARIABLE_MODIFICATION_MAXIMUM)
        where_clause = builder.build(Peptide)

        where_clause_string = str(
            where_clause.compile(compile_kwargs={"literal_binds": True}))
        matches = re.findall(self.__class__.WEIGHT_TOLERANCE_REGEX,
                             where_clause_string)
        # Without modifications there is only one between-condition.
        self.assertEqual(len(matches), 1)

        peptides = session.query(Peptide).filter(where_clause).all()

        # Check if only matching peptides were found
        self.assertEqual(len(peptides),
                         len(PEPTIDES_FOR_UNMODIFIED_SEARCH['matching']))
        for peptide in peptides:
            self.assertIn(peptide.sequence,
                          PEPTIDES_FOR_UNMODIFIED_SEARCH['matching'])
Пример #14
0
    def sequence_lookup():
        """
        Check if the incoming peptide sequences exists in MaCPepDB

        Returns
        -------
        Response
            Flask response
        """
        data = request.json
        errors = defaultdict(list)

        if not "sequences" in data:
            errors["sequences"].append("cannot be empty")
        elif not isinstance(data["sequences"], list):
            errors["sequences"].append("must be a list")

        peptides = [Peptide(sequence, 0) for sequence in data["sequences"]]
        # Sort peptides by partition
        partitions = defaultdict(list)
        for peptide in peptides:
            partitions[peptide.partition].append(peptide)

        if len(errors) > 0:
            return jsonify({"errors": errors}), 422

        def generate_text_stream() -> Iterator[str]:
            database_connection = macpepdb_pool.getconn()
            try:
                with database_connection.cursor() as database_cursor:
                    database_cursor.itersize = ApiPeptidesController.PEPTIDE_LOOKUP_CHUNKS
                    # Request each partition in a separate query
                    for partition, part_peptides in partitions.items():
                        # Chunk the peptides in 500
                        chunk_start = 0
                        while chunk_start < len(part_peptides):
                            chunk = part_peptides[chunk_start:(
                                chunk_start +
                                ApiPeptidesController.PEPTIDE_LOOKUP_CHUNKS)]
                            for peptide_idx, peptide in enumerate(
                                    Peptide.select(
                                        database_cursor,
                                        WhereCondition(
                                            [
                                                "partition = %s", "AND"
                                                "(mass, sequence) IN (" +
                                                ",".join(["(%s, %s)"] *
                                                         len(chunk)) + ")"
                                            ],
                                            [partition] + list(
                                                itertools.chain(
                                                    *[(peptide.mass,
                                                       peptide.sequence)
                                                      for peptide in chunk])),
                                        ),
                                        stream=True)):
                                if peptide_idx > 0:
                                    yield "\n"
                                yield peptide.sequence
                            chunk_start += ApiPeptidesController.PEPTIDE_LOOKUP_CHUNKS

            finally:
                macpepdb_pool.putconn(database_connection)

        return Response(generate_text_stream(), content_type="text/plain")
    def __calculate_columns_and_precursor_range(
            self, modification_counters: list, precursor: int,
            lower_precursor_tolerance_ppm: int,
            upper_precursor_tolerance_ppm: int):
        """
        Creates column condition from the given modification counter and precursor and tolerances.

        Parameters
        ----------
        modification_counters : list
            List of modification counters for this PTM/mass combination.
        precursor : int
            Precursor
        lower_precursor_tolerance_ppm : int
            Precursor rolerance
        upper_precursor_tolerance_ppm : int
            Precursor rolerance
        """
        # Dict with amino acid one letter code as key and [count, is_static] as value, e.g.: "c": [4, True]
        amino_acid_occurences = {}
        # Used terminus modification of the form [modification, is_applied], e.g.: [Modification, False]
        n_terminus_modification = None
        c_terminus_modification = None
        # Sum of the mass delta by the applied modifications
        delta_sum = 0

        for counter in modification_counters:
            # If counter is for static non terminus modification or a variable modification with a count higher than zero. Add it to the occurances
            if not counter.modification.is_terminus_modification and (
                    counter.modification.is_static or
                (counter.modification.is_variable and counter.count > 0)):
                if counter.modification.amino_acid.one_letter_code in amino_acid_occurences:
                    amino_acid_occurences[counter.modification.amino_acid.
                                          one_letter_code][0] += counter.count
                else:
                    amino_acid_occurences[
                        counter.modification.amino_acid.one_letter_code] = [
                            counter.count, counter.modification.is_static
                        ]
                delta_sum += counter.count * counter.modification.delta
            elif counter.modification.is_terminus_modification and counter.modification.is_variable:
                if counter.modification.is_position_n_terminus:
                    n_terminus_modification = [
                        counter.modification, counter.count > 0
                    ]
                if counter.modification.is_position_c_terminus:
                    c_terminus_modification = [
                        counter.modification, counter.count > 0
                    ]

                delta_sum += counter.count * counter.modification.delta
            elif counter.modification.is_static and counter.modification.is_terminus_modification:
                delta_sum += counter.count * counter.modification.delta

        for one_letter_code, count_and_type in amino_acid_occurences.items():
            # Build the column name, e.g. a_count
            column_name = "{}_count".format(one_letter_code.lower())
            # Add condition. In case the modification is fix we want equals the amino acid, if the modification is variable it could be more.
            sql_operator = "= %s" if count_and_type[1] else ">= %s"
            self.__column_conditions.append(
                ColumnCondition(column_name, sql_operator,
                                (count_and_type[0], )))

        # For n- and c-terminus modification check if there is one:
        # * If the modification is variable and applied check for presence (absence is uninteresting because it is a variable modification)
        if n_terminus_modification and (n_terminus_modification[0].is_variable
                                        and n_terminus_modification[1]):
            self.__column_conditions.append(
                ColumnCondition("n_terminus", "= %s",
                                (n_terminus_modification[0].amino_acid.
                                 get_one_letter_code_ascii_dec(), )))

        if c_terminus_modification and (c_terminus_modification[0].is_variable
                                        and c_terminus_modification[1]):
            self.__column_conditions.append(
                ColumnCondition("c_terminus", "= %s",
                                (c_terminus_modification[0].amino_acid.
                                 get_one_letter_code_ascii_dec(), )))

        # Add the mass between condition
        self.__precursor_range = PrecursorRange(precursor - delta_sum,
                                                lower_precursor_tolerance_ppm,
                                                upper_precursor_tolerance_ppm)

        self.__column_conditions.append(
            ColumnCondition("mass", "BETWEEN %s AND %s",
                            (self.__precursor_range.lower_limit,
                             self.__precursor_range.upper_limit)))

        first_partition = Peptide.get_partition(
            self.__precursor_range.lower_limit)
        last_partition = Peptide.get_partition(
            self.__precursor_range.upper_limit)
        conditions_operator = ""
        conditions_values = ()
        if first_partition == last_partition:
            conditions_operator = "= %s"
            conditions_values = (first_partition, )
        else:
            conditions_operator = "BETWEEN %s AND %s"
            conditions_values = (first_partition, last_partition)
        self.__column_conditions.append(
            ColumnCondition("partition", conditions_operator,
                            conditions_values))
Пример #16
0
    def test_with_real_data(self):
        VARIABLE_MODIFICATION_LIMIT = 2
        # Mass of MFPVTJEDTEGNVJTVSPPCYGFJQJR
        PRECURSOR = 3025492916648
        PRECURSOR_TOLERANCE = 20
        # With the given mass, tolerance and modifications 3 peptides shoud be found

        modifications_file_path = pathlib.Path(
            './test_files/modifications.csv')

        work_dir = pathlib.Path(f"./tmp/{self.id()}")
        test_files_path = pathlib.Path('./test_files')
        protein_data_test_file_path = test_files_path.joinpath('proteins.txt')

        self.prepare_workdir(work_dir, test_files_path,
                             protein_data_test_file_path)

        maintenance = DatabaseMaintenance(os.getenv("TEST_MACPEPDB_URL"),
                                          work_dir, 4, 5, 'Trypsin', 2, 5, 40)

        maintenance.start()

        modification_collection = ModificationCollection.read_from_csv_file(
            modifications_file_path)
        peptide_mass_validator = PeptideMassValidator(
            modification_collection, VARIABLE_MODIFICATION_LIMIT,
            PrecursorRange(PRECURSOR, PRECURSOR_TOLERANCE,
                           PRECURSOR_TOLERANCE))

        validated_matching_peptide_sequences = set()

        with self.database_connection:
            with self.database_connection.cursor() as database_cursor:
                # Run through all peptides (in batches of 1000 peptides) and check which matchs the precursor and modification requirements
                database_cursor.execute(
                    f"SELECT sequence, number_of_missed_cleavages FROM {Peptide.TABLE_NAME};"
                )
                while True:
                    rows = database_cursor.fetchmany(1000)
                    if not len(rows):
                        break
                    for row in rows:
                        peptide = Peptide(row[0], row[1])
                        if peptide_mass_validator.validate(peptide):
                            validated_matching_peptide_sequences.add(
                                peptide.sequence)

        with self.database_connection:
            with self.database_connection.cursor() as database_cursor:
                modification_combination_list = ModificationCombinationList(
                    modification_collection, PRECURSOR, PRECURSOR_TOLERANCE,
                    PRECURSOR_TOLERANCE, VARIABLE_MODIFICATION_LIMIT)
                where_conditions = modification_combination_list.to_where_condition(
                )
                peptides = Peptide.select(database_cursor,
                                          where_conditions,
                                          fetchall=True)

                queried_matching_peptide_sequences = set()
                for peptide in peptides:
                    queried_matching_peptide_sequences.add(peptide.sequence)

                # Check length of both manually validated set and the queried set
                self.assertEqual(len(validated_matching_peptide_sequences),
                                 len(queried_matching_peptide_sequences))

                # Cross check if peptide from one set is in the other set
                for sequence in queried_matching_peptide_sequences:
                    self.assertIn(sequence,
                                  validated_matching_peptide_sequences)

                for sequence in validated_matching_peptide_sequences:
                    self.assertIn(sequence, queried_matching_peptide_sequences)
Пример #17
0
    def stream(peptide_conversion: Callable[[int, Peptide],
                                            Iterator[ByteString]],
               delimiter: ByteString, pre_peptide_content: ByteString,
               post_spectra_content: Callable[[Any, WhereCondition],
                                              ByteString],
               where_condition: WhereCondition, order_by_instruction: str,
               offset: int, limit: int, include_metadata: bool,
               metadata_condition: MetadataCondition) -> Iterable[ByteString]:
        """
        Queries peptides and yields content for a stream response.

        Parameters
        ----------
        peptide_conversion : Callable[[int, Peptide], Iterator[ByteString]]
            Function with peptide index and peptide as input and yields the given peptide as bytes string for the response.
        delimiter : ByteString
            Delimiter between peptides
        pre_peptide_content : ByteString
            _description_
        post_spectra_content : Callable[[Any, WhereCondition], ByteString]
            Function which yields content after peptides, e.g. to add the count.
            Arguments are is the database cursor and the where condition
        where_condition : WhereCondition
            WhereCondition for SQL query
        order_by_instruction : str
            OrderBy instruction for SQL query
        offset : int
            Search offset
        limit : int
            Return limit
        include_metadata : bool
            If ture metadata will be included.
        metadata_condition : MetadataCondition
            Conditions for metadata. If not empty it sets metadata to true.

        Yields
        ------
        Iterator[Iterable[ByteString]]
            Response body
        """
        # In a generator the response is already returned and the app context is teared down. So we can not use a database connection from the actual request handling.
        # Get a new one from the pool and return it when the generator ist stopped (GeneratorExit is thrown).
        database_connection = macpepdb_pool.getconn()
        # Determine if metadata checks are necessary
        do_metadata_checks = metadata_condition.has_conditions()
        try:
            yield pre_peptide_content
            with database_connection.cursor(
                    name="peptide_search") as database_cursor:
                # Counter for written peptides necessary of manual limit offset handling
                written_peptides = 0
                for peptide_idx, peptide in enumerate(
                        Peptide.select(database_cursor,
                                       where_condition,
                                       order_by=order_by_instruction,
                                       include_metadata=include_metadata,
                                       stream=True)):
                    if peptide_idx >= offset - 1 and (
                            not do_metadata_checks
                            or metadata_condition.validate(peptide.metadata)):
                        if written_peptides > 0:
                            yield delimiter
                        yield from peptide_conversion(peptide_idx, peptide)
                        written_peptides += 1
                    # Break peptide cursor loop if limit is hit
                    if written_peptides == limit:
                        break
            with database_connection.cursor() as database_cursor:
                yield post_spectra_content(database_cursor, where_condition)
        except BaseException as e:
            app.logger.error(
                f"steam throws err => {e}\n\ntraceback => {traceback.format_exc()}"
            )
            raise e
        finally:
            macpepdb_pool.putconn(database_connection)
 def test_termini(self):
     fictional_peptide = Peptide(FICTIONAL_SEQUENCE, 0)
     self.assertEqual("A", fictional_peptide.n_terminus)
     self.assertEqual("A", fictional_peptide.c_terminus)
Пример #19
0
    def verify_database_integrity(self, proteins_from_file: List[Protein],
                                  enzym: DigestEnzyme):
        """
        Verifies the database by:
        1. Check if all protein from file exists and their attributes are matching
        2. Digest the given proteins and check if:
            2.1 The peptides are found in the database (by primary key)
            2.2 The values which are generated on the fly and not send from the database, e.g. amino acid counts, matches the on in the database.
        3. Check if all proteins and their peptides have association and if the association count matches the actual protein peptides relationships
        4. Check if all peptides have a related metadata record

        Parameters
        ----------
        proteins_from_file : List[Protein]
            Proteins read from the protein file
        enzym : DigestEnzyme
            Enzym for digesting. Shoud match the one which is used for the database creation.
        """
        peptides_from_file_proteins = set()
        for file_protein in proteins_from_file:
            for new_peptide in enzym.digest(file_protein):
                peptides_from_file_proteins.add(new_peptide)

        with self.database_connection.cursor() as database_cursor:
            # Check if protein count in database are equals to set
            database_cursor.execute(
                f"SELECT count(*) FROM {Protein.TABLE_NAME};")
            self.assertEqual(len(proteins_from_file),
                             database_cursor.fetchone()[0])

            # Check if all proteins are correct proteins
            for file_protein in proteins_from_file:
                db_protein = Protein.select(
                    database_cursor,
                    WhereCondition(["accession = %s"],
                                   [file_protein.accession]),
                )
                self.assertIsNotNone(db_protein)
                self.assertEqual(db_protein.accession, file_protein.accession)
                self.assertEqual(db_protein.secondary_accessions,
                                 file_protein.secondary_accessions)
                self.assertEqual(db_protein.entry_name,
                                 file_protein.entry_name)
                self.assertEqual(db_protein.name, file_protein.name)
                self.assertEqual(db_protein.sequence, file_protein.sequence)
                self.assertEqual(db_protein.taxonomy_id,
                                 file_protein.taxonomy_id)
                self.assertEqual(db_protein.proteome_id,
                                 file_protein.proteome_id)
                self.assertEqual(db_protein.is_reviewed,
                                 file_protein.is_reviewed)

            # Check if set count is equals db count
            # Because peptides are not removed from the database it is possible to have more peptides
            # in the database after protein updates than in the file.
            database_cursor.execute(
                f"SELECT count(*) FROM {Peptide.TABLE_NAME};")
            self.assertLessEqual(len(peptides_from_file_proteins),
                                 database_cursor.fetchone()[0])

            for file_peptide in peptides_from_file_proteins:
                db_peptide = Peptide.select(
                    database_cursor,
                    WhereCondition([
                        "partition = %s", "AND", "mass = %s", "AND",
                        "sequence = %s"
                    ], [
                        file_peptide.partition, file_peptide.mass,
                        file_peptide.sequence
                    ]))
                self.assertIsNotNone(db_peptide)
                self.assertEqual(db_peptide.sequence, file_peptide.sequence)
                self.assertEqual(db_peptide.mass, file_peptide.mass)
                self.assertEqual(db_peptide.partition, file_peptide.partition)
                self.assertEqual(db_peptide.number_of_missed_cleavages,
                                 file_peptide.number_of_missed_cleavages)

                # Because the amino acid counts are counted on the fly to save I/O and bandwidth, lets check the values in the database
                database_cursor.execute(
                    ("SELECT "
                     "a_count, "
                     "b_count, "
                     "c_count, "
                     "d_count, "
                     "e_count, "
                     "f_count, "
                     "g_count, "
                     "h_count, "
                     "i_count, "
                     "j_count, "
                     "k_count, "
                     "l_count, "
                     "m_count, "
                     "n_count, "
                     "o_count, "
                     "p_count, "
                     "q_count, "
                     "r_count, "
                     "s_count, "
                     "t_count, "
                     "u_count, "
                     "v_count, "
                     "w_count, "
                     "y_count, "
                     "z_count, "
                     "n_terminus, "
                     "c_terminus "
                     f"FROM {Peptide.TABLE_NAME} "
                     "WHERE partition = %s AND mass = %s AND sequence = %s"),
                    (file_peptide.partition, file_peptide.mass,
                     file_peptide.sequence))
                db_peptide_record = database_cursor.fetchone()
                self.assertIsNotNone(db_peptide_record)
                # file_peptide attributes in the array below have the same order as in the query
                for value_idx, file_peptide_value in enumerate([
                        file_peptide.a_count, file_peptide.b_count,
                        file_peptide.c_count, file_peptide.d_count,
                        file_peptide.e_count, file_peptide.f_count,
                        file_peptide.g_count, file_peptide.h_count,
                        file_peptide.i_count, file_peptide.j_count,
                        file_peptide.k_count, file_peptide.l_count,
                        file_peptide.m_count, file_peptide.n_count,
                        file_peptide.o_count, file_peptide.p_count,
                        file_peptide.q_count, file_peptide.r_count,
                        file_peptide.s_count, file_peptide.t_count,
                        file_peptide.u_count, file_peptide.v_count,
                        file_peptide.w_count, file_peptide.y_count,
                        file_peptide.z_count,
                        file_peptide.get_n_terminus_ascii_dec(),
                        file_peptide.get_c_terminus_ascii_dec()
                ]):
                    self.assertEqual(file_peptide_value,
                                     db_peptide_record[value_idx])

            # Check protein/peptide-associations from both directions
            desired_number_of_associations = 0
            for file_protein in proteins_from_file:
                for file_peptide in enzym.digest(file_protein):
                    # Increase association counter
                    desired_number_of_associations += 1
                    database_cursor.execute((
                        "SELECT true "
                        f"FROM {ProteinPeptideAssociation.TABLE_NAME} "
                        "WHERE protein_accession = %s AND partition = %s AND peptide_mass = %s AND peptide_sequence = %s;"
                    ), (
                        file_protein.accession,
                        file_peptide.partition,
                        file_peptide.mass,
                        file_peptide.sequence,
                    ))
                    is_association_found = database_cursor.fetchone()[0]
                    self.assertIsNotNone(is_association_found)
                    self.assertTrue(is_association_found)

            # Check association counter. Must be equals even after updates.
            database_cursor.execute(
                f"SELECT count(*) FROM {ProteinPeptideAssociation.TABLE_NAME};"
            )
            self.assertEqual(desired_number_of_associations,
                             database_cursor.fetchone()[0])

            # Check if peptide metadata equals peptides
            database_cursor.execute(
                f"SELECT count(*) FROM {PeptideMetadata.TABLE_NAME};")
            metadata_count = database_cursor.fetchone()[0]
            database_cursor.execute(
                f"SELECT count(*) FROM {Peptide.TABLE_NAME};")
            peptide_count = database_cursor.fetchone()[0]
            self.assertEqual(metadata_count, peptide_count)
            # Check if the current peptides have updated metadata
            for file_peptide in peptides_from_file_proteins:
                file_peptide.fetch_metadata_from_proteins(database_cursor)
                db_metadata = PeptideMetadata.select(database_cursor,
                                                     file_peptide)
                self.assertIsNotNone(
                    db_metadata,
                    f"metadata for peptide '{file_peptide.sequence}' is missing"
                )
                if db_metadata:
                    self.assertEqual(db_metadata.is_swiss_prot,
                                     file_peptide.metadata.is_swiss_prot)
                    self.assertEqual(db_metadata.is_trembl,
                                     file_peptide.metadata.is_trembl)
                    self.assertEqual(
                        sorted(db_metadata.taxonomy_ids),
                        sorted(file_peptide.metadata.taxonomy_ids))
                    self.assertEqual(
                        sorted(db_metadata.unique_taxonomy_ids),
                        sorted(file_peptide.metadata.unique_taxonomy_ids))
                    self.assertEqual(
                        sorted(db_metadata.proteome_ids),
                        sorted(file_peptide.metadata.proteome_ids))

            # Check if maintenance mode is false and update timestamp is greater zero
            database_status = MaintenanceInformation.select(
                database_cursor, MaintenanceInformation.DATABASE_STATUS_KEY)
            self.assertNotEqual(database_status, None)
            self.assertGreater(database_status.values['last_update'], 0)
            self.assertEqual(database_status.values['status'],
                             DatabaseStatus.READY.value)
            self.assertFalse(database_status.values['maintenance_mode'])
Пример #20
0
    def sequence_mass(sequence):
        peptide = Peptide(sequence, 0)

        return jsonify({'mass': mass_to_float(peptide.mass)})
Пример #21
0
    def digest():
        """
        Digest a given peptide/sequence, search the resulting peptides in the database and return matching and not matching peptides in separate array.
        """
        data = request.get_json()
        errors = ApiDigestionController.check_digestion_parameters(data)

        if not "sequence" in data:
            errors["sequence"].append("cannot be empty")

        digestion_peptides = []
        database_peptides = []
        if len(errors) == 0:
            EnzymeClass = get_digestion_enzyme_by_name("trypsin")
            enzyme = EnzymeClass(data["maximum_number_of_missed_cleavages"],
                                 data["minimum_peptide_length"],
                                 data["maximum_peptide_length"])
            digestion_peptides = enzyme.digest(
                Protein("TMP", [], "TMP", "TMP", data["sequence"], [], [],
                        False, 0))

            if "do_database_search" in data and isinstance(
                    data["do_database_search"],
                    bool) and data["do_database_search"]:
                database_connection = get_database_connection()
                with database_connection.cursor() as database_cursor:
                    database_peptides = Peptide.select(
                        database_cursor,
                        WhereCondition(
                            ["(partition, mass, sequence) IN %s"], (tuple(
                                (peptide.partition, peptide.mass,
                                 peptide.sequence)
                                for peptide in digestion_peptides), )),
                        fetchall=True)
                database_peptides.sort(key=lambda peptide: peptide.mass)
                digestion_peptides = [
                    peptide for peptide in digestion_peptides
                    if peptide not in database_peptides
                ]

            digestion_peptides.sort(key=lambda peptide: peptide.mass)

        if len(errors) == 0:

            def json_stream() -> Iterator[bytes]:
                yield b"{\"database\": ["
                for peptide_idx, peptide in enumerate(database_peptides):
                    if peptide_idx > 0:
                        yield b","
                    yield from peptide.to_json()
                yield b"],\"digestion\": ["
                for peptide_idx, peptide in enumerate(digestion_peptides):
                    if peptide_idx > 0:
                        yield b","
                    yield from peptide.to_json()
                yield f"],\"count\": {len(database_peptides) +  len(digestion_peptides)}}}".encode(
                    "utf-8")

            return Response(json_stream(), content_type="application/json")

        else:
            return jsonify({"errors": errors}), 422
Пример #22
0
    def test_validation(self):
        static_carbamidomethylation_of_c = Modification(
            'unimod:4', 'carbamidomethylation of cysteine',
            AminoAcid.get_by_one_letter_code('C'), mass_to_int(57.021464),
            True, ModificationPosition.ANYWHERE)
        variable_oxidation_of_m = Modification(
            'unimod:35', 'oxidation of methionine',
            AminoAcid.get_by_one_letter_code('M'), mass_to_int(15.994915),
            False, ModificationPosition.ANYWHERE)
        static_custom_modification_of_n_terminal_d = Modification(
            'custom:1', 'custom of aspartic acid',
            AminoAcid.get_by_one_letter_code('D'), mass_to_int(10.01541), True,
            ModificationPosition.N_TERMINUS)
        variable_custom_modification_of_n_terminal_d = Modification(
            'custom:2', 'custom of aspartic acid',
            AminoAcid.get_by_one_letter_code('D'), mass_to_int(10.01541),
            False, ModificationPosition.N_TERMINUS)
        static_custom_modification_of_c_terminal_r = Modification(
            'custom:3', 'custom of arginine',
            AminoAcid.get_by_one_letter_code('R'), mass_to_int(6.153215), True,
            ModificationPosition.C_TERMINUS)
        variable_custom_modification_of_c_terminal_r = Modification(
            'custom:4', 'custom of arginine',
            AminoAcid.get_by_one_letter_code('R'), mass_to_int(6.153215),
            False, ModificationPosition.C_TERMINUS)

        peptide = Peptide(LEPTIN_PEPTIDE_SEQUENCE, 2)

        # Static carbamidomethylation of C
        expected_peptide_mass = peptide.weight + peptide.c_count * static_carbamidomethylation_of_c.delta
        modification_collection = ModificationCollection(
            [static_carbamidomethylation_of_c])
        precursor_range = PrecursorRange(expected_peptide_mass, 0, 0)
        validator = PeptideMassValidator(modification_collection, 0,
                                         precursor_range)
        self.assertTrue(validator.validate(peptide))

        # This should als match with allowed variable modification (where actually none is applied)
        # Static carbamidomethylation of C
        # Variable oxidation of M (not considered in expected_weight)
        modification_collection = ModificationCollection(
            [static_carbamidomethylation_of_c, variable_oxidation_of_m])
        validator = PeptideMassValidator(modification_collection, 3,
                                         precursor_range)
        self.assertTrue(validator.validate(peptide))

        # Static carbamidomethylation of C
        # 1 variable oxidation of M
        expected_peptide_mass = peptide.weight \
            + peptide.c_count * static_carbamidomethylation_of_c.delta \
            + 1 * variable_oxidation_of_m.delta
        modification_collection = ModificationCollection(
            [static_carbamidomethylation_of_c, variable_oxidation_of_m])
        precursor_range = PrecursorRange(expected_peptide_mass, 0, 0)
        validator = PeptideMassValidator(modification_collection, 3,
                                         precursor_range)
        self.assertTrue(validator.validate(peptide))

        # This should not match if no variable modifiations are allowed
        # Static carbamidomethylation of C
        # Variable oxidation of M (considered in expected_weight but no variable modification allowed in validation)
        validator.set_maximum_number_of_variable_modifications(0)
        self.assertFalse(validator.validate(peptide))

        # Lets replace two Js with Ms and test 3 applied variable oxidations of M
        # Static carbamidomethylation of C
        # 3 Variable oxidation of M
        peptide = Peptide(LEPTIN_PEPTIDE_SEQUENCE.replace('J', 'M', 2), 2)
        expected_peptide_mass = peptide.weight \
            + peptide.c_count * static_carbamidomethylation_of_c.delta \
            + 3 * variable_oxidation_of_m.delta
        modification_collection = ModificationCollection(
            [static_carbamidomethylation_of_c, variable_oxidation_of_m])
        precursor_range = PrecursorRange(expected_peptide_mass, 0, 0)
        validator = PeptideMassValidator(modification_collection, 3,
                                         precursor_range)
        self.assertTrue(validator.validate(peptide))

        # This should fail with only 2 allowed variable modifications
        validator.set_maximum_number_of_variable_modifications(2)
        self.assertFalse(validator.validate(peptide))

        # Test variable n-terminal
        # Variable n-terminal modification of D
        # Static carbamidomethylation of C
        # 2 variable oxidation of M
        expected_peptide_mass = peptide.weight \
            + variable_custom_modification_of_n_terminal_d.delta \
            + peptide.c_count * static_carbamidomethylation_of_c.delta \
            + 2 * variable_oxidation_of_m.delta
        modification_collection = ModificationCollection([
            static_carbamidomethylation_of_c, variable_oxidation_of_m,
            variable_custom_modification_of_n_terminal_d
        ])
        precursor_range = PrecursorRange(expected_peptide_mass, 0, 0)
        validator = PeptideMassValidator(modification_collection, 3,
                                         precursor_range)
        self.assertTrue(validator.validate(peptide))

        # This should fail with only 2 allowed variable modifications
        validator.set_maximum_number_of_variable_modifications(2)
        self.assertFalse(validator.validate(peptide))

        # Test static n-terminal modification
        # Static n-terminal modification of D
        # Static carbamidomethylation of C
        # 2 variable oxidation of M
        expected_peptide_mass = peptide.weight \
            + static_custom_modification_of_n_terminal_d.delta \
            + peptide.c_count * static_carbamidomethylation_of_c.delta \
            + 2 * variable_oxidation_of_m.delta
        modification_collection = ModificationCollection([
            static_carbamidomethylation_of_c, variable_oxidation_of_m,
            static_custom_modification_of_n_terminal_d
        ])
        precursor_range = PrecursorRange(expected_peptide_mass, 0, 0)
        validator = PeptideMassValidator(modification_collection, 3,
                                         precursor_range)
        self.assertTrue(validator.validate(peptide))

        # Test variable n-terminal
        # Variable c-terminal modification of R
        # Static carbamidomethylation of C
        # 2 variable oxidation of M
        expected_peptide_mass = peptide.weight \
            + variable_custom_modification_of_c_terminal_r.delta \
            + peptide.c_count * static_carbamidomethylation_of_c.delta \
            + 2 * variable_oxidation_of_m.delta
        modification_collection = ModificationCollection([
            static_carbamidomethylation_of_c, variable_oxidation_of_m,
            variable_custom_modification_of_c_terminal_r
        ])
        precursor_range = PrecursorRange(expected_peptide_mass, 0, 0)
        validator = PeptideMassValidator(modification_collection, 3,
                                         precursor_range)
        self.assertTrue(validator.validate(peptide))

        # This should fail with only 2 allowed variable modifications
        validator.set_maximum_number_of_variable_modifications(2)
        self.assertFalse(validator.validate(peptide))

        # Test static n-terminal modification
        # Static c-terminal modification of R
        # Static carbamidomethylation of C
        # 2 variable oxidation of M
        expected_peptide_mass = peptide.weight \
            + static_custom_modification_of_c_terminal_r.delta \
            + peptide.c_count * static_carbamidomethylation_of_c.delta \
            + 2 * variable_oxidation_of_m.delta
        modification_collection = ModificationCollection([
            static_carbamidomethylation_of_c, variable_oxidation_of_m,
            static_custom_modification_of_c_terminal_r
        ])
        precursor_range = PrecursorRange(expected_peptide_mass, 0, 0)
        validator = PeptideMassValidator(modification_collection, 3,
                                         precursor_range)
        self.assertTrue(validator.validate(peptide))