def to_where_condition(self) -> WhereCondition: """ Returns a where condition, containing the WHERE-part of a SQL-query and the values for the query, which can be used to fetch peptides for the mass/modification-combination. Returns ------- WhereCondition """ if len(self): finished_where_condition = WhereCondition([], []) for combination in self: finished_where_condition.concatenate( combination.to_where_condition(), "OR") return finished_where_condition else: precursor_range = PrecursorRange( self.__precursor, self.__lower_precursor_tolerance_ppm, self.__upper_precursor_tolerance_ppm) return WhereCondition([ "partition BETWEEN %s AND %s", "AND", "mass BETWEEN %s AND %s" ], [ Peptide.get_partition(precursor_range.lower_limit), Peptide.get_partition(precursor_range.upper_limit), precursor_range.lower_limit, precursor_range.upper_limit ])
def show(sequence: str): is_reviewed = request.args.get("is_reviewed", None) if is_reviewed is not None: is_reviewed = bool(int(is_reviewed)) sequence = sequence.upper() database_connection = get_database_connection() with database_connection.cursor() as database_cursor: peptide = Peptide(sequence, 0, None) peptide = Peptide.select( database_cursor, WhereCondition([ "partition = %s", "AND", "mass = %s", "AND", "sequence = %s" ], [peptide.partition, peptide.mass, peptide.sequence]), include_metadata=True) if peptide is None: return jsonify({"errors": {"sequence": ["not found"]}}), 404 # Return peptide if is_reviewed is not requested (None), # or is_reviewed is requested and True and metadata is_swiss_prot is also True # or is_reviewed is requested and False and metadata is_trembl is True if is_reviewed is None \ or is_reviewed and peptide.metadata.is_swiss_prot \ or not is_reviewed and peptide.metadata.is_trembl: return Response(peptide.to_json(), content_type="application/json") return jsonify({}), 404
def test_with_modifications(self): # lower hit # Add peptides to database with self.database_connection: with self.database_connection.cursor() as database_cursor: for key in PEPTIDES_FOR_MODIFIED_SEARCH.keys(): Peptide.bulk_insert(database_cursor, [ Peptide(sequence, 0) for sequence in PEPTIDES_FOR_MODIFIED_SEARCH[key] ]) csv_file_path = pathlib.Path("./test_files/modifications.csv") modification_collection = ModificationCollection.read_from_csv_file( csv_file_path) precursor = mass_to_int(thomson_to_dalton(MASS_TO_CHARGE_RATIO, CHARGE)) with self.database_connection: with self.database_connection.cursor() as database_cursor: modification_combination_list = ModificationCombinationList( modification_collection, precursor, PRECURSOR_TOLERANCE, PRECURSOR_TOLERANCE, VARIABLE_MODIFICATION_MAXIMUM) where_condition = modification_combination_list.to_where_condition( ) peptides = Peptide.select(database_cursor, where_condition, fetchall=True) # Check if only matching peptides were found self.assertEqual(len(peptides), len(PEPTIDES_FOR_MODIFIED_SEARCH['matching'])) for peptide in peptides: self.assertIn(peptide.sequence, PEPTIDES_FOR_MODIFIED_SEARCH['matching'])
def peptides(accession: str): database_connection = get_database_connection() with database_connection.cursor() as database_cursor: peptides = Peptide.select( database_cursor, WhereCondition( [f"(peps.partition, peps.mass, peps.sequence) IN (SELECT partition, peptide_mass, peptide_sequence FROM {ProteinPeptideAssociation.TABLE_NAME} as ppa WHERE ppa.protein_accession = %s)"], [accession] ), fetchall=True, include_metadata= True ) peptides.sort(key = lambda peptide: peptide.mass) def json_stream() -> Iterator[bytes]: yield b"{\"peptides\": [" for peptide_idx, peptide in enumerate(peptides): if peptide_idx > 0: yield b"," yield from peptide.to_json() yield b"]}" return Response( json_stream(), content_type="application/json" )
def proteins(sequence: str): peptide = Peptide(sequence.upper(), 0) database_connection = get_database_connection() with database_connection.cursor() as database_cursor: proteins = Protein.select( database_cursor, WhereCondition([ f"accession = ANY(SELECT protein_accession FROM {ProteinPeptideAssociation.TABLE_NAME} as ppa WHERE ppa.partition = %s AND ppa.peptide_mass = %s AND ppa.peptide_sequence = %s)" ], [peptide.partition, peptide.mass, peptide.sequence]), True) reviewed_proteins = [] unreviewed_proteins = [] for protein in proteins: if protein.is_reviewed: reviewed_proteins.append(protein) else: unreviewed_proteins.append(protein) def json_stream() -> Iterator[bytes]: yield b"{\"reviewed_proteins\": [" for protein_idx, protein in enumerate(reviewed_proteins): if protein_idx > 0: yield b"," yield from protein.to_json() yield b"],\"unreviewed_proteins\": [" for protein_idx, protein in enumerate(unreviewed_proteins): if protein_idx > 0: yield b"," yield from protein.to_json() yield b"]}" return Response(json_stream(), content_type="application/json")
def test_mass_calculation(self): fictional_peptide = Peptide(FICTIONAL_SEQUENCE, 0) # If the amino acid cound test passes, we can use the counts to calculate the mass manally. # Actually there is no external tool which supports all of our known amino acids, so we can double check the weigth. mass = fictional_peptide.a_count * AminoAcid.get_by_one_letter_code('A').mono_mass \ + fictional_peptide.b_count * AminoAcid.get_by_one_letter_code('B').mono_mass \ + fictional_peptide.c_count * AminoAcid.get_by_one_letter_code('C').mono_mass \ + fictional_peptide.d_count * AminoAcid.get_by_one_letter_code('D').mono_mass \ + fictional_peptide.e_count * AminoAcid.get_by_one_letter_code('E').mono_mass \ + fictional_peptide.f_count * AminoAcid.get_by_one_letter_code('F').mono_mass \ + fictional_peptide.g_count * AminoAcid.get_by_one_letter_code('G').mono_mass \ + fictional_peptide.h_count * AminoAcid.get_by_one_letter_code('H').mono_mass \ + fictional_peptide.i_count * AminoAcid.get_by_one_letter_code('I').mono_mass \ + fictional_peptide.j_count * AminoAcid.get_by_one_letter_code('J').mono_mass \ + fictional_peptide.k_count * AminoAcid.get_by_one_letter_code('K').mono_mass \ + fictional_peptide.l_count * AminoAcid.get_by_one_letter_code('L').mono_mass \ + fictional_peptide.m_count * AminoAcid.get_by_one_letter_code('M').mono_mass \ + fictional_peptide.n_count * AminoAcid.get_by_one_letter_code('N').mono_mass \ + fictional_peptide.o_count * AminoAcid.get_by_one_letter_code('O').mono_mass \ + fictional_peptide.p_count * AminoAcid.get_by_one_letter_code('P').mono_mass \ + fictional_peptide.q_count * AminoAcid.get_by_one_letter_code('Q').mono_mass \ + fictional_peptide.r_count * AminoAcid.get_by_one_letter_code('R').mono_mass \ + fictional_peptide.s_count * AminoAcid.get_by_one_letter_code('S').mono_mass \ + fictional_peptide.t_count * AminoAcid.get_by_one_letter_code('T').mono_mass \ + fictional_peptide.u_count * AminoAcid.get_by_one_letter_code('U').mono_mass \ + fictional_peptide.v_count * AminoAcid.get_by_one_letter_code('V').mono_mass \ + fictional_peptide.w_count * AminoAcid.get_by_one_letter_code('W').mono_mass \ + fictional_peptide.y_count * AminoAcid.get_by_one_letter_code('Y').mono_mass \ + fictional_peptide.z_count * AminoAcid.get_by_one_letter_code('Z').mono_mass \ + H2O.mono_mass self.assertEqual(mass, fictional_peptide.mass)
def test_amino_acid_counts(self): fictional_peptide = Peptide(FICTIONAL_SEQUENCE, 0) self.assertEqual(fictional_peptide.a_count, 2) self.assertEqual(fictional_peptide.b_count, 2) self.assertEqual(fictional_peptide.c_count, 4) self.assertEqual(fictional_peptide.d_count, 5) self.assertEqual(fictional_peptide.e_count, 2) self.assertEqual(fictional_peptide.f_count, 1) self.assertEqual(fictional_peptide.g_count, 2) self.assertEqual(fictional_peptide.h_count, 2) self.assertEqual(fictional_peptide.i_count, 6) self.assertEqual(fictional_peptide.j_count, 3) self.assertEqual(fictional_peptide.k_count, 5) self.assertEqual(fictional_peptide.l_count, 7) self.assertEqual(fictional_peptide.m_count, 1) self.assertEqual(fictional_peptide.n_count, 1) self.assertEqual(fictional_peptide.o_count, 4) self.assertEqual(fictional_peptide.p_count, 3) self.assertEqual(fictional_peptide.q_count, 4) self.assertEqual(fictional_peptide.r_count, 3) self.assertEqual(fictional_peptide.s_count, 6) self.assertEqual(fictional_peptide.t_count, 7) self.assertEqual(fictional_peptide.u_count, 1) self.assertEqual(fictional_peptide.v_count, 5) self.assertEqual(fictional_peptide.w_count, 4) self.assertEqual(fictional_peptide.y_count, 1) self.assertEqual(fictional_peptide.z_count, 1) amino_acid_sum = fictional_peptide.a_count \ + fictional_peptide.b_count \ + fictional_peptide.c_count \ + fictional_peptide.d_count \ + fictional_peptide.e_count \ + fictional_peptide.f_count \ + fictional_peptide.g_count \ + fictional_peptide.h_count \ + fictional_peptide.i_count \ + fictional_peptide.j_count \ + fictional_peptide.k_count \ + fictional_peptide.l_count \ + fictional_peptide.m_count \ + fictional_peptide.n_count \ + fictional_peptide.o_count \ + fictional_peptide.p_count \ + fictional_peptide.q_count \ + fictional_peptide.r_count \ + fictional_peptide.s_count \ + fictional_peptide.t_count \ + fictional_peptide.u_count \ + fictional_peptide.v_count \ + fictional_peptide.w_count \ + fictional_peptide.y_count \ + fictional_peptide.z_count self.assertEqual(len(fictional_peptide.sequence), amino_acid_sum)
def test_without_modifications(self): # Add peptides to database with self.database_connection: with self.database_connection.cursor() as database_cursor: for key in PEPTIDES_FOR_UNMODIFIED_SEARCH.keys(): Peptide.bulk_insert(database_cursor, [ Peptide(sequence, 0) for sequence in PEPTIDES_FOR_UNMODIFIED_SEARCH[key] ]) modification_collection = ModificationCollection([]) precursor = mass_to_int(thomson_to_dalton(MASS_TO_CHARGE_RATIO, CHARGE)) with self.database_connection: with self.database_connection.cursor() as database_cursor: modification_combination_list = ModificationCombinationList( modification_collection, precursor, PRECURSOR_TOLERANCE, PRECURSOR_TOLERANCE, VARIABLE_MODIFICATION_MAXIMUM) where_condition = modification_combination_list.to_where_condition( ) select_conditions_string = database_cursor.mogrify( where_condition.get_condition_str(), where_condition.values).decode('utf-8') matches = re.findall(self.__class__.MASS_TOLERANCE_REGEX, select_conditions_string) # Without modifications there is only one between-condition. self.assertEqual(len(matches), 1) peptides = Peptide.select(database_cursor, where_condition, fetchall=True) # Check if only matching peptides were found self.assertEqual( len(peptides), len(PEPTIDES_FOR_UNMODIFIED_SEARCH['matching'])) for peptide in peptides: self.assertIn(peptide.sequence, PEPTIDES_FOR_UNMODIFIED_SEARCH['matching'])
def test_validation(self): """ Checks if mass validation works. """ peptide_mass_validator = PeptideMassValidator( self.__class__.MODIFICATION_COLLECTION, self.__class__.NUMBER_OF_VARIABLE_MODIFICATIONS, self.__class__.PRECURSOR_RANGE) for plain_sequence, annotated_sequence in self.__class__.PEPTIDE_SEQUENCES: peptide = Peptide(plain_sequence, Trypsin.count_missed_cleavages(plain_sequence)) self.assertTrue( peptide_mass_validator.validate(peptide, True), f"expected: {annotated_sequence}; is: {peptide.sequence_with_modification_markers}" )
def run(self): """ Starts the process and enqueing of updateable peptides. """ self.activate_signal_handling() self.__general_log.send("Start enqueuing updatable peptides.") database_connection = None # retry loop while not self.termination_event.is_set(): try: if not database_connection or ( database_connection and database_connection.closed != 0): database_connection = psycopg2.connect(self.__database_url) with database_connection.cursor( name='updatable_peptide_collector') as database_cursor: database_cursor.itersize = 1000 database_cursor.execute( f"SELECT sequence, number_of_missed_cleavages FROM {Peptide.TABLE_NAME} WHERE is_metadata_up_to_date = false;" ) peptides = [] for peptide_row in database_cursor: peptides.append(Peptide(peptide_row[0], peptide_row[1])) if len(peptides) == self.__class__.PEPTIDE_BATCH: self.__enque_peptides(peptides) peptides = [] if self.termination_event.is_set(): break # breat cursor loop # Enqueue last batch of peptides if len(peptides): self.__enque_peptides(peptides) peptides = [] break # break retry loop except psycopg2.Error as error: # Catch database errors and disconnects self.__general_log.send( f"error occured, see:\n{error}\ntry again") self.__general_log.send("All updatable peptides enqueued.") self.__general_log.close()
def generate_text_stream() -> Iterator[str]: database_connection = macpepdb_pool.getconn() try: with database_connection.cursor() as database_cursor: database_cursor.itersize = ApiPeptidesController.PEPTIDE_LOOKUP_CHUNKS # Request each partition in a separate query for partition, part_peptides in partitions.items(): # Chunk the peptides in 500 chunk_start = 0 while chunk_start < len(part_peptides): chunk = part_peptides[chunk_start:( chunk_start + ApiPeptidesController.PEPTIDE_LOOKUP_CHUNKS)] for peptide_idx, peptide in enumerate( Peptide.select( database_cursor, WhereCondition( [ "partition = %s", "AND" "(mass, sequence) IN (" + ",".join(["(%s, %s)"] * len(chunk)) + ")" ], [partition] + list( itertools.chain( *[(peptide.mass, peptide.sequence) for peptide in chunk])), ), stream=True)): if peptide_idx > 0: yield "\n" yield peptide.sequence chunk_start += ApiPeptidesController.PEPTIDE_LOOKUP_CHUNKS finally: macpepdb_pool.putconn(database_connection)
def test_with_modifications(self): # lower hit # Add peptides to database session = self.session_factory() for key in PEPTIDES_FOR_MODIFIED_SEARCH.keys(): for sequence in PEPTIDES_FOR_MODIFIED_SEARCH[key]: peptide = Peptide(sequence, 0) session.add(peptide) session.commit() session.close() csv_file_path = pathlib.Path("./test_files/modifications.csv") modification_collection = ModificationCollection.read_from_csv_file( csv_file_path) precursor = mass_to_int(thomson_to_dalton(MASS_TO_CHARGE_RATIO, CHARGE)) builder = ModifiedPeptideWhereClauseBuilder( modification_collection, precursor, PRECURSOR_TOLERANCE, PRECURSOR_TOLERANCE, VARIABLE_MODIFICATION_MAXIMUM) where_clause = builder.build(Peptide) # Create fresh session session = self.session_factory() builder = ModifiedPeptideWhereClauseBuilder( modification_collection, precursor, PRECURSOR_TOLERANCE, PRECURSOR_TOLERANCE, VARIABLE_MODIFICATION_MAXIMUM) where_clause = builder.build(Peptide) peptides = session.query(Peptide).filter(where_clause).all() # Check if only matching peptides were found self.assertEqual(len(peptides), len(PEPTIDES_FOR_MODIFIED_SEARCH['matching'])) for peptide in peptides: self.assertIn(peptide.sequence, PEPTIDES_FOR_MODIFIED_SEARCH['matching'])
def test_without_modifications(self): # Add peptides to database session = self.session_factory() for key in PEPTIDES_FOR_UNMODIFIED_SEARCH.keys(): for sequence in PEPTIDES_FOR_UNMODIFIED_SEARCH[key]: peptide = Peptide(sequence, 0) session.add(peptide) session.commit() session.close() modification_collection = ModificationCollection([]) precursor = mass_to_int(thomson_to_dalton(MASS_TO_CHARGE_RATIO, CHARGE)) # Create fresh session session = self.session_factory() builder = ModifiedPeptideWhereClauseBuilder( modification_collection, precursor, PRECURSOR_TOLERANCE, PRECURSOR_TOLERANCE, VARIABLE_MODIFICATION_MAXIMUM) where_clause = builder.build(Peptide) where_clause_string = str( where_clause.compile(compile_kwargs={"literal_binds": True})) matches = re.findall(self.__class__.WEIGHT_TOLERANCE_REGEX, where_clause_string) # Without modifications there is only one between-condition. self.assertEqual(len(matches), 1) peptides = session.query(Peptide).filter(where_clause).all() # Check if only matching peptides were found self.assertEqual(len(peptides), len(PEPTIDES_FOR_UNMODIFIED_SEARCH['matching'])) for peptide in peptides: self.assertIn(peptide.sequence, PEPTIDES_FOR_UNMODIFIED_SEARCH['matching'])
def sequence_lookup(): """ Check if the incoming peptide sequences exists in MaCPepDB Returns ------- Response Flask response """ data = request.json errors = defaultdict(list) if not "sequences" in data: errors["sequences"].append("cannot be empty") elif not isinstance(data["sequences"], list): errors["sequences"].append("must be a list") peptides = [Peptide(sequence, 0) for sequence in data["sequences"]] # Sort peptides by partition partitions = defaultdict(list) for peptide in peptides: partitions[peptide.partition].append(peptide) if len(errors) > 0: return jsonify({"errors": errors}), 422 def generate_text_stream() -> Iterator[str]: database_connection = macpepdb_pool.getconn() try: with database_connection.cursor() as database_cursor: database_cursor.itersize = ApiPeptidesController.PEPTIDE_LOOKUP_CHUNKS # Request each partition in a separate query for partition, part_peptides in partitions.items(): # Chunk the peptides in 500 chunk_start = 0 while chunk_start < len(part_peptides): chunk = part_peptides[chunk_start:( chunk_start + ApiPeptidesController.PEPTIDE_LOOKUP_CHUNKS)] for peptide_idx, peptide in enumerate( Peptide.select( database_cursor, WhereCondition( [ "partition = %s", "AND" "(mass, sequence) IN (" + ",".join(["(%s, %s)"] * len(chunk)) + ")" ], [partition] + list( itertools.chain( *[(peptide.mass, peptide.sequence) for peptide in chunk])), ), stream=True)): if peptide_idx > 0: yield "\n" yield peptide.sequence chunk_start += ApiPeptidesController.PEPTIDE_LOOKUP_CHUNKS finally: macpepdb_pool.putconn(database_connection) return Response(generate_text_stream(), content_type="text/plain")
def __calculate_columns_and_precursor_range( self, modification_counters: list, precursor: int, lower_precursor_tolerance_ppm: int, upper_precursor_tolerance_ppm: int): """ Creates column condition from the given modification counter and precursor and tolerances. Parameters ---------- modification_counters : list List of modification counters for this PTM/mass combination. precursor : int Precursor lower_precursor_tolerance_ppm : int Precursor rolerance upper_precursor_tolerance_ppm : int Precursor rolerance """ # Dict with amino acid one letter code as key and [count, is_static] as value, e.g.: "c": [4, True] amino_acid_occurences = {} # Used terminus modification of the form [modification, is_applied], e.g.: [Modification, False] n_terminus_modification = None c_terminus_modification = None # Sum of the mass delta by the applied modifications delta_sum = 0 for counter in modification_counters: # If counter is for static non terminus modification or a variable modification with a count higher than zero. Add it to the occurances if not counter.modification.is_terminus_modification and ( counter.modification.is_static or (counter.modification.is_variable and counter.count > 0)): if counter.modification.amino_acid.one_letter_code in amino_acid_occurences: amino_acid_occurences[counter.modification.amino_acid. one_letter_code][0] += counter.count else: amino_acid_occurences[ counter.modification.amino_acid.one_letter_code] = [ counter.count, counter.modification.is_static ] delta_sum += counter.count * counter.modification.delta elif counter.modification.is_terminus_modification and counter.modification.is_variable: if counter.modification.is_position_n_terminus: n_terminus_modification = [ counter.modification, counter.count > 0 ] if counter.modification.is_position_c_terminus: c_terminus_modification = [ counter.modification, counter.count > 0 ] delta_sum += counter.count * counter.modification.delta elif counter.modification.is_static and counter.modification.is_terminus_modification: delta_sum += counter.count * counter.modification.delta for one_letter_code, count_and_type in amino_acid_occurences.items(): # Build the column name, e.g. a_count column_name = "{}_count".format(one_letter_code.lower()) # Add condition. In case the modification is fix we want equals the amino acid, if the modification is variable it could be more. sql_operator = "= %s" if count_and_type[1] else ">= %s" self.__column_conditions.append( ColumnCondition(column_name, sql_operator, (count_and_type[0], ))) # For n- and c-terminus modification check if there is one: # * If the modification is variable and applied check for presence (absence is uninteresting because it is a variable modification) if n_terminus_modification and (n_terminus_modification[0].is_variable and n_terminus_modification[1]): self.__column_conditions.append( ColumnCondition("n_terminus", "= %s", (n_terminus_modification[0].amino_acid. get_one_letter_code_ascii_dec(), ))) if c_terminus_modification and (c_terminus_modification[0].is_variable and c_terminus_modification[1]): self.__column_conditions.append( ColumnCondition("c_terminus", "= %s", (c_terminus_modification[0].amino_acid. get_one_letter_code_ascii_dec(), ))) # Add the mass between condition self.__precursor_range = PrecursorRange(precursor - delta_sum, lower_precursor_tolerance_ppm, upper_precursor_tolerance_ppm) self.__column_conditions.append( ColumnCondition("mass", "BETWEEN %s AND %s", (self.__precursor_range.lower_limit, self.__precursor_range.upper_limit))) first_partition = Peptide.get_partition( self.__precursor_range.lower_limit) last_partition = Peptide.get_partition( self.__precursor_range.upper_limit) conditions_operator = "" conditions_values = () if first_partition == last_partition: conditions_operator = "= %s" conditions_values = (first_partition, ) else: conditions_operator = "BETWEEN %s AND %s" conditions_values = (first_partition, last_partition) self.__column_conditions.append( ColumnCondition("partition", conditions_operator, conditions_values))
def test_with_real_data(self): VARIABLE_MODIFICATION_LIMIT = 2 # Mass of MFPVTJEDTEGNVJTVSPPCYGFJQJR PRECURSOR = 3025492916648 PRECURSOR_TOLERANCE = 20 # With the given mass, tolerance and modifications 3 peptides shoud be found modifications_file_path = pathlib.Path( './test_files/modifications.csv') work_dir = pathlib.Path(f"./tmp/{self.id()}") test_files_path = pathlib.Path('./test_files') protein_data_test_file_path = test_files_path.joinpath('proteins.txt') self.prepare_workdir(work_dir, test_files_path, protein_data_test_file_path) maintenance = DatabaseMaintenance(os.getenv("TEST_MACPEPDB_URL"), work_dir, 4, 5, 'Trypsin', 2, 5, 40) maintenance.start() modification_collection = ModificationCollection.read_from_csv_file( modifications_file_path) peptide_mass_validator = PeptideMassValidator( modification_collection, VARIABLE_MODIFICATION_LIMIT, PrecursorRange(PRECURSOR, PRECURSOR_TOLERANCE, PRECURSOR_TOLERANCE)) validated_matching_peptide_sequences = set() with self.database_connection: with self.database_connection.cursor() as database_cursor: # Run through all peptides (in batches of 1000 peptides) and check which matchs the precursor and modification requirements database_cursor.execute( f"SELECT sequence, number_of_missed_cleavages FROM {Peptide.TABLE_NAME};" ) while True: rows = database_cursor.fetchmany(1000) if not len(rows): break for row in rows: peptide = Peptide(row[0], row[1]) if peptide_mass_validator.validate(peptide): validated_matching_peptide_sequences.add( peptide.sequence) with self.database_connection: with self.database_connection.cursor() as database_cursor: modification_combination_list = ModificationCombinationList( modification_collection, PRECURSOR, PRECURSOR_TOLERANCE, PRECURSOR_TOLERANCE, VARIABLE_MODIFICATION_LIMIT) where_conditions = modification_combination_list.to_where_condition( ) peptides = Peptide.select(database_cursor, where_conditions, fetchall=True) queried_matching_peptide_sequences = set() for peptide in peptides: queried_matching_peptide_sequences.add(peptide.sequence) # Check length of both manually validated set and the queried set self.assertEqual(len(validated_matching_peptide_sequences), len(queried_matching_peptide_sequences)) # Cross check if peptide from one set is in the other set for sequence in queried_matching_peptide_sequences: self.assertIn(sequence, validated_matching_peptide_sequences) for sequence in validated_matching_peptide_sequences: self.assertIn(sequence, queried_matching_peptide_sequences)
def stream(peptide_conversion: Callable[[int, Peptide], Iterator[ByteString]], delimiter: ByteString, pre_peptide_content: ByteString, post_spectra_content: Callable[[Any, WhereCondition], ByteString], where_condition: WhereCondition, order_by_instruction: str, offset: int, limit: int, include_metadata: bool, metadata_condition: MetadataCondition) -> Iterable[ByteString]: """ Queries peptides and yields content for a stream response. Parameters ---------- peptide_conversion : Callable[[int, Peptide], Iterator[ByteString]] Function with peptide index and peptide as input and yields the given peptide as bytes string for the response. delimiter : ByteString Delimiter between peptides pre_peptide_content : ByteString _description_ post_spectra_content : Callable[[Any, WhereCondition], ByteString] Function which yields content after peptides, e.g. to add the count. Arguments are is the database cursor and the where condition where_condition : WhereCondition WhereCondition for SQL query order_by_instruction : str OrderBy instruction for SQL query offset : int Search offset limit : int Return limit include_metadata : bool If ture metadata will be included. metadata_condition : MetadataCondition Conditions for metadata. If not empty it sets metadata to true. Yields ------ Iterator[Iterable[ByteString]] Response body """ # In a generator the response is already returned and the app context is teared down. So we can not use a database connection from the actual request handling. # Get a new one from the pool and return it when the generator ist stopped (GeneratorExit is thrown). database_connection = macpepdb_pool.getconn() # Determine if metadata checks are necessary do_metadata_checks = metadata_condition.has_conditions() try: yield pre_peptide_content with database_connection.cursor( name="peptide_search") as database_cursor: # Counter for written peptides necessary of manual limit offset handling written_peptides = 0 for peptide_idx, peptide in enumerate( Peptide.select(database_cursor, where_condition, order_by=order_by_instruction, include_metadata=include_metadata, stream=True)): if peptide_idx >= offset - 1 and ( not do_metadata_checks or metadata_condition.validate(peptide.metadata)): if written_peptides > 0: yield delimiter yield from peptide_conversion(peptide_idx, peptide) written_peptides += 1 # Break peptide cursor loop if limit is hit if written_peptides == limit: break with database_connection.cursor() as database_cursor: yield post_spectra_content(database_cursor, where_condition) except BaseException as e: app.logger.error( f"steam throws err => {e}\n\ntraceback => {traceback.format_exc()}" ) raise e finally: macpepdb_pool.putconn(database_connection)
def test_termini(self): fictional_peptide = Peptide(FICTIONAL_SEQUENCE, 0) self.assertEqual("A", fictional_peptide.n_terminus) self.assertEqual("A", fictional_peptide.c_terminus)
def verify_database_integrity(self, proteins_from_file: List[Protein], enzym: DigestEnzyme): """ Verifies the database by: 1. Check if all protein from file exists and their attributes are matching 2. Digest the given proteins and check if: 2.1 The peptides are found in the database (by primary key) 2.2 The values which are generated on the fly and not send from the database, e.g. amino acid counts, matches the on in the database. 3. Check if all proteins and their peptides have association and if the association count matches the actual protein peptides relationships 4. Check if all peptides have a related metadata record Parameters ---------- proteins_from_file : List[Protein] Proteins read from the protein file enzym : DigestEnzyme Enzym for digesting. Shoud match the one which is used for the database creation. """ peptides_from_file_proteins = set() for file_protein in proteins_from_file: for new_peptide in enzym.digest(file_protein): peptides_from_file_proteins.add(new_peptide) with self.database_connection.cursor() as database_cursor: # Check if protein count in database are equals to set database_cursor.execute( f"SELECT count(*) FROM {Protein.TABLE_NAME};") self.assertEqual(len(proteins_from_file), database_cursor.fetchone()[0]) # Check if all proteins are correct proteins for file_protein in proteins_from_file: db_protein = Protein.select( database_cursor, WhereCondition(["accession = %s"], [file_protein.accession]), ) self.assertIsNotNone(db_protein) self.assertEqual(db_protein.accession, file_protein.accession) self.assertEqual(db_protein.secondary_accessions, file_protein.secondary_accessions) self.assertEqual(db_protein.entry_name, file_protein.entry_name) self.assertEqual(db_protein.name, file_protein.name) self.assertEqual(db_protein.sequence, file_protein.sequence) self.assertEqual(db_protein.taxonomy_id, file_protein.taxonomy_id) self.assertEqual(db_protein.proteome_id, file_protein.proteome_id) self.assertEqual(db_protein.is_reviewed, file_protein.is_reviewed) # Check if set count is equals db count # Because peptides are not removed from the database it is possible to have more peptides # in the database after protein updates than in the file. database_cursor.execute( f"SELECT count(*) FROM {Peptide.TABLE_NAME};") self.assertLessEqual(len(peptides_from_file_proteins), database_cursor.fetchone()[0]) for file_peptide in peptides_from_file_proteins: db_peptide = Peptide.select( database_cursor, WhereCondition([ "partition = %s", "AND", "mass = %s", "AND", "sequence = %s" ], [ file_peptide.partition, file_peptide.mass, file_peptide.sequence ])) self.assertIsNotNone(db_peptide) self.assertEqual(db_peptide.sequence, file_peptide.sequence) self.assertEqual(db_peptide.mass, file_peptide.mass) self.assertEqual(db_peptide.partition, file_peptide.partition) self.assertEqual(db_peptide.number_of_missed_cleavages, file_peptide.number_of_missed_cleavages) # Because the amino acid counts are counted on the fly to save I/O and bandwidth, lets check the values in the database database_cursor.execute( ("SELECT " "a_count, " "b_count, " "c_count, " "d_count, " "e_count, " "f_count, " "g_count, " "h_count, " "i_count, " "j_count, " "k_count, " "l_count, " "m_count, " "n_count, " "o_count, " "p_count, " "q_count, " "r_count, " "s_count, " "t_count, " "u_count, " "v_count, " "w_count, " "y_count, " "z_count, " "n_terminus, " "c_terminus " f"FROM {Peptide.TABLE_NAME} " "WHERE partition = %s AND mass = %s AND sequence = %s"), (file_peptide.partition, file_peptide.mass, file_peptide.sequence)) db_peptide_record = database_cursor.fetchone() self.assertIsNotNone(db_peptide_record) # file_peptide attributes in the array below have the same order as in the query for value_idx, file_peptide_value in enumerate([ file_peptide.a_count, file_peptide.b_count, file_peptide.c_count, file_peptide.d_count, file_peptide.e_count, file_peptide.f_count, file_peptide.g_count, file_peptide.h_count, file_peptide.i_count, file_peptide.j_count, file_peptide.k_count, file_peptide.l_count, file_peptide.m_count, file_peptide.n_count, file_peptide.o_count, file_peptide.p_count, file_peptide.q_count, file_peptide.r_count, file_peptide.s_count, file_peptide.t_count, file_peptide.u_count, file_peptide.v_count, file_peptide.w_count, file_peptide.y_count, file_peptide.z_count, file_peptide.get_n_terminus_ascii_dec(), file_peptide.get_c_terminus_ascii_dec() ]): self.assertEqual(file_peptide_value, db_peptide_record[value_idx]) # Check protein/peptide-associations from both directions desired_number_of_associations = 0 for file_protein in proteins_from_file: for file_peptide in enzym.digest(file_protein): # Increase association counter desired_number_of_associations += 1 database_cursor.execute(( "SELECT true " f"FROM {ProteinPeptideAssociation.TABLE_NAME} " "WHERE protein_accession = %s AND partition = %s AND peptide_mass = %s AND peptide_sequence = %s;" ), ( file_protein.accession, file_peptide.partition, file_peptide.mass, file_peptide.sequence, )) is_association_found = database_cursor.fetchone()[0] self.assertIsNotNone(is_association_found) self.assertTrue(is_association_found) # Check association counter. Must be equals even after updates. database_cursor.execute( f"SELECT count(*) FROM {ProteinPeptideAssociation.TABLE_NAME};" ) self.assertEqual(desired_number_of_associations, database_cursor.fetchone()[0]) # Check if peptide metadata equals peptides database_cursor.execute( f"SELECT count(*) FROM {PeptideMetadata.TABLE_NAME};") metadata_count = database_cursor.fetchone()[0] database_cursor.execute( f"SELECT count(*) FROM {Peptide.TABLE_NAME};") peptide_count = database_cursor.fetchone()[0] self.assertEqual(metadata_count, peptide_count) # Check if the current peptides have updated metadata for file_peptide in peptides_from_file_proteins: file_peptide.fetch_metadata_from_proteins(database_cursor) db_metadata = PeptideMetadata.select(database_cursor, file_peptide) self.assertIsNotNone( db_metadata, f"metadata for peptide '{file_peptide.sequence}' is missing" ) if db_metadata: self.assertEqual(db_metadata.is_swiss_prot, file_peptide.metadata.is_swiss_prot) self.assertEqual(db_metadata.is_trembl, file_peptide.metadata.is_trembl) self.assertEqual( sorted(db_metadata.taxonomy_ids), sorted(file_peptide.metadata.taxonomy_ids)) self.assertEqual( sorted(db_metadata.unique_taxonomy_ids), sorted(file_peptide.metadata.unique_taxonomy_ids)) self.assertEqual( sorted(db_metadata.proteome_ids), sorted(file_peptide.metadata.proteome_ids)) # Check if maintenance mode is false and update timestamp is greater zero database_status = MaintenanceInformation.select( database_cursor, MaintenanceInformation.DATABASE_STATUS_KEY) self.assertNotEqual(database_status, None) self.assertGreater(database_status.values['last_update'], 0) self.assertEqual(database_status.values['status'], DatabaseStatus.READY.value) self.assertFalse(database_status.values['maintenance_mode'])
def sequence_mass(sequence): peptide = Peptide(sequence, 0) return jsonify({'mass': mass_to_float(peptide.mass)})
def digest(): """ Digest a given peptide/sequence, search the resulting peptides in the database and return matching and not matching peptides in separate array. """ data = request.get_json() errors = ApiDigestionController.check_digestion_parameters(data) if not "sequence" in data: errors["sequence"].append("cannot be empty") digestion_peptides = [] database_peptides = [] if len(errors) == 0: EnzymeClass = get_digestion_enzyme_by_name("trypsin") enzyme = EnzymeClass(data["maximum_number_of_missed_cleavages"], data["minimum_peptide_length"], data["maximum_peptide_length"]) digestion_peptides = enzyme.digest( Protein("TMP", [], "TMP", "TMP", data["sequence"], [], [], False, 0)) if "do_database_search" in data and isinstance( data["do_database_search"], bool) and data["do_database_search"]: database_connection = get_database_connection() with database_connection.cursor() as database_cursor: database_peptides = Peptide.select( database_cursor, WhereCondition( ["(partition, mass, sequence) IN %s"], (tuple( (peptide.partition, peptide.mass, peptide.sequence) for peptide in digestion_peptides), )), fetchall=True) database_peptides.sort(key=lambda peptide: peptide.mass) digestion_peptides = [ peptide for peptide in digestion_peptides if peptide not in database_peptides ] digestion_peptides.sort(key=lambda peptide: peptide.mass) if len(errors) == 0: def json_stream() -> Iterator[bytes]: yield b"{\"database\": [" for peptide_idx, peptide in enumerate(database_peptides): if peptide_idx > 0: yield b"," yield from peptide.to_json() yield b"],\"digestion\": [" for peptide_idx, peptide in enumerate(digestion_peptides): if peptide_idx > 0: yield b"," yield from peptide.to_json() yield f"],\"count\": {len(database_peptides) + len(digestion_peptides)}}}".encode( "utf-8") return Response(json_stream(), content_type="application/json") else: return jsonify({"errors": errors}), 422
def test_validation(self): static_carbamidomethylation_of_c = Modification( 'unimod:4', 'carbamidomethylation of cysteine', AminoAcid.get_by_one_letter_code('C'), mass_to_int(57.021464), True, ModificationPosition.ANYWHERE) variable_oxidation_of_m = Modification( 'unimod:35', 'oxidation of methionine', AminoAcid.get_by_one_letter_code('M'), mass_to_int(15.994915), False, ModificationPosition.ANYWHERE) static_custom_modification_of_n_terminal_d = Modification( 'custom:1', 'custom of aspartic acid', AminoAcid.get_by_one_letter_code('D'), mass_to_int(10.01541), True, ModificationPosition.N_TERMINUS) variable_custom_modification_of_n_terminal_d = Modification( 'custom:2', 'custom of aspartic acid', AminoAcid.get_by_one_letter_code('D'), mass_to_int(10.01541), False, ModificationPosition.N_TERMINUS) static_custom_modification_of_c_terminal_r = Modification( 'custom:3', 'custom of arginine', AminoAcid.get_by_one_letter_code('R'), mass_to_int(6.153215), True, ModificationPosition.C_TERMINUS) variable_custom_modification_of_c_terminal_r = Modification( 'custom:4', 'custom of arginine', AminoAcid.get_by_one_letter_code('R'), mass_to_int(6.153215), False, ModificationPosition.C_TERMINUS) peptide = Peptide(LEPTIN_PEPTIDE_SEQUENCE, 2) # Static carbamidomethylation of C expected_peptide_mass = peptide.weight + peptide.c_count * static_carbamidomethylation_of_c.delta modification_collection = ModificationCollection( [static_carbamidomethylation_of_c]) precursor_range = PrecursorRange(expected_peptide_mass, 0, 0) validator = PeptideMassValidator(modification_collection, 0, precursor_range) self.assertTrue(validator.validate(peptide)) # This should als match with allowed variable modification (where actually none is applied) # Static carbamidomethylation of C # Variable oxidation of M (not considered in expected_weight) modification_collection = ModificationCollection( [static_carbamidomethylation_of_c, variable_oxidation_of_m]) validator = PeptideMassValidator(modification_collection, 3, precursor_range) self.assertTrue(validator.validate(peptide)) # Static carbamidomethylation of C # 1 variable oxidation of M expected_peptide_mass = peptide.weight \ + peptide.c_count * static_carbamidomethylation_of_c.delta \ + 1 * variable_oxidation_of_m.delta modification_collection = ModificationCollection( [static_carbamidomethylation_of_c, variable_oxidation_of_m]) precursor_range = PrecursorRange(expected_peptide_mass, 0, 0) validator = PeptideMassValidator(modification_collection, 3, precursor_range) self.assertTrue(validator.validate(peptide)) # This should not match if no variable modifiations are allowed # Static carbamidomethylation of C # Variable oxidation of M (considered in expected_weight but no variable modification allowed in validation) validator.set_maximum_number_of_variable_modifications(0) self.assertFalse(validator.validate(peptide)) # Lets replace two Js with Ms and test 3 applied variable oxidations of M # Static carbamidomethylation of C # 3 Variable oxidation of M peptide = Peptide(LEPTIN_PEPTIDE_SEQUENCE.replace('J', 'M', 2), 2) expected_peptide_mass = peptide.weight \ + peptide.c_count * static_carbamidomethylation_of_c.delta \ + 3 * variable_oxidation_of_m.delta modification_collection = ModificationCollection( [static_carbamidomethylation_of_c, variable_oxidation_of_m]) precursor_range = PrecursorRange(expected_peptide_mass, 0, 0) validator = PeptideMassValidator(modification_collection, 3, precursor_range) self.assertTrue(validator.validate(peptide)) # This should fail with only 2 allowed variable modifications validator.set_maximum_number_of_variable_modifications(2) self.assertFalse(validator.validate(peptide)) # Test variable n-terminal # Variable n-terminal modification of D # Static carbamidomethylation of C # 2 variable oxidation of M expected_peptide_mass = peptide.weight \ + variable_custom_modification_of_n_terminal_d.delta \ + peptide.c_count * static_carbamidomethylation_of_c.delta \ + 2 * variable_oxidation_of_m.delta modification_collection = ModificationCollection([ static_carbamidomethylation_of_c, variable_oxidation_of_m, variable_custom_modification_of_n_terminal_d ]) precursor_range = PrecursorRange(expected_peptide_mass, 0, 0) validator = PeptideMassValidator(modification_collection, 3, precursor_range) self.assertTrue(validator.validate(peptide)) # This should fail with only 2 allowed variable modifications validator.set_maximum_number_of_variable_modifications(2) self.assertFalse(validator.validate(peptide)) # Test static n-terminal modification # Static n-terminal modification of D # Static carbamidomethylation of C # 2 variable oxidation of M expected_peptide_mass = peptide.weight \ + static_custom_modification_of_n_terminal_d.delta \ + peptide.c_count * static_carbamidomethylation_of_c.delta \ + 2 * variable_oxidation_of_m.delta modification_collection = ModificationCollection([ static_carbamidomethylation_of_c, variable_oxidation_of_m, static_custom_modification_of_n_terminal_d ]) precursor_range = PrecursorRange(expected_peptide_mass, 0, 0) validator = PeptideMassValidator(modification_collection, 3, precursor_range) self.assertTrue(validator.validate(peptide)) # Test variable n-terminal # Variable c-terminal modification of R # Static carbamidomethylation of C # 2 variable oxidation of M expected_peptide_mass = peptide.weight \ + variable_custom_modification_of_c_terminal_r.delta \ + peptide.c_count * static_carbamidomethylation_of_c.delta \ + 2 * variable_oxidation_of_m.delta modification_collection = ModificationCollection([ static_carbamidomethylation_of_c, variable_oxidation_of_m, variable_custom_modification_of_c_terminal_r ]) precursor_range = PrecursorRange(expected_peptide_mass, 0, 0) validator = PeptideMassValidator(modification_collection, 3, precursor_range) self.assertTrue(validator.validate(peptide)) # This should fail with only 2 allowed variable modifications validator.set_maximum_number_of_variable_modifications(2) self.assertFalse(validator.validate(peptide)) # Test static n-terminal modification # Static c-terminal modification of R # Static carbamidomethylation of C # 2 variable oxidation of M expected_peptide_mass = peptide.weight \ + static_custom_modification_of_c_terminal_r.delta \ + peptide.c_count * static_carbamidomethylation_of_c.delta \ + 2 * variable_oxidation_of_m.delta modification_collection = ModificationCollection([ static_carbamidomethylation_of_c, variable_oxidation_of_m, static_custom_modification_of_c_terminal_r ]) precursor_range = PrecursorRange(expected_peptide_mass, 0, 0) validator = PeptideMassValidator(modification_collection, 3, precursor_range) self.assertTrue(validator.validate(peptide))