def test_get_aa_sequence_mass(self): aa_sequence = 'ARNDCEQGHILKMFPSTWYV' aa_masses = { 'G': 57.02146, 'A': 71.03711, 'S': 87.03203, 'P': 97.05276, 'V': 99.06841, 'T': 101.04768, 'C': 103.00919, 'L': 113.08406, 'I': 113.08406, 'N': 114.04293, 'D': 115.02694, 'Q': 128.05858, 'K': 128.09496, 'E': 129.04259, 'M': 131.04049, 'H': 137.05891, 'F': 147.06841, 'R': 156.10111, 'Y': 163.06333, 'W': 186.07931, } actual = mass.get_aa_sequence_mass(aa_sequence, aa_masses=aa_masses) expected = 2376.11432 self.assertEquals(actual, expected)
def process_protein_batch(self, batch, taxon, logger=None): """ Process a batch of proteins with the given digest. """ if not batch: return if not logger: logger = self.logger # Get existing proteins by searching for sequences. existing_proteins = {} for protein in (self.session.query(Protein).filter( Protein.sequence.in_( [sequence for metadata, sequence in batch]))): existing_proteins[protein.sequence] = protein # Initialize collection of undigested proteins. undigested_proteins = {} digested_proteins = {} if existing_proteins: for protein in (self.session.query(Protein).filter( Protein.id.in_([ protein.id for protein in existing_proteins.values() ])).join(ProteinDigest).filter( ProteinDigest.digest == self.digest)): digested_proteins[protein.sequence] = protein for protein in existing_proteins.values(): if protein.sequence not in digested_proteins: undigested_proteins[protein.sequence] = protein # Create proteins which do not exist in the db and add to undigested # collection. num_new_proteins = 0 for metadata, sequence in batch: if sequence not in existing_proteins: try: mass = get_aa_sequence_mass(sequence) protein = Protein(sequence=sequence, mass=mass) except Exception as e: logger.exception("Error processing protein, skipping") continue self.session.add(protein) num_new_proteins += 1 undigested_proteins[sequence] = protein existing_proteins[sequence] = protein logger.info("creating %s new proteins..." % (num_new_proteins)) self.session.commit() self.stats['Protein'] += num_new_proteins # Digest undigested proteins. if undigested_proteins: num_undigested = len(undigested_proteins) logger.info("digesting %s proteins" % num_new_proteins) undigested_batch = {} peptide_counter = 0 protein_digests = [] for protein in undigested_proteins.values(): protein_digest = ProteinDigest(protein=protein, digest=self.digest) protein_digests.append(protein_digest) peptide_sequences = cleave( protein.sequence, self.digest.protease.cleavage_rule, self.digest.max_missed_cleavages, min_acids=self.digest.min_acids, max_acids=self.digest.max_acids, ) peptide_counter += len(peptide_sequences) undigested_batch[protein] = { 'peptide_sequences': peptide_sequences, 'protein_digest': protein_digest, } if (peptide_counter > 1e4): self.process_peptide_batch(undigested_batch, logger) peptide_counter = 0 self.process_peptide_batch(undigested_batch, logger) # Create taxon protein instances in bulk. taxon_protein_dicts = [] for metadata, sequence in batch: try: protein = existing_proteins[sequence] except Exception as e: logger.exception("Error processing protein, sequence does not" " exist in db, skipping") continue taxon_protein_dicts.append({ 'protein_id': protein.id, 'taxon_id': taxon.id, 'metadata': metadata, }) logger.info("Creating %s new taxon proteins..." % (len(taxon_protein_dicts))) self.session.execute(db.tables['TaxonProtein'].insert(), taxon_protein_dicts) self.session.commit() self.stats['TaxonProtein'] += len(taxon_protein_dicts)
def process_peptide_batch(self, batch, logger=None): if not logger: logger = self.logger # Assemble combined peptide sequences and protein digests. combined_peptide_sequences = set() combined_protein_digests = [] for protein, data in batch.items(): for sequence in data['peptide_sequences']: combined_peptide_sequences.add(sequence) combined_protein_digests.append(data['protein_digest']) # Add protein digests to db. logger.info("Creating %s new protein digests..." % (len(combined_protein_digests))) self.session.add_all(combined_protein_digests) self.session.commit() self.stats['ProteinDigest'] += len(combined_protein_digests) # Get existing peptides. existing_peptides = {} existing_peptides_batch = [] existing_peptides_counter = 0 for sequence in combined_peptide_sequences: existing_peptides_counter += 1 existing_peptides_batch.append(sequence) if (existing_peptides_counter % 500) == 0: self.update_existing_peptides_(existing_peptides_batch, existing_peptides) existing_peptides_batch = [] self.update_existing_peptides_(existing_peptides_batch, existing_peptides) # Create non-existent peptides in bulk. num_new_peptides = 0 peptide_dicts = [] for sequence in combined_peptide_sequences: if sequence not in existing_peptides: num_new_peptides += 1 mass = get_aa_sequence_mass(sequence) peptide_dicts.append({ 'sequence': sequence, 'mass': mass, }) logger.info("Creating %s new peptides..." % num_new_peptides) self.session.execute(db.tables['Peptide'].insert(), peptide_dicts) self.session.commit() self.stats['Peptide'] += num_new_peptides # Get newly created peptide objects and add to existing peptides. created_peptides_batch = [] created_peptides_counter = 0 for peptide_dict in peptide_dicts: created_peptides_counter += 1 created_peptides_batch.append(peptide_dict['sequence']) if (created_peptides_counter % 500) == 0: self.update_existing_peptides_(created_peptides_batch, existing_peptides) created_peptides_batch = [] self.update_existing_peptides_(created_peptides_batch, existing_peptides) # Create histogram of peptide sequence occurences for each protein. num_peptide_instances = 0 for protein, data in batch.items(): peptides_histogram = defaultdict(int) for sequence in data['peptide_sequences']: peptides_histogram[sequence] += 1 data['peptide_histogram'] = peptides_histogram # Update number of peptide instances. num_peptide_instances += len(peptides_histogram) # Create protein digest peptide instances in bulk. logger.info("Creating %s new protein digest peptides..." % (num_peptide_instances)) pdp_batch = [] pdp_counter = 0 for protein, data in batch.items(): for sequence, count in data['peptide_histogram'].items(): pdp_counter += 1 peptide = existing_peptides[sequence] pdp_batch.append({ 'peptide_id': peptide.id, 'protein_digest_id': data['protein_digest'].id, 'count': count, }) if (pdp_counter % 1e4) == 0: self.session.execute( db.tables['ProteinDigestPeptide'].insert(), pdp_batch) self.session.commit() self.session.execute(db.tables['ProteinDigestPeptide'].insert(), pdp_batch) self.session.commit() self.stats['ProteinDigestPeptide'] += num_peptide_instances
def process_peptide_batch(self, metagenome_sequence_digests_dict, logger=None): if not logger: logger = self.logger # Assemble combined peptide sequences and metagenome digests. Each metagenome sequence can have many peptides. combined_peptide_sequences = set() for proteinId, data in list(metagenome_sequence_digests_dict.items()): for sequence in data['peptide_sequences']: combined_peptide_sequences.add(sequence) # Get existing peptides. existing_peptides = {} # Create non-existent peptides in bulk. start_time = time.time() num_new_peptides = 0 peptide_sequences = [] peptide_masses = [] peptide_file = '' for sequence in combined_peptide_sequences: num_new_peptides += 1 #calculate mass of peptide mass = get_aa_sequence_mass(sequence) peptide_sequences.append(sequence) peptide_masses.append(mass) logger.info("Creating %s new peptides..." % num_new_peptides) cur = db.get_psycopg2_cursor() cur.execute("select * from peptide_insert(%s, %s);", (peptide_sequences, peptide_masses)) for record in cur: try: peptide = Peptide( id=record[0], sequence=record[1], ) existing_peptides[peptide.sequence] = peptide except Exception as e: logger.exception("Error processing peptide, skipping") continue total_time = time.time() - start_time self.total_peptide_time = self.total_peptide_time + total_time logger.info("peptide time elapsed: %s" % (total_time)) self.stats['Peptide'] += num_new_peptides # Create histogram of peptide sequence occurences for each protein. num_peptide_instances = 0 for sequenceId, data in list(metagenome_sequence_digests_dict.items()): peptides_histogram = defaultdict(int) for sequence in data['peptide_sequences']: peptides_histogram[sequence] += 1 data['peptide_histogram'] = peptides_histogram # Update number of peptide instances. num_peptide_instances += len(peptides_histogram) # Create protein digest peptide instances in bulk. start_time = time.time() pdp_peptide_ids = [] pdp_metagenome_sequence_ids = [] pdp_digest_ids = [] pdp_peptide_count = [] pdp_counter = 0 for sequenceId, data in list(metagenome_sequence_digests_dict.items()): for sequence, count in list(data['peptide_histogram'].items()): pdp_counter += 1 peptide = existing_peptides[sequence] pdp_peptide_ids.append(peptide.id) pdp_metagenome_sequence_ids.append( data['metagenome_sequence'].id) pdp_digest_ids.append(data['digest'].id) pdp_peptide_count.append(count) total_time = time.time() - start_time cur.execute( "select metagenome_sequence_digest_peptide_insert(%s, %s, %s, %s);", (pdp_peptide_ids, pdp_metagenome_sequence_ids, pdp_digest_ids, pdp_peptide_count)) db.psycopg2_connection.commit() total_time = time.time() - start_time # logger.info("protein digest time elapsed: %s" % (total_time)) self.stats['ProteinDigestPeptide'] += num_peptide_instances
def process_peptide_batch(self, batch, logger=None): if not logger: logger = self.logger # Assemble combined peptide sequences and protein digests. combined_peptide_sequences = set() protein_ids = [] digest_ids = [] protein_digests = [] protein_digests_dict = {} for proteinId, data in list(batch.items()): for sequence in data['peptide_sequences']: combined_peptide_sequences.add(sequence) pd = data['protein_digest'] protein_ids.append(pd.protein.id) digest_ids.append(pd.digest.id) cur = db.get_psycopg2_cursor() cur.execute("select * from protein_digest_insert(%s, %s);", (protein_ids, digest_ids)) # iterate through the protein_digest records returned from the insert and build a protein_digest object for record in cur: try: protein_digest = ProteinDigest(id=record[0], protein=record[1], digest=record[2]) protein_digests.append(protein_digest) batch_record = batch.get(record[1]) protein_digests_dict[record[1]] = { 'peptide_sequences': batch_record['peptide_sequences'], 'protein_digest': protein_digest, } except Exception as e: logger.exception("Error processing protein digest, skipping") continue db.psycopg2_connection.commit() self.stats['ProteinDigest'] += len(protein_digests) # Get existing peptides. existing_peptides = {} # Create non-existent peptides in bulk. start_time = time.time() num_new_peptides = 0 peptide_dicts = [] peptide_sequences = [] peptide_masses = [] for sequence in combined_peptide_sequences: #if sequence not in existing_peptides: num_new_peptides += 1 mass = get_aa_sequence_mass(sequence) peptide_dicts.append({ 'sequence': sequence, 'mass': mass, }) peptide_sequences.append(sequence) peptide_masses.append(mass) logger.info("Creating %s new peptides..." % num_new_peptides) cur = db.get_psycopg2_cursor() cur.execute("select * from peptide_insert(%s, %s);", (peptide_sequences, peptide_masses)) for record in cur: try: peptide = Peptide( id=record[0], sequence=record[1], ) existing_peptides[peptide.sequence] = peptide except Exception as e: logger.exception("Error processing peptide, skipping") continue self.stats['Peptide'] += num_new_peptides # Create histogram of peptide sequence occurences for each protein. num_peptide_instances = 0 for proteinId, data in list(protein_digests_dict.items()): peptides_histogram = defaultdict(int) for sequence in data['peptide_sequences']: peptides_histogram[sequence] += 1 data['peptide_histogram'] = peptides_histogram # Update number of peptide instances. num_peptide_instances += len(peptides_histogram) total_time = time.time() - start_time logger.info("peptide time elapsed: %s" % (total_time)) # Create protein digest peptide instances in bulk. logger.info("Creating %s new protein digest peptides..." % (num_peptide_instances)) start_time = time.time() pdp_batch = [] pdp_peptide_ids = [] pdp_protein_digest_ids = [] pdp_peptide_count = [] pdp_counter = 0 for proteinId, data in list(protein_digests_dict.items()): for sequence, count in list(data['peptide_histogram'].items()): pdp_counter += 1 peptide = existing_peptides[sequence] pdp_peptide_ids.append(peptide.id) pdp_protein_digest_ids.append(data['protein_digest'].id) pdp_peptide_count.append(count) total_time = time.time() - start_time logger.info("protein digest loop time elapsed: %s" % (total_time)) cur = db.get_psycopg2_cursor() cur.execute( "select protein_digest_peptide_insert(%s, %s, %s);", (pdp_peptide_ids, pdp_protein_digest_ids, pdp_peptide_count)) db.psycopg2_connection.commit() total_time = time.time() - start_time logger.info("protein digest time elapsed: %s" % (total_time)) self.stats['ProteinDigestPeptide'] += num_peptide_instances
def process_protein_batch(self, batch, taxon, logger=None): """ Process a batch of proteins with the given digest. """ if not batch: return if not logger: logger = self.logger # Get existing proteins by searching for sequences. existing_proteins = {} existing_protein_ids = [] cur = db.get_psycopg2_cursor() sequences = [] for metadata, sequence in batch: sequences.append(sequence) cur.execute("select * from protein where protein.sequence in %s", (tuple(sequences), )) for record in cur.fetchall(): protein = Protein(id=record[0], sequence=record[1], mass=record[2]) existing_proteins[protein.sequence] = protein existing_protein_ids.append(record[0]) db.psycopg2_connection.commit() # Initialize collection of undigested proteins. undigested_proteins = {} digested_proteins = {} protein_sequences = [] protein_masses = [] #testing now, convert to stored procedure if existing_proteins: cur = db.get_psycopg2_cursor() cur.execute( "select * from protein join protein_digest on protein.id = protein_digest.protein_id where protein.id in %s and protein_digest.digest_id = %s", ( tuple(existing_protein_ids), self.digest.id, )) for record in cur.fetchall(): protein = Protein(id=record[0], sequence=record[1], mass=record[2]) digested_proteins[protein.sequence] = protein db.psycopg2_connection.commit() for protein in list(existing_proteins.values()): if protein.sequence not in digested_proteins: undigested_proteins[protein.sequence] = protein # Create proteins which do not exist in the db and add to undigested # collection. start_time = time.time() num_new_proteins = 0 for metadata, sequence in batch: try: mass = get_aa_sequence_mass(sequence) except Exception as e: logger.exception("Error processing protein, skipping") continue num_new_proteins += 1 # add sequence and mass to their respective lists to be passed to postgres stored procedure if (sequence not in protein_sequences): protein_sequences.append(sequence) protein_masses.append(mass) logger.info("creating %s new proteins..." % (num_new_proteins)) cur = db.get_psycopg2_cursor() cur.execute("select * from protein_insert(%s, %s);", (protein_sequences, protein_masses)) # iterate through the protein records returned from the insert and build a protein object for record in cur: try: protein = Protein(id=record[0], sequence=record[1], mass=record[2]) except Exception as e: logger.exception("Error processing protein, skipping") continue undigested_proteins[record[1]] = protein existing_proteins[record[1]] = protein db.psycopg2_connection.commit() total_time = time.time() - start_time logger.info("time elapsed: %s" % (total_time)) self.stats['Protein'] += num_new_proteins # Digest undigested proteins. if undigested_proteins: num_undigested = len(undigested_proteins) logger.info("digesting %s proteins" % num_new_proteins) undigested_batch = {} peptide_counter = 0 protein_digests = [] for protein in list(undigested_proteins.values()): protein_digest = ProteinDigest(protein=protein, digest=self.digest) protein_digests.append(protein_digest) #do the digestion of a single protein sequence peptide_sequences = cleave( protein.sequence, self.digest.protease.cleavage_rule, self.logger, self.digest.max_missed_cleavages, min_acids=self.digest.min_acids, max_acids=self.digest.max_acids, ) peptide_counter += len(peptide_sequences) undigested_batch[protein.id] = { 'peptide_sequences': peptide_sequences, 'protein_digest': protein_digest, } self.process_peptide_batch(undigested_batch, logger) # Create taxon protein instances in bulk. taxon_protein_dicts = [] taxon_protein_ids = [] taxon_ids = [] metadatas = [] for metadata, sequence in batch: if sequence != "No sequence found": try: protein = existing_proteins[sequence] except Exception as e: logger.exception( "Error processing protein, sequence does not" " exist in db, skipping") continue taxon_protein_dicts.append({ 'protein_id': protein.id, 'taxon_id': taxon.id, 'metadata': metadata, }) taxon_protein_ids.append(protein.id) taxon_ids.append(taxon.id) metadatas.append(metadata) logger.info("Creating %s new taxon proteins..." % (len(taxon_protein_dicts))) cur = db.get_psycopg2_cursor() cur.execute("select * from taxon_protein_insert(%s, %s, %s);", (taxon_protein_ids, taxon_ids, metadatas)) db.psycopg2_connection.commit() self.stats['TaxonProtein'] += len(taxon_protein_dicts)
def process_peptide_batch(self, batch, logger=None): if not logger: logger = self.logger # Assemble combined peptide sequences and protein digests. combined_peptide_sequences = set() combined_protein_digests = [] for protein, data in batch.items(): for sequence in data['peptide_sequences']: combined_peptide_sequences.add(sequence) combined_protein_digests.append(data['protein_digest']) # Add protein digests to db. logger.info("Creating %s new protein digests..." % ( len(combined_protein_digests))) self.session.add_all(combined_protein_digests) self.session.commit() self.stats['ProteinDigest'] += len(combined_protein_digests) # Get existing peptides. existing_peptides = {} existing_peptides_batch = [] existing_peptides_counter = 0 for sequence in combined_peptide_sequences: existing_peptides_counter += 1 existing_peptides_batch.append(sequence) if (existing_peptides_counter % 500) == 0: self.update_existing_peptides_( existing_peptides_batch, existing_peptides) existing_peptides_batch = [] self.update_existing_peptides_( existing_peptides_batch, existing_peptides) # Create non-existent peptides in bulk. num_new_peptides = 0 peptide_dicts = [] for sequence in combined_peptide_sequences: if sequence not in existing_peptides: num_new_peptides += 1 mass = get_aa_sequence_mass(sequence) peptide_dicts.append({ 'sequence': sequence, 'mass': mass, }) logger.info("Creating %s new peptides..." % num_new_peptides) self.session.execute(db.tables['Peptide'].insert(), peptide_dicts) self.session.commit() self.stats['Peptide'] += num_new_peptides # Get newly created peptide objects and add to existing peptides. created_peptides_batch = [] created_peptides_counter = 0 for peptide_dict in peptide_dicts: created_peptides_counter += 1 created_peptides_batch.append(peptide_dict['sequence']) if (created_peptides_counter % 500) == 0: self.update_existing_peptides_(created_peptides_batch, existing_peptides) created_peptides_batch = [] self.update_existing_peptides_( created_peptides_batch, existing_peptides) # Create histogram of peptide sequence occurences for each protein. num_peptide_instances = 0 for protein, data in batch.items(): peptides_histogram = defaultdict(int) for sequence in data['peptide_sequences']: peptides_histogram[sequence] += 1 data['peptide_histogram'] = peptides_histogram # Update number of peptide instances. num_peptide_instances += len(peptides_histogram) # Create protein digest peptide instances in bulk. logger.info("Creating %s new protein digest peptides..." % ( num_peptide_instances)) pdp_batch = [] pdp_counter = 0 for protein, data in batch.items(): for sequence, count in data['peptide_histogram'].items(): pdp_counter += 1 peptide = existing_peptides[sequence] pdp_batch.append({ 'peptide_id': peptide.id, 'protein_digest_id': data['protein_digest'].id, 'count': count, }) if (pdp_counter % 1e4) == 0: self.session.execute( db.tables['ProteinDigestPeptide'].insert(), pdp_batch) self.session.commit() self.session.execute( db.tables['ProteinDigestPeptide'].insert(), pdp_batch) self.session.commit() self.stats['ProteinDigestPeptide'] += num_peptide_instances
def process_protein_batch(self, batch, taxon, logger=None): """ Process a batch of proteins with the given digest. """ if not batch: return if not logger: logger = self.logger # Get existing proteins by searching for sequences. existing_proteins = {} for protein in ( self.session.query(Protein) .filter(Protein.sequence.in_( [sequence for metadata, sequence in batch]) ) ): existing_proteins[protein.sequence] = protein # Initialize collection of undigested proteins. undigested_proteins = {} digested_proteins = {} if existing_proteins: for protein in ( self.session.query(Protein) .filter(Protein.id.in_( [protein.id for protein in existing_proteins.values()])) .join(ProteinDigest) .filter(ProteinDigest.digest == self.digest) ): digested_proteins[protein.sequence] = protein for protein in existing_proteins.values(): if protein.sequence not in digested_proteins: undigested_proteins[protein.sequence] = protein # Create proteins which do not exist in the db and add to undigested # collection. num_new_proteins = 0 for metadata, sequence in batch: if sequence not in existing_proteins: try: mass = get_aa_sequence_mass(sequence) protein = Protein(sequence=sequence, mass=mass) except Exception as e: logger.exception("Error processing protein, skipping") continue self.session.add(protein) num_new_proteins += 1 undigested_proteins[sequence] = protein existing_proteins[sequence] = protein logger.info("creating %s new proteins..." % ( num_new_proteins)) self.session.commit() self.stats['Protein'] += num_new_proteins # Digest undigested proteins. if undigested_proteins: num_undigested = len(undigested_proteins) logger.info("digesting %s proteins" % num_new_proteins) undigested_batch = {} peptide_counter = 0 protein_digests = [] for protein in undigested_proteins.values(): protein_digest = ProteinDigest(protein=protein, digest=self.digest) protein_digests.append(protein_digest) peptide_sequences = cleave( protein.sequence, self.digest.protease.cleavage_rule, self.digest.max_missed_cleavages, min_acids=self.digest.min_acids, max_acids=self.digest.max_acids, ) peptide_counter += len(peptide_sequences) undigested_batch[protein] = { 'peptide_sequences': peptide_sequences, 'protein_digest': protein_digest, } if (peptide_counter > 1e4): self.process_peptide_batch(undigested_batch, logger) peptide_counter = 0 self.process_peptide_batch(undigested_batch, logger) # Create taxon protein instances in bulk. taxon_protein_dicts = [] for metadata, sequence in batch: try: protein = existing_proteins[sequence] except Exception as e: logger.exception("Error processing protein, sequence does not" " exist in db, skipping") continue taxon_protein_dicts.append({ 'protein_id': protein.id, 'taxon_id': taxon.id, 'metadata': metadata, }) logger.info("Creating %s new taxon proteins..." % ( len(taxon_protein_dicts))) self.session.execute( db.tables['TaxonProtein'].insert(), taxon_protein_dicts) self.session.commit() self.stats['TaxonProtein'] += len(taxon_protein_dicts)