def serialize_substituent_rule(substituent): return { "name": substituent.name, "composition": formula(substituent.composition), "can_nh_derivatize": substituent.can_nh_derivatize, "is_nh_derivatizable": substituent.is_nh_derivatizable, "attachment_composition": formula(substituent.attachment_composition) }
def serialize_substituent_rule(substituent): return { "name": substituent.name, "composition": formula(substituent.composition), "can_nh_derivatize": substituent.can_nh_derivatize, "is_nh_derivatizable": substituent.is_nh_derivatizable, "attachment_composition": formula(substituent.attachment_composition) }
def _serialize_compound_mass_shift(self, mass_shift): return { "name": mass_shift.name, "composition": formula(mass_shift.composition), "counts": { k.name: v for k, v in mass_shift.counts.items() }, "definitions": { k.name: formula(k.composition) for k, v in mass_shift.counts.items() } }
def _serialize_compound_mass_shift(self, mass_shift): return { "name": mass_shift.name, "composition": formula(mass_shift.composition), "counts": { k.name: v for k, v in mass_shift.counts.items() }, "definitions": { k.name: formula(k.composition) for k, v in mass_shift.counts.items() } }
def display_peptide_modification(name): mod = Modification(name) click.echo("name: %s" % mod.name) click.echo("mass: %f" % mod.mass) click.echo("formula: %s" % formula(mod.composition)) for target in mod.rule.targets: click.echo("target: %s" % target.serialize())
def pack_peptide(self, peptide_ident, start, end, score, score_type, parent_protein): match = Peptide( calculated_mass=peptide_ident.peptide_sequence.mass, base_peptide_sequence=peptide_ident.base_sequence, modified_peptide_sequence=str(peptide_ident.peptide_sequence), formula=formula(peptide_ident.peptide_sequence.total_composition()), count_glycosylation_sites=None, count_missed_cleavages=peptide_ident.missed_cleavages, count_variable_modifications=peptide_ident.modification_counter, start_position=start, end_position=end, peptide_score=score, peptide_score_type=score_type, sequence_length=end - start, protein_id=parent_protein.id, hypothesis_id=self.hypothesis_id) n_glycosites = n_glycan_sequon_sites( match, parent_protein) o_glycosites = o_glycan_sequon_sites(match, parent_protein) gag_glycosites = gag_sequon_sites(match, parent_protein) match.count_glycosylation_sites = len(n_glycosites) + len(o_glycosites) match.n_glycosylation_sites = sorted(n_glycosites) match.o_glycosylation_sites = sorted(o_glycosites) match.gagylation_sites = sorted(gag_glycosites) return match
def add_modifications(self, constant_modifications=None, variable_modifications=None, max_variable_modifications=4): if constant_modifications is None: constant_modifications = [] if variable_modifications is None: variable_modifications = [] result = MemoryPeptideCollection() for peptide in self.peptides: for modified_peptide, n_variable_modifications in peptide_permutations( str(peptide), constant_modifications, variable_modifications): total_modification_count = ( n_variable_modifications + peptide.count_variable_modifications) if total_modification_count > max_variable_modifications: continue inst = Peptide( base_peptide_sequence=peptide.base_peptide_sequence, modified_peptide_sequence=str(modified_peptide), count_missed_cleavages=peptide.count_missed_cleavages, count_variable_modifications=total_modification_count, sequence_length=peptide.sequence_length, start_position=peptide.start_position, end_position=peptide.end_position, calculated_mass=modified_peptide.mass, formula=formula(modified_peptide.total_composition())) result.add(inst) return result
def add_modifications(self, constant_modifications=None, variable_modifications=None, max_variable_modifications=4): if constant_modifications is None: constant_modifications = [] if variable_modifications is None: variable_modifications = [] result = MemoryPeptideCollection() for peptide in self.peptides: for modified_peptide, n_variable_modifications in peptide_permutations( str(peptide), constant_modifications, variable_modifications): total_modification_count = ( n_variable_modifications + peptide.count_variable_modifications) if total_modification_count > max_variable_modifications: continue inst = Peptide( base_peptide_sequence=peptide.base_peptide_sequence, modified_peptide_sequence=str(modified_peptide), count_missed_cleavages=peptide.count_missed_cleavages, count_variable_modifications=total_modification_count, sequence_length=peptide.sequence_length, start_position=peptide.start_position, end_position=peptide.end_position, calculated_mass=modified_peptide.mass, formula=formula(modified_peptide.total_composition())) result.add(inst) return result
def run(self): self.make_pipeline() structure_class_lookup = self.structure_class_loader self.log("Loading Glycan Compositions from Stream for %r" % self.hypothesis) acc = [] counter = 0 for composition, structure_classes in self.transformer: mass = composition.mass() composition_string = composition.serialize() formula_string = formula(composition.total_composition()) inst = DBGlycanComposition( calculated_mass=mass, formula=formula_string, composition=composition_string, hypothesis_id=self.hypothesis_id) self.session.add(inst) self.session.flush() counter += 1 for structure_class in structure_classes: structure_class = structure_class_lookup[structure_class] acc.append(dict(glycan_id=inst.id, class_id=structure_class.id)) if len(acc) % 100 == 0: self.session.execute(GlycanCompositionToClass.insert(), acc) acc = [] if acc: self.session.execute(GlycanCompositionToClass.insert(), acc) acc = [] self.session.commit() self.log("Generated %d glycan compositions" % counter)
def pack_peptide(self, peptide_ident, start, end, score, score_type, parent_protein): match = Peptide( calculated_mass=peptide_ident.peptide_sequence.mass, base_peptide_sequence=peptide_ident.base_sequence, modified_peptide_sequence=str(peptide_ident.peptide_sequence), formula=formula(peptide_ident.peptide_sequence.total_composition()), count_glycosylation_sites=None, count_missed_cleavages=peptide_ident.missed_cleavages, count_variable_modifications=peptide_ident.modification_counter, start_position=start, end_position=end, peptide_score=score, peptide_score_type=score_type, sequence_length=end - start, protein_id=parent_protein.id, hypothesis_id=self.hypothesis_id) n_glycosites = n_glycan_sequon_sites( match, parent_protein) o_glycosites = o_glycan_sequon_sites(match, parent_protein) gag_glycosites = gag_sequon_sites(match, parent_protein) match.count_glycosylation_sites = len(n_glycosites) + len(o_glycosites) match.n_glycosylation_sites = sorted(n_glycosites) match.o_glycosylation_sites = sorted(o_glycosites) match.gagylation_sites = sorted(gag_glycosites) return match
def run(self): self.make_pipeline() structure_class_lookup = self.structure_class_loader acc = [] self.log("Generating Glycan Compositions from Symbolic Rules for %r" % self.hypothesis) counter = 0 for composition, structure_classes in self.transformer: mass = composition.mass() composition_string = composition.serialize() formula_string = formula(composition.total_composition()) inst = DBGlycanComposition( calculated_mass=mass, formula=formula_string, composition=composition_string, hypothesis_id=self.hypothesis_id) counter += 1 self.session.add(inst) self.session.flush() for structure_class in structure_classes: structure_class = structure_class_lookup[structure_class] acc.append(dict(glycan_id=inst.id, class_id=structure_class.id)) if len(acc) % 100 == 0: self.session.execute(GlycanCompositionToClass.insert(), acc) acc = [] if counter % 1000 == 0: self.log("%d glycan compositions created" % (counter,)) if acc: self.session.execute(GlycanCompositionToClass.insert(), acc) acc = [] self.session.commit() self.log("Generated %d glycan compositions" % counter)
def split_protein(self, protein_obj, sites=None): if sites is None: sites = [] n = len(sites) seen = set() for i in range(1, n + 1): for split_sites in itertools.combinations(sites, i): spanning_peptides = protein_obj.peptides.filter( *self._make_split_expression(split_sites)).all() for peptide in spanning_peptides: adjusted_sites = [0] + [ s - peptide.start_position for s in split_sites ] + [peptide.sequence_length] for j in range(len(adjusted_sites) - 1): begin, end = adjusted_sites[j], adjusted_sites[j + 1] if end - begin < self.min_length: continue start_position = begin + peptide.start_position end_position = end + peptide.start_position if (start_position, end_position) in seen: continue else: seen.add((start_position, end_position)) for modified_peptide, n_variable_modifications in self._permuted_peptides( peptide.base_peptide_sequence[begin:end]): inst = Peptide( base_peptide_sequence=str( peptide.base_peptide_sequence[begin:end]), modified_peptide_sequence=str( modified_peptide), count_missed_cleavages=peptide. count_missed_cleavages, count_variable_modifications= n_variable_modifications, sequence_length=len(modified_peptide), start_position=start_position, end_position=end_position, calculated_mass=modified_peptide.mass, formula=formula( modified_peptide.total_composition()), protein_id=protein_obj.id) inst.hypothesis_id = protein_obj.hypothesis_id inst.peptide_score = 0 inst.peptide_score_type = 'null_score' n_glycosites = parent_sequence_aware_n_glycan_sequon_sites( inst, protein_obj) o_glycosites = o_glycan_sequon_sites( inst, protein_obj) gag_glycosites = gag_sequon_sites( inst, protein_obj) inst.count_glycosylation_sites = len(n_glycosites) inst.n_glycosylation_sites = sorted(n_glycosites) inst.o_glycosylation_sites = sorted(o_glycosites) inst.gagylation_sites = sorted(gag_glycosites) yield inst
def modify_string(self, peptide): for modified_peptide, n_variable_modifications in self.peptide_permuter(peptide): inst = Peptide( base_peptide_sequence=str(peptide), modified_peptide_sequence=str(modified_peptide), count_missed_cleavages=-1, count_variable_modifications=n_variable_modifications, sequence_length=len(modified_peptide), start_position=-1, end_position=-1, calculated_mass=modified_peptide.mass, formula=formula(modified_peptide.total_composition())) yield inst
def modify_string(self, peptide): for modified_peptide, n_variable_modifications in self.peptide_permuter(peptide): inst = Peptide( base_peptide_sequence=str(peptide), modified_peptide_sequence=str(modified_peptide), count_missed_cleavages=-1, count_variable_modifications=n_variable_modifications, sequence_length=len(modified_peptide), start_position=-1, end_position=-1, calculated_mass=modified_peptide.mass, formula=formula(modified_peptide.total_composition())) yield inst
def _migrate_single_glycopeptide(self, glycopeptide): inst = Glycopeptide( id=glycopeptide.id, peptide_id=glycopeptide.id.peptide_id, glycan_combination_id=glycopeptide.id.glycan_combination_id, protein_id=glycopeptide.id.protein_id, hypothesis_id=glycopeptide.id.hypothesis_id, glycopeptide_sequence=glycopeptide.get_sequence(), calculated_mass=glycopeptide.total_mass, formula=formula(glycopeptide.total_composition())) self._glycopeptide_hypothesis_migrator.migrate_glycopeptide(inst) self._glycopeptide_hypothesis_migrator.commit() return self._glycopeptide_hypothesis_migrator.glycopeptide_id_map[glycopeptide.id]
def modifications(): d = {} mt = ModificationTable() d['definitions'] = [ (rule.title, formula(rule.composition), rule.mass) for rule in mt.rules() ] d['specificities'] = set() for rule in mt.rules(): if (ModificationCategory.substitution in rule.categories or ModificationCategory.glycosylation in rule.categories or ModificationCategory.other_glycosylation in rule.categories): continue d['specificities'].update(rule.as_spec_strings()) d['specificities'] = tuple(d['specificities']) return jsonify(**d)
def combinate(self, n=1): j = 0 for comb_compositions in itertools.combinations_with_replacement(self.glycan_compositions, n): j += 1 counts = Counter(comb_compositions) merged = merge_compositions_frozen(comb_compositions) composition = str(merged) mass = sum(c.mass for c in comb_compositions) elemental_composition = Composition() for c in comb_compositions: elemental_composition += c.elemental_composition inst = GlycanCombination( count=n, calculated_mass=mass, composition=composition, formula=formula(elemental_composition)) yield inst, counts
def convert_to_peptide_dict(glycopeptide, id_tracker): data = { "id": glycopeptide.id, "peptide_sequence": parser.strip_modifications(glycopeptide), "modifications": [] } i = 0 # TODO: handle N-terminal and C-terminal modifications for pos, mods in glycopeptide: i += 1 if not mods: continue else: mod = mods[0] if mod.rule.is_a("glycosylation"): mod_dict = { "monoisotopic_mass_delta": glycopeptide.glycan_composition.mass(), "location": i, "name": "unknown modification", "params": [ components.UserParam(name='GlycosylationType', value=str(mod)), components.UserParam(name='GlycanComposition', value=str( glycopeptide.glycan_composition)), components.UserParam( name='Formula', value=formula(glycopeptide.glycan_composition. total_composition())) ] } data['modifications'].append(mod_dict) else: mod_dict = { "monoisotopic_mass_delta": mod.mass, "location": i, "name": mod.name, } data['modifications'].append(mod_dict) return data
def fetch_glycopeptides(self, glycopeptide_ids): aggregate = dict() for gp in self._identified_glycopeptide_set: for solution_set in gp.spectrum_matches: for match in solution_set: aggregate[match.target.id] = match.target out = [] for i, obj in enumerate(aggregate.values(), 1): inst = Glycopeptide( id=obj.id, peptide_id=obj.id.peptide_id, glycan_combination_id=obj.id.glycan_combination_id, protein_id=obj.id.protein_id, hypothesis_id=obj.id.hypothesis_id, glycopeptide_sequence=obj.get_sequence(), calculated_mass=obj.total_mass, formula=formula(obj.total_composition())) out.append(inst) return out
def fetch_glycopeptides(self, glycopeptide_ids): aggregate = dict() for gp in self._identified_glycopeptide_set: for solution_set in gp.spectrum_matches: for match in solution_set: aggregate[match.target.id] = match.target out = [] for i, obj in enumerate(aggregate.values(), 1): inst = Glycopeptide( id=obj.id, peptide_id=obj.id.peptide_id, glycan_combination_id=obj.id.glycan_combination_id, protein_id=obj.id.protein_id, hypothesis_id=obj.id.hypothesis_id, glycopeptide_sequence=obj.get_sequence(), calculated_mass=obj.total_mass, formula=formula(obj.total_composition())) out.append(inst) return out
def convert_to_peptide_dict(glycopeptide, id_tracker): data = { "id": glycopeptide.id, "peptide_sequence": parser.strip_modifications(glycopeptide), "modifications": [ ] } i = 0 # TODO: handle N-terminal and C-terminal modifications for pos, mods in glycopeptide: i += 1 if not mods: continue else: mod = mods[0] if mod.rule.is_a("glycosylation"): mod_dict = { "monoisotopic_mass_delta": glycopeptide.glycan_composition.mass(), "location": i, "name": "unknown modification", "params": [ components.UserParam( name='GlycosylationType', value=str(mod)), components.UserParam(name='GlycanComposition', value=str( glycopeptide.glycan_composition)), components.UserParam(name='Formula', value=formula( glycopeptide.glycan_composition.total_composition())) ] } data['modifications'].append(mod_dict) else: mod_dict = { "monoisotopic_mass_delta": mod.mass, "location": i, "name": mod.name, } data['modifications'].append(mod_dict) return data
def handle_peptide(self, peptide): water = Composition("H2O") peptide_composition = Composition(str(peptide.formula)) obj = peptide.convert() # Handle N-linked glycosylation sites n_glycosylation_unoccupied_sites = set(peptide.n_glycosylation_sites) for site in list(n_glycosylation_unoccupied_sites): if obj[site][1]: n_glycosylation_unoccupied_sites.remove(site) for i in range(len(n_glycosylation_unoccupied_sites)): i += 1 for gc in self.glycan_combination_partitions[i, {GlycanTypes.n_glycan: i}]: total_mass = peptide.calculated_mass + gc.calculated_mass - (gc.count * water.mass) formula_string = formula(peptide_composition + Composition(str(gc.formula)) - (water * gc.count)) for site_set in limiting_combinations(n_glycosylation_unoccupied_sites, i): sequence = peptide.convert() for site in site_set: sequence.add_modification(site, _n_glycosylation.name) sequence.glycan = gc.convert() glycopeptide_sequence = str(sequence) glycopeptide = Glycopeptide( calculated_mass=total_mass, formula=formula_string, glycopeptide_sequence=glycopeptide_sequence, peptide_id=peptide.id, protein_id=peptide.protein_id, hypothesis_id=peptide.hypothesis_id, glycan_combination_id=gc.id) yield glycopeptide # Handle O-linked glycosylation sites o_glycosylation_unoccupied_sites = set(peptide.o_glycosylation_sites) for site in list(o_glycosylation_unoccupied_sites): if obj[site][1]: o_glycosylation_unoccupied_sites.remove(site) for i in range(len(o_glycosylation_unoccupied_sites)): i += 1 for gc in self.glycan_combination_partitions[i, {GlycanTypes.o_glycan: i}]: total_mass = peptide.calculated_mass + gc.calculated_mass - (gc.count * water.mass) formula_string = formula(peptide_composition + Composition(str(gc.formula)) - (water * gc.count)) for site_set in limiting_combinations(o_glycosylation_unoccupied_sites, i): sequence = peptide.convert() for site in site_set: sequence.add_modification(site, _o_glycosylation.name) sequence.glycan = gc.convert() glycopeptide_sequence = str(sequence) glycopeptide = Glycopeptide( calculated_mass=total_mass, formula=formula_string, glycopeptide_sequence=glycopeptide_sequence, peptide_id=peptide.id, protein_id=peptide.protein_id, hypothesis_id=peptide.hypothesis_id, glycan_combination_id=gc.id) yield glycopeptide # Handle GAG glycosylation sites gag_unoccupied_sites = set(peptide.gagylation_sites) for site in list(gag_unoccupied_sites): if obj[site][1]: gag_unoccupied_sites.remove(site) for i in range(len(gag_unoccupied_sites)): i += 1 for gc in self.glycan_combination_partitions[i, {GlycanTypes.gag_linker: i}]: total_mass = peptide.calculated_mass + gc.calculated_mass - (gc.count * water.mass) formula_string = formula(peptide_composition + Composition(str(gc.formula)) - (water * gc.count)) for site_set in limiting_combinations(gag_unoccupied_sites, i): sequence = peptide.convert() for site in site_set: sequence.add_modification(site, _gag_linker_glycosylation.name) sequence.glycan = gc.convert() glycopeptide_sequence = str(sequence) glycopeptide = Glycopeptide( calculated_mass=total_mass, formula=formula_string, glycopeptide_sequence=glycopeptide_sequence, peptide_id=peptide.id, protein_id=peptide.protein_id, hypothesis_id=peptide.hypothesis_id, glycan_combination_id=gc.id) yield glycopeptide
def split_protein(self, protein_obj, sites=None): if sites is None: sites = [] if not sites: return seen = set() sites_seen = set() peptides = protein_obj.peptides.all() peptide_intervals = IntervalTreeNode.build(map(PeptideInterval, peptides)) for site in sites: overlap_region = peptide_intervals.contains_point(site - 1) spanned_intervals = IntervalTreeNode.build(overlap_region) # No spanned peptides. May be caused by regions of protein which digest to peptides # of unacceptable size. if spanned_intervals is None: continue lo = spanned_intervals.start hi = spanned_intervals.end # Get the set of all sites spanned by any peptide which spans the current query site spanned_sites = [s for s in sites if lo <= s <= hi] for i in range(1, len(spanned_sites) + 1): for split_sites in itertools.combinations(spanned_sites, i): site_key = frozenset(split_sites) if site_key in sites_seen: continue sites_seen.add(site_key) spanning_peptides_query = spanned_intervals.contains_point(split_sites[0]) for site_j in split_sites[1:]: spanning_peptides_query = [ sp for sp in spanning_peptides_query if site_j in sp ] spanning_peptides = [] for sp in spanning_peptides_query: spanning_peptides.extend(sp) for peptide in spanning_peptides: adjusted_sites = [0] + [s - peptide.start_position for s in split_sites] + [ peptide.sequence_length] for j in range(len(adjusted_sites) - 1): begin, end = adjusted_sites[j], adjusted_sites[j + 1] if end - begin < self.min_length: continue start_position = begin + peptide.start_position end_position = end + peptide.start_position if (start_position, end_position) in seen: continue else: seen.add((start_position, end_position)) for modified_peptide, n_variable_modifications in self._permuted_peptides( peptide.base_peptide_sequence[begin:end]): inst = Peptide( base_peptide_sequence=str(peptide.base_peptide_sequence[begin:end]), modified_peptide_sequence=str(modified_peptide), count_missed_cleavages=peptide.count_missed_cleavages, count_variable_modifications=n_variable_modifications, sequence_length=len(modified_peptide), start_position=start_position, end_position=end_position, calculated_mass=modified_peptide.mass, formula=formula(modified_peptide.total_composition()), protein_id=protein_obj.id) inst.hypothesis_id = protein_obj.hypothesis_id inst.peptide_score = 0 inst.peptide_score_type = 'null_score' n_glycosites = n_glycan_sequon_sites( inst, protein_obj) o_glycosites = o_glycan_sequon_sites(inst, protein_obj) gag_glycosites = gag_sequon_sites(inst, protein_obj) inst.count_glycosylation_sites = len(n_glycosites) inst.n_glycosylation_sites = sorted(n_glycosites) inst.o_glycosylation_sites = sorted(o_glycosites) inst.gagylation_sites = sorted(gag_glycosites) yield inst
def handle_peptide(self, peptide): water = Composition("H2O") peptide_composition = Composition(str(peptide.formula)) obj = peptide.convert() # Handle N-linked glycosylation sites n_glycosylation_unoccupied_sites = set(peptide.n_glycosylation_sites) for site in list(n_glycosylation_unoccupied_sites): if obj[site][1]: n_glycosylation_unoccupied_sites.remove(site) for i in range(len(n_glycosylation_unoccupied_sites)): i += 1 for gc in self.glycan_combination_partitions[i, {GlycanTypes.n_glycan: i}]: total_mass = peptide.calculated_mass + gc.calculated_mass - (gc.count * water.mass) formula_string = formula(peptide_composition + Composition(str(gc.formula)) - (water * gc.count)) for site_set in limiting_combinations(n_glycosylation_unoccupied_sites, i): sequence = peptide.convert() for site in site_set: sequence.add_modification(site, _n_glycosylation.name) sequence.glycan = gc.convert() glycopeptide_sequence = str(sequence) glycopeptide = Glycopeptide( calculated_mass=total_mass, formula=formula_string, glycopeptide_sequence=glycopeptide_sequence, peptide_id=peptide.id, protein_id=peptide.protein_id, hypothesis_id=peptide.hypothesis_id, glycan_combination_id=gc.id) yield glycopeptide # Handle O-linked glycosylation sites o_glycosylation_unoccupied_sites = set(peptide.o_glycosylation_sites) for site in list(o_glycosylation_unoccupied_sites): if obj[site][1]: o_glycosylation_unoccupied_sites.remove(site) for i in range(len(o_glycosylation_unoccupied_sites)): i += 1 for gc in self.glycan_combination_partitions[i, {GlycanTypes.o_glycan: i}]: total_mass = peptide.calculated_mass + gc.calculated_mass - (gc.count * water.mass) formula_string = formula(peptide_composition + Composition(str(gc.formula)) - (water * gc.count)) for site_set in limiting_combinations(o_glycosylation_unoccupied_sites, i): sequence = peptide.convert() for site in site_set: sequence.add_modification(site, _o_glycosylation.name) sequence.glycan = gc.convert() glycopeptide_sequence = str(sequence) glycopeptide = Glycopeptide( calculated_mass=total_mass, formula=formula_string, glycopeptide_sequence=glycopeptide_sequence, peptide_id=peptide.id, protein_id=peptide.protein_id, hypothesis_id=peptide.hypothesis_id, glycan_combination_id=gc.id) yield glycopeptide # Handle GAG glycosylation sites gag_unoccupied_sites = set(peptide.gagylation_sites) for site in list(gag_unoccupied_sites): if obj[site][1]: gag_unoccupied_sites.remove(site) for i in range(len(gag_unoccupied_sites)): i += 1 for gc in self.glycan_combination_partitions[i, {GlycanTypes.gag_linker: i}]: total_mass = peptide.calculated_mass + gc.calculated_mass - (gc.count * water.mass) formula_string = formula(peptide_composition + Composition(str(gc.formula)) - (water * gc.count)) for site_set in limiting_combinations(gag_unoccupied_sites, i): sequence = peptide.convert() for site in site_set: sequence.add_modification(site, _gag_linker_glycosylation.name) sequence.glycan = gc.convert() glycopeptide_sequence = str(sequence) glycopeptide = Glycopeptide( calculated_mass=total_mass, formula=formula_string, glycopeptide_sequence=glycopeptide_sequence, peptide_id=peptide.id, protein_id=peptide.protein_id, hypothesis_id=peptide.hypothesis_id, glycan_combination_id=gc.id) yield glycopeptide
def convert_to_peptide_dict(self, glycopeptide, id_tracker): data = { "id": glycopeptide.id, "peptide_sequence": parser.strip_modifications(glycopeptide), "modifications": [] } i = 0 # TODO: handle N-terminal and C-terminal modifications glycosylation_event_count = len(glycopeptide.convert().glycosylation_manager) glycosylation_events_handled = 0 for _pos, mods in glycopeptide: i += 1 if not mods: continue else: mod = mods[0] if mod.rule.is_a("glycosylation"): glycosylation_events_handled += 1 is_aggregate_stub = False mod_params = [ glycosylation_type_to_term( str(mod.rule.glycosylation_type)) ] if mod.rule.is_core: mod_params.extend( self.gnome_resolver.glycan_composition_to_terms(glycopeptide.glycan_composition.clone())) mass = glycopeptide.glycan_composition.mass() if glycosylation_event_count == 1: mod_params.append({ "name": "glycan composition", "cvRef": "PSI-MS", "accession": "MS:XXXX14" }) else: mod_params.append({ "name": "glycan aggregate", "cvRef": "PSI-MS", "accession": "MS:XXXX15" }) if glycosylation_events_handled > 1: mass = 0 is_aggregate_stub = True if not is_aggregate_stub: mod_params.append({ "accession": 'MS:1000864', "cvRef": "PSI-MS", "name": "chemical formula", "value": formula(glycopeptide.glycan_composition.total_composition()), }) else: mod_params.append({ "accession": 'MS:1000864', "cvRef": "PSI-MS", "name": "chemical formula", "value": formula(mod.rule.composition), }) if mod.rule.is_composition: mod_params.extend(self.gnome_resolver.glycan_composition_to_terms(mod.rule.glycan.clone())) mod_params.append({ "name": "glycan composition", "cvRef": "PSI-MS", "accession": "MS:XXXX14" }) else: mod_params.append({ "name": "glycan structure", "cvRef": "PSI-MS", "accession": "MS:XXXXXXX" }) mass = mod.mass mod_dict = { "monoisotopic_mass_delta": mass, "location": i, # "name": "unknown modification", "name": "glycosylation modification", "params": [components.CVParam(**x) for x in mod_params] } data['modifications'].append(mod_dict) else: mod_dict = { "monoisotopic_mass_delta": mod.mass, "location": i, "name": mod.name, } data['modifications'].append(mod_dict) return data
def split_protein(self, protein_obj, sites=None): if sites is None: sites = [] if not sites: return seen = set() sites_seen = set() peptides = protein_obj.peptides.all() peptide_intervals = IntervalTreeNode.build(map(PeptideInterval, peptides)) for site in sites: overlap_region = peptide_intervals.contains_point(site - 1) spanned_intervals = IntervalTreeNode.build(overlap_region) # No spanned peptides. May be caused by regions of protein which digest to peptides # of unacceptable size. if spanned_intervals is None: continue lo = spanned_intervals.start hi = spanned_intervals.end # Get the set of all sites spanned by any peptide which spans the current query site spanned_sites = [s for s in sites if lo <= s <= hi] for i in range(1, len(spanned_sites) + 1): for split_sites in itertools.combinations(spanned_sites, i): site_key = frozenset(split_sites) if site_key in sites_seen: continue sites_seen.add(site_key) spanning_peptides_query = spanned_intervals.contains_point(split_sites[0]) for site_j in split_sites[1:]: spanning_peptides_query = [ sp for sp in spanning_peptides_query if site_j in sp ] spanning_peptides = [] for sp in spanning_peptides_query: spanning_peptides.extend(sp) for peptide in spanning_peptides: adjusted_sites = [0] + [s - peptide.start_position for s in split_sites] + [ peptide.sequence_length] for j in range(len(adjusted_sites) - 1): begin, end = adjusted_sites[j], adjusted_sites[j + 1] if end - begin < self.min_length: continue start_position = begin + peptide.start_position end_position = end + peptide.start_position if (start_position, end_position) in seen: continue else: seen.add((start_position, end_position)) for modified_peptide, n_variable_modifications in self._permuted_peptides( peptide.base_peptide_sequence[begin:end]): inst = Peptide( base_peptide_sequence=str(peptide.base_peptide_sequence[begin:end]), modified_peptide_sequence=str(modified_peptide), count_missed_cleavages=peptide.count_missed_cleavages, count_variable_modifications=n_variable_modifications, sequence_length=len(modified_peptide), start_position=start_position, end_position=end_position, calculated_mass=modified_peptide.mass, formula=formula(modified_peptide.total_composition()), protein_id=protein_obj.id) inst.hypothesis_id = protein_obj.hypothesis_id inst.peptide_score = 0 inst.peptide_score_type = 'null_score' n_glycosites = n_glycan_sequon_sites( inst, protein_obj) o_glycosites = o_glycan_sequon_sites(inst, protein_obj) gag_glycosites = gag_sequon_sites(inst, protein_obj) inst.count_glycosylation_sites = len(n_glycosites) inst.n_glycosylation_sites = sorted(n_glycosites) inst.o_glycosylation_sites = sorted(o_glycosites) inst.gagylation_sites = sorted(gag_glycosites) yield inst
def _serialize_mass_shift(self, mass_shift): return {"name": mass_shift.name, "composition": formula(mass_shift.composition)}
def _serialize_mass_shift(self, mass_shift): return {"name": mass_shift.name, "composition": formula(mass_shift.composition)}