def generate_random_glycopeptides(target_mass, ppm_error=10e-6, count=20, constant_modifications=None, variable_modifications=None, glycans=None, min_length=0, cleavage_start=None, cleavage_end=None, max_missed_cleavages=1, max_glycosylations=2): ''' Given a target mass value and a tolerance threshold around it, create a set of random glycopeptides that satisfy the mass requirements. ''' if glycans is None: glycans = mammalian_glycans if constant_modifications is None: constant_modifications = [] else: constant_modifications = copy.deepcopy(constant_modifications) if variable_modifications is None: variable_modifications = [] else: variable_modifications = copy.deepcopy(variable_modifications) if cleavage_start is None or len(cleavage_start) == 0: cleavage_start = [""] if cleavage_end is None or len(cleavage_end) == 0: cleavage_end = [""] cleavage_pattern = Protease(cleavage_start, cleavage_end) variable_modifications = [ mod for mod in variable_modifications if mod.name != "HexNAc"] constant_modifications = [ mod for mod in constant_modifications if mod.name != "HexNAc"] components = MassHeap(map(lambda x: GrowingSequence(x, cleavage_pattern), generate_component_set( constant_modifications, variable_modifications))) sequons = MassHeap( map(lambda x: GrowingSequence(x, cleavage_pattern), itertools.chain.from_iterable( map(lambda x: ("{0}({1}){2}".format(x[0], g.as_modification().serialize(), x[1:]) for g in glycans), generate_n_linked_sequons() ) ) ) ) loc_fabs = fabs water = Composition("H2O").mass def reset_target_mass(): return (water + target_mass) - min(p.mass for p in candidate.pad()) solutions = set() max_iter = count * 10000 iter_count = 0 candidate = GrowingSequence("", cleavage_pattern) mass_to_meet = reset_target_mass() while(len(solutions) < count and iter_count < max_iter): can_glycosylate = (len(candidate) > min_length / 3) and \ (has_glycan(candidate) < max_glycosylations) and \ (random.random() > .7) options = list(components.get_lower_than(mass_to_meet)) if(can_glycosylate): glycosylated_options = list(sequons.get_lower_than(mass_to_meet)) options += glycosylated_options #logger.debug("%s options for extension, mass to meet: %s, %s" % (len(options), mass_to_meet, str(candidate))) next_part = random.choice(options) candidate.extend(next_part) mass_to_meet -= (next_part.mass - water) # print(str(candidate), candidate.missed_cleavages, len(candidate)) # Reset, too many missed cleavages? if candidate.missed_cleavages > max_missed_cleavages: #print("Too many missed cleavages: %s, Reset!" % candidate.missed_cleavages) candidate = GrowingSequence("", cleavage_pattern) mass_to_meet = reset_target_mass() for padded_sequence in candidate.pad(): # Only consider glycosylated sequences if has_glycan(candidate) < 1: break # Only consider longer sequences if(len(padded_sequence) < min_length): continue error = loc_fabs( (target_mass - padded_sequence.mass) / float(target_mass)) # logger.debug("%s, %s, %s" % # (padded_sequence, padded_sequence.mass, error)) # Accept? if error <= ppm_error: #logger.debug("Accepting %s %s" % # (padded_sequence, padded_sequence.mass)) solutions.add(str(padded_sequence)) # Reset, too big? if mass_to_meet < components[0].mass: candidate = GrowingSequence("", cleavage_pattern) mass_to_meet = reset_target_mass() iter_count += 1 return solutions
def generate_random(self, target_mass, count, max_missed_cleavages=3, max_glycosylations=2, min_length=5): loc_fabs = fabs water = Composition("H2O").mass components = self.components sequons = self.sequons cleavage_pattern = self.cleavage_pattern ppm_error = self.ppm_error def reset_target_mass(): return (water + target_mass) - min(p.mass for p in candidate.pad()) solutions = set() max_iter = count * 10000 iter_count = 0 candidate = GrowingSequence("", self.cleavage_pattern) mass_to_meet = reset_target_mass() while(len(solutions) < count and iter_count < max_iter): try: can_glycosylate = (len(candidate) > min_length / 3) and \ (has_glycan(candidate) < max_glycosylations) and \ (random.random() > .7) options = list(components.get_lower_than(mass_to_meet)) if(can_glycosylate): glycosylated_options = list( sequons.get_lower_than(mass_to_meet)) options += glycosylated_options if len(options) == 0: candidate = GrowingSequence("", cleavage_pattern) mass_to_meet = reset_target_mass() next_part = random.choice(options) candidate.extend(next_part) mass_to_meet -= (next_part.mass - water) # print(str(candidate), candidate.missed_cleavages, len(candidate)) # Reset, too many missed cleavages? if candidate.missed_cleavages > max_missed_cleavages: #print("Too many missed cleavages: %s, Reset!" % candidate.missed_cleavages) candidate = GrowingSequence("", cleavage_pattern) mass_to_meet = reset_target_mass() for padded_sequence in candidate.pad(): # Only consider glycosylated sequences if has_glycan(candidate) < 1: break # Only consider longer sequences if(len(padded_sequence) < min_length): continue error = loc_fabs( (target_mass - padded_sequence.mass) / float(target_mass)) #logger.debug("%s, %s, %s" % # (padded_sequence, padded_sequence.mass, error)) # Accept? if error <= ppm_error: if str(padded_sequence) in self.ignore_sequences: continue #logger.debug("Accepting %s %s" % #(padded_sequence, padded_sequence.mass)) solutions.add(str(padded_sequence)) # Reset, too big? if mass_to_meet < components[0].mass: candidate = GrowingSequence("", cleavage_pattern) mass_to_meet = reset_target_mass() iter_count += 1 except IndexError, e: pname = multiprocessing.current_process().name logger.error("[%s] RandomGlycopeptideBuilder: Exception occurred", pname, exc_info=e)