def __init__(self, nodes, constant_modifications=None, variable_modifications=None): self.nodes = MassHeap(Node(s.mass, graph=self) for s in nodes) self.edges = defaultdict(list) self.constant_modifications = constant_modifications or [] self.variable_modifications = variable_modifications or [] self.variable_modifications += [modification.Modification("HexNAc").rule] self.parts = MassHeap(generate_component_set( self.constant_modifications, self.variable_modifications)) self.long_parts = {1: self.parts} self.node_map = {}
def generate_random_glycopeptides(target_mass, ppm_error=10e-6, count=20, constant_modifications=None, variable_modifications=None, glycans=None, min_length=0, cleavage_start=None, cleavage_end=None, max_missed_cleavages=1, max_glycosylations=2): ''' Given a target mass value and a tolerance threshold around it, create a set of random glycopeptides that satisfy the mass requirements. ''' if glycans is None: glycans = mammalian_glycans if constant_modifications is None: constant_modifications = [] else: constant_modifications = copy.deepcopy(constant_modifications) if variable_modifications is None: variable_modifications = [] else: variable_modifications = copy.deepcopy(variable_modifications) if cleavage_start is None or len(cleavage_start) == 0: cleavage_start = [""] if cleavage_end is None or len(cleavage_end) == 0: cleavage_end = [""] cleavage_pattern = Protease(cleavage_start, cleavage_end) variable_modifications = [ mod for mod in variable_modifications if mod.name != "HexNAc"] constant_modifications = [ mod for mod in constant_modifications if mod.name != "HexNAc"] components = MassHeap(map(lambda x: GrowingSequence(x, cleavage_pattern), generate_component_set( constant_modifications, variable_modifications))) sequons = MassHeap( map(lambda x: GrowingSequence(x, cleavage_pattern), itertools.chain.from_iterable( map(lambda x: ("{0}({1}){2}".format(x[0], g.as_modification().serialize(), x[1:]) for g in glycans), generate_n_linked_sequons() ) ) ) ) loc_fabs = fabs water = Composition("H2O").mass def reset_target_mass(): return (water + target_mass) - min(p.mass for p in candidate.pad()) solutions = set() max_iter = count * 10000 iter_count = 0 candidate = GrowingSequence("", cleavage_pattern) mass_to_meet = reset_target_mass() while(len(solutions) < count and iter_count < max_iter): can_glycosylate = (len(candidate) > min_length / 3) and \ (has_glycan(candidate) < max_glycosylations) and \ (random.random() > .7) options = list(components.get_lower_than(mass_to_meet)) if(can_glycosylate): glycosylated_options = list(sequons.get_lower_than(mass_to_meet)) options += glycosylated_options #logger.debug("%s options for extension, mass to meet: %s, %s" % (len(options), mass_to_meet, str(candidate))) next_part = random.choice(options) candidate.extend(next_part) mass_to_meet -= (next_part.mass - water) # print(str(candidate), candidate.missed_cleavages, len(candidate)) # Reset, too many missed cleavages? if candidate.missed_cleavages > max_missed_cleavages: #print("Too many missed cleavages: %s, Reset!" % candidate.missed_cleavages) candidate = GrowingSequence("", cleavage_pattern) mass_to_meet = reset_target_mass() for padded_sequence in candidate.pad(): # Only consider glycosylated sequences if has_glycan(candidate) < 1: break # Only consider longer sequences if(len(padded_sequence) < min_length): continue error = loc_fabs( (target_mass - padded_sequence.mass) / float(target_mass)) # logger.debug("%s, %s, %s" % # (padded_sequence, padded_sequence.mass, error)) # Accept? if error <= ppm_error: #logger.debug("Accepting %s %s" % # (padded_sequence, padded_sequence.mass)) solutions.add(str(padded_sequence)) # Reset, too big? if mass_to_meet < components[0].mass: candidate = GrowingSequence("", cleavage_pattern) mass_to_meet = reset_target_mass() iter_count += 1 return solutions
class Graph(object): def __init__(self, nodes, constant_modifications=None, variable_modifications=None): self.nodes = MassHeap(Node(s.mass, graph=self) for s in nodes) self.edges = defaultdict(list) self.constant_modifications = constant_modifications or [] self.variable_modifications = variable_modifications or [] self.variable_modifications += [modification.Modification("HexNAc").rule] self.parts = MassHeap(generate_component_set( self.constant_modifications, self.variable_modifications)) self.long_parts = {1: self.parts} self.node_map = {} def __iter__(self): return iter(self.nodes) def process(self, node, parts=None): if parts is None: parts = self.parts for extent in self.nodes.get_higher_than(node.mass): gap_mass = extent.mass - node.mass for part in parts.get_lower_than(gap_mass + precursor_mass_shift + 1): if fabs(ppm_error(gap_mass + y_mass_shift, part.mass + y_mass_shift)) <= 2e-5: self.edges[frozenset((node, extent))].append(Edge( node, extent, node.mass, link_sequence=part) ) def process_all(self, length=1): parts = self.long_parts.get(length, None) if parts is None: self.build_unordered_sequences(length) parts = self.long_parts[length] for node in self: self.process(node, parts) self.build_node_map() def build_node_map(self): self.node_map = defaultdict(list) for node in self: for pair in [pair for pair in self.edges if node in pair]: if node == min(pair, key=masser): self.node_map[node].extend(self.edges[pair]) def roots(self): roots = set(self.node_map) for node, edges in self.node_map.items(): for edge in edges: roots.discard(edge.to_terminus) if len(edges) == 0: roots.discard(node) return list(roots) def build_unordered_sequences(self, n=2): self.long_parts[n] = MassHeap(list(unordered_combinations(self.parts, n))) def get_sequence(self, node): for path in self.traverse(node): yield (node, SequenceCollection(map(lambda x: x.link_sequence, path)), path[-1].to_terminus) def traverse(self, node): if len(self.node_map[node]) == 0: yield [] else: for edge in self.node_map[node]: for path in self.traverse(edge.to_terminus): yield [edge] + path def all_paths(self): for root in self.roots(): for path in self.traverse(root): yield (sum(map(len, path)), path) def all_sequences(self): for root in self.roots(): for seq in self.get_sequence(root): yield (len(seq[1]), seq) def identify_node(self, node, parts=None): if parts is None: parts = self.parts for part in parts: if fabs(ppm_error(node.mass, part.mass + y_mass_shift)) <= 2e-5: node.composition.append(part) node.kind.append('y') elif fabs(ppm_error(node.mass, part.mass + b_mass_shift)) <= 2e-5: node.composition.append(part) node.kind.append('b') return zip(node.composition, node.kind)
def sequence_spectra(ms_spectrum, drop_mass=0, constant_modifications=None, variable_modifications=None, max_running_gaps=1, max_total_gaps=2): constant_modifications = constant_modifications or [] variable_modifications = variable_modifications or [] variable_modifications += [modification.Modification("HexNAc").rule] precursor_mass = ms_spectrum.neutral_mass - drop_mass logger.info("Precursor Mass: %f", precursor_mass) previous_sequences = SqliteDiskQueue() parts = map(SimpleFragment, generate_component_set(constant_modifications, variable_modifications)) tandem = MassHeap(ms_spectrum.tandem_data) # Get starting component match = False for part in parts: for msms in tandem: if (fabs(ppm_error(msms.neutral_mass, part.mass + y_mass_shift)) < 2e-5): previous_sequences.append(SequenceRecord(part, kind='y')) match = True if (fabs(ppm_error(msms.neutral_mass, part.mass + b_mass_shift)) < 2e-5): previous_sequences.append(SequenceRecord(part, kind='b')) match = True if not match: for part in parts: previous_sequences.append(SequenceRecord(part, 1, 1)) next_sequences = SqliteDiskQueue() solutions = deque(maxlen=4 * max_total_gaps) min_part = min(parts, key=lambda x: x.mass).mass max_part = max(parts, key=lambda x: x.mass).mass while len(previous_sequences) > 0: for seq in previous_sequences: match = [] lower = (seq.mass + min_part) upper = seq.mass + max_part lower_threshold = lower - (y_mass_shift + lower * 2e-5) upper_threshold = upper + (y_mass_shift + upper * 2e-5) for msms in reversed(list(tandem.get_higher_than(lower_threshold))): if msms.neutral_mass > upper_threshold: break for part in parts: mass_query = part.mass + y_mass_shift + seq.mass if seq.kind != 'b' and (fabs(ppm_error(msms.neutral_mass, mass_query)) < 2e-5): ext = seq.extend(part, False) ext.matches.append(msms) ext.kind = 'y' next_sequences.append(ext) match.append(msms) # logger.info("Match on %r -> %r", ext, msms) mass_query += -y_mass_shift + b_mass_shift if seq.kind != 'y' and (fabs(ppm_error(msms.neutral_mass, mass_query)) < 2e-5): ext = seq.extend(part, False) ext.kind = 'b' next_sequences.append(ext) match.append(msms) # logger.info("Match on %r -> %r", ext, msms) if len(match) == 0: # logger.info("No match on %r", seq) if seq.current_gaps + 1 <= max_running_gaps and seq.total_gaps + 1 <= max_total_gaps: for part in parts: next_sequences.append(seq.extend(part, True)) logger.info("Round over, %d candidates", len(next_sequences)) if len(next_sequences) == 0: return ResultsGroup([seq for round in solutions for seq in round], parts) previous_sequences = next_sequences solutions.append(seq for seq in next_sequences if len(seq.matches) > 0) next_sequences = SqliteDiskQueue()