class Graph(object): def __init__(self, nodes, constant_modifications=None, variable_modifications=None): self.nodes = MassHeap(Node(s.mass, graph=self) for s in nodes) self.edges = defaultdict(list) self.constant_modifications = constant_modifications or [] self.variable_modifications = variable_modifications or [] self.variable_modifications += [modification.Modification("HexNAc").rule] self.parts = MassHeap(generate_component_set( self.constant_modifications, self.variable_modifications)) self.long_parts = {1: self.parts} self.node_map = {} def __iter__(self): return iter(self.nodes) def process(self, node, parts=None): if parts is None: parts = self.parts for extent in self.nodes.get_higher_than(node.mass): gap_mass = extent.mass - node.mass for part in parts.get_lower_than(gap_mass + precursor_mass_shift + 1): if fabs(ppm_error(gap_mass + y_mass_shift, part.mass + y_mass_shift)) <= 2e-5: self.edges[frozenset((node, extent))].append(Edge( node, extent, node.mass, link_sequence=part) ) def process_all(self, length=1): parts = self.long_parts.get(length, None) if parts is None: self.build_unordered_sequences(length) parts = self.long_parts[length] for node in self: self.process(node, parts) self.build_node_map() def build_node_map(self): self.node_map = defaultdict(list) for node in self: for pair in [pair for pair in self.edges if node in pair]: if node == min(pair, key=masser): self.node_map[node].extend(self.edges[pair]) def roots(self): roots = set(self.node_map) for node, edges in self.node_map.items(): for edge in edges: roots.discard(edge.to_terminus) if len(edges) == 0: roots.discard(node) return list(roots) def build_unordered_sequences(self, n=2): self.long_parts[n] = MassHeap(list(unordered_combinations(self.parts, n))) def get_sequence(self, node): for path in self.traverse(node): yield (node, SequenceCollection(map(lambda x: x.link_sequence, path)), path[-1].to_terminus) def traverse(self, node): if len(self.node_map[node]) == 0: yield [] else: for edge in self.node_map[node]: for path in self.traverse(edge.to_terminus): yield [edge] + path def all_paths(self): for root in self.roots(): for path in self.traverse(root): yield (sum(map(len, path)), path) def all_sequences(self): for root in self.roots(): for seq in self.get_sequence(root): yield (len(seq[1]), seq) def identify_node(self, node, parts=None): if parts is None: parts = self.parts for part in parts: if fabs(ppm_error(node.mass, part.mass + y_mass_shift)) <= 2e-5: node.composition.append(part) node.kind.append('y') elif fabs(ppm_error(node.mass, part.mass + b_mass_shift)) <= 2e-5: node.composition.append(part) node.kind.append('b') return zip(node.composition, node.kind)
def sequence_spectra(ms_spectrum, drop_mass=0, constant_modifications=None, variable_modifications=None, max_running_gaps=1, max_total_gaps=2): constant_modifications = constant_modifications or [] variable_modifications = variable_modifications or [] variable_modifications += [modification.Modification("HexNAc").rule] precursor_mass = ms_spectrum.neutral_mass - drop_mass logger.info("Precursor Mass: %f", precursor_mass) previous_sequences = SqliteDiskQueue() parts = map(SimpleFragment, generate_component_set(constant_modifications, variable_modifications)) tandem = MassHeap(ms_spectrum.tandem_data) # Get starting component match = False for part in parts: for msms in tandem: if (fabs(ppm_error(msms.neutral_mass, part.mass + y_mass_shift)) < 2e-5): previous_sequences.append(SequenceRecord(part, kind='y')) match = True if (fabs(ppm_error(msms.neutral_mass, part.mass + b_mass_shift)) < 2e-5): previous_sequences.append(SequenceRecord(part, kind='b')) match = True if not match: for part in parts: previous_sequences.append(SequenceRecord(part, 1, 1)) next_sequences = SqliteDiskQueue() solutions = deque(maxlen=4 * max_total_gaps) min_part = min(parts, key=lambda x: x.mass).mass max_part = max(parts, key=lambda x: x.mass).mass while len(previous_sequences) > 0: for seq in previous_sequences: match = [] lower = (seq.mass + min_part) upper = seq.mass + max_part lower_threshold = lower - (y_mass_shift + lower * 2e-5) upper_threshold = upper + (y_mass_shift + upper * 2e-5) for msms in reversed(list(tandem.get_higher_than(lower_threshold))): if msms.neutral_mass > upper_threshold: break for part in parts: mass_query = part.mass + y_mass_shift + seq.mass if seq.kind != 'b' and (fabs(ppm_error(msms.neutral_mass, mass_query)) < 2e-5): ext = seq.extend(part, False) ext.matches.append(msms) ext.kind = 'y' next_sequences.append(ext) match.append(msms) # logger.info("Match on %r -> %r", ext, msms) mass_query += -y_mass_shift + b_mass_shift if seq.kind != 'y' and (fabs(ppm_error(msms.neutral_mass, mass_query)) < 2e-5): ext = seq.extend(part, False) ext.kind = 'b' next_sequences.append(ext) match.append(msms) # logger.info("Match on %r -> %r", ext, msms) if len(match) == 0: # logger.info("No match on %r", seq) if seq.current_gaps + 1 <= max_running_gaps and seq.total_gaps + 1 <= max_total_gaps: for part in parts: next_sequences.append(seq.extend(part, True)) logger.info("Round over, %d candidates", len(next_sequences)) if len(next_sequences) == 0: return ResultsGroup([seq for round in solutions for seq in round], parts) previous_sequences = next_sequences solutions.append(seq for seq in next_sequences if len(seq.matches) > 0) next_sequences = SqliteDiskQueue()