def __init__(self, database_path, analysis_id, stream=None, threshold=5): super(GlycanChromatogramReportCreator, self).__init__( database_path, analysis_id, stream) self.set_template_loader(os.path.dirname(__file__)) self.threshold = threshold self.glycan_chromatograms = ChromatogramFilter([]) self.unidentified_chromatograms = ChromatogramFilter([])
def __init__(self, database_path, analysis_id, stream=None, threshold=5): super(GlycanChromatogramReportCreator, self).__init__(database_path, analysis_id, stream) self.set_template_loader(os.path.dirname(__file__)) self.threshold = threshold self.glycan_chromatograms = ChromatogramFilter([]) self.unidentified_chromatograms = ChromatogramFilter([])
def evaluate(self, chromatograms, delta_rt=0.25, min_points=3, smooth_overlap_rt=True, *args, **kwargs): filtered = ChromatogramFilter.process( chromatograms, delta_rt=delta_rt, min_points=min_points) if smooth_overlap_rt: filtered = ChromatogramOverlapSmoother(filtered) solutions = [] i = 0 n = len(filtered) for case in filtered: start = time.time() i += 1 if self.in_debug_mode(): self.debug("... Evaluating %r" % (case, )) if i % 1000 == 0: self.log("... %0.2f%% chromatograms evaluated (%d/%d)" % (i * 100. / n, i, n)) try: sol = self.evaluate_chromatogram(case) if self.scoring_model.accept(sol): solutions.append(sol) else: if sol.glycan_composition: self.debug("... Rejecting %s with score %s %s" % ( sol, sol.score, sol.score_components())) end = time.time() # Report on anything that took more than 30 seconds to evaluate if end - start > 30.0: self.log("... %r took a long time to evaluated (%0.2fs)" % (case, end - start)) except (IndexError, ValueError): continue solutions = ChromatogramFilter(solutions) return solutions
def __init__(self, chromatograms, error_tolerance=1e-5, scan_id_to_rt=lambda x: x): self.chromatograms = ChromatogramFilter( map(TandemAnnotatedChromatogram, chromatograms)) self.rt_tree = build_rt_interval_tree(self.chromatograms) self.scan_id_to_rt = scan_id_to_rt self.orphans = [] self.error_tolerance = error_tolerance
def search_all(self, chromatograms, mass_error_tolerance=1e-5): matches = [] chromatograms = ChromatogramFilter(chromatograms) self.log("Matching Chromatograms") i = 0 n = len(chromatograms) for chro in chromatograms: i += 1 if i % 1000 == 0: self.log("... %0.2f%% chromatograms searched (%d/%d)" % (i * 100. / n, i, n)) matches.extend(self.search(chro, mass_error_tolerance)) matches = ChromatogramFilter(matches) return matches
def run(self): self.log("... Begin Extracting Chromatograms") self.load_peaks() self.log("...... Aggregating Chromatograms") self.aggregate_chromatograms() self.summary_chromatograms() # Ensure chromatograms are wrapped and sorted. if self.truncate: self.chromatograms = ChromatogramFilter( self.truncate_chromatograms(self.chromatograms)) else: self.chromatograms = ChromatogramFilter(self.chromatograms) return self.chromatograms
def find_related_profiles(self, chromatograms, mass_shifts, mass_error_tolerance=1e-5): self.log("Building Connected Components") graph = ChromatogramGraph(chromatograms) graph.find_shared_peaks() components = graph.connected_components() n_components = len(components) self.log("Validating %d Components" % (n_components, )) for i_components, component in enumerate(components): if i_components % 1000 == 0 and i_components > 0: self.log("... %d Components Validated (%0.2f%%)" % ( i_components, i_components / float(n_components) * 100.)) if len(component) == 1: continue component = ChromatogramFilter([node.chromatogram for node in component]) for a in component: pairs = [] for mass_shift in mass_shifts: bs = component.find_all_by_mass( a.weighted_neutral_mass - mass_shift.mass, mass_error_tolerance) for b in bs: if b != a: pairs.append((mass_shift, b)) if not pairs: continue grouped_pairs = [] pairs.sort(key=lambda x: (x[1].start_time, x[1].weighted_neutral_mass)) last = [pairs[0]] for current in pairs[1:]: if current[1] is last[0][1]: last.append(current) else: grouped_pairs.append(last) last = [current] grouped_pairs.append(last) unique_pairs = [] def minimizer(args): mass_shift, b = args return abs(a.weighted_neutral_mass - (b.weighted_neutral_mass + mass_shift.mass)) for pair_group in grouped_pairs: unique_pairs.append(min(pair_group, key=minimizer)) for mass_shift, b in unique_pairs: used_set = set(b.used_as_mass_shift) used_set.add((a.key, mass_shift)) b.used_as_mass_shift = list(used_set)
def aggregate_chromatograms(self): forest = ChromatogramForest([], self.grouping_tolerance, self.scan_id_to_rt) forest.aggregate_peaks(self.annotated_peaks, self.minimum_mass, self.minimum_intensity) chroma = list(forest) self.log("%d Chromatograms Extracted." % (len(chroma),)) self.chromatograms = ChromatogramFilter.process( chroma, min_points=self.min_points, delta_rt=self.delta_rt)
def acceptance_filter(self, solutions, threshold=None): if threshold is None: threshold = self.acceptance_threshold return ChromatogramFilter([ sol for sol in solutions if sol.score >= threshold and not sol.used_as_adduct ])
def aggregate_chromatograms(self): forest = ChromatogramForest([], self.grouping_tolerance, self.scan_id_to_rt) forest.aggregate_peaks(self.annotated_peaks, self.minimum_mass, self.minimum_intensity) chroma = list(forest) self.log("... %d Chromatograms Extracted." % (len(chroma),)) self.chromatograms = ChromatogramFilter.process( chroma, min_points=self.min_points, delta_rt=self.delta_rt)
def evaluate(self, chromatograms, delta_rt=0.25, min_points=3, smooth_overlap_rt=True, *args, **kwargs): solutions = super(LogitSumChromatogramEvaluator, self).evaluate(chromatograms, delta_rt=delta_rt, min_points=min_points, smooth_overlap_rt=smooth_overlap_rt, *args, **kwargs) accumulator = defaultdict(list) for case in solutions: accumulator[case.key].append(case) solutions = [] for group, members in accumulator.items(): members = sorted(members, key=lambda x: x.score, reverse=True) reference = members[0] base = reference.clone() for other in members[1:]: base = base.merge(other) merged = reference.__class__(base, reference.score, scorer=reference.scorer, score_set=reference.score_set) solutions.append(merged) return ChromatogramFilter(solutions)
def evaluate(self, chromatograms, delta_rt=0.25, min_points=3, smooth_overlap_rt=True, *args, **kwargs): solutions = super(LogitSumChromatogramEvaluator, self).evaluate( chromatograms, delta_rt=delta_rt, min_points=min_points, smooth_overlap_rt=smooth_overlap_rt, *args, **kwargs) self.log("Collapsing Duplicates") accumulator = defaultdict(list) for case in solutions: accumulator[case.key].append(case) solutions = [] n = len(accumulator) i = 0.0 for group, members in accumulator.items(): if i % 1000 == 0 and i > 0: self.log("... %d groups collapsed (%0.02f%%)" % (i, i / n * 100.0)) members = sorted(members, key=lambda x: x.score, reverse=True) reference = members[0] base = reference.clone() for other in members[1:]: base = base.merge(other, skip_duplicate_nodes=True) merged = reference.__class__( base, reference.score, scorer=reference.scorer, score_set=reference.score_set) if self.update_score_on_merge and len(members) > 1: aggregated = self.evaluate_chromatogram(merged) if aggregated.score > reference.score: merged.score_set = aggregated.score_set merged.score = aggregated.score solutions.append(merged) i += 1.0 return ChromatogramFilter(solutions)
def __init__(self, chromatograms, error_tolerance=1e-5, scan_id_to_rt=lambda x: x): self.chromatograms = ChromatogramFilter(map( TandemAnnotatedChromatogram, chromatograms)) self.rt_tree = build_rt_interval_tree(self.chromatograms) self.scan_id_to_rt = scan_id_to_rt self.orphans = [] self.error_tolerance = error_tolerance
def join_mass_shifted(self, chromatograms, adducts, mass_error_tolerance=1e-5): out = [] i = 0 n = len(chromatograms) self.log("Begin Forward Search") for chroma in chromatograms: i += 1 if i % 1000 == 0: self.log("... %0.2f%% chromatograms searched (%d/%d)" % (i * 100. / n, i, n)) add = chroma for adduct in adducts: query_mass = chroma.weighted_neutral_mass + adduct.mass matches = chromatograms.find_all_by_mass(query_mass, mass_error_tolerance) for match in matches: if match and span_overlap(add, match): try: match.used_as_adduct.append((add.key, adduct)) add = add.merge(match, node_type=adduct) add.created_at = "join_mass_shifted" add.adducts.append(adduct) except DuplicateNodeError as e: e.original = chroma e.to_add = match e.accumulated = add e.adduct = adduct raise e out.append(add) return ChromatogramFilter(out)
def run(self): self.log("... Begin Extracting Chromatograms") self.load_peaks() self.aggregate_chromatograms() self.summary_chromatograms() if self.truncate: self.chromatograms = ChromatogramFilter( self.truncate_chromatograms(self.chromatograms)) return self.chromatograms
def finalize_matches(self, solutions): out = [] for sol in solutions: if sol.score <= self.ignore_below: continue elif (sol.composition is None) and (Unmodified not in sol.adducts): continue out.append(sol) solutions = ChromatogramFilter(out) return solutions
def __init__(self, connection, analysis_name, sample_run, chromatogram_set, glycan_db, chromatogram_extractor): AnalysisMigrationBase.__init__( self, connection, analysis_name, sample_run, chromatogram_extractor) self._glycan_hypothesis_migrator = None self.glycan_db = glycan_db self.chromatogram_set = ChromatogramFilter(chromatogram_set) self._index_chromatogram_set()
def process(self, chromatograms, adducts=None, mass_error_tolerance=1e-5, delta_rt=0): if adducts is None: adducts = [] matches = [] chromatograms = ChromatogramFilter(chromatograms) matches = self.search_all(chromatograms, mass_error_tolerance) matches = self.join_common_identities(matches, delta_rt) if adducts: self.log("Handling Adducts") matches = self.join_mass_shifted(matches, adducts, mass_error_tolerance) matches = self.reverse_adduct_search(matches, adducts, mass_error_tolerance) matches = self.join_common_identities(matches, delta_rt) self.find_related_profiles(matches, adducts, mass_error_tolerance) return matches
def load_glycan_composition_chromatograms(self): from glycan_profiling.chromatogram_tree import ChromatogramFilter node_type_cache = dict() scan_id_cache = dict() q = self.query(GlycanCompositionChromatogram).filter( GlycanCompositionChromatogram.analysis_id == self.analysis_id).yield_per(100) chroma = ChromatogramFilter([ c.convert( chromatogram_scoring_model=self.chromatogram_scoring_model, node_type_cache=node_type_cache, scan_id_cache=scan_id_cache) for c in q ]) return chroma
def evaluate(self, chromatograms, delta_rt=0.25, min_points=3, smooth_overlap_rt=True, *args, **kwargs): solutions = super(LaplacianRegularizedChromatogramEvaluator, self).evaluate(chromatograms, delta_rt=delta_rt, min_points=min_points, smooth_overlap_rt=smooth_overlap_rt, *args, **kwargs) self.log("... Applying Network Smoothing Regularization") updated_network, search, params = smooth_network( self.network, solutions, lmbda=self.smoothing_factor, lambda_max=self.grid_smoothing_max, model_state=self.regularization_model) solutions = sorted(solutions, key=lambda x: x.score, reverse=True) # TODO - Use aggregation across multiple observations for the same glycan composition # instead of discarding all but the top scoring feature? seen = dict() unannotated = [] for sol in solutions: if sol.glycan_composition is None: unannotated.append(sol) continue if sol.glycan_composition in seen: continue seen[sol.glycan_composition] = sol node = updated_network[sol.glycan_composition] if sol.score > self.acceptance_threshold: sol.score = node.score else: # Do not permit network smoothing to boost scores below acceptance_threshold if node.score < sol.score: sol.score = node.score self.network_parameters = params self.grid_search = search display_table(search.model.neighborhood_names, np.array(params.tau).reshape((-1, 1)), print_fn=lambda x: self.log("...... %s" % (x, ))) self.log("...... smoothing factor: %0.3f; threshold: %0.3f" % (params.lmbda, params.threshold)) return ChromatogramFilter(list(seen.values()) + unannotated)
def _make_summary_graphics(self): try: builder = GlycanChromatographySummaryGraphBuilder( ChromatogramFilter(self.glycan_chromatograms + self.unidentified_chromatograms)) chrom, bar = builder.draw(self.score_threshold) self.figure_axes['chromatograms_chart'] = chrom self.figure_axes['abundance_bar_chart'] = bar except ValueError: ax = figax() ax.text(0.5, 0.5, "No Chromatograms Extracted", ha='center') ax.set_axis_off() self.figure_axes["chromatograms_chart"] = ArtistBase(ax) ax = figax() ax.text(0.5, 0.5, "No Entities Matched", ha='center') ax.set_axis_off() self.figure_axes['abundance_bar_chart'] = ArtistBase(ax)
def _load_chromatograms(self): extractor = ChromatogramExtractor(self.scan_loader, minimum_mass=1000.0, grouping_tolerance=1.5e-5) chromatograms = extractor.run() for chrom in chromatograms: chrom.mark = False idgps = self.analysis_loader.load_identified_glycopeptides() for idgp in idgps: if idgp.chromatogram is None: continue for mshift in idgp.mass_shifts: chroma = chromatograms.find_all_by_mass( idgp.weighted_neutral_mass + mshift.mass, 1e-5) for chrom in chroma: if idgp.chromatogram.overlaps_in_time(chrom): chrom.mark = True chromatograms = ChromatogramFilter( [chrom for chrom in chromatograms if not chrom.mark] + list(idgps)) self.identified_structures = idgps self.chromatograms = chromatograms
def process_chromatograms(self, processor, peak_loader, database): """Extract, match and evaluate chromatograms against the glycan database. If MSn are available and required, then MSn scan will be extracted and mapped onto chromatograms, and search each MSn scan with the pseudo-fragments of the glycans matching the chromatograms they map to. Parameters ---------- processor : ChromatgramProcessor The container responsible for carrying out the matching and evaluating of chromatograms peak_loader : RandomAccessScanIterator An object which can be used iterate over MS scans database : SearchableMassCollection The database of glycan compositions to serch against """ if self.require_msms_signature > 0: self.log("Extracting MS/MS") msms_scans = self.load_msms(peak_loader) if len(msms_scans) == 0: self.log("No MS/MS scans present. Ignoring requirement.") processor.run() else: matches = processor.match_compositions() annotated_matches = self.annotate_matches_with_msms( matches, peak_loader, msms_scans, database) # filter out those matches which do not have sufficient signature ion signal # from MS2 to include. As the MS1 scoring procedure will not preserve the # MS2 mapping, we must keep a mapping from Chromatogram Key to mapped tandem # matches to re-align later kept_annotated_matches = [] key_to_tandem = defaultdict(list) for match in annotated_matches: accepted = False best_score = 0 key_to_tandem[match.key].extend(match.tandem_solutions) for gsm in match.tandem_solutions: if gsm.score > best_score: best_score = gsm.score if gsm.score > self.require_msms_signature: accepted = True break if accepted: kept_annotated_matches.append(match) else: self.debug( "%s was discarded with insufficient MS/MS evidence %f" % ( match, best_score)) kept_annotated_matches = ChromatogramFilter(kept_annotated_matches) processor.evaluate_chromatograms(kept_annotated_matches) for solution in processor.solutions: mapped = [] try: gsms = key_to_tandem[solution.key] for gsm in gsms: if solution.spans_time_point(gsm.scan_time): mapped.append(gsm) solution.tandem_solutions = mapped except KeyError: solution.tandem_solutions = [] continue processor.solutions = ChromatogramFilter([ solution for solution in processor.solutions if len(solution.tandem_solutions) > 0 ]) processor.accepted_solutions = ChromatogramFilter([ solution for solution in processor.accepted_solutions if len(solution.tandem_solutions) > 0 ]) else: processor.run()
def reverse_adduct_search(self, chromatograms, adducts, mass_error_tolerance=1e-5): exclude_compositions = defaultdict(list) candidate_chromatograms = [] new_members = {} unmatched = [] for chroma in chromatograms: if chroma.composition is not None: exclude_compositions[chroma.composition].append(chroma) else: candidate_chromatograms.append(chroma) n = len(chromatograms) i = 0 self.log("Begin Reverse Search") for chroma in candidate_chromatograms: i += 1 if i % 1000 == 0: self.log("... %0.2f%% chromatograms searched (%d/%d)" % (i * 100. / n, i, n)) candidate_mass = chroma.weighted_neutral_mass matched = False exclude = False for adduct in adducts: matches = self.match(candidate_mass - adduct.mass, mass_error_tolerance) if matches is None: continue for match in matches: name = match if name in exclude_compositions: # This chromatogram matches another form of an existing composition # assignment. If it were assigned during `join_mass_shifted`, then # it overlapped with that entity and should not be merged. Otherwise # construct a new match for hit in exclude_compositions[name]: if span_overlap(hit, chroma): exclude = True break else: if name in new_members: chroma_to_update = new_members[name] else: chroma_to_update = self.chromatogram_type(match) chroma_to_update.created_at = "reverse_adduction_search" chroma, _ = chroma.bisect_adduct(Unmodified) chroma_to_update = chroma_to_update.merge(chroma, adduct) chroma_to_update.created_at = "reverse_adduction_search" new_members[name] = chroma_to_update matched = True else: if name in new_members: chroma_to_update = new_members[name] else: chroma_to_update = self.chromatogram_type(match) chroma_to_update.created_at = "reverse_adduction_search" chroma, _ = chroma.bisect_adduct(Unmodified) chroma_to_update = chroma_to_update.merge(chroma, adduct) chroma_to_update.created_at = "reverse_adduction_search" new_members[name] = chroma_to_update matched = True if not matched and not exclude: unmatched.append(chroma) out = [] out.extend(s for g in exclude_compositions.values() for s in g) out.extend(new_members.values()) out.extend(unmatched) return ChromatogramFilter(out)
class ChromatogramMSMSMapper(TaskBase): def __init__(self, chromatograms, error_tolerance=1e-5, scan_id_to_rt=lambda x: x): self.chromatograms = ChromatogramFilter(map( TandemAnnotatedChromatogram, chromatograms)) self.rt_tree = build_rt_interval_tree(self.chromatograms) self.scan_id_to_rt = scan_id_to_rt self.orphans = [] self.error_tolerance = error_tolerance def find_chromatogram_spanning(self, time): return ChromatogramFilter([interv[0] for interv in self.rt_tree.contains_point(time)]) def find_chromatogram_for(self, solution): try: precursor_scan_time = self.scan_id_to_rt( solution.precursor_information.precursor_scan_id) except Exception: precursor_scan_time = self.scan_id_to_rt(solution.scan_id) overlapping_chroma = self.find_chromatogram_spanning(precursor_scan_time) chroma = overlapping_chroma.find_mass( solution.precursor_information.neutral_mass, self.error_tolerance) if chroma is None: self.orphans.append(ScanTimeBundle(solution, precursor_scan_time)) else: chroma.tandem_solutions.append(solution) def assign_solutions_to_chromatograms(self, solutions): n = len(solutions) for i, solution in enumerate(solutions): if i % 1000 == 0: self.log("... %d/%d Solutions Handled (%0.2f%%)" % (i, n, (i * 100.0 / n))) self.find_chromatogram_for(solution) def distribute_orphans(self, threshold_fn=lambda x: x.q_value < 0.05): lost = [] n = len(self.orphans) n_chromatograms = len(self.chromatograms) for j, orphan in enumerate(self.orphans): mass = orphan.solution.precursor_ion_mass time = orphan.scan_time if j % 100 == 0: self.log("... %r %d/%d Orphans Handled (%0.2f%%)" % (orphan, j, n, (j * 100.0 / n))) candidates = self.chromatograms.find_all_by_mass(mass, self.error_tolerance) if len(candidates) > 0: best_index = 0 best_distance = float('inf') for i, candidate in enumerate(candidates): dist = min(abs(candidate.start_time - time), abs(candidate.end_time - time)) if dist < best_distance: best_index = i best_distance = dist new_owner = candidates[best_index] new_owner.add_displaced_solution(orphan.solution) else: if threshold_fn(orphan.solution): if n_chromatograms > 0: self.log("No chromatogram found for %r, q-value %0.4f (mass: %0.4f, time: %0.4f)" % ( orphan, orphan.solution.q_value, mass, time)) lost.append(orphan.solution) self.orphans = TandemSolutionsWithoutChromatogram.aggregate(lost) def assign_entities(self, threshold_fn=lambda x: x.q_value < 0.05, entity_chromatogram_type=None): if entity_chromatogram_type is None: entity_chromatogram_type = GlycopeptideChromatogram for chromatogram in self: solutions = chromatogram.most_representative_solutions(threshold_fn) if solutions: solutions = sorted(solutions, key=lambda x: x.score, reverse=True) chromatogram.assign_entity(solutions[0], entity_chromatogram_type=entity_chromatogram_type) chromatogram.representative_solutions = solutions def merge_common_entities(self, annotated_chromatograms, delta_rt=0.25, require_unmodified=True, threshold_fn=lambda x: x.q_value < 0.05): aggregated = defaultdict(list) finished = [] self.log("Aggregating Common Entities: %d chromatograms" % (len(annotated_chromatograms,))) for chroma in annotated_chromatograms: if chroma.composition is not None: if chroma.entity is not None: # Convert to string to avoid redundant sequences from getting # binned differently due to random ordering of ids. aggregated[str(chroma.entity)].append(chroma) else: aggregated[str(chroma.composition)].append(chroma) else: finished.append(chroma) for entity, group in aggregated.items(): out = [] group = sorted(group, key=lambda x: x.start_time) chroma = group[0] for obs in group[1:]: if chroma.chromatogram.overlaps_in_time(obs) or ( chroma.end_time - obs.start_time) < delta_rt: chroma = chroma.merge(obs) else: out.append(chroma) chroma = obs out.append(chroma) finished.extend(out) self.log("After merging: %d chromatograms" % (len(finished),)) if require_unmodified: out = [] for chromatogram in finished: # the structure's best match has not been identified in an unmodified state if Unmodified not in chromatogram.mass_shifts: solutions = chromatogram.most_representative_solutions( threshold_fn, reject_shifted=True) # if there is a reasonable solution in an unmodified state if solutions: # select the best solution solutions = sorted(solutions, key=lambda x: x.score, reverse=True) # remove the invalidated mass shifts current_shifts = chromatogram.chromatogram.mass_shifts partitions = [] for shift in current_shifts: partition, _ = chromatogram.chromatogram.bisect_mass_shift(shift) partitions.append(partition.deduct_node_type(shift)) accumulated_chromatogram = partitions[0] for partition in partitions[1:]: accumulated_chromatogram = accumulated_chromatogram.merge(partition) chromatogram.chromatogram = accumulated_chromatogram # update the tandem annotations chromatogram.assign_entity( solutions[0], entity_chromatogram_type=chromatogram.chromatogram.__class__) chromatogram.representative_solutions = solutions out.append(chromatogram) else: log_handle.log("... Could not find an alternative option for %r" % (chromatogram,)) out.append(chromatogram) else: out.append(chromatogram) finished = [] aggregated = defaultdict(list) for chroma in out: if chroma.composition is not None: if chroma.entity is not None: aggregated[chroma.entity].append(chroma) else: aggregated[chroma.composition].append(chroma) else: finished.append(chroma) for entity, group in aggregated.items(): out = [] group = sorted(group, key=lambda x: x.start_time) chroma = group[0] for obs in group[1:]: if chroma.chromatogram.overlaps_in_time(obs) or ( chroma.end_time - obs.start_time) < delta_rt: chroma = chroma.merge(obs) else: out.append(chroma) chroma = obs out.append(chroma) finished.extend(out) return finished def __len__(self): return len(self.chromatograms) def __iter__(self): return iter(self.chromatograms) def __getitem__(self, i): if isinstance(i, (int, slice)): return self.chromatograms[i] else: return [self.chromatograms[j] for j in i]
def prune_mass_shifts(self, solutions): return prune_bad_mass_shift_branches(ChromatogramFilter(solutions))
class ChromatogramMSMSMapper(TaskBase): def __init__(self, chromatograms, error_tolerance=1e-5, scan_id_to_rt=lambda x: x): self.chromatograms = ChromatogramFilter( map(TandemAnnotatedChromatogram, chromatograms)) self.rt_tree = build_rt_interval_tree(self.chromatograms) self.scan_id_to_rt = scan_id_to_rt self.orphans = [] self.error_tolerance = error_tolerance def find_chromatogram_spanning(self, time): return ChromatogramFilter( [interv[0] for interv in self.rt_tree.contains_point(time)]) def find_chromatogram_for(self, solution): try: precursor_scan_time = self.scan_id_to_rt( solution.precursor_information.precursor_scan_id) except Exception: precursor_scan_time = self.scan_id_to_rt(solution.scan_id) overlapping_chroma = self.find_chromatogram_spanning( precursor_scan_time) chroma = overlapping_chroma.find_mass( solution.precursor_information.neutral_mass, self.error_tolerance) if chroma is None: if debug_mode: self.log("... %s is an orphan" % (solution, )) self.orphans.append(ScanTimeBundle(solution, precursor_scan_time)) else: if debug_mode: self.log("... Assigning %s to %s" % (solution, chroma)) chroma.tandem_solutions.append(solution) def assign_solutions_to_chromatograms(self, solutions): n = len(solutions) for i, solution in enumerate(solutions): if i % 1000 == 0: self.log("... %d/%d Solutions Handled (%0.2f%%)" % (i, n, (i * 100.0 / n))) self.find_chromatogram_for(solution) def distribute_orphans(self, threshold_fn=lambda x: x.q_value < 0.05): lost = [] n = len(self.orphans) n_chromatograms = len(self.chromatograms) for j, orphan in enumerate(self.orphans): mass = orphan.solution.precursor_ion_mass time = orphan.scan_time if j % 100 == 0: self.log("... %r %d/%d Orphans Handled (%0.2f%%)" % (orphan, j, n, (j * 100.0 / n))) candidates = self.chromatograms.find_all_by_mass( mass, self.error_tolerance) if len(candidates) > 0: best_index = 0 best_distance = float('inf') for i, candidate in enumerate(candidates): dist = min(abs(candidate.start_time - time), abs(candidate.end_time - time)) if dist < best_distance: best_index = i best_distance = dist new_owner = candidates[best_index] if debug_mode: self.log( "... Assigning %r to %r with %d existing solutions with distance %0.3f" % (orphan, new_owner, len( new_owner.tandem_solutions), best_distance)) new_owner.add_displaced_solution(orphan.solution) else: if threshold_fn(orphan.solution): if n_chromatograms > 0: self.log( "No chromatogram found for %r, q-value %0.4f (mass: %0.4f, time: %0.4f)" % (orphan, orphan.solution.q_value, mass, time)) lost.append(orphan.solution) self.orphans = TandemSolutionsWithoutChromatogram.aggregate(lost) def assign_entities(self, threshold_fn=lambda x: x.q_value < 0.05, entity_chromatogram_type=None): if entity_chromatogram_type is None: entity_chromatogram_type = GlycopeptideChromatogram for chromatogram in self: solutions = chromatogram.most_representative_solutions( threshold_fn) if solutions: solutions = sorted(solutions, key=lambda x: x.score, reverse=True) if debug_mode: self.log("... Assigning %s to %s out of %r\n" % (solutions[0], chromatogram, solutions)) chromatogram.assign_entity( solutions[0], entity_chromatogram_type=entity_chromatogram_type) chromatogram.representative_solutions = solutions def merge_common_entities(self, annotated_chromatograms, delta_rt=0.25, require_unmodified=True, threshold_fn=lambda x: x.q_value < 0.05): aggregated = defaultdict(list) finished = [] self.log("Aggregating Common Entities: %d chromatograms" % (len(annotated_chromatograms, ))) for chroma in annotated_chromatograms: if chroma.composition is not None: if chroma.entity is not None: # Convert to string to avoid redundant sequences from getting # binned differently due to random ordering of ids. aggregated[str(chroma.entity)].append(chroma) else: aggregated[str(chroma.composition)].append(chroma) else: finished.append(chroma) for entity, group in aggregated.items(): out = [] group = sorted(group, key=lambda x: x.start_time) chroma = group[0] for obs in group[1:]: if chroma.chromatogram.overlaps_in_time(obs) or ( chroma.end_time - obs.start_time) < delta_rt: chroma = chroma.merge(obs) else: out.append(chroma) chroma = obs out.append(chroma) finished.extend(out) self.log("After merging: %d chromatograms" % (len(finished), )) if require_unmodified: out = [] for chromatogram in finished: # the structure's best match has not been identified in an unmodified state if Unmodified not in chromatogram.mass_shifts: solutions = chromatogram.most_representative_solutions( threshold_fn, reject_shifted=True) # if there is a reasonable solution in an unmodified state if solutions: # select the best solution solutions = sorted(solutions, key=lambda x: x.score, reverse=True) # remove the invalidated mass shifts current_shifts = chromatogram.chromatogram.mass_shifts partitions = [] for shift in current_shifts: partition, _ = chromatogram.chromatogram.bisect_mass_shift( shift) partitions.append( partition.deduct_node_type(shift)) accumulated_chromatogram = partitions[0] for partition in partitions[1:]: accumulated_chromatogram = accumulated_chromatogram.merge( partition) chromatogram.chromatogram = accumulated_chromatogram # update the tandem annotations chromatogram.assign_entity( solutions[0], entity_chromatogram_type=chromatogram.chromatogram. __class__) chromatogram.representative_solutions = solutions out.append(chromatogram) else: log_handle.log( "... Could not find an alternative option for %r" % (chromatogram, )) out.append(chromatogram) else: out.append(chromatogram) finished = [] aggregated = defaultdict(list) for chroma in out: if chroma.composition is not None: if chroma.entity is not None: aggregated[chroma.entity].append(chroma) else: aggregated[chroma.composition].append(chroma) else: finished.append(chroma) for entity, group in aggregated.items(): out = [] group = sorted(group, key=lambda x: x.start_time) chroma = group[0] for obs in group[1:]: if chroma.chromatogram.overlaps_in_time(obs) or ( chroma.end_time - obs.start_time) < delta_rt: chroma = chroma.merge(obs) else: out.append(chroma) chroma = obs out.append(chroma) finished.extend(out) return finished def __len__(self): return len(self.chromatograms) def __iter__(self): return iter(self.chromatograms) def __getitem__(self, i): if isinstance(i, (int, slice)): return self.chromatograms[i] else: return [self.chromatograms[j] for j in i]
def prune_adducts(self, solutions): return prune_bad_adduct_branches(ChromatogramFilter(solutions), score_margin=2.5)
class GlycanChromatogramReportCreator(ReportCreatorBase): def __init__(self, database_path, analysis_id, stream=None, threshold=5): super(GlycanChromatogramReportCreator, self).__init__( database_path, analysis_id, stream) self.set_template_loader(os.path.dirname(__file__)) self.threshold = threshold self.glycan_chromatograms = ChromatogramFilter([]) self.unidentified_chromatograms = ChromatogramFilter([]) def glycan_link(self, key): match = self.glycan_chromatograms.find_key(key) if match is not None: return chromatogram_link(match) match = self.unidentified_chromatograms.find_key(key) if match is not None: return chromatogram_link(match) return None def prepare_environment(self): super(GlycanChromatogramReportCreator, self).prepare_environment() self.env.filters["logit"] = logit self.env.filters['chromatogram_figures'] = chromatogram_figures self.env.filters['glycan_link'] = self.glycan_link def make_template_stream(self): template_obj = self.env.get_template("overview.templ") ads = serialize.AnalysisDeserializer( self.database_connection._original_connection, analysis_id=self.analysis_id) self.glycan_chromatograms = gcs = ads.load_glycan_composition_chromatograms() # und = ads.load_unidentified_chromatograms() self.unidentified_chromatograms = und = ChromatogramFilter( ads.query(serialize.UnidentifiedChromatogram).filter( serialize.UnidentifiedChromatogram.analysis_id == self.analysis_id).all()) if len(gcs) == 0: self.log("No glycan compositions were identified. Skipping report building") templ = Template(''' <html> <style> body { font-family: sans-serif; } </style> <body> <h3>No glycan compositions were identified</h3> </body> </html> ''') return templ.stream() summary_plot = summaries.GlycanChromatographySummaryGraphBuilder( filter(lambda x: x.score > self.threshold, gcs + und)) lcms_plot, composition_abundance_plot = summary_plot.draw(min_score=5) try: lcms_plot.ax.legend_.set_visible(False) except AttributeError: # The legend may not have been created pass lcms_plot.ax.set_title("Glycan Composition\nLC-MS Aggregated EICs", fontsize=24) fig = lcms_plot.ax.figure fig.set_figwidth(fig.get_figwidth() * 2.) fig.set_figheight(fig.get_figheight() * 2.) composition_abundance_plot.ax.set_title("Glycan Composition\nTotal Abundances", fontsize=24) composition_abundance_plot.ax.set_xlabel( composition_abundance_plot.ax.get_xlabel(), fontsize=14) def resolve_key(key): match = gcs.find_key(key) if match is None: match = und.find_key(key) return match template_stream = (template_obj.stream( analysis=ads.analysis, lcms_plot=svguri_plot( lcms_plot.ax, bbox_inches='tight', patchless=True, svg_width="100%"), composition_abundance_plot=svguri_plot( composition_abundance_plot.ax, bbox_inches='tight', patchless=True, svg_width="100%"), glycan_chromatograms=gcs, unidentified_chromatograms=und, resolve_key=resolve_key )) return template_stream
def prune_adducts(self, solutions): return prune_bad_adduct_branches(ChromatogramFilter(solutions))
last = disjoint_set[0] for case in disjoint_set[1:]: if last.overlaps_in_time(case) or ((case.start_time - last.end_time) < delta_rt): merged = last._merge_missing_only(case) merged.used_as_adduct = list(last.used_as_adduct) for ua in case.used_as_adduct: if ua not in merged.used_as_adduct: merged.used_as_adduct.append(ua) last = merged last.created_at = "join_common_identities" else: accumulated.append(last) last = case accumulated.append(last) out.extend(accumulated) return ChromatogramFilter(out) def find_related_profiles(self, chromatograms, adducts, mass_error_tolerance=1e-5): graph = ChromatogramGraph(chromatograms) graph.find_shared_peaks() components = graph.connected_components() for component in components: component = [node.chromatogram for node in component] if len(component) == 1: continue problem_pairs = set() for a, b in permutations(component, 2): best_err = float('inf') best_match = None mass_shift = a.weighted_neutral_mass - b.weighted_neutral_mass
def find_chromatogram_spanning(self, time): return ChromatogramFilter( [interv[0] for interv in self.rt_tree.contains_point(time)])
def make_template_stream(self): template_obj = self.env.get_template("overview.templ") ads = serialize.AnalysisDeserializer( self.database_connection._original_connection, analysis_id=self.analysis_id) self.glycan_chromatograms = gcs = ads.load_glycan_composition_chromatograms( ) # und = ads.load_unidentified_chromatograms() self.unidentified_chromatograms = und = ChromatogramFilter( ads.query(serialize.UnidentifiedChromatogram).filter( serialize.UnidentifiedChromatogram.analysis_id == self.analysis_id).all()) if len(gcs) == 0: self.log( "No glycan compositions were identified. Skipping report building" ) templ = Template(''' <html> <style> body { font-family: sans-serif; } </style> <body> <h3>No glycan compositions were identified</h3> </body> </html> ''') return templ.stream() summary_plot = summaries.GlycanChromatographySummaryGraphBuilder( filter(lambda x: x.score > self.threshold, gcs + und)) lcms_plot, composition_abundance_plot = summary_plot.draw(min_score=5) try: lcms_plot.ax.legend_.set_visible(False) except AttributeError: # The legend may not have been created pass lcms_plot.ax.set_title("Glycan Composition\nLC-MS Aggregated EICs", fontsize=24) fig = lcms_plot.ax.figure fig.set_figwidth(fig.get_figwidth() * 2.) fig.set_figheight(fig.get_figheight() * 2.) composition_abundance_plot.ax.set_title( "Glycan Composition\nTotal Abundances", fontsize=24) composition_abundance_plot.ax.set_xlabel( composition_abundance_plot.ax.get_xlabel(), fontsize=14) def resolve_key(key): match = gcs.find_key(key) if match is None: match = und.find_key(key) return match template_stream = (template_obj.stream( analysis=ads.analysis, lcms_plot=svguri_plot(lcms_plot.ax, bbox_inches='tight', patchless=True, svg_width="100%"), composition_abundance_plot=svguri_plot( composition_abundance_plot.ax, bbox_inches='tight', patchless=True, svg_width="100%"), glycan_chromatograms=gcs, unidentified_chromatograms=und, resolve_key=resolve_key)) return template_stream
class ChromatogramMSMSMapper(TaskBase): def __init__(self, chromatograms, error_tolerance=1e-5, scan_id_to_rt=lambda x: x): self.chromatograms = ChromatogramFilter( map(TandemAnnotatedChromatogram, chromatograms)) self.rt_tree = build_rt_interval_tree(self.chromatograms) self.scan_id_to_rt = scan_id_to_rt self.orphans = [] self.error_tolerance = error_tolerance def find_chromatogram_spanning(self, time): return ChromatogramFilter( [interv[0] for interv in self.rt_tree.contains_point(time)]) def find_chromatogram_for(self, solution): precursor_scan_time = self.scan_id_to_rt( solution.precursor_information.precursor_scan_id) overlapping_chroma = self.find_chromatogram_spanning( precursor_scan_time) chroma = overlapping_chroma.find_mass( solution.precursor_information.neutral_mass, self.error_tolerance) if chroma is None: self.orphans.append(ScanTimeBundle(solution, precursor_scan_time)) else: chroma.tandem_solutions.append(solution) def assign_solutions_to_chromatograms(self, solutions): for solution in solutions: self.find_chromatogram_for(solution) def distribute_orphans(self, threshold_fn=lambda x: x.q_value < 0.05): lost = [] for orphan in self.orphans: mass = orphan.solution.precursor_ion_mass window = self.error_tolerance * mass candidates = self.chromatograms.mass_between( mass - window, mass + window) time = orphan.scan_time if len(candidates) > 0: best_index = 0 best_distance = float('inf') for i, candidate in enumerate(candidates): dist = min(abs(candidate.start_time - time), abs(candidate.end_time - time)) if dist < best_distance: best_index = i best_distance = dist new_owner = candidates[best_index] new_owner.add_displaced_solution(orphan.solution) else: if threshold_fn(orphan.solution): self.log( "No chromatogram found for %r, q-value %0.4f (mass: %0.4f, time: %0.4f)" % (orphan, orphan.solution.q_value, mass, time)) lost.append(orphan.solution) self.orphans = TandemSolutionsWithoutChromatogram.aggregate(lost) def assign_entities(self, threshold_fn=lambda x: x.q_value < 0.05, entity_chromatogram_type=None): if entity_chromatogram_type is None: entity_chromatogram_type = GlycopeptideChromatogram for chromatogram in self: solutions = chromatogram.most_representative_solutions( threshold_fn) if solutions: solutions = sorted(solutions, key=lambda x: x.score, reverse=True) chromatogram.assign_entity( solutions[0], entity_chromatogram_type=entity_chromatogram_type) def merge_common_entities(self, annotated_chromatograms): aggregated = defaultdict(list) finished = [] self.log("Aggregating Common Entities: %d chromatograms" % (len(annotated_chromatograms, ))) for chroma in annotated_chromatograms: if chroma.composition is not None: if chroma.entity is not None: aggregated[chroma.entity].append(chroma) self.log("... %s (%s)" % (chroma.entity, chroma.adducts)) else: aggregated[chroma.composition].append(chroma) else: finished.append(chroma) for entity, group in aggregated.items(): out = [] chroma = group[0] for obs in group[1:]: if chroma.chromatogram.overlaps_in_time(obs): chroma = chroma.merge(obs) else: out.append(chroma) chroma = obs out.append(chroma) finished.extend(out) self.log("After merging: %d chromatograms" % (len(finished), )) return finished def __len__(self): return len(self.chromatograms) def __iter__(self): return iter(self.chromatograms) def __getitem__(self, i): if isinstance(i, (int, slice)): return self.chromatograms[i] else: return [self.chromatograms[j] for j in i]