示例#1
0
 def __init__(self, database_path, analysis_id, stream=None, threshold=5):
     super(GlycanChromatogramReportCreator, self).__init__(
         database_path, analysis_id, stream)
     self.set_template_loader(os.path.dirname(__file__))
     self.threshold = threshold
     self.glycan_chromatograms = ChromatogramFilter([])
     self.unidentified_chromatograms = ChromatogramFilter([])
示例#2
0
 def __init__(self, database_path, analysis_id, stream=None, threshold=5):
     super(GlycanChromatogramReportCreator,
           self).__init__(database_path, analysis_id, stream)
     self.set_template_loader(os.path.dirname(__file__))
     self.threshold = threshold
     self.glycan_chromatograms = ChromatogramFilter([])
     self.unidentified_chromatograms = ChromatogramFilter([])
示例#3
0
 def evaluate(self, chromatograms, delta_rt=0.25, min_points=3, smooth_overlap_rt=True,
              *args, **kwargs):
     filtered = ChromatogramFilter.process(
         chromatograms, delta_rt=delta_rt, min_points=min_points)
     if smooth_overlap_rt:
         filtered = ChromatogramOverlapSmoother(filtered)
     solutions = []
     i = 0
     n = len(filtered)
     for case in filtered:
         start = time.time()
         i += 1
         if self.in_debug_mode():
             self.debug("... Evaluating %r" % (case, ))
         if i % 1000 == 0:
             self.log("... %0.2f%% chromatograms evaluated (%d/%d)" % (i * 100. / n, i, n))
         try:
             sol = self.evaluate_chromatogram(case)
             if self.scoring_model.accept(sol):
                 solutions.append(sol)
             else:
                 if sol.glycan_composition:
                     self.debug("... Rejecting %s with score %s %s" % (
                         sol, sol.score, sol.score_components()))
             end = time.time()
             # Report on anything that took more than 30 seconds to evaluate
             if end - start > 30.0:
                 self.log("... %r took a long time to evaluated (%0.2fs)" % (case, end - start))
         except (IndexError, ValueError):
             continue
     solutions = ChromatogramFilter(solutions)
     return solutions
 def __init__(self,
              chromatograms,
              error_tolerance=1e-5,
              scan_id_to_rt=lambda x: x):
     self.chromatograms = ChromatogramFilter(
         map(TandemAnnotatedChromatogram, chromatograms))
     self.rt_tree = build_rt_interval_tree(self.chromatograms)
     self.scan_id_to_rt = scan_id_to_rt
     self.orphans = []
     self.error_tolerance = error_tolerance
示例#5
0
 def search_all(self, chromatograms, mass_error_tolerance=1e-5):
     matches = []
     chromatograms = ChromatogramFilter(chromatograms)
     self.log("Matching Chromatograms")
     i = 0
     n = len(chromatograms)
     for chro in chromatograms:
         i += 1
         if i % 1000 == 0:
             self.log("... %0.2f%% chromatograms searched (%d/%d)" % (i * 100. / n, i, n))
         matches.extend(self.search(chro, mass_error_tolerance))
     matches = ChromatogramFilter(matches)
     return matches
示例#6
0
 def run(self):
     self.log("... Begin Extracting Chromatograms")
     self.load_peaks()
     self.log("...... Aggregating Chromatograms")
     self.aggregate_chromatograms()
     self.summary_chromatograms()
     # Ensure chromatograms are wrapped and sorted.
     if self.truncate:
         self.chromatograms = ChromatogramFilter(
             self.truncate_chromatograms(self.chromatograms))
     else:
         self.chromatograms = ChromatogramFilter(self.chromatograms)
     return self.chromatograms
示例#7
0
    def find_related_profiles(self, chromatograms, mass_shifts, mass_error_tolerance=1e-5):
        self.log("Building Connected Components")
        graph = ChromatogramGraph(chromatograms)
        graph.find_shared_peaks()
        components = graph.connected_components()

        n_components = len(components)
        self.log("Validating %d Components" % (n_components, ))
        for i_components, component in enumerate(components):
            if i_components % 1000 == 0 and i_components > 0:
                self.log("... %d Components Validated (%0.2f%%)" % (
                    i_components,
                    i_components / float(n_components) * 100.))
            if len(component) == 1:
                continue
            component = ChromatogramFilter([node.chromatogram for node in component])

            for a in component:
                pairs = []
                for mass_shift in mass_shifts:
                    bs = component.find_all_by_mass(
                        a.weighted_neutral_mass - mass_shift.mass, mass_error_tolerance)
                    for b in bs:
                        if b != a:
                            pairs.append((mass_shift, b))
                if not pairs:
                    continue
                grouped_pairs = []
                pairs.sort(key=lambda x: (x[1].start_time, x[1].weighted_neutral_mass))
                last = [pairs[0]]
                for current in pairs[1:]:
                    if current[1] is last[0][1]:
                        last.append(current)
                    else:
                        grouped_pairs.append(last)
                        last = [current]
                grouped_pairs.append(last)
                unique_pairs = []

                def minimizer(args):
                    mass_shift, b = args
                    return abs(a.weighted_neutral_mass - (b.weighted_neutral_mass + mass_shift.mass))

                for pair_group in grouped_pairs:
                    unique_pairs.append(min(pair_group, key=minimizer))

                for mass_shift, b in unique_pairs:
                    used_set = set(b.used_as_mass_shift)
                    used_set.add((a.key, mass_shift))
                    b.used_as_mass_shift = list(used_set)
示例#8
0
 def aggregate_chromatograms(self):
     forest = ChromatogramForest([], self.grouping_tolerance, self.scan_id_to_rt)
     forest.aggregate_peaks(self.annotated_peaks, self.minimum_mass, self.minimum_intensity)
     chroma = list(forest)
     self.log("%d Chromatograms Extracted." % (len(chroma),))
     self.chromatograms = ChromatogramFilter.process(
         chroma, min_points=self.min_points, delta_rt=self.delta_rt)
 def acceptance_filter(self, solutions, threshold=None):
     if threshold is None:
         threshold = self.acceptance_threshold
     return ChromatogramFilter([
         sol for sol in solutions
         if sol.score >= threshold and not sol.used_as_adduct
     ])
示例#10
0
 def aggregate_chromatograms(self):
     forest = ChromatogramForest([], self.grouping_tolerance, self.scan_id_to_rt)
     forest.aggregate_peaks(self.annotated_peaks, self.minimum_mass, self.minimum_intensity)
     chroma = list(forest)
     self.log("... %d Chromatograms Extracted." % (len(chroma),))
     self.chromatograms = ChromatogramFilter.process(
         chroma, min_points=self.min_points, delta_rt=self.delta_rt)
示例#11
0
    def evaluate(self,
                 chromatograms,
                 delta_rt=0.25,
                 min_points=3,
                 smooth_overlap_rt=True,
                 *args,
                 **kwargs):
        solutions = super(LogitSumChromatogramEvaluator,
                          self).evaluate(chromatograms,
                                         delta_rt=delta_rt,
                                         min_points=min_points,
                                         smooth_overlap_rt=smooth_overlap_rt,
                                         *args,
                                         **kwargs)

        accumulator = defaultdict(list)
        for case in solutions:
            accumulator[case.key].append(case)
        solutions = []
        for group, members in accumulator.items():
            members = sorted(members, key=lambda x: x.score, reverse=True)
            reference = members[0]
            base = reference.clone()
            for other in members[1:]:
                base = base.merge(other)
            merged = reference.__class__(base,
                                         reference.score,
                                         scorer=reference.scorer,
                                         score_set=reference.score_set)
            solutions.append(merged)
        return ChromatogramFilter(solutions)
示例#12
0
 def evaluate(self, chromatograms, delta_rt=0.25, min_points=3, smooth_overlap_rt=True,
              *args, **kwargs):
     solutions = super(LogitSumChromatogramEvaluator, self).evaluate(
         chromatograms, delta_rt=delta_rt, min_points=min_points,
         smooth_overlap_rt=smooth_overlap_rt, *args, **kwargs)
     self.log("Collapsing Duplicates")
     accumulator = defaultdict(list)
     for case in solutions:
         accumulator[case.key].append(case)
     solutions = []
     n = len(accumulator)
     i = 0.0
     for group, members in accumulator.items():
         if i % 1000 == 0 and i > 0:
             self.log("... %d groups collapsed (%0.02f%%)" % (i, i / n * 100.0))
         members = sorted(members, key=lambda x: x.score, reverse=True)
         reference = members[0]
         base = reference.clone()
         for other in members[1:]:
             base = base.merge(other, skip_duplicate_nodes=True)
         merged = reference.__class__(
             base, reference.score, scorer=reference.scorer,
             score_set=reference.score_set)
         if self.update_score_on_merge and len(members) > 1:
             aggregated = self.evaluate_chromatogram(merged)
             if aggregated.score > reference.score:
                 merged.score_set = aggregated.score_set
                 merged.score = aggregated.score
         solutions.append(merged)
         i += 1.0
     return ChromatogramFilter(solutions)
 def __init__(self, chromatograms, error_tolerance=1e-5, scan_id_to_rt=lambda x: x):
     self.chromatograms = ChromatogramFilter(map(
         TandemAnnotatedChromatogram, chromatograms))
     self.rt_tree = build_rt_interval_tree(self.chromatograms)
     self.scan_id_to_rt = scan_id_to_rt
     self.orphans = []
     self.error_tolerance = error_tolerance
示例#14
0
 def join_mass_shifted(self, chromatograms, adducts, mass_error_tolerance=1e-5):
     out = []
     i = 0
     n = len(chromatograms)
     self.log("Begin Forward Search")
     for chroma in chromatograms:
         i += 1
         if i % 1000 == 0:
             self.log("... %0.2f%% chromatograms searched (%d/%d)" % (i * 100. / n, i, n))
         add = chroma
         for adduct in adducts:
             query_mass = chroma.weighted_neutral_mass + adduct.mass
             matches = chromatograms.find_all_by_mass(query_mass, mass_error_tolerance)
             for match in matches:
                 if match and span_overlap(add, match):
                     try:
                         match.used_as_adduct.append((add.key, adduct))
                         add = add.merge(match, node_type=adduct)
                         add.created_at = "join_mass_shifted"
                         add.adducts.append(adduct)
                     except DuplicateNodeError as e:
                         e.original = chroma
                         e.to_add = match
                         e.accumulated = add
                         e.adduct = adduct
                         raise e
         out.append(add)
     return ChromatogramFilter(out)
示例#15
0
 def run(self):
     self.log("... Begin Extracting Chromatograms")
     self.load_peaks()
     self.aggregate_chromatograms()
     self.summary_chromatograms()
     if self.truncate:
         self.chromatograms = ChromatogramFilter(
             self.truncate_chromatograms(self.chromatograms))
     return self.chromatograms
示例#16
0
 def finalize_matches(self, solutions):
     out = []
     for sol in solutions:
         if sol.score <= self.ignore_below:
             continue
         elif (sol.composition is None) and (Unmodified not in sol.adducts):
             continue
         out.append(sol)
     solutions = ChromatogramFilter(out)
     return solutions
示例#17
0
    def __init__(self, connection, analysis_name, sample_run,
                 chromatogram_set, glycan_db,
                 chromatogram_extractor):
        AnalysisMigrationBase.__init__(
            self, connection, analysis_name, sample_run,
            chromatogram_extractor)
        self._glycan_hypothesis_migrator = None

        self.glycan_db = glycan_db
        self.chromatogram_set = ChromatogramFilter(chromatogram_set)
        self._index_chromatogram_set()
示例#18
0
 def process(self, chromatograms, adducts=None, mass_error_tolerance=1e-5, delta_rt=0):
     if adducts is None:
         adducts = []
     matches = []
     chromatograms = ChromatogramFilter(chromatograms)
     matches = self.search_all(chromatograms, mass_error_tolerance)
     matches = self.join_common_identities(matches, delta_rt)
     if adducts:
         self.log("Handling Adducts")
         matches = self.join_mass_shifted(matches, adducts, mass_error_tolerance)
         matches = self.reverse_adduct_search(matches, adducts, mass_error_tolerance)
     matches = self.join_common_identities(matches, delta_rt)
     self.find_related_profiles(matches, adducts, mass_error_tolerance)
     return matches
示例#19
0
 def load_glycan_composition_chromatograms(self):
     from glycan_profiling.chromatogram_tree import ChromatogramFilter
     node_type_cache = dict()
     scan_id_cache = dict()
     q = self.query(GlycanCompositionChromatogram).filter(
         GlycanCompositionChromatogram.analysis_id ==
         self.analysis_id).yield_per(100)
     chroma = ChromatogramFilter([
         c.convert(
             chromatogram_scoring_model=self.chromatogram_scoring_model,
             node_type_cache=node_type_cache,
             scan_id_cache=scan_id_cache) for c in q
     ])
     return chroma
示例#20
0
 def evaluate(self,
              chromatograms,
              delta_rt=0.25,
              min_points=3,
              smooth_overlap_rt=True,
              *args,
              **kwargs):
     solutions = super(LaplacianRegularizedChromatogramEvaluator,
                       self).evaluate(chromatograms,
                                      delta_rt=delta_rt,
                                      min_points=min_points,
                                      smooth_overlap_rt=smooth_overlap_rt,
                                      *args,
                                      **kwargs)
     self.log("... Applying Network Smoothing Regularization")
     updated_network, search, params = smooth_network(
         self.network,
         solutions,
         lmbda=self.smoothing_factor,
         lambda_max=self.grid_smoothing_max,
         model_state=self.regularization_model)
     solutions = sorted(solutions, key=lambda x: x.score, reverse=True)
     # TODO - Use aggregation across multiple observations for the same glycan composition
     # instead of discarding all but the top scoring feature?
     seen = dict()
     unannotated = []
     for sol in solutions:
         if sol.glycan_composition is None:
             unannotated.append(sol)
             continue
         if sol.glycan_composition in seen:
             continue
         seen[sol.glycan_composition] = sol
         node = updated_network[sol.glycan_composition]
         if sol.score > self.acceptance_threshold:
             sol.score = node.score
         else:
             # Do not permit network smoothing to boost scores below acceptance_threshold
             if node.score < sol.score:
                 sol.score = node.score
     self.network_parameters = params
     self.grid_search = search
     display_table(search.model.neighborhood_names,
                   np.array(params.tau).reshape((-1, 1)),
                   print_fn=lambda x: self.log("...... %s" % (x, )))
     self.log("...... smoothing factor: %0.3f; threshold: %0.3f" %
              (params.lmbda, params.threshold))
     return ChromatogramFilter(list(seen.values()) + unannotated)
示例#21
0
 def _make_summary_graphics(self):
     try:
         builder = GlycanChromatographySummaryGraphBuilder(
             ChromatogramFilter(self.glycan_chromatograms +
                                self.unidentified_chromatograms))
         chrom, bar = builder.draw(self.score_threshold)
         self.figure_axes['chromatograms_chart'] = chrom
         self.figure_axes['abundance_bar_chart'] = bar
     except ValueError:
         ax = figax()
         ax.text(0.5, 0.5, "No Chromatograms Extracted", ha='center')
         ax.set_axis_off()
         self.figure_axes["chromatograms_chart"] = ArtistBase(ax)
         ax = figax()
         ax.text(0.5, 0.5, "No Entities Matched", ha='center')
         ax.set_axis_off()
         self.figure_axes['abundance_bar_chart'] = ArtistBase(ax)
示例#22
0
 def _load_chromatograms(self):
     extractor = ChromatogramExtractor(self.scan_loader,
                                       minimum_mass=1000.0,
                                       grouping_tolerance=1.5e-5)
     chromatograms = extractor.run()
     for chrom in chromatograms:
         chrom.mark = False
     idgps = self.analysis_loader.load_identified_glycopeptides()
     for idgp in idgps:
         if idgp.chromatogram is None:
             continue
         for mshift in idgp.mass_shifts:
             chroma = chromatograms.find_all_by_mass(
                 idgp.weighted_neutral_mass + mshift.mass, 1e-5)
             for chrom in chroma:
                 if idgp.chromatogram.overlaps_in_time(chrom):
                     chrom.mark = True
     chromatograms = ChromatogramFilter(
         [chrom for chrom in chromatograms if not chrom.mark] + list(idgps))
     self.identified_structures = idgps
     self.chromatograms = chromatograms
示例#23
0
    def process_chromatograms(self, processor, peak_loader, database):
        """Extract, match and evaluate chromatograms against the glycan database.

        If MSn are available and required, then MSn scan will be extracted
        and mapped onto chromatograms, and search each MSn scan with the
        pseudo-fragments of the glycans matching the chromatograms they
        map to.

        Parameters
        ----------
        processor : ChromatgramProcessor
            The container responsible for carrying out the matching
            and evaluating of chromatograms
        peak_loader : RandomAccessScanIterator
            An object which can be used iterate over MS scans
        database : SearchableMassCollection
            The database of glycan compositions to serch against
        """
        if self.require_msms_signature > 0:
            self.log("Extracting MS/MS")
            msms_scans = self.load_msms(peak_loader)
            if len(msms_scans) == 0:
                self.log("No MS/MS scans present. Ignoring requirement.")
                processor.run()
            else:
                matches = processor.match_compositions()
                annotated_matches = self.annotate_matches_with_msms(
                    matches, peak_loader, msms_scans, database)
                # filter out those matches which do not have sufficient signature ion signal
                # from MS2 to include. As the MS1 scoring procedure will not preserve the
                # MS2 mapping, we must keep a mapping from Chromatogram Key to mapped tandem
                # matches to re-align later
                kept_annotated_matches = []
                key_to_tandem = defaultdict(list)
                for match in annotated_matches:
                    accepted = False
                    best_score = 0
                    key_to_tandem[match.key].extend(match.tandem_solutions)
                    for gsm in match.tandem_solutions:
                        if gsm.score > best_score:
                            best_score = gsm.score
                        if gsm.score > self.require_msms_signature:
                            accepted = True
                            break
                    if accepted:
                        kept_annotated_matches.append(match)
                    else:
                        self.debug(
                            "%s was discarded with insufficient MS/MS evidence %f" % (
                                match, best_score))
                kept_annotated_matches = ChromatogramFilter(kept_annotated_matches)
                processor.evaluate_chromatograms(kept_annotated_matches)
                for solution in processor.solutions:
                    mapped = []
                    try:
                        gsms = key_to_tandem[solution.key]
                        for gsm in gsms:
                            if solution.spans_time_point(gsm.scan_time):
                                mapped.append(gsm)
                        solution.tandem_solutions = mapped
                    except KeyError:
                        solution.tandem_solutions = []
                        continue
                processor.solutions = ChromatogramFilter([
                    solution for solution in processor.solutions
                    if len(solution.tandem_solutions) > 0
                ])
                processor.accepted_solutions = ChromatogramFilter([
                    solution for solution in processor.accepted_solutions
                    if len(solution.tandem_solutions) > 0
                ])
        else:
            processor.run()
示例#24
0
    def reverse_adduct_search(self, chromatograms, adducts, mass_error_tolerance=1e-5):
        exclude_compositions = defaultdict(list)
        candidate_chromatograms = []

        new_members = {}
        unmatched = []

        for chroma in chromatograms:
            if chroma.composition is not None:
                exclude_compositions[chroma.composition].append(chroma)
            else:
                candidate_chromatograms.append(chroma)
        n = len(chromatograms)
        i = 0
        self.log("Begin Reverse Search")
        for chroma in candidate_chromatograms:
            i += 1
            if i % 1000 == 0:
                self.log("... %0.2f%% chromatograms searched (%d/%d)" % (i * 100. / n, i, n))
            candidate_mass = chroma.weighted_neutral_mass
            matched = False
            exclude = False
            for adduct in adducts:
                matches = self.match(candidate_mass - adduct.mass, mass_error_tolerance)
                if matches is None:
                    continue
                for match in matches:
                    name = match
                    if name in exclude_compositions:
                        # This chromatogram matches another form of an existing composition
                        # assignment. If it were assigned during `join_mass_shifted`, then
                        # it overlapped with that entity and should not be merged. Otherwise
                        # construct a new match
                        for hit in exclude_compositions[name]:
                            if span_overlap(hit, chroma):
                                exclude = True
                                break
                        else:
                            if name in new_members:
                                chroma_to_update = new_members[name]
                            else:
                                chroma_to_update = self.chromatogram_type(match)
                                chroma_to_update.created_at = "reverse_adduction_search"
                            chroma, _ = chroma.bisect_adduct(Unmodified)
                            chroma_to_update = chroma_to_update.merge(chroma, adduct)
                            chroma_to_update.created_at = "reverse_adduction_search"
                            new_members[name] = chroma_to_update
                            matched = True
                    else:
                        if name in new_members:
                            chroma_to_update = new_members[name]
                        else:
                            chroma_to_update = self.chromatogram_type(match)
                            chroma_to_update.created_at = "reverse_adduction_search"
                        chroma, _ = chroma.bisect_adduct(Unmodified)
                        chroma_to_update = chroma_to_update.merge(chroma, adduct)
                        chroma_to_update.created_at = "reverse_adduction_search"
                        new_members[name] = chroma_to_update
                        matched = True
            if not matched and not exclude:
                unmatched.append(chroma)
        out = []
        out.extend(s for g in exclude_compositions.values() for s in g)
        out.extend(new_members.values())
        out.extend(unmatched)
        return ChromatogramFilter(out)
class ChromatogramMSMSMapper(TaskBase):
    def __init__(self, chromatograms, error_tolerance=1e-5, scan_id_to_rt=lambda x: x):
        self.chromatograms = ChromatogramFilter(map(
            TandemAnnotatedChromatogram, chromatograms))
        self.rt_tree = build_rt_interval_tree(self.chromatograms)
        self.scan_id_to_rt = scan_id_to_rt
        self.orphans = []
        self.error_tolerance = error_tolerance

    def find_chromatogram_spanning(self, time):
        return ChromatogramFilter([interv[0] for interv in self.rt_tree.contains_point(time)])

    def find_chromatogram_for(self, solution):
        try:
            precursor_scan_time = self.scan_id_to_rt(
                solution.precursor_information.precursor_scan_id)
        except Exception:
            precursor_scan_time = self.scan_id_to_rt(solution.scan_id)
        overlapping_chroma = self.find_chromatogram_spanning(precursor_scan_time)
        chroma = overlapping_chroma.find_mass(
            solution.precursor_information.neutral_mass, self.error_tolerance)
        if chroma is None:
            self.orphans.append(ScanTimeBundle(solution, precursor_scan_time))
        else:
            chroma.tandem_solutions.append(solution)

    def assign_solutions_to_chromatograms(self, solutions):
        n = len(solutions)
        for i, solution in enumerate(solutions):
            if i % 1000 == 0:
                self.log("... %d/%d Solutions Handled (%0.2f%%)" % (i, n, (i * 100.0 / n)))
            self.find_chromatogram_for(solution)

    def distribute_orphans(self, threshold_fn=lambda x: x.q_value < 0.05):
        lost = []
        n = len(self.orphans)
        n_chromatograms = len(self.chromatograms)
        for j, orphan in enumerate(self.orphans):
            mass = orphan.solution.precursor_ion_mass
            time = orphan.scan_time
            if j % 100 == 0:
                self.log("... %r %d/%d Orphans Handled (%0.2f%%)" % (orphan, j, n, (j * 100.0 / n)))
            candidates = self.chromatograms.find_all_by_mass(mass, self.error_tolerance)
            if len(candidates) > 0:
                best_index = 0
                best_distance = float('inf')
                for i, candidate in enumerate(candidates):
                    dist = min(abs(candidate.start_time - time), abs(candidate.end_time - time))
                    if dist < best_distance:
                        best_index = i
                        best_distance = dist
                    new_owner = candidates[best_index]
                    new_owner.add_displaced_solution(orphan.solution)
            else:
                if threshold_fn(orphan.solution):
                    if n_chromatograms > 0:
                        self.log("No chromatogram found for %r, q-value %0.4f (mass: %0.4f, time: %0.4f)" % (
                            orphan, orphan.solution.q_value, mass, time))
                    lost.append(orphan.solution)
        self.orphans = TandemSolutionsWithoutChromatogram.aggregate(lost)

    def assign_entities(self, threshold_fn=lambda x: x.q_value < 0.05, entity_chromatogram_type=None):
        if entity_chromatogram_type is None:
            entity_chromatogram_type = GlycopeptideChromatogram
        for chromatogram in self:
            solutions = chromatogram.most_representative_solutions(threshold_fn)
            if solutions:
                solutions = sorted(solutions, key=lambda x: x.score, reverse=True)
                chromatogram.assign_entity(solutions[0], entity_chromatogram_type=entity_chromatogram_type)
                chromatogram.representative_solutions = solutions

    def merge_common_entities(self, annotated_chromatograms, delta_rt=0.25, require_unmodified=True,
                              threshold_fn=lambda x: x.q_value < 0.05):
        aggregated = defaultdict(list)
        finished = []
        self.log("Aggregating Common Entities: %d chromatograms" % (len(annotated_chromatograms,)))
        for chroma in annotated_chromatograms:
            if chroma.composition is not None:
                if chroma.entity is not None:
                    # Convert to string to avoid redundant sequences from getting
                    # binned differently due to random ordering of ids.
                    aggregated[str(chroma.entity)].append(chroma)
                else:
                    aggregated[str(chroma.composition)].append(chroma)
            else:
                finished.append(chroma)
        for entity, group in aggregated.items():
            out = []
            group = sorted(group, key=lambda x: x.start_time)
            chroma = group[0]
            for obs in group[1:]:
                if chroma.chromatogram.overlaps_in_time(obs) or (
                        chroma.end_time - obs.start_time) < delta_rt:
                    chroma = chroma.merge(obs)
                else:
                    out.append(chroma)
                    chroma = obs
            out.append(chroma)
            finished.extend(out)
        self.log("After merging: %d chromatograms" % (len(finished),))
        if require_unmodified:
            out = []
            for chromatogram in finished:
                # the structure's best match has not been identified in an unmodified state
                if Unmodified not in chromatogram.mass_shifts:
                    solutions = chromatogram.most_representative_solutions(
                        threshold_fn, reject_shifted=True)
                    # if there is a reasonable solution in an unmodified state
                    if solutions:
                        # select the best solution
                        solutions = sorted(solutions, key=lambda x: x.score, reverse=True)

                        # remove the invalidated mass shifts
                        current_shifts = chromatogram.chromatogram.mass_shifts
                        partitions = []
                        for shift in current_shifts:
                            partition, _ = chromatogram.chromatogram.bisect_mass_shift(shift)
                            partitions.append(partition.deduct_node_type(shift))
                        accumulated_chromatogram = partitions[0]
                        for partition in partitions[1:]:
                            accumulated_chromatogram = accumulated_chromatogram.merge(partition)
                        chromatogram.chromatogram = accumulated_chromatogram

                        # update the tandem annotations
                        chromatogram.assign_entity(
                            solutions[0],
                            entity_chromatogram_type=chromatogram.chromatogram.__class__)
                        chromatogram.representative_solutions = solutions
                        out.append(chromatogram)
                    else:
                        log_handle.log("... Could not find an alternative option for %r" % (chromatogram,))
                        out.append(chromatogram)
                else:
                    out.append(chromatogram)
            finished = []
            aggregated = defaultdict(list)
            for chroma in out:
                if chroma.composition is not None:
                    if chroma.entity is not None:
                        aggregated[chroma.entity].append(chroma)
                    else:
                        aggregated[chroma.composition].append(chroma)
                else:
                    finished.append(chroma)
            for entity, group in aggregated.items():
                out = []
                group = sorted(group, key=lambda x: x.start_time)
                chroma = group[0]
                for obs in group[1:]:
                    if chroma.chromatogram.overlaps_in_time(obs) or (
                            chroma.end_time - obs.start_time) < delta_rt:
                        chroma = chroma.merge(obs)
                    else:
                        out.append(chroma)
                        chroma = obs
                out.append(chroma)
                finished.extend(out)
        return finished

    def __len__(self):
        return len(self.chromatograms)

    def __iter__(self):
        return iter(self.chromatograms)

    def __getitem__(self, i):
        if isinstance(i, (int, slice)):
            return self.chromatograms[i]
        else:
            return [self.chromatograms[j] for j in i]
示例#26
0
 def prune_mass_shifts(self, solutions):
     return prune_bad_mass_shift_branches(ChromatogramFilter(solutions))
示例#27
0
class ChromatogramMSMSMapper(TaskBase):
    def __init__(self,
                 chromatograms,
                 error_tolerance=1e-5,
                 scan_id_to_rt=lambda x: x):
        self.chromatograms = ChromatogramFilter(
            map(TandemAnnotatedChromatogram, chromatograms))
        self.rt_tree = build_rt_interval_tree(self.chromatograms)
        self.scan_id_to_rt = scan_id_to_rt
        self.orphans = []
        self.error_tolerance = error_tolerance

    def find_chromatogram_spanning(self, time):
        return ChromatogramFilter(
            [interv[0] for interv in self.rt_tree.contains_point(time)])

    def find_chromatogram_for(self, solution):
        try:
            precursor_scan_time = self.scan_id_to_rt(
                solution.precursor_information.precursor_scan_id)
        except Exception:
            precursor_scan_time = self.scan_id_to_rt(solution.scan_id)
        overlapping_chroma = self.find_chromatogram_spanning(
            precursor_scan_time)
        chroma = overlapping_chroma.find_mass(
            solution.precursor_information.neutral_mass, self.error_tolerance)
        if chroma is None:
            if debug_mode:
                self.log("... %s is an orphan" % (solution, ))
            self.orphans.append(ScanTimeBundle(solution, precursor_scan_time))
        else:
            if debug_mode:
                self.log("... Assigning %s to %s" % (solution, chroma))
            chroma.tandem_solutions.append(solution)

    def assign_solutions_to_chromatograms(self, solutions):
        n = len(solutions)
        for i, solution in enumerate(solutions):
            if i % 1000 == 0:
                self.log("... %d/%d Solutions Handled (%0.2f%%)" %
                         (i, n, (i * 100.0 / n)))
            self.find_chromatogram_for(solution)

    def distribute_orphans(self, threshold_fn=lambda x: x.q_value < 0.05):
        lost = []
        n = len(self.orphans)
        n_chromatograms = len(self.chromatograms)
        for j, orphan in enumerate(self.orphans):
            mass = orphan.solution.precursor_ion_mass
            time = orphan.scan_time
            if j % 100 == 0:
                self.log("... %r %d/%d Orphans Handled (%0.2f%%)" %
                         (orphan, j, n, (j * 100.0 / n)))
            candidates = self.chromatograms.find_all_by_mass(
                mass, self.error_tolerance)
            if len(candidates) > 0:
                best_index = 0
                best_distance = float('inf')
                for i, candidate in enumerate(candidates):
                    dist = min(abs(candidate.start_time - time),
                               abs(candidate.end_time - time))
                    if dist < best_distance:
                        best_index = i
                        best_distance = dist
                new_owner = candidates[best_index]
                if debug_mode:
                    self.log(
                        "... Assigning %r to %r with %d existing solutions with distance %0.3f"
                        % (orphan, new_owner, len(
                            new_owner.tandem_solutions), best_distance))
                new_owner.add_displaced_solution(orphan.solution)
            else:
                if threshold_fn(orphan.solution):
                    if n_chromatograms > 0:
                        self.log(
                            "No chromatogram found for %r, q-value %0.4f (mass: %0.4f, time: %0.4f)"
                            % (orphan, orphan.solution.q_value, mass, time))
                    lost.append(orphan.solution)
        self.orphans = TandemSolutionsWithoutChromatogram.aggregate(lost)

    def assign_entities(self,
                        threshold_fn=lambda x: x.q_value < 0.05,
                        entity_chromatogram_type=None):
        if entity_chromatogram_type is None:
            entity_chromatogram_type = GlycopeptideChromatogram
        for chromatogram in self:
            solutions = chromatogram.most_representative_solutions(
                threshold_fn)
            if solutions:
                solutions = sorted(solutions,
                                   key=lambda x: x.score,
                                   reverse=True)
                if debug_mode:
                    self.log("... Assigning %s to %s out of %r\n" %
                             (solutions[0], chromatogram, solutions))
                chromatogram.assign_entity(
                    solutions[0],
                    entity_chromatogram_type=entity_chromatogram_type)
                chromatogram.representative_solutions = solutions

    def merge_common_entities(self,
                              annotated_chromatograms,
                              delta_rt=0.25,
                              require_unmodified=True,
                              threshold_fn=lambda x: x.q_value < 0.05):
        aggregated = defaultdict(list)
        finished = []
        self.log("Aggregating Common Entities: %d chromatograms" %
                 (len(annotated_chromatograms, )))
        for chroma in annotated_chromatograms:
            if chroma.composition is not None:
                if chroma.entity is not None:
                    # Convert to string to avoid redundant sequences from getting
                    # binned differently due to random ordering of ids.
                    aggregated[str(chroma.entity)].append(chroma)
                else:
                    aggregated[str(chroma.composition)].append(chroma)
            else:
                finished.append(chroma)
        for entity, group in aggregated.items():
            out = []
            group = sorted(group, key=lambda x: x.start_time)
            chroma = group[0]
            for obs in group[1:]:
                if chroma.chromatogram.overlaps_in_time(obs) or (
                        chroma.end_time - obs.start_time) < delta_rt:
                    chroma = chroma.merge(obs)
                else:
                    out.append(chroma)
                    chroma = obs
            out.append(chroma)
            finished.extend(out)
        self.log("After merging: %d chromatograms" % (len(finished), ))
        if require_unmodified:
            out = []
            for chromatogram in finished:
                # the structure's best match has not been identified in an unmodified state
                if Unmodified not in chromatogram.mass_shifts:
                    solutions = chromatogram.most_representative_solutions(
                        threshold_fn, reject_shifted=True)
                    # if there is a reasonable solution in an unmodified state
                    if solutions:
                        # select the best solution
                        solutions = sorted(solutions,
                                           key=lambda x: x.score,
                                           reverse=True)

                        # remove the invalidated mass shifts
                        current_shifts = chromatogram.chromatogram.mass_shifts
                        partitions = []
                        for shift in current_shifts:
                            partition, _ = chromatogram.chromatogram.bisect_mass_shift(
                                shift)
                            partitions.append(
                                partition.deduct_node_type(shift))
                        accumulated_chromatogram = partitions[0]
                        for partition in partitions[1:]:
                            accumulated_chromatogram = accumulated_chromatogram.merge(
                                partition)
                        chromatogram.chromatogram = accumulated_chromatogram

                        # update the tandem annotations
                        chromatogram.assign_entity(
                            solutions[0],
                            entity_chromatogram_type=chromatogram.chromatogram.
                            __class__)
                        chromatogram.representative_solutions = solutions
                        out.append(chromatogram)
                    else:
                        log_handle.log(
                            "... Could not find an alternative option for %r" %
                            (chromatogram, ))
                        out.append(chromatogram)
                else:
                    out.append(chromatogram)
            finished = []
            aggregated = defaultdict(list)
            for chroma in out:
                if chroma.composition is not None:
                    if chroma.entity is not None:
                        aggregated[chroma.entity].append(chroma)
                    else:
                        aggregated[chroma.composition].append(chroma)
                else:
                    finished.append(chroma)
            for entity, group in aggregated.items():
                out = []
                group = sorted(group, key=lambda x: x.start_time)
                chroma = group[0]
                for obs in group[1:]:
                    if chroma.chromatogram.overlaps_in_time(obs) or (
                            chroma.end_time - obs.start_time) < delta_rt:
                        chroma = chroma.merge(obs)
                    else:
                        out.append(chroma)
                        chroma = obs
                out.append(chroma)
                finished.extend(out)
        return finished

    def __len__(self):
        return len(self.chromatograms)

    def __iter__(self):
        return iter(self.chromatograms)

    def __getitem__(self, i):
        if isinstance(i, (int, slice)):
            return self.chromatograms[i]
        else:
            return [self.chromatograms[j] for j in i]
示例#28
0
 def prune_adducts(self, solutions):
     return prune_bad_adduct_branches(ChromatogramFilter(solutions),
                                      score_margin=2.5)
示例#29
0
class GlycanChromatogramReportCreator(ReportCreatorBase):
    def __init__(self, database_path, analysis_id, stream=None, threshold=5):
        super(GlycanChromatogramReportCreator, self).__init__(
            database_path, analysis_id, stream)
        self.set_template_loader(os.path.dirname(__file__))
        self.threshold = threshold
        self.glycan_chromatograms = ChromatogramFilter([])
        self.unidentified_chromatograms = ChromatogramFilter([])

    def glycan_link(self, key):
        match = self.glycan_chromatograms.find_key(key)
        if match is not None:
            return chromatogram_link(match)
        match = self.unidentified_chromatograms.find_key(key)
        if match is not None:
            return chromatogram_link(match)
        return None

    def prepare_environment(self):
        super(GlycanChromatogramReportCreator, self).prepare_environment()
        self.env.filters["logit"] = logit
        self.env.filters['chromatogram_figures'] = chromatogram_figures
        self.env.filters['glycan_link'] = self.glycan_link

    def make_template_stream(self):
        template_obj = self.env.get_template("overview.templ")

        ads = serialize.AnalysisDeserializer(
            self.database_connection._original_connection,
            analysis_id=self.analysis_id)

        self.glycan_chromatograms = gcs = ads.load_glycan_composition_chromatograms()
        # und = ads.load_unidentified_chromatograms()
        self.unidentified_chromatograms = und = ChromatogramFilter(
            ads.query(serialize.UnidentifiedChromatogram).filter(
                serialize.UnidentifiedChromatogram.analysis_id == self.analysis_id).all())

        if len(gcs) == 0:
            self.log("No glycan compositions were identified. Skipping report building")
            templ = Template('''
                <html>
                <style>
                body {
                    font-family: sans-serif;
                }
                </style>
                <body>
                    <h3>No glycan compositions were identified</h3>
                </body>
                </html>
                ''')
            return templ.stream()

        summary_plot = summaries.GlycanChromatographySummaryGraphBuilder(
            filter(lambda x: x.score > self.threshold, gcs + und))
        lcms_plot, composition_abundance_plot = summary_plot.draw(min_score=5)

        try:
            lcms_plot.ax.legend_.set_visible(False)
        except AttributeError:
            # The legend may not have been created
            pass
        lcms_plot.ax.set_title("Glycan Composition\nLC-MS Aggregated EICs", fontsize=24)

        fig = lcms_plot.ax.figure
        fig.set_figwidth(fig.get_figwidth() * 2.)
        fig.set_figheight(fig.get_figheight() * 2.)

        composition_abundance_plot.ax.set_title("Glycan Composition\nTotal Abundances", fontsize=24)
        composition_abundance_plot.ax.set_xlabel(
            composition_abundance_plot.ax.get_xlabel(), fontsize=14)

        def resolve_key(key):
            match = gcs.find_key(key)
            if match is None:
                match = und.find_key(key)
            return match

        template_stream = (template_obj.stream(
            analysis=ads.analysis, lcms_plot=svguri_plot(
                lcms_plot.ax, bbox_inches='tight', patchless=True,
                svg_width="100%"),
            composition_abundance_plot=svguri_plot(
                composition_abundance_plot.ax, bbox_inches='tight', patchless=True,
                svg_width="100%"),
            glycan_chromatograms=gcs,
            unidentified_chromatograms=und,
            resolve_key=resolve_key
        ))
        return template_stream
示例#30
0
 def prune_adducts(self, solutions):
     return prune_bad_adduct_branches(ChromatogramFilter(solutions))
示例#31
0
            last = disjoint_set[0]
            for case in disjoint_set[1:]:
                if last.overlaps_in_time(case) or ((case.start_time - last.end_time) < delta_rt):
                    merged = last._merge_missing_only(case)
                    merged.used_as_adduct = list(last.used_as_adduct)
                    for ua in case.used_as_adduct:
                        if ua not in merged.used_as_adduct:
                            merged.used_as_adduct.append(ua)
                    last = merged
                    last.created_at = "join_common_identities"
                else:
                    accumulated.append(last)
                    last = case
            accumulated.append(last)
            out.extend(accumulated)
        return ChromatogramFilter(out)

    def find_related_profiles(self, chromatograms, adducts, mass_error_tolerance=1e-5):
        graph = ChromatogramGraph(chromatograms)
        graph.find_shared_peaks()
        components = graph.connected_components()

        for component in components:
            component = [node.chromatogram for node in component]
            if len(component) == 1:
                continue
            problem_pairs = set()
            for a, b in permutations(component, 2):
                best_err = float('inf')
                best_match = None
                mass_shift = a.weighted_neutral_mass - b.weighted_neutral_mass
 def find_chromatogram_spanning(self, time):
     return ChromatogramFilter(
         [interv[0] for interv in self.rt_tree.contains_point(time)])
示例#33
0
    def make_template_stream(self):
        template_obj = self.env.get_template("overview.templ")

        ads = serialize.AnalysisDeserializer(
            self.database_connection._original_connection,
            analysis_id=self.analysis_id)

        self.glycan_chromatograms = gcs = ads.load_glycan_composition_chromatograms(
        )
        # und = ads.load_unidentified_chromatograms()
        self.unidentified_chromatograms = und = ChromatogramFilter(
            ads.query(serialize.UnidentifiedChromatogram).filter(
                serialize.UnidentifiedChromatogram.analysis_id ==
                self.analysis_id).all())

        if len(gcs) == 0:
            self.log(
                "No glycan compositions were identified. Skipping report building"
            )
            templ = Template('''
                <html>
                <style>
                body {
                    font-family: sans-serif;
                }
                </style>
                <body>
                    <h3>No glycan compositions were identified</h3>
                </body>
                </html>
                ''')
            return templ.stream()

        summary_plot = summaries.GlycanChromatographySummaryGraphBuilder(
            filter(lambda x: x.score > self.threshold, gcs + und))
        lcms_plot, composition_abundance_plot = summary_plot.draw(min_score=5)

        try:
            lcms_plot.ax.legend_.set_visible(False)
        except AttributeError:
            # The legend may not have been created
            pass
        lcms_plot.ax.set_title("Glycan Composition\nLC-MS Aggregated EICs",
                               fontsize=24)

        fig = lcms_plot.ax.figure
        fig.set_figwidth(fig.get_figwidth() * 2.)
        fig.set_figheight(fig.get_figheight() * 2.)

        composition_abundance_plot.ax.set_title(
            "Glycan Composition\nTotal Abundances", fontsize=24)
        composition_abundance_plot.ax.set_xlabel(
            composition_abundance_plot.ax.get_xlabel(), fontsize=14)

        def resolve_key(key):
            match = gcs.find_key(key)
            if match is None:
                match = und.find_key(key)
            return match

        template_stream = (template_obj.stream(
            analysis=ads.analysis,
            lcms_plot=svguri_plot(lcms_plot.ax,
                                  bbox_inches='tight',
                                  patchless=True,
                                  svg_width="100%"),
            composition_abundance_plot=svguri_plot(
                composition_abundance_plot.ax,
                bbox_inches='tight',
                patchless=True,
                svg_width="100%"),
            glycan_chromatograms=gcs,
            unidentified_chromatograms=und,
            resolve_key=resolve_key))
        return template_stream
class ChromatogramMSMSMapper(TaskBase):
    def __init__(self,
                 chromatograms,
                 error_tolerance=1e-5,
                 scan_id_to_rt=lambda x: x):
        self.chromatograms = ChromatogramFilter(
            map(TandemAnnotatedChromatogram, chromatograms))
        self.rt_tree = build_rt_interval_tree(self.chromatograms)
        self.scan_id_to_rt = scan_id_to_rt
        self.orphans = []
        self.error_tolerance = error_tolerance

    def find_chromatogram_spanning(self, time):
        return ChromatogramFilter(
            [interv[0] for interv in self.rt_tree.contains_point(time)])

    def find_chromatogram_for(self, solution):
        precursor_scan_time = self.scan_id_to_rt(
            solution.precursor_information.precursor_scan_id)
        overlapping_chroma = self.find_chromatogram_spanning(
            precursor_scan_time)
        chroma = overlapping_chroma.find_mass(
            solution.precursor_information.neutral_mass, self.error_tolerance)
        if chroma is None:
            self.orphans.append(ScanTimeBundle(solution, precursor_scan_time))
        else:
            chroma.tandem_solutions.append(solution)

    def assign_solutions_to_chromatograms(self, solutions):
        for solution in solutions:
            self.find_chromatogram_for(solution)

    def distribute_orphans(self, threshold_fn=lambda x: x.q_value < 0.05):
        lost = []
        for orphan in self.orphans:
            mass = orphan.solution.precursor_ion_mass
            window = self.error_tolerance * mass
            candidates = self.chromatograms.mass_between(
                mass - window, mass + window)
            time = orphan.scan_time
            if len(candidates) > 0:
                best_index = 0
                best_distance = float('inf')
                for i, candidate in enumerate(candidates):
                    dist = min(abs(candidate.start_time - time),
                               abs(candidate.end_time - time))
                    if dist < best_distance:
                        best_index = i
                        best_distance = dist
                    new_owner = candidates[best_index]
                    new_owner.add_displaced_solution(orphan.solution)
            else:
                if threshold_fn(orphan.solution):
                    self.log(
                        "No chromatogram found for %r, q-value %0.4f (mass: %0.4f, time: %0.4f)"
                        % (orphan, orphan.solution.q_value, mass, time))
                    lost.append(orphan.solution)
        self.orphans = TandemSolutionsWithoutChromatogram.aggregate(lost)

    def assign_entities(self,
                        threshold_fn=lambda x: x.q_value < 0.05,
                        entity_chromatogram_type=None):
        if entity_chromatogram_type is None:
            entity_chromatogram_type = GlycopeptideChromatogram
        for chromatogram in self:
            solutions = chromatogram.most_representative_solutions(
                threshold_fn)
            if solutions:
                solutions = sorted(solutions,
                                   key=lambda x: x.score,
                                   reverse=True)
                chromatogram.assign_entity(
                    solutions[0],
                    entity_chromatogram_type=entity_chromatogram_type)

    def merge_common_entities(self, annotated_chromatograms):
        aggregated = defaultdict(list)
        finished = []
        self.log("Aggregating Common Entities: %d chromatograms" %
                 (len(annotated_chromatograms, )))
        for chroma in annotated_chromatograms:
            if chroma.composition is not None:
                if chroma.entity is not None:
                    aggregated[chroma.entity].append(chroma)
                    self.log("... %s (%s)" % (chroma.entity, chroma.adducts))
                else:
                    aggregated[chroma.composition].append(chroma)
            else:
                finished.append(chroma)
        for entity, group in aggregated.items():
            out = []
            chroma = group[0]
            for obs in group[1:]:
                if chroma.chromatogram.overlaps_in_time(obs):
                    chroma = chroma.merge(obs)
                else:
                    out.append(chroma)
                    chroma = obs
            out.append(chroma)
            finished.extend(out)
        self.log("After merging: %d chromatograms" % (len(finished), ))
        return finished

    def __len__(self):
        return len(self.chromatograms)

    def __iter__(self):
        return iter(self.chromatograms)

    def __getitem__(self, i):
        if isinstance(i, (int, slice)):
            return self.chromatograms[i]
        else:
            return [self.chromatograms[j] for j in i]