def interval_tree(self): if self._interval_tree is None: self._interval_tree = IntervalTreeNode.build(self.clusters) if self._interval_tree is None: raise ValueError( "Could not build intervals for peak retrieval with %d clusters" % len( self.clusters), self) return self._interval_tree
def test_build(self): # generate many redundant intervals to force # the interval tree to branch out. intervals = self.make_intervals() * 30 ivt = IntervalTreeNode.build(intervals) assert ivt.start == 3 assert ivt.end == 36 assert ivt.left is not None assert ivt.left.start == 3 assert ivt.left.end == 12 assert ivt.right is not None assert ivt.right.end == 36 assert ivt.right.start == 22
def find_solution_for(self, feature): feature_node = self.nodes[feature] tree = self.interval_tree if tree is None: tree = IntervalTreeNode.build(self.clusters) if tree is None: raise ValueError( "Could not build intervals for peak retrieval with %d clusters" % len( self.clusters)) clusters = tree.contains_point(feature.mz) if len(clusters) == 0: return self._find_fuzzy_solution_for(feature) best_fits = [cluster.disjoint_best_fits() for cluster in clusters] acc = [] for fits in best_fits: acc.extend(fits) best_fits = acc common = tuple(set(best_fits) & set(feature_node.links)) if len(common) > 1 or len(common) == 0: if len(common) > 1: warnings.warn("Too many solutions exist for %r" % feature) # If there were no fits for this peak, then it may be that this peak # was not included in a fit. Try to find the nearest solution. i = 0 err = float('inf') for j, case in enumerate(best_fits): case_err = abs(case.monoisotopic_feature.mz - feature.mz) if case_err < err: i = j err = case_err fit = best_fits[i] else: fit = common[0] return self._solution_map[fit]
def split_protein(self, protein_obj, sites=None): if sites is None: sites = [] if not sites: return seen = set() sites_seen = set() peptides = protein_obj.peptides.all() peptide_intervals = IntervalTreeNode.build(map(PeptideInterval, peptides)) for site in sites: overlap_region = peptide_intervals.contains_point(site - 1) spanned_intervals = IntervalTreeNode.build(overlap_region) # No spanned peptides. May be caused by regions of protein which digest to peptides # of unacceptable size. if spanned_intervals is None: continue lo = spanned_intervals.start hi = spanned_intervals.end # Get the set of all sites spanned by any peptide which spans the current query site spanned_sites = [s for s in sites if lo <= s <= hi] for i in range(1, len(spanned_sites) + 1): for split_sites in itertools.combinations(spanned_sites, i): site_key = frozenset(split_sites) if site_key in sites_seen: continue sites_seen.add(site_key) spanning_peptides_query = spanned_intervals.contains_point(split_sites[0]) for site_j in split_sites[1:]: spanning_peptides_query = [ sp for sp in spanning_peptides_query if site_j in sp ] spanning_peptides = [] for sp in spanning_peptides_query: spanning_peptides.extend(sp) for peptide in spanning_peptides: adjusted_sites = [0] + [s - peptide.start_position for s in split_sites] + [ peptide.sequence_length] for j in range(len(adjusted_sites) - 1): begin, end = adjusted_sites[j], adjusted_sites[j + 1] if end - begin < self.min_length: continue start_position = begin + peptide.start_position end_position = end + peptide.start_position if (start_position, end_position) in seen: continue else: seen.add((start_position, end_position)) for modified_peptide, n_variable_modifications in self._permuted_peptides( peptide.base_peptide_sequence[begin:end]): inst = Peptide( base_peptide_sequence=str(peptide.base_peptide_sequence[begin:end]), modified_peptide_sequence=str(modified_peptide), count_missed_cleavages=peptide.count_missed_cleavages, count_variable_modifications=n_variable_modifications, sequence_length=len(modified_peptide), start_position=start_position, end_position=end_position, calculated_mass=modified_peptide.mass, formula=formula(modified_peptide.total_composition()), protein_id=protein_obj.id) inst.hypothesis_id = protein_obj.hypothesis_id inst.peptide_score = 0 inst.peptide_score_type = 'null_score' n_glycosites = n_glycan_sequon_sites( inst, protein_obj) o_glycosites = o_glycan_sequon_sites(inst, protein_obj) gag_glycosites = gag_sequon_sites(inst, protein_obj) inst.count_glycosylation_sites = len(n_glycosites) inst.n_glycosylation_sites = sorted(n_glycosites) inst.o_glycosylation_sites = sorted(o_glycosites) inst.gagylation_sites = sorted(gag_glycosites) yield inst
def __init__(self, features): self.rt_tree = IntervalTreeNode.build(map(RTFeatureNode, features))
def neutral_mass_point_organizer_callback(contained_intervals): return IntervalTreeNode.build([ Interval(node.neutral_mass, node.neutral_mass, [node]) for node in contained_intervals ])
def mz_point_organizer_callback(contained_intervals): return IntervalTreeNode.build( [Interval(node.mz, node.mz, [node]) for node in contained_intervals])