def test_empty_queries(): t = IntervalTree() e = set() assert len(t) == 0 assert t.is_empty() assert t[3] == e assert t[4:6] == e assert t.begin() == 0 assert t.end() == 0 assert t[t.begin():t.end()] == e assert t.items() == e assert set(t) == e assert set(t.copy()) == e assert t.find_nested() == {} t.verify()
def test_empty_queries(): t = IntervalTree() e = set() assert len(t) == 0 assert t.is_empty() assert t[3] == e assert t[4:6] == e assert t.begin() == 0 assert t.end() == 0 assert t[t.begin():t.end()] == e assert t.items() == e assert set(t) == e assert set(t.copy()) == e assert t.find_nested() == {} assert t.range().is_null() assert t.range().length() == 0 t.verify()
def test_copy(): itree = IntervalTree([Interval(0, 1, "x"), Interval(1, 2, ["x"])]) itree.verify() itree2 = IntervalTree(itree) # Shares Interval objects itree2.verify() itree3 = itree.copy() # Shallow copy (same as above, as Intervals are singletons) itree3.verify() itree4 = pickle.loads(pickle.dumps(itree)) # Deep copy itree4.verify() list(itree[1])[0].data[0] = "y" assert sorted(itree) == [Interval(0, 1, 'x'), Interval(1, 2, ['y'])] assert sorted(itree2) == [Interval(0, 1, 'x'), Interval(1, 2, ['y'])] assert sorted(itree3) == [Interval(0, 1, 'x'), Interval(1, 2, ['y'])] assert sorted(itree4) == [Interval(0, 1, 'x'), Interval(1, 2, ['x'])]
def test_copy(): itree = IntervalTree([Interval(0, 1, "x"), Interval(1, 2, ["x"])]) itree.verify() itree2 = IntervalTree(itree) # Shares Interval objects itree2.verify() itree3 = itree.copy( ) # Shallow copy (same as above, as Intervals are singletons) itree3.verify() itree4 = pickle.loads(pickle.dumps(itree)) # Deep copy itree4.verify() list(itree[1])[0].data[0] = "y" assert sorted(itree) == [Interval(0, 1, 'x'), Interval(1, 2, ['y'])] assert sorted(itree2) == [Interval(0, 1, 'x'), Interval(1, 2, ['y'])] assert sorted(itree3) == [Interval(0, 1, 'x'), Interval(1, 2, ['y'])] assert sorted(itree4) == [Interval(0, 1, 'x'), Interval(1, 2, ['x'])]
def seq_from_exons_introns(exons, introns, join=True): """ Merges exons and introns and returns sequence Note that exons and introns are in different formats Exons are tuples of (exn_num, chrom, start, stop, strand, gene) (should be nonoverlapping) Introns are tuples of (intron_seq, intron_gcoords) that are nonoverlappping by construction We ignore the intron sequence IN CASE the coords overlap """ itree = IntervalTree() chroms = set() strands = set() for exn_num, chrom, start, stop, strand, gene in exons: chroms.add(chrom) strands.add(strand) itree[start:stop] = f"exon_{exn_num}" assert len(chroms) == 1 chrom = chroms.pop() assert len(strands) == 1 strand = strands.pop() for i, gcoord in enumerate(introns[1]): chrom, startstop, strand = gcoord.split(":") start, stop = map(int, startstop.split("-")) itree[start:stop] = f"ri_{i}" itree_orig = itree.copy() itree.merge_overlaps(lambda x, y: ";".join([x, y])) if len(itree) != len(itree_orig): logging.warn(f"Contains overlaps: {itree_orig}") # The itree sorts everything in 5' to 3' regardless of strand seqs = [] for interval in itree: # Actual sequences are rev comped properly # seq = GENOME_FA[chrom][interval.begin:interval.end] seq = GENOME_FA.get_seq(chrom, interval.begin, interval.end, strand == "-") assert seq.seq seqs.append(seq.seq) return ''.join(seqs) if join else seqs
def test_empty_queries(): t = IntervalTree() e = set() assert len(t) == 0 assert t.is_empty() assert t[3] == e assert t[4:6] == e assert t.begin() == 0 assert t.end() == 0 assert t[t.begin():t.end()] == e assert t.overlap(t.begin(), t.end()) == e assert t.envelop(t.begin(), t.end()) == e assert t.items() == e assert set(t) == e assert set(t.copy()) == e assert t.find_nested() == {} assert t.range().is_null() assert t.range().length() == 0 t.verify()
def get_intron_tree(pysamAlignment, chrID): ''' Read bam file and generate IntervalTrees with format: (intron start, intron end, num of support) Parameter: bam_filename: <string> filename for the bam file. ''' #pysamAlignment = pysam.AlignmentFile(bam_filename) f_fetch = pysamAlignmentClass.fetch(chrID) # reads iterator f_introns = pysamAlignmentClass.find_introns(f_fetch) # dictionary # converty dictionary to IntervalTree intron_tree = IntervalTree() for (begin, end), data in f_introns.items(): intron_tree.addi(begin, end, data) #built non overlapped range intron_tree_non_overlaps = intron_tree.copy() intron_tree_non_overlaps.merge_overlaps() count = 0 single_support = 0 total_candidate = 0 total_junc_with_in_read = 0 for interval in intron_tree_non_overlaps: candidates = find_candidate(interval.begin, interval.end, intron_tree, 10) # some statistics total_candidate += len(candidates) total_junc_with_in_read += sum([x.data for x in candidates]) single_support += sum([x.data for x in candidates if x.data < 3]) count += 1 if count < 0: break
class FeatureSet(object): """ An ordered collection of :class:`SeqFeature` objects. :param type feature_class: type of the features stored in the collection; defaults to :class:`SeqFeature` and must inherit from it. """ def __init__(self, feature_class=None): if feature_class is None: feature_class = SeqFeature elif not issubclass(feature_class, SeqFeature): raise RuntimeError( "FeatureSet expects a feature class that inherits from SeqFeature" ) self._features = IntervalTree() self._feature_class = feature_class def __or__(self, other): return self.difference(other) def __len__(self): return len(self._features) def __iter__(self): for f in sorted(self._features): yield f.data def __repr__(self): return '{}({})'.format(self.__class__.__name__, list(self)) def _wrap_feature(self, feature): if isinstance(feature, SeqFeature): return Interval(feature.location.start, feature.location.end, feature) elif isinstance(feature, (self._feature_class, Feature)): return Interval(feature.start, feature.end, feature) else: raise ValueError( "feature must be one of Bio.SeqFeature, co.Feature, %s" % self._feature_class) def copy(self): """ :returns: a copy of this collection :rtype: :class:`FeatureSet` """ fs = FeatureSet(feature_class=self._feature_class) fs._features = self._features.copy() return fs def add(self, *args, **kwargs): """ Creates a feature object from the given ``args`` and ``kwargs`` and adds it to the collection. :rtype: :class:`SeqFeature` """ feature = self._feature_class(*args, **kwargs) self._features.add(self._wrap_feature(feature)) return feature def remove(self, feature): """ Removes the given feature from the collection """ self._features.remove(self._wrap_feature(feature)) def find(self, between_start=None, between_end=None, type=None, id=None, strand=None, **qualifiers): """ Iterate over all features matching the search parameters. - ``between_start`` and ``between_end`` can be used to restrict the search range. - ``type``, ``id``, and ``strand`` each restrict the search to features that match on these attributes - ``qualifiers`` is an arbitrary group of keyword arguments that will be matched to the qualifier keys of each feature. Each key must be present and have the same value as in the search parameters. """ if between_start or between_end: it = self.overlap(between_start or 0, between_end or sys.maxsize) else: it = iter(self) attrs = [(k, v) for k, v in (('type', type), ('id', id), ('strand', strand)) if v is not None] for feature in it: if any(getattr(feature, key) != value for key, value in attrs): continue if any( feature.qualifiers.get(key) != value for key, value in qualifiers.items()): continue yield feature def overlap(self, start, end): """ Returns an iterator over all features in the collection that overlap the given range. :param int start: overlap region start :param int end: overlap region end """ if start > end: raise RuntimeError("start cannot be larger than end.") for f in sorted(self._features.search(start, end + 1)): yield f.data def difference(self, other): fs = self.copy() fs._features = self._features - other._features return fs def union(self, other): fs = self.copy() fs._features = self._features | other._features return fs
def test_symmetric_difference(): a = IntervalTree.from_tuples(data.ivs1.data) b = IntervalTree.from_tuples(data.ivs2.data) e = IntervalTree() # symdiffs with e assert a.symmetric_difference(e) == a ae = a.copy() ae.symmetric_difference_update(e) assert ae == a assert b.symmetric_difference(e) == b be = b.copy() be.symmetric_difference_update(e) assert be == b assert e.symmetric_difference(e) == e ee = e.copy() ee.symmetric_difference_update(e) assert ee == e # symdiff with self assert a.symmetric_difference(a) == e aa = a.copy() aa.symmetric_difference_update(a) assert aa == e assert b.symmetric_difference(b) == e bb = b.copy() bb.symmetric_difference_update(b) == e assert bb == e # commutativity resulting in empty ab = a.symmetric_difference(b) ba = b.symmetric_difference(a) ab.verify() ba.verify() assert ab == ba assert len(ab) == len(a) + len(b) # no overlaps, so sum ab = a.copy() ab.symmetric_difference_update(b) ba = b.copy() ba.symmetric_difference_update(a) ab.verify() ba.verify() assert ab == ba assert len(ab) == len(a) + len(b) # no overlaps, so sum # commutativity on non-overlapping sets ab = a.union(b) ba = b.union(a) aba = ab.symmetric_difference(a) abb = ab.symmetric_difference(b) bab = ba.symmetric_difference(b) baa = ba.symmetric_difference(a) aba.verify() abb.verify() bab.verify() baa.verify() assert aba == b assert abb == a assert bab == a assert baa == b ab = a.union(b) ba = b.union(a) aba = ab.copy() aba.symmetric_difference_update(a) abb = ab.copy() abb.symmetric_difference_update(b) bab = ba.copy() bab.symmetric_difference_update(b) baa = ba.copy() baa.symmetric_difference_update(a) aba.verify() abb.verify() bab.verify() baa.verify() assert aba == b assert abb == a assert bab == a assert baa == b # commutativity with overlapping sets c = IntervalTree.from_tuples(data.ivs3.data) bc = b.symmetric_difference(c) cb = c.symmetric_difference(b) bc.verify() cb.verify() assert bc == cb assert len(bc) > 0 assert len(bc) < len(b) + len(c) assert b.containsi(13, 23) assert c.containsi(13, 23) assert not bc.containsi(13, 23) assert c.containsi(819, 828) assert not b.containsi(819, 828) assert b.containsi(0, 1) assert not c.containsi(0, 1) assert bc.containsi(819, 828) assert bc.containsi(0, 1) bc = b.copy() bc.symmetric_difference_update(c) cb = c.copy() cb.symmetric_difference_update(b) bc.verify() cb.verify() assert bc == cb assert len(bc) > 0 assert len(bc) < len(b) + len(c) assert b.containsi(13, 23) assert c.containsi(13, 23) assert not bc.containsi(13, 23) assert c.containsi(819, 828) assert not b.containsi(819, 828) assert b.containsi(0, 1) assert not c.containsi(0, 1) assert bc.containsi(819, 828) assert bc.containsi(0, 1)
def test_intersection(): a = IntervalTree.from_tuples(data.ivs1.data) b = IntervalTree.from_tuples(data.ivs2.data) e = IntervalTree() # intersections with e assert a.intersection(e) == e ae = a.copy() ae.intersection_update(e) assert ae == e assert b.intersection(e) == e be = b.copy() be.intersection_update(e) assert be == e assert e.intersection(e) == e ee = e.copy() ee.intersection_update(e) assert ee == e # intersections with self assert a.intersection(a) == a aa = a.copy() aa.intersection_update(a) assert aa == a assert b.intersection(b) == b bb = b.copy() bb.intersection(b) == b assert bb == b # commutativity resulting in empty ab = a.intersection(b) ba = b.intersection(a) ab.verify() ba.verify() assert ab == ba assert len(ab) == 0 # no overlaps, so empty tree ab = a.copy() ab.intersection_update(b) ba = b.copy() ba.intersection_update(a) ab.verify() ba.verify() assert ab == ba assert len(ab) == 0 # no overlaps, so empty tree # commutativity on non-overlapping sets ab = a.union(b) ba = b.union(a) aba = ab.intersection(a) # these should yield no change abb = ab.intersection(b) bab = ba.intersection(b) baa = ba.intersection(a) aba.verify() abb.verify() bab.verify() baa.verify() assert aba == a assert abb == b assert bab == b assert baa == a ab = a.union(b) ba = b.union(a) aba = ab.copy() aba.intersection_update(a) # these should yield no change abb = ab.copy() abb.intersection_update(b) bab = ba.copy() bab.intersection_update(b) baa = ba.copy() baa.intersection_update(a) aba.verify() abb.verify() bab.verify() baa.verify() assert aba == a assert abb == b assert bab == b assert baa == a # commutativity with overlapping sets c = IntervalTree.from_tuples(data.ivs3.data) bc = b.intersection(c) cb = c.intersection(b) bc.verify() cb.verify() assert bc == cb assert len(bc) < len(b) assert len(bc) < len(c) assert len(bc) > 0 assert b.containsi(13, 23) assert c.containsi(13, 23) assert bc.containsi(13, 23) assert not b.containsi(819, 828) assert not c.containsi(0, 1) assert not bc.containsi(819, 828) assert not bc.containsi(0, 1) bc = b.copy() bc.intersection_update(c) cb = c.copy() cb.intersection_update(b) bc.verify() cb.verify() assert bc == cb assert len(bc) < len(b) assert len(bc) < len(c) assert len(bc) > 0 assert b.containsi(13, 23) assert c.containsi(13, 23) assert bc.containsi(13, 23) assert not b.containsi(819, 828) assert not c.containsi(0, 1) assert not bc.containsi(819, 828) assert not bc.containsi(0, 1)
def filterOverlaps(self, overlapPercCutoff=70): """Filtering out amplicons that substantially overlap. The amplicon with the highest PPC with be kept. The MFEprimerRes attribute must be set. in-place edit of MFEprimerRes object (table object filtered of overlaps) Parmeters --------- overlapPercCutoff : float percent of overlap to consider 'substantially' overlapping """ if self.MFEprimerRes is None: msg = 'genome object does not have MFEprimerRes attribute.' + \ ' Run MFEprimer() first' raise AttributeError, msg # making interval tree tree = IntervalTree() # loading intervals for count, row in self.MFEprimerRes.iterrows(): # sanity check for + strand if row['BindingStart'] > row['BindingStop']: raise TypeError('MFEprimer binding start-stop is not + strand') tree.addi(row['BindingStart'], row['BindingStop'], [count, row['PPC'], row['Size']]) # finding all that substantially overlap; keeping one with > PPC tree2 = tree.copy() for iv1 in tree.iter(): # skipping if already removed from tree2 if not iv1 in tree2: continue overlaps = tree.search(iv1.begin, iv1.end) # skipping those that poorly overlap lowOverlap = set() for iv2 in overlaps: if iv1.overlaps(iv2): percOverlaps = self._calcPercOverlap(iv1, iv2) if percOverlaps[0] < overlapPercCutoff: lowOverlap.add(iv2) overlaps = overlaps - lowOverlap # just list of substantially overlapping # skipping those that have been already removed prevRm = set([x for x in overlaps if x not in tree2]) overlaps = overlaps - prevRm # removing all substantially overlapping intervals with lower PPC if len(overlaps) > 1: overlaps = sorted(overlaps, key=lambda x: x.data[1], reverse=True) for o in overlaps[1:]: if o in tree2: tree2.remove(o) else: pass # selecting columns iv_idx = [x.data[0] for x in tree2.iter()] self.MFEprimerRes = self.MFEprimerRes.iloc[iv_idx]
class Chromosome: def __init__(self, chr_name, chr_length): self.chr_name = chr_name self.chr_length = chr_length self.paternal_tree = IntervalTree() self.maternal_tree = IntervalTree() def add_seg(self, type, allele, cluster_num, cn_change, start, end): if allele == 'paternal': self.paternal_tree[start:end] = Event(type, allele, cluster_num, cn_change) else: self.maternal_tree[start:end] = Event(type, allele, cluster_num, cn_change) def add_seg_interval(self, type, cluster_num, cn_change, interval): self.add_seg(type, interval.data.allele, cluster_num, cn_change, interval.begin, interval.end) def calc_current_cnv_lineage(self, start, end, cluster_num, phylogeny): lineage_clusters, _ = phylogeny.get_lineage(cluster_num) pat_intervals = self.paternal_tree.copy() pat_intervals.slice(start) pat_intervals.slice(end) pat_tree = IntervalTree() for i in pat_intervals.envelop(start, end): if i.data.cluster_num in lineage_clusters: pat_tree.add(i) pat_tree.split_overlaps() pat_tree.merge_overlaps(data_reducer=self.sum_levels) mat_intervals = self.maternal_tree.copy() mat_intervals.slice(start) mat_intervals.slice(end) mat_tree = IntervalTree() for i in mat_intervals.envelop(start, end): if i.data.cluster_num in lineage_clusters: mat_tree.add(i) mat_tree.split_overlaps() mat_tree.merge_overlaps(data_reducer=self.sum_levels) return pat_tree, mat_tree def calc_full_cnv(self, phylogeny): pat_tree = IntervalTree() for i in self.paternal_tree: weighted_cn = i.data.cn_change * phylogeny.ccfs[i.data.cluster_num] pat_tree[i.begin: i.end] = Event(i.data.type, i.data.allele, i.data.cluster_num, weighted_cn) pat_tree.split_overlaps() pat_tree.merge_overlaps(data_reducer=self.sum_levels) mat_tree = IntervalTree() for i in self.maternal_tree: weighted_cn = i.data.cn_change * phylogeny.ccfs[i.data.cluster_num] mat_tree[i.begin: i.end] = Event(i.data.type, i.data.allele, i.data.cluster_num, weighted_cn) mat_tree.split_overlaps() mat_tree.merge_overlaps(data_reducer=self.sum_levels) # could deliver a Chromosome (or child class) instead of just a tree return pat_tree, mat_tree def get_cnv_df(self, pat_tree, mat_tree): both_alleles = IntervalTree(list(pat_tree) + list(mat_tree)) both_alleles.split_overlaps() both_alleles.merge_overlaps(data_reducer=self.specify_levels) seg_df = [] for segment in both_alleles: seg_df.append([self.chr_name, segment.begin, segment.end, segment.data['major'], segment.data['minor']]) return pd.DataFrame(seg_df, columns=['Chromosome', 'Start.bp', 'End.bp', 'major', 'minor']) def get_phased_df(self, pat_tree, mat_tree): both_alleles = IntervalTree(list(pat_tree) + list(mat_tree)) both_alleles.split_overlaps() both_alleles.merge_overlaps(data_reducer=self.specify_phasing) seg_df = [] for segment in both_alleles: seg_df.append( [self.chr_name, segment.begin, segment.end, segment.data['paternal'], segment.data['maternal']]) return pd.DataFrame(seg_df, columns=['Chromosome', 'Start.bp', 'End.bp', 'paternal', 'maternal']) @staticmethod def sum_levels(old, new): return Event(old.type, old.allele, None, old.cn_change + new.cn_change) @staticmethod def specify_levels(old, new): return {'major': max(old.cn_change, new.cn_change), 'minor': min(old.cn_change, new.cn_change)} @staticmethod def specify_phasing(old, new): return {'paternal': old.cn_change if old.allele == 'paternal' else new.cn_change, 'maternal': old.cn_change if old.allele == 'maternal' else new.cn_change}
class TaskSet(object): """ Holds a set of tasks in a priority queue. """ def __init__(self): self._tasksQueue = TaskUnitPriorityQueue() # keep r1 < r2 < r3 order. self._intervalTree = IntervalTree() @property def tasks(self): return self._tasksQueue.items() def add(self, task): if not self._tasksQueue.contains(task.taskID): self._addTaskToTree(task) self._tasksQueue.push(task) else: raise DuplicateTaskException def _addTaskToTree(self, task): """ Adds task to interval tree. """ self._intervalTree.addi(begin=task.release, end=task.deadline, data=task.taskID) def remove(self, task): self._intervalTree.discardi(task.release, task.deadline, task.taskID) self._tasksQueue.remove(task.taskID) def _findLatestInterval(self, intervals): """ Find the latest interval. """ latest = intervals[0] for interval in intervals: if interval.begin > latest.begin: latest = interval return latest def _orIntervals(self, intervalListA, intervalListB): return list(set(intervalListA) | set(intervalListB)) def _conflictPath(self, interval, intervalTree): """ @param interval The interval to find conflicts with. @param intervalTree The intervalTree that contains all intervals Finds the longest number of intervals that are all overlapping (conflicting). For example: if A and B conflict and B and C conflict and A is the interval we're looking for conflicts with, the returned intervals will be A, B, C. Another example: if D and E conflict and F and G conflict, and we're looking for all conflicts with D, only D and E will be returned as F and G are not overlapping with either D and E. """ intervals = list(intervalTree.search(interval)) # if only one interval, check if its the one we're # trying to find conflicts with. if len(intervals) == 1 and intervals[0] == interval: return [] # now find the latest of all the intervals and get all conflicts # with and keep going until there are no more conflicts. latestInterval = self._findLatestInterval(intervals) # remove all the conflicts, we dont need to check them again. intervalTree.remove_overlap(interval) # put the latest conflict back into the tree and find its conflicts intervalTree.add(latestInterval) # now go find all conflicts with the latest interval until there are none. return self._orIntervals( intervals, self._conflictPath(latestInterval, intervalTree)) def _intervalConflictAlreadyDetected(self, interval, conflicts): """ Checks to see if interval was already detected to conflict. """ for conflict in conflicts: for ival in conflict: if ival == interval: return True return False def findConflicts(self): """ Finds all conflicts within the task set. """ begin = self._intervalTree.begin() end = self._intervalTree.end() conflicts = [] conflictObjs = [] nonConflictsObjs = [] intervals = sorted(self._intervalTree[begin:end]) for interval in intervals: # check if this interval was already detected to conflict if self._intervalConflictAlreadyDetected(interval, conflicts): continue conflictIntervals = self._conflictPath(interval, self._intervalTree.copy()) if len(conflictIntervals) > 0: # there was a conflict conflicts.append(conflictIntervals) conflictObjs.append(Conflict(conflictIntervals)) else: nonConflictsObjs.append(Conflict(interval)) return ConflictSet(conflictObjs), ConflictSet(nonConflictsObjs) def __iter__(self): return self._tasksQueue