def test_merge_equals_with_dupes(): t = IntervalTree.from_tuples(data.ivs1.data) orig = IntervalTree.from_tuples(data.ivs1.data) assert orig == t # one dupe assert t.containsi(4, 7, '[4,7)') t.addi(4, 7, 'foo') assert len(t) == len(orig) + 1 assert orig != t t.merge_equals() t.verify() assert t != orig assert t.containsi(4, 7) assert not t.containsi(4, 7, 'foo') assert not t.containsi(4, 7, '[4,7)') # two dupes t = IntervalTree.from_tuples(data.ivs1.data) t.addi(4, 7, 'foo') assert t.containsi(10, 12, '[10,12)') t.addi(10, 12, 'bar') assert len(t) == len(orig) + 2 assert t != orig t.merge_equals() t.verify() assert t != orig assert t.containsi(4, 7) assert not t.containsi(4, 7, 'foo') assert not t.containsi(4, 7, '[4,7)') assert t.containsi(10, 12) assert not t.containsi(10, 12, 'bar') assert not t.containsi(10, 12, '[10,12)')
def find_5prime_for_inversion(start, end, gtj: list): """ We have two canonical transcripts, and we want to find out which one is the 5' and the 3' :param start: beginning of the inversion :param end: end of inversion :param gtj: list of ENSEMBL genes :return: tuple with 5' as the first, and 3' as the second """ # first generate whole gene intervals: gtj0_iv = IntervalTree.from_tuples([(gtj[0].exons.begin(), gtj[0].exons.end())]) gtj1_iv = IntervalTree.from_tuples([(gtj[1].exons.begin(), gtj[1].exons.end())]) gene_tuple = () if gtj[0].strand > 0: # first gene is forward if gtj0_iv.at(start): # and the start point is in the first gene gene_tuple = (gtj[0], gtj[1]) # leave as it is else: gene_tuple = (gtj[1], gtj[0]) else: # first gene is reverse if gtj0_iv.at( start ): # this case the first gene will be the 3', so have to swap gene_tuple = (gtj[1], gtj[0]) else: gene_tuple = (gtj[0], gtj[1]) return gene_tuple
def test_merge_overlaps_reducer_with_initializer(): def reducer(old, new): return old + [new] # empty tree e = IntervalTree() e.merge_overlaps(data_reducer=reducer, data_initializer=[]) e.verify() assert not e # one Interval in tree o = IntervalTree.from_tuples([(1, 2, 'hello')]) o.merge_overlaps(data_reducer=reducer, data_initializer=[]) o.verify() assert len(o) == 1 assert sorted(o) == [Interval(1, 2, ['hello'])] # many Intervals in tree, with gap t = IntervalTree.from_tuples(data.ivs1.data) t.merge_overlaps(data_reducer=reducer, data_initializer=[]) t.verify() assert len(t) == 2 assert sorted(t) == [ Interval(1, 2, ['[1,2)']), Interval(4, 15, [ '[4,7)', '[5,9)', '[6,10)', '[8,10)', '[8,15)', '[10,12)', '[12,14)', '[14,15)', ]) ]
def find_5prime_for_inversion(start, end, gtj: list): """ We have two canonical transcripts, and we want to find out which one is the 5' and the 3' . For inversions we will have actually two valid fusions, though we do not know which one is the correct one. Nevertheless, we are reporting both. :param start: beginning of the inversion :param end: end of inversion :param gtj: list of ENSEMBL genes :return: list with two tuples [(5',3'), (5',3') ] """ # first generate whole gene intervals: gtj0_iv = IntervalTree.from_tuples([(gtj[0].exons.begin(), gtj[0].exons.end())]) gtj1_iv = IntervalTree.from_tuples([(gtj[1].exons.begin(), gtj[1].exons.end())]) gene_tuple = () if gtj[0].strand > 0: # first gene is forward if gtj0_iv.at(start): # and the start point is in the first gene gene_tuple = (gtj[0], gtj[1]) # leave as it is else: gene_tuple = (gtj[1], gtj[0]) else: # first gene is reverse if gtj0_iv.at( start ): # this case the first gene will be the 3', so have to swap gene_tuple = (gtj[1], gtj[0]) else: gene_tuple = (gtj[0], gtj[1]) return [gene_tuple, (gene_tuple[1], gene_tuple[0])]
def test_merge_equals_reducer_wo_initializer(): def reducer(old, new): return "%s, %s" % (old, new) # empty tree e = IntervalTree() e.merge_equals(data_reducer=reducer) e.verify() assert not e # One Interval in tree, no change o = IntervalTree.from_tuples([(1, 2, 'hello')]) o.merge_equals(data_reducer=reducer) o.verify() assert len(o) == 1 assert sorted(o) == [Interval(1, 2, 'hello')] # many Intervals in tree, no change t = IntervalTree.from_tuples(data.ivs1.data) orig = IntervalTree.from_tuples(data.ivs1.data) t.merge_equals(data_reducer=reducer) t.verify() assert len(t) == len(orig) assert t == orig # many Intervals in tree, with change t = IntervalTree.from_tuples(data.ivs1.data) orig = IntervalTree.from_tuples(data.ivs1.data) t.addi(4, 7, 'foo') t.merge_equals(data_reducer=reducer) t.verify() assert len(t) == len(orig) assert t != orig assert not t.containsi(4, 7, 'foo') assert not t.containsi(4, 7, '[4,7)') assert t.containsi(4, 7, '[4,7), foo')
def test_merge_neighbors_with_gap_nonstrict(): def reducer(old, new): return "%s, %s" % (old, new) # default distance=1 t = IntervalTree.from_tuples(data.ivs1.data) t.merge_neighbors(data_reducer=reducer, distance=1, strict=False) t.verify() assert len(t) == 2 assert sorted(t) == [ Interval(1, 2, '[1,2)'), Interval( 4, 15, '[4,7), [5,9), [6,10), [8,10), [8,15), [10,12), [12,14), [14,15)'), ] # distance=2 t = IntervalTree.from_tuples(data.ivs1.data) t.merge_neighbors(data_reducer=reducer, distance=2, strict=False) t.verify() assert len(t) == 1 assert sorted(t) == [ Interval( 1, 15, '[1,2), [4,7), [5,9), [6,10), [8,10), [8,15), [10,12), [12,14), [14,15)' ) ]
def __init__(self, covariate, population): """ Interpolates a covariate by population weighting. :param covariate: (pd.DataFrame) :param population: (pd.DataFrame) """ # Covariates must be sorted by both age_group_id and age_lower because age_lower is not unique to age_group_id indices = ['location_id', 'sex_id', 'year_id', 'age_group_id'] sort_order = indices + ['age_lower'] self.covariate = covariate.sort_values(by=sort_order) self.population = population.sort_values(by=sort_order) self.location_ids = self.covariate.location_id.unique() self.age_intervals = IntervalTree.from_tuples( self.covariate[['age_lower', 'age_upper', 'age_group_id']].values) self.time_intervals = IntervalTree.from_tuples([ (t, t + 1, t) for t in self.covariate.year_id.unique() ]) self.dict_cov = dict( zip(map(tuple, self.covariate[indices].values.tolist()), self.covariate['mean_value'].values)) self.dict_pop = dict( zip(map(tuple, self.population[indices].values.tolist()), self.population['population'].values))
def test_merge_overlaps_reducer_with_initializer(): def reducer(old, new): return old + [new] # empty tree e = IntervalTree() e.merge_overlaps(data_reducer=reducer, data_initializer=[]) e.verify() assert not e # One Interval in tree o = IntervalTree.from_tuples([(1, 2, 'hello')]) o.merge_overlaps(data_reducer=reducer, data_initializer=[]) o.verify() assert len(o) == 1 assert sorted(o) == [Interval(1, 2, ['hello'])] # many Intervals in tree, with gap t = IntervalTree.from_tuples(data.ivs1.data) t.merge_overlaps(data_reducer=reducer, data_initializer=[]) t.verify() assert len(t) == 2 assert sorted(t) == [ Interval(1, 2, ['[1,2)']), Interval(4, 15, [ '[4,7)', '[5,9)', '[6,10)', '[8,10)', '[8,15)', '[10,12)', '[12,14)', '[14,15)', ]) ]
def test_merge_overlaps_reducer_wo_initializer(): def reducer(old, new): return "%s, %s" % (old, new) # empty tree e = IntervalTree() e.merge_overlaps(data_reducer=reducer) e.verify() assert not e # one Interval in tree o = IntervalTree.from_tuples([(1, 2, 'hello')]) o.merge_overlaps(data_reducer=reducer) o.verify() assert len(o) == 1 assert sorted(o) == [Interval(1, 2, 'hello')] # many Intervals in tree, with gap t = IntervalTree.from_tuples(data.ivs1.data) t.merge_overlaps(data_reducer=reducer) t.verify() assert len(t) == 2 assert sorted(t) == [ Interval(1, 2, '[1,2)'), Interval( 4, 15, '[4,7), [5,9), [6,10), [8,10), [8,15), [10,12), [12,14), [14,15)') ]
def test_merge_overlaps_reducer_wo_initializer(): def reducer(old, new): return "%s, %s" % (old, new) # empty tree e = IntervalTree() e.merge_overlaps(data_reducer=reducer) e.verify() assert not e # One Interval in tree o = IntervalTree.from_tuples([(1, 2, 'hello')]) o.merge_overlaps(data_reducer=reducer) o.verify() assert len(o) == 1 assert sorted(o) == [Interval(1, 2, 'hello')] # many Intervals in tree, with gap t = IntervalTree.from_tuples(data.ivs1.data) t.merge_overlaps(data_reducer=reducer) t.verify() assert len(t) == 2 assert sorted(t) == [ Interval(1, 2,'[1,2)'), Interval(4, 15, '[4,7), [5,9), [6,10), [8,10), [8,15), [10,12), [12,14), [14,15)') ]
def test_merge_equals_wo_dupes(): t = IntervalTree.from_tuples(data.ivs1.data) orig = IntervalTree.from_tuples(data.ivs1.data) assert orig == t t.merge_equals() t.verify() assert orig == t
def test_intersection(): a = IntervalTree.from_tuples(data.ivs1.data) b = IntervalTree.from_tuples(data.ivs2.data) e = IntervalTree() # intersections with e assert a.intersection(e) == e assert b.intersection(e) == e assert e.intersection(e) == e # intersections with self assert a.intersection(a) == a assert b.intersection(b) == b # commutativity resulting in empty ab = a.intersection(b) ba = b.intersection(a) ab.verify() ba.verify() assert ab == ba assert len(ab) == 0 # no overlaps, so empty tree # commutativity on non-overlapping sets ab = a.union(b) ba = b.union(a) aba = ab.intersection(a) # these should yield no change abb = ab.intersection(b) bab = ba.intersection(b) baa = ba.intersection(a) aba.verify() abb.verify() bab.verify() baa.verify() assert aba == a assert abb == b assert bab == b assert baa == a # commutativity with overlapping sets c = IntervalTree.from_tuples(data.ivs3.data) bc = b.intersection(c) cb = c.intersection(b) bc.verify() cb.verify() assert bc == cb assert len(bc) < len(b) assert len(bc) < len(c) assert len(bc) > 0 assert b.containsi(13, 23) assert c.containsi(13, 23) assert bc.containsi(13, 23) assert not b.containsi(819, 828) assert not c.containsi(0, 1) assert not bc.containsi(819, 820) assert not bc.containsi(0, 1)
def test_emptying_partial(): t = IntervalTree.from_tuples(data.ivs1.data) assert t[7:] t.remove_overlap(7, t.end()) assert not t[7:] t = IntervalTree.from_tuples(data.ivs1.data) assert t[:7] t.remove_overlap(t.begin(), 7) assert not t[:7]
def test_tree_bounds(): def assert_tree_bounds(t): begin, end, _ = set(t).pop() for iv in t: if iv.begin < begin: begin = iv.begin if iv.end > end: end = iv.end assert t.begin() == begin assert t.end() == end assert_tree_bounds(IntervalTree.from_tuples(data.ivs1.data)) assert_tree_bounds(IntervalTree.from_tuples(data.ivs2.data))
def test_partial_iter_range(): def assert_iter(t, limit): s = sorted(t) assert [i for i in t.iterOverlap()] == s s = sorted(iv for iv in t if iv.begin < limit) assert [i for i in t.iterOverlap(None, limit)] == s s = sorted(iv for iv in t if iv.end > limit) assert [i for i in t.iterOverlap(limit)] == s assert_iter(IntervalTree.from_tuples(data.ivs1.data), 7) assert_iter(IntervalTree.from_tuples(data.ivs2.data), -3)
def test_partial_get_query(): def assert_get(t, limit): s = set(t) assert t[:] == s s = set(iv for iv in t if iv.begin < limit) assert t[:limit] == s s = set(iv for iv in t if iv.end > limit) assert t[limit:] == s assert_get(IntervalTree.from_tuples(data.ivs1.data), 7) assert_get(IntervalTree.from_tuples(data.ivs2.data), -3)
def test_end_order_iter(): def assert_iter(t, limit): keyFunc = cmp_to_key(lambda a, b: a.endCmp(b)) s = sorted(t, key=keyFunc) assert [i for i in t.iterOverlap(endOrder=True)] == s s = sorted([iv for iv in t if iv.begin < limit], key=keyFunc) assert [i for i in t.iterOverlap(None, limit, endOrder=True)] == s s = sorted([iv for iv in t if iv.end > limit], key=keyFunc) assert [i for i in t.iterOverlap(limit, endOrder=True)] == s assert_iter(IntervalTree.from_tuples(data.ivs1.data), 7) assert_iter(IntervalTree.from_tuples(data.ivs2.data), -3)
def test_merge_overlaps_gapless(): # default strict=True t = IntervalTree.from_tuples(data.ivs2.data) t.merge_overlaps() t.verify() assert [(iv.begin, iv.end, iv.data) for iv in sorted(t)] == data.ivs2.data # strict=False t = IntervalTree.from_tuples(data.ivs2.data) rng = t.range() t.merge_overlaps(strict=False) t.verify() assert len(t) == 1 assert t.pop() == rng
def from_gtf( cls, gtf_path, # type: pathlib.Path chromosomes=None, # type: List[str] record_filter=None # type: Callable[[Any], bool] ): # type: (...) -> TranscriptReference """Builds an Reference instance from the given GTF file.""" # Open gtf file. gtf = pysam.TabixFile(native_str(gtf_path), parser=pysam.asGTF()) if chromosomes is None: chromosomes = gtf.contigs # Build the trees. transcript_trees = {} exon_trees = {} for chrom in chromosomes: # Collect exons and transcripts. transcripts = [] exons = [] records = gtf.fetch(reference=chrom) if record_filter is not None: records = (rec for rec in records if record_filter(rec)) for record in records: if record.feature == 'transcript': transcripts.append(cls._record_to_transcript(record)) elif record.feature == 'exon': exons.append(cls._record_to_exon(record)) # Build transcript lookup tree. transcript_trees[chrom] = IntervalTree.from_tuples( (tr.start, tr.end, tr) for tr in transcripts) # Build exon lookup tree. keyfunc = lambda rec: rec.transcript_id exons = sorted(exons, key=keyfunc) grouped = itertools.groupby(exons, key=keyfunc) for tr_id, grp in grouped: exon_trees[tr_id] = IntervalTree.from_tuples( (exon.start, exon.end, exon) for exon in grp) return cls(transcript_trees, exon_trees)
def test_merge_neighbors_reducer_with_initializer(): def reducer(old, new): return old + [new] # empty tree e = IntervalTree() e.merge_neighbors(data_reducer=reducer, data_initializer=[]) e.verify() assert not e # one Interval in tree o = IntervalTree.from_tuples([(1, 2, 'hello')]) o.merge_neighbors(data_reducer=reducer, data_initializer=[]) o.verify() assert len(o) == 1 assert sorted(o) == [Interval(1, 2, ['hello'])] # many Intervals in tree, without gap _data_no_gap = ( (1, 2, '[1,2)'), (2, 3, '[2,3)'), (3, 4, '[3,4)'), ) t = IntervalTree.from_tuples(_data_no_gap) t.merge_neighbors(data_reducer=reducer, data_initializer=[]) t.verify() assert len(t) == 1 for begin, end, _data in t.items(): assert begin == 1 assert end == 4 assert _data == ['[1,2)', '[2,3)', '[3,4)'] # many Intervals in tree, with gap and distance=2 _data_gap = ( (1, 2, '[1,2)'), (4, 6, '[4,6)'), (5, 8, '[5,8)'), (13, 15, '[13,15)'), ) t = IntervalTree.from_tuples(_data_gap) t.merge_neighbors(data_reducer=reducer, data_initializer=[], distance=2) t.verify() assert len(t) == 3 assert sorted(t) == [ Interval(1, 6, ['[1,2)', '[4,6)']), Interval(5, 8, ['[5,8)']), Interval(13, 15, ['[13,15)']), ]
def merge_turns(turns): """Merge overlapping turns by same speaker within each file.""" # Split turns by file and speaker. turn_map = defaultdict(list) file_to_speakers = defaultdict(set) for turn in turns: turn_map[(turn.file_id, turn.speaker_id)].append(turn) file_to_speakers[turn.file_id].add(turn.speaker_id) # Merge separately within each file and for each speaker. new_turns = [] file_ids = set([file_id for file_id, _ in iterkeys(turn_map)]) for file_id in sorted(file_ids): for speaker_id in sorted(file_to_speakers[file_id]): speaker_turns = turn_map[(file_id, speaker_id)] speaker_it = IntervalTree.from_tuples( [(turn.onset, turn.offset) for turn in speaker_turns]) n_turns_pre = len(speaker_it) speaker_it.merge_overlaps() n_turns_post = len(speaker_it) if n_turns_post < n_turns_pre: speaker_turns = [] for intrvl in speaker_it: speaker_turns.append( Turn(intrvl.begin, intrvl.end, speaker_id=speaker_id, file_id=file_id)) speaker_turns = sorted( speaker_turns, key=lambda x: (x.onset, x.offset)) warn('Merging overlapping speaker turns. ' 'FILE: %s, SPEAKER: %s' % (file_id, speaker_id)) new_turns.extend(speaker_turns) turns = new_turns return turns
def make_annot(args): """ Create binary annotations from 0.1, 0.01 and 0.001 p-value stratuums of the summary statistic. """ check_input_file(args.sumstats_file) for chri in range(1, 23): check_output_file(args.output_file.format(chri), args.force) print('Reading summary statistics file {}...'.format(args.sumstats_file)) sumstats = pd.read_csv(args.sumstats_file, delim_whitespace=True, usecols=['PVAL', 'CHR', 'BP']) print('Done, read {} SNPs.'.format(sumstats.shape[0])) for chri in range(1, 23): print('Processing chromosome {}...'.format(chri)) df = pd.read_csv(args.annot.format(chri), delim_whitespace=True) df = df[['CHR', 'BP', 'SNP', 'CM']].copy() for pthresh, label in [(0.1, '.1'), (0.01, '.01'), (0.001, '.001')]: sumstatsCHR = sumstats[sumstats.CHR == chri].copy(deep=True) print('{} markers, {} of them are on chr {}, {} of them have p-value below {}'.format(sumstats.shape[0], sumstatsCHR.shape[0], chri, (sumstatsCHR.PVAL < pthresh).sum(), pthresh)) itree = IntervalTree.from_tuples(zip(sumstatsCHR[sumstatsCHR.PVAL < pthresh].BP - args.window, sumstatsCHR[sumstatsCHR.PVAL < pthresh].BP + args.window)) itree.merge_overlaps() print('Found {} intervals, average length {}'.format(len(itree), sum([i.length() for i in itree])/len(itree))) annot_binary = [int(bool(itree[p])) for p in df.BP] df['PVAL{}'.format(label)] = annot_binary print('{} markers out of {} ({}%) belongs to the annotation'.format(sum(annot_binary), len(annot_binary), 100 * sum(annot_binary) / len(annot_binary))) df.to_csv(args.output_file.format(chri), index=False, sep='\t', compression='gzip') print('Results saved to {}'.format(args.output_file.format(chri)))
def test_span(): e = IntervalTree() assert e.span() == 0 t = IntervalTree.from_tuples(data.ivs1.data) assert t.span() == t.end() - t.begin() assert t.span() == 14
def test_merge_overlaps_with_gap(): t = IntervalTree.from_tuples(data.ivs1.data) t.merge_overlaps() t.verify() assert len(t) == 2 assert t == IntervalTree([Interval(1, 2, '[1,2)'), Interval(4, 15)])
def test_delete(): t = IntervalTree.from_tuples(data.ivs1.data) try: t.remove(Interval(1, 3, "Doesn't exist")) except ValueError: pass else: raise AssertionError("Expected ValueError") try: t.remove(Interval(500, 1000, "Doesn't exist")) except ValueError: pass else: raise AssertionError("Expected ValueError") orig = t.print_structure(True) t.discard(Interval(1, 3, "Doesn't exist")) t.discard(Interval(500, 1000, "Doesn't exist")) assert orig == t.print_structure(True) assert match.set_data(t[14]) == set(['[8,15)', '[14,15)']) t.remove(Interval(14, 15, '[14,15)')) assert match.set_data(t[14]) == set(['[8,15)']) t.verify() t.discard(Interval(8, 15, '[8,15)')) assert match.set_data(t[14]) == set() t.verify() assert t[5] t.remove_overlap(5) t.verify() assert not t[5]
def getTrackData(self, chrom_len, gene2info): gene2intervals = {} rainbow2gene = {} for gene in gene2info: gene2intervals[gene] = gene2info[gene] for index, curr in enumerate(gene2info[gene]): rainbow2gene[curr['r_id']] = gene gene2intervals[gene][index] = curr for subtype in curr['interval']: gene2intervals[gene][index]['interval'][subtype] = self.tree2json(curr['interval'][subtype], False) trees = {'+':{}, '-':{}} strand2info = {'+':[], '-':[]} for gene in gene2info: for index, curr in enumerate(gene2info[gene]): for strand in ['+', '-']: if curr['annot']['strand'] == strand: strand2info[strand].append([curr['annot']['start'], curr['annot']['end'], curr['r_id']]) #strand2info = {strand:[[gene2info[curr]['annot']['start'], gene2info[curr]['annot']['end'], curr] for curr in gene2info if gene2info[curr]['annot']['strand'] == strand] for strand in ['+', '-']} for strand in ['+', '-']: trees[strand] = IntervalTree.from_tuples(strand2info[strand]) interval2genes = self.getRangesFromTree(chrom_len, trees) interval2blocks = self.getBlocksFromTree(chrom_len, trees) return interval2genes, interval2blocks, rainbow2gene
def build_capture_trees(bed_file): """Reads BED file and returns a dict of 25 interval trees (1/per chromosome)""" # read the BED file bed_df = pd.read_csv(bed_file, usecols=[0, 1, 2, 3], sep='\t', names=['chrom', 'start', 'stop', 'gene'], dtype={ 'chrom': str, 'start': int, 'stop': int, 'gene': str }) # Intervals exclude the end point, so increment all stops bed_df['stop'] += 1 # build an interval tree for each chromosome trees = {} chroms = [str(i) for i in range(1, 23)] + ['X', 'Y'] for c in chroms: # select rows for c chrom_df = bed_df[bed_df.chrom == c][['start', 'stop', 'gene']] # convert rows to series of tuples chrom_tuples = [tuple(x) for x in chrom_df.values] # build the interval tree from tuples trees[c] = IntervalTree.from_tuples(chrom_tuples) return trees
def _subset_to_windows( insertions, # type: List[Insertion] gene_windows # type: Dict[str, Tuple[str, int, int]] ): # type: (...) -> List[Insertion] """Subsets insertions for given gene windows.""" # Create lookup trees. trees = { chrom: IntervalTree.from_tuples((i[1:]) for i in chrom_int) for chrom, chrom_int in itertools.groupby( sorted(gene_windows.values()), operator.itemgetter(0)) } # Determine which insertions overlap tree intervals and # correspond to genes with known gene window. def _in_windows(ins, trees): try: return trees[ins.seqname].overlaps(ins.position) except KeyError: return False return [ ins for ins in insertions if ins.metadata['gene_id'] in gene_windows and _in_windows(ins, trees) ]
def merge_turns(turns): """Merge overlapping turns by same speaker within each file.""" # Merge separately within each file and for each speaker. new_turns = [] for (file_id, speaker_id), speaker_turns in groupby( turns, lambda x: (x.file_id, x.speaker_id)): speaker_turns = list(speaker_turns) speaker_it = IntervalTree.from_tuples([(turn.onset, turn.offset) for turn in speaker_turns]) n_turns_pre = len(speaker_it) speaker_it.merge_overlaps() n_turns_post = len(speaker_it) if n_turns_post < n_turns_pre: speaker_turns = [] for intrvl in speaker_it: speaker_turns.append( Turn(intrvl.begin, intrvl.end, speaker_id=speaker_id, file_id=file_id)) speaker_turns = sorted(speaker_turns, key=lambda x: (x.onset, x.offset)) warn('Merging overlapping speaker turns. ' 'FILE: %s, SPEAKER: %s' % (file_id, speaker_id)) new_turns.extend(speaker_turns) return new_turns
def getGene2Info(self, chrom, data): gene2info = {} rainbow2gene = {} rainbow_tree = {} ranges = {'+':[], '-':[]} r_id = 0 for strand in data: tree_tuple = [] for datum in data[strand]: r_id += 1 start, end, (element, *values) = datum tree_tuple.append([start, end, r_id]) rainbow2gene[r_id] = element curr = { 'r_id':r_id, 'annot':{'chrom':chrom,'strand':strand,'start': start,'end': end,'values': values }, 'interval':{ '-':[ [start, end] ] } } try: gene2info[element].append(curr) except: gene2info[element] = [curr] rainbow_tree[strand] = IntervalTree.from_tuples(tree_tuple) return gene2info, rainbow2gene, rainbow_tree
def test_difference_operator(): minuend = IntervalTree.from_tuples(data.ivs1.data) assert isinstance(minuend, IntervalTree) subtrahend = minuend.copy() expected_difference = IntervalTree([subtrahend.pop()]) expected_difference.add(subtrahend.pop()) minuend.verify() subtrahend.verify() expected_difference.verify() assert len(expected_difference) == len(minuend) - len(subtrahend) for iv in expected_difference: assert iv not in subtrahend assert iv in minuend difference = minuend - subtrahend difference.verify() for iv in difference: assert iv not in subtrahend assert iv in minuend assert iv in expected_difference assert difference == expected_difference
def getGene2Info(self, genes): gene2info = {} for rainbow_id, gene in enumerate(genes): gene_id = gene['annot']['gene_id'] #curr_info = gene2info[gene_id] curr_info = {'r_id': rainbow_id} curr_info['annot'] = { key: gene['annot'][key] for key in gene['annot'] if key != 'gene_id' } curr_info['interval'] = {} for subtype in gene['parts']: curr_tree = IntervalTree.from_tuples( [curr[:2] for curr in gene['parts'][subtype]]) curr_tree.merge_overlaps() curr_info['interval'][subtype] = curr_tree curr_info['annot']['start'] = curr_tree.begin() curr_info['annot']['end'] = curr_tree.end() try: gene2info[gene_id].append(curr_info) except: gene2info[gene_id] = [curr_info] return gene2info
def extract_intervals_for_genes_from_gff(genes: Set[str], gff_stream: TextIO, padding: int = 0) -> IntervalTree: intervals = [] for row in map(str.rstrip, gff_stream): if row.startswith("#") or not row: continue fields = row.split("\t") if fields[2].lower() != "gene": continue attributes = attributes_dict_from_str(fields[8]) name = attributes.get("gene", attributes.get("Name", None)) if name is None: logger.warning(f"No gene/Name attribute for ID {attributes['ID']}") continue if name not in genes: continue start = (int(fields[3]) - 1) - padding # GFF start is 1-based inclusive end = int(fields[4]) + padding # GFF end is 1-based inclusive strand = fields[6] intervals.append((start, end, (name, strand))) return IntervalTree.from_tuples(intervals)
def make_age_intervals(df: Optional[pd.DataFrame] = None, gbd_round_id: Optional[int] = None) -> IntervalTree: """ Makes an interval tree out of age lower and upper for age group IDs. The interval tree can be made from an existing data frame with those columns or it can be made from getting the full set of age groups from the IHME databases. Parameters ---------- df Data frame from which to construct the interval tree. Must have the columns ['age_group_id', 'age_lower', 'age_upper']. If passed, ignores gbd_round_id. gbd_round_id The gbd round ID from which to pull the age group metadata which is used to construct the interval tree. Ignored if df is specified instead. """ if df is None and gbd_round_id is None: raise IhmeIDError( "Need to pass either a data frame with columns" "['age_group_id', 'age_lower', 'age_upper' or a valid" "gbd_round_id to get the full set of age groups.") if df is None: df = get_age_group_metadata(gbd_round_id=gbd_round_id) else: for col in ['age_group_id', 'age_lower', 'age_upper']: if col not in df.columns: raise IhmeIDError( f"The data frame columns {df.columns} do not contain" f"the required column {col}.") age_intervals = IntervalTree.from_tuples( df[['age_lower', 'age_upper', 'age_group_id']].values) return age_intervals
def test_pickle(): t = IntervalTree.from_tuples(data.ivs1.data) p = pickle.dumps(t) t2 = pickle.loads(p) assert t == t2 t2.verify()
def optimality_core(): #tree = test_build_tree() #write_result(tree) #print(len(tree)) matrix = OptimalityTestMatrix({ 'issue4result': IntervalTree.from_tuples(data.issue4_result.data), }) pprint(matrix.summary_matrix)
def test_mismatched_tree_and_membership_set(): t = IntervalTree.from_tuples(data.ivs1.data) members = set(t.all_intervals) assert t.all_intervals == members t.removei(1, 2, '[1,2)') assert t.all_intervals != members t.all_intervals = members # intentionally introduce error with pytest.raises(AssertionError): t.verify()
def test_emptying_iteration(): t = IntervalTree.from_tuples(data.ivs1.data) for iv in sorted(iter(t)): t.remove(iv) t.verify() assert len(t) == 0 assert t.is_empty() assert not t
def test_overlaps(): t = IntervalTree.from_tuples(data.ivs1.data) assert not t.overlaps(-3.2) assert t.overlaps(1) assert t.overlaps(1.5) assert t.overlaps(0, 3) assert not t.overlaps(0, 1) assert not t.overlaps(2, 4) assert not t.overlaps(4, 2) assert not t.overlaps(3, 0)
def test_emptying_clear(): t = IntervalTree.from_tuples(data.ivs1.data) assert t t.clear() assert len(t) == 0 assert t.is_empty() assert not t # make sure emptying an empty tree does not crash t.clear()
def tree(): t = IntervalTree.from_tuples(data) # Node<10.58, depth=3, balance=1> # Interval(8.65, 13.65) root = Node() root.x_center = 10.58 root.s_center = set([Interval(*data[0])]) root.depth = 3 root.balance = 1 # <: Node<5.66, depth=1, balance=0> # Interval(3.57, 9.47) # Interval(5.38, 10.38) # Interval(5.66, 9.66) n = root.left_node = Node() n.x_center = 5.66 n.s_center = set(Interval(*tup) for tup in data[1:4]) n.depth = 1 n.balance = 0 # >: Node<16.49, depth=2, balance=-1> # Interval(16.49, 20.83) n = root.right_node = Node() n.x_center = 16.49 n.s_center = set([Interval(*data[4])]) n.depth = 2 n.balance = -1 # <: Node<11.42, depth=1, balance=0> # Interval(11.42, 16.42) n.left_node = Node() n = n.left_node n.x_center = 11.42 n.s_center = set([Interval(*data[5])]) n.depth = 1 n.balance = 0 structure = root.print_structure(tostring=True) # root.print_structure() assert structure == """\ Node<10.58, depth=3, balance=1> Interval(8.65, 13.65) <: Node<5.66, depth=1, balance=0> Interval(3.57, 9.47) Interval(5.38, 10.38) Interval(5.66, 9.66) >: Node<16.49, depth=2, balance=-1> Interval(16.49, 20.83) <: Node<11.42, depth=1, balance=0> Interval(11.42, 16.42) """ t.top_node = root t.verify() return t
def test_point_queries(): t = IntervalTree.from_tuples(data.ivs1.data) assert match.set_data(t[4]) == set(['[4,7)']) assert match.set_data(t.at(4)) == set(['[4,7)']) assert match.set_data(t[9]) == set(['[6,10)', '[8,10)', '[8,15)']) assert match.set_data(t.at(9)) == set(['[6,10)', '[8,10)', '[8,15)']) assert match.set_data(t[15]) == set() assert match.set_data(t.at(15)) == set() assert match.set_data(t[5]) == set(['[4,7)', '[5,9)']) assert match.set_data(t.at(5)) == set(['[4,7)', '[5,9)']) assert match.set_data(t[4:5]) == set(['[4,7)'])
def __init__(self, path, slop=200): from intervaltree import IntervalTree from csv import reader self.trees = {} self.slop = slop with open(path, "rt") as bed: chrm = None ivls = [] for row in reader(bed, delimiter="\t"): if row[0] != chrm: if len(ivls) > 0: self.trees[chrm] = IntervalTree.from_tuples(ivls) chrm = row[0] ivls = [] ivls.append(( max(1, int(row[1]) - slop + 1), int(row[2]) + slop + 1 )) if len(ivls) > 0: self.trees[chrm] = IntervalTree.from_tuples(ivls)
def test_remove_overlap(): t = IntervalTree.from_tuples(data.ivs1.data) assert t[1] t.remove_overlap(1) assert not t[1] t.verify() assert t[8] t.remove_overlap(8) assert not t[8] t.verify()
def test_split_overlap(): t = IntervalTree.from_tuples(data.ivs1.data) t.split_overlaps() t.verify() while t: iv = set(t).pop() t.remove(iv) for other in t.overlap(iv): assert other.begin == iv.begin assert other.end == iv.end
def tree(): t = IntervalTree.from_tuples(data) # Node<961, depth=2, balance=0> # Interval(961, 986, 1) root = Node() root.x_center = 961 root.s_center = set([Interval(*data[7])]) root.depth = 2 root.balance = 0 # <: Node<871, depth=1, balance=0> # Interval(860, 917, 1) # Interval(860, 917, 2) # Interval(860, 917, 3) # Interval(860, 917, 4) # Interval(871, 917, 1) # Interval(871, 917, 2) # Interval(871, 917, 3) n = root.left_node = Node() n.x_center = 871 n.s_center = set(Interval(*tup) for tup in data[:7]) n.depth = 1 n.balance = 0 # >: Node<1047, depth=1, balance=0> # Interval(1047, 1064, 1) # Interval(1047, 1064, 2) n = root.right_node = Node() n.x_center = 1047 n.s_center = set(Interval(*tup) for tup in data[8:]) n.depth = 1 n.balance = 0 structure = root.print_structure(tostring=True) # root.print_structure() assert structure == """\ Node<961, depth=2, balance=0> Interval(961, 986, 1) <: Node<871, depth=1, balance=0> Interval(860, 917, 1) Interval(860, 917, 2) Interval(860, 917, 3) Interval(860, 917, 4) Interval(871, 917, 1) Interval(871, 917, 2) Interval(871, 917, 3) >: Node<1047, depth=1, balance=0> Interval(1047, 1064, 1) Interval(1047, 1064, 2) """ t.top_node = root t.verify() return t
def test_envelop_vs_overlap_queries(): t = IntervalTree.from_tuples(data.ivs1.data) assert match.set_data(t.envelop(4, 5)) == set() assert match.set_data(t.overlap(4, 5)) == set(['[4,7)']) assert match.set_data(t.envelop(4, 6)) == set() assert match.set_data(t.overlap(4, 6)) == set(['[4,7)', '[5,9)']) assert match.set_data(t.envelop(6, 10)) == set(['[6,10)', '[8,10)']) assert match.set_data(t.overlap(6, 10)) == set([ '[4,7)', '[5,9)', '[6,10)', '[8,10)', '[8,15)']) assert match.set_data(t.envelop(6, 11)) == set(['[6,10)', '[8,10)']) assert match.set_data(t.overlap(6, 11)) == set([ '[4,7)', '[5,9)', '[6,10)', '[8,10)', '[8,15)', '[10,12)'])
def removeGenusZeroLinks(LinkData): """ Removes all links that are are equivalent toplogically to other links and do not contribute to genus! Uses IntervalTree to find all crossing links to a given link (in O(N log N) vs. O(N^2) for N links). Required IntervalTree package! *Args*: LinkData: Nx2 links, rows (p1,p2) *Returns*: removal_linkData: The list of links retained """ org_tree = IntervalTree.from_tuples(list(map(tuple, LinkData)) ) removal_tree = IntervalTree.from_tuples(list(map(tuple, LinkData)) ) for m,t in enumerate(org_tree): loc_set = org_tree[t[0]:t[1]] for l in loc_set: if t[0] >= l[0] and t[1]<= l[1] and t!=l: #find if i is contained in l #now find the intervals overlapping Interval(i[0], l[0]) and i[1], l[1] left_set = org_tree[l[0]:t[0]] right_set = org_tree[t[1]:l[1]] if len(right_set) == 1 and len(left_set) ==1: # the right and left set has one overlap: # the interval l removal_tree.remove(t) removal_linkData = [[i[0], i[1]] for i in removal_tree] return removal_linkData ################################################################################
def test_merge_equals_reducer_with_initializer(): def reducer(old, new): return old + [new] # empty tree e = IntervalTree() e.merge_equals(data_reducer=reducer, data_initializer=[]) e.verify() assert not e # One Interval in tree, no change o = IntervalTree.from_tuples([(1, 2, 'hello')]) o.merge_equals(data_reducer=reducer, data_initializer=[]) o.verify() assert len(o) == 1 assert sorted(o) == [Interval(1, 2, ['hello'])] # many Intervals in tree, no change t = IntervalTree.from_tuples(data.ivs1.data) orig = IntervalTree.from_tuples(data.ivs1.data) t.merge_equals(data_reducer=reducer, data_initializer=[]) t.verify() assert len(t) == len(orig) assert t != orig assert sorted(t) == [Interval(b, e, [d]) for b, e, d in sorted(orig)] # many Intervals in tree, with change t = IntervalTree.from_tuples(data.ivs1.data) orig = IntervalTree.from_tuples(data.ivs1.data) t.addi(4, 7, 'foo') t.merge_equals(data_reducer=reducer, data_initializer=[]) t.verify() assert len(t) == len(orig) assert t != orig assert not t.containsi(4, 7, 'foo') assert not t.containsi(4, 7, '[4,7)') assert t.containsi(4, 7, ['[4,7)', 'foo'])