def test_random_intervals(self): """Test random interval lists.""" n_cases = 1000 full_interval_size = 10000 n_regions_to_test = 50 max_intervals = 50 for case in xrange(n_cases): n_intervals = random.randrange(1, max_intervals) interval_list = [] for i in xrange(n_intervals): ival_length = random.randrange(full_interval_size / 10) ival_start = random.randrange(full_interval_size - ival_length) ival_end = ival_start + ival_length interval_list.append( interval_tree.Interval(ival_start, ival_end)) itree = interval_tree.IntervalTree(interval_list) test_region_starts = random.sample(range(full_interval_size), n_regions_to_test) for test_region_start in test_region_starts: test_region_length = random.randrange(full_interval_size - test_region_start) test_region_end = test_region_start + test_region_length true_overlapping_intervals = [] for interval in interval_list: if interval.start < test_region_end and test_region_start < interval.stop: true_overlapping_intervals.append(interval) itree_overlapping_intervals = [] itree.find_overlapping(test_region_start, test_region_end, itree_overlapping_intervals) self.assertEqual(len(true_overlapping_intervals), len(itree_overlapping_intervals)) for o_ival in true_overlapping_intervals: self.assertIn(o_ival, itree_overlapping_intervals)
def test_boundaries(self): """Test that intervals in the IntervalTree are interpreted as the usual half-open [start, end) intervals. """ interval_list = [] interval_list.append(interval_tree.Interval(5, 10)) interval_list.append(interval_tree.Interval(5, 11)) interval_list.append(interval_tree.Interval(10, 15)) interval_list.append(interval_tree.Interval(20, 25)) itree = interval_tree.IntervalTree(interval_list) ivals = [] itree.find_overlapping(25, 26, ivals) self.assertEqual(len(ivals), 0) ivals = [] itree.find_overlapping(20, 21, ivals) self.assertEqual(len(ivals), 1) self.assertEqual(ivals[0].start, 20) self.assertEqual(ivals[0].stop, 25) ivals = [] itree.find_overlapping(5, 10, ivals) self.assertEqual(len(ivals), 2) self.assertEqual(ivals[0].start, 5) self.assertEqual(ivals[0].stop, 10) self.assertEqual(ivals[1].start, 5) self.assertEqual(ivals[1].stop, 11) ivals = [] itree.find_overlapping(10, 11, ivals) self.assertEqual(len(ivals), 2) self.assertEqual(ivals[0].start, 5) self.assertEqual(ivals[0].stop, 11) self.assertEqual(ivals[1].start, 10) self.assertEqual(ivals[1].stop, 15)
def generate_gff_records(interval_list, readers, ref_id, region_size_func, untruncator): """Generator for Gff records for a ref_id. :param interval_list: a sequence of interval_tree.Intervals of alignments to this reference :param reader: CmpH5Reader for SamfileAdapter for file containing the alignments :param ref_id: ID for this reference :param region_size_func: function from reference length to region size :param untruncator: dict that maps from truncated name to full name. If a truncated name does not appear in the dict, then it just uses the truncated name. :yields: GffIO.Gff3Records """ # Get the appropriate region size for this reference for reader in readers: try: ref_length = reader.referenceInfo(ref_id).Length ref_full_name = reader.referenceInfo(ref_id).FullName break except KeyError: pass short_name = ref_full_name.split()[0] region_size = region_size_func(ref_length) if region_size == 0: # bug 25079 - /by0 err raise ValueError( 'region_size == 0 for ref_id {r}'.format(r=str(ref_id))) log.debug("Chosen region size for reference {i} is {r}".format( i=ref_id, r=region_size)) log.debug("reference {i} has full name {n} and length {L}".format( i=ref_id, n=ref_full_name, L=ref_length)) itree = interval_tree.IntervalTree(interval_list) # To improve performance, we batch the interval lookups and projections # into ranges regions_per_batch = int(math.ceil(Constants.BATCH_SIZE / region_size)) batch_start, batch_end = 0, 0 batch_coverage_arr = None for region_start in xrange(0, ref_length, region_size): region_end = region_start + region_size # pbpy summarizeCoverage would merge the last region into the # penultimate region, so we do that here if region_end >= ref_length and region_start > 0: continue if region_end + region_size >= ref_length: region_end = ref_length # Check if we need to step to the next batch if region_end > batch_end: if region_start < batch_end: raise ValueError("A region overlaps a batch, which should not " "happen.") batch_start = region_start batch_end = region_size * regions_per_batch + batch_end if ref_length - region_size <= batch_end: batch_end = ref_length log.debug("Processing batch ({s}, {e})".format(s=batch_start, e=batch_end)) overlapping_intervals = [] itree.find_overlapping(batch_start, batch_end, overlapping_intervals) batch_coverage_arr = project_into_region(overlapping_intervals, batch_start, batch_end) region_start_in_batch = region_start - batch_start region_end_in_batch = region_end - batch_start region_coverage_arr = batch_coverage_arr[ region_start_in_batch:region_end_in_batch] gff_attributes = get_attributes_from_coverage(region_coverage_arr) # Note the region_start + 1. GFF is 1-based and used closed intervals # XXX using truncated name (identifier field), see ticket 28667 gff_record = GffIO.Gff3Record( short_name, # untruncator.get(ref_full_name, ref_full_name), region_start + 1, region_end, "region", score='0.00', strand='+', attributes=gff_attributes) yield gff_record