def write_variant_to_tempfile(self, variant): output_path = test_utils.test_tmpfile('test.vcf') header = variants_pb2.VcfHeader( contigs=[reference_pb2.ContigInfo(name='20')], sample_names=[call.call_set_name for call in variant.calls], formats=[ variants_pb2.VcfFormatInfo(id='DP', number='1', type='Integer', description='Read depth'), variants_pb2.VcfFormatInfo( id='AD', number='R', type='Integer', description='Read depth for each allele') ]) writer = vcf.VcfWriter(output_path, header=header) with writer: writer.write(variant) return output_path
def __init__(self, chromosomes): """Initializes an InMemoryFastaReader using data from chromosomes. Args: chromosomes: list[tuple]. The chromosomes we are caching in memory as a list of tuples. Each tuple must be exactly three elements in length, containing (chromosome name [str], start [int], bases [str]). Raises: ValueError: If any of the chromosomes tuples are invalid. """ super(InMemoryFastaReader, self).__init__() ref_seqs = [] contigs = [] for i, (contig_name, start, bases) in enumerate(chromosomes): if start < 0: raise ValueError( 'start={} must be >= for chromosome={}'.format( start, contig_name)) if not bases: raise ValueError( 'Bases must contain at least one base, but got "{}"'. format(bases)) end = start + len(bases) ref_seqs.append( reference_pb2.ReferenceSequence(region=ranges.make_range( contig_name, start, end), bases=bases)) contigs.append( reference_pb2.ContigInfo(name=contig_name, n_bases=end, pos_in_fasta=i)) self._reader = in_memory_fasta_reader.InMemoryFastaReader.create( contigs, ref_seqs) self.header = RefFastaHeader(contigs=self._reader.contigs)
def __init__(self, chromosomes): """Initializes an InMemoryRefReader using data from chromosomes. Args: chromosomes: List[tuple]. The chromosomes we are caching in memory as a list of tuples. Each tuple must be exactly three string elements in length, containing (chromosome name, start, bases). Raises: ValueError: If any of the InMemoryChromosome are invalid. """ super(InMemoryRefReader, self).__init__() self._chroms = {} contigs = [] for i, (contig_name, start, bases) in enumerate(chromosomes): if start < 0: raise ValueError( 'start={} must be >= for chromosome={}'.format( start, contig_name)) if contig_name in self._chroms: raise ValueError( 'Duplicate chromosome={} detect'.format(contig_name)) if not bases: raise ValueError( 'Bases must contain at least one base, but got "{}"'. format(bases)) end = start + len(bases) self._chroms[contig_name] = _InMemoryChromosome(start, end, bases) contigs.append( reference_pb2.ContigInfo(name=contig_name, n_bases=end, pos_in_fasta=i)) self.header = RefFastaHeader(contigs=contigs)
"""Tests for vcf_reader CLIF python wrappers.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function from absl.testing import absltest from nucleus.io.python import vcf_reader from nucleus.protos import reference_pb2 from nucleus.protos import variants_pb2 from nucleus.testing import test_utils from nucleus.util import ranges expected_sites_contigs = [ reference_pb2.ContigInfo(name='chr1', pos_in_fasta=0, n_bases=248956422), reference_pb2.ContigInfo(name='chr2', pos_in_fasta=1, n_bases=242193529), reference_pb2.ContigInfo(name='chr3', pos_in_fasta=2, n_bases=198295559), reference_pb2.ContigInfo(name='chr4', pos_in_fasta=3, n_bases=190214555), reference_pb2.ContigInfo(name='chr5', pos_in_fasta=4, n_bases=181538259), reference_pb2.ContigInfo(name='chr6', pos_in_fasta=5, n_bases=170805979), reference_pb2.ContigInfo(name='chr7', pos_in_fasta=6, n_bases=159345973), reference_pb2.ContigInfo(name='chr8', pos_in_fasta=7, n_bases=145138636), reference_pb2.ContigInfo(name='chr9', pos_in_fasta=8, n_bases=138394717), reference_pb2.ContigInfo(name='chr10', pos_in_fasta=9, n_bases=133797422), reference_pb2.ContigInfo(name='chr11', pos_in_fasta=10, n_bases=135086622), reference_pb2.ContigInfo(name='chr12', pos_in_fasta=11, n_bases=133275309), reference_pb2.ContigInfo(name='chr13', pos_in_fasta=12, n_bases=114364328), reference_pb2.ContigInfo(name='chr14', pos_in_fasta=13, n_bases=107043718), reference_pb2.ContigInfo(name='chr15', pos_in_fasta=14, n_bases=101991189), reference_pb2.ContigInfo(name='chr16', pos_in_fasta=15, n_bases=90338345),
class RangesTests(parameterized.TestCase): def test_ranges_overlaps(self): def check_overlaps(chr1, start1, end1, chr2, start2, end2, expected): i1 = ranges.make_range(chr1, start1, end1) i2 = ranges.make_range(chr2, start2, end2) self.assertEqual(ranges.ranges_overlap(i1, i2), expected) self.assertEqual(ranges.ranges_overlap(i2, i1), expected) check_overlaps('chr1', 0, 3, 'chr1', 4, 10, False) check_overlaps('chr1', 0, 3, 'chr1', 3, 10, False) check_overlaps('chr1', 0, 3, 'chr1', 2, 10, True) check_overlaps('chr1', 0, 3, 'chr1', 1, 10, True) check_overlaps('chr1', 0, 3, 'chr1', 0, 10, True) check_overlaps('chr1', 0, 3, 'chr1', 0, 1, True) check_overlaps('chr1', 0, 3, 'chr1', 0, 2, True) check_overlaps('chr1', 0, 3, 'chr1', 0, 3, True) check_overlaps('chr1', 0, 3, 'chr1', 1, 2, True) check_overlaps('chr1', 0, 3, 'chr1', 1, 3, True) check_overlaps('chr1', 0, 3, 'chr1', 2, 3, True) check_overlaps('chr1', 0, 3, 'chr1', 3, 3, False) check_overlaps('chr1', 1, 3, 'chr1', 0, 4, True) check_overlaps('chr1', 1, 3, 'chr1', 1, 4, True) def test_detector_no_ranges(self): range_set = ranges.RangeSet() # don't have any ranges by default self.assertEqual(bool(range_set), False) # make sure we can call overlaps without any ranges self.assertFalse(range_set.overlaps('chr1', 10)) def test_from_regions_not_empty(self): literals = ['chr1', 'chr2:10-20'] self.assertItemsEqual([ ranges.make_range('chr1', 0, 10), ranges.make_range('chr2', 9, 20) ], ranges.RangeSet.from_regions( literals, ranges.contigs_dict(_TEST_CONTIGS))) def test_from_regions_empty_literals(self): range_set = ranges.RangeSet.from_regions([]) # The set is empty. self.assertItemsEqual([], range_set) self.assertFalse(range_set) def test_unrecognized_contig_triggers_exception(self): with self.assertRaises(ValueError): _ = ranges.RangeSet([ranges.make_range('bogus_chromosome', 1, 10)], _TEST_CONTIGS) @parameterized.parameters( # Overlapping intervals get merged. (['1:1-5', '1:3-8'], ['1:1-8']), (['1:1-5', '1:3-8', '1:6-9'], ['1:1-9']), # Adjacent intervals are merged. (['1:1-5', '1:5-8'], ['1:1-8']), (['1:1-5', '1:5-8', '1:8-10'], ['1:1-10']), # Sanity check that non-overlapping aren't merged. (['1:1-5', '1:6-8'], ['1:1-5', '1:6-8']), ) def test_overlapping_and_adjacent_ranges_are_merged( self, regions, expected): self.assertCountEqual(ranges.RangeSet.from_regions(expected), ranges.RangeSet.from_regions(regions)) def test_detector_ranges(self): test_ranges = [ ranges.make_range('chr1', 0, 5), ranges.make_range('chr1', 8, 10), ranges.make_range('chr1', 12, 13), ranges.make_range('chr2', 2, 5), ] range_set = ranges.RangeSet(test_ranges) self.assertEqual(bool(range_set), True) self.assertEqual(len(range_set), 4) self.assertEqual(range_set.overlaps('chr1', 0), True) self.assertEqual(range_set.overlaps('chr1', 1), True) self.assertEqual(range_set.overlaps('chr1', 2), True) self.assertEqual(range_set.overlaps('chr1', 3), True) self.assertEqual(range_set.overlaps('chr1', 4), True) self.assertEqual(range_set.overlaps('chr1', 5), False) self.assertEqual(range_set.overlaps('chr1', 6), False) self.assertEqual(range_set.overlaps('chr1', 7), False) self.assertEqual(range_set.overlaps('chr1', 8), True) self.assertEqual(range_set.overlaps('chr1', 9), True) self.assertEqual(range_set.overlaps('chr1', 10), False) self.assertEqual(range_set.overlaps('chr1', 11), False) self.assertEqual(range_set.overlaps('chr1', 12), True) self.assertEqual(range_set.overlaps('chr1', 13), False) self.assertEqual(range_set.overlaps('chr1', 100), False) self.assertEqual(range_set.overlaps('chr1', 1000), False) self.assertEqual(range_set.overlaps('chr2', 0), False) self.assertEqual(range_set.overlaps('chr2', 1), False) self.assertEqual(range_set.overlaps('chr2', 2), True) self.assertEqual(range_set.overlaps('chr2', 3), True) self.assertEqual(range_set.overlaps('chr2', 4), True) self.assertEqual(range_set.overlaps('chr2', 5), False) self.assertEqual(range_set.overlaps('chr2', 6), False) self.assertEqual(range_set.overlaps('chr3', 3), False) def test_overlaps_variant_with_ranges(self): variant = variants_pb2.Variant(reference_name='chr2', start=10, end=11) range_set = ranges.RangeSet([ranges.make_range('chr1', 0, 5)]) with mock.patch.object(range_set, 'overlaps') as mock_overlaps: mock_overlaps.return_value = True self.assertEqual(range_set.variant_overlaps(variant), True) mock_overlaps.assert_called_once_with('chr2', 10) def test_overlaps_variant_empty_range(self): variant = variants_pb2.Variant(reference_name='chr2', start=10, end=11) empty_set = ranges.RangeSet() self.assertEqual( empty_set.variant_overlaps(variant, empty_set_return_value='foo'), 'foo') def test_envelops(self): start_ix = 5 end_ix = 10 start_ix2 = end_ix + 1 end_ix2 = end_ix + 5 range_set = ranges.RangeSet([ ranges.make_range('chr1', start_ix, end_ix), ranges.make_range('chr1', start_ix2, end_ix2) ]) # No start position before the first start range is enveloped. for i in range(start_ix): self.assertFalse(range_set.envelops('chr1', i, start_ix + 1)) # All regions within a single record are enveloped. for six in range(start_ix, end_ix): for eix in range(six, end_ix + 1): self.assertTrue(range_set.envelops('chr1', six, eix), 'chr1 {} {} not enveloped'.format(six, eix)) # Bridging across two ranges is not enveloped. for six in range(start_ix, end_ix): for eix in range(start_ix2, end_ix2 + 1): self.assertFalse(range_set.envelops('chr1', six, eix)) # Other chromosome is not spanned. self.assertFalse(range_set.envelops('chr2', start_ix, start_ix + 1)) @parameterized.parameters( (ranges.make_range('1', 10, 50), '1', 9, False), (ranges.make_range('1', 10, 50), '1', 10, True), (ranges.make_range('1', 10, 50), '2', 10, False), (ranges.make_range('1', 10, 50), '1', 30, True), (ranges.make_range('1', 10, 50), '2', 30, False), (ranges.make_range('1', 10, 50), '1', 49, True), (ranges.make_range('1', 10, 50), '1', 50, False), (ranges.make_range('1', 10, 50), '1', 51, False), ) def test_position_overlaps(self, interval, chrom, pos, expected): self.assertEqual(ranges.position_overlaps(chrom, pos, interval), expected) def test_make_position(self): self.assertEqual( ranges.make_position('chr1', 10), position_pb2.Position(reference_name='chr1', position=10, reverse_strand=False)) self.assertEqual( ranges.make_position('chr2', 100, reverse_strand=True), position_pb2.Position(reference_name='chr2', position=100, reverse_strand=True)) def test_make_range(self): interval = ranges.make_range('chr1', 1, 10) self.assertEqual(interval.reference_name, 'chr1') self.assertEqual(interval.start, 1) self.assertEqual(interval.end, 10) def test_to_literal(self): self.assertEqual(ranges.to_literal(ranges.make_range('chr1', 0, 20)), 'chr1:1-20') @parameterized.parameters(['chr1', '1', 'MT', 'chrM', 'chrX', 'X', 'Y']) def test_parse_literal_chromosomes(self, chrom): self.assertEqual(ranges.parse_literal(chrom + ':1-20'), ranges.make_range(chrom, 0, 20)) @parameterized.parameters( ('chr1:{}-{}'.format(start_str, end_str), start_val, end_val) for start_str, start_val in [('12', 11), ('1,234', 1233)] for end_str, end_val in [('56789', 56789), ('56,789', 56789)]) def test_parse_literal_numerics(self, literal, start_val, end_val): self.assertEqual(ranges.parse_literal(literal), ranges.make_range('chr1', start_val, end_val)) def test_parse_literal_one_bp(self): self.assertEqual(ranges.parse_literal('1:10'), ranges.make_range('1', 9, 10)) self.assertEqual(ranges.parse_literal('1:100'), ranges.make_range('1', 99, 100)) self.assertEqual(ranges.parse_literal('1:1,000'), ranges.make_range('1', 999, 1000)) @parameterized.parameters(['x', 'chr1', 'chr1:', 'chr1:10-', 'chr1:-1-10']) def test_parse_literal_bad(self, bad_literal): with self.assertRaisesRegexp(ValueError, bad_literal): ranges.parse_literal(bad_literal) @parameterized.parameters('test.bed', 'test.bed.gz') def test_from_bed(self, bed_filename): source = test_utils.genomics_core_testdata(bed_filename) self.assertCountEqual([ ranges.make_range('chr1', 1, 10), ranges.make_range('chr2', 20, 30), ranges.make_range('chr2', 40, 60), ranges.make_range('chr3', 80, 90), ], ranges.RangeSet.from_bed(source)) @parameterized.parameters( dict(regions=[], expected=[]), dict(regions=['chr1:10-20'], expected=[ranges.make_range('chr1', 9, 20)]), dict(regions=['test.bed'], expected=_TEST_BED_REGIONS), dict(regions=['test.bed', 'test.bed'], expected=_TEST_BED_REGIONS + _TEST_BED_REGIONS), dict(regions=['chr1:10-20', 'test.bed'], expected=[ranges.make_range('chr1', 9, 20)] + _TEST_BED_REGIONS), dict(regions=['test.bed', 'chr1:10-20'], expected=_TEST_BED_REGIONS + [ranges.make_range('chr1', 9, 20)]), dict(regions=['chr1:9-19', 'test.bed', 'chr1:10-20'], expected=([ranges.make_range('chr1', 8, 19)] + _TEST_BED_REGIONS + [ranges.make_range('chr1', 9, 20)])), ) def test_from_regions(self, regions, expected): # For convenience we allow 'test.bed' in our regions but the actual file # path is in our testdata directory. for i in range(len(regions)): if regions[i] == 'test.bed': regions[i] = test_utils.genomics_core_testdata('test.bed') self.assertEqual(list(ranges.from_regions(regions)), expected) @parameterized.parameters( # Intersection with 1, 2, 3 identical RangeSets produces the original set. ([['1:1-10']], ['1:1-10']), ([['1:1-10'], ['1:1-10']], ['1:1-10']), ([['1:1-10'], ['1:1-10'], ['1:1-10']], ['1:1-10']), # Test some simple overlap configurations. ([['1:1-10'], ['1:11-15']], []), ([['1:1-10'], ['1:10-15']], ['1:10']), ([['1:1-10'], ['1:9-15']], ['1:9-10']), ([['1:5-10'], ['1:1-15']], ['1:5-10']), ([['1:5-10'], ['1:1-4']], []), ([['1:5-10'], ['1:1-5']], ['1:5']), # Check cutting a single interval into multiple pieces. ([['1:5-15'], ['1:6-8', '1:10-12']], ['1:6-8', '1:10-12']), ([['1:5-15'], ['1:3-8', '1:10-12']], ['1:5-8', '1:10-12']), ([['1:5-15'], ['1:3-8', '1:10-20']], ['1:5-8', '1:10-15']), # We have multiple overlapping intervals; make sure we merge intervals. ([['1:5-15'], ['1:3-8', '1:6-10']], ['1:5-10']), ([['1:5-15'], ['1:3-8', '1:6-10', '1:13']], ['1:5-10', '1:13']), # Check that multiple intervals work. ([['1:5-15', '1:20-25'], ['1:3-8', '1:16-23']], ['1:5-8', '1:20-23']), ([['1:5-15', '1:20-25'], ['1:3-8', '1:50-60']], ['1:5-8']), ([['1:5-15', '1:20-25'], ['1:3-4', '1:16-23']], ['1:20-23']), # Check that multiple sets can be intersected. ([['1:10-20'], ['1:5-15']], ['1:10-15']), ([['1:10-20'], ['1:5-15'], ['1:13-30']], ['1:13-15']), ([['1:10-20'], ['1:5-15'], ['1:25-30']], []), # Check that different chromosomes are kept separate. ([['1:10-20'], ['2:10-20']], []), ([['1:10-20', '2:11-14'], ['1:11-14']], ['1:11-14']), ([['1:10-20', '2:11-14'], ['2:10-20']], ['2:11-14']), ) def test_intersection(self, regions, expected): regions_list = [ranges.RangeSet.from_regions(r) for r in regions] copies = [ranges.RangeSet(rs) for rs in regions_list] # Check that the intersection is as expected. self.assertCountEqual(ranges.RangeSet.from_regions(expected), regions_list[0].intersection(*regions_list[1:])) # Check that no one was modified. for pre, post in zip(copies, regions_list): self.assertCountEqual(pre, post) @parameterized.parameters( dict(lhs=['1:1-100'], rhs=['1:10-20'], expected=['1:1-9', '1:21-100']), dict(lhs=['1:1-100'], rhs=[], expected=['1:1-100']), dict(lhs=['1:1-100', '2:1-10'], rhs=['2:1-100'], expected=['1:1-100']), dict(lhs=['1:1-100'], rhs=['1:10-20', '1:15-30'], expected=['1:1-9', '1:31-100']), dict(lhs=['1:1-100'], rhs=['1:10-20', '1:30-40'], expected=['1:1-9', '1:21-29', '1:41-100']), # Excluding regions not in lhs has no impact. dict(lhs=['1:1-100'], rhs=['2:1-100'], expected=['1:1-100']), # Check that excluding the whole region results in an empty RangeSet. dict(lhs=['1:1-100'], rhs=['1:1-100'], expected=[]), # An empty tree remains empty. dict(lhs=[], rhs=['1:1-100'], expected=[]), ) def test_exclude_regions(self, lhs, rhs, expected): lhs = ranges.RangeSet.from_regions(lhs) rhs = ranges.RangeSet.from_regions(rhs) # Mutating operation returns None. self.assertIsNone(lhs.exclude_regions(rhs)) self.assertCountEqual(ranges.RangeSet.from_regions(expected), lhs) @parameterized.parameters(('chr1', ranges.make_range('chr1', 0, 10)), ('chr2', ranges.make_range('chr2', 0, 5))) def test_parse_literal_with_contig_map(self, contig_name, expected): contig_map = { 'chr1': reference_pb2.ContigInfo(name='chr1', n_bases=10), 'chr2': reference_pb2.ContigInfo(name='chr2', n_bases=5), } self.assertEqual( ranges.parse_literal(contig_name, contig_map=contig_map), expected) @parameterized.parameters(['x', 'chr1:', 'chr1:10-', 'chr1:-1-10']) def test_parse_literal_with_contig_map_and_bad_input_raises_exception( self, bad_literal): with self.assertRaises(ValueError): ranges.parse_literal(bad_literal, contig_map={ 'chr1': reference_pb2.ContigInfo(name='chr1', n_bases=10) }) def test_from_contigs(self): contigs = [ reference_pb2.ContigInfo(name='chr1', n_bases=10), reference_pb2.ContigInfo(name='chr2', n_bases=5), ] self.assertCountEqual([ ranges.make_range('chr1', 0, 10), ranges.make_range('chr2', 0, 5), ], ranges.RangeSet.from_contigs(contigs)) @parameterized.parameters( # Chop our contigs into 50 bp pieces. (50, [('chr1', 0, 50), ('chr1', 50, 76), ('chr2', 0, 50), ('chr2', 50, 100), ('chr2', 100, 121), ('chrM', 0, 50), ('chrM', 50, 100)]), # Chop our contigs in 120 bp pieces, leaving a 1 bp fragment in chr2. (120, [('chr1', 0, 76), ('chr2', 0, 120), ('chr2', 120, 121), ('chrM', 0, 100)]), # A 500 max size spans each of our contigs fully. (500, [('chr1', 0, 76), ('chr2', 0, 121), ('chrM', 0, 100)]), ) def test_partitions(self, interval_size, expected): rangeset = ranges.RangeSet([ ranges.make_range('chrM', 0, 100), ranges.make_range('chr1', 0, 76), ranges.make_range('chr2', 0, 121), ]) self.assertEqual([ranges.make_range(*args) for args in expected], list(rangeset.partition(interval_size))) def test_partitions_bad_interval_size_raises(self): # list() is necessary to force the generator to execute. with self.assertRaisesRegexp(ValueError, 'max_size'): list( ranges.RangeSet([ranges.make_range('chrM', 0, 100)]).partition(-10)) with self.assertRaisesRegexp(ValueError, 'max_size'): list( ranges.RangeSet([ranges.make_range('chrM', 0, 100)]).partition(0)) @parameterized.parameters( (10, [('1', 0, 10), ('1', 20, 30), ('1', 30, 40), ('1', 45, 50)]), (7, [('1', 0, 7), ('1', 7, 10), ('1', 20, 27), ('1', 27, 34), ('1', 34, 40), ('1', 45, 50)]), (50, [('1', 0, 10), ('1', 20, 40), ('1', 45, 50)]), ) def test_partition_of_multiple_intervals(self, interval_size, expected): rangeset = ranges.RangeSet([ ranges.make_range('1', 0, 10), ranges.make_range('1', 20, 40), ranges.make_range('1', 45, 50), ]) self.assertCountEqual([ranges.make_range(*args) for args in expected], rangeset.partition(interval_size)) def test_bed_parser(self): test_bed_path = test_utils.test_tmpfile( 'test_bed_parser.bed', '\n'.join([ 'chr20\t61724611\t61725646', 'chr20\t61304163\t61305182', 'chr20\t61286467\t61286789' ])) self.assertEqual(list(ranges.bed_parser(test_bed_path)), [ ranges.make_range('chr20', 61724611, 61725646), ranges.make_range('chr20', 61304163, 61305182), ranges.make_range('chr20', 61286467, 61286789), ]) def test_bedpe_parser(self): # pylint: disable=line-too-long data = '\n'.join([ 'chr20\t25763416\t25765517\tchr20\t25825181\t25826882\tP2_PM_20_1549\t63266\t+\tTYPE:DELETION', 'chr20\t25972820\t25972991\tchr20\t26045347\t26045538\tP2_PM_20_696\t72548\t+\tTYPE:DELETION', 'chr20\t23719873\t23721974\tchr20\t23794822\t23796523\tP2_PM_20_1548\t76450\t+\tTYPE:DELETION', ]) test_bedpe_path = test_utils.test_tmpfile('test_bedpe_parser.bedpe', data) self.assertEqual(list(ranges.bedpe_parser(test_bedpe_path)), [ ranges.make_range('chr20', 25763416, 25826882), ranges.make_range('chr20', 25972820, 26045538), ranges.make_range('chr20', 23719873, 23796523), ]) def test_bedpe_parser_skips_cross_chr_events(self): # pylint: disable=line-too-long data = '\n'.join([ 'chr20\t25763416\t25765517\tchr21\t25825181\t25826882\tP2_PM_20_1549\t63266\t+\tTYPE:DELETION', 'chr20\t25972820\t25972991\tchr20\t26045347\t26045538\tP2_PM_20_696\t72548\t+\tTYPE:DELETION', 'chr20\t23719873\t23721974\tchr20\t23794822\t23796523\tP2_PM_20_1548\t76450\t+\tTYPE:DELETION', ]) test_bedpe_path = test_utils.test_tmpfile('test_bedpe_parser2.bedpe', data) self.assertEqual(list(ranges.bedpe_parser(test_bedpe_path)), [ ranges.make_range('chr20', 25972820, 26045538), ranges.make_range('chr20', 23719873, 23796523), ]) def test_contigs_n_bases(self): c1 = reference_pb2.ContigInfo(name='c', n_bases=100, pos_in_fasta=0) c2 = reference_pb2.ContigInfo(name='a', n_bases=50, pos_in_fasta=1) c3 = reference_pb2.ContigInfo(name='b', n_bases=25, pos_in_fasta=2) self.assertEqual(100, ranges.contigs_n_bases([c1])) self.assertEqual(50, ranges.contigs_n_bases([c2])) self.assertEqual(25, ranges.contigs_n_bases([c3])) self.assertEqual(150, ranges.contigs_n_bases([c1, c2])) self.assertEqual(125, ranges.contigs_n_bases([c1, c3])) self.assertEqual(175, ranges.contigs_n_bases([c1, c2, c3])) def test_rangeset_iteration_order(self): contigs = [ reference_pb2.ContigInfo(name='c', n_bases=100, pos_in_fasta=0), reference_pb2.ContigInfo(name='b', n_bases=121, pos_in_fasta=2), reference_pb2.ContigInfo(name='a', n_bases=76, pos_in_fasta=1), ] unsorted = ranges.parse_literals( ['a:10', 'c:20', 'b:30', 'b:10-15', 'a:5']) # Iteration order over a RangeSet instantiated with a contigs list is # determined by pos_in_fasta, start, end. range_set_with_contigs = ranges.RangeSet(unsorted, contigs) self.assertEqual( ranges.parse_literals(['c:20', 'a:5', 'a:10', 'b:10-15', 'b:30']), [range_ for range_ in range_set_with_contigs]) # For a RangeSet instantiated *without* a contig map, the iteration order # is determined by reference_name, start, end. range_set_no_contigs = ranges.RangeSet(unsorted) self.assertEqual( ranges.parse_literals(['a:5', 'a:10', 'b:10-15', 'b:30', 'c:20']), [range_ for range_ in range_set_no_contigs]) def test_sort_ranges(self): contigs = [ reference_pb2.ContigInfo(name='c', n_bases=100, pos_in_fasta=0), reference_pb2.ContigInfo(name='a', n_bases=76, pos_in_fasta=1), reference_pb2.ContigInfo(name='b', n_bases=121, pos_in_fasta=2), ] unsorted = ranges.parse_literals( ['a:10', 'c:20', 'b:30', 'b:10-15', 'b:10', 'a:5']) # Without contigs we sort the contigs by name lexicographically. self.assertEqual( ranges.parse_literals( ['a:5', 'a:10', 'b:10', 'b:10-15', 'b:30', 'c:20']), ranges.sorted_ranges(unsorted)) # With contigs we sort by the position of the contigs themselves. self.assertEqual( ranges.parse_literals( ['c:20', 'a:5', 'a:10', 'b:10', 'b:10-15', 'b:30']), ranges.sorted_ranges(unsorted, contigs)) @parameterized.parameters( (ranges.make_range('1', 0, 10), ranges.make_range('2', 0, 10), 0), (ranges.make_range('1', 0, 10), ranges.make_range('1', 10, 20), 0), (ranges.make_range('1', 0, 10), ranges.make_range('1', 100, 200), 0), (ranges.make_range('1', 10, 10), ranges.make_range('1', 0, 20), 0), (ranges.make_range('1', 0, 100), ranges.make_range('1', 50, 99), 49), # Check that the overlap handles a few key edge cases. (ranges.make_range('1', 0, 10), ranges.make_range('1', 0, 1), 1), (ranges.make_range('1', 0, 10), ranges.make_range('1', 0, 2), 2), (ranges.make_range('1', 1, 10), ranges.make_range('1', 0, 1), 0), ) def test_overlap_len(self, region_1, region_2, expected_overlap): """Test ReadAssigner.overlap_len().""" self.assertEqual(expected_overlap, ranges.overlap_len(region_1, region_2)) self.assertEqual(expected_overlap, ranges.overlap_len(region_2, region_1)) @parameterized.parameters( # No search_regions produces None. dict(query_range=ranges.make_range('1', 20, 30), search_ranges=[], expected=None), # Read overlaps with none of the ranges returns None. dict(query_range=ranges.make_range('1', 20, 30), search_ranges=[ ranges.make_range('1', 0, 10), ranges.make_range('1', 5, 10) ], expected=None), # Read has longer overlap with the first range. dict(query_range=ranges.make_range('1', 4, 10), search_ranges=[ ranges.make_range('1', 0, 10), ranges.make_range('1', 5, 10) ], expected=0), # Read has longer overlap with the second range. dict(query_range=ranges.make_range('1', 9, 20), search_ranges=[ ranges.make_range('1', 0, 10), ranges.make_range('1', 5, 15) ], expected=1), # Read has the maximum overlap with the third range. dict(query_range=ranges.make_range('1', 9, 20), search_ranges=[ ranges.make_range('1', 0, 10), ranges.make_range('1', 0, 15), ranges.make_range('1', 5, 20) ], expected=2), # Read has the maximum overlap with the middle range. dict(query_range=ranges.make_range('1', 5, 13), search_ranges=[ ranges.make_range('1', 0, 10), ranges.make_range('1', 0, 15), ranges.make_range('1', 10, 20) ], expected=1), # Read has a different reference_name with other ranges. dict(query_range=ranges.make_range('2', 0, 10), search_ranges=[ ranges.make_range('1', 0, 10), ranges.make_range('2', 5, 15), ranges.make_range('3', 0, 10) ], expected=1), # Read has equal overlap in two ranges. dict(query_range=ranges.make_range('1', 5, 15), search_ranges=[ ranges.make_range('1', 0, 10), ranges.make_range('1', 10, 20), ranges.make_range('1', 12, 20) ], expected=0), ) def test_find_max_overlapping(self, query_range, search_ranges, expected): actual = ranges.find_max_overlapping(query_range, search_ranges) self.assertEqual(expected, actual) def test_find_max_overlapping_allows_unordered_search_ranges(self): query_range = ranges.make_range('1', 4, 12) search_ranges = [ ranges.make_range('1', 0, 10), ranges.make_range('1', 10, 20), ranges.make_range('1', 12, 20) ] max_overlapping_range = search_ranges[0] for permutated_ranges in itertools.permutations(search_ranges): self.assertEqual( permutated_ranges.index(max_overlapping_range), ranges.find_max_overlapping(query_range, permutated_ranges)) def test_find_max_overlapping_returns_least_index(self): query_range = ranges.make_range('1', 0, 10) search_ranges = [ ranges.make_range('1', 0, 5), ranges.make_range('1', 5, 10) ] for to_search in [search_ranges, list(reversed(search_ranges))]: self.assertEqual( 0, ranges.find_max_overlapping(query_range, to_search)) @parameterized.parameters( dict( regions=[ ranges.make_range('1', 1, 10), ], expected_span=ranges.make_range('1', 1, 10), ), dict( regions=[ ranges.make_range('1', 1, 10), ranges.make_range('1', 10, 100), ], expected_span=ranges.make_range('1', 1, 100), ), dict( regions=[ ranges.make_range('1', 1, 10), ranges.make_range('1', 10, 100), ranges.make_range('1', 2, 20), ], expected_span=ranges.make_range('1', 1, 100), ), # potential edge cases: # same start, different ends. dict( regions=[ ranges.make_range('1', 1, 10), ranges.make_range('1', 1, 100), ], expected_span=ranges.make_range('1', 1, 100), ), # same end, different starts. dict( regions=[ ranges.make_range('1', 1, 10), ranges.make_range('1', 2, 10), ], expected_span=ranges.make_range('1', 1, 10), ), ) def test_span_computes_span_correctly(self, regions, expected_span): for permutation in itertools.permutations(regions, len(regions)): self.assertEqual(expected_span, ranges.span(permutation)) @parameterized.parameters( dict(regions=[], regexp='empty'), dict(regions=[ ranges.make_range('1', 0, 2), ranges.make_range('2', 0, 2), ], regexp='regions must be all on the same contig'), dict(regions=[ ranges.make_range('1', 0, 2), ranges.make_range('1', 0, 3), ranges.make_range('2', 0, 2), ], regexp='regions must be all on the same contig'), ) def test_span_raises_on_bad_input(self, regions, regexp): with self.assertRaisesRegexp(ValueError, regexp): ranges.span(regions) @parameterized.parameters( dict( region=ranges.make_range('1', 10, 20), n_bp=n_bp, contig_map=None, expected=ranges.make_range('1', 10 - n_bp, 20 + n_bp), ) for n_bp in range(10)) def test_expand_is_correct(self, region, n_bp, contig_map, expected): self.assertEqual(expected, ranges.expand(region, n_bp, contig_map)) @parameterized.parameters( # Check that we don't create Ranges with negative starts. dict( region=ranges.make_range('1', 10, 20), n_bp=20, contig_map=None, expected=ranges.make_range('1', 0, 40), ), # Check that we respect n_bp if contig_map is provided. dict( region=ranges.make_range('1', 10, 20), n_bp=40, contig_map={ '1': reference_pb2.ContigInfo(name='1', n_bases=50), }, expected=ranges.make_range('1', 0, 50), ), ) def test_expand_handles_boundaries(self, region, n_bp, contig_map, expected): self.assertEqual(expected, ranges.expand(region, n_bp, contig_map)) def test_expand_raises_on_negative_n_bp(self): with self.assertRaisesRegexp(ValueError, 'n_bp must be >= 0 but got -10'): ranges.expand(ranges.make_range('1', 10, 20), -10) def test_expand_raises_with_missing_contig_in_map(self): # Empty contig_map should raise. with self.assertRaises(KeyError): ranges.expand(ranges.make_range('1', 10, 20), 1, contig_map={}) # Missing '1' from the contig map should raise. with self.assertRaises(KeyError): ranges.expand(ranges.make_range('1', 10, 20), 1, contig_map={ '2': reference_pb2.ContigInfo(name='2', n_bases=50), }) @parameterized.parameters( dict( region=ranges.make_range(chrom, start, start + length), expected_length=length, ) for length in range(10) for start in [10, 20, 1000] for chrom in ['1', '20']) def test_length_is_correct(self, region, expected_length): self.assertEqual(expected_length, ranges.length(region))
from nucleus.protos import position_pb2 from nucleus.protos import reference_pb2 from nucleus.protos import variants_pb2 from nucleus.testing import test_utils from nucleus.util import ranges _TEST_BED_REGIONS = [ ranges.make_range('chr1', 1, 10), ranges.make_range('chr2', 20, 30), ranges.make_range('chr2', 40, 60), ranges.make_range('chr3', 80, 90), ] _TEST_CONTIGS = [ reference_pb2.ContigInfo(name='chr1', n_bases=10, pos_in_fasta=0), reference_pb2.ContigInfo(name='chr2', n_bases=100, pos_in_fasta=1), reference_pb2.ContigInfo(name='chr3', n_bases=500, pos_in_fasta=2), ] class RangesTests(parameterized.TestCase): def test_ranges_overlaps(self): def check_overlaps(chr1, start1, end1, chr2, start2, end2, expected): i1 = ranges.make_range(chr1, start1, end1) i2 = ranges.make_range(chr2, start2, end2) self.assertEqual(ranges.ranges_overlap(i1, i2), expected) self.assertEqual(ranges.ranges_overlap(i2, i1), expected) check_overlaps('chr1', 0, 3, 'chr1', 4, 10, False) check_overlaps('chr1', 0, 3, 'chr1', 3, 10, False)
def test_sam_contigs(self): reader = sam_reader.SamReader.from_file(reads_path=self.bam, ref_path='', options=self.options) with reader: self.assertEqual([ reference_pb2.ContigInfo( name='chrM', pos_in_fasta=0, n_bases=16571), reference_pb2.ContigInfo( name='chr1', pos_in_fasta=1, n_bases=249250621), reference_pb2.ContigInfo( name='chr2', pos_in_fasta=2, n_bases=243199373), reference_pb2.ContigInfo( name='chr3', pos_in_fasta=3, n_bases=198022430), reference_pb2.ContigInfo( name='chr4', pos_in_fasta=4, n_bases=191154276), reference_pb2.ContigInfo( name='chr5', pos_in_fasta=5, n_bases=180915260), reference_pb2.ContigInfo( name='chr6', pos_in_fasta=6, n_bases=171115067), reference_pb2.ContigInfo( name='chr7', pos_in_fasta=7, n_bases=159138663), reference_pb2.ContigInfo( name='chr8', pos_in_fasta=8, n_bases=146364022), reference_pb2.ContigInfo( name='chr9', pos_in_fasta=9, n_bases=141213431), reference_pb2.ContigInfo( name='chr10', pos_in_fasta=10, n_bases=135534747), reference_pb2.ContigInfo( name='chr11', pos_in_fasta=11, n_bases=135006516), reference_pb2.ContigInfo( name='chr12', pos_in_fasta=12, n_bases=133851895), reference_pb2.ContigInfo( name='chr13', pos_in_fasta=13, n_bases=115169878), reference_pb2.ContigInfo( name='chr14', pos_in_fasta=14, n_bases=107349540), reference_pb2.ContigInfo( name='chr15', pos_in_fasta=15, n_bases=102531392), reference_pb2.ContigInfo( name='chr16', pos_in_fasta=16, n_bases=90354753), reference_pb2.ContigInfo( name='chr17', pos_in_fasta=17, n_bases=81195210), reference_pb2.ContigInfo( name='chr18', pos_in_fasta=18, n_bases=78077248), reference_pb2.ContigInfo( name='chr19', pos_in_fasta=19, n_bases=59128983), reference_pb2.ContigInfo( name='chr20', pos_in_fasta=20, n_bases=63025520), reference_pb2.ContigInfo( name='chr21', pos_in_fasta=21, n_bases=48129895), reference_pb2.ContigInfo( name='chr22', pos_in_fasta=22, n_bases=51304566), reference_pb2.ContigInfo( name='chrX', pos_in_fasta=23, n_bases=155270560), reference_pb2.ContigInfo( name='chrY', pos_in_fasta=24, n_bases=59373566), ], list(reader.header.contigs))
def test_writing_canned_variants(self): """Tests writing all the variants that are 'canned' in our tfrecord file.""" # This file is in the TF record format tfrecord_file = test_utils.genomics_core_testdata( 'test_samples.vcf.golden.tfrecord') writer_options = variants_pb2.VcfWriterOptions() header = variants_pb2.VcfHeader( contigs=[ reference_pb2.ContigInfo(name='chr1', n_bases=248956422), reference_pb2.ContigInfo(name='chr2', n_bases=242193529), reference_pb2.ContigInfo(name='chr3', n_bases=198295559), reference_pb2.ContigInfo(name='chrX', n_bases=156040895) ], sample_names=['NA12878_18_99'], filters=[ variants_pb2.VcfFilterInfo( id='PASS', description='All filters passed'), variants_pb2.VcfFilterInfo(id='LowQual', description=''), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL95.00to96.00'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL96.00to97.00'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL97.00to99.00'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.00to99.50'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.50to99.90'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.90to99.95'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.95to100.00+'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.95to100.00'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.50to99.60'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.60to99.80'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.80to99.90'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.90to99.95'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00+'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00'), ], infos=[ variants_pb2.VcfInfo( id='END', number='1', type='Integer', description='Stop position of the interval') ], formats=[ variants_pb2.VcfFormatInfo( id='GT', number='1', type='String', description='Genotype'), variants_pb2.VcfFormatInfo( id='GQ', number='1', type='Integer', description='Genotype Quality'), variants_pb2.VcfFormatInfo( id='DP', number='1', type='Integer', description='Read depth of all passing filters reads.'), variants_pb2.VcfFormatInfo( id='MIN_DP', number='1', type='Integer', description='Minimum DP observed within the GVCF block.'), variants_pb2.VcfFormatInfo( id='AD', number='R', type='Integer', description= 'Read depth of all passing filters reads for each allele.'), variants_pb2.VcfFormatInfo( id='VAF', number='A', type='Float', description='Variant allele fractions.'), variants_pb2.VcfFormatInfo( id='PL', number='G', type='Integer', description='Genotype likelihoods, Phred encoded'), ], ) variant_records = list( io_utils.read_tfrecords(tfrecord_file, proto=variants_pb2.Variant)) out_fname = test_utils.test_tmpfile('output.vcf') with vcf_writer.VcfWriter.to_file(out_fname, header, writer_options) as writer: for record in variant_records[:5]: writer.write(record) # Check: are the variants written as expected? # pylint: disable=line-too-long expected_vcf_content = [ '##fileformat=VCFv4.2\n', '##FILTER=<ID=PASS,Description="All filters passed">\n', '##FILTER=<ID=LowQual,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL95.00to96.00,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL96.00to97.00,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL97.00to99.00,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.00to99.50,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.50to99.90,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.90to99.95,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00+,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.50to99.60,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.60to99.80,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.80to99.90,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.90to99.95,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.95to100.00+,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.95to100.00,Description="">\n', '##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of ' 'the interval">\n', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n', '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">\n', '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth of all ' 'passing filters reads.">\n', '##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP ' 'observed within the GVCF block.">\n', '##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth of all ' 'passing filters reads for each allele.">\n', '##FORMAT=<ID=VAF,Number=A,Type=Float,Description=\"Variant allele ' 'fractions.">\n', '##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Genotype ' 'likelihoods, Phred encoded">\n', '##contig=<ID=chr1,length=248956422>\n', '##contig=<ID=chr2,length=242193529>\n', '##contig=<ID=chr3,length=198295559>\n', '##contig=<ID=chrX,length=156040895>\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNA12878_18_99\n', 'chr1\t13613\t.\tT\tA\t39.88\tVQSRTrancheSNP99.90to99.95\t.\tGT:GQ:DP:AD:PL\t0/1:16:4:1,3:68,0,16\n', 'chr1\t13813\t.\tT\tG\t90.28\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:9:3:0,3:118,9,0\n', 'chr1\t13838\trs28428499\tC\tT\t62.74\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:6:2:0,2:90,6,0\n', 'chr1\t14397\trs756427959\tCTGT\tC\t37.73\tPASS\t.\tGT:GQ:DP:AD:PL\t0/1:75:5:3,2:75,0,152\n', 'chr1\t14522\t.\tG\tA\t49.77\tVQSRTrancheSNP99.60to99.80\t.\tGT:GQ:DP:AD:PL\t0/1:78:10:6,4:78,0,118\n' ] # pylint: enable=line-too-long with gfile.GFile(out_fname, 'r') as f: self.assertEqual(f.readlines(), expected_vcf_content)
def write_test_protos(self, filename): protos = [reference_pb2.ContigInfo(name=str(i)) for i in range(10)] path = test_utils.test_tmpfile(filename) io.write_tfrecords(protos, path) return protos, path
def setUp(self): self.proto1 = reference_pb2.ContigInfo( name='p1', n_bases=10, pos_in_fasta=0) self.proto2 = reference_pb2.ContigInfo( name='p2', n_bases=20, pos_in_fasta=1) self.protos = [self.proto1, self.proto2]