def test_wrap(self, fasta_filename): chr_names = ['chrM', 'chr1', 'chr2'] chr_lengths = [100, 76, 121] fasta = test_utils.genomics_core_testdata(fasta_filename) fai = test_utils.genomics_core_testdata(fasta_filename + '.fai') with reference_fai.GenomeReferenceFai.from_file(fasta, fai) as ref: self.assertEqual(ref.n_contigs, 3) self.assertIn(fasta, ref.fasta_path) self.assertIn('GenomeReference backed by htslib FAI index', str(ref)) self.assertEqual(ref.contig_names, chr_names) self.assertEqual(ref.n_bp, sum(chr_lengths)) self.assertEqual(ref.bases(ranges.make_range('chrM', 1, 10)), 'ATCACAGGT') self.assertTrue( ref.is_valid_interval(ranges.make_range('chrM', 1, 10))) self.assertFalse( ref.is_valid_interval(ranges.make_range('chrM', 1, 100000))) self.assertEqual(len(ref.contigs), 3) self.assertEqual([c.name for c in ref.contigs], chr_names) self.assertEqual([c.n_bases for c in ref.contigs], chr_lengths) for contig in ref.contigs: self.assertEqual(ref.contig(contig.name), contig) self.assertTrue(ref.has_contig(contig.name)) self.assertFalse(ref.has_contig(contig.name + '.unknown'))
def test_detector_ranges(self): test_ranges = [ ranges.make_range('chr1', 0, 5), ranges.make_range('chr1', 8, 10), ranges.make_range('chr1', 12, 13), ranges.make_range('chr2', 2, 5), ] range_set = ranges.RangeSet(test_ranges) self.assertEqual(bool(range_set), True) self.assertEqual(len(range_set), 4) self.assertEqual(range_set.overlaps('chr1', 0), True) self.assertEqual(range_set.overlaps('chr1', 1), True) self.assertEqual(range_set.overlaps('chr1', 2), True) self.assertEqual(range_set.overlaps('chr1', 3), True) self.assertEqual(range_set.overlaps('chr1', 4), True) self.assertEqual(range_set.overlaps('chr1', 5), False) self.assertEqual(range_set.overlaps('chr1', 6), False) self.assertEqual(range_set.overlaps('chr1', 7), False) self.assertEqual(range_set.overlaps('chr1', 8), True) self.assertEqual(range_set.overlaps('chr1', 9), True) self.assertEqual(range_set.overlaps('chr1', 10), False) self.assertEqual(range_set.overlaps('chr1', 11), False) self.assertEqual(range_set.overlaps('chr1', 12), True) self.assertEqual(range_set.overlaps('chr1', 13), False) self.assertEqual(range_set.overlaps('chr1', 100), False) self.assertEqual(range_set.overlaps('chr1', 1000), False) self.assertEqual(range_set.overlaps('chr2', 0), False) self.assertEqual(range_set.overlaps('chr2', 1), False) self.assertEqual(range_set.overlaps('chr2', 2), True) self.assertEqual(range_set.overlaps('chr2', 3), True) self.assertEqual(range_set.overlaps('chr2', 4), True) self.assertEqual(range_set.overlaps('chr2', 5), False) self.assertEqual(range_set.overlaps('chr2', 6), False) self.assertEqual(range_set.overlaps('chr3', 3), False)
def test_envelops(self): start_ix = 5 end_ix = 10 start_ix2 = end_ix + 1 end_ix2 = end_ix + 5 range_set = ranges.RangeSet([ ranges.make_range('chr1', start_ix, end_ix), ranges.make_range('chr1', start_ix2, end_ix2) ]) # No start position before the first start range is enveloped. for i in range(start_ix): self.assertFalse(range_set.envelops('chr1', i, start_ix + 1)) # All regions within a single record are enveloped. for six in range(start_ix, end_ix): for eix in range(six, end_ix + 1): self.assertTrue(range_set.envelops('chr1', six, eix), 'chr1 {} {} not enveloped'.format(six, eix)) # Bridging across two ranges is not enveloped. for six in range(start_ix, end_ix): for eix in range(start_ix2, end_ix2 + 1): self.assertFalse(range_set.envelops('chr1', six, eix)) # Other chromosome is not spanned. self.assertFalse(range_set.envelops('chr2', start_ix, start_ix + 1))
def test_read_range(self, update_cached_read_end_first): """Tests reads have their ranges calculated correctly.""" start = 10000001 read = test_utils.make_read( 'AAACAG', chrom='chrX', start=start, cigar='2M1I3M', quals=range(10, 16), name='read1') if update_cached_read_end_first: # Explicitly update cached_end. read.cached_end = utils.read_end(read, use_cached_read_end=False) self.assertEqual( ranges.make_range('chrX', start, start + 5), utils.read_range(read)) read = test_utils.make_read( 'AAACAG', chrom='chrX', start=start, cigar='2M16D3M', quals=range(10, 16), name='read1') if update_cached_read_end_first: # Explicitly update cached_end. read.cached_end = utils.read_end(read, use_cached_read_end=False) self.assertEqual( ranges.make_range('chrX', start, start + 5 + 16), utils.read_range(read))
def test_from_regions_not_empty(self): literals = ['chr1', 'chr2:10-20'] self.assertItemsEqual( [ranges.make_range('chr1', 0, 10), ranges.make_range('chr2', 9, 20)], ranges.RangeSet.from_regions( literals, ranges.contigs_dict(_TEST_CONTIGS)))
def test_parse_literal_one_bp(self): self.assertEqual(ranges.parse_literal('1:10'), ranges.make_range('1', 9, 10)) self.assertEqual(ranges.parse_literal('1:100'), ranges.make_range('1', 99, 100)) self.assertEqual(ranges.parse_literal('1:1,000'), ranges.make_range('1', 999, 1000))
def test_from_bed(self, bed_filename): source = test_utils.genomics_core_testdata(bed_filename) self.assertCountEqual([ ranges.make_range('chr1', 1, 10), ranges.make_range('chr2', 20, 30), ranges.make_range('chr2', 40, 60), ranges.make_range('chr3', 80, 90), ], ranges.RangeSet.from_bed(source))
def test_partitions(self, interval_size, expected): rangeset = ranges.RangeSet([ ranges.make_range('chrM', 0, 100), ranges.make_range('chr1', 0, 76), ranges.make_range('chr2', 0, 121), ]) self.assertEqual([ranges.make_range(*args) for args in expected], list(rangeset.partition(interval_size)))
def test_partition_of_multiple_intervals(self, interval_size, expected): rangeset = ranges.RangeSet([ ranges.make_range('1', 0, 10), ranges.make_range('1', 20, 40), ranges.make_range('1', 45, 50), ]) self.assertCountEqual([ranges.make_range(*args) for args in expected], rangeset.partition(interval_size))
def test_query_edge_cases(self): reader = fasta.InMemoryFastaReader([('1', 0, 'ACGT')]) # Check that we can query the first base correctly. self.assertEqual(reader.query(ranges.make_range('1', 0, 1)), 'A') # Check that we can query the last base correctly. self.assertEqual(reader.query(ranges.make_range('1', 3, 4)), 'T') # Check that we can query the entire sequence correctly. self.assertEqual(reader.query(ranges.make_range('1', 0, 4)), 'ACGT')
def test_from_contigs(self): contigs = [ reference_pb2.ContigInfo(name='chr1', n_bases=10), reference_pb2.ContigInfo(name='chr2', n_bases=5), ] self.assertCountEqual([ ranges.make_range('chr1', 0, 10), ranges.make_range('chr2', 0, 5), ], ranges.RangeSet.from_contigs(contigs))
def test_find_max_overlapping_returns_least_index(self): query_range = ranges.make_range('1', 0, 10) search_ranges = [ ranges.make_range('1', 0, 5), ranges.make_range('1', 5, 10) ] for to_search in [search_ranges, list(reversed(search_ranges))]: self.assertEqual(0, ranges.find_max_overlapping(query_range, to_search))
def test_partitions_bad_interval_size_raises(self): # list() is necessary to force the generator to execute. with self.assertRaisesRegexp(ValueError, 'max_size'): list( ranges.RangeSet([ranges.make_range('chrM', 0, 100)]).partition(-10)) with self.assertRaisesRegexp(ValueError, 'max_size'): list( ranges.RangeSet([ranges.make_range('chrM', 0, 100)]).partition(0))
def test_from_regions_not_empty(self): literals = ['chr1', 'chr2:10-20'] contig_map = { 'chr1': reference_pb2.ContigInfo(name='chr1', n_bases=10), 'chr2': reference_pb2.ContigInfo(name='chr2', n_bases=100), } self.assertItemsEqual( [ranges.make_range('chr1', 0, 10), ranges.make_range('chr2', 9, 20)], ranges.RangeSet.from_regions(literals, contig_map))
def test_bed_parser(self): test_bed_path = test_utils.test_tmpfile( 'test_bed_parser.bed', '\n'.join([ 'chr20\t61724611\t61725646', 'chr20\t61304163\t61305182', 'chr20\t61286467\t61286789' ])) self.assertEqual(list(ranges.bed_parser(test_bed_path)), [ ranges.make_range('chr20', 61724611, 61725646), ranges.make_range('chr20', 61304163, 61305182), ranges.make_range('chr20', 61286467, 61286789), ])
def test_bedpe_parser_skips_cross_chr_events(self): # pylint: disable=line-too-long data = [ 'chr20\t25763416\t25765517\tchr21\t25825181\t25826882\tP2_PM_20_1549\t63266\t+\tTYPE:DELETION', 'chr20\t25972820\t25972991\tchr20\t26045347\t26045538\tP2_PM_20_696\t72548\t+\tTYPE:DELETION', 'chr20\t23719873\t23721974\tchr20\t23794822\t23796523\tP2_PM_20_1548\t76450\t+\tTYPE:DELETION', ] self.assertEqual(list(ranges.parse_lines(data, 'bedpe')), [ ranges.make_range('chr20', 25972820, 26045538), ranges.make_range('chr20', 23719873, 23796523), ])
def test_bed_parser(self): data = [ 'chr20\t61724611\t61725646', 'chr20\t61304163\t61305182', 'chr20\t61286467\t61286789', ] self.assertEqual(list(ranges.parse_lines(data, 'bed')), [ ranges.make_range('chr20', 61724611, 61725646), ranges.make_range('chr20', 61304163, 61305182), ranges.make_range('chr20', 61286467, 61286789), ])
def test_dispatching_reader(self): with fasta.FastaReader( test_utils.genomics_core_testdata('test.fasta')) as reader: # The reader is an instance of IndexedFastaReader which supports query(). self.assertEqual(reader.query(ranges.make_range('chrM', 1, 6)), 'ATCAC') with fasta.FastaReader( test_utils.genomics_core_testdata('unindexed.fasta')) as reader: # The reader is an instance of UnindexedFastaReader which doesn't support # query(). with self.assertRaises(NotImplementedError): reader.query(ranges.make_range('chrM', 1, 5))
def setUp(self): out_fname = test_utils.test_tmpfile('output.gff') self.writer = gff_writer.GffWriter.to_file(out_fname, gff_pb2.GffHeader(), gff_pb2.GffWriterOptions()) self.expected_gff_content = open( test_utils.genomics_core_testdata( 'test_features.gff')).readlines() self.header = gff_pb2.GffHeader( sequence_regions=[ranges.make_range('ctg123', 0, 1497228)]) self.record = gff_pb2.GffRecord( range=ranges.make_range('ctg123', 1000, 1100))
def test_expand_raises_with_missing_contig_in_map(self): # Empty contig_map should raise. with self.assertRaises(KeyError): ranges.expand(ranges.make_range('1', 10, 20), 1, contig_map={}) # Missing '1' from the contig map should raise. with self.assertRaises(KeyError): ranges.expand(ranges.make_range('1', 10, 20), 1, contig_map={ '2': reference_pb2.ContigInfo(name='2', n_bases=50), })
def test_bedpe_parser_skips_cross_chr_events(self): # pylint: disable=line-too-long data = '\n'.join([ 'chr20\t25763416\t25765517\tchr21\t25825181\t25826882\tP2_PM_20_1549\t63266\t+\tTYPE:DELETION', 'chr20\t25972820\t25972991\tchr20\t26045347\t26045538\tP2_PM_20_696\t72548\t+\tTYPE:DELETION', 'chr20\t23719873\t23721974\tchr20\t23794822\t23796523\tP2_PM_20_1548\t76450\t+\tTYPE:DELETION', ]) test_bedpe_path = test_utils.test_tmpfile('test_bedpe_parser2.bedpe', data) self.assertEqual(list(ranges.bedpe_parser(test_bedpe_path)), [ ranges.make_range('chr20', 25972820, 26045538), ranges.make_range('chr20', 23719873, 23796523), ])
def test_find_max_overlapping_allows_unordered_search_ranges(self): query_range = ranges.make_range('1', 4, 12) search_ranges = [ ranges.make_range('1', 0, 10), ranges.make_range('1', 10, 20), ranges.make_range('1', 12, 20) ] max_overlapping_range = search_ranges[0] for permutated_ranges in itertools.permutations(search_ranges): self.assertEqual( permutated_ranges.index(max_overlapping_range), ranges.find_max_overlapping(query_range, permutated_ranges))
def test_variant_position_and_range(self): v1 = test_utils.make_variant(chrom='1', alleles=['A', 'C'], start=10) v2 = test_utils.make_variant(chrom='1', alleles=['AGCT', 'C'], start=10) pos = ranges.make_range('1', 10, 11) range_ = ranges.make_range('1', 10, 14) v1_range_tuple = ('1', 10, 11) v2_range_tuple = ('1', 10, 14) self.assertEqual(pos, variant_utils.variant_position(v1)) self.assertEqual(pos, variant_utils.variant_position(v2)) self.assertEqual(pos, variant_utils.variant_range(v1)) self.assertEqual(range_, variant_utils.variant_range(v2)) self.assertEqual(v1_range_tuple, variant_utils.variant_range_tuple(v1)) self.assertEqual(v2_range_tuple, variant_utils.variant_range_tuple(v2))
def test_good_query(self): for contig in self.fasta_reader.header.contigs: for start in range(contig.n_bases): for end in range(start, contig.n_bases): region = ranges.make_range(contig.name, start, end) self.assertEqual(self.in_mem.query(region), self.fasta_reader.query(region))
def __init__(self, chromosomes): """Initializes an InMemoryFastaReader using data from chromosomes. Args: chromosomes: list[tuple]. The chromosomes we are caching in memory as a list of tuples. Each tuple must be exactly three elements in length, containing (chromosome name [str], start [int], bases [str]). Raises: ValueError: If any of the chromosomes tuples are invalid. """ super(InMemoryFastaReader, self).__init__() ref_seqs = [] contigs = [] for i, (contig_name, start, bases) in enumerate(chromosomes): if start < 0: raise ValueError('start={} must be >= for chromosome={}'.format( start, contig_name)) if not bases: raise ValueError( 'Bases must contain at least one base, but got "{}"'.format(bases)) end = start + len(bases) ref_seqs.append(reference_pb2.ReferenceSequence( region=ranges.make_range(contig_name, start, end), bases=bases)) contigs.append( reference_pb2.ContigInfo( name=contig_name, n_bases=end, pos_in_fasta=i)) self._reader = in_memory_fasta_reader.InMemoryFastaReader.create( contigs, ref_seqs) self.header = RefFastaHeader(contigs=self._reader.contigs)
def test_overlaps_variant_with_ranges(self): variant = variants_pb2.Variant(reference_name='chr2', start=10, end=11) range_set = ranges.RangeSet([ranges.make_range('chr1', 0, 5)]) with mock.patch.object(range_set, 'overlaps') as mock_overlaps: mock_overlaps.return_value = True self.assertEqual(range_set.variant_overlaps(variant), True) mock_overlaps.assert_called_once_with('chr2', 10)
def setUp(self): tfrecord_file = test_utils.genomics_core_testdata( 'test_features.gff.tfrecord') self.records = list( io_utils.read_tfrecords(tfrecord_file, proto=gff_pb2.GffRecord)) self.header = gff_pb2.GffHeader( sequence_regions=[ranges.make_range('ctg123', 0, 1497228)])
def make_ngs_error_examples(ref_path, vcf_path, bam_path): """ Yields tf.Example for training a ML model. Each tf.Example contains relevant features aboout the ngs read. Args: ref_path: str. A path to an indexed fasta file. vcf_path: str. A path to an indexed VCF file. bam_path: str. A path to an SAM/BAM file. Yields: A tuple (example, ngs_read_length, has_error), where example is a tf.Example, ngs_read_length is the length of the read generated by the sequencer, and has_error is a boolean specifying whether the example contains a read error. """ # Create a ref_reader backed by ref. ref_reader = fasta.IndexedFastaReader(ref_path) # Create a vcf_reader backed by vcf. vcf_reader = vcf.VcfReader(vcf_path) # Create a sam_reader backed by bam. Provide an empty ReadRequirements # proto to the reader so it enables standard filtering based on the default # values of ReadRequirements. Also explicitly allow the reader to access an # unindexed BAM, so only the iterate() function is enabled. read_requirements = reads_pb2.ReadRequirements() sam_reader = sam.SamReader(bam_path, read_requirements=read_requirements) # All our readers and writers are context managers, so use the `with` # construct to open all of the inputs/outputs and close them when we are done # looping over our reads. with ref_reader, vcf_reader, sam_reader: # Loop over the reads in our BAM file: for read in sam_reader.iterate(): # Get the Range proto describing the chrom/start/stop spanned by our read. assert len(read.alignment.cigar) > 0 first_cigar = read.alignment.cigar[0] # If the first cigar is a CLIP_SOFT, the start of sequence is the cigar # operation length before the alignment position. start = read.alignment.position.position if first_cigar.operation == cigar_pb2.CigarUnit.CLIP_SOFT: start -= first_cigar.operation_length read_range = ranges.make_range(read.alignment.position.reference_name, start, start + len(read.aligned_sequence)) # Get all of the variants that overlap our read range. variants = list(vcf_reader.query(read_range)) # Get the reference bases spanned by our read. ref_bases = ref_reader.query(read_range) # Check that we can use our read for generating an example. if is_usable_training_example(read, variants, ref_bases): # Convert read and ref_bases to a tf.Example with make_example. yield make_example(read, ref_bases), len(read.aligned_sequence), ( read.aligned_sequence != ref_bases)
def check_overlaps(chr1, start1, end1, chr2, start2, end2, expected): nbp = end1 - start1 read = test_utils.make_read( 'A' * nbp, chrom=chr1, start=start1, cigar='{}M'.format(nbp)) region = ranges.make_range(chr2, start2, end2) self.assertEqual(utils.read_overlaps_region(read, region), expected) # This check ensures we get the same result calling ranges.ranges_overlap. self.assertEqual( ranges.ranges_overlap(region, utils.read_range(read)), expected)
def setUpClass(cls): cls.fasta_reader = fasta.RefFastaReader( test_utils.genomics_core_testdata('test.fasta')) cls.in_mem = fasta.InMemoryRefReader( [(contig.name, 0, cls.fasta_reader.query( ranges.make_range(contig.name, 0, contig.n_bases))) for contig in cls.fasta_reader.header.contigs])