def test_adding_reads(self): aregion = _test_assembled_region('chr1:3-15') # We haven't added any reads, so reads is empty and the span is None. self.assertEqual(aregion.reads, []) self.assertIsNone(aregion.read_span) # Add read2, giving us a real read span and a read in our region's reads. read_to_add = self.get_reads_by_name(['read2'])[0] expected_reads = [read_to_add] aregion.add_read(read_to_add) self.assertEqual(aregion.reads, expected_reads) self.assertEqual(aregion.read_span, ranges.parse_literal('chr1:7-9')) # Add read1, increasing the span on the left. read_to_add = self.get_reads_by_name(['read1'])[0] expected_reads += [read_to_add] aregion.add_read(read_to_add) self.assertEqual(aregion.reads, expected_reads) self.assertEqual(aregion.read_span, ranges.parse_literal('chr1:2-9')) # Finally, add in all of the reads. reads_to_add = self.get_reads_by_name(['read3', 'read4', 'read5']) expected_reads += reads_to_add for read in reads_to_add: aregion.add_read(read) self.assertEqual(aregion.reads, expected_reads) self.assertEqual(aregion.read_span, ranges.parse_literal('chr1:2-31'))
def test_realigner_example_variant(self, region_literal, variant_literal): """All overlapping reads should include 10bp deletion at chr20:10046178.""" region = ranges.parse_literal(region_literal) variant = ranges.parse_literal(variant_literal) reads = _get_reads(region) _, realigned_reads = self.reads_realigner.realign_reads(reads, region) for read in realigned_reads: has_variant = False self.assertTrue(read.HasField('alignment')) self.assertEqual(variant.reference_name, read.alignment.position.reference_name) ref_pos = read.alignment.position.position for cigar in read.alignment.cigar: self.assertIn(cigar.operation, utils.CIGAR_OPS) if cigar.operation in utils.CIGAR_ALIGN_OPS: ref_pos += cigar.operation_length elif cigar.operation in utils.CIGAR_DELETE_OPS: if (ref_pos == variant.start and cigar.operation_length == variant.end - ref_pos): has_variant = True ref_pos += cigar.operation_length if (read.alignment.position.position <= variant.start and ref_pos >= variant.end): self.assertTrue(has_variant)
def test_parse_literal_one_bp(self): self.assertEqual( ranges.parse_literal('1:10'), ranges.make_range('1', 9, 10)) self.assertEqual( ranges.parse_literal('1:100'), ranges.make_range('1', 99, 100)) self.assertEqual( ranges.parse_literal('1:1,000'), ranges.make_range('1', 999, 1000))
def test_query_raises_with_bad_range(self): with sam_reader.SamReader.from_file(self.bam, self.indexed_options) as reader: with self.assertRaisesRegexp(ValueError, 'Unknown reference_name'): reader.query(ranges.parse_literal('XXX:1-10')) with self.assertRaisesRegexp(ValueError, 'unknown reference interval'): reader.query(ranges.parse_literal('chr20:10-5'))
def test_parse_literal_with_contig_map_and_bad_input_raises_exception( self, bad_literal): with self.assertRaises(ValueError): ranges.parse_literal( bad_literal, contig_map={ 'chr1': core_pb2.ContigInfo(name='chr1', n_bases=10) })
def test_sam_query(self): reader = genomics_io.make_sam_reader( test_utils.genomics_core_testdata('test.bam')) expected = [(ranges.parse_literal('chr20:10,000,000-10,000,100'), 106), (ranges.parse_literal('chr20:10,000,000-10,000,000'), 45)] with reader: for interval, n_expected in expected: with reader.query(interval) as iterable: self.assertEqual(test_utils.iterable_len(iterable), n_expected)
def test_query_raises_with_bad_range(self): with self.assertRaisesRegexp(ValueError, 'Unknown reference_name'): self.samples_reader.query(ranges.parse_literal('XXX:1-10')) with self.assertRaisesRegexp(ValueError, 'Malformed region'): self.samples_reader.query(ranges.parse_literal('chr1:0-5')) with self.assertRaisesRegexp(ValueError, 'Malformed region'): self.samples_reader.query(ranges.parse_literal('chr1:6-5')) with self.assertRaisesRegexp(ValueError, 'Malformed region'): self.samples_reader.query(ranges.parse_literal('chr1:10-5'))
def test_bam_query(self): reader = sam_reader.SamReader.from_file(self.bam, self.indexed_options) expected = [(ranges.parse_literal('chr20:10,000,000-10,000,100'), 106), (ranges.parse_literal('chr20:10,000,000-10,000,000'), 45)] with reader: for interval, n_expected in expected: with reader.query(interval) as iterable: self.assertIsInstance(iterable, clif_postproc.WrappedCppIterable) self.assertEqual(test_utils.iterable_len(iterable), n_expected)
def test_realigner_example_region(self, region_literal, expected_window_literal, expected_haplotypes, comment): region = ranges.parse_literal(region_literal) reads = _get_reads(region) windows_haplotypes, realigned_reads = self.reads_realigner.realign_reads( reads, region) self.assertEqual(len(reads), len(realigned_reads)) self.assertEqual(ranges.parse_literal(expected_window_literal), windows_haplotypes[0].span, comment) self.assertEqual(expected_haplotypes, set(windows_haplotypes[0].haplotypes), comment)
def test_parse_literal_with_contig_map(self, contig_name, expected): contig_map = { 'chr1': core_pb2.ContigInfo(name='chr1', n_bases=10), 'chr2': core_pb2.ContigInfo(name='chr2', n_bases=5), } self.assertEqual( ranges.parse_literal(contig_name, contig_map=contig_map), expected)
def test_ops_on_closed_reader_raise(self): with self.samples_reader: pass # At this point the reader is closed. with self.assertRaisesRegexp(ValueError, 'Cannot Iterate a closed'): self.samples_reader.iterate() with self.assertRaisesRegexp(ValueError, 'Cannot Query a closed'): self.samples_reader.query( ranges.parse_literal('chr1:10,000,000-10,000,100'))
def test_ops_on_closed_reader_raise(self): reader = sam_reader.SamReader.from_file(self.bam, self.indexed_options) with reader: pass # At this point the reader is closed. with self.assertRaisesRegexp(ValueError, 'Cannot Iterate a closed'): reader.iterate() with self.assertRaisesRegexp(ValueError, 'Cannot Query a closed'): reader.query(ranges.parse_literal('chr20:10,000,000-10,000,100'))
def test_straightforward_region(self): ref_reader = genomics_io.make_ref_reader(test_utils.CHR20_FASTA) bam_reader = genomics_io.make_sam_reader(test_utils.CHR20_BAM) region = ranges.parse_literal('chr20:10,000,000-10,000,100') ref_seq = ref_reader.bases(region) all_reads = list(bam_reader.query(region)) dbg30 = debruijn_graph.build(ref_seq, all_reads, self.single_k_dbg_options(30)) self.assertIsNotNone(dbg30) self.assertEqual([ref_seq], dbg30.candidate_haplotypes())
def test_downsampling(self, method, maybe_range, fraction, expected_n_reads): reader = genomics_io.make_sam_reader( test_utils.genomics_core_testdata('test.bam'), downsample_fraction=fraction, random_seed=12345) with reader: if method == 'iterate': reads_iter = reader.iterate() elif method == 'query': reads_iter = reader.query(ranges.parse_literal(maybe_range)) else: self.fail('Unexpected method', method) self.assertEqual(test_utils.iterable_len(reads_iter), expected_n_reads)
def test_context_manager(self): """Test that we can use context manager to do two queries in sequence.""" reader = sam_reader.SamReader.from_file(self.bam, self.indexed_options) region = ranges.parse_literal('chr20:10,000,000-10,000,100') with reader: with reader.query(region) as query_iterable1: self.assertIsNotNone(query_iterable1) self.assertIsInstance(query_iterable1, clif_postproc.WrappedCppIterable) with reader.query(region) as query_iterable2: self.assertIsNotNone(query_iterable2) self.assertIsInstance(query_iterable2, clif_postproc.WrappedCppIterable)
def test_complex_region(self): # There is a heterozygous 9 bp deletion of tandem TGA repeat. # "chr20:10,095,379-10,095,500" ref_reader = genomics_io.make_ref_reader(test_utils.CHR20_FASTA) bam_reader = genomics_io.make_sam_reader(test_utils.CHR20_BAM) region = ranges.parse_literal('chr20:10,095,379-10,095,500') ref_seq = ref_reader.bases(region) reads = list(bam_reader.query(region)) dbg = debruijn_graph.build(ref_seq, reads, self.dbg_options()) self.assertIsNotNone(dbg) self.assertEqual(44, dbg.kmer_size) self.assertEqual(2, len(dbg.candidate_haplotypes())) self.assertIn(ref_seq, dbg.candidate_haplotypes())
def setUp(self): self.region = ranges.parse_literal('chr20:10,000,000-10,000,100') FLAGS.reads = '' self.options = make_examples.default_options(add_flags=False) self.options.reference_filename = test_utils.CHR20_FASTA self.options.reads_filename = test_utils.CHR20_BAM self.options.truth_variants_filename = test_utils.TRUTH_VARIANTS_VCF self.options.mode = deepvariant_pb2.DeepVariantOptions.TRAINING self.processor = make_examples.RegionProcessor(self.options) self.mock_init = self.add_mock('_initialize') self.default_shape = [5, 5, 7] self.default_format = 'raw'
def test_realigner_doesnt_create_invalid_intervals(self): """Tests that read sets don't result in a crash in reference_fai.cc.""" read = test_utils.make_read('ACCGT' * 50, start=63025520 - 250, cigar='250M', quals=range(30, 35) * 50, name='read1') reads = [read] * 20 region = ranges.parse_literal('chr20:63,025,320-63,025,520') self.reads_realigner.realign_reads(reads, region) # These reads are aligned off the edge of the contig. read = test_utils.make_read('TTATA' * 50, start=63025520 - 200, cigar='200M50S', quals=range(30, 35) * 50, name='read1') reads = [read] * 20 self.reads_realigner.realign_reads(reads, region)
def test_catches_bad_flags(self): # Set all of the requested flag values. region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.ref = test_utils.CHR20_FASTA FLAGS.reads = test_utils.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile('vsc.tfrecord') FLAGS.examples = test_utils.test_tmpfile('examples.tfrecord') FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = 'training' FLAGS.truth_variants = test_utils.TRUTH_VARIANTS_VCF # This is the bad flag. FLAGS.confident_regions = '' with mock.patch.object(logging, 'error') as mock_logging,\ mock.patch.object(sys, 'exit') as mock_exit: make_examples.main(['make_examples.py']) mock_logging.assert_called_once_with( 'confident_regions is required when in training mode.') mock_exit.assert_called_once_with(errno.ENOENT)
def test_make_examples_end2end(self, mode, num_shards): self.assertIn(mode, {'calling', 'training'}) region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.ref = test_utils.CHR20_FASTA FLAGS.reads = test_utils.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile( _sharded('vsc.tfrecord', num_shards)) FLAGS.examples = test_utils.test_tmpfile( _sharded('examples.tfrecord', num_shards)) FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = mode if mode == 'calling': FLAGS.gvcf = test_utils.test_tmpfile( _sharded('gvcf.tfrecord', num_shards)) else: FLAGS.truth_variants = test_utils.TRUTH_VARIANTS_VCF FLAGS.confident_regions = test_utils.CONFIDENT_REGIONS_BED for task_id in range(max(num_shards, 1)): FLAGS.task = task_id options = make_examples.default_options(add_flags=True) make_examples.make_examples_runner(options) # Test that our candidates are reasonable, calling specific helper functions # to check lots of properties of the output. candidates = _sort_candidates( io_utils.read_tfrecords(FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall)) self.verify_deepvariant_calls(candidates, options) self.verify_variants([call.variant for call in candidates], region, options, is_gvcf=False) # Verify that the variants in the examples are all good. examples = self.verify_examples(FLAGS.examples, region, options, verify_labels=mode == 'training') example_variants = [tf_utils.example_variant(ex) for ex in examples] self.verify_variants(example_variants, region, options, is_gvcf=False) # Verify the integrity of the examples and then check that they match our # golden labeled examples. Note we expect the order for both training and # calling modes to produce deterministic order because we fix the random # seed. if mode == 'calling': golden_file = _sharded(test_utils.GOLDEN_CALLING_EXAMPLES, num_shards) else: golden_file = _sharded(test_utils.GOLDEN_TRAINING_EXAMPLES, num_shards) self.assertDeepVariantExamplesEqual( examples, list(io_utils.read_tfrecords(golden_file))) if mode == 'calling': nist_reader = genomics_io.make_vcf_reader( test_utils.TRUTH_VARIANTS_VCF) nist_variants = list(nist_reader.query(region)) self.verify_nist_concordance(example_variants, nist_variants) # Check the quality of our generated gvcf file. gvcfs = _sort_variants( io_utils.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant)) self.verify_variants(gvcfs, region, options, is_gvcf=True) self.verify_contiguity(gvcfs, region)
def test_query_on_unindexed_reader_raises(self): with sam_reader.SamReader.from_file(self.bam, self.options) as reader: with self.assertRaisesRegexp(ValueError, 'Cannot query without an index'): reader.query( ranges.parse_literal('chr20:10,000,000-10,000,100'))
def test_parse_literal_bad(self, bad_literal): with self.assertRaises(ValueError): ranges.parse_literal(bad_literal)
def test_parse_literal_numerics(self, literal, start_val, end_val): self.assertEqual( ranges.parse_literal(literal), ranges.make_range('chr1', start_val, end_val))
def test_parse_literal_chromosomes(self, chrom): self.assertEqual( ranges.parse_literal(chrom + ':1-20'), ranges.make_range(chrom, 0, 20))
def setUp(self): self.query_window = ranges.parse_literal(QUERY_WINDOW)
def test_construction(self): aregion = _test_assembled_region('chr1:1-5', haplotypes=['A', 'C']) self.assertEqual(aregion.region, ranges.parse_literal('chr1:1-5')) self.assertEqual(aregion.haplotypes, ['A', 'C']) self.assertEqual(aregion.reads, [])
def _test_assembled_region(region_str, haplotypes=None): return realigner.AssemblyRegion( realigner_pb2.CandidateHaplotypes( span=ranges.parse_literal(region_str), haplotypes=haplotypes or []))
def test_realigner_diagnostics(self, enabled, emit_reads): # Make sure that by default we aren't emitting any diagnostic outputs. dx_dir = test_utils.test_tmpfile('dx') region_str = 'chr20:10046179-10046188' region = ranges.parse_literal(region_str) assembled_region_str = 'chr20:10046109-10046257' reads = _get_reads(region) self.config = realigner.realigner_config(FLAGS) self.config.diagnostics.enabled = enabled self.config.diagnostics.output_root = dx_dir self.config.diagnostics.emit_realigned_reads = emit_reads self.reads_realigner = realigner.Realigner(self.config, self.ref_reader) _, realigned_reads = self.reads_realigner.realign_reads(reads, region) self.reads_realigner.diagnostic_logger.close( ) # Force close all resources. if not enabled: # Make sure our diagnostic output isn't emitted. self.assertFalse(tf.gfile.Exists(dx_dir)) else: # Our root directory exists. self.assertTrue(tf.gfile.IsDirectory(dx_dir)) # We expect a realigner_metrics.csv in our rootdir with 1 entry in it. metrics_file = os.path.join( dx_dir, self.reads_realigner.diagnostic_logger.metrics_filename) self.assertTrue(tf.gfile.Exists(metrics_file)) with tf.gfile.FastGFile(metrics_file) as fin: rows = list(csv.DictReader(fin)) self.assertEqual(len(rows), 1) self.assertEqual(set(rows[0].keys()), {'window', 'k', 'n_haplotypes', 'time'}) self.assertEqual(rows[0]['window'], assembled_region_str) self.assertEqual(int(rows[0]['k']), 25) self.assertTrue(int(rows[0]['n_haplotypes']), 2) # Check that our runtime is reasonable (greater than 0, less than 10 s). self.assertTrue(0.0 < float(rows[0]['time']) < 10.0) # As does the subdirectory for this region. region_subdir = os.path.join(dx_dir, assembled_region_str) self.assertTrue(tf.gfile.IsDirectory(region_subdir)) # We always have a graph.dot self.assertTrue( tf.gfile.Exists( os.path.join( region_subdir, self.reads_realigner.diagnostic_logger. graph_filename))) reads_file = os.path.join( dx_dir, region_str, self.reads_realigner.diagnostic_logger. realigned_reads_filename) if emit_reads: self.assertTrue(tf.gfile.Exists(reads_file)) reads_from_dx = io_utils.read_tfrecords( reads_file, reads_pb2.Read) self.assertCountEqual(reads_from_dx, realigned_reads) else: self.assertFalse(tf.gfile.Exists(reads_file))
def test_vcf_query(self): range1 = ranges.parse_literal('chr3:100,000-500,000') iterable = self.samples_reader.query(range1) self.assertEqual(test_utils.iterable_len(iterable), 4)
def test_query_on_unindexed_reader_raises(self): with vcf_reader.VcfReader.from_file(self.samples_vcf, self.unindexed_options) as reader: with self.assertRaisesRegexp(ValueError, 'Cannot query without an index'): reader.query(ranges.parse_literal('chr1:10,000,000-10,000,100'))