def _initialize(self): """Initialize the resources needed for this work in the current env.""" if self.initialized: raise ValueError('Cannot initialize this object twice') self.ref_reader = genomics_io.make_ref_reader( self.options.reference_filename) self.sam_reader = self._make_sam_reader() self.in_memory_sam_reader = utils.InMemorySamReader([]) if self.options.realigner_enabled: self.realigner = realigner.Realigner( self.options.realigner_options, self.ref_reader) self.pic = pileup_image.PileupImageCreator( ref_reader=self.ref_reader, sam_reader=self.in_memory_sam_reader, options=self.options.pic_options) if in_training_mode(self.options): self.labeler = variant_labeler.VariantLabeler( genomics_io.make_vcf_reader( self.options.truth_variants_filename), read_confident_regions(self.options)) self.variant_caller = variant_caller.VariantCaller( self.options.variant_caller_options) self.random = np.random.RandomState(self.options.random_seed) self.initialized = True
def test_realigner_end2end(self): ref_reader = fasta.IndexedFastaReader(testdata.CHR20_FASTA) config = realigner.realigner_config(FLAGS) reads_realigner = realigner.Realigner(config, ref_reader) region_str = 'chr20:10,000,000-10,009,999' windows_count = 0 regions = ranges.RangeSet.from_regions([region_str]) for region in regions.partition(1000): with sam.SamReader( testdata.CHR20_BAM, read_requirements=reads_pb2.ReadRequirements()) as sam_reader: in_reads = list(sam_reader.query(region)) windows, out_reads = reads_realigner.realign_reads(in_reads, region) # We should always get back all of the reads we sent in. Instead of just # checking the lengths are the same, make sure all the read names are the # same. self.assertCountEqual([r.fragment_name for r in in_reads], [r.fragment_name for r in out_reads]) # Check each window to make sure it's reasonable. for window in windows: # We always expect the reference sequence to be one of our haplotypes. ref_seq = ref_reader.query(window.span) self.assertIn(ref_seq, set(window.haplotypes)) windows_count += len(windows) self.assertGreater(windows_count, 0)
def test_realigner_end2end(self): ref_reader = genomics_io.make_ref_reader(test_utils.CHR20_FASTA) config = realigner.realigner_config(FLAGS) reads_realigner = realigner.Realigner(config, ref_reader) region_str = 'chr20:10,000,000-10,009,999' regions = ranges.RangeSet.from_regions([region_str]) for region in regions.partition(1000): with genomics_io.make_sam_reader( test_utils.CHR20_BAM, core_pb2.ReadRequirements()) as sam_reader: in_reads = list(sam_reader.query(region)) windows, out_reads = reads_realigner.realign_reads( in_reads, region) # We should always get back all of the reads we sent in. Instead of just # checking the lengths are the same, make sure all the read names are the # same. self.assertCountEqual([r.fragment_name for r in in_reads], [r.fragment_name for r in out_reads]) # Make sure we assembled at least one windows in the region. self.assertNotEqual(0, len(windows)) # Check each window to make sure it's reasonable. for window in windows: # We always expect the reference sequence to be one of our haplotypes. ref_seq = ref_reader.bases(window.span) self.assertIn(ref_seq, set(window.haplotypes))
def setUp(self): self.ref_reader = fasta.IndexedFastaReader(testdata.CHR20_FASTA) # redacted FLAGS.ws_use_window_selector_model = True self.config = realigner.realigner_config(FLAGS) self.reads_realigner = realigner.Realigner(self.config, self.ref_reader)
def _initialize(self): """Initialize the resources needed for this work in the current env.""" if self.initialized: raise ValueError('Cannot initialize this object twice') self.ref_reader = fasta.IndexedFastaReader( self.options.reference_filename) self.sam_reader = self._make_sam_reader() self.in_memory_sam_reader = sam.InMemorySamReader([]) if self.options.realigner_enabled: self.realigner = realigner.Realigner( self.options.realigner_options, self.ref_reader) self.pic = pileup_image.PileupImageCreator( ref_reader=self.ref_reader, sam_reader=self.in_memory_sam_reader, options=self.options.pic_options) if in_training_mode(self.options): self.labeler = self._make_labeler_from_options() self.variant_caller = variant_caller.VariantCaller( self.options.variant_caller_options) self.random = np.random.RandomState(self.options.random_seed) self.initialized = True
def test_realigner_diagnostics(self, enabled, emit_reads): # Make sure that by default we aren't emitting any diagnostic outputs. dx_dir = test_utils.test_tmpfile('dx_enabled{}_emitreads_{}'.format( enabled, emit_reads)) region_str = 'chr20:10046178-10046188' region = ranges.parse_literal(region_str) assembled_region_str = 'chr20:10046096-10046267' reads, header = _get_reads_and_header(region) self.config = realigner.realigner_config(FLAGS) self.config.diagnostics.enabled = enabled self.config.diagnostics.output_root = dx_dir self.config.diagnostics.emit_realigned_reads = emit_reads self.reads_realigner = realigner.Realigner(self.config, self.ref_reader, header) _, _ = self.reads_realigner.realign_reads(reads, region) self.reads_realigner.diagnostic_logger.close( ) # Force close all resources. if not enabled: # Make sure our diagnostic output isn't emitted. self.assertFalse(tf.io.gfile.exists(dx_dir)) else: # Our root directory exists. self.assertTrue(tf.io.gfile.isdir(dx_dir)) # We expect a realigner_metrics.csv in our rootdir with 1 entry in it. metrics_file = os.path.join( dx_dir, self.reads_realigner.diagnostic_logger.metrics_filename) self.assertTrue(tf.io.gfile.exists(metrics_file)) with tf.io.gfile.GFile(metrics_file) as fin: rows = list(csv.DictReader(fin)) self.assertLen(rows, 1) self.assertEqual(set(rows[0].keys()), {'window', 'k', 'n_haplotypes', 'time'}) self.assertEqual(rows[0]['window'], assembled_region_str) self.assertEqual(int(rows[0]['k']), 25) self.assertTrue(int(rows[0]['n_haplotypes']), 2) # Check that our runtime is reasonable (greater than 0, less than 10 s). self.assertTrue(0.0 < float(rows[0]['time']) < 10.0) # As does the subdirectory for this region. region_subdir = os.path.join(dx_dir, assembled_region_str) self.assertTrue(tf.io.gfile.isdir(region_subdir)) # We always have a graph.dot self.assertTrue( tf.io.gfile.exists( os.path.join( region_subdir, self.reads_realigner.diagnostic_logger. graph_filename))) reads_file = os.path.join( dx_dir, region_str, self.reads_realigner.diagnostic_logger. realigned_reads_filename) # if emit_reads=False then file should not exist and vice versa. self.assertEqual(emit_reads, tf.io.gfile.exists(reads_file))
def setUp(self): self.ref_reader = fasta.IndexedFastaReader(testdata.CHR20_FASTA) self.config = realigner.realigner_config(FLAGS) self.reads_realigner = realigner.Realigner(self.config, self.ref_reader)
def test_realigner_diagnostics(self, enabled, emit_reads): # Make sure that by default we aren't emitting any diagnostic outputs. dx_dir = test_utils.test_tmpfile('dx') region_str = 'chr20:10046179-10046188' region = ranges.parse_literal(region_str) assembled_region_str = 'chr20:10046109-10046257' reads = _get_reads(region) self.config = realigner.realigner_config(FLAGS) self.config.diagnostics.enabled = enabled self.config.diagnostics.output_root = dx_dir self.config.diagnostics.emit_realigned_reads = emit_reads self.reads_realigner = realigner.Realigner(self.config, self.ref_reader) _, realigned_reads = self.reads_realigner.realign_reads(reads, region) self.reads_realigner.diagnostic_logger.close( ) # Force close all resources. if not enabled: # Make sure our diagnostic output isn't emitted. self.assertFalse(tf.gfile.Exists(dx_dir)) else: # Our root directory exists. self.assertTrue(tf.gfile.IsDirectory(dx_dir)) # We expect a realigner_metrics.csv in our rootdir with 1 entry in it. metrics_file = os.path.join( dx_dir, self.reads_realigner.diagnostic_logger.metrics_filename) self.assertTrue(tf.gfile.Exists(metrics_file)) with tf.gfile.FastGFile(metrics_file) as fin: rows = list(csv.DictReader(fin)) self.assertEqual(len(rows), 1) self.assertEqual(set(rows[0].keys()), {'window', 'k', 'n_haplotypes', 'time'}) self.assertEqual(rows[0]['window'], assembled_region_str) self.assertEqual(int(rows[0]['k']), 25) self.assertTrue(int(rows[0]['n_haplotypes']), 2) # Check that our runtime is reasonable (greater than 0, less than 10 s). self.assertTrue(0.0 < float(rows[0]['time']) < 10.0) # As does the subdirectory for this region. region_subdir = os.path.join(dx_dir, assembled_region_str) self.assertTrue(tf.gfile.IsDirectory(region_subdir)) # We always have a graph.dot self.assertTrue( tf.gfile.Exists( os.path.join( region_subdir, self.reads_realigner.diagnostic_logger. graph_filename))) reads_file = os.path.join( dx_dir, region_str, self.reads_realigner.diagnostic_logger. realigned_reads_filename) if emit_reads: self.assertTrue(tf.gfile.Exists(reads_file)) reads_from_dx = io_utils.read_tfrecords( reads_file, reads_pb2.Read) self.assertCountEqual(reads_from_dx, realigned_reads) else: self.assertFalse(tf.gfile.Exists(reads_file))
def setUp(self): self.ref_reader = genomics_io.make_ref_reader(test_utils.CHR20_FASTA) self.config = realigner.realigner_config(FLAGS) self.reads_realigner = realigner.Realigner(self.config, self.ref_reader)