Exemplo n.º 1
0
    def _initialize(self):
        """Initialize the resources needed for this work in the current env."""
        if self.initialized:
            raise ValueError('Cannot initialize this object twice')

        self.ref_reader = genomics_io.make_ref_reader(
            self.options.reference_filename)
        self.sam_reader = self._make_sam_reader()
        self.in_memory_sam_reader = utils.InMemorySamReader([])

        if self.options.realigner_enabled:
            self.realigner = realigner.Realigner(
                self.options.realigner_options, self.ref_reader)
        self.pic = pileup_image.PileupImageCreator(
            ref_reader=self.ref_reader,
            sam_reader=self.in_memory_sam_reader,
            options=self.options.pic_options)

        if in_training_mode(self.options):
            self.labeler = variant_labeler.VariantLabeler(
                genomics_io.make_vcf_reader(
                    self.options.truth_variants_filename),
                read_confident_regions(self.options))

        self.variant_caller = variant_caller.VariantCaller(
            self.options.variant_caller_options)
        self.random = np.random.RandomState(self.options.random_seed)
        self.initialized = True
Exemplo n.º 2
0
  def test_realigner_end2end(self):
    ref_reader = fasta.IndexedFastaReader(testdata.CHR20_FASTA)
    config = realigner.realigner_config(FLAGS)
    reads_realigner = realigner.Realigner(config, ref_reader)
    region_str = 'chr20:10,000,000-10,009,999'
    windows_count = 0

    regions = ranges.RangeSet.from_regions([region_str])
    for region in regions.partition(1000):
      with sam.SamReader(
          testdata.CHR20_BAM,
          read_requirements=reads_pb2.ReadRequirements()) as sam_reader:
        in_reads = list(sam_reader.query(region))
      windows, out_reads = reads_realigner.realign_reads(in_reads, region)

      # We should always get back all of the reads we sent in. Instead of just
      # checking the lengths are the same, make sure all the read names are the
      # same.
      self.assertCountEqual([r.fragment_name for r in in_reads],
                            [r.fragment_name for r in out_reads])

      # Check each window to make sure it's reasonable.
      for window in windows:
        # We always expect the reference sequence to be one of our haplotypes.
        ref_seq = ref_reader.query(window.span)
        self.assertIn(ref_seq, set(window.haplotypes))
      windows_count += len(windows)

    self.assertGreater(windows_count, 0)
Exemplo n.º 3
0
    def test_realigner_end2end(self):
        ref_reader = genomics_io.make_ref_reader(test_utils.CHR20_FASTA)
        config = realigner.realigner_config(FLAGS)
        reads_realigner = realigner.Realigner(config, ref_reader)
        region_str = 'chr20:10,000,000-10,009,999'

        regions = ranges.RangeSet.from_regions([region_str])
        for region in regions.partition(1000):
            with genomics_io.make_sam_reader(
                    test_utils.CHR20_BAM,
                    core_pb2.ReadRequirements()) as sam_reader:
                in_reads = list(sam_reader.query(region))
            windows, out_reads = reads_realigner.realign_reads(
                in_reads, region)

            # We should always get back all of the reads we sent in. Instead of just
            # checking the lengths are the same, make sure all the read names are the
            # same.
            self.assertCountEqual([r.fragment_name for r in in_reads],
                                  [r.fragment_name for r in out_reads])

            # Make sure we assembled at least one windows in the region.
            self.assertNotEqual(0, len(windows))

            # Check each window to make sure it's reasonable.
            for window in windows:
                # We always expect the reference sequence to be one of our haplotypes.
                ref_seq = ref_reader.bases(window.span)
                self.assertIn(ref_seq, set(window.haplotypes))
Exemplo n.º 4
0
 def setUp(self):
     self.ref_reader = fasta.IndexedFastaReader(testdata.CHR20_FASTA)
     # redacted
     FLAGS.ws_use_window_selector_model = True
     self.config = realigner.realigner_config(FLAGS)
     self.reads_realigner = realigner.Realigner(self.config,
                                                self.ref_reader)
Exemplo n.º 5
0
    def _initialize(self):
        """Initialize the resources needed for this work in the current env."""
        if self.initialized:
            raise ValueError('Cannot initialize this object twice')

        self.ref_reader = fasta.IndexedFastaReader(
            self.options.reference_filename)
        self.sam_reader = self._make_sam_reader()
        self.in_memory_sam_reader = sam.InMemorySamReader([])

        if self.options.realigner_enabled:
            self.realigner = realigner.Realigner(
                self.options.realigner_options, self.ref_reader)
        self.pic = pileup_image.PileupImageCreator(
            ref_reader=self.ref_reader,
            sam_reader=self.in_memory_sam_reader,
            options=self.options.pic_options)

        if in_training_mode(self.options):
            self.labeler = self._make_labeler_from_options()

        self.variant_caller = variant_caller.VariantCaller(
            self.options.variant_caller_options)
        self.random = np.random.RandomState(self.options.random_seed)
        self.initialized = True
Exemplo n.º 6
0
    def test_realigner_diagnostics(self, enabled, emit_reads):
        # Make sure that by default we aren't emitting any diagnostic outputs.
        dx_dir = test_utils.test_tmpfile('dx_enabled{}_emitreads_{}'.format(
            enabled, emit_reads))
        region_str = 'chr20:10046178-10046188'
        region = ranges.parse_literal(region_str)
        assembled_region_str = 'chr20:10046096-10046267'
        reads, header = _get_reads_and_header(region)
        self.config = realigner.realigner_config(FLAGS)
        self.config.diagnostics.enabled = enabled
        self.config.diagnostics.output_root = dx_dir
        self.config.diagnostics.emit_realigned_reads = emit_reads
        self.reads_realigner = realigner.Realigner(self.config,
                                                   self.ref_reader, header)
        _, _ = self.reads_realigner.realign_reads(reads, region)
        self.reads_realigner.diagnostic_logger.close(
        )  # Force close all resources.

        if not enabled:
            # Make sure our diagnostic output isn't emitted.
            self.assertFalse(tf.io.gfile.exists(dx_dir))
        else:
            # Our root directory exists.
            self.assertTrue(tf.io.gfile.isdir(dx_dir))

            # We expect a realigner_metrics.csv in our rootdir with 1 entry in it.
            metrics_file = os.path.join(
                dx_dir,
                self.reads_realigner.diagnostic_logger.metrics_filename)
            self.assertTrue(tf.io.gfile.exists(metrics_file))
            with tf.io.gfile.GFile(metrics_file) as fin:
                rows = list(csv.DictReader(fin))
                self.assertLen(rows, 1)
                self.assertEqual(set(rows[0].keys()),
                                 {'window', 'k', 'n_haplotypes', 'time'})
                self.assertEqual(rows[0]['window'], assembled_region_str)
                self.assertEqual(int(rows[0]['k']), 25)
                self.assertTrue(int(rows[0]['n_haplotypes']), 2)
                # Check that our runtime is reasonable (greater than 0, less than 10 s).
                self.assertTrue(0.0 < float(rows[0]['time']) < 10.0)

            # As does the subdirectory for this region.
            region_subdir = os.path.join(dx_dir, assembled_region_str)
            self.assertTrue(tf.io.gfile.isdir(region_subdir))

            # We always have a graph.dot
            self.assertTrue(
                tf.io.gfile.exists(
                    os.path.join(
                        region_subdir, self.reads_realigner.diagnostic_logger.
                        graph_filename)))

            reads_file = os.path.join(
                dx_dir, region_str, self.reads_realigner.diagnostic_logger.
                realigned_reads_filename)

            # if emit_reads=False then file should not exist and vice versa.
            self.assertEqual(emit_reads, tf.io.gfile.exists(reads_file))
Exemplo n.º 7
0
 def setUp(self):
   self.ref_reader = fasta.IndexedFastaReader(testdata.CHR20_FASTA)
   self.config = realigner.realigner_config(FLAGS)
   self.reads_realigner = realigner.Realigner(self.config, self.ref_reader)
Exemplo n.º 8
0
    def test_realigner_diagnostics(self, enabled, emit_reads):
        # Make sure that by default we aren't emitting any diagnostic outputs.
        dx_dir = test_utils.test_tmpfile('dx')
        region_str = 'chr20:10046179-10046188'
        region = ranges.parse_literal(region_str)
        assembled_region_str = 'chr20:10046109-10046257'
        reads = _get_reads(region)
        self.config = realigner.realigner_config(FLAGS)
        self.config.diagnostics.enabled = enabled
        self.config.diagnostics.output_root = dx_dir
        self.config.diagnostics.emit_realigned_reads = emit_reads
        self.reads_realigner = realigner.Realigner(self.config,
                                                   self.ref_reader)
        _, realigned_reads = self.reads_realigner.realign_reads(reads, region)
        self.reads_realigner.diagnostic_logger.close(
        )  # Force close all resources.

        if not enabled:
            # Make sure our diagnostic output isn't emitted.
            self.assertFalse(tf.gfile.Exists(dx_dir))
        else:
            # Our root directory exists.
            self.assertTrue(tf.gfile.IsDirectory(dx_dir))

            # We expect a realigner_metrics.csv in our rootdir with 1 entry in it.
            metrics_file = os.path.join(
                dx_dir,
                self.reads_realigner.diagnostic_logger.metrics_filename)
            self.assertTrue(tf.gfile.Exists(metrics_file))
            with tf.gfile.FastGFile(metrics_file) as fin:
                rows = list(csv.DictReader(fin))
                self.assertEqual(len(rows), 1)
                self.assertEqual(set(rows[0].keys()),
                                 {'window', 'k', 'n_haplotypes', 'time'})
                self.assertEqual(rows[0]['window'], assembled_region_str)
                self.assertEqual(int(rows[0]['k']), 25)
                self.assertTrue(int(rows[0]['n_haplotypes']), 2)
                # Check that our runtime is reasonable (greater than 0, less than 10 s).
                self.assertTrue(0.0 < float(rows[0]['time']) < 10.0)

            # As does the subdirectory for this region.
            region_subdir = os.path.join(dx_dir, assembled_region_str)
            self.assertTrue(tf.gfile.IsDirectory(region_subdir))

            # We always have a graph.dot
            self.assertTrue(
                tf.gfile.Exists(
                    os.path.join(
                        region_subdir, self.reads_realigner.diagnostic_logger.
                        graph_filename)))

            reads_file = os.path.join(
                dx_dir, region_str, self.reads_realigner.diagnostic_logger.
                realigned_reads_filename)
            if emit_reads:
                self.assertTrue(tf.gfile.Exists(reads_file))
                reads_from_dx = io_utils.read_tfrecords(
                    reads_file, reads_pb2.Read)
                self.assertCountEqual(reads_from_dx, realigned_reads)
            else:
                self.assertFalse(tf.gfile.Exists(reads_file))
Exemplo n.º 9
0
 def setUp(self):
     self.ref_reader = genomics_io.make_ref_reader(test_utils.CHR20_FASTA)
     self.config = realigner.realigner_config(FLAGS)
     self.reads_realigner = realigner.Realigner(self.config,
                                                self.ref_reader)