Exemplo n.º 1
0
 def log_graph_metrics(self, region, graph, candidate_haplotypes,
                       graph_building_time):
   """Logs, if enabled, graph construction information for region."""
   if self.enabled:
     if graph:
       dest_file = self._file_for_region(region, self.graph_filename)
       with tf.gfile.FastGFile(dest_file, 'w') as f:
         f.write(graph.graphviz())
     self._write_csv_line(
         ranges.to_literal(region), graph.kmer_size if graph else 'NA',
         len(candidate_haplotypes), graph_building_time)
    def test_catches_bad_flags(self):
        # Set all of the requested flag values.
        region = ranges.parse_literal('chr20:10,000,000-10,010,000')
        FLAGS.ref = test_utils.CHR20_FASTA
        FLAGS.reads = test_utils.CHR20_BAM
        FLAGS.candidates = test_utils.test_tmpfile('vsc.tfrecord')
        FLAGS.examples = test_utils.test_tmpfile('examples.tfrecord')
        FLAGS.regions = [ranges.to_literal(region)]
        FLAGS.partition_size = 1000
        FLAGS.mode = 'training'
        FLAGS.truth_variants = test_utils.TRUTH_VARIANTS_VCF
        # This is the bad flag.
        FLAGS.confident_regions = ''

        with mock.patch.object(logging, 'error') as mock_logging,\
            mock.patch.object(sys, 'exit') as mock_exit:
            make_examples.main(['make_examples.py'])
        mock_logging.assert_called_once_with(
            'confident_regions is required when in training mode.')
        mock_exit.assert_called_once_with(errno.ENOENT)
Exemplo n.º 3
0
    def process(self, region):
        """Finds candidates and creates corresponding examples in a region.

    Args:
      region: A nucleus.genomics.v1.Range proto. Specifies the region on the
        genome we should process.

    Returns:
      Three values. First is a list of the found candidates, which are
      deepvariant.DeepVariantCall objects. The second value is a list of filled
      in tf.Example protos. For example, these will include the candidate
      variant, the pileup image, and, if in training mode, the truth variants
      and labels needed for training. The third value is a list of
      nucleus.genomics.v1.Variant protos containing gVCF information for all
      reference sites, if gvcf generation is enabled, otherwise returns [].
    """
        region_timer = timer.TimerStart()

        # Print some basic information about what we are doing.
        if not self.initialized:
            self._initialize()

        self.in_memory_sam_reader.replace_reads(self.region_reads(region))
        candidates, gvcfs = self.candidates_in_region(region)
        examples = []
        for candidate in candidates:
            for example in self.create_pileup_examples(candidate):
                if in_training_mode(self.options):
                    if self.label_variant(example, candidate.variant):
                        examples.append(example)
                else:
                    examples.append(example)
        logging.info('Found %s candidates in %s [%0.2fs elapsed]',
                     len(examples), ranges.to_literal(region),
                     region_timer.Stop())
        # Useful for debugging what examples are emitted...
        # for example in examples:
        #   logging.info('  example: %s', tf_utils.example_key(example))
        return candidates, examples, gvcfs
Exemplo n.º 4
0
def make_example(variant, alt_alleles, encoded_image, shape, image_format):
    """Creates a new tf.Example suitable for use with DeepVariant.

  Args:
    variant: third_party.nucleus.protos.Variant protobuf
      containing information about a candidate variant call.
    alt_alleles: A set of strings. Indicates the alternate alleles used as "alt"
      when constructing the image.
    encoded_image: a Tensor of type tf.string. Should contain an image encoding
      the reference and read data supporting variant. The encoding should be
      consistent with the image_format argument.
    shape: a list of (width, height, channel).
    image_format: string. The scheme used to encode our image.

  Returns:
    A tf.Example proto containing the standard DeepVariant features.
  """
    example = example_pb2.Example()
    features = example.features
    features.feature['locus'].bytes_list.value.append(
        ranges.to_literal(
            ranges.make_range(variant.reference_name, variant.start,
                              variant.end)))
    features.feature['variant/encoded'].bytes_list.value.append(
        variant.SerializeToString())
    all_alts = list(variant.alternate_bases)
    alt_indices = sorted(all_alts.index(alt) for alt in alt_alleles)

    features.feature['alt_allele_indices/encoded'].bytes_list.value.append(
        deepvariant_pb2.CallVariantsOutput.AltAlleleIndices(
            indices=alt_indices).SerializeToString())

    features.feature['image/encoded'].bytes_list.value.append(encoded_image)
    features.feature['image/format'].bytes_list.value.append(image_format)
    features.feature['image/shape'].int64_list.value.extend(shape)
    return example
Exemplo n.º 5
0
 def test_to_literal(self):
   self.assertEqual(
       ranges.to_literal(ranges.make_range('chr1', 0, 20)), 'chr1:1-20')
Exemplo n.º 6
0
 def _file_for_region(self, region, basename):
     """Returns the path to a file in a region-specific subdirectory."""
     assert self.enabled, 'only callable when diagnostics are on'
     return self._root_join(
         os.path.join(ranges.to_literal(region), basename))
    def test_make_examples_end2end(self, mode, num_shards):
        self.assertIn(mode, {'calling', 'training'})
        region = ranges.parse_literal('chr20:10,000,000-10,010,000')
        FLAGS.ref = test_utils.CHR20_FASTA
        FLAGS.reads = test_utils.CHR20_BAM
        FLAGS.candidates = test_utils.test_tmpfile(
            _sharded('vsc.tfrecord', num_shards))
        FLAGS.examples = test_utils.test_tmpfile(
            _sharded('examples.tfrecord', num_shards))
        FLAGS.regions = [ranges.to_literal(region)]
        FLAGS.partition_size = 1000
        FLAGS.mode = mode

        if mode == 'calling':
            FLAGS.gvcf = test_utils.test_tmpfile(
                _sharded('gvcf.tfrecord', num_shards))
        else:
            FLAGS.truth_variants = test_utils.TRUTH_VARIANTS_VCF
            FLAGS.confident_regions = test_utils.CONFIDENT_REGIONS_BED

        for task_id in range(max(num_shards, 1)):
            FLAGS.task = task_id
            options = make_examples.default_options(add_flags=True)
            make_examples.make_examples_runner(options)

        # Test that our candidates are reasonable, calling specific helper functions
        # to check lots of properties of the output.
        candidates = _sort_candidates(
            io_utils.read_tfrecords(FLAGS.candidates,
                                    proto=deepvariant_pb2.DeepVariantCall))
        self.verify_deepvariant_calls(candidates, options)
        self.verify_variants([call.variant for call in candidates],
                             region,
                             options,
                             is_gvcf=False)

        # Verify that the variants in the examples are all good.
        examples = self.verify_examples(FLAGS.examples,
                                        region,
                                        options,
                                        verify_labels=mode == 'training')
        example_variants = [tf_utils.example_variant(ex) for ex in examples]
        self.verify_variants(example_variants, region, options, is_gvcf=False)

        # Verify the integrity of the examples and then check that they match our
        # golden labeled examples. Note we expect the order for both training and
        # calling modes to produce deterministic order because we fix the random
        # seed.
        if mode == 'calling':
            golden_file = _sharded(test_utils.GOLDEN_CALLING_EXAMPLES,
                                   num_shards)
        else:
            golden_file = _sharded(test_utils.GOLDEN_TRAINING_EXAMPLES,
                                   num_shards)
        self.assertDeepVariantExamplesEqual(
            examples, list(io_utils.read_tfrecords(golden_file)))

        if mode == 'calling':
            nist_reader = genomics_io.make_vcf_reader(
                test_utils.TRUTH_VARIANTS_VCF)
            nist_variants = list(nist_reader.query(region))
            self.verify_nist_concordance(example_variants, nist_variants)

            # Check the quality of our generated gvcf file.
            gvcfs = _sort_variants(
                io_utils.read_tfrecords(FLAGS.gvcf,
                                        proto=variants_pb2.Variant))
            self.verify_variants(gvcfs, region, options, is_gvcf=True)
            self.verify_contiguity(gvcfs, region)