예제 #1
0
def create_region_filter(region_flag_string, verbose=False):
  """Create a function that acts as a regions filter.

  Args:
    region_flag_string: string from --regions.
    verbose: bool. Whether to print regions after parsing.

  Returns:
    A function that given a variant will return True or False whether the
        variant falls inside the regions.

  """
  if isinstance(region_flag_string, str):
    region_args = region_flag_string.split()
  regions = ranges.RangeSet.from_regions(region_args)
  if verbose:
    logging.info('Regions to filter to: %s',
                 ', '.join([ranges.to_literal(r) for r in regions]))

  def passes_region_filter(variant):
    for r in regions:
      if ranges.position_overlaps(variant.reference_name, variant.start, r):
        return True
    return False

  return passes_region_filter
예제 #2
0
  def test_make_examples_training_end2end_with_alt_aligned_pileup(
      self, alt_align, expected_shape):
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.candidates = test_utils.test_tmpfile(_sharded('vsc.tfrecord'))
    FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord'))
    FLAGS.partition_size = 1000
    FLAGS.mode = 'training'
    FLAGS.gvcf_gq_binsize = 5
    FLAGS.alt_aligned_pileup = alt_align  # This is the only input change.
    FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
    FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED
    options = make_examples.default_options(add_flags=True)
    # Run make_examples with the flags above.
    make_examples_core.make_examples_runner(options)

    # Check the output for shape and against the golden file.
    if alt_align == 'rows':
      golden_file = _sharded(testdata.ALT_ALIGNED_ROWS_EXAMPLES)
    elif alt_align == 'diff_channels':
      golden_file = _sharded(testdata.ALT_ALIGNED_DIFF_CHANNELS_EXAMPLES)
    else:
      raise ValueError("Golden data doesn't exist for this alt_align option: "
                       '{}'.format(alt_align))
    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, region, options, verify_labels=True)
    self.assertDeepVariantExamplesEqual(
        examples, list(tfrecord.read_tfrecords(golden_file)))
    # Pileup image should have 3 rows of height 100, so resulting height is 300.
    self.assertEqual(decode_example(examples[0])['image/shape'], expected_shape)
예제 #3
0
  def test_make_examples_runtime_by_region(self):
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.mode = 'calling'
    num_shards = 4
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('examples.tfrecord', num_shards))
    # Use same number of shards for profiling files as examples.
    output_prefix = test_utils.test_tmpfile('runtime_profile')
    FLAGS.runtime_by_region = output_prefix + '@{}'.format(num_shards)
    FLAGS.task = 2
    # Run make_examples with those FLAGS.
    options = make_examples.default_options(add_flags=True)
    make_examples_core.make_examples_runner(options)
    # Sharded output ending in @4 becomes -00002-of-00004 for task 2.
    expected_output_path = output_prefix + '-0000{}-of-00004'.format(FLAGS.task)
    expected_columns = [
        'region', 'get reads', 'find candidates', 'make pileup images',
        'write outputs', 'num reads', 'num candidates', 'num examples'
    ]

    with gfile.Open(expected_output_path, 'r') as fin:
      header = fin.readline()
      column_names = header.strip().split('\t')
      self.assertEqual(expected_columns, column_names)
      non_header_lines = fin.readlines()
      self.assertLen(non_header_lines, 3)
      one_row = non_header_lines[0].strip().split('\t')
      self.assertEqual(len(one_row), len(column_names))
      self.assertGreater(int(one_row[5]), 0, msg='num reads > 0')
      self.assertGreater(int(one_row[6]), 0, msg='num candidates > 0')
      self.assertGreater(int(one_row[7]), 0, msg='num examples > 0')
예제 #4
0
def make_example(variant, alt_alleles, encoded_image, shape, image_format):
    """Creates a new tf.Example suitable for use with DeepVariant.

  Args:
    variant: third_party.nucleus.protos.Variant protobuf
      containing information about a candidate variant call.
    alt_alleles: A set of strings. Indicates the alternate alleles used as "alt"
      when constructing the image.
    encoded_image: a Tensor of type tf.string. Should contain an image encoding
      the reference and read data supporting variant. The encoding should be
      consistent with the image_format argument.
    shape: a list of (width, height, channel).
    image_format: string. The scheme used to encode our image.

  Returns:
    A tf.Example proto containing the standard DeepVariant features.
  """
    example = example_pb2.Example()
    features = example.features
    features.feature['locus'].bytes_list.value.append(
        ranges.to_literal(
            ranges.make_range(variant.reference_name, variant.start,
                              variant.end)))
    example_set_variant(example, variant)
    all_alts = list(variant.alternate_bases)
    alt_indices = sorted(all_alts.index(alt) for alt in alt_alleles)

    features.feature['alt_allele_indices/encoded'].bytes_list.value.append(
        deepvariant_pb2.CallVariantsOutput.AltAlleleIndices(
            indices=alt_indices).SerializeToString())

    features.feature['image/encoded'].bytes_list.value.append(encoded_image)
    features.feature['image/format'].bytes_list.value.append(image_format)
    features.feature['image/shape'].int64_list.value.extend(shape)
    return example
예제 #5
0
  def test_make_examples_with_allele_frequency(self, mode):
    FLAGS.mode = 'calling'
    FLAGS.ref = testdata.GRCH38_FASTA
    FLAGS.reads = testdata.GRCH38_CHR20_AND_21_BAM
    num_shards = 1
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('examples.tfrecord', num_shards))
    region = ranges.parse_literal('chr20:61001-62000')
    FLAGS.use_allele_frequency = True
    FLAGS.regions = [ranges.to_literal(region)]
    if mode == 'one vcf':
      FLAGS.population_vcfs = testdata.AF_VCF_CHR20_AND_21
    elif mode == 'two vcfs':
      FLAGS.population_vcfs = ' '.join(
          [testdata.AF_VCF_CHR20, testdata.AF_VCF_CHR21])
    else:
      raise ValueError('Invalid mode for parameterized test.')
    options = make_examples.default_options(add_flags=True)
    # Run make_examples with the flags above.
    make_examples_core.make_examples_runner(options)

    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, region, options, verify_labels=False)

    # Pileup images should have one extra channel.
    self.assertEqual([100, 221, dv_constants.PILEUP_NUM_CHANNELS + 1],
                     decode_example(examples[0])['image/shape'])

    # Test there is something in the added channel.
    # Values capture whether each loci has been seen in the observed examples.
    population_matched_loci = {
        'chr20:61539_A': False,
        'chr20:61634_G': False,
        'chr20:61644_G': False
    }

    for example in examples:
      locus_id = vis.locus_id_from_variant(vis.variant_from_example(example))
      if locus_id in population_matched_loci.keys():
        channels = vis.channels_from_example(example)
        self.assertGreater(
            np.sum(channels[dv_constants.PILEUP_NUM_CHANNELS]),
            0,
            msg='There should be '
            'something in the %s-th channel for variant '
            '%s' % (dv_constants.PILEUP_NUM_CHANNELS + 1, locus_id))
        population_matched_loci[locus_id] = True
    self.assertTrue(
        all(population_matched_loci.values()),
        msg='Check that all '
        '3 sample loci appeared in the examples.')

    # Check against the golden file (same for both modes).
    golden_file = _sharded(testdata.GOLDEN_ALLELE_FREQUENCY_EXAMPLES)
    examples_from_golden = list(tfrecord.read_tfrecords(golden_file))
    self.assertDeepVariantExamplesEqual(examples_from_golden, examples)
예제 #6
0
 def log_graph_metrics(self, region, graph, candidate_haplotypes,
                       graph_building_time):
   """Logs, if enabled, graph construction information for region."""
   if self.enabled:
     if graph:
       dest_file = self._file_for_region(region, self.graph_filename)
       with tf.gfile.FastGFile(dest_file, 'w') as f:
         f.write(graph.graphviz())
     self._write_csv_line(
         ranges.to_literal(region), graph.kmer_size if graph else 'NA',
         len(candidate_haplotypes), graph_building_time)
예제 #7
0
  def test_make_examples_end2end_failed_on_cram(self):
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')

    FLAGS.use_ref_for_cram = False
    FLAGS.write_run_info = True
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_CRAM
    FLAGS.candidates = test_utils.test_tmpfile(_sharded('failed.vsc.tfrecord'))
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('failed.examples.tfrecord'))
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = 'calling'
    FLAGS.gvcf_gq_binsize = 5
    options = make_examples.default_options(add_flags=True)
    with six.assertRaisesRegex(self, ValueError,
                               'Failed to parse BAM/CRAM file.'):
      make_examples_core.make_examples_runner(options)
예제 #8
0
    def process(self, region):
        """Finds candidates and creates corresponding examples in a region.

    Args:
      region: A nucleus.genomics.v1.Range proto. Specifies the region on the
        genome we should process.

    Returns:
      Three values. First is a list of the found candidates, which are
      deepvariant.DeepVariantCall objects. The second value is a list of filled
      in tf.Example protos. For example, these will include the candidate
      variant, the pileup image, and, if in training mode, the truth variants
      and labels needed for training. The third value is a list of
      nucleus.genomics.v1.Variant protos containing gVCF information for all
      reference sites, if gvcf generation is enabled, otherwise returns [].
    """
        region_timer = timer.TimerStart()

        # Print some basic information about what we are doing.
        if not self.initialized:
            self._initialize()

        self.in_memory_sam_reader.replace_reads(self.region_reads(region))
        candidates, gvcfs = self.candidates_in_region(region)

        if in_training_mode(self.options):
            examples = [
                self.add_label_to_example(example, label)
                for candidate, label in self.label_candidates(
                    candidates, region)
                for example in self.create_pileup_examples(candidate)
            ]
        else:
            examples = [
                example for candidate in candidates
                for example in self.create_pileup_examples(candidate)
            ]

        logging.info('Found %s candidates in %s [%d bp] [%0.2fs elapsed]',
                     len(examples), ranges.to_literal(region),
                     ranges.length(region), region_timer.Stop())
        return candidates, examples, gvcfs
예제 #9
0
  def test_catches_bad_flags(self):
    # Set all of the requested flag values.
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.candidates = test_utils.test_tmpfile('vsc.tfrecord')
    FLAGS.examples = test_utils.test_tmpfile('examples.tfrecord')
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = 'training'
    FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
    # This is the bad flag.
    FLAGS.confident_regions = ''

    with mock.patch.object(logging, 'error') as mock_logging,\
        mock.patch.object(sys, 'exit') as mock_exit:
      make_examples.main(['make_examples.py'])
    mock_logging.assert_called_once_with(
        'confident_regions is required when in training mode.')
    mock_exit.assert_called_once_with(errno.ENOENT)
예제 #10
0
  def test_make_examples_with_variant_selection(self,
                                                select_types,
                                                expected_count,
                                                keep_legacy_behavior=False):
    if select_types is not None:
      FLAGS.select_variant_types = select_types
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.candidates = test_utils.test_tmpfile(_sharded('vsc.tfrecord'))
    FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord'))
    FLAGS.partition_size = 1000
    FLAGS.mode = 'calling'
    FLAGS.keep_legacy_allele_counter_behavior = keep_legacy_behavior
    options = make_examples.default_options(add_flags=True)
    make_examples_core.make_examples_runner(options)

    candidates = list(tfrecord.read_tfrecords(FLAGS.candidates))
    self.assertLen(candidates, expected_count)
예제 #11
0
  def test_make_examples_end2end_confirm_downsample_fraction_used(self):

    def _get_examples(downsample_fraction=None):
      if downsample_fraction is not None:
        FLAGS.downsample_fraction = downsample_fraction
      options = make_examples.default_options(add_flags=True)
      make_examples_core.make_examples_runner(options)
      examples = self.verify_examples(
          FLAGS.examples, region, options, verify_labels=False)
      return examples

    region = ranges.parse_literal('chr20:10,000,000-10,004,000')
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord'))
    FLAGS.mode = 'calling'
    examples1 = _get_examples()
    examples2 = _get_examples(0.01)
    self.assertLess(len(examples2), len(examples1))
예제 #12
0
  def test_catches_bad_flags(self):
    # Set all of the requested flag values.
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.candidates = test_utils.test_tmpfile('vsc.tfrecord')
    FLAGS.examples = test_utils.test_tmpfile('examples.tfrecord')
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = 'training'
    FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
    # This is the bad flag.
    FLAGS.confident_regions = ''

    with mock.patch.object(logging, 'error') as mock_logging,\
        mock.patch.object(sys, 'exit') as mock_exit:
      make_examples.main(['make_examples.py'])
    mock_logging.assert_called_once_with(
        'confident_regions is required when in training mode.')
    mock_exit.assert_called_once_with(errno.ENOENT)
예제 #13
0
  def test_make_examples_end2end_failed_on_mismatched_multi_bam(self):
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')

    FLAGS.write_run_info = True
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = ','.join([testdata.CHR20_BAM, testdata.NOCHR20_BAM])
    FLAGS.candidates = test_utils.test_tmpfile(
        _sharded('mismatched_multi_bam.vsc.tfrecord'))
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('mismatched_multi_bam.examples.tfrecord'))
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = 'calling'
    FLAGS.gvcf_gq_binsize = 5
    options = make_examples.default_options(add_flags=True)
    # This shows an example of what the error message looks like:
    # redacted
    with six.assertRaisesRegex(
        self, ValueError, 'NOT_FOUND: Unknown reference_name '
        'reference_name: "chr20" start: 9999999 end: 10000999'):
      make_examples_core.make_examples_runner(options)
예제 #14
0
  def process(self, region):
    """Finds candidates and creates corresponding examples in a region.

    Args:
      region: A nucleus.genomics.v1.Range proto. Specifies the region on the
        genome we should process.

    Returns:
      Three values. First is a list of the found candidates, which are
      deepvariant.DeepVariantCall objects. The second value is a list of filled
      in tf.Example protos. For example, these will include the candidate
      variant, the pileup image, and, if in training mode, the truth variants
      and labels needed for training. The third value is a list of
      nucleus.genomics.v1.Variant protos containing gVCF information for all
      reference sites, if gvcf generation is enabled, otherwise returns [].
    """
    region_timer = timer.TimerStart()

    # Print some basic information about what we are doing.
    if not self.initialized:
      self._initialize()

    self.in_memory_sam_reader.replace_reads(self.region_reads(region))
    candidates, gvcfs = self.candidates_in_region(region)

    if in_training_mode(self.options):
      examples = [
          self.add_label_to_example(example, label)
          for candidate, label in self.label_candidates(candidates)
          for example in self.create_pileup_examples(candidate)
      ]
    else:
      examples = [
          example for candidate in candidates
          for example in self.create_pileup_examples(candidate)
      ]

    logging.info('Found %s candidates in %s [%0.2fs elapsed]', len(examples),
                 ranges.to_literal(region), region_timer.Stop())
    return candidates, examples, gvcfs
예제 #15
0
 def test_make_examples_training_end2end_with_customized_classes_labeler(self):
   FLAGS.labeler_algorithm = 'customized_classes_labeler'
   FLAGS.customized_classes_labeler_classes_list = 'ref,class1,class2'
   FLAGS.customized_classes_labeler_info_field_name = 'type'
   region = ranges.parse_literal('chr20:10,000,000-10,004,000')
   FLAGS.regions = [ranges.to_literal(region)]
   FLAGS.ref = testdata.CHR20_FASTA
   FLAGS.reads = testdata.CHR20_BAM
   FLAGS.candidates = test_utils.test_tmpfile(_sharded('vsc.tfrecord'))
   FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord'))
   FLAGS.partition_size = 1000
   FLAGS.mode = 'training'
   FLAGS.gvcf_gq_binsize = 5
   FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF_WITH_TYPES
   FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED
   options = make_examples.default_options(add_flags=True)
   make_examples_core.make_examples_runner(options)
   golden_file = _sharded(testdata.CUSTOMIZED_CLASSES_GOLDEN_TRAINING_EXAMPLES)
   # Verify that the variants in the examples are all good.
   examples = self.verify_examples(
       FLAGS.examples, region, options, verify_labels=True)
   self.assertDeepVariantExamplesEqual(
       examples, list(tfrecord.read_tfrecords(golden_file)))
예제 #16
0
  def test_make_examples_end2end(self,
                                 mode,
                                 num_shards,
                                 test_condition=TestConditions.USE_BAM,
                                 labeler_algorithm=None,
                                 use_fast_pass_aligner=True):
    self.assertIn(mode, {'calling', 'training'})
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.write_run_info = True
    FLAGS.ref = testdata.CHR20_FASTA
    if test_condition == TestConditions.USE_BAM:
      FLAGS.reads = testdata.CHR20_BAM
    elif test_condition == TestConditions.USE_CRAM:
      FLAGS.reads = testdata.CHR20_CRAM
    elif test_condition == TestConditions.USE_MULTI_BAMS:
      FLAGS.reads = ','.join(
          [testdata.CHR20_BAM_FIRST_HALF, testdata.CHR20_BAM_SECOND_HALF])

    FLAGS.candidates = test_utils.test_tmpfile(
        _sharded('vsc.tfrecord', num_shards))
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('examples.tfrecord', num_shards))
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = mode
    FLAGS.gvcf_gq_binsize = 5
    FLAGS.use_fast_pass_aligner = use_fast_pass_aligner
    if labeler_algorithm is not None:
      FLAGS.labeler_algorithm = labeler_algorithm

    if mode == 'calling':
      FLAGS.gvcf = test_utils.test_tmpfile(
          _sharded('gvcf.tfrecord', num_shards))
    else:
      FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
      FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED

    for task_id in range(max(num_shards, 1)):
      FLAGS.task = task_id
      options = make_examples.default_options(add_flags=True)
      # We need to overwrite bam_fname for USE_CRAM test since Golden Set
      # generated from BAM file. BAM filename is stored in candidates. If we
      # don't overwrite default_options variants won't match and test fail.
      options.bam_fname = 'NA12878_S1.chr20.10_10p1mb.bam'
      make_examples_core.make_examples_runner(options)

      # Check that our run_info proto contains the basic fields we'd expect:
      # (a) our options are written to the run_info.options field.
      run_info = make_examples_core.read_make_examples_run_info(
          options.run_info_filename)
      self.assertEqual(run_info.options, options)
      # (b) run_info.resource_metrics is present and contains our hostname.
      self.assertTrue(run_info.HasField('resource_metrics'))
      self.assertEqual(run_info.resource_metrics.host_name, platform.node())

    # Test that our candidates are reasonable, calling specific helper functions
    # to check lots of properties of the output.
    candidates = sorted(
        tfrecord.read_tfrecords(
            FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall),
        key=lambda c: variant_utils.variant_range_tuple(c.variant))
    self.verify_deepvariant_calls(candidates, options)
    self.verify_variants([call.variant for call in candidates],
                         region,
                         options,
                         is_gvcf=False)

    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, region, options, verify_labels=mode == 'training')
    example_variants = [tf_utils.example_variant(ex) for ex in examples]
    self.verify_variants(example_variants, region, options, is_gvcf=False)

    # Verify the integrity of the examples and then check that they match our
    # golden labeled examples. Note we expect the order for both training and
    # calling modes to produce deterministic order because we fix the random
    # seed.
    if mode == 'calling':
      golden_file = _sharded(testdata.GOLDEN_CALLING_EXAMPLES, num_shards)
    else:
      golden_file = _sharded(testdata.GOLDEN_TRAINING_EXAMPLES, num_shards)
    self.assertDeepVariantExamplesEqual(
        examples, list(tfrecord.read_tfrecords(golden_file)))

    if mode == 'calling':
      nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF)
      nist_variants = list(nist_reader.query(region))
      self.verify_nist_concordance(example_variants, nist_variants)

      # Check the quality of our generated gvcf file.
      gvcfs = variant_utils.sorted_variants(
          tfrecord.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant))
      self.verify_variants(gvcfs, region, options, is_gvcf=True)
      self.verify_contiguity(gvcfs, region)
      gvcf_golden_file = _sharded(testdata.GOLDEN_POSTPROCESS_GVCF_INPUT,
                                  num_shards)
      expected_gvcfs = list(
          tfrecord.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant))
      # Despite the name, assertCountEqual checks that all elements match.
      self.assertCountEqual(gvcfs, expected_gvcfs)

    if (mode == 'training' and num_shards == 0 and
        labeler_algorithm != 'positional_labeler'):
      # The positional labeler doesn't track metrics, so don't try to read them
      # in when that's the mode.
      self.assertEqual(
          make_examples_core.read_make_examples_run_info(
              testdata.GOLDEN_MAKE_EXAMPLES_RUN_INFO).labeling_metrics,
          run_info.labeling_metrics)
예제 #17
0
  def test_make_examples_end2end(self, mode, num_shards,
                                 labeler_algorithm=None):
    self.maxDiff = None
    self.assertIn(mode, {'calling', 'training'})
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.candidates = test_utils.test_tmpfile(
        _sharded('vsc.tfrecord', num_shards))
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('examples.tfrecord', num_shards))
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = mode
    FLAGS.gvcf_gq_binsize = 5
    if labeler_algorithm is not None:
      FLAGS.labeler_algorithm = labeler_algorithm

    if mode == 'calling':
      FLAGS.gvcf = test_utils.test_tmpfile(
          _sharded('gvcf.tfrecord', num_shards))
    else:
      FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
      FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED

    for task_id in range(max(num_shards, 1)):
      FLAGS.task = task_id
      options = make_examples.default_options(add_flags=True)
      make_examples.make_examples_runner(options)

    # Test that our candidates are reasonable, calling specific helper functions
    # to check lots of properties of the output.
    candidates = sorted(
        io_utils.read_tfrecords(
            FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall),
        key=lambda c: variant_utils.variant_range_tuple(c.variant))
    self.verify_deepvariant_calls(candidates, options)
    self.verify_variants(
        [call.variant for call in candidates], region, options, is_gvcf=False)

    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, region, options, verify_labels=mode == 'training')
    example_variants = [tf_utils.example_variant(ex) for ex in examples]
    self.verify_variants(example_variants, region, options, is_gvcf=False)

    # Verify the integrity of the examples and then check that they match our
    # golden labeled examples. Note we expect the order for both training and
    # calling modes to produce deterministic order because we fix the random
    # seed.
    if mode == 'calling':
      golden_file = _sharded(testdata.GOLDEN_CALLING_EXAMPLES, num_shards)
    else:
      golden_file = _sharded(testdata.GOLDEN_TRAINING_EXAMPLES, num_shards)
    self.assertDeepVariantExamplesEqual(
        examples, list(io_utils.read_tfrecords(golden_file)))

    if mode == 'calling':
      nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF)
      nist_variants = list(nist_reader.query(region))
      self.verify_nist_concordance(example_variants, nist_variants)

      # Check the quality of our generated gvcf file.
      gvcfs = variant_utils.sorted_variants(
          io_utils.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant))
      self.verify_variants(gvcfs, region, options, is_gvcf=True)
      self.verify_contiguity(gvcfs, region)
      gvcf_golden_file = _sharded(testdata.GOLDEN_POSTPROCESS_GVCF_INPUT,
                                  num_shards)
      expected_gvcfs = list(
          io_utils.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant))
      self.assertItemsEqual(gvcfs, expected_gvcfs)
예제 #18
0
 def test_to_literal(self):
   self.assertEqual(
       ranges.to_literal(ranges.make_range('chr1', 0, 20)), 'chr1:1-20')
예제 #19
0
 def __str__(self):
   return ('AssemblyRegion(region={}, span={}) with {} haplotypes and {} '
           'reads').format(
               ranges.to_literal(self.region),
               ranges.to_literal(self.read_span), len(self.haplotypes),
               len(self.reads))
예제 #20
0
 def _file_for_region(self, region, basename):
   """Returns the path to a file in a region-specific subdirectory."""
   assert self.enabled, 'only callable when diagnostics are on'
   return self._root_join(os.path.join(ranges.to_literal(region), basename))