Exemplo n.º 1
0
  def test_make_examples_training_end2end_with_alt_aligned_pileup(
      self, alt_align, expected_shape):
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.candidates = test_utils.test_tmpfile(_sharded('vsc.tfrecord'))
    FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord'))
    FLAGS.partition_size = 1000
    FLAGS.mode = 'training'
    FLAGS.gvcf_gq_binsize = 5
    FLAGS.alt_aligned_pileup = alt_align  # This is the only input change.
    FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
    FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED
    options = make_examples.default_options(add_flags=True)
    # Run make_examples with the flags above.
    make_examples_core.make_examples_runner(options)

    # Check the output for shape and against the golden file.
    if alt_align == 'rows':
      golden_file = _sharded(testdata.ALT_ALIGNED_ROWS_EXAMPLES)
    elif alt_align == 'diff_channels':
      golden_file = _sharded(testdata.ALT_ALIGNED_DIFF_CHANNELS_EXAMPLES)
    else:
      raise ValueError("Golden data doesn't exist for this alt_align option: "
                       '{}'.format(alt_align))
    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, region, options, verify_labels=True)
    self.assertDeepVariantExamplesEqual(
        examples, list(tfrecord.read_tfrecords(golden_file)))
    # Pileup image should have 3 rows of height 100, so resulting height is 300.
    self.assertEqual(decode_example(examples[0])['image/shape'], expected_shape)
Exemplo n.º 2
0
  def test_make_examples_runtime_by_region(self):
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.mode = 'calling'
    num_shards = 4
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('examples.tfrecord', num_shards))
    # Use same number of shards for profiling files as examples.
    output_prefix = test_utils.test_tmpfile('runtime_profile')
    FLAGS.runtime_by_region = output_prefix + '@{}'.format(num_shards)
    FLAGS.task = 2
    # Run make_examples with those FLAGS.
    options = make_examples.default_options(add_flags=True)
    make_examples_core.make_examples_runner(options)
    # Sharded output ending in @4 becomes -00002-of-00004 for task 2.
    expected_output_path = output_prefix + '-0000{}-of-00004'.format(FLAGS.task)
    expected_columns = [
        'region', 'get reads', 'find candidates', 'make pileup images',
        'write outputs', 'num reads', 'num candidates', 'num examples'
    ]

    with gfile.Open(expected_output_path, 'r') as fin:
      header = fin.readline()
      column_names = header.strip().split('\t')
      self.assertEqual(expected_columns, column_names)
      non_header_lines = fin.readlines()
      self.assertLen(non_header_lines, 3)
      one_row = non_header_lines[0].strip().split('\t')
      self.assertEqual(len(one_row), len(column_names))
      self.assertGreater(int(one_row[5]), 0, msg='num reads > 0')
      self.assertGreater(int(one_row[6]), 0, msg='num candidates > 0')
      self.assertGreater(int(one_row[7]), 0, msg='num examples > 0')
Exemplo n.º 3
0
  def test_make_examples_end2end_vcf_candidate_importer(self, mode):
    FLAGS.variant_caller = 'vcf_candidate_importer'
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.candidates = test_utils.test_tmpfile(
        _sharded('vcf_candidate_importer.{}.tfrecord'.format(mode)))
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('vcf_candidate_importer.examples.{}.tfrecord'.format(mode)))
    FLAGS.mode = mode

    if mode == 'calling':
      golden_file = _sharded(
          testdata.GOLDEN_VCF_CANDIDATE_IMPORTER_CALLING_EXAMPLES)
      FLAGS.proposed_variants = testdata.VCF_CANDIDATE_IMPORTER_VARIANTS
      # Adding the following flags to match how the testdata was created.
      FLAGS.regions = 'chr20:59,777,000-60,000,000'
      FLAGS.realign_reads = False
    else:
      golden_file = _sharded(
          testdata.GOLDEN_VCF_CANDIDATE_IMPORTER_TRAINING_EXAMPLES)
      FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
    options = make_examples.default_options(add_flags=True)
    make_examples_core.make_examples_runner(options)
    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, None, options, verify_labels=mode == 'training')
    self.assertDeepVariantExamplesEqual(
        examples, list(tfrecord.read_tfrecords(golden_file)))
    self.assertEqual(
        decode_example(examples[0])['image/shape'],
        [100, 221, dv_constants.PILEUP_NUM_CHANNELS])
Exemplo n.º 4
0
    def _get_examples(use_confident_regions=False):
      # `flag_name` can be either 'confident_regions' or 'regions'. Both should
      # be used to constrain the set of candidates generated, and as a result
      # generating the same examples.
      bed_path = test_utils.test_tmpfile('vcf_candidate_importer.bed')
      with gfile.Open(bed_path, 'w') as fout:
        fout.write('\t'.join(['chr20', '10000000', '10001000']) + '\n')
      if use_confident_regions:
        FLAGS.confident_regions = bed_path
        FLAGS.regions = ''
      else:
        FLAGS.confident_regions = ''
        FLAGS.regions = bed_path

      FLAGS.examples = test_utils.test_tmpfile(
          _sharded('vcf_candidate_importer.tfrecord'))
      FLAGS.mode = 'training'
      FLAGS.reads = testdata.CHR20_BAM
      FLAGS.ref = testdata.CHR20_FASTA
      FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
      FLAGS.variant_caller = 'vcf_candidate_importer'

      options = make_examples.default_options(add_flags=True)
      make_examples_core.make_examples_runner(options)
      # Verify that the variants in the examples are all good.
      examples = self.verify_examples(
          FLAGS.examples, None, options, verify_labels=False)
      return examples
Exemplo n.º 5
0
 def _get_examples(downsample_fraction=None):
   if downsample_fraction is not None:
     FLAGS.downsample_fraction = downsample_fraction
   options = make_examples.default_options(add_flags=True)
   make_examples_core.make_examples_runner(options)
   examples = self.verify_examples(
       FLAGS.examples, region, options, verify_labels=False)
   return examples
Exemplo n.º 6
0
  def test_make_examples_with_allele_frequency(self, mode):
    FLAGS.mode = 'calling'
    FLAGS.ref = testdata.GRCH38_FASTA
    FLAGS.reads = testdata.GRCH38_CHR20_AND_21_BAM
    num_shards = 1
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('examples.tfrecord', num_shards))
    region = ranges.parse_literal('chr20:61001-62000')
    FLAGS.use_allele_frequency = True
    FLAGS.regions = [ranges.to_literal(region)]
    if mode == 'one vcf':
      FLAGS.population_vcfs = testdata.AF_VCF_CHR20_AND_21
    elif mode == 'two vcfs':
      FLAGS.population_vcfs = ' '.join(
          [testdata.AF_VCF_CHR20, testdata.AF_VCF_CHR21])
    else:
      raise ValueError('Invalid mode for parameterized test.')
    options = make_examples.default_options(add_flags=True)
    # Run make_examples with the flags above.
    make_examples_core.make_examples_runner(options)

    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, region, options, verify_labels=False)

    # Pileup images should have one extra channel.
    self.assertEqual([100, 221, dv_constants.PILEUP_NUM_CHANNELS + 1],
                     decode_example(examples[0])['image/shape'])

    # Test there is something in the added channel.
    # Values capture whether each loci has been seen in the observed examples.
    population_matched_loci = {
        'chr20:61539_A': False,
        'chr20:61634_G': False,
        'chr20:61644_G': False
    }

    for example in examples:
      locus_id = vis.locus_id_from_variant(vis.variant_from_example(example))
      if locus_id in population_matched_loci.keys():
        channels = vis.channels_from_example(example)
        self.assertGreater(
            np.sum(channels[dv_constants.PILEUP_NUM_CHANNELS]),
            0,
            msg='There should be '
            'something in the %s-th channel for variant '
            '%s' % (dv_constants.PILEUP_NUM_CHANNELS + 1, locus_id))
        population_matched_loci[locus_id] = True
    self.assertTrue(
        all(population_matched_loci.values()),
        msg='Check that all '
        '3 sample loci appeared in the examples.')

    # Check against the golden file (same for both modes).
    golden_file = _sharded(testdata.GOLDEN_ALLELE_FREQUENCY_EXAMPLES)
    examples_from_golden = list(tfrecord.read_tfrecords(golden_file))
    self.assertDeepVariantExamplesEqual(examples_from_golden, examples)
Exemplo n.º 7
0
  def test_make_examples_end2end_failed_on_cram(self):
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')

    FLAGS.use_ref_for_cram = False
    FLAGS.write_run_info = True
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_CRAM
    FLAGS.candidates = test_utils.test_tmpfile(_sharded('failed.vsc.tfrecord'))
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('failed.examples.tfrecord'))
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = 'calling'
    FLAGS.gvcf_gq_binsize = 5
    options = make_examples.default_options(add_flags=True)
    with six.assertRaisesRegex(self, ValueError,
                               'Failed to parse BAM/CRAM file.'):
      make_examples_core.make_examples_runner(options)
Exemplo n.º 8
0
def main(argv=()):
    with errors.clean_commandline_error_exit():
        if len(argv) > 1:
            errors.log_and_raise(
                'Command line parsing failure: make_examples does not accept '
                'positional arguments but some are present on the command line: '
                '"{}".'.format(str(argv)), errors.CommandLineError)
        del argv  # Unused.

        proto_utils.uses_fast_cpp_protos_or_die()

        logging_level.set_from_flag()
        hts_verbose.set(hts_verbose.htsLogLevel[FLAGS.hts_logging_level])

        # Set up options; may do I/O.
        options = default_options(add_flags=True, flags_obj=FLAGS)
        check_options_are_valid(options)

        # Run!
        make_examples_core.make_examples_runner(options)
Exemplo n.º 9
0
  def test_make_examples_with_variant_selection(self,
                                                select_types,
                                                expected_count,
                                                keep_legacy_behavior=False):
    if select_types is not None:
      FLAGS.select_variant_types = select_types
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.candidates = test_utils.test_tmpfile(_sharded('vsc.tfrecord'))
    FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord'))
    FLAGS.partition_size = 1000
    FLAGS.mode = 'calling'
    FLAGS.keep_legacy_allele_counter_behavior = keep_legacy_behavior
    options = make_examples.default_options(add_flags=True)
    make_examples_core.make_examples_runner(options)

    candidates = list(tfrecord.read_tfrecords(FLAGS.candidates))
    self.assertLen(candidates, expected_count)
Exemplo n.º 10
0
  def test_make_examples_end2end_failed_on_mismatched_multi_bam(self):
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')

    FLAGS.write_run_info = True
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = ','.join([testdata.CHR20_BAM, testdata.NOCHR20_BAM])
    FLAGS.candidates = test_utils.test_tmpfile(
        _sharded('mismatched_multi_bam.vsc.tfrecord'))
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('mismatched_multi_bam.examples.tfrecord'))
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = 'calling'
    FLAGS.gvcf_gq_binsize = 5
    options = make_examples.default_options(add_flags=True)
    # This shows an example of what the error message looks like:
    # redacted
    with six.assertRaisesRegex(
        self, ValueError, 'NOT_FOUND: Unknown reference_name '
        'reference_name: "chr20" start: 9999999 end: 10000999'):
      make_examples_core.make_examples_runner(options)
Exemplo n.º 11
0
 def test_make_examples_training_end2end_with_customized_classes_labeler(self):
   FLAGS.labeler_algorithm = 'customized_classes_labeler'
   FLAGS.customized_classes_labeler_classes_list = 'ref,class1,class2'
   FLAGS.customized_classes_labeler_info_field_name = 'type'
   region = ranges.parse_literal('chr20:10,000,000-10,004,000')
   FLAGS.regions = [ranges.to_literal(region)]
   FLAGS.ref = testdata.CHR20_FASTA
   FLAGS.reads = testdata.CHR20_BAM
   FLAGS.candidates = test_utils.test_tmpfile(_sharded('vsc.tfrecord'))
   FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord'))
   FLAGS.partition_size = 1000
   FLAGS.mode = 'training'
   FLAGS.gvcf_gq_binsize = 5
   FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF_WITH_TYPES
   FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED
   options = make_examples.default_options(add_flags=True)
   make_examples_core.make_examples_runner(options)
   golden_file = _sharded(testdata.CUSTOMIZED_CLASSES_GOLDEN_TRAINING_EXAMPLES)
   # Verify that the variants in the examples are all good.
   examples = self.verify_examples(
       FLAGS.examples, region, options, verify_labels=True)
   self.assertDeepVariantExamplesEqual(
       examples, list(tfrecord.read_tfrecords(golden_file)))
Exemplo n.º 12
0
  def test_make_examples_end2end(self,
                                 mode,
                                 num_shards,
                                 test_condition=TestConditions.USE_BAM,
                                 labeler_algorithm=None,
                                 use_fast_pass_aligner=True):
    self.assertIn(mode, {'calling', 'training'})
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.write_run_info = True
    FLAGS.ref = testdata.CHR20_FASTA
    if test_condition == TestConditions.USE_BAM:
      FLAGS.reads = testdata.CHR20_BAM
    elif test_condition == TestConditions.USE_CRAM:
      FLAGS.reads = testdata.CHR20_CRAM
    elif test_condition == TestConditions.USE_MULTI_BAMS:
      FLAGS.reads = ','.join(
          [testdata.CHR20_BAM_FIRST_HALF, testdata.CHR20_BAM_SECOND_HALF])

    FLAGS.candidates = test_utils.test_tmpfile(
        _sharded('vsc.tfrecord', num_shards))
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('examples.tfrecord', num_shards))
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = mode
    FLAGS.gvcf_gq_binsize = 5
    FLAGS.use_fast_pass_aligner = use_fast_pass_aligner
    if labeler_algorithm is not None:
      FLAGS.labeler_algorithm = labeler_algorithm

    if mode == 'calling':
      FLAGS.gvcf = test_utils.test_tmpfile(
          _sharded('gvcf.tfrecord', num_shards))
    else:
      FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
      FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED

    for task_id in range(max(num_shards, 1)):
      FLAGS.task = task_id
      options = make_examples.default_options(add_flags=True)
      # We need to overwrite bam_fname for USE_CRAM test since Golden Set
      # generated from BAM file. BAM filename is stored in candidates. If we
      # don't overwrite default_options variants won't match and test fail.
      options.bam_fname = 'NA12878_S1.chr20.10_10p1mb.bam'
      make_examples_core.make_examples_runner(options)

      # Check that our run_info proto contains the basic fields we'd expect:
      # (a) our options are written to the run_info.options field.
      run_info = make_examples_core.read_make_examples_run_info(
          options.run_info_filename)
      self.assertEqual(run_info.options, options)
      # (b) run_info.resource_metrics is present and contains our hostname.
      self.assertTrue(run_info.HasField('resource_metrics'))
      self.assertEqual(run_info.resource_metrics.host_name, platform.node())

    # Test that our candidates are reasonable, calling specific helper functions
    # to check lots of properties of the output.
    candidates = sorted(
        tfrecord.read_tfrecords(
            FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall),
        key=lambda c: variant_utils.variant_range_tuple(c.variant))
    self.verify_deepvariant_calls(candidates, options)
    self.verify_variants([call.variant for call in candidates],
                         region,
                         options,
                         is_gvcf=False)

    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, region, options, verify_labels=mode == 'training')
    example_variants = [tf_utils.example_variant(ex) for ex in examples]
    self.verify_variants(example_variants, region, options, is_gvcf=False)

    # Verify the integrity of the examples and then check that they match our
    # golden labeled examples. Note we expect the order for both training and
    # calling modes to produce deterministic order because we fix the random
    # seed.
    if mode == 'calling':
      golden_file = _sharded(testdata.GOLDEN_CALLING_EXAMPLES, num_shards)
    else:
      golden_file = _sharded(testdata.GOLDEN_TRAINING_EXAMPLES, num_shards)
    self.assertDeepVariantExamplesEqual(
        examples, list(tfrecord.read_tfrecords(golden_file)))

    if mode == 'calling':
      nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF)
      nist_variants = list(nist_reader.query(region))
      self.verify_nist_concordance(example_variants, nist_variants)

      # Check the quality of our generated gvcf file.
      gvcfs = variant_utils.sorted_variants(
          tfrecord.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant))
      self.verify_variants(gvcfs, region, options, is_gvcf=True)
      self.verify_contiguity(gvcfs, region)
      gvcf_golden_file = _sharded(testdata.GOLDEN_POSTPROCESS_GVCF_INPUT,
                                  num_shards)
      expected_gvcfs = list(
          tfrecord.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant))
      # Despite the name, assertCountEqual checks that all elements match.
      self.assertCountEqual(gvcfs, expected_gvcfs)

    if (mode == 'training' and num_shards == 0 and
        labeler_algorithm != 'positional_labeler'):
      # The positional labeler doesn't track metrics, so don't try to read them
      # in when that's the mode.
      self.assertEqual(
          make_examples_core.read_make_examples_run_info(
              testdata.GOLDEN_MAKE_EXAMPLES_RUN_INFO).labeling_metrics,
          run_info.labeling_metrics)