Пример #1
0
    def test_flags_strictly_needs_sam_aux_fields(
            self, flags_strictly_needs_sam_aux_fields):
        FLAGS.mode = 'calling'
        FLAGS.ref = testdata.CHR20_FASTA
        FLAGS.reads = testdata.CHR20_BAM
        FLAGS.examples = 'examples.tfrecord'
        FLAGS[flags_strictly_needs_sam_aux_fields].value = True
        FLAGS.parse_sam_aux_fields = False

        with six.assertRaisesRegex(
                self, Exception,
                'If --{} is set then --parse_sam_aux_fields must be set too.'.
                format(flags_strictly_needs_sam_aux_fields)):
            make_examples.default_options(add_flags=True)
Пример #2
0
  def test_make_examples_runtime_by_region(self):
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.mode = 'calling'
    num_shards = 4
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('examples.tfrecord', num_shards))
    # Use same number of shards for profiling files as examples.
    output_prefix = test_utils.test_tmpfile('runtime_profile')
    FLAGS.runtime_by_region = output_prefix + '@{}'.format(num_shards)
    FLAGS.task = 2
    # Run make_examples with those FLAGS.
    options = make_examples.default_options(add_flags=True)
    make_examples_core.make_examples_runner(options)
    # Sharded output ending in @4 becomes -00002-of-00004 for task 2.
    expected_output_path = output_prefix + '-0000{}-of-00004'.format(FLAGS.task)
    expected_columns = [
        'region', 'get reads', 'find candidates', 'make pileup images',
        'write outputs', 'num reads', 'num candidates', 'num examples'
    ]

    with gfile.Open(expected_output_path, 'r') as fin:
      header = fin.readline()
      column_names = header.strip().split('\t')
      self.assertEqual(expected_columns, column_names)
      non_header_lines = fin.readlines()
      self.assertLen(non_header_lines, 3)
      one_row = non_header_lines[0].strip().split('\t')
      self.assertEqual(len(one_row), len(column_names))
      self.assertGreater(int(one_row[5]), 0, msg='num reads > 0')
      self.assertGreater(int(one_row[6]), 0, msg='num candidates > 0')
      self.assertGreater(int(one_row[7]), 0, msg='num examples > 0')
Пример #3
0
  def test_make_examples_training_end2end_with_alt_aligned_pileup(
      self, alt_align, expected_shape):
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.candidates = test_utils.test_tmpfile(_sharded('vsc.tfrecord'))
    FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord'))
    FLAGS.partition_size = 1000
    FLAGS.mode = 'training'
    FLAGS.gvcf_gq_binsize = 5
    FLAGS.alt_aligned_pileup = alt_align  # This is the only input change.
    FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
    FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED
    options = make_examples.default_options(add_flags=True)
    # Run make_examples with the flags above.
    make_examples_core.make_examples_runner(options)

    # Check the output for shape and against the golden file.
    if alt_align == 'rows':
      golden_file = _sharded(testdata.ALT_ALIGNED_ROWS_EXAMPLES)
    elif alt_align == 'diff_channels':
      golden_file = _sharded(testdata.ALT_ALIGNED_DIFF_CHANNELS_EXAMPLES)
    else:
      raise ValueError("Golden data doesn't exist for this alt_align option: "
                       '{}'.format(alt_align))
    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, region, options, verify_labels=True)
    self.assertDeepVariantExamplesEqual(
        examples, list(tfrecord.read_tfrecords(golden_file)))
    # Pileup image should have 3 rows of height 100, so resulting height is 300.
    self.assertEqual(decode_example(examples[0])['image/shape'], expected_shape)
Пример #4
0
    def _get_examples(use_confident_regions=False):
      # `flag_name` can be either 'confident_regions' or 'regions'. Both should
      # be used to constrain the set of candidates generated, and as a result
      # generating the same examples.
      bed_path = test_utils.test_tmpfile('vcf_candidate_importer.bed')
      with gfile.Open(bed_path, 'w') as fout:
        fout.write('\t'.join(['chr20', '10000000', '10001000']) + '\n')
      if use_confident_regions:
        FLAGS.confident_regions = bed_path
        FLAGS.regions = ''
      else:
        FLAGS.confident_regions = ''
        FLAGS.regions = bed_path

      FLAGS.examples = test_utils.test_tmpfile(
          _sharded('vcf_candidate_importer.tfrecord'))
      FLAGS.mode = 'training'
      FLAGS.reads = testdata.CHR20_BAM
      FLAGS.ref = testdata.CHR20_FASTA
      FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
      FLAGS.variant_caller = 'vcf_candidate_importer'

      options = make_examples.default_options(add_flags=True)
      make_examples_core.make_examples_runner(options)
      # Verify that the variants in the examples are all good.
      examples = self.verify_examples(
          FLAGS.examples, None, options, verify_labels=False)
      return examples
Пример #5
0
  def test_make_examples_end2end_vcf_candidate_importer(self, mode):
    FLAGS.variant_caller = 'vcf_candidate_importer'
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.candidates = test_utils.test_tmpfile(
        _sharded('vcf_candidate_importer.{}.tfrecord'.format(mode)))
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('vcf_candidate_importer.examples.{}.tfrecord'.format(mode)))
    FLAGS.mode = mode

    if mode == 'calling':
      golden_file = _sharded(
          testdata.GOLDEN_VCF_CANDIDATE_IMPORTER_CALLING_EXAMPLES)
      FLAGS.proposed_variants = testdata.VCF_CANDIDATE_IMPORTER_VARIANTS
      # Adding the following flags to match how the testdata was created.
      FLAGS.regions = 'chr20:59,777,000-60,000,000'
      FLAGS.realign_reads = False
    else:
      golden_file = _sharded(
          testdata.GOLDEN_VCF_CANDIDATE_IMPORTER_TRAINING_EXAMPLES)
      FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
    options = make_examples.default_options(add_flags=True)
    make_examples_core.make_examples_runner(options)
    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, None, options, verify_labels=mode == 'training')
    self.assertDeepVariantExamplesEqual(
        examples, list(tfrecord.read_tfrecords(golden_file)))
    self.assertEqual(
        decode_example(examples[0])['image/shape'],
        [100, 221, dv_constants.PILEUP_NUM_CHANNELS])
 def test_gvcf_output_enabled_is_false_without_gvcf_flag(self):
     FLAGS.mode = 'training'
     FLAGS.gvcf = ''
     FLAGS.reads = ''
     FLAGS.ref = ''
     FLAGS.examples = ''
     options = make_examples.default_options(add_flags=True)
     self.assertFalse(make_examples.gvcf_output_enabled(options))
Пример #7
0
 def _get_examples(downsample_fraction=None):
   if downsample_fraction is not None:
     FLAGS.downsample_fraction = downsample_fraction
   options = make_examples.default_options(add_flags=True)
   make_examples_core.make_examples_runner(options)
   examples = self.verify_examples(
       FLAGS.examples, region, options, verify_labels=False)
   return examples
Пример #8
0
 def test_gvcf_output_enabled_is_true_with_gvcf_flag(self):
     FLAGS.mode = 'training'
     FLAGS.gvcf = '/tmp/foo.vcf'
     FLAGS.reads = ''
     FLAGS.ref = ''
     FLAGS.examples = ''
     options = make_examples.default_options(add_flags=True)
     self.assertTrue(make_examples_core.gvcf_output_enabled(options))
Пример #9
0
 def test_gvcf_output_enabled_is_true_with_gvcf_flag(self):
   FLAGS.mode = 'training'
   FLAGS.gvcf = '/tmp/foo.vcf'
   FLAGS.reads = ''
   FLAGS.ref = ''
   FLAGS.examples = ''
   options = make_examples.default_options(add_flags=True)
   self.assertTrue(make_examples.gvcf_output_enabled(options))
Пример #10
0
  def test_make_examples_with_allele_frequency(self, mode):
    FLAGS.mode = 'calling'
    FLAGS.ref = testdata.GRCH38_FASTA
    FLAGS.reads = testdata.GRCH38_CHR20_AND_21_BAM
    num_shards = 1
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('examples.tfrecord', num_shards))
    region = ranges.parse_literal('chr20:61001-62000')
    FLAGS.use_allele_frequency = True
    FLAGS.regions = [ranges.to_literal(region)]
    if mode == 'one vcf':
      FLAGS.population_vcfs = testdata.AF_VCF_CHR20_AND_21
    elif mode == 'two vcfs':
      FLAGS.population_vcfs = ' '.join(
          [testdata.AF_VCF_CHR20, testdata.AF_VCF_CHR21])
    else:
      raise ValueError('Invalid mode for parameterized test.')
    options = make_examples.default_options(add_flags=True)
    # Run make_examples with the flags above.
    make_examples_core.make_examples_runner(options)

    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, region, options, verify_labels=False)

    # Pileup images should have one extra channel.
    self.assertEqual([100, 221, dv_constants.PILEUP_NUM_CHANNELS + 1],
                     decode_example(examples[0])['image/shape'])

    # Test there is something in the added channel.
    # Values capture whether each loci has been seen in the observed examples.
    population_matched_loci = {
        'chr20:61539_A': False,
        'chr20:61634_G': False,
        'chr20:61644_G': False
    }

    for example in examples:
      locus_id = vis.locus_id_from_variant(vis.variant_from_example(example))
      if locus_id in population_matched_loci.keys():
        channels = vis.channels_from_example(example)
        self.assertGreater(
            np.sum(channels[dv_constants.PILEUP_NUM_CHANNELS]),
            0,
            msg='There should be '
            'something in the %s-th channel for variant '
            '%s' % (dv_constants.PILEUP_NUM_CHANNELS + 1, locus_id))
        population_matched_loci[locus_id] = True
    self.assertTrue(
        all(population_matched_loci.values()),
        msg='Check that all '
        '3 sample loci appeared in the examples.')

    # Check against the golden file (same for both modes).
    golden_file = _sharded(testdata.GOLDEN_ALLELE_FREQUENCY_EXAMPLES)
    examples_from_golden = list(tfrecord.read_tfrecords(golden_file))
    self.assertDeepVariantExamplesEqual(examples_from_golden, examples)
Пример #11
0
 def test_min_base_quality(self):
   FLAGS.min_base_quality = 5
   FLAGS.ref = testdata.CHR20_FASTA
   FLAGS.reads = testdata.CHR20_BAM
   FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
   FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED
   FLAGS.mode = 'training'
   FLAGS.examples = ''
   options = make_examples.default_options(add_flags=True)
   self.assertEqual(options.pic_options.read_requirements.min_base_quality, 5)
Пример #12
0
    def test_flag_optionally_needs_sam_aux_fields_with_different_parse_sam_aux_fields(
            self, flag_optionally_needs_sam_aux_fields, parse_sam_aux_fields,
            expected_message):
        FLAGS.mode = 'calling'
        FLAGS.ref = testdata.CHR20_FASTA
        FLAGS.reads = testdata.CHR20_BAM
        FLAGS.examples = 'examples.tfrecord'
        FLAGS[flag_optionally_needs_sam_aux_fields].value = True
        FLAGS.parse_sam_aux_fields = parse_sam_aux_fields

        with self.assertLogs() as logs:
            make_examples.default_options(add_flags=True)
        aux_fields_log_messages = [
            x for x in logs.output if '--parse_sam_aux_fields' in x
        ]
        if aux_fields_log_messages:
            self.assertRegex(
                aux_fields_log_messages[0],
                expected_message.format(flag_optionally_needs_sam_aux_fields))
        else:
            self.assertEmpty(aux_fields_log_messages)
Пример #13
0
    def test_incorrect_empty_regions(self):
        FLAGS.mode = 'calling'
        FLAGS.ref = testdata.CHR20_FASTA
        FLAGS.reads = testdata.CHR20_BAM
        # Deliberately incorrect contig name.
        FLAGS.regions = '20:10,000,000-11,000,000'
        FLAGS.examples = 'examples.tfrecord'

        options = make_examples.default_options(add_flags=True)
        with six.assertRaisesRegex(self, ValueError,
                                   'The regions to call is empty.'):
            make_examples_core.processing_regions_from_options(options)
Пример #14
0
  def test_default_options_with_training_random_emit_ref_sites(self):
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
    FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED
    FLAGS.mode = 'training'
    FLAGS.examples = ''

    FLAGS.training_random_emit_ref_sites = 0.3
    options = make_examples.default_options(add_flags=True)
    self.assertAlmostEqual(
        options.variant_caller_options.fraction_reference_sites_to_emit, 0.3)
Пример #15
0
 def test_add_supporting_other_alt_color(self):
   FLAGS.mode = 'training'
   FLAGS.gvcf = ''
   FLAGS.reads = ''
   FLAGS.ref = ''
   FLAGS.examples = ''
   FLAGS.add_supporting_other_alt_color = True
   options = make_examples.default_options(add_flags=True)
   self.assertAlmostEqual(
       options.pic_options.other_allele_supporting_read_alpha, 0.3)
   self.assertAlmostEqual(options.pic_options.allele_unsupporting_read_alpha,
                          0.6)
Пример #16
0
  def test_default_options_with_training_random_emit_ref_sites(self):
    FLAGS.ref = test_utils.CHR20_FASTA
    FLAGS.reads = test_utils.CHR20_BAM
    FLAGS.truth_variants = test_utils.TRUTH_VARIANTS_VCF
    FLAGS.confident_regions = test_utils.CONFIDENT_REGIONS_BED
    FLAGS.mode = 'training'
    FLAGS.examples = ''

    FLAGS.training_random_emit_ref_sites = 0.3
    options = make_examples.default_options(add_flags=True)
    self.assertAlmostEqual(
        options.variant_caller_options.fraction_reference_sites_to_emit, 0.3)
Пример #17
0
  def test_default_options_without_training_random_emit_ref_sites(self):
    FLAGS.ref = test_utils.CHR20_FASTA
    FLAGS.reads = test_utils.CHR20_BAM
    FLAGS.truth_variants = test_utils.TRUTH_VARIANTS_VCF
    FLAGS.confident_regions = test_utils.CONFIDENT_REGIONS_BED
    FLAGS.mode = 'training'
    FLAGS.examples = ''

    options = make_examples.default_options(add_flags=True)
    # In proto3, there is no way to check presence of scalar field:
    # redacted
    # As an approximation, we directly check that the value should be exactly 0.
    self.assertEqual(
        options.variant_caller_options.fraction_reference_sites_to_emit, 0.0)
Пример #18
0
  def setUp(self):
    self.region = ranges.parse_literal('chr20:10,000,000-10,000,100')

    FLAGS.reads = ''
    self.options = make_examples.default_options(add_flags=False)
    self.options.reference_filename = testdata.CHR20_FASTA
    self.options.reads_filename = testdata.CHR20_BAM
    self.options.truth_variants_filename = testdata.TRUTH_VARIANTS_VCF
    self.options.mode = deepvariant_pb2.DeepVariantOptions.TRAINING

    self.processor = make_examples.RegionProcessor(self.options)
    self.mock_init = self.add_mock('_initialize')
    self.default_shape = [5, 5, 7]
    self.default_format = 'raw'
    def setUp(self):
        self.region = ranges.parse_literal('chr20:10,000,000-10,000,100')

        FLAGS.reads = ''
        self.options = make_examples.default_options(add_flags=False)
        self.options.reference_filename = test_utils.CHR20_FASTA
        self.options.reads_filename = test_utils.CHR20_BAM
        self.options.truth_variants_filename = test_utils.TRUTH_VARIANTS_VCF
        self.options.mode = deepvariant_pb2.DeepVariantOptions.TRAINING

        self.processor = make_examples.RegionProcessor(self.options)
        self.mock_init = self.add_mock('_initialize')
        self.default_shape = [5, 5, 7]
        self.default_format = 'raw'
Пример #20
0
  def test_default_options_without_training_random_emit_ref_sites(self):
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
    FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED
    FLAGS.mode = 'training'
    FLAGS.examples = ''

    options = make_examples.default_options(add_flags=True)
    # In proto3, there is no way to check presence of scalar field:
    # redacted
    # As an approximation, we directly check that the value should be exactly 0.
    self.assertEqual(
        options.variant_caller_options.fraction_reference_sites_to_emit, 0.0)
Пример #21
0
  def test_regions_and_exclude_regions_flags(self):
    FLAGS.mode = 'calling'
    FLAGS.ref = test_utils.CHR20_FASTA
    FLAGS.reads = test_utils.CHR20_BAM
    FLAGS.regions = 'chr20:10,000,000-11,000,000'
    FLAGS.examples = 'examples.tfrecord'
    FLAGS.exclude_regions = 'chr20:10,010,000-10,100,000'

    options = make_examples.default_options(add_flags=True)
    self.assertCountEqual(
        list(
            ranges.RangeSet(
                make_examples.processing_regions_from_options(options))),
        _from_literals_list(
            ['chr20:10,000,000-10,009,999', 'chr20:10,100,001-11,000,000']))
Пример #22
0
  def test_regions_and_exclude_regions_flags(self):
    FLAGS.mode = 'calling'
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.regions = 'chr20:10,000,000-11,000,000'
    FLAGS.examples = 'examples.tfrecord'
    FLAGS.exclude_regions = 'chr20:10,010,000-10,100,000'

    options = make_examples.default_options(add_flags=True)
    self.assertCountEqual(
        list(
            ranges.RangeSet(
                make_examples.processing_regions_from_options(options))),
        _from_literals_list(
            ['chr20:10,000,000-10,009,999', 'chr20:10,100,001-11,000,000']))
Пример #23
0
  def test_sharded_outputs1(self, settings):
    # Set all of the requested flag values.
    for name, (flag_val, _) in settings.items():
      setattr(FLAGS, name, flag_val)

    FLAGS.mode = 'training'
    FLAGS.reads = ''
    FLAGS.ref = ''
    options = make_examples.default_options(add_flags=True)

    # Check all of the flags.
    for name, option_val in [('examples', options.examples_filename),
                             ('candidates', options.candidates_filename),
                             ('gvcf', options.gvcf_filename)]:
      expected = settings[name][1] if name in settings else ''
      self.assertEqual(expected, option_val)
    def test_sharded_outputs1(self, settings):
        # Set all of the requested flag values.
        for name, (flag_val, _) in settings.items():
            setattr(FLAGS, name, flag_val)

        FLAGS.mode = 'training'
        FLAGS.reads = ''
        FLAGS.ref = ''
        options = make_examples.default_options(add_flags=True)

        # Check all of the flags.
        for name, option_val in [('examples', options.examples_filename),
                                 ('candidates', options.candidates_filename),
                                 ('gvcf', options.gvcf_filename)]:
            expected = settings[name][1] if name in settings else ''
            self.assertEqual(expected, option_val)
Пример #25
0
  def test_make_examples_end2end_failed_on_cram(self):
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')

    FLAGS.use_ref_for_cram = False
    FLAGS.write_run_info = True
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_CRAM
    FLAGS.candidates = test_utils.test_tmpfile(_sharded('failed.vsc.tfrecord'))
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('failed.examples.tfrecord'))
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = 'calling'
    FLAGS.gvcf_gq_binsize = 5
    options = make_examples.default_options(add_flags=True)
    with six.assertRaisesRegex(self, ValueError,
                               'Failed to parse BAM/CRAM file.'):
      make_examples_core.make_examples_runner(options)
Пример #26
0
  def test_make_examples_with_variant_selection(self,
                                                select_types,
                                                expected_count,
                                                keep_legacy_behavior=False):
    if select_types is not None:
      FLAGS.select_variant_types = select_types
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.candidates = test_utils.test_tmpfile(_sharded('vsc.tfrecord'))
    FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord'))
    FLAGS.partition_size = 1000
    FLAGS.mode = 'calling'
    FLAGS.keep_legacy_allele_counter_behavior = keep_legacy_behavior
    options = make_examples.default_options(add_flags=True)
    make_examples_core.make_examples_runner(options)

    candidates = list(tfrecord.read_tfrecords(FLAGS.candidates))
    self.assertLen(candidates, expected_count)
Пример #27
0
  def test_make_examples_end2end_failed_on_mismatched_multi_bam(self):
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')

    FLAGS.write_run_info = True
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = ','.join([testdata.CHR20_BAM, testdata.NOCHR20_BAM])
    FLAGS.candidates = test_utils.test_tmpfile(
        _sharded('mismatched_multi_bam.vsc.tfrecord'))
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('mismatched_multi_bam.examples.tfrecord'))
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = 'calling'
    FLAGS.gvcf_gq_binsize = 5
    options = make_examples.default_options(add_flags=True)
    # This shows an example of what the error message looks like:
    # redacted
    with six.assertRaisesRegex(
        self, ValueError, 'NOT_FOUND: Unknown reference_name '
        'reference_name: "chr20" start: 9999999 end: 10000999'):
      make_examples_core.make_examples_runner(options)
Пример #28
0
  def test_confident_regions(self):
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
    FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED
    FLAGS.mode = 'training'
    FLAGS.examples = ''

    options = make_examples.default_options(add_flags=True)
    confident_regions = make_examples.read_confident_regions(options)

    # Our expected intervals, inlined from CONFIDENT_REGIONS_BED.
    expected = _from_literals_list([
        'chr20:10000847-10002407', 'chr20:10002521-10004171',
        'chr20:10004274-10004964', 'chr20:10004995-10006386',
        'chr20:10006410-10007800', 'chr20:10007825-10008018',
        'chr20:10008044-10008079', 'chr20:10008101-10008707',
        'chr20:10008809-10008897', 'chr20:10009003-10009791',
        'chr20:10009934-10010531'
    ])
    # Our confident regions should be exactly those found in the BED file.
    self.assertCountEqual(expected, list(confident_regions))
Пример #29
0
    def test_confident_regions(self):
        FLAGS.ref = testdata.CHR20_FASTA
        FLAGS.reads = testdata.CHR20_BAM
        FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
        FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED
        FLAGS.mode = 'training'
        FLAGS.examples = ''

        options = make_examples.default_options(add_flags=True)
        confident_regions = make_examples_core.read_confident_regions(options)

        # Our expected intervals, inlined from CONFIDENT_REGIONS_BED.
        expected = _from_literals_list([
            'chr20:10000847-10002407', 'chr20:10002521-10004171',
            'chr20:10004274-10004964', 'chr20:10004995-10006386',
            'chr20:10006410-10007800', 'chr20:10007825-10008018',
            'chr20:10008044-10008079', 'chr20:10008101-10008707',
            'chr20:10008809-10008897', 'chr20:10009003-10009791',
            'chr20:10009934-10010531'
        ])
        # Our confident regions should be exactly those found in the BED file.
        six.assertCountEqual(self, expected, list(confident_regions))
Пример #30
0
 def test_make_examples_training_end2end_with_customized_classes_labeler(self):
   FLAGS.labeler_algorithm = 'customized_classes_labeler'
   FLAGS.customized_classes_labeler_classes_list = 'ref,class1,class2'
   FLAGS.customized_classes_labeler_info_field_name = 'type'
   region = ranges.parse_literal('chr20:10,000,000-10,004,000')
   FLAGS.regions = [ranges.to_literal(region)]
   FLAGS.ref = testdata.CHR20_FASTA
   FLAGS.reads = testdata.CHR20_BAM
   FLAGS.candidates = test_utils.test_tmpfile(_sharded('vsc.tfrecord'))
   FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord'))
   FLAGS.partition_size = 1000
   FLAGS.mode = 'training'
   FLAGS.gvcf_gq_binsize = 5
   FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF_WITH_TYPES
   FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED
   options = make_examples.default_options(add_flags=True)
   make_examples_core.make_examples_runner(options)
   golden_file = _sharded(testdata.CUSTOMIZED_CLASSES_GOLDEN_TRAINING_EXAMPLES)
   # Verify that the variants in the examples are all good.
   examples = self.verify_examples(
       FLAGS.examples, region, options, verify_labels=True)
   self.assertDeepVariantExamplesEqual(
       examples, list(tfrecord.read_tfrecords(golden_file)))
Пример #31
0
    def setUp(self):
        super(RegionProcessorTest, self).setUp()
        self._saved_flags = flagsaver.save_flag_values()
        self.region = ranges.parse_literal('chr20:10,000,000-10,000,100')

        FLAGS.reads = ''
        self.options = make_examples.default_options(add_flags=False)
        self.options.reference_filename = testdata.CHR20_FASTA
        main_sample = self.options.sample_options[0]
        if not main_sample.reads_filenames:
            main_sample.reads_filenames.append(testdata.CHR20_BAM)
        main_sample.variant_caller_options.sample_name = 'sample_id'
        main_sample.name = 'sample_id'
        self.options.truth_variants_filename = testdata.TRUTH_VARIANTS_VCF
        self.options.mode = deepvariant_pb2.MakeExamplesOptions.TRAINING
        self.processor = make_examples_core.RegionProcessor(self.options)
        self.ref_reader = fasta.IndexedFastaReader(
            self.options.reference_filename)
        self.mock_init = self.add_mock('initialize')
        for sample in self.processor.samples:
            sample.in_memory_sam_reader = mock.Mock()
        self.default_shape = [5, 5, 7]
        self.default_format = 'raw'
    def test_make_examples_end2end(self, mode, num_shards):
        self.assertIn(mode, {'calling', 'training'})
        region = ranges.parse_literal('chr20:10,000,000-10,010,000')
        FLAGS.ref = test_utils.CHR20_FASTA
        FLAGS.reads = test_utils.CHR20_BAM
        FLAGS.candidates = test_utils.test_tmpfile(
            _sharded('vsc.tfrecord', num_shards))
        FLAGS.examples = test_utils.test_tmpfile(
            _sharded('examples.tfrecord', num_shards))
        FLAGS.regions = [ranges.to_literal(region)]
        FLAGS.partition_size = 1000
        FLAGS.mode = mode

        if mode == 'calling':
            FLAGS.gvcf = test_utils.test_tmpfile(
                _sharded('gvcf.tfrecord', num_shards))
        else:
            FLAGS.truth_variants = test_utils.TRUTH_VARIANTS_VCF
            FLAGS.confident_regions = test_utils.CONFIDENT_REGIONS_BED

        for task_id in range(max(num_shards, 1)):
            FLAGS.task = task_id
            options = make_examples.default_options(add_flags=True)
            make_examples.make_examples_runner(options)

        # Test that our candidates are reasonable, calling specific helper functions
        # to check lots of properties of the output.
        candidates = _sort_candidates(
            io_utils.read_tfrecords(FLAGS.candidates,
                                    proto=deepvariant_pb2.DeepVariantCall))
        self.verify_deepvariant_calls(candidates, options)
        self.verify_variants([call.variant for call in candidates],
                             region,
                             options,
                             is_gvcf=False)

        # Verify that the variants in the examples are all good.
        examples = self.verify_examples(FLAGS.examples,
                                        region,
                                        options,
                                        verify_labels=mode == 'training')
        example_variants = [tf_utils.example_variant(ex) for ex in examples]
        self.verify_variants(example_variants, region, options, is_gvcf=False)

        # Verify the integrity of the examples and then check that they match our
        # golden labeled examples. Note we expect the order for both training and
        # calling modes to produce deterministic order because we fix the random
        # seed.
        if mode == 'calling':
            golden_file = _sharded(test_utils.GOLDEN_CALLING_EXAMPLES,
                                   num_shards)
        else:
            golden_file = _sharded(test_utils.GOLDEN_TRAINING_EXAMPLES,
                                   num_shards)
        self.assertDeepVariantExamplesEqual(
            examples, list(io_utils.read_tfrecords(golden_file)))

        if mode == 'calling':
            nist_reader = genomics_io.make_vcf_reader(
                test_utils.TRUTH_VARIANTS_VCF)
            nist_variants = list(nist_reader.query(region))
            self.verify_nist_concordance(example_variants, nist_variants)

            # Check the quality of our generated gvcf file.
            gvcfs = _sort_variants(
                io_utils.read_tfrecords(FLAGS.gvcf,
                                        proto=variants_pb2.Variant))
            self.verify_variants(gvcfs, region, options, is_gvcf=True)
            self.verify_contiguity(gvcfs, region)
Пример #33
0
 def test_invalid_sequencing_type(self):
   FLAGS.mode = 'training'
   FLAGS.sequencing_type = 'wGs'
   with self.assertRaises(ValueError):
     make_examples.default_options(add_flags=True)
Пример #34
0
  def test_make_examples_end2end(self,
                                 mode,
                                 num_shards,
                                 test_condition=TestConditions.USE_BAM,
                                 labeler_algorithm=None,
                                 use_fast_pass_aligner=True):
    self.assertIn(mode, {'calling', 'training'})
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.write_run_info = True
    FLAGS.ref = testdata.CHR20_FASTA
    if test_condition == TestConditions.USE_BAM:
      FLAGS.reads = testdata.CHR20_BAM
    elif test_condition == TestConditions.USE_CRAM:
      FLAGS.reads = testdata.CHR20_CRAM
    elif test_condition == TestConditions.USE_MULTI_BAMS:
      FLAGS.reads = ','.join(
          [testdata.CHR20_BAM_FIRST_HALF, testdata.CHR20_BAM_SECOND_HALF])

    FLAGS.candidates = test_utils.test_tmpfile(
        _sharded('vsc.tfrecord', num_shards))
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('examples.tfrecord', num_shards))
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = mode
    FLAGS.gvcf_gq_binsize = 5
    FLAGS.use_fast_pass_aligner = use_fast_pass_aligner
    if labeler_algorithm is not None:
      FLAGS.labeler_algorithm = labeler_algorithm

    if mode == 'calling':
      FLAGS.gvcf = test_utils.test_tmpfile(
          _sharded('gvcf.tfrecord', num_shards))
    else:
      FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
      FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED

    for task_id in range(max(num_shards, 1)):
      FLAGS.task = task_id
      options = make_examples.default_options(add_flags=True)
      # We need to overwrite bam_fname for USE_CRAM test since Golden Set
      # generated from BAM file. BAM filename is stored in candidates. If we
      # don't overwrite default_options variants won't match and test fail.
      options.bam_fname = 'NA12878_S1.chr20.10_10p1mb.bam'
      make_examples_core.make_examples_runner(options)

      # Check that our run_info proto contains the basic fields we'd expect:
      # (a) our options are written to the run_info.options field.
      run_info = make_examples_core.read_make_examples_run_info(
          options.run_info_filename)
      self.assertEqual(run_info.options, options)
      # (b) run_info.resource_metrics is present and contains our hostname.
      self.assertTrue(run_info.HasField('resource_metrics'))
      self.assertEqual(run_info.resource_metrics.host_name, platform.node())

    # Test that our candidates are reasonable, calling specific helper functions
    # to check lots of properties of the output.
    candidates = sorted(
        tfrecord.read_tfrecords(
            FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall),
        key=lambda c: variant_utils.variant_range_tuple(c.variant))
    self.verify_deepvariant_calls(candidates, options)
    self.verify_variants([call.variant for call in candidates],
                         region,
                         options,
                         is_gvcf=False)

    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, region, options, verify_labels=mode == 'training')
    example_variants = [tf_utils.example_variant(ex) for ex in examples]
    self.verify_variants(example_variants, region, options, is_gvcf=False)

    # Verify the integrity of the examples and then check that they match our
    # golden labeled examples. Note we expect the order for both training and
    # calling modes to produce deterministic order because we fix the random
    # seed.
    if mode == 'calling':
      golden_file = _sharded(testdata.GOLDEN_CALLING_EXAMPLES, num_shards)
    else:
      golden_file = _sharded(testdata.GOLDEN_TRAINING_EXAMPLES, num_shards)
    self.assertDeepVariantExamplesEqual(
        examples, list(tfrecord.read_tfrecords(golden_file)))

    if mode == 'calling':
      nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF)
      nist_variants = list(nist_reader.query(region))
      self.verify_nist_concordance(example_variants, nist_variants)

      # Check the quality of our generated gvcf file.
      gvcfs = variant_utils.sorted_variants(
          tfrecord.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant))
      self.verify_variants(gvcfs, region, options, is_gvcf=True)
      self.verify_contiguity(gvcfs, region)
      gvcf_golden_file = _sharded(testdata.GOLDEN_POSTPROCESS_GVCF_INPUT,
                                  num_shards)
      expected_gvcfs = list(
          tfrecord.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant))
      # Despite the name, assertCountEqual checks that all elements match.
      self.assertCountEqual(gvcfs, expected_gvcfs)

    if (mode == 'training' and num_shards == 0 and
        labeler_algorithm != 'positional_labeler'):
      # The positional labeler doesn't track metrics, so don't try to read them
      # in when that's the mode.
      self.assertEqual(
          make_examples_core.read_make_examples_run_info(
              testdata.GOLDEN_MAKE_EXAMPLES_RUN_INFO).labeling_metrics,
          run_info.labeling_metrics)
Пример #35
0
  def test_make_examples_end2end(self, mode, num_shards,
                                 labeler_algorithm=None):
    self.maxDiff = None
    self.assertIn(mode, {'calling', 'training'})
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.candidates = test_utils.test_tmpfile(
        _sharded('vsc.tfrecord', num_shards))
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('examples.tfrecord', num_shards))
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = mode
    FLAGS.gvcf_gq_binsize = 5
    if labeler_algorithm is not None:
      FLAGS.labeler_algorithm = labeler_algorithm

    if mode == 'calling':
      FLAGS.gvcf = test_utils.test_tmpfile(
          _sharded('gvcf.tfrecord', num_shards))
    else:
      FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
      FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED

    for task_id in range(max(num_shards, 1)):
      FLAGS.task = task_id
      options = make_examples.default_options(add_flags=True)
      make_examples.make_examples_runner(options)

    # Test that our candidates are reasonable, calling specific helper functions
    # to check lots of properties of the output.
    candidates = sorted(
        io_utils.read_tfrecords(
            FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall),
        key=lambda c: variant_utils.variant_range_tuple(c.variant))
    self.verify_deepvariant_calls(candidates, options)
    self.verify_variants(
        [call.variant for call in candidates], region, options, is_gvcf=False)

    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, region, options, verify_labels=mode == 'training')
    example_variants = [tf_utils.example_variant(ex) for ex in examples]
    self.verify_variants(example_variants, region, options, is_gvcf=False)

    # Verify the integrity of the examples and then check that they match our
    # golden labeled examples. Note we expect the order for both training and
    # calling modes to produce deterministic order because we fix the random
    # seed.
    if mode == 'calling':
      golden_file = _sharded(testdata.GOLDEN_CALLING_EXAMPLES, num_shards)
    else:
      golden_file = _sharded(testdata.GOLDEN_TRAINING_EXAMPLES, num_shards)
    self.assertDeepVariantExamplesEqual(
        examples, list(io_utils.read_tfrecords(golden_file)))

    if mode == 'calling':
      nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF)
      nist_variants = list(nist_reader.query(region))
      self.verify_nist_concordance(example_variants, nist_variants)

      # Check the quality of our generated gvcf file.
      gvcfs = variant_utils.sorted_variants(
          io_utils.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant))
      self.verify_variants(gvcfs, region, options, is_gvcf=True)
      self.verify_contiguity(gvcfs, region)
      gvcf_golden_file = _sharded(testdata.GOLDEN_POSTPROCESS_GVCF_INPUT,
                                  num_shards)
      expected_gvcfs = list(
          io_utils.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant))
      self.assertItemsEqual(gvcfs, expected_gvcfs)