def test_flags_strictly_needs_sam_aux_fields( self, flags_strictly_needs_sam_aux_fields): FLAGS.mode = 'calling' FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.examples = 'examples.tfrecord' FLAGS[flags_strictly_needs_sam_aux_fields].value = True FLAGS.parse_sam_aux_fields = False with six.assertRaisesRegex( self, Exception, 'If --{} is set then --parse_sam_aux_fields must be set too.'. format(flags_strictly_needs_sam_aux_fields)): make_examples.default_options(add_flags=True)
def test_make_examples_runtime_by_region(self): region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.regions = [ranges.to_literal(region)] FLAGS.mode = 'calling' num_shards = 4 FLAGS.examples = test_utils.test_tmpfile( _sharded('examples.tfrecord', num_shards)) # Use same number of shards for profiling files as examples. output_prefix = test_utils.test_tmpfile('runtime_profile') FLAGS.runtime_by_region = output_prefix + '@{}'.format(num_shards) FLAGS.task = 2 # Run make_examples with those FLAGS. options = make_examples.default_options(add_flags=True) make_examples_core.make_examples_runner(options) # Sharded output ending in @4 becomes -00002-of-00004 for task 2. expected_output_path = output_prefix + '-0000{}-of-00004'.format(FLAGS.task) expected_columns = [ 'region', 'get reads', 'find candidates', 'make pileup images', 'write outputs', 'num reads', 'num candidates', 'num examples' ] with gfile.Open(expected_output_path, 'r') as fin: header = fin.readline() column_names = header.strip().split('\t') self.assertEqual(expected_columns, column_names) non_header_lines = fin.readlines() self.assertLen(non_header_lines, 3) one_row = non_header_lines[0].strip().split('\t') self.assertEqual(len(one_row), len(column_names)) self.assertGreater(int(one_row[5]), 0, msg='num reads > 0') self.assertGreater(int(one_row[6]), 0, msg='num candidates > 0') self.assertGreater(int(one_row[7]), 0, msg='num examples > 0')
def test_make_examples_training_end2end_with_alt_aligned_pileup( self, alt_align, expected_shape): region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.regions = [ranges.to_literal(region)] FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile(_sharded('vsc.tfrecord')) FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord')) FLAGS.partition_size = 1000 FLAGS.mode = 'training' FLAGS.gvcf_gq_binsize = 5 FLAGS.alt_aligned_pileup = alt_align # This is the only input change. FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED options = make_examples.default_options(add_flags=True) # Run make_examples with the flags above. make_examples_core.make_examples_runner(options) # Check the output for shape and against the golden file. if alt_align == 'rows': golden_file = _sharded(testdata.ALT_ALIGNED_ROWS_EXAMPLES) elif alt_align == 'diff_channels': golden_file = _sharded(testdata.ALT_ALIGNED_DIFF_CHANNELS_EXAMPLES) else: raise ValueError("Golden data doesn't exist for this alt_align option: " '{}'.format(alt_align)) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, region, options, verify_labels=True) self.assertDeepVariantExamplesEqual( examples, list(tfrecord.read_tfrecords(golden_file))) # Pileup image should have 3 rows of height 100, so resulting height is 300. self.assertEqual(decode_example(examples[0])['image/shape'], expected_shape)
def _get_examples(use_confident_regions=False): # `flag_name` can be either 'confident_regions' or 'regions'. Both should # be used to constrain the set of candidates generated, and as a result # generating the same examples. bed_path = test_utils.test_tmpfile('vcf_candidate_importer.bed') with gfile.Open(bed_path, 'w') as fout: fout.write('\t'.join(['chr20', '10000000', '10001000']) + '\n') if use_confident_regions: FLAGS.confident_regions = bed_path FLAGS.regions = '' else: FLAGS.confident_regions = '' FLAGS.regions = bed_path FLAGS.examples = test_utils.test_tmpfile( _sharded('vcf_candidate_importer.tfrecord')) FLAGS.mode = 'training' FLAGS.reads = testdata.CHR20_BAM FLAGS.ref = testdata.CHR20_FASTA FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF FLAGS.variant_caller = 'vcf_candidate_importer' options = make_examples.default_options(add_flags=True) make_examples_core.make_examples_runner(options) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, None, options, verify_labels=False) return examples
def test_make_examples_end2end_vcf_candidate_importer(self, mode): FLAGS.variant_caller = 'vcf_candidate_importer' FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile( _sharded('vcf_candidate_importer.{}.tfrecord'.format(mode))) FLAGS.examples = test_utils.test_tmpfile( _sharded('vcf_candidate_importer.examples.{}.tfrecord'.format(mode))) FLAGS.mode = mode if mode == 'calling': golden_file = _sharded( testdata.GOLDEN_VCF_CANDIDATE_IMPORTER_CALLING_EXAMPLES) FLAGS.proposed_variants = testdata.VCF_CANDIDATE_IMPORTER_VARIANTS # Adding the following flags to match how the testdata was created. FLAGS.regions = 'chr20:59,777,000-60,000,000' FLAGS.realign_reads = False else: golden_file = _sharded( testdata.GOLDEN_VCF_CANDIDATE_IMPORTER_TRAINING_EXAMPLES) FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF options = make_examples.default_options(add_flags=True) make_examples_core.make_examples_runner(options) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, None, options, verify_labels=mode == 'training') self.assertDeepVariantExamplesEqual( examples, list(tfrecord.read_tfrecords(golden_file))) self.assertEqual( decode_example(examples[0])['image/shape'], [100, 221, dv_constants.PILEUP_NUM_CHANNELS])
def test_gvcf_output_enabled_is_false_without_gvcf_flag(self): FLAGS.mode = 'training' FLAGS.gvcf = '' FLAGS.reads = '' FLAGS.ref = '' FLAGS.examples = '' options = make_examples.default_options(add_flags=True) self.assertFalse(make_examples.gvcf_output_enabled(options))
def _get_examples(downsample_fraction=None): if downsample_fraction is not None: FLAGS.downsample_fraction = downsample_fraction options = make_examples.default_options(add_flags=True) make_examples_core.make_examples_runner(options) examples = self.verify_examples( FLAGS.examples, region, options, verify_labels=False) return examples
def test_gvcf_output_enabled_is_true_with_gvcf_flag(self): FLAGS.mode = 'training' FLAGS.gvcf = '/tmp/foo.vcf' FLAGS.reads = '' FLAGS.ref = '' FLAGS.examples = '' options = make_examples.default_options(add_flags=True) self.assertTrue(make_examples_core.gvcf_output_enabled(options))
def test_gvcf_output_enabled_is_true_with_gvcf_flag(self): FLAGS.mode = 'training' FLAGS.gvcf = '/tmp/foo.vcf' FLAGS.reads = '' FLAGS.ref = '' FLAGS.examples = '' options = make_examples.default_options(add_flags=True) self.assertTrue(make_examples.gvcf_output_enabled(options))
def test_make_examples_with_allele_frequency(self, mode): FLAGS.mode = 'calling' FLAGS.ref = testdata.GRCH38_FASTA FLAGS.reads = testdata.GRCH38_CHR20_AND_21_BAM num_shards = 1 FLAGS.examples = test_utils.test_tmpfile( _sharded('examples.tfrecord', num_shards)) region = ranges.parse_literal('chr20:61001-62000') FLAGS.use_allele_frequency = True FLAGS.regions = [ranges.to_literal(region)] if mode == 'one vcf': FLAGS.population_vcfs = testdata.AF_VCF_CHR20_AND_21 elif mode == 'two vcfs': FLAGS.population_vcfs = ' '.join( [testdata.AF_VCF_CHR20, testdata.AF_VCF_CHR21]) else: raise ValueError('Invalid mode for parameterized test.') options = make_examples.default_options(add_flags=True) # Run make_examples with the flags above. make_examples_core.make_examples_runner(options) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, region, options, verify_labels=False) # Pileup images should have one extra channel. self.assertEqual([100, 221, dv_constants.PILEUP_NUM_CHANNELS + 1], decode_example(examples[0])['image/shape']) # Test there is something in the added channel. # Values capture whether each loci has been seen in the observed examples. population_matched_loci = { 'chr20:61539_A': False, 'chr20:61634_G': False, 'chr20:61644_G': False } for example in examples: locus_id = vis.locus_id_from_variant(vis.variant_from_example(example)) if locus_id in population_matched_loci.keys(): channels = vis.channels_from_example(example) self.assertGreater( np.sum(channels[dv_constants.PILEUP_NUM_CHANNELS]), 0, msg='There should be ' 'something in the %s-th channel for variant ' '%s' % (dv_constants.PILEUP_NUM_CHANNELS + 1, locus_id)) population_matched_loci[locus_id] = True self.assertTrue( all(population_matched_loci.values()), msg='Check that all ' '3 sample loci appeared in the examples.') # Check against the golden file (same for both modes). golden_file = _sharded(testdata.GOLDEN_ALLELE_FREQUENCY_EXAMPLES) examples_from_golden = list(tfrecord.read_tfrecords(golden_file)) self.assertDeepVariantExamplesEqual(examples_from_golden, examples)
def test_min_base_quality(self): FLAGS.min_base_quality = 5 FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED FLAGS.mode = 'training' FLAGS.examples = '' options = make_examples.default_options(add_flags=True) self.assertEqual(options.pic_options.read_requirements.min_base_quality, 5)
def test_flag_optionally_needs_sam_aux_fields_with_different_parse_sam_aux_fields( self, flag_optionally_needs_sam_aux_fields, parse_sam_aux_fields, expected_message): FLAGS.mode = 'calling' FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.examples = 'examples.tfrecord' FLAGS[flag_optionally_needs_sam_aux_fields].value = True FLAGS.parse_sam_aux_fields = parse_sam_aux_fields with self.assertLogs() as logs: make_examples.default_options(add_flags=True) aux_fields_log_messages = [ x for x in logs.output if '--parse_sam_aux_fields' in x ] if aux_fields_log_messages: self.assertRegex( aux_fields_log_messages[0], expected_message.format(flag_optionally_needs_sam_aux_fields)) else: self.assertEmpty(aux_fields_log_messages)
def test_incorrect_empty_regions(self): FLAGS.mode = 'calling' FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM # Deliberately incorrect contig name. FLAGS.regions = '20:10,000,000-11,000,000' FLAGS.examples = 'examples.tfrecord' options = make_examples.default_options(add_flags=True) with six.assertRaisesRegex(self, ValueError, 'The regions to call is empty.'): make_examples_core.processing_regions_from_options(options)
def test_default_options_with_training_random_emit_ref_sites(self): FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED FLAGS.mode = 'training' FLAGS.examples = '' FLAGS.training_random_emit_ref_sites = 0.3 options = make_examples.default_options(add_flags=True) self.assertAlmostEqual( options.variant_caller_options.fraction_reference_sites_to_emit, 0.3)
def test_add_supporting_other_alt_color(self): FLAGS.mode = 'training' FLAGS.gvcf = '' FLAGS.reads = '' FLAGS.ref = '' FLAGS.examples = '' FLAGS.add_supporting_other_alt_color = True options = make_examples.default_options(add_flags=True) self.assertAlmostEqual( options.pic_options.other_allele_supporting_read_alpha, 0.3) self.assertAlmostEqual(options.pic_options.allele_unsupporting_read_alpha, 0.6)
def test_default_options_with_training_random_emit_ref_sites(self): FLAGS.ref = test_utils.CHR20_FASTA FLAGS.reads = test_utils.CHR20_BAM FLAGS.truth_variants = test_utils.TRUTH_VARIANTS_VCF FLAGS.confident_regions = test_utils.CONFIDENT_REGIONS_BED FLAGS.mode = 'training' FLAGS.examples = '' FLAGS.training_random_emit_ref_sites = 0.3 options = make_examples.default_options(add_flags=True) self.assertAlmostEqual( options.variant_caller_options.fraction_reference_sites_to_emit, 0.3)
def test_default_options_without_training_random_emit_ref_sites(self): FLAGS.ref = test_utils.CHR20_FASTA FLAGS.reads = test_utils.CHR20_BAM FLAGS.truth_variants = test_utils.TRUTH_VARIANTS_VCF FLAGS.confident_regions = test_utils.CONFIDENT_REGIONS_BED FLAGS.mode = 'training' FLAGS.examples = '' options = make_examples.default_options(add_flags=True) # In proto3, there is no way to check presence of scalar field: # redacted # As an approximation, we directly check that the value should be exactly 0. self.assertEqual( options.variant_caller_options.fraction_reference_sites_to_emit, 0.0)
def setUp(self): self.region = ranges.parse_literal('chr20:10,000,000-10,000,100') FLAGS.reads = '' self.options = make_examples.default_options(add_flags=False) self.options.reference_filename = testdata.CHR20_FASTA self.options.reads_filename = testdata.CHR20_BAM self.options.truth_variants_filename = testdata.TRUTH_VARIANTS_VCF self.options.mode = deepvariant_pb2.DeepVariantOptions.TRAINING self.processor = make_examples.RegionProcessor(self.options) self.mock_init = self.add_mock('_initialize') self.default_shape = [5, 5, 7] self.default_format = 'raw'
def setUp(self): self.region = ranges.parse_literal('chr20:10,000,000-10,000,100') FLAGS.reads = '' self.options = make_examples.default_options(add_flags=False) self.options.reference_filename = test_utils.CHR20_FASTA self.options.reads_filename = test_utils.CHR20_BAM self.options.truth_variants_filename = test_utils.TRUTH_VARIANTS_VCF self.options.mode = deepvariant_pb2.DeepVariantOptions.TRAINING self.processor = make_examples.RegionProcessor(self.options) self.mock_init = self.add_mock('_initialize') self.default_shape = [5, 5, 7] self.default_format = 'raw'
def test_default_options_without_training_random_emit_ref_sites(self): FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED FLAGS.mode = 'training' FLAGS.examples = '' options = make_examples.default_options(add_flags=True) # In proto3, there is no way to check presence of scalar field: # redacted # As an approximation, we directly check that the value should be exactly 0. self.assertEqual( options.variant_caller_options.fraction_reference_sites_to_emit, 0.0)
def test_regions_and_exclude_regions_flags(self): FLAGS.mode = 'calling' FLAGS.ref = test_utils.CHR20_FASTA FLAGS.reads = test_utils.CHR20_BAM FLAGS.regions = 'chr20:10,000,000-11,000,000' FLAGS.examples = 'examples.tfrecord' FLAGS.exclude_regions = 'chr20:10,010,000-10,100,000' options = make_examples.default_options(add_flags=True) self.assertCountEqual( list( ranges.RangeSet( make_examples.processing_regions_from_options(options))), _from_literals_list( ['chr20:10,000,000-10,009,999', 'chr20:10,100,001-11,000,000']))
def test_regions_and_exclude_regions_flags(self): FLAGS.mode = 'calling' FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.regions = 'chr20:10,000,000-11,000,000' FLAGS.examples = 'examples.tfrecord' FLAGS.exclude_regions = 'chr20:10,010,000-10,100,000' options = make_examples.default_options(add_flags=True) self.assertCountEqual( list( ranges.RangeSet( make_examples.processing_regions_from_options(options))), _from_literals_list( ['chr20:10,000,000-10,009,999', 'chr20:10,100,001-11,000,000']))
def test_sharded_outputs1(self, settings): # Set all of the requested flag values. for name, (flag_val, _) in settings.items(): setattr(FLAGS, name, flag_val) FLAGS.mode = 'training' FLAGS.reads = '' FLAGS.ref = '' options = make_examples.default_options(add_flags=True) # Check all of the flags. for name, option_val in [('examples', options.examples_filename), ('candidates', options.candidates_filename), ('gvcf', options.gvcf_filename)]: expected = settings[name][1] if name in settings else '' self.assertEqual(expected, option_val)
def test_sharded_outputs1(self, settings): # Set all of the requested flag values. for name, (flag_val, _) in settings.items(): setattr(FLAGS, name, flag_val) FLAGS.mode = 'training' FLAGS.reads = '' FLAGS.ref = '' options = make_examples.default_options(add_flags=True) # Check all of the flags. for name, option_val in [('examples', options.examples_filename), ('candidates', options.candidates_filename), ('gvcf', options.gvcf_filename)]: expected = settings[name][1] if name in settings else '' self.assertEqual(expected, option_val)
def test_make_examples_end2end_failed_on_cram(self): region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.use_ref_for_cram = False FLAGS.write_run_info = True FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_CRAM FLAGS.candidates = test_utils.test_tmpfile(_sharded('failed.vsc.tfrecord')) FLAGS.examples = test_utils.test_tmpfile( _sharded('failed.examples.tfrecord')) FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = 'calling' FLAGS.gvcf_gq_binsize = 5 options = make_examples.default_options(add_flags=True) with six.assertRaisesRegex(self, ValueError, 'Failed to parse BAM/CRAM file.'): make_examples_core.make_examples_runner(options)
def test_make_examples_with_variant_selection(self, select_types, expected_count, keep_legacy_behavior=False): if select_types is not None: FLAGS.select_variant_types = select_types region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.regions = [ranges.to_literal(region)] FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile(_sharded('vsc.tfrecord')) FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord')) FLAGS.partition_size = 1000 FLAGS.mode = 'calling' FLAGS.keep_legacy_allele_counter_behavior = keep_legacy_behavior options = make_examples.default_options(add_flags=True) make_examples_core.make_examples_runner(options) candidates = list(tfrecord.read_tfrecords(FLAGS.candidates)) self.assertLen(candidates, expected_count)
def test_make_examples_end2end_failed_on_mismatched_multi_bam(self): region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.write_run_info = True FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = ','.join([testdata.CHR20_BAM, testdata.NOCHR20_BAM]) FLAGS.candidates = test_utils.test_tmpfile( _sharded('mismatched_multi_bam.vsc.tfrecord')) FLAGS.examples = test_utils.test_tmpfile( _sharded('mismatched_multi_bam.examples.tfrecord')) FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = 'calling' FLAGS.gvcf_gq_binsize = 5 options = make_examples.default_options(add_flags=True) # This shows an example of what the error message looks like: # redacted with six.assertRaisesRegex( self, ValueError, 'NOT_FOUND: Unknown reference_name ' 'reference_name: "chr20" start: 9999999 end: 10000999'): make_examples_core.make_examples_runner(options)
def test_confident_regions(self): FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED FLAGS.mode = 'training' FLAGS.examples = '' options = make_examples.default_options(add_flags=True) confident_regions = make_examples.read_confident_regions(options) # Our expected intervals, inlined from CONFIDENT_REGIONS_BED. expected = _from_literals_list([ 'chr20:10000847-10002407', 'chr20:10002521-10004171', 'chr20:10004274-10004964', 'chr20:10004995-10006386', 'chr20:10006410-10007800', 'chr20:10007825-10008018', 'chr20:10008044-10008079', 'chr20:10008101-10008707', 'chr20:10008809-10008897', 'chr20:10009003-10009791', 'chr20:10009934-10010531' ]) # Our confident regions should be exactly those found in the BED file. self.assertCountEqual(expected, list(confident_regions))
def test_confident_regions(self): FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED FLAGS.mode = 'training' FLAGS.examples = '' options = make_examples.default_options(add_flags=True) confident_regions = make_examples_core.read_confident_regions(options) # Our expected intervals, inlined from CONFIDENT_REGIONS_BED. expected = _from_literals_list([ 'chr20:10000847-10002407', 'chr20:10002521-10004171', 'chr20:10004274-10004964', 'chr20:10004995-10006386', 'chr20:10006410-10007800', 'chr20:10007825-10008018', 'chr20:10008044-10008079', 'chr20:10008101-10008707', 'chr20:10008809-10008897', 'chr20:10009003-10009791', 'chr20:10009934-10010531' ]) # Our confident regions should be exactly those found in the BED file. six.assertCountEqual(self, expected, list(confident_regions))
def test_make_examples_training_end2end_with_customized_classes_labeler(self): FLAGS.labeler_algorithm = 'customized_classes_labeler' FLAGS.customized_classes_labeler_classes_list = 'ref,class1,class2' FLAGS.customized_classes_labeler_info_field_name = 'type' region = ranges.parse_literal('chr20:10,000,000-10,004,000') FLAGS.regions = [ranges.to_literal(region)] FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile(_sharded('vsc.tfrecord')) FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord')) FLAGS.partition_size = 1000 FLAGS.mode = 'training' FLAGS.gvcf_gq_binsize = 5 FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF_WITH_TYPES FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED options = make_examples.default_options(add_flags=True) make_examples_core.make_examples_runner(options) golden_file = _sharded(testdata.CUSTOMIZED_CLASSES_GOLDEN_TRAINING_EXAMPLES) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, region, options, verify_labels=True) self.assertDeepVariantExamplesEqual( examples, list(tfrecord.read_tfrecords(golden_file)))
def setUp(self): super(RegionProcessorTest, self).setUp() self._saved_flags = flagsaver.save_flag_values() self.region = ranges.parse_literal('chr20:10,000,000-10,000,100') FLAGS.reads = '' self.options = make_examples.default_options(add_flags=False) self.options.reference_filename = testdata.CHR20_FASTA main_sample = self.options.sample_options[0] if not main_sample.reads_filenames: main_sample.reads_filenames.append(testdata.CHR20_BAM) main_sample.variant_caller_options.sample_name = 'sample_id' main_sample.name = 'sample_id' self.options.truth_variants_filename = testdata.TRUTH_VARIANTS_VCF self.options.mode = deepvariant_pb2.MakeExamplesOptions.TRAINING self.processor = make_examples_core.RegionProcessor(self.options) self.ref_reader = fasta.IndexedFastaReader( self.options.reference_filename) self.mock_init = self.add_mock('initialize') for sample in self.processor.samples: sample.in_memory_sam_reader = mock.Mock() self.default_shape = [5, 5, 7] self.default_format = 'raw'
def test_make_examples_end2end(self, mode, num_shards): self.assertIn(mode, {'calling', 'training'}) region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.ref = test_utils.CHR20_FASTA FLAGS.reads = test_utils.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile( _sharded('vsc.tfrecord', num_shards)) FLAGS.examples = test_utils.test_tmpfile( _sharded('examples.tfrecord', num_shards)) FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = mode if mode == 'calling': FLAGS.gvcf = test_utils.test_tmpfile( _sharded('gvcf.tfrecord', num_shards)) else: FLAGS.truth_variants = test_utils.TRUTH_VARIANTS_VCF FLAGS.confident_regions = test_utils.CONFIDENT_REGIONS_BED for task_id in range(max(num_shards, 1)): FLAGS.task = task_id options = make_examples.default_options(add_flags=True) make_examples.make_examples_runner(options) # Test that our candidates are reasonable, calling specific helper functions # to check lots of properties of the output. candidates = _sort_candidates( io_utils.read_tfrecords(FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall)) self.verify_deepvariant_calls(candidates, options) self.verify_variants([call.variant for call in candidates], region, options, is_gvcf=False) # Verify that the variants in the examples are all good. examples = self.verify_examples(FLAGS.examples, region, options, verify_labels=mode == 'training') example_variants = [tf_utils.example_variant(ex) for ex in examples] self.verify_variants(example_variants, region, options, is_gvcf=False) # Verify the integrity of the examples and then check that they match our # golden labeled examples. Note we expect the order for both training and # calling modes to produce deterministic order because we fix the random # seed. if mode == 'calling': golden_file = _sharded(test_utils.GOLDEN_CALLING_EXAMPLES, num_shards) else: golden_file = _sharded(test_utils.GOLDEN_TRAINING_EXAMPLES, num_shards) self.assertDeepVariantExamplesEqual( examples, list(io_utils.read_tfrecords(golden_file))) if mode == 'calling': nist_reader = genomics_io.make_vcf_reader( test_utils.TRUTH_VARIANTS_VCF) nist_variants = list(nist_reader.query(region)) self.verify_nist_concordance(example_variants, nist_variants) # Check the quality of our generated gvcf file. gvcfs = _sort_variants( io_utils.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant)) self.verify_variants(gvcfs, region, options, is_gvcf=True) self.verify_contiguity(gvcfs, region)
def test_invalid_sequencing_type(self): FLAGS.mode = 'training' FLAGS.sequencing_type = 'wGs' with self.assertRaises(ValueError): make_examples.default_options(add_flags=True)
def test_make_examples_end2end(self, mode, num_shards, test_condition=TestConditions.USE_BAM, labeler_algorithm=None, use_fast_pass_aligner=True): self.assertIn(mode, {'calling', 'training'}) region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.write_run_info = True FLAGS.ref = testdata.CHR20_FASTA if test_condition == TestConditions.USE_BAM: FLAGS.reads = testdata.CHR20_BAM elif test_condition == TestConditions.USE_CRAM: FLAGS.reads = testdata.CHR20_CRAM elif test_condition == TestConditions.USE_MULTI_BAMS: FLAGS.reads = ','.join( [testdata.CHR20_BAM_FIRST_HALF, testdata.CHR20_BAM_SECOND_HALF]) FLAGS.candidates = test_utils.test_tmpfile( _sharded('vsc.tfrecord', num_shards)) FLAGS.examples = test_utils.test_tmpfile( _sharded('examples.tfrecord', num_shards)) FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = mode FLAGS.gvcf_gq_binsize = 5 FLAGS.use_fast_pass_aligner = use_fast_pass_aligner if labeler_algorithm is not None: FLAGS.labeler_algorithm = labeler_algorithm if mode == 'calling': FLAGS.gvcf = test_utils.test_tmpfile( _sharded('gvcf.tfrecord', num_shards)) else: FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED for task_id in range(max(num_shards, 1)): FLAGS.task = task_id options = make_examples.default_options(add_flags=True) # We need to overwrite bam_fname for USE_CRAM test since Golden Set # generated from BAM file. BAM filename is stored in candidates. If we # don't overwrite default_options variants won't match and test fail. options.bam_fname = 'NA12878_S1.chr20.10_10p1mb.bam' make_examples_core.make_examples_runner(options) # Check that our run_info proto contains the basic fields we'd expect: # (a) our options are written to the run_info.options field. run_info = make_examples_core.read_make_examples_run_info( options.run_info_filename) self.assertEqual(run_info.options, options) # (b) run_info.resource_metrics is present and contains our hostname. self.assertTrue(run_info.HasField('resource_metrics')) self.assertEqual(run_info.resource_metrics.host_name, platform.node()) # Test that our candidates are reasonable, calling specific helper functions # to check lots of properties of the output. candidates = sorted( tfrecord.read_tfrecords( FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall), key=lambda c: variant_utils.variant_range_tuple(c.variant)) self.verify_deepvariant_calls(candidates, options) self.verify_variants([call.variant for call in candidates], region, options, is_gvcf=False) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, region, options, verify_labels=mode == 'training') example_variants = [tf_utils.example_variant(ex) for ex in examples] self.verify_variants(example_variants, region, options, is_gvcf=False) # Verify the integrity of the examples and then check that they match our # golden labeled examples. Note we expect the order for both training and # calling modes to produce deterministic order because we fix the random # seed. if mode == 'calling': golden_file = _sharded(testdata.GOLDEN_CALLING_EXAMPLES, num_shards) else: golden_file = _sharded(testdata.GOLDEN_TRAINING_EXAMPLES, num_shards) self.assertDeepVariantExamplesEqual( examples, list(tfrecord.read_tfrecords(golden_file))) if mode == 'calling': nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) nist_variants = list(nist_reader.query(region)) self.verify_nist_concordance(example_variants, nist_variants) # Check the quality of our generated gvcf file. gvcfs = variant_utils.sorted_variants( tfrecord.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant)) self.verify_variants(gvcfs, region, options, is_gvcf=True) self.verify_contiguity(gvcfs, region) gvcf_golden_file = _sharded(testdata.GOLDEN_POSTPROCESS_GVCF_INPUT, num_shards) expected_gvcfs = list( tfrecord.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant)) # Despite the name, assertCountEqual checks that all elements match. self.assertCountEqual(gvcfs, expected_gvcfs) if (mode == 'training' and num_shards == 0 and labeler_algorithm != 'positional_labeler'): # The positional labeler doesn't track metrics, so don't try to read them # in when that's the mode. self.assertEqual( make_examples_core.read_make_examples_run_info( testdata.GOLDEN_MAKE_EXAMPLES_RUN_INFO).labeling_metrics, run_info.labeling_metrics)
def test_make_examples_end2end(self, mode, num_shards, labeler_algorithm=None): self.maxDiff = None self.assertIn(mode, {'calling', 'training'}) region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile( _sharded('vsc.tfrecord', num_shards)) FLAGS.examples = test_utils.test_tmpfile( _sharded('examples.tfrecord', num_shards)) FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = mode FLAGS.gvcf_gq_binsize = 5 if labeler_algorithm is not None: FLAGS.labeler_algorithm = labeler_algorithm if mode == 'calling': FLAGS.gvcf = test_utils.test_tmpfile( _sharded('gvcf.tfrecord', num_shards)) else: FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED for task_id in range(max(num_shards, 1)): FLAGS.task = task_id options = make_examples.default_options(add_flags=True) make_examples.make_examples_runner(options) # Test that our candidates are reasonable, calling specific helper functions # to check lots of properties of the output. candidates = sorted( io_utils.read_tfrecords( FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall), key=lambda c: variant_utils.variant_range_tuple(c.variant)) self.verify_deepvariant_calls(candidates, options) self.verify_variants( [call.variant for call in candidates], region, options, is_gvcf=False) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, region, options, verify_labels=mode == 'training') example_variants = [tf_utils.example_variant(ex) for ex in examples] self.verify_variants(example_variants, region, options, is_gvcf=False) # Verify the integrity of the examples and then check that they match our # golden labeled examples. Note we expect the order for both training and # calling modes to produce deterministic order because we fix the random # seed. if mode == 'calling': golden_file = _sharded(testdata.GOLDEN_CALLING_EXAMPLES, num_shards) else: golden_file = _sharded(testdata.GOLDEN_TRAINING_EXAMPLES, num_shards) self.assertDeepVariantExamplesEqual( examples, list(io_utils.read_tfrecords(golden_file))) if mode == 'calling': nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) nist_variants = list(nist_reader.query(region)) self.verify_nist_concordance(example_variants, nist_variants) # Check the quality of our generated gvcf file. gvcfs = variant_utils.sorted_variants( io_utils.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant)) self.verify_variants(gvcfs, region, options, is_gvcf=True) self.verify_contiguity(gvcfs, region) gvcf_golden_file = _sharded(testdata.GOLDEN_POSTPROCESS_GVCF_INPUT, num_shards) expected_gvcfs = list( io_utils.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant)) self.assertItemsEqual(gvcfs, expected_gvcfs)