def log_graph_metrics(self, region, graph, candidate_haplotypes, graph_building_time): """Logs, if enabled, graph construction information for region.""" if self.enabled: if graph: dest_file = self._file_for_region(region, self.graph_filename) with tf.gfile.FastGFile(dest_file, 'w') as f: f.write(graph.graphviz()) self._write_csv_line( ranges.to_literal(region), graph.kmer_size if graph else 'NA', len(candidate_haplotypes), graph_building_time)
def test_catches_bad_flags(self): # Set all of the requested flag values. region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.ref = test_utils.CHR20_FASTA FLAGS.reads = test_utils.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile('vsc.tfrecord') FLAGS.examples = test_utils.test_tmpfile('examples.tfrecord') FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = 'training' FLAGS.truth_variants = test_utils.TRUTH_VARIANTS_VCF # This is the bad flag. FLAGS.confident_regions = '' with mock.patch.object(logging, 'error') as mock_logging,\ mock.patch.object(sys, 'exit') as mock_exit: make_examples.main(['make_examples.py']) mock_logging.assert_called_once_with( 'confident_regions is required when in training mode.') mock_exit.assert_called_once_with(errno.ENOENT)
def process(self, region): """Finds candidates and creates corresponding examples in a region. Args: region: A nucleus.genomics.v1.Range proto. Specifies the region on the genome we should process. Returns: Three values. First is a list of the found candidates, which are deepvariant.DeepVariantCall objects. The second value is a list of filled in tf.Example protos. For example, these will include the candidate variant, the pileup image, and, if in training mode, the truth variants and labels needed for training. The third value is a list of nucleus.genomics.v1.Variant protos containing gVCF information for all reference sites, if gvcf generation is enabled, otherwise returns []. """ region_timer = timer.TimerStart() # Print some basic information about what we are doing. if not self.initialized: self._initialize() self.in_memory_sam_reader.replace_reads(self.region_reads(region)) candidates, gvcfs = self.candidates_in_region(region) examples = [] for candidate in candidates: for example in self.create_pileup_examples(candidate): if in_training_mode(self.options): if self.label_variant(example, candidate.variant): examples.append(example) else: examples.append(example) logging.info('Found %s candidates in %s [%0.2fs elapsed]', len(examples), ranges.to_literal(region), region_timer.Stop()) # Useful for debugging what examples are emitted... # for example in examples: # logging.info(' example: %s', tf_utils.example_key(example)) return candidates, examples, gvcfs
def make_example(variant, alt_alleles, encoded_image, shape, image_format): """Creates a new tf.Example suitable for use with DeepVariant. Args: variant: third_party.nucleus.protos.Variant protobuf containing information about a candidate variant call. alt_alleles: A set of strings. Indicates the alternate alleles used as "alt" when constructing the image. encoded_image: a Tensor of type tf.string. Should contain an image encoding the reference and read data supporting variant. The encoding should be consistent with the image_format argument. shape: a list of (width, height, channel). image_format: string. The scheme used to encode our image. Returns: A tf.Example proto containing the standard DeepVariant features. """ example = example_pb2.Example() features = example.features features.feature['locus'].bytes_list.value.append( ranges.to_literal( ranges.make_range(variant.reference_name, variant.start, variant.end))) features.feature['variant/encoded'].bytes_list.value.append( variant.SerializeToString()) all_alts = list(variant.alternate_bases) alt_indices = sorted(all_alts.index(alt) for alt in alt_alleles) features.feature['alt_allele_indices/encoded'].bytes_list.value.append( deepvariant_pb2.CallVariantsOutput.AltAlleleIndices( indices=alt_indices).SerializeToString()) features.feature['image/encoded'].bytes_list.value.append(encoded_image) features.feature['image/format'].bytes_list.value.append(image_format) features.feature['image/shape'].int64_list.value.extend(shape) return example
def test_to_literal(self): self.assertEqual( ranges.to_literal(ranges.make_range('chr1', 0, 20)), 'chr1:1-20')
def _file_for_region(self, region, basename): """Returns the path to a file in a region-specific subdirectory.""" assert self.enabled, 'only callable when diagnostics are on' return self._root_join( os.path.join(ranges.to_literal(region), basename))
def test_make_examples_end2end(self, mode, num_shards): self.assertIn(mode, {'calling', 'training'}) region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.ref = test_utils.CHR20_FASTA FLAGS.reads = test_utils.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile( _sharded('vsc.tfrecord', num_shards)) FLAGS.examples = test_utils.test_tmpfile( _sharded('examples.tfrecord', num_shards)) FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = mode if mode == 'calling': FLAGS.gvcf = test_utils.test_tmpfile( _sharded('gvcf.tfrecord', num_shards)) else: FLAGS.truth_variants = test_utils.TRUTH_VARIANTS_VCF FLAGS.confident_regions = test_utils.CONFIDENT_REGIONS_BED for task_id in range(max(num_shards, 1)): FLAGS.task = task_id options = make_examples.default_options(add_flags=True) make_examples.make_examples_runner(options) # Test that our candidates are reasonable, calling specific helper functions # to check lots of properties of the output. candidates = _sort_candidates( io_utils.read_tfrecords(FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall)) self.verify_deepvariant_calls(candidates, options) self.verify_variants([call.variant for call in candidates], region, options, is_gvcf=False) # Verify that the variants in the examples are all good. examples = self.verify_examples(FLAGS.examples, region, options, verify_labels=mode == 'training') example_variants = [tf_utils.example_variant(ex) for ex in examples] self.verify_variants(example_variants, region, options, is_gvcf=False) # Verify the integrity of the examples and then check that they match our # golden labeled examples. Note we expect the order for both training and # calling modes to produce deterministic order because we fix the random # seed. if mode == 'calling': golden_file = _sharded(test_utils.GOLDEN_CALLING_EXAMPLES, num_shards) else: golden_file = _sharded(test_utils.GOLDEN_TRAINING_EXAMPLES, num_shards) self.assertDeepVariantExamplesEqual( examples, list(io_utils.read_tfrecords(golden_file))) if mode == 'calling': nist_reader = genomics_io.make_vcf_reader( test_utils.TRUTH_VARIANTS_VCF) nist_variants = list(nist_reader.query(region)) self.verify_nist_concordance(example_variants, nist_variants) # Check the quality of our generated gvcf file. gvcfs = _sort_variants( io_utils.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant)) self.verify_variants(gvcfs, region, options, is_gvcf=True) self.verify_contiguity(gvcfs, region)