def process(self, region): """Finds candidates and creates corresponding examples in a region. Args: region: A nucleus.genomics.v1.Range proto. Specifies the region on the genome we should process. Returns: Three values. First is a list of the found candidates, which are deepvariant.DeepVariantCall objects. The second value is a list of filled in tf.Example protos. For example, these will include the candidate variant, the pileup image, and, if in training mode, the truth variants and labels needed for training. The third value is a list of nucleus.genomics.v1.Variant protos containing gVCF information for all reference sites, if gvcf generation is enabled, otherwise returns []. """ region_timer = timer.TimerStart() # Print some basic information about what we are doing. if not self.initialized: self._initialize() self.in_memory_sam_reader.replace_reads(self.region_reads(region)) candidates, gvcfs = self.candidates_in_region(region) if in_training_mode(self.options): examples = [ self.add_label_to_example(example, label) for candidate, label in self.label_candidates( candidates, region) for example in self.create_pileup_examples(candidate) ] else: examples = [ example for candidate in candidates for example in self.create_pileup_examples(candidate) ] logging.info('Found %s candidates in %s [%d bp] [%0.2fs elapsed]', len(examples), ranges.to_literal(region), ranges.length(region), region_timer.Stop()) return candidates, examples, gvcfs
def make_examples_runner(options): """Runs examples creation stage of deepvariant.""" resource_monitor = resources.ResourceMonitor().start() logging.info('Preparing inputs') regions = processing_regions_from_options(options) # Create a processor to create candidates and examples for each region. region_processor = RegionProcessor(options) logging.info('Writing examples to %s', options.examples_filename) if options.candidates_filename: logging.info('Writing candidates to %s', options.candidates_filename) if options.gvcf_filename: logging.info('Writing gvcf records to %s', options.gvcf_filename) n_regions, n_candidates, n_examples = 0, 0, 0 last_reported = 0 with OutputsWriter(options) as writer: running_timer = timer.TimerStart() for region in regions: candidates, examples, gvcfs = region_processor.process(region) n_candidates += len(candidates) n_examples += len(examples) n_regions += 1 writer.write_candidates(*candidates) # If we have any gvcf records, write them out. This if also serves to # protect us from trying to write to the gvcfs output of writer when gvcf # generation is turned off. In that case, gvcfs will always be empty and # we'll never execute the write. if gvcfs: writer.write_gvcfs(*gvcfs) writer.write_examples(*examples) # Output timing for every N candidates. # redacted if (int(n_candidates / FLAGS.logging_every_n_candidates) > last_reported or n_regions == 1): last_reported = int(n_candidates / FLAGS.logging_every_n_candidates) logging.info( 'Task %s: %s candidates (%s examples) [%0.2fs elapsed]', options.task_id, n_candidates, n_examples, running_timer.Stop()) running_timer = timer.TimerStart() # Construct and then write out our MakeExamplesRunInfo proto. if options.run_info_filename: run_info = deepvariant_pb2.MakeExamplesRunInfo( options=options, resource_metrics=resource_monitor.metrics()) if in_training_mode(options): if region_processor.labeler.metrics is not None: run_info.labeling_metrics.CopyFrom( region_processor.labeler.metrics) else: logging.warning( 'Labeling metrics requested but the selected labeling ' 'algorithm %s does not collect metrics; skipping.', options.labeler_algorithm) logging.info('Writing MakeExamplesRunInfo to %s', options.run_info_filename) write_make_examples_run_info(run_info, path=options.run_info_filename) logging.info('Found %s candidate variants', n_candidates) logging.info('Created %s examples', n_examples)