示例#1
0
    def process(self, region):
        """Finds candidates and creates corresponding examples in a region.

    Args:
      region: A nucleus.genomics.v1.Range proto. Specifies the region on the
        genome we should process.

    Returns:
      Three values. First is a list of the found candidates, which are
      deepvariant.DeepVariantCall objects. The second value is a list of filled
      in tf.Example protos. For example, these will include the candidate
      variant, the pileup image, and, if in training mode, the truth variants
      and labels needed for training. The third value is a list of
      nucleus.genomics.v1.Variant protos containing gVCF information for all
      reference sites, if gvcf generation is enabled, otherwise returns [].
    """
        region_timer = timer.TimerStart()

        # Print some basic information about what we are doing.
        if not self.initialized:
            self._initialize()

        self.in_memory_sam_reader.replace_reads(self.region_reads(region))
        candidates, gvcfs = self.candidates_in_region(region)

        if in_training_mode(self.options):
            examples = [
                self.add_label_to_example(example, label)
                for candidate, label in self.label_candidates(
                    candidates, region)
                for example in self.create_pileup_examples(candidate)
            ]
        else:
            examples = [
                example for candidate in candidates
                for example in self.create_pileup_examples(candidate)
            ]

        logging.info('Found %s candidates in %s [%d bp] [%0.2fs elapsed]',
                     len(examples), ranges.to_literal(region),
                     ranges.length(region), region_timer.Stop())
        return candidates, examples, gvcfs
示例#2
0
def make_examples_runner(options):
    """Runs examples creation stage of deepvariant."""
    resource_monitor = resources.ResourceMonitor().start()
    logging.info('Preparing inputs')
    regions = processing_regions_from_options(options)

    # Create a processor to create candidates and examples for each region.
    region_processor = RegionProcessor(options)

    logging.info('Writing examples to %s', options.examples_filename)
    if options.candidates_filename:
        logging.info('Writing candidates to %s', options.candidates_filename)
    if options.gvcf_filename:
        logging.info('Writing gvcf records to %s', options.gvcf_filename)

    n_regions, n_candidates, n_examples = 0, 0, 0
    last_reported = 0
    with OutputsWriter(options) as writer:
        running_timer = timer.TimerStart()
        for region in regions:
            candidates, examples, gvcfs = region_processor.process(region)
            n_candidates += len(candidates)
            n_examples += len(examples)
            n_regions += 1

            writer.write_candidates(*candidates)

            # If we have any gvcf records, write them out. This if also serves to
            # protect us from trying to write to the gvcfs output of writer when gvcf
            # generation is turned off. In that case, gvcfs will always be empty and
            # we'll never execute the write.
            if gvcfs:
                writer.write_gvcfs(*gvcfs)
            writer.write_examples(*examples)

            # Output timing for every N candidates.
            # redacted
            if (int(n_candidates / FLAGS.logging_every_n_candidates) >
                    last_reported or n_regions == 1):
                last_reported = int(n_candidates /
                                    FLAGS.logging_every_n_candidates)
                logging.info(
                    'Task %s: %s candidates (%s examples) [%0.2fs elapsed]',
                    options.task_id, n_candidates, n_examples,
                    running_timer.Stop())
                running_timer = timer.TimerStart()

    # Construct and then write out our MakeExamplesRunInfo proto.
    if options.run_info_filename:
        run_info = deepvariant_pb2.MakeExamplesRunInfo(
            options=options, resource_metrics=resource_monitor.metrics())
        if in_training_mode(options):
            if region_processor.labeler.metrics is not None:
                run_info.labeling_metrics.CopyFrom(
                    region_processor.labeler.metrics)
            else:
                logging.warning(
                    'Labeling metrics requested but the selected labeling '
                    'algorithm %s does not collect metrics; skipping.',
                    options.labeler_algorithm)
        logging.info('Writing MakeExamplesRunInfo to %s',
                     options.run_info_filename)
        write_make_examples_run_info(run_info, path=options.run_info_filename)

    logging.info('Found %s candidate variants', n_candidates)
    logging.info('Created %s examples', n_examples)