Exemplo n.º 1
0
    def test_realigner_end2end(self):
        ref_reader = genomics_io.make_ref_reader(test_utils.CHR20_FASTA)
        config = realigner.realigner_config(FLAGS)
        reads_realigner = realigner.Realigner(config, ref_reader)
        region_str = 'chr20:10,000,000-10,009,999'

        regions = ranges.RangeSet.from_regions([region_str])
        for region in regions.partition(1000):
            with genomics_io.make_sam_reader(
                    test_utils.CHR20_BAM,
                    core_pb2.ReadRequirements()) as sam_reader:
                in_reads = list(sam_reader.query(region))
            windows, out_reads = reads_realigner.realign_reads(
                in_reads, region)

            # We should always get back all of the reads we sent in. Instead of just
            # checking the lengths are the same, make sure all the read names are the
            # same.
            self.assertCountEqual([r.fragment_name for r in in_reads],
                                  [r.fragment_name for r in out_reads])

            # Make sure we assembled at least one windows in the region.
            self.assertNotEqual(0, len(windows))

            # Check each window to make sure it's reasonable.
            for window in windows:
                # We always expect the reference sequence to be one of our haplotypes.
                ref_seq = ref_reader.bases(window.span)
                self.assertIn(ref_seq, set(window.haplotypes))
def main(argv=()):
    with errors.clean_commandline_error_exit():
        if len(argv) > 1:
            errors.log_and_raise(
                'Command line parsing failure: postprocess_variants does not accept '
                'positional arguments but some are present on the command line: '
                '"{}".'.format(str(argv)), errors.CommandLineError)
        del argv  # Unused.
        proto_utils.uses_fast_cpp_protos_or_die()

        logging_level.set_from_flag()

        with genomics_io.make_ref_reader(FLAGS.ref) as reader:
            contigs = reader.contigs
        paths = io_utils.maybe_generate_sharded_filenames(FLAGS.infile)
        with tempfile.NamedTemporaryFile() as temp:
            postprocess_variants_lib.process_single_sites_tfrecords(
                contigs, paths, temp.name)
            # Read one CallVariantsOutput record and extract the sample name from it.
            # Note that this assumes that all CallVariantsOutput protos in the infile
            # contain a single VariantCall within their constituent Variant proto, and
            # that the call_set_name is identical in each of the records.
            record = next(
                io_utils.read_tfrecords(
                    paths[0],
                    proto=deepvariant_pb2.CallVariantsOutput,
                    max_records=1))
            sample_name = _extract_single_sample_name(record)
            write_call_variants_output_to_vcf(
                contigs=contigs,
                input_sorted_tfrecord_path=temp.name,
                output_vcf_path=FLAGS.outfile,
                qual_filter=FLAGS.qual_filter,
                multi_allelic_qual_filter=FLAGS.multi_allelic_qual_filter,
                sample_name=sample_name)
Exemplo n.º 3
0
    def _initialize(self):
        """Initialize the resources needed for this work in the current env."""
        if self.initialized:
            raise ValueError('Cannot initialize this object twice')

        self.ref_reader = genomics_io.make_ref_reader(
            self.options.reference_filename)
        self.sam_reader = self._make_sam_reader()
        self.in_memory_sam_reader = utils.InMemorySamReader([])

        if self.options.realigner_enabled:
            self.realigner = realigner.Realigner(
                self.options.realigner_options, self.ref_reader)
        self.pic = pileup_image.PileupImageCreator(
            ref_reader=self.ref_reader,
            sam_reader=self.in_memory_sam_reader,
            options=self.options.pic_options)

        if in_training_mode(self.options):
            self.labeler = variant_labeler.VariantLabeler(
                genomics_io.make_vcf_reader(
                    self.options.truth_variants_filename),
                read_confident_regions(self.options))

        self.variant_caller = variant_caller.VariantCaller(
            self.options.variant_caller_options)
        self.random = np.random.RandomState(self.options.random_seed)
        self.initialized = True
    def test_call_from_allele_counter(self):
        ref = genomics_io.make_ref_reader(test_utils.CHR20_FASTA)
        sam_reader = genomics_io.make_sam_reader(test_utils.CHR20_BAM)
        size = 1000
        region = ranges.make_range('chr20', 10000000, 10000000 + size)
        allele_counter = _allelecounter.AlleleCounter(
            ref, region,
            deepvariant_pb2.AlleleCounterOptions(partition_size=size))
        caller = variant_calling.VariantCaller(
            deepvariant_pb2.VariantCallerOptions(min_count_snps=2,
                                                 min_count_indels=2,
                                                 min_fraction_snps=0.12,
                                                 min_fraction_indels=0.12,
                                                 sample_name='sample_name',
                                                 p_error=0.001,
                                                 max_gq=50,
                                                 gq_resolution=1,
                                                 ploidy=2))

        # Grab all of the reads in our region and add them to the allele_counter.
        reads = list(sam_reader.query(region))
        self.assertNotEmpty(reads)
        for read in reads:
            allele_counter.add(read)

        # Get the candidates records for this whole region.
        candidates = caller.calls_from_allele_counter(allele_counter)

        # We should have at least some candidates and some gvcf records.
        self.assertNotEmpty(candidates)

        # Each candidate should be a DeepVariantCall.
        for candidate in candidates:
            self.assertIsInstance(candidate, deepvariant_pb2.DeepVariantCall)
    def test_straightforward_region(self):
        ref_reader = genomics_io.make_ref_reader(test_utils.CHR20_FASTA)
        bam_reader = genomics_io.make_sam_reader(test_utils.CHR20_BAM)
        region = ranges.parse_literal('chr20:10,000,000-10,000,100')
        ref_seq = ref_reader.bases(region)

        all_reads = list(bam_reader.query(region))
        dbg30 = debruijn_graph.build(ref_seq, all_reads,
                                     self.single_k_dbg_options(30))
        self.assertIsNotNone(dbg30)
        self.assertEqual([ref_seq], dbg30.candidate_haplotypes())
 def test_complex_region(self):
     # There is a heterozygous 9 bp deletion of tandem TGA repeat.
     # "chr20:10,095,379-10,095,500"
     ref_reader = genomics_io.make_ref_reader(test_utils.CHR20_FASTA)
     bam_reader = genomics_io.make_sam_reader(test_utils.CHR20_BAM)
     region = ranges.parse_literal('chr20:10,095,379-10,095,500')
     ref_seq = ref_reader.bases(region)
     reads = list(bam_reader.query(region))
     dbg = debruijn_graph.build(ref_seq, reads, self.dbg_options())
     self.assertIsNotNone(dbg)
     self.assertEqual(44, dbg.kmer_size)
     self.assertEqual(2, len(dbg.candidate_haplotypes()))
     self.assertIn(ref_seq, dbg.candidate_haplotypes())
Exemplo n.º 7
0
def processing_regions_from_options(options):
    """Computes the calling regions from our options.

  This function does all of the work needed to read our input files and region
  specifications to determine the list of regions we should generate examples
  over. It also computes the confident regions need to label variants.

  Args:
    options: deepvariant.DeepVariantOptions proto containing information about
      our input data sources.

  Returns:
    Two values. The first is a list of learning.genomics.v1.Range protos of the
    regions we should process. The second is a RangeSet containing the confident
    regions for labeling, or None if we are running in training mode.
  """
    ref_contigs = genomics_io.make_ref_reader(
        options.reference_filename).contigs
    sam_contigs = genomics_io.make_sam_reader(options.reads_filename).contigs

    # Add in confident regions and vcf_contigs if in training mode.
    vcf_contigs = None
    if in_training_mode(options):
        vcf_contigs = genomics_io.make_vcf_reader(
            options.truth_variants_filename).contigs

    # Compute the common contigs among our inputs, and check that the contigs are
    # sufficiently consistent among each other.
    contigs = common_contigs(only_true(ref_contigs, sam_contigs, vcf_contigs),
                             exclude_contig_names=options.exclude_contigs)
    validate_reference_contig_coverage(ref_contigs, contigs,
                                       options.min_shared_contigs_basepairs)
    logging.info('Common contigs are %s', [c.name for c in contigs])

    regions = regions_to_process(
        contigs,
        partition_size=options.allele_counter_options.partition_size,
        calling_regions=ranges.RangeSet.from_regions(
            options.calling_regions, ranges.contigs_dict(ref_contigs)),
        task_id=options.task_id,
        num_shards=options.num_shards)

    return regions
 def test_remote_fasta(self):
   reader = genomics_io.make_ref_reader(REMOTE_FASTA)
   reference_segment = reader.bases(self.query_window)
   self.assertEqual(EXPECTED_REFERENCE_SEQUENCE, reference_segment)
Exemplo n.º 9
0
 def setUp(self):
     self.ref_reader = genomics_io.make_ref_reader(test_utils.CHR20_FASTA)
     self.config = realigner.realigner_config(FLAGS)
     self.reads_realigner = realigner.Realigner(self.config,
                                                self.ref_reader)
Exemplo n.º 10
0
def main(argv=()):
    with errors.clean_commandline_error_exit():
        if len(argv) > 1:
            errors.log_and_raise(
                'Command line parsing failure: postprocess_variants does not accept '
                'positional arguments but some are present on the command line: '
                '"{}".'.format(str(argv)), errors.CommandLineError)
        del argv  # Unused.

        if (not FLAGS.nonvariant_site_tfrecord_path) != (
                not FLAGS.gvcf_outfile):
            errors.log_and_raise(
                'gVCF creation requires both nonvariant_site_tfrecord_path and '
                'gvcf_outfile flags to be set.', errors.CommandLineError)

        proto_utils.uses_fast_cpp_protos_or_die()

        logging_level.set_from_flag()

        with genomics_io.make_ref_reader(FLAGS.ref) as reader:
            contigs = reader.contigs
        paths = io_utils.maybe_generate_sharded_filenames(FLAGS.infile)
        with tempfile.NamedTemporaryFile() as temp:
            postprocess_variants_lib.process_single_sites_tfrecords(
                contigs, paths, temp.name)
            # Read one CallVariantsOutput record and extract the sample name from it.
            # Note that this assumes that all CallVariantsOutput protos in the infile
            # contain a single VariantCall within their constituent Variant proto, and
            # that the call_set_name is identical in each of the records.
            record = next(
                io_utils.read_tfrecords(
                    paths[0],
                    proto=deepvariant_pb2.CallVariantsOutput,
                    max_records=1))
            sample_name = _extract_single_sample_name(record)
            independent_variants = _transform_call_variants_output_to_variants(
                input_sorted_tfrecord_path=temp.name,
                qual_filter=FLAGS.qual_filter,
                multi_allelic_qual_filter=FLAGS.multi_allelic_qual_filter,
                sample_name=sample_name)
            variant_generator = haplotypes.maybe_resolve_conflicting_variants(
                independent_variants)
            write_variants_to_vcf(contigs=contigs,
                                  variant_generator=variant_generator,
                                  output_vcf_path=FLAGS.outfile,
                                  sample_name=sample_name)

        # Also write out the gVCF file if it was provided.
        if FLAGS.nonvariant_site_tfrecord_path:
            nonvariant_generator = io_utils.read_shard_sorted_tfrecords(
                FLAGS.nonvariant_site_tfrecord_path,
                key=_get_contig_based_variant_sort_keyfn(contigs),
                proto=variants_pb2.Variant)
            with genomics_io.make_vcf_reader(
                    FLAGS.outfile, use_index=False,
                    include_likelihoods=True) as variant_reader:
                lessthanfn = _get_contig_based_lessthan(variant_reader.contigs)
                gvcf_variants = (_transform_to_gvcf_record(variant)
                                 for variant in variant_reader.iterate())
                merged_variants = merge_variants_and_nonvariants(
                    gvcf_variants, nonvariant_generator, lessthanfn)
                write_variants_to_vcf(contigs=contigs,
                                      variant_generator=merged_variants,
                                      output_vcf_path=FLAGS.gvcf_outfile,
                                      sample_name=sample_name,
                                      filters=FILTERS)
Exemplo n.º 11
0
 def test_make_ref_reader(self, fasta_filename):
   fasta_path = test_utils.genomics_core_testdata(fasta_filename)
   with genomics_io.make_ref_reader(fasta_path) as reader:
     self.assertEqual(reader.bases(ranges.make_range('chrM', 1, 6)), 'ATCAC')