Python make_sam_reader 예제들, deepvariant.core.genomics_io.make_sam_reader Python 예제들

예제 #1

0

파일 보기

파일: realigner_test.py 프로젝트: zmandyhe/deepvariant

    def test_realigner_end2end(self):
        ref_reader = genomics_io.make_ref_reader(test_utils.CHR20_FASTA)
        config = realigner.realigner_config(FLAGS)
        reads_realigner = realigner.Realigner(config, ref_reader)
        region_str = 'chr20:10,000,000-10,009,999'

        regions = ranges.RangeSet.from_regions([region_str])
        for region in regions.partition(1000):
            with genomics_io.make_sam_reader(
                    test_utils.CHR20_BAM,
                    core_pb2.ReadRequirements()) as sam_reader:
                in_reads = list(sam_reader.query(region))
            windows, out_reads = reads_realigner.realign_reads(
                in_reads, region)

            # We should always get back all of the reads we sent in. Instead of just
            # checking the lengths are the same, make sure all the read names are the
            # same.
            self.assertCountEqual([r.fragment_name for r in in_reads],
                                  [r.fragment_name for r in out_reads])

            # Make sure we assembled at least one windows in the region.
            self.assertNotEqual(0, len(windows))

            # Check each window to make sure it's reasonable.
            for window in windows:
                # We always expect the reference sequence to be one of our haplotypes.
                ref_seq = ref_reader.bases(window.span)
                self.assertIn(ref_seq, set(window.haplotypes))

예제 #2

0

파일 보기

파일: make_examples.py 프로젝트: WesCoomber/compilersProjecto

 def _make_sam_reader(self):
     return genomics_io.make_sam_reader(
         self.options.reads_filename,
         self.options.read_requirements,
         hts_block_size=FLAGS.hts_block_size,
         downsample_fraction=self.options.downsample_fraction,
         random_seed=self.options.random_seed)

예제 #3

0

파일 보기

파일: variant_calling_wrap_test.py 프로젝트: WesCoomber/compilersProjecto

    def test_call_from_allele_counter(self):
        ref = genomics_io.make_ref_reader(test_utils.CHR20_FASTA)
        sam_reader = genomics_io.make_sam_reader(test_utils.CHR20_BAM)
        size = 1000
        region = ranges.make_range('chr20', 10000000, 10000000 + size)
        allele_counter = _allelecounter.AlleleCounter(
            ref, region,
            deepvariant_pb2.AlleleCounterOptions(partition_size=size))
        caller = variant_calling.VariantCaller(
            deepvariant_pb2.VariantCallerOptions(min_count_snps=2,
                                                 min_count_indels=2,
                                                 min_fraction_snps=0.12,
                                                 min_fraction_indels=0.12,
                                                 sample_name='sample_name',
                                                 p_error=0.001,
                                                 max_gq=50,
                                                 gq_resolution=1,
                                                 ploidy=2))

        # Grab all of the reads in our region and add them to the allele_counter.
        reads = list(sam_reader.query(region))
        self.assertNotEmpty(reads)
        for read in reads:
            allele_counter.add(read)

        # Get the candidates records for this whole region.
        candidates = caller.calls_from_allele_counter(allele_counter)

        # We should have at least some candidates and some gvcf records.
        self.assertNotEmpty(candidates)

        # Each candidate should be a DeepVariantCall.
        for candidate in candidates:
            self.assertIsInstance(candidate, deepvariant_pb2.DeepVariantCall)

예제 #4

0

파일 보기

 def test_sam_query(self):
   reader = genomics_io.make_sam_reader(
       test_utils.genomics_core_testdata('test.bam'))
   expected = [(ranges.parse_literal('chr20:10,000,000-10,000,100'), 106),
               (ranges.parse_literal('chr20:10,000,000-10,000,000'), 45)]
   with reader:
     for interval, n_expected in expected:
       with reader.query(interval) as iterable:
         self.assertEqual(test_utils.iterable_len(iterable), n_expected)

예제 #5

0

파일 보기

파일: debruijn_graph_wrap_test.py 프로젝트: WesCoomber/compilersProjecto

    def test_straightforward_region(self):
        ref_reader = genomics_io.make_ref_reader(test_utils.CHR20_FASTA)
        bam_reader = genomics_io.make_sam_reader(test_utils.CHR20_BAM)
        region = ranges.parse_literal('chr20:10,000,000-10,000,100')
        ref_seq = ref_reader.bases(region)

        all_reads = list(bam_reader.query(region))
        dbg30 = debruijn_graph.build(ref_seq, all_reads,
                                     self.single_k_dbg_options(30))
        self.assertIsNotNone(dbg30)
        self.assertEqual([ref_seq], dbg30.candidate_haplotypes())

예제 #6

0

파일 보기

 def test_bam_iterate_partially(self):
   """Verify that iteration provides results incrementally, not all at once."""
   reader = genomics_io.make_sam_reader(
       test_utils.genomics_core_testdata('test.bam'), use_index=False)
   with reader:
     iterable = reader.iterate()
     # We expect 106 records in total.
     for _ in xrange(10):
       results = list(itertools.islice(iterable, 10))
       self.assertEqual(len(results), 10)
     results = list(itertools.islice(iterable, 10))
     self.assertEqual(len(results), 6)

예제 #7

0

파일 보기

 def _parse_read_with_aux_tags(self, tag_string):
   # Minimal header line to create a valid SAM file.
   header_lines = '@HD	VN:1.3	SO:coordinate\n@SQ	SN:chr1	LN:248956422\n'
   # A single stock read we'll add our AUX fields to.
   read = 'read_name	0	chr1	1	0	3M	*	0	0	CCC	AAA	' + tag_string
   path = test_utils.test_tmpfile('aux_tags.bam')
   with tf.gfile.FastGFile(path, 'w') as fout:
     fout.write(header_lines)
     fout.write(read + '\n')
   with genomics_io.make_sam_reader(
       path, use_index=False, parse_aux_fields=True) as reader:
     return list(reader.iterate())

예제 #8

0

파일 보기

 def test_downsampling(self, method, maybe_range, fraction, expected_n_reads):
   reader = genomics_io.make_sam_reader(
       test_utils.genomics_core_testdata('test.bam'),
       downsample_fraction=fraction,
       random_seed=12345)
   with reader:
     if method == 'iterate':
       reads_iter = reader.iterate()
     elif method == 'query':
       reads_iter = reader.query(ranges.parse_literal(maybe_range))
     else:
       self.fail('Unexpected method', method)
     self.assertEqual(test_utils.iterable_len(reads_iter), expected_n_reads)

예제 #9

0

파일 보기

파일: debruijn_graph_wrap_test.py 프로젝트: WesCoomber/compilersProjecto

 def test_complex_region(self):
     # There is a heterozygous 9 bp deletion of tandem TGA repeat.
     # "chr20:10,095,379-10,095,500"
     ref_reader = genomics_io.make_ref_reader(test_utils.CHR20_FASTA)
     bam_reader = genomics_io.make_sam_reader(test_utils.CHR20_BAM)
     region = ranges.parse_literal('chr20:10,095,379-10,095,500')
     ref_seq = ref_reader.bases(region)
     reads = list(bam_reader.query(region))
     dbg = debruijn_graph.build(ref_seq, reads, self.dbg_options())
     self.assertIsNotNone(dbg)
     self.assertEqual(44, dbg.kmer_size)
     self.assertEqual(2, len(dbg.candidate_haplotypes()))
     self.assertIn(ref_seq, dbg.candidate_haplotypes())

예제 #10

0

파일 보기

파일: make_examples.py 프로젝트: WesCoomber/compilersProjecto

def processing_regions_from_options(options):
    """Computes the calling regions from our options.

  This function does all of the work needed to read our input files and region
  specifications to determine the list of regions we should generate examples
  over. It also computes the confident regions need to label variants.

  Args:
    options: deepvariant.DeepVariantOptions proto containing information about
      our input data sources.

  Returns:
    Two values. The first is a list of learning.genomics.v1.Range protos of the
    regions we should process. The second is a RangeSet containing the confident
    regions for labeling, or None if we are running in training mode.
  """
    ref_contigs = genomics_io.make_ref_reader(
        options.reference_filename).contigs
    sam_contigs = genomics_io.make_sam_reader(options.reads_filename).contigs

    # Add in confident regions and vcf_contigs if in training mode.
    vcf_contigs = None
    if in_training_mode(options):
        vcf_contigs = genomics_io.make_vcf_reader(
            options.truth_variants_filename).contigs

    # Compute the common contigs among our inputs, and check that the contigs are
    # sufficiently consistent among each other.
    contigs = common_contigs(only_true(ref_contigs, sam_contigs, vcf_contigs),
                             exclude_contig_names=options.exclude_contigs)
    validate_reference_contig_coverage(ref_contigs, contigs,
                                       options.min_shared_contigs_basepairs)
    logging.info('Common contigs are %s', [c.name for c in contigs])

    regions = regions_to_process(
        contigs,
        partition_size=options.allele_counter_options.partition_size,
        calling_regions=ranges.RangeSet.from_regions(
            options.calling_regions, ranges.contigs_dict(ref_contigs)),
        task_id=options.task_id,
        num_shards=options.num_shards)

    return regions

예제 #11

0

파일 보기

파일: make_examples.py 프로젝트: WesCoomber/compilersProjecto

def extract_sample_name_from_reads(reads_path):
    """Returns the sample name as derived from the BAM file of reads.

  Args:
    reads_path: Path to the SAM/BAM file containing a single sample.

  Returns:
    The sample ID annotated in the read group.

  Raises:
    ValueError: There is not exactly one unique sample name in the SAM/BAM.
  """
    with genomics_io.make_sam_reader(reads_path) as sam_reader:
        samples = sam_reader.samples
    if len(samples) != 1:
        raise ValueError('Expected a single sample, found {}'.format(samples))
    sample = next(iter(samples))
    if not sample:
        raise ValueError('Sample name is empty.')
    return sample

예제 #12

0

파일 보기

파일: realigner_test.py 프로젝트: zmandyhe/deepvariant

def _get_reads(region):
    with genomics_io.make_sam_reader(test_utils.CHR20_BAM) as in_sam_reader:
        return list(in_sam_reader.query(region))

예제 #13

0

파일 보기

파일: genomics_io_plugins_test.py 프로젝트: WesCoomber/compilersProjecto

 def test_tfbam_plugin_loads(self):
     reader = genomics_io.make_sam_reader('*****@*****.**', use_index=True)
     self.assertIsNotNone(reader)

예제 #14

0

파일 보기

 def test_bam_iterate(self):
   reader = genomics_io.make_sam_reader(
       test_utils.genomics_core_testdata('test.bam'), use_index=False)
   with reader:
     self.assertEqual(test_utils.iterable_len(reader.iterate()), 106)

예제 #15

0

파일 보기

 def test_tfbam_plugin_does_not_load(self):
     with self.assertRaisesRegexp(
             ImportError,
             'tfbam_lib module not found, cannot read .tfbam files.'):
         _ = genomics_io.make_sam_reader('*****@*****.**', use_index=True)

예제 #16

0

파일 보기

파일: genomics_io_gcs_test.py 프로젝트: WesCoomber/compilersProjecto

 def test_remote_bam(self):
   reader = genomics_io.make_sam_reader(REMOTE_BAM)
   reads = list(reader.query(self.query_window))
   self.assertEqual(EXPECTED_READS_IN_WINDOW, len(reads))

예제 #17

0

파일 보기

파일: make_examples.py 프로젝트: zmandyhe/deepvariant

def default_options(add_flags=True, flags_obj=None):
    """Creates a DeepVariantOptions proto populated with reasonable defaults.

  Args:
    add_flags: bool. defaults to True. If True, we will push the value of
      certain FLAGS into our options. If False, those option fields are left
      uninitialized.
    flags_obj: object.  If not None, use as the source of flags,
      else use global FLAGS.

  Returns:
    deepvariant_pb2.DeepVariantOptions protobuf.

  Raises:
    ValueError: If we observe invalid flag values.
  """
    if not flags_obj:
        flags_obj = FLAGS

    read_reqs = core_pb2.ReadRequirements(
        min_base_quality=10,
        min_mapping_quality=10,
        min_base_quality_mode=core_pb2.ReadRequirements.ENFORCED_BY_CLIENT)

    pic_options = pileup_image.default_options(read_requirements=read_reqs)

    allele_counter_options = deepvariant_pb2.AlleleCounterOptions(
        partition_size=flags_obj.partition_size, read_requirements=read_reqs)

    if flags_obj.sample_name:
        sample_name = flags_obj.sample_name
    elif flags_obj.reads:
        with genomics_io.make_sam_reader(flags_obj.reads) as sam_reader:
            sample_name = extract_sample_name_from_sam_reader(sam_reader)
    else:
        sample_name = _UNKNOWN_SAMPLE

    variant_caller_options = deepvariant_pb2.VariantCallerOptions(
        min_count_snps=flags_obj.vsc_min_count_snps,
        min_count_indels=flags_obj.vsc_min_count_indels,
        min_fraction_snps=flags_obj.vsc_min_fraction_snps,
        min_fraction_indels=flags_obj.vsc_min_fraction_indels,
        # Not specified by default: fraction_reference_sites_to_emit,
        # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'.
        random_seed=1400605801,
        sample_name=sample_name,
        p_error=0.001,
        max_gq=50,
        gq_resolution=flags_obj.gvcf_gq_binsize,
        ploidy=2)

    options = deepvariant_pb2.DeepVariantOptions(
        exclude_contigs=exclude_contigs.EXCLUDED_HUMAN_CONTIGS,
        # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'.
        random_seed=609314161,
        # # Not specified by default: calling_regions = 3;
        read_requirements=read_reqs,
        allele_counter_options=allele_counter_options,
        variant_caller_options=variant_caller_options,
        pic_options=pic_options,
        n_cores=1,
        task_id=0,
        num_shards=0,
        min_shared_contigs_basepairs=0.9,
    )

    if add_flags:
        if flags_obj.mode == 'training':
            options.mode = deepvariant_pb2.DeepVariantOptions.TRAINING
        elif flags_obj.mode == 'calling':
            options.mode = deepvariant_pb2.DeepVariantOptions.CALLING
        else:
            raise ValueError('Unexpected mode', flags_obj.mode)

        if flags_obj.ref:
            options.reference_filename = flags_obj.ref
        if flags_obj.reads:
            options.reads_filename = flags_obj.reads
        if flags_obj.confident_regions:
            options.confident_regions_filename = flags_obj.confident_regions
        if flags_obj.truth_variants:
            options.truth_variants_filename = flags_obj.truth_variants

        if flags_obj.downsample_fraction != NO_DOWNSAMPLING:
            options.downsample_fraction = flags_obj.downsample_fraction

        if flags_obj.multi_allelic_mode:
            multi_allelic_enum = {
                'include_het_alt_images':
                deepvariant_pb2.PileupImageOptions.ADD_HET_ALT_IMAGES,
                'exclude_het_alt_images':
                deepvariant_pb2.PileupImageOptions.NO_HET_ALT_IMAGES,
            }[flags_obj.multi_allelic_mode]
            options.pic_options.multi_allelic_mode = multi_allelic_enum

        if flags_obj.pileup_image_height:
            options.pic_options.height = flags_obj.pileup_image_height
        if flags_obj.pileup_image_width:
            options.pic_options.width = flags_obj.pileup_image_width

        num_shards, examples, candidates, gvcf = io_utils.resolve_filespecs(
            flags_obj.task, flags_obj.examples or '', flags_obj.candidates
            or '', flags_obj.gvcf or '')
        options.examples_filename = examples
        options.candidates_filename = candidates
        options.gvcf_filename = gvcf

        options.calling_regions.extend(parse_regions_flag(flags_obj.regions))
        options.exclude_calling_regions.extend(
            parse_regions_flag(flags_obj.exclude_regions))

        options.task_id = flags_obj.task
        options.num_shards = 0 if num_shards is None else num_shards

        options.realigner_enabled = flags_obj.realign_reads
        if options.realigner_enabled:
            options.realigner_options.CopyFrom(
                realigner.realigner_config(flags_obj))

        options.max_reads_per_partition = flags_obj.max_reads_per_partition

        if (options.mode == deepvariant_pb2.DeepVariantOptions.TRAINING
                and flags_obj.training_random_emit_ref_sites != NO_RANDOM_REF):
            options.variant_caller_options.fraction_reference_sites_to_emit = (
                flags_obj.training_random_emit_ref_sites)

    return options