def test_parse_literal_one_bp(self):
   self.assertEqual(
       ranges.parse_literal('1:10'), ranges.make_range('1', 9, 10))
   self.assertEqual(
       ranges.parse_literal('1:100'), ranges.make_range('1', 99, 100))
   self.assertEqual(
       ranges.parse_literal('1:1,000'), ranges.make_range('1', 999, 1000))
示例#2
0
  def call_aligner(self, assembled_region):
    """Helper function to call aligner module."""
    if not assembled_region.reads:
      return []

    contig = assembled_region.region.reference_name
    ref_start = max(
        0,
        min(assembled_region.read_span.start, assembled_region.region.start) -
        _REF_ALIGN_MARGIN)
    ref_end = min(
        self.ref_reader.contig(contig).n_bases,
        max(assembled_region.read_span.end, assembled_region.region.end) +
        _REF_ALIGN_MARGIN)

    ref_prefix = self.ref_reader.bases(
        ranges.make_range(contig, ref_start, assembled_region.region.start))
    ref = self.ref_reader.bases(assembled_region.region)

    # If we can't create the ref suffix then return the original alignments.
    if ref_end <= assembled_region.region.end:
      return assembled_region.reads
    else:
      ref_suffix = self.ref_reader.bases(
          ranges.make_range(contig, assembled_region.region.end, ref_end))

    ref_region = ranges.make_range(contig, ref_start, ref_end)
    ref_seq = ref_prefix + ref + ref_suffix
    reads_aligner = aligner.Aligner(self.config.aln_config, ref_region, ref_seq)
    return reads_aligner.align_reads([
        ref_prefix + target + ref_suffix
        for target in assembled_region.haplotypes
    ], assembled_region.reads)
示例#3
0
    def test_wrap(self, fasta_filename):
        chr_names = ['chrM', 'chr1', 'chr2']
        chr_lengths = [100, 76, 121]
        fasta = test_utils.genomics_core_testdata(fasta_filename)
        fai = test_utils.genomics_core_testdata(fasta_filename + '.fai')
        with reference_fai.GenomeReferenceFai.from_file(fasta, fai) as ref:
            self.assertEqual(ref.n_contigs, 3)
            self.assertIn(fasta, ref.fasta_path)
            self.assertIn('GenomeReference backed by htslib FAI index',
                          str(ref))
            self.assertEqual(ref.contig_names, chr_names)
            self.assertEqual(ref.n_bp, sum(chr_lengths))
            self.assertEqual(ref.bases(ranges.make_range('chrM', 1, 10)),
                             'ATCACAGGT')

            self.assertTrue(
                ref.is_valid_interval(ranges.make_range('chrM', 1, 10)))
            self.assertFalse(
                ref.is_valid_interval(ranges.make_range('chrM', 1, 100000)))

            self.assertEqual(len(ref.contigs), 3)
            self.assertEqual([c.name for c in ref.contigs], chr_names)
            self.assertEqual([c.n_bases for c in ref.contigs], chr_lengths)
            for contig in ref.contigs:
                self.assertEqual(ref.contig(contig.name), contig)
                self.assertTrue(ref.has_contig(contig.name))
                self.assertFalse(ref.has_contig(contig.name + '.unknown'))
  def test_detector_ranges(self):
    test_ranges = [
        ranges.make_range('chr1', 0, 5),
        ranges.make_range('chr1', 8, 10),
        ranges.make_range('chr1', 12, 13),
        ranges.make_range('chr2', 2, 5),
    ]
    range_set = ranges.RangeSet(test_ranges)
    self.assertEqual(bool(range_set), True)
    self.assertEqual(len(range_set), 4)

    self.assertEqual(range_set.overlaps('chr1', 0), True)
    self.assertEqual(range_set.overlaps('chr1', 1), True)
    self.assertEqual(range_set.overlaps('chr1', 2), True)
    self.assertEqual(range_set.overlaps('chr1', 3), True)
    self.assertEqual(range_set.overlaps('chr1', 4), True)
    self.assertEqual(range_set.overlaps('chr1', 5), False)
    self.assertEqual(range_set.overlaps('chr1', 6), False)
    self.assertEqual(range_set.overlaps('chr1', 7), False)
    self.assertEqual(range_set.overlaps('chr1', 8), True)
    self.assertEqual(range_set.overlaps('chr1', 9), True)
    self.assertEqual(range_set.overlaps('chr1', 10), False)
    self.assertEqual(range_set.overlaps('chr1', 11), False)
    self.assertEqual(range_set.overlaps('chr1', 12), True)
    self.assertEqual(range_set.overlaps('chr1', 13), False)
    self.assertEqual(range_set.overlaps('chr1', 100), False)
    self.assertEqual(range_set.overlaps('chr1', 1000), False)
    self.assertEqual(range_set.overlaps('chr2', 0), False)
    self.assertEqual(range_set.overlaps('chr2', 1), False)
    self.assertEqual(range_set.overlaps('chr2', 2), True)
    self.assertEqual(range_set.overlaps('chr2', 3), True)
    self.assertEqual(range_set.overlaps('chr2', 4), True)
    self.assertEqual(range_set.overlaps('chr2', 5), False)
    self.assertEqual(range_set.overlaps('chr2', 6), False)
    self.assertEqual(range_set.overlaps('chr3', 3), False)
 def test_partitions(self, interval_size, expected):
   rangeset = ranges.RangeSet([
       ranges.make_range('chrM', 0, 100),
       ranges.make_range('chr1', 0, 76),
       ranges.make_range('chr2', 0, 121),
   ])
   self.assertCountEqual([ranges.make_range(*args) for args in expected],
                         rangeset.partition(interval_size))
 def test_partition_of_multiple_intervals(self, interval_size, expected):
   rangeset = ranges.RangeSet([
       ranges.make_range('1', 0, 10),
       ranges.make_range('1', 20, 40),
       ranges.make_range('1', 45, 50),
   ])
   self.assertCountEqual([ranges.make_range(*args) for args in expected],
                         rangeset.partition(interval_size))
 def test_from_bed(self):
   source = test_utils.genomics_core_testdata('test.bed')
   self.assertCountEqual([
       ranges.make_range('chr1', 1, 10),
       ranges.make_range('chr2', 20, 30),
       ranges.make_range('chr2', 40, 60),
       ranges.make_range('chr3', 80, 90),
   ], ranges.RangeSet.from_bed(source))
  def test_find_max_overlapping_returns_least_index(self):
    query_range = ranges.make_range('1', 0, 10)
    search_ranges = [
        ranges.make_range('1', 0, 5),
        ranges.make_range('1', 5, 10)
    ]

    for to_search in [search_ranges, list(reversed(search_ranges))]:
      self.assertEqual(0, ranges.find_max_overlapping(query_range, to_search))
 def test_from_contigs(self):
   contigs = [
       core_pb2.ContigInfo(name='chr1', n_bases=10),
       core_pb2.ContigInfo(name='chr2', n_bases=5),
   ]
   self.assertCountEqual([
       ranges.make_range('chr1', 0, 10),
       ranges.make_range('chr2', 0, 5),
   ], ranges.RangeSet.from_contigs(contigs))
示例#10
0
 def test_from_regions_not_empty(self):
   literals = ['chr1', 'chr2:10-20']
   contig_map = {
       'chr1': core_pb2.ContigInfo(name='chr1', n_bases=10),
       'chr2': core_pb2.ContigInfo(name='chr2', n_bases=100),
   }
   self.assertItemsEqual(
       [ranges.make_range('chr1', 0, 10),
        ranges.make_range('chr2', 9, 20)],
       ranges.RangeSet.from_regions(literals, contig_map))
 def test_variant_position_and_range(self):
     v1 = test_utils.make_variant(chrom='1', alleles=['A', 'C'], start=10)
     v2 = test_utils.make_variant(chrom='1',
                                  alleles=['AGCT', 'C'],
                                  start=10)
     pos = ranges.make_range('1', 10, 11)
     range_ = ranges.make_range('1', 10, 14)
     self.assertEqual(pos, variantutils.variant_position(v1))
     self.assertEqual(pos, variantutils.variant_position(v2))
     self.assertEqual(pos, variantutils.variant_range(v1))
     self.assertEqual(range_, variantutils.variant_range(v2))
示例#12
0
 def test_bed_parser(self):
   data = [
       'chr20\t61724611\t61725646',
       'chr20\t61304163\t61305182',
       'chr20\t61286467\t61286789',
   ]
   self.assertEqual(
       list(ranges.parse_lines(data, 'bed')), [
           ranges.make_range('chr20', 61724611, 61725646),
           ranges.make_range('chr20', 61304163, 61305182),
           ranges.make_range('chr20', 61286467, 61286789),
       ])
示例#13
0
 def test_bedpe_parser_skips_cross_chr_events(self):
   # pylint: disable=line-too-long
   data = [
       'chr20\t25763416\t25765517\tchr21\t25825181\t25826882\tP2_PM_20_1549\t63266\t+\tTYPE:DELETION',
       'chr20\t25972820\t25972991\tchr20\t26045347\t26045538\tP2_PM_20_696\t72548\t+\tTYPE:DELETION',
       'chr20\t23719873\t23721974\tchr20\t23794822\t23796523\tP2_PM_20_1548\t76450\t+\tTYPE:DELETION',
   ]
   self.assertEqual(
       list(ranges.parse_lines(data, 'bedpe')), [
           ranges.make_range('chr20', 25972820, 26045538),
           ranges.make_range('chr20', 23719873, 23796523),
       ])
示例#14
0
  def test_find_max_overlapping_allows_unordered_search_ranges(self):
    query_range = ranges.make_range('1', 4, 12)
    search_ranges = [
        ranges.make_range('1', 0, 10),
        ranges.make_range('1', 10, 20),
        ranges.make_range('1', 12, 20)
    ]
    max_overlapping_range = search_ranges[0]

    for permutated_ranges in itertools.permutations(search_ranges):
      self.assertEqual(
          permutated_ranges.index(max_overlapping_range),
          ranges.find_max_overlapping(query_range, permutated_ranges))
示例#15
0
 def read_span(self):
     if self._read_span is None and self.reads:
         spans = [utils.read_range(r) for r in self.reads]
         self._read_span = ranges.make_range(spans[0].reference_name,
                                             min(s.start for s in spans),
                                             max(s.end for s in spans))
     return self._read_span
示例#16
0
 def test_overlaps_variant_with_ranges(self):
   variant = variants_pb2.Variant(reference_name='chr2', start=10, end=11)
   range_set = ranges.RangeSet([ranges.make_range('chr1', 0, 5)])
   with mock.patch.object(range_set, 'overlaps') as mock_overlaps:
     mock_overlaps.return_value = True
     self.assertEqual(range_set.variant_overlaps(variant), True)
     mock_overlaps.assert_called_once_with('chr2', 10)
示例#17
0
  def test_no_bad_soft_clipping(self):
    self.skipTest('Enable when b/63143285 global alignment is fixed')
    common = 'CTA'
    read_seq = common + 'GA'
    ref_seq = 'N' + common + 'CA' + 'N'
    alt_seq = 'A' + ref_seq
    targets = [ref_seq, alt_seq]

    region = ranges.make_range('ref', 0, len(ref_seq))
    align_reads = self.make_test_aligner(ref_seq, region)

    read = test_utils.make_read(
        read_seq,
        chrom='ref',
        start=0,
        cigar=[(len(read_seq), 'M')],
        quals=[35] * len(read_seq),
        name='read')
    realigned = align_reads.align_reads(targets, [read])[0]

    # redacted
    # 5M as we'd expect for this read:
    # read_seq: -CTAGA-
    # ref_seq : NCGTCAN
    # But the current algorithm produces a local alignment of the read against
    # the haplotypes, and the G <=> C mismatch causes the local aligner to
    # simply skip those bases instead of incurring the mismatch penalty for it,
    # resulting in a 3M2S read (GA clipped off) instead of the better 5M result.
    self.assertEqual([_cigar.to_cigar_unit(len(read_seq), 'M')],
                     list(realigned.alignment.cigar))
    def test_call_from_allele_counter(self):
        ref = genomics_io.make_ref_reader(test_utils.CHR20_FASTA)
        sam_reader = genomics_io.make_sam_reader(test_utils.CHR20_BAM)
        size = 1000
        region = ranges.make_range('chr20', 10000000, 10000000 + size)
        allele_counter = _allelecounter.AlleleCounter(
            ref, region,
            deepvariant_pb2.AlleleCounterOptions(partition_size=size))
        caller = variant_calling.VariantCaller(
            deepvariant_pb2.VariantCallerOptions(min_count_snps=2,
                                                 min_count_indels=2,
                                                 min_fraction_snps=0.12,
                                                 min_fraction_indels=0.12,
                                                 sample_name='sample_name',
                                                 p_error=0.001,
                                                 max_gq=50,
                                                 gq_resolution=1,
                                                 ploidy=2))

        # Grab all of the reads in our region and add them to the allele_counter.
        reads = list(sam_reader.query(region))
        self.assertNotEmpty(reads)
        for read in reads:
            allele_counter.add(read)

        # Get the candidates records for this whole region.
        candidates = caller.calls_from_allele_counter(allele_counter)

        # We should have at least some candidates and some gvcf records.
        self.assertNotEmpty(candidates)

        # Each candidate should be a DeepVariantCall.
        for candidate in candidates:
            self.assertIsInstance(candidate, deepvariant_pb2.DeepVariantCall)
示例#19
0
    def test_get_reference_bases_good_region(self):
        self.dv_call.variant.start = 10
        region = ranges.make_range(self.variant.reference_name, 8, 13)

        actual = self.pic.get_reference_bases(self.variant)
        self.assertEqual('ACGT', actual)
        self.mock_ref_reader.is_valid_interval.assert_called_once_with(region)
        self.mock_ref_reader.bases.assert_called_once_with(region)
示例#20
0
 def test_read_range(self):
     """Tests reads have their ranges calculated correctly."""
     start = 10000001
     read = test_utils.make_read('AAACAG',
                                 chrom='chrX',
                                 start=start,
                                 cigar='2M1I3M',
                                 quals=range(10, 16),
                                 name='read1')
     self.assertEquals(ranges.make_range('chrX', start, start + 5),
                       utils.read_range(read))
     read = test_utils.make_read('AAACAG',
                                 chrom='chrX',
                                 start=start,
                                 cigar='2M16D3M',
                                 quals=range(10, 16),
                                 name='read1')
     self.assertEquals(ranges.make_range('chrX', start, start + 5 + 16),
                       utils.read_range(read))
示例#21
0
 def make_test_aligner(self, ref_seq=None, region=None):
     config = realigner_pb2.RealignerOptions.AlignerOptions(match=1,
                                                            mismatch=1,
                                                            gap_open=2,
                                                            gap_extend=1,
                                                            k=3,
                                                            error_rate=.02)
     ref_seq = ref_seq or 'AAAAAAA'
     region = region or ranges.make_range('ref', 10, 10 + len(ref_seq))
     return aligner.Aligner(config, region, ref_seq)
示例#22
0
def variant_range(variant):
    """Returns a new Range covering variant.

  Args:
    variant: third_party.nucleus.protos.Variant.

  Returns:
    A new Range with the same reference_name, start, and end as variant.
  """
    return ranges.make_range(variant.reference_name, variant.start,
                             variant.end)
示例#23
0
def read_range(read):
  """Creates a Range proto from the alignment of Read.

  Args:
    read: the read to calculate range

  Returns:
    A learning.genomics.deepvariant.core.genomics.Range for read.
  """
  start = read.alignment.position.position
  end = start + cigar.alignment_length(read.alignment.cigar)
  return ranges.make_range(read.alignment.position.reference_name, start, end)
示例#24
0
def read_range(read):
    """Creates a Range proto from the alignment of Read.

  Args:
    read: the read to calculate range

  Returns:
    A third_party.nucleus.protos.Range for read.
  """
    start = read.alignment.position.position
    end = start + cigar.alignment_length(read.alignment.cigar)
    return ranges.make_range(read.alignment.position.reference_name, start,
                             end)
示例#25
0
def variant_position(variant):
    """Returns a new Range at the start position of variant.

  Args:
    variant: third_party.nucleus.protos.Variant.

  Returns:
    A new Range with the same reference_name as variant and start but an end
    that is start + 1. This produces a range that is the single basepair of the
    start of variant, hence the name position.
  """
    return ranges.make_range(variant.reference_name, variant.start,
                             variant.start + 1)
示例#26
0
    def get_reads(self, variant):
        """Gets the reads used to construct the pileup image around variant.

    Args:
      variant: A third_party.nucleus.protos.Variant proto
        describing the variant we are creating the pileup image of.

    Returns:
      A list of third_party.nucleus.protos.Read protos.
    """
        query_start = variant.start - self._options.read_overlap_buffer_bp
        query_end = variant.end + self._options.read_overlap_buffer_bp
        region = ranges.make_range(variant.reference_name, query_start,
                                   query_end)
        return list(self._sam_reader.query(region))
示例#27
0
  def test_align_reads_simple(self, read_seq, expected_align_pos,
                              expected_cigar, comment):
    """Test Aligner.align_reads(). Simple tests.

    Targets consist of
      - original reference sequence.
      - a sequence with 'AA' insertion at position 14 and
      -                 'T' deletion at position 19.

    Args:
      read_seq: str, read sequence.
      expected_align_pos: int, expected aligned position
      expected_cigar: [(int, str)], expected cigar information.
      comment: str, test comment.
    """
    ref_seq = 'AAAAAAAAAAAAATGCATGGGGGATTTTTTTTTTT'
    region = ranges.make_range('ref', 10, 10 + len(ref_seq))
    align_reads = self.make_test_aligner(ref_seq, region)
    # redacted
    # implemented. For local alignment, it ensures that there are enough exact
    # matches between the reference and target for end-to-end alignment.
    targets = [ref_seq, 'AAAAAAAAAAAAATAAGCAGGGGGATTTTTTTTTTT']
    read = test_utils.make_read(
        read_seq,
        chrom='ref',
        start=0,
        cigar=[(len(read_seq), 'M')],
        quals=[64] * len(read_seq),
        name='read')
    aligned_reads = align_reads.align_reads(targets, [read])
    self.assertEqual(expected_align_pos,
                     aligned_reads[0].alignment.position.position, comment)
    self.assertEqual(
        _cigar.to_cigar_units(expected_cigar),
        list(aligned_reads[0].alignment.cigar), comment)

    read = test_utils.make_read(
        read_seq,
        chrom='ref',
        start=0,
        cigar=[(2, 'H'), (len(read_seq), 'M'), (1, 'H')],
        quals=[64] * len(read_seq),
        name='read')
    aligned_reads = align_reads.align_reads(targets, [read])
    expected_cigar_w_hard_clip = [(2, 'H')] + expected_cigar + [(1, 'H')]
    self.assertEqual(
        _cigar.to_cigar_units(expected_cigar_w_hard_clip),
        list(aligned_reads[0].alignment.cigar), comment)
示例#28
0
    def get_reference_bases(self, variant):
        """Gets the reference bases used to make the pileup image around variant.

    Args:
      variant: A third_party.nucleus.protos.Variant proto
        describing the variant we are creating the pileup image of.

    Returns:
      A string of reference bases or None. Returns None if the reference
      interval for variant isn't valid for some reason.
    """
        start = variant.start - self.half_width
        end = start + self._options.width
        region = ranges.make_range(variant.reference_name, start, end)
        if self._ref_reader.is_valid_interval(region):
            return self._ref_reader.bases(region)
        else:
            return None
示例#29
0
 def test_sanity_check_readalignment(self, ref_name, ref_start, ref_end,
                                     read_chrom, read_start, read_len,
                                     read_cigar, exception_msg):
     """Test Aligner.sanity_check_readalignment()."""
     region = ranges.make_range(ref_name, ref_start, ref_end)
     ref_seq = 'A' * (ref_end - ref_start)
     align_reads = self.make_test_aligner(ref_seq, region)
     read = test_utils.make_read('A' * read_len,
                                 chrom=read_chrom,
                                 start=read_start,
                                 cigar=read_cigar,
                                 quals=[64] * read_len,
                                 name='read')
     if exception_msg:
         with self.assertRaisesRegexp(ValueError, exception_msg):
             align_reads.sanity_check_readalignment(read)
     else:
         align_reads.sanity_check_readalignment(read)
示例#30
0
def make_example(variant, alt_alleles, encoded_image, shape, image_format):
    """Creates a new tf.Example suitable for use with DeepVariant.

  Args:
    variant: third_party.nucleus.protos.Variant protobuf
      containing information about a candidate variant call.
    alt_alleles: A set of strings. Indicates the alternate alleles used as "alt"
      when constructing the image.
    encoded_image: a Tensor of type tf.string. Should contain an image encoding
      the reference and read data supporting variant. The encoding should be
      consistent with the image_format argument.
    shape: a list of (width, height, channel).
    image_format: string. The scheme used to encode our image.

  Returns:
    A tf.Example proto containing the standard DeepVariant features.
  """
    example = example_pb2.Example()
    features = example.features
    features.feature['locus'].bytes_list.value.append(
        ranges.to_literal(
            ranges.make_range(variant.reference_name, variant.start,
                              variant.end)))
    features.feature['variant/encoded'].bytes_list.value.append(
        variant.SerializeToString())
    all_alts = list(variant.alternate_bases)
    alt_indices = sorted(all_alts.index(alt) for alt in alt_alleles)

    features.feature['alt_allele_indices/encoded'].bytes_list.value.append(
        deepvariant_pb2.CallVariantsOutput.AltAlleleIndices(
            indices=alt_indices).SerializeToString())

    features.feature['image/encoded'].bytes_list.value.append(encoded_image)
    features.feature['image/format'].bytes_list.value.append(image_format)
    features.feature['image/shape'].int64_list.value.extend(shape)
    return example