예제 #1
0
    def test_candidate_pos_low_qual(self):
        """Test WindowSelector.process_read() with reads of low quality."""
        window = WindowSelector(self.test_ws_config())

        ref = 'A' * 100

        read_1 = test_utils.make_read('AAGA',
                                      start=10,
                                      cigar='4M',
                                      quals=[64, 64, 10, 30],
                                      name='read_1')
        read_2 = test_utils.make_read('AAGTA',
                                      start=10,
                                      cigar='2M2I1M',
                                      quals=[64, 64, 10, 30, 64],
                                      name='read_2')
        read_3 = test_utils.make_read('TGATAC',
                                      start=10,
                                      cigar='2S3M1S',
                                      quals=[64, 10, 64, 64, 64, 64],
                                      name='read_3')
        read_4 = test_utils.make_read('AAGA',
                                      start=10,
                                      cigar='2M1X1M',
                                      quals=[64, 64, 30, 10],
                                      name='read_4')

        self.assertEqual(list(window.process_read(ref, read_1)), [])
        self.assertEqual(list(window.process_read(ref, read_2)), [11, 13])
        self.assertEqual(list(window.process_read(ref, read_3)), [8, 11, 13])
        self.assertEqual(list(window.process_read(ref, read_4)), [12])
예제 #2
0
  def test_align_reads_simple(self, read_seq, expected_align_pos,
                              expected_cigar, comment):
    """Test Aligner.align_reads(). Simple tests.

    Targets consist of
      - original reference sequence.
      - a sequence with 'AA' insertion at position 14 and
      -                 'T' deletion at position 19.

    Args:
      read_seq: str, read sequence.
      expected_align_pos: int, expected aligned position
      expected_cigar: [(int, str)], expected cigar information.
      comment: str, test comment.
    """
    ref_seq = 'AAAAAAAAAAAAATGCATGGGGGATTTTTTTTTTT'
    region = ranges.make_range('ref', 10, 10 + len(ref_seq))
    align_reads = self.make_test_aligner(ref_seq, region)
    # redacted
    # implemented. For local alignment, it ensures that there are enough exact
    # matches between the reference and target for end-to-end alignment.
    targets = [ref_seq, 'AAAAAAAAAAAAATAAGCAGGGGGATTTTTTTTTTT']
    read = test_utils.make_read(
        read_seq,
        chrom='ref',
        start=0,
        cigar=[(len(read_seq), 'M')],
        quals=[64] * len(read_seq),
        name='read')
    aligned_reads = align_reads.align_reads(targets, [read])
    self.assertEqual(expected_align_pos,
                     aligned_reads[0].alignment.position.position, comment)
    self.assertEqual(
        _cigar.to_cigar_units(expected_cigar),
        list(aligned_reads[0].alignment.cigar), comment)

    read = test_utils.make_read(
        read_seq,
        chrom='ref',
        start=0,
        cigar=[(2, 'H'), (len(read_seq), 'M'), (1, 'H')],
        quals=[64] * len(read_seq),
        name='read')
    aligned_reads = align_reads.align_reads(targets, [read])
    expected_cigar_w_hard_clip = [(2, 'H')] + expected_cigar + [(1, 'H')]
    self.assertEqual(
        _cigar.to_cigar_units(expected_cigar_w_hard_clip),
        list(aligned_reads[0].alignment.cigar), comment)
예제 #3
0
    def test_read_support_is_respected(self, read_name, read_number,
                                       alt_allele, read_base, supports_alt):
        """supports_alt is encoded as the 5th channel out of the 7 channels."""
        dv_call = deepvariant_pb2.DeepVariantCall(
            variant=variants_pb2.Variant(reference_name='chr1',
                                         start=10,
                                         end=11,
                                         reference_bases='A',
                                         alternate_bases=[alt_allele]),
            allele_support={
                'C': _supporting_reads('read1/1', 'read3/2'),
                'G': _supporting_reads('read2/1', 'read2/2'),
            })
        read = test_utils.make_read(read_base,
                                    start=dv_call.variant.start,
                                    cigar='1M',
                                    quals=[50],
                                    name=read_name)
        read.read_number = read_number
        actual = _make_encoder().encode_read(dv_call, 'TAT', read,
                                             dv_call.variant.start - 1,
                                             alt_allele)
        expected_base_values = {'C': 30, 'G': 180}
        expected_supports_alt_channel = [152, 254]
        expected = [
            expected_base_values[read_base], 254, 211, 70,
            expected_supports_alt_channel[supports_alt], 254, 1
        ]

        self.assertEqual(list(actual[0, 1]), expected)
예제 #4
0
 def test_encode_read_insertion(self):
     # ref:  AA-CAG
     # read: AAACAG
     start = 2
     read = test_utils.make_read('AAACAG',
                                 start=start,
                                 cigar='2M1I3M',
                                 quals=range(10, 16),
                                 name='read1')
     dv_call = _make_dv_call()
     alt_allele = dv_call.variant.alternate_bases[0]
     full_expected = np.dstack([
         # Base.
         (250, 0, 30, 250, 180),
         # Base quality.
         (63, 76, 82, 88, 95),
         # Mapping quality.
         (211, 211, 211, 211, 211),
         # Strand channel (forward or reverse)
         (70, 70, 70, 70, 70),
         # Supports alt or not.
         (254, 254, 254, 254, 254),
         # Matches ref or not.
         (50, 254, 50, 50, 50),
         # Cigar operation length.
         (2, 1, 3, 3, 3)
     ]).astype(np.uint8)
     self.assertImageRowEquals(
         _make_encoder().encode_read(dv_call, 'AACAG', read, start,
                                     alt_allele), full_expected)
예제 #5
0
 def test_encode_read_deletion(self):
     # ref:  AACAG
     # read: AA--G
     start = 2
     read = test_utils.make_read('AAG',
                                 start=start,
                                 cigar='2M2D1M',
                                 quals=range(10, 13),
                                 name='read1')
     dv_call = _make_dv_call()
     alt_allele = dv_call.variant.alternate_bases[0]
     full_expected = np.dstack([
         # Base. The second A is 0 because it's the anchor of the deletion.
         (250, 0, 0, 0, 180),
         # Base quality.
         (63, 69, 0, 0, 76),
         # Mapping quality.
         (211, 211, 0, 0, 211),
         # Strand channel (forward or reverse)
         (70, 70, 0, 0, 70),
         # Supports alt or not.
         (254, 254, 0, 0, 254),
         # Matches ref or not.
         (50, 254, 0, 0, 50),
         # Cigar operation length.
         (2, 2, 0, 0, 1)
     ]).astype(np.uint8)
     self.assertImageRowEquals(
         _make_encoder().encode_read(dv_call, 'AACAG', read, start,
                                     alt_allele), full_expected)
예제 #6
0
  def test_realign_read(self, read_seq, target_seq, expected_align_start,
                        expected_cigar, comment):
    """Test Aligner.test_align_read_to_target()."""
    read = aligner.Read(
        test_utils.make_read(
            read_seq,
            chrom='ref',
            start=0,
            cigar=[(len(read_seq), 'M')],
            quals=[64] * len(read_seq),
            name='read'))
    align_reads = self.make_test_aligner(ref_seq=target_seq)
    align_reads.set_targets([target_seq])

    align_reads.realign_read(read)

    if expected_align_start:
      self.assertEqual(align_reads.targets[0], read.target, comment)
      self.assertEqual(expected_align_start,
                       read.target_offset + read.alignment.target_begin,
                       comment)
      self.assertEqual(expected_cigar, read.alignment.cigar, comment)
    else:
      self.assertIsNone(read.target, comment)
      self.assertIsNone(read.target_offset, comment)
      self.assertIsNone(read.alignment, comment)
예제 #7
0
    def test_encode_read_matches(self):
        start = 10
        dv_call = _make_dv_call()
        alt_allele = dv_call.variant.alternate_bases[0]
        read = test_utils.make_read('ACCGT',
                                    start=start,
                                    cigar='5M',
                                    quals=range(10, 15),
                                    name='read1')
        full_expected = np.dstack([
            # Base.
            (250, 30, 30, 180, 100),
            # Base quality.
            (63, 69, 76, 82, 88),
            # Mapping quality.
            (211, 211, 211, 211, 211),
            # Strand channel (forward or reverse)
            (70, 70, 70, 70, 70),
            # Supports alt or not.
            (254, 254, 254, 254, 254),
            # Matches ref or not.
            (50, 50, 254, 50, 50),
            # Cigar operation length.
            (5, 5, 5, 5, 5)
        ]).astype(np.uint8)

        self.assertImageRowEquals(
            _make_encoder().encode_read(dv_call, 'ACAGT', read, start,
                                        alt_allele), full_expected)
예제 #8
0
  def test_no_bad_soft_clipping(self):
    self.skipTest('Enable when b/63143285 global alignment is fixed')
    common = 'CTA'
    read_seq = common + 'GA'
    ref_seq = 'N' + common + 'CA' + 'N'
    alt_seq = 'A' + ref_seq
    targets = [ref_seq, alt_seq]

    region = ranges.make_range('ref', 0, len(ref_seq))
    align_reads = self.make_test_aligner(ref_seq, region)

    read = test_utils.make_read(
        read_seq,
        chrom='ref',
        start=0,
        cigar=[(len(read_seq), 'M')],
        quals=[35] * len(read_seq),
        name='read')
    realigned = align_reads.align_reads(targets, [read])[0]

    # redacted
    # 5M as we'd expect for this read:
    # read_seq: -CTAGA-
    # ref_seq : NCGTCAN
    # But the current algorithm produces a local alignment of the read against
    # the haplotypes, and the G <=> C mismatch causes the local aligner to
    # simply skip those bases instead of incurring the mismatch penalty for it,
    # resulting in a 3M2S read (GA clipped off) instead of the better 5M result.
    self.assertEqual([_cigar.to_cigar_unit(len(read_seq), 'M')],
                     list(realigned.alignment.cigar))
    def test_filtering_by_qual(self):
        """Test that we filter out edges containing low-quality basecalls."""
        ref_str = 'GATTACA'
        read_str = 'GATGTACA'
        read = test_utils.make_read(read_str,
                                    chrom='chr20',
                                    start=1,
                                    cigar=[(len(read_str), 'M')],
                                    quals=[30, 30, 30, 1, 30, 30, 30, 30],
                                    name='read')

        # Use two reads so read path doesn't get pruned.
        dbg = debruijn_graph.build(ref_str, [read, read],
                                   self.single_k_dbg_options(2))

        self.assertGraphEqual(
            """\
        digraph G {
        0[label=GA];
        1[label=AT];
        2[label=TT];
        3[label=TA];
        4[label=AC];
        5[label=CA];
        0->1 [label=3 color=red];
        1->2 [label=1 color=red];
        2->3 [label=1 color=red];
        3->4 [label=3 color=red];
        4->5 [label=3 color=red];
        }
        """, dbg)
    def test_pruning_2(self):
        """Test that pruning removes edges not between source and sink."""
        ref_str = 'GATTACA'
        read_str = 'CCGATGACACC'
        read = test_utils.make_read(read_str,
                                    chrom='chr20',
                                    start=1,
                                    cigar=[(len(read_str), 'M')],
                                    quals=[30] * len(read_str),
                                    name='read')
        # Use two reads so read path doesn't get pruned.
        dbg = debruijn_graph.build(ref_str, [read, read],
                                   self.single_k_dbg_options(3))

        self.assertGraphEqual(
            """\
        digraph G {
        0[label=GAT];
        1[label=ATT];
        2[label=TTA];
        3[label=TAC];
        4[label=ACA];
        5[label=ATG];
        6[label=TGA];
        7[label=GAC];
        0->1 [label=1 color=red];
        1->2 [label=1 color=red];
        2->3 [label=1 color=red];
        3->4 [label=1 color=red];
        0->5 [label=2];
        5->6 [label=2];
        6->7 [label=2];
        7->4 [label=2];
        }
        """, dbg)
 def test_pruning_1(self):
     """Test that pruning removes a path traced by only one read."""
     ref_str = 'GATTACA'
     read_str = 'GATGACA'
     read = test_utils.make_read(read_str,
                                 chrom='chr20',
                                 start=1,
                                 cigar=[(len(read_str), 'M')],
                                 quals=[30] * len(read_str),
                                 name='read')
     dbg = debruijn_graph.build(ref_str, [read],
                                self.single_k_dbg_options(3))
     self.assertGraphEqual(
         """\
     digraph G {
     0[label=GAT];
     1[label=ATT];
     2[label=TTA];
     3[label=TAC];
     4[label=ACA];
     0->1 [label=1 color=red];
     1->2 [label=1 color=red];
     2->3 [label=1 color=red];
     3->4 [label=1 color=red];
     }
     """, dbg)
예제 #12
0
 def setUp(self):
     reads = [
         test_utils.make_read('ACG', start=1, cigar='3M', name='read1'),
         test_utils.make_read('ACG', start=6, cigar='3M', name='read2'),
         test_utils.make_read('ACG', start=9, cigar='3M', name='read3'),
         test_utils.make_read('ACG', start=28, cigar='3M', name='read4'),
         test_utils.make_read('A' * 10, start=3, cigar='10M', name='read5'),
     ]
     self.reads = {read.fragment_name: read for read in reads}
     self.regions = {
         'r1': _test_assembled_region('chr1:1-5'),
         'r2': _test_assembled_region('chr1:10-15'),
         'r3': _test_assembled_region('chr1:20-30'),
     }
     self.assembled_regions = [
         self.regions[r] for r in sorted(self.regions)
     ]
예제 #13
0
    def test_realigner_doesnt_create_invalid_intervals(self):
        """Tests that read sets don't result in a crash in reference_fai.cc."""
        read = test_utils.make_read('ACCGT' * 50,
                                    start=63025520 - 250,
                                    cigar='250M',
                                    quals=range(30, 35) * 50,
                                    name='read1')
        reads = [read] * 20
        region = ranges.parse_literal('chr20:63,025,320-63,025,520')
        self.reads_realigner.realign_reads(reads, region)

        # These reads are aligned off the edge of the contig.
        read = test_utils.make_read('TTATA' * 50,
                                    start=63025520 - 200,
                                    cigar='200M50S',
                                    quals=range(30, 35) * 50,
                                    name='read1')
        reads = [read] * 20
        self.reads_realigner.realign_reads(reads, region)
예제 #14
0
 def test_sw_start_offsets(self):
   """Test Aligner._sw_start_offsets()."""
   k = 3
   read = aligner.Read(
       test_utils.make_read(
           'AaGAt', start=0, cigar=[(5, 'M')], quals=[64] * 5, name='read_1'))
   read.set_read_kmers(k)
   target = aligner.Target('TgATCAGATAAG')
   target.build_target_index(k)
   self.assertEqual([-1, 4, 9],
                    aligner._sw_start_offsets(target.kmer_index, read.kmers))
예제 #15
0
    def test_process_read(self):
        """Test WindowSelector.process_read()."""
        window = WindowSelector(self.test_ws_config())

        ref = 'A' * 100

        read_1 = test_utils.make_read('AAGA',
                                      start=10,
                                      cigar='4M',
                                      quals=[64] * 4,
                                      name='read_1')
        read_2 = test_utils.make_read('AAGTA',
                                      start=10,
                                      cigar='2M2I1M',
                                      quals=[64] * 5,
                                      name='read_2')
        read_3 = test_utils.make_read('AAA',
                                      start=10,
                                      cigar='2M2D1M',
                                      quals=[64] * 3,
                                      name='read_3')
        read_4 = test_utils.make_read('TGATAC',
                                      start=10,
                                      cigar='2S3M1S',
                                      quals=[64] * 6,
                                      name='read_4')
        read_5 = test_utils.make_read('AAGA',
                                      start=10,
                                      cigar='2M1X1M',
                                      quals=[64] * 4,
                                      name='read_5')

        self.assertEqual(list(window.process_read(ref, read_1)), [12])
        self.assertEqual(list(window.process_read(ref, read_2)),
                         [10, 11, 12, 13])
        self.assertEqual(list(window.process_read(ref, read_3)), [12, 13])
        self.assertEqual(list(window.process_read(ref, read_4)),
                         [8, 9, 11, 13])
        self.assertEqual(list(window.process_read(ref, read_5)), [12])
 def test_k_exceeds_read_length(self):
     """This is a regression test for b/64564513."""
     # If k > read length, no edges will go into the graph from this read.
     # This crashed prior to the bugfix.
     ref_str = 'GATTACATG'
     read_str = 'GATGACA'
     read = test_utils.make_read(read_str,
                                 chrom='chr20',
                                 start=1,
                                 cigar=[(len(read_str), 'M')],
                                 quals=[30] * len(read_str),
                                 name='read')
     dbg = debruijn_graph.build(ref_str, [read, read],
                                self.single_k_dbg_options(8))
     self.assertIsNotNone(dbg)
예제 #17
0
 def test_sanity_check_readalignment(self, ref_name, ref_start, ref_end,
                                     read_chrom, read_start, read_len,
                                     read_cigar, exception_msg):
     """Test Aligner.sanity_check_readalignment()."""
     region = ranges.make_range(ref_name, ref_start, ref_end)
     ref_seq = 'A' * (ref_end - ref_start)
     align_reads = self.make_test_aligner(ref_seq, region)
     read = test_utils.make_read('A' * read_len,
                                 chrom=read_chrom,
                                 start=read_start,
                                 cigar=read_cigar,
                                 quals=[64] * read_len,
                                 name='read')
     if exception_msg:
         with self.assertRaisesRegexp(ValueError, exception_msg):
             align_reads.sanity_check_readalignment(read)
     else:
         align_reads.sanity_check_readalignment(read)
예제 #18
0
    def test_encode_read_spans2(self, bases_start, bases_end):
        bases = 'AAAACCGTCCC'
        quals = [9, 9, 9, 10, 11, 12, 13, 14, 8, 8, 8]
        bases_start_offset = 7
        ref_start = 10
        ref_size = 5
        read_bases = bases[bases_start:bases_end]
        read_quals = quals[bases_start:bases_end]
        read_start = bases_start_offset + bases_start

        # Create our expected image row encoding.
        op_len = bases_end - bases_start
        full_expected = np.dstack([
            # Base.
            (250, 30, 30, 180, 100),
            # Base quality.
            (63, 69, 76, 82, 88),
            # Mapping quality.
            (211, 211, 211, 211, 211),
            # Strand channel (forward or reverse)
            (70, 70, 70, 70, 70),
            # Supports alt or not.
            (254, 254, 254, 254, 254),
            # Matches ref or not.
            (50, 50, 254, 50, 50),
            # Cigar operation length.
            [op_len] * 5
        ]).astype(np.uint8)
        expected = np.zeros((1, ref_size, pileup_image.DEFAULT_NUM_CHANNEL),
                            dtype=np.uint8)
        for i in range(read_start, read_start + len(read_bases)):
            if ref_start <= i < ref_start + ref_size:
                expected[0, i - ref_start] = full_expected[0, i - ref_start]

        read = test_utils.make_read(read_bases,
                                    start=read_start,
                                    cigar=str(len(read_bases)) + 'M',
                                    quals=read_quals,
                                    name='read1')
        dv_call = _make_dv_call()
        alt_allele = dv_call.variant.alternate_bases[0]
        self.assertImageRowEquals(
            _make_encoder().encode_read(dv_call, 'ACAGT', read, ref_start,
                                        alt_allele), expected)
예제 #19
0
 def test_align_read_with_whole_clippd_seq(self):
   """Test Aligner.align_reads() when the whole read sequence is clipped."""
   ref_seq = ('TTTGTTTGTTTGTGTTTGTGTTTTTGTTTGTTTGTGTTTGTGTTTGTTTGTGGTTTGTGT'
              'GTTTGTGTTTGTGTTGGTTTG')
   ref_len = len(ref_seq)
   align_reads = self.make_test_aligner(ref_seq)
   target_ins = 'AAAAAGTGGGGGGGAAGTGGGGAAAAA'
   targets = [
       ref_seq,
       ref_seq[:int(ref_len / 2)] + target_ins + ref_seq[int(ref_len / 2):]
   ]
   read_seq = 'CCC' + target_ins + 'CCC'
   read = test_utils.make_read(
       read_seq,
       chrom='ref',
       start=10,
       cigar=[(len(read_seq), 'M')],
       quals=[64] * len(read_seq),
       name='read')
   aligned_reads = align_reads.align_reads(targets, [read])
   self.assertEqual(read, aligned_reads[0],
                    'Read should have its original alignment.')
    def test_basics(self):
        """Basic example."""
        ref_str = 'GATTACA'
        read_str = 'GATGACA'
        read = test_utils.make_read(read_str,
                                    chrom='chr20',
                                    start=1,
                                    cigar=[(len(read_str), 'M')],
                                    quals=[30] * len(read_str),
                                    name='read')

        self.assertEqual(self.single_k_dbg_options(3).min_k, 3)
        # Use two reads so read path doesn't get pruned.
        dbg = debruijn_graph.build(ref_str, [read, read],
                                   self.single_k_dbg_options(3))

        self.assertItemsEqual([ref_str, read_str], dbg.candidate_haplotypes())

        self.assertGraphEqual(
            """\
          digraph G {
          0[label=GAT];
          1[label=ATT];
          2[label=TTA];
          3[label=TAC];
          4[label=ACA];
          5[label=ATG];
          6[label=TGA];
          7[label=GAC];
          0->1 [label=1 color=red];
          1->2 [label=1 color=red];
          2->3 [label=1 color=red];
          3->4 [label=1 color=red];
          0->5 [label=2];
          5->6 [label=2];
          6->7 [label=2];
          7->4 [label=2];
          }
          """, dbg)
예제 #21
0
    def test_ignores_reads_with_low_quality_bases(self):
        dv_call = deepvariant_pb2.DeepVariantCall(
            variant=variants_pb2.Variant(reference_name='chr1',
                                         start=2,
                                         end=3,
                                         reference_bases='A',
                                         alternate_bases=['C']))
        pie = _make_encoder()

        # Get the threshold the encoder uses.
        min_qual = pileup_image.DEFAULT_MIN_BASE_QUALITY

        for qual in range(0, min_qual + 5):
            quals = [min_qual - 1, qual, min_qual + 1]
            read = test_utils.make_read('AAA',
                                        start=1,
                                        cigar='3M',
                                        quals=quals)
            actual = pie.encode_read(dv_call, 'AACAG', read, 1, 'C')
            if qual < min_qual:
                self.assertIsNone(actual)
            else:
                self.assertIsNotNone(actual)
예제 #22
0
    def setUp(self):
        self.alt_allele = 'C'
        self.dv_call = _make_dv_call(ref_bases='G', alt_bases=self.alt_allele)
        self.pic = _make_image_creator(None,
                                       None,
                                       width=3,
                                       height=4,
                                       reference_band_height=2)
        self.ref = 'AGC'
        self.read1 = test_utils.make_read('AGC',
                                          start=0,
                                          cigar='3M',
                                          name='read1')
        self.read2 = test_utils.make_read('AGC',
                                          start=1,
                                          cigar='3M',
                                          name='read2')
        self.read3 = test_utils.make_read('AGC',
                                          start=2,
                                          cigar='3M',
                                          name='read3')
        self.read4 = test_utils.make_read('AGC',
                                          start=3,
                                          cigar='3M',
                                          name='read4')

        self.expected_rows = {
            'ref':
            np.asarray(range(0, 21),
                       np.uint8).reshape(1, 3,
                                         pileup_image.DEFAULT_NUM_CHANNEL),
            'empty':
            np.zeros((1, 3, pileup_image.DEFAULT_NUM_CHANNEL), dtype=np.uint8),
            'read1':
            np.full((1, 3, pileup_image.DEFAULT_NUM_CHANNEL),
                    1,
                    dtype=np.uint8),
            'read2':
            np.full((1, 3, pileup_image.DEFAULT_NUM_CHANNEL),
                    2,
                    dtype=np.uint8),
            'read3':
            None,
            'read4':
            np.full((1, 3, pileup_image.DEFAULT_NUM_CHANNEL),
                    3,
                    dtype=np.uint8),
        }

        # Setup our shared mocks.
        mock_encoder = mock.Mock(spec=['encode_read', 'encode_reference'])
        mock_encoder.encode_reference.return_value = self.expected_rows['ref']

        # pylint: disable=unused-argument
        def get_read_row(dv_call, refbases, read, pos, alt_allele):
            return self.expected_rows[read.fragment_name]

        mock_encoder.encode_read.side_effect = get_read_row

        self.mock_enc_ref = mock_encoder.encode_reference
        self.mock_enc_read = mock_encoder.encode_read

        self.pic._encoder = mock_encoder