def test_k_exceeds_ref_length(self):
     """This is a regression test for b/64564513."""
     # We don't allow a k >= ref length.  This crashed prior to the bugfix.
     ref_str = 'GATTACA'
     dbg = debruijn_graph.build(ref_str, [], self.single_k_dbg_options(7))
     self.assertIsNone(dbg)
     dbg = debruijn_graph.build(ref_str, [], self.single_k_dbg_options(8))
     self.assertIsNone(dbg)
 def test_k_exceeds_ref_length(self):
   """This is a regression test for b/64564513."""
   # We don't allow a k >= ref length.  This crashed prior to the bugfix.
   ref_str = 'GATTACA'
   dbg = debruijn_graph.build(ref_str, [], self.single_k_dbg_options(7))
   self.assertIsNone(dbg)
   dbg = debruijn_graph.build(ref_str, [], self.single_k_dbg_options(8))
   self.assertIsNone(dbg)
  def test_filtering_by_qual(self):
    """Test that we filter out edges containing low-quality basecalls."""
    ref_str = 'GATTACA'
    read_str = 'GATGTACA'
    read = test_utils.make_read(
        read_str,
        chrom='chr20',
        start=1,
        cigar=[(len(read_str), 'M')],
        quals=[30, 30, 30, 1, 30, 30, 30, 30],
        name='read')

    # Use two reads so read path doesn't get pruned.
    dbg = debruijn_graph.build(ref_str, [read, read],
                               self.single_k_dbg_options(2))

    self.assertGraphEqual("""\
        digraph G {
        0[label=GA];
        1[label=AT];
        2[label=TT];
        3[label=TA];
        4[label=AC];
        5[label=CA];
        0->1 [label=3 color=red];
        1->2 [label=1 color=red];
        2->3 [label=1 color=red];
        3->4 [label=3 color=red];
        4->5 [label=3 color=red];
        }
        """, dbg)
  def test_pruning_2(self):
    """Test that pruning removes edges not between source and sink."""
    ref_str = 'GATTACA'
    read_str = 'CCGATGACACC'
    read = test_utils.make_read(
        read_str,
        chrom='chr20',
        start=1,
        cigar=[(len(read_str), 'M')],
        quals=[30] * len(read_str),
        name='read')
    # Use two reads so read path doesn't get pruned.
    dbg = debruijn_graph.build(ref_str, [read, read],
                               self.single_k_dbg_options(3))

    self.assertGraphEqual("""\
        digraph G {
        0[label=GAT];
        1[label=ATT];
        2[label=TTA];
        3[label=TAC];
        4[label=ACA];
        5[label=ATG];
        6[label=TGA];
        7[label=GAC];
        0->1 [label=1 color=red];
        1->2 [label=1 color=red];
        2->3 [label=1 color=red];
        3->4 [label=1 color=red];
        0->5 [label=2];
        5->6 [label=2];
        6->7 [label=2];
        7->4 [label=2];
        }
        """, dbg)
 def test_pruning_1(self):
   """Test that pruning removes a path traced by only one read."""
   ref_str = 'GATTACA'
   read_str = 'GATGACA'
   read = test_utils.make_read(
       read_str,
       chrom='chr20',
       start=1,
       cigar=[(len(read_str), 'M')],
       quals=[30] * len(read_str),
       name='read')
   dbg = debruijn_graph.build(ref_str, [read], self.single_k_dbg_options(3))
   self.assertGraphEqual("""\
       digraph G {
       0[label=GAT];
       1[label=ATT];
       2[label=TTA];
       3[label=TAC];
       4[label=ACA];
       0->1 [label=1 color=red];
       1->2 [label=1 color=red];
       2->3 [label=1 color=red];
       3->4 [label=1 color=red];
       }
       """, dbg)
Пример #6
0
  def call_debruijn_graph(self, windows, reads):
    """Helper function to call debruijn_graph module."""
    windows_haplotypes = []
    # Build and process de-Bruijn graph for each window.
    sam_reader = sam.InMemorySamReader(reads)

    for window in windows:
      if window.end - window.start > self.config.ws_config.max_window_size:
        continue
      if not self.ref_reader.is_valid(window):
        continue
      ref = self.ref_reader.query(window)
      window_reads = list(sam_reader.query(window))

      with timer.Timer() as t:
        graph = debruijn_graph.build(ref, window_reads, self.config.dbg_config)
      graph_building_time = t.GetDuration()

      if not graph:
        candidate_haplotypes = [ref]
      else:
        candidate_haplotypes = graph.candidate_haplotypes()
      if candidate_haplotypes and candidate_haplotypes != [ref]:
        candidate_haplotypes_info = realigner_pb2.CandidateHaplotypes(
            span=window, haplotypes=candidate_haplotypes)
        windows_haplotypes.append(candidate_haplotypes_info)

      self.diagnostic_logger.log_graph_metrics(
          window, graph, candidate_haplotypes, graph_building_time)

    return windows_haplotypes
Пример #7
0
    def call_debruijn_graph(self, windows, reads):
        """Helper function to call debruijn_graph module."""
        windows_haplotypes = []
        # Build and process de-Bruijn graph for each window.
        for window in windows:
            if window.end - window.start > self.config.ws_config.max_window_size:
                continue
            if not self.ref_reader.is_valid_interval(window):
                continue
            ref = self.ref_reader.bases(window)
            # redacted
            dbg_reads = [
                read for read in reads
                if ranges.ranges_overlap(window, utils.read_range(read))
            ]

            with timer.Timer() as t:
                graph = debruijn_graph.build(ref, dbg_reads,
                                             self.config.dbg_config)
            graph_building_time = t.GetDuration()

            if not graph:
                candidate_haplotypes = [ref]
            else:
                candidate_haplotypes = graph.candidate_haplotypes()
            if candidate_haplotypes and candidate_haplotypes != [ref]:
                candidate_haplotypes_info = realigner_pb2.CandidateHaplotypes(
                    span=window, haplotypes=candidate_haplotypes)
                windows_haplotypes.append(candidate_haplotypes_info)

            self.diagnostic_logger.log_graph_metrics(window, graph,
                                                     candidate_haplotypes,
                                                     graph_building_time)

        return windows_haplotypes
 def test_pruning_1(self):
     """Test that pruning removes a path traced by only one read."""
     ref_str = 'GATTACA'
     read_str = 'GATGACA'
     read = test_utils.make_read(read_str,
                                 chrom='chr20',
                                 start=1,
                                 cigar=[(len(read_str), 'M')],
                                 quals=[30] * len(read_str),
                                 name='read')
     dbg = debruijn_graph.build(ref_str, [read],
                                self.single_k_dbg_options(3))
     self.assertGraphEqual(
         """\
     digraph G {
     0[label=GAT];
     1[label=ATT];
     2[label=TTA];
     3[label=TAC];
     4[label=ACA];
     0->1 [label=1 color=red];
     1->2 [label=1 color=red];
     2->3 [label=1 color=red];
     3->4 [label=1 color=red];
     }
     """, dbg)
    def test_pruning_2(self):
        """Test that pruning removes edges not between source and sink."""
        ref_str = 'GATTACA'
        read_str = 'CCGATGACACC'
        read = test_utils.make_read(read_str,
                                    chrom='chr20',
                                    start=1,
                                    cigar=[(len(read_str), 'M')],
                                    quals=[30] * len(read_str),
                                    name='read')
        # Use two reads so read path doesn't get pruned.
        dbg = debruijn_graph.build(ref_str, [read, read],
                                   self.single_k_dbg_options(3))

        self.assertGraphEqual(
            """\
        digraph G {
        0[label=GAT];
        1[label=ATT];
        2[label=TTA];
        3[label=TAC];
        4[label=ACA];
        5[label=ATG];
        6[label=TGA];
        7[label=GAC];
        0->1 [label=1 color=red];
        1->2 [label=1 color=red];
        2->3 [label=1 color=red];
        3->4 [label=1 color=red];
        0->5 [label=2];
        5->6 [label=2];
        6->7 [label=2];
        7->4 [label=2];
        }
        """, dbg)
Пример #10
0
    def test_filtering_by_qual(self):
        """Test that we filter out edges containing low-quality basecalls."""
        ref_str = 'GATTACA'
        read_str = 'GATGTACA'
        read = test_utils.make_read(read_str,
                                    chrom='chr20',
                                    start=1,
                                    cigar=[(len(read_str), 'M')],
                                    quals=[30, 30, 30, 1, 30, 30, 30, 30],
                                    name='read')

        # Use two reads so read path doesn't get pruned.
        dbg = debruijn_graph.build(ref_str, [read, read],
                                   self.single_k_dbg_options(2))

        self.assertGraphEqual(
            """\
        digraph G {
        0[label=GA];
        1[label=AT];
        2[label=TT];
        3[label=TA];
        4[label=AC];
        5[label=CA];
        0->1 [label=3 color=red];
        1->2 [label=1 color=red];
        2->3 [label=1 color=red];
        3->4 [label=3 color=red];
        4->5 [label=3 color=red];
        }
        """, dbg)
Пример #11
0
  def call_debruijn_graph(self, windows, reads):
    """Helper function to call debruijn_graph module."""
    windows_haplotypes = []
    # Build and process de-Bruijn graph for each window.
    for window in windows:
      if window.end - window.start > self.config.ws_config.max_window_size:
        continue
      if not self.ref_reader.is_valid(window):
        continue
      ref = self.ref_reader.query(window)
      # redacted
      dbg_reads = [
          read for read in reads
          if ranges.ranges_overlap(window, utils.read_range(read))
      ]

      with timer.Timer() as t:
        graph = debruijn_graph.build(ref, dbg_reads, self.config.dbg_config)
      graph_building_time = t.GetDuration()

      if not graph:
        candidate_haplotypes = [ref]
      else:
        candidate_haplotypes = graph.candidate_haplotypes()
      if candidate_haplotypes and candidate_haplotypes != [ref]:
        candidate_haplotypes_info = realigner_pb2.CandidateHaplotypes(
            span=window, haplotypes=candidate_haplotypes)
        windows_haplotypes.append(candidate_haplotypes_info)

      self.diagnostic_logger.log_graph_metrics(
          window, graph, candidate_haplotypes, graph_building_time)

    return windows_haplotypes
Пример #12
0
    def test_adding_edges_with_bad_positions(self, bad_position,
                                             dropped_edges):
        """Test that we filter out edges containing low-quality basecalls."""
        ref_str = 'GATTACA'
        read_str = 'GATTACA'

        kmer_indices = {
            'GA': 0,
            'AT': 1,
            'TT': 2,
            'TA': 3,
            'AC': 4,
            'CA': 5,
        }

        def kmer_to_index_edge(kmer_edge):
            k1, k2 = kmer_edge.split('->')
            return '{}->{}'.format(kmer_indices[k1], kmer_indices[k2])

        dropped_edges = {kmer_to_index_edge(edge) for edge in dropped_edges}

        for bad_type in ['qual', 'base']:
            bases = list(read_str)
            quals = [30] * len(bases)
            cigar = [(len(bases), 'M')]
            if bad_position is not None:
                if bad_type == 'qual':
                    quals[bad_position] = 1
                elif bad_type == 'base':
                    bases[bad_position] = 'N'
                else:
                    raise ValueError('Unexpected base type')

            read = test_utils.make_read(''.join(bases),
                                        start=0,
                                        cigar=cigar,
                                        quals=quals)

            # Use two reads so read path doesn't get pruned.
            dbg = debruijn_graph.build(ref_str, [read, read],
                                       self.single_k_dbg_options(2))

            expected_edges = '\n'.join(
                '{} [label={} color=red];'.format(
                    edge, 1 if edge in dropped_edges else 3)
                for edge in ['0->1', '1->2', '2->3', '3->4', '4->5'])

            self.assertGraphEqual(
                """\
            digraph G {
            0[label=GA];
            1[label=AT];
            2[label=TT];
            3[label=TA];
            4[label=AC];
            5[label=CA];
            %s
            }
            """ % expected_edges, dbg)
Пример #13
0
  def test_straightforward_region(self):
    ref_reader = fasta.RefFastaReader(testdata.CHR20_FASTA)
    bam_reader = sam.SamReader(testdata.CHR20_BAM)
    region = ranges.parse_literal('chr20:10,000,000-10,000,100')
    ref_seq = ref_reader.query(region)

    all_reads = list(bam_reader.query(region))
    dbg30 = debruijn_graph.build(ref_seq, all_reads,
                                 self.single_k_dbg_options(30))
    self.assertIsNotNone(dbg30)
    self.assertEqual([ref_seq], dbg30.candidate_haplotypes())
Пример #14
0
    def test_straightforward_region(self):
        ref_reader = fasta.IndexedFastaReader(testdata.CHR20_FASTA)
        bam_reader = sam.SamReader(testdata.CHR20_BAM)
        region = ranges.parse_literal('chr20:10,000,000-10,000,100')
        ref_seq = ref_reader.query(region)

        all_reads = list(bam_reader.query(region))
        dbg30 = debruijn_graph.build(ref_seq, all_reads,
                                     self.single_k_dbg_options(30))
        self.assertIsNotNone(dbg30)
        self.assertEqual([ref_seq], dbg30.candidate_haplotypes())
Пример #15
0
 def test_ref_cycle_detector(self, ref, smallest_good_k):
   min_k = max(smallest_good_k - 5, 1)
   max_k = min(smallest_good_k + 5, len(ref))
   for k in range(min_k, max_k):
     # The build fails, returning None, with a k < smallest_good_k. If
     # k >= smallest_good_k, then we expect a real non-None instance.
     result = debruijn_graph.build(ref, [], self.single_k_dbg_options(k))
     if k < smallest_good_k:
       self.assertIsNone(result, 'Cycle not detected for k={}'.format(k))
     else:
       self.assertIsNotNone(result, 'False cycle detected for k={}'.format(k))
Пример #16
0
 def test_complex_region(self):
     # There is a heterozygous 9 bp deletion of tandem TGA repeat.
     # "chr20:10,095,379-10,095,500"
     ref_reader = fasta.IndexedFastaReader(testdata.CHR20_FASTA)
     bam_reader = sam.SamReader(testdata.CHR20_BAM)
     region = ranges.parse_literal('chr20:10,095,379-10,095,500')
     ref_seq = ref_reader.query(region)
     reads = list(bam_reader.query(region))
     dbg = debruijn_graph.build(ref_seq, reads, self.dbg_options())
     self.assertIsNotNone(dbg)
     self.assertEqual(44, dbg.kmer_size)
     self.assertEqual(2, len(dbg.candidate_haplotypes()))
     self.assertIn(ref_seq, dbg.candidate_haplotypes())
Пример #17
0
 def test_complex_region(self):
   # There is a heterozygous 9 bp deletion of tandem TGA repeat.
   # "chr20:10,095,379-10,095,500"
   ref_reader = fasta.RefFastaReader(testdata.CHR20_FASTA)
   bam_reader = sam.SamReader(testdata.CHR20_BAM)
   region = ranges.parse_literal('chr20:10,095,379-10,095,500')
   ref_seq = ref_reader.query(region)
   reads = list(bam_reader.query(region))
   dbg = debruijn_graph.build(ref_seq, reads, self.dbg_options())
   self.assertIsNotNone(dbg)
   self.assertEqual(44, dbg.kmer_size)
   self.assertEqual(2, len(dbg.candidate_haplotypes()))
   self.assertIn(ref_seq, dbg.candidate_haplotypes())
Пример #18
0
 def test_k_exceeds_read_length(self):
     """This is a regression test for b/64564513."""
     # If k > read length, no edges will go into the graph from this read.
     # This crashed prior to the bugfix.
     ref_str = 'GATTACATG'
     read_str = 'GATGACA'
     read = test_utils.make_read(read_str,
                                 chrom='chr20',
                                 start=1,
                                 cigar=[(len(read_str), 'M')],
                                 quals=[30] * len(read_str),
                                 name='read')
     dbg = debruijn_graph.build(ref_str, [read, read],
                                self.single_k_dbg_options(8))
     self.assertIsNotNone(dbg)
Пример #19
0
 def test_k_exceeds_read_length(self):
   """This is a regression test for b/64564513."""
   # If k > read length, no edges will go into the graph from this read.
   # This crashed prior to the bugfix.
   ref_str = 'GATTACATG'
   read_str = 'GATGACA'
   read = test_utils.make_read(
       read_str,
       chrom='chr20',
       start=1,
       cigar=[(len(read_str), 'M')],
       quals=[30] * len(read_str),
       name='read')
   dbg = debruijn_graph.build(ref_str, [read, read],
                              self.single_k_dbg_options(8))
   self.assertIsNotNone(dbg)
Пример #20
0
  def test_basics(self):
    """Basic example."""
    ref_str = 'GATTACA'
    read_str = 'GATGACA'
    read = test_utils.make_read(
        read_str,
        chrom='chr20',
        start=1,
        cigar=[(len(read_str), 'M')],
        quals=[30] * len(read_str),
        name='read')

    self.assertEqual(self.single_k_dbg_options(3).min_k, 3)
    # Use two reads so read path doesn't get pruned.
    dbg = debruijn_graph.build(ref_str, [read, read],
                               self.single_k_dbg_options(3))

    self.assertItemsEqual([ref_str, read_str], dbg.candidate_haplotypes())

    self.assertGraphEqual(
        """\
          digraph G {
          0[label=GAT];
          1[label=ATT];
          2[label=TTA];
          3[label=TAC];
          4[label=ACA];
          5[label=ATG];
          6[label=TGA];
          7[label=GAC];
          0->1 [label=1 color=red];
          1->2 [label=1 color=red];
          2->3 [label=1 color=red];
          3->4 [label=1 color=red];
          0->5 [label=2];
          5->6 [label=2];
          6->7 [label=2];
          7->4 [label=2];
          }
          """, dbg)
Пример #21
0
  def test_basics(self):
    """Basic example."""
    ref_str = 'GATTACA'
    read_str = 'GATGACA'
    read = test_utils.make_read(
        read_str,
        chrom='chr20',
        start=1,
        cigar=[(len(read_str), 'M')],
        quals=[30] * len(read_str),
        name='read')

    self.assertEqual(self.single_k_dbg_options(3).min_k, 3)
    # Use two reads so read path doesn't get pruned.
    dbg = debruijn_graph.build(ref_str, [read, read],
                               self.single_k_dbg_options(3))

    self.assertItemsEqual([ref_str, read_str], dbg.candidate_haplotypes())

    self.assertGraphEqual("""\
          digraph G {
          0[label=GAT];
          1[label=ATT];
          2[label=TTA];
          3[label=TAC];
          4[label=ACA];
          5[label=ATG];
          6[label=TGA];
          7[label=GAC];
          0->1 [label=1 color=red];
          1->2 [label=1 color=red];
          2->3 [label=1 color=red];
          3->4 [label=1 color=red];
          0->5 [label=2];
          5->6 [label=2];
          6->7 [label=2];
          7->4 [label=2];
          }
          """, dbg)