def test_parse_literal_one_bp(self): self.assertEqual( ranges.parse_literal('1:10'), ranges.make_range('1', 9, 10)) self.assertEqual( ranges.parse_literal('1:100'), ranges.make_range('1', 99, 100)) self.assertEqual( ranges.parse_literal('1:1,000'), ranges.make_range('1', 999, 1000))
def call_aligner(self, assembled_region): """Helper function to call aligner module.""" if not assembled_region.reads: return [] contig = assembled_region.region.reference_name ref_start = max( 0, min(assembled_region.read_span.start, assembled_region.region.start) - _REF_ALIGN_MARGIN) ref_end = min( self.ref_reader.contig(contig).n_bases, max(assembled_region.read_span.end, assembled_region.region.end) + _REF_ALIGN_MARGIN) ref_prefix = self.ref_reader.bases( ranges.make_range(contig, ref_start, assembled_region.region.start)) ref = self.ref_reader.bases(assembled_region.region) # If we can't create the ref suffix then return the original alignments. if ref_end <= assembled_region.region.end: return assembled_region.reads else: ref_suffix = self.ref_reader.bases( ranges.make_range(contig, assembled_region.region.end, ref_end)) ref_region = ranges.make_range(contig, ref_start, ref_end) ref_seq = ref_prefix + ref + ref_suffix reads_aligner = aligner.Aligner(self.config.aln_config, ref_region, ref_seq) return reads_aligner.align_reads([ ref_prefix + target + ref_suffix for target in assembled_region.haplotypes ], assembled_region.reads)
def test_wrap(self, fasta_filename): chr_names = ['chrM', 'chr1', 'chr2'] chr_lengths = [100, 76, 121] fasta = test_utils.genomics_core_testdata(fasta_filename) fai = test_utils.genomics_core_testdata(fasta_filename + '.fai') with reference_fai.GenomeReferenceFai.from_file(fasta, fai) as ref: self.assertEqual(ref.n_contigs, 3) self.assertIn(fasta, ref.fasta_path) self.assertIn('GenomeReference backed by htslib FAI index', str(ref)) self.assertEqual(ref.contig_names, chr_names) self.assertEqual(ref.n_bp, sum(chr_lengths)) self.assertEqual(ref.bases(ranges.make_range('chrM', 1, 10)), 'ATCACAGGT') self.assertTrue( ref.is_valid_interval(ranges.make_range('chrM', 1, 10))) self.assertFalse( ref.is_valid_interval(ranges.make_range('chrM', 1, 100000))) self.assertEqual(len(ref.contigs), 3) self.assertEqual([c.name for c in ref.contigs], chr_names) self.assertEqual([c.n_bases for c in ref.contigs], chr_lengths) for contig in ref.contigs: self.assertEqual(ref.contig(contig.name), contig) self.assertTrue(ref.has_contig(contig.name)) self.assertFalse(ref.has_contig(contig.name + '.unknown'))
def test_detector_ranges(self): test_ranges = [ ranges.make_range('chr1', 0, 5), ranges.make_range('chr1', 8, 10), ranges.make_range('chr1', 12, 13), ranges.make_range('chr2', 2, 5), ] range_set = ranges.RangeSet(test_ranges) self.assertEqual(bool(range_set), True) self.assertEqual(len(range_set), 4) self.assertEqual(range_set.overlaps('chr1', 0), True) self.assertEqual(range_set.overlaps('chr1', 1), True) self.assertEqual(range_set.overlaps('chr1', 2), True) self.assertEqual(range_set.overlaps('chr1', 3), True) self.assertEqual(range_set.overlaps('chr1', 4), True) self.assertEqual(range_set.overlaps('chr1', 5), False) self.assertEqual(range_set.overlaps('chr1', 6), False) self.assertEqual(range_set.overlaps('chr1', 7), False) self.assertEqual(range_set.overlaps('chr1', 8), True) self.assertEqual(range_set.overlaps('chr1', 9), True) self.assertEqual(range_set.overlaps('chr1', 10), False) self.assertEqual(range_set.overlaps('chr1', 11), False) self.assertEqual(range_set.overlaps('chr1', 12), True) self.assertEqual(range_set.overlaps('chr1', 13), False) self.assertEqual(range_set.overlaps('chr1', 100), False) self.assertEqual(range_set.overlaps('chr1', 1000), False) self.assertEqual(range_set.overlaps('chr2', 0), False) self.assertEqual(range_set.overlaps('chr2', 1), False) self.assertEqual(range_set.overlaps('chr2', 2), True) self.assertEqual(range_set.overlaps('chr2', 3), True) self.assertEqual(range_set.overlaps('chr2', 4), True) self.assertEqual(range_set.overlaps('chr2', 5), False) self.assertEqual(range_set.overlaps('chr2', 6), False) self.assertEqual(range_set.overlaps('chr3', 3), False)
def test_partitions(self, interval_size, expected): rangeset = ranges.RangeSet([ ranges.make_range('chrM', 0, 100), ranges.make_range('chr1', 0, 76), ranges.make_range('chr2', 0, 121), ]) self.assertCountEqual([ranges.make_range(*args) for args in expected], rangeset.partition(interval_size))
def test_partition_of_multiple_intervals(self, interval_size, expected): rangeset = ranges.RangeSet([ ranges.make_range('1', 0, 10), ranges.make_range('1', 20, 40), ranges.make_range('1', 45, 50), ]) self.assertCountEqual([ranges.make_range(*args) for args in expected], rangeset.partition(interval_size))
def test_from_bed(self): source = test_utils.genomics_core_testdata('test.bed') self.assertCountEqual([ ranges.make_range('chr1', 1, 10), ranges.make_range('chr2', 20, 30), ranges.make_range('chr2', 40, 60), ranges.make_range('chr3', 80, 90), ], ranges.RangeSet.from_bed(source))
def test_find_max_overlapping_returns_least_index(self): query_range = ranges.make_range('1', 0, 10) search_ranges = [ ranges.make_range('1', 0, 5), ranges.make_range('1', 5, 10) ] for to_search in [search_ranges, list(reversed(search_ranges))]: self.assertEqual(0, ranges.find_max_overlapping(query_range, to_search))
def test_from_contigs(self): contigs = [ core_pb2.ContigInfo(name='chr1', n_bases=10), core_pb2.ContigInfo(name='chr2', n_bases=5), ] self.assertCountEqual([ ranges.make_range('chr1', 0, 10), ranges.make_range('chr2', 0, 5), ], ranges.RangeSet.from_contigs(contigs))
def test_from_regions_not_empty(self): literals = ['chr1', 'chr2:10-20'] contig_map = { 'chr1': core_pb2.ContigInfo(name='chr1', n_bases=10), 'chr2': core_pb2.ContigInfo(name='chr2', n_bases=100), } self.assertItemsEqual( [ranges.make_range('chr1', 0, 10), ranges.make_range('chr2', 9, 20)], ranges.RangeSet.from_regions(literals, contig_map))
def test_variant_position_and_range(self): v1 = test_utils.make_variant(chrom='1', alleles=['A', 'C'], start=10) v2 = test_utils.make_variant(chrom='1', alleles=['AGCT', 'C'], start=10) pos = ranges.make_range('1', 10, 11) range_ = ranges.make_range('1', 10, 14) self.assertEqual(pos, variantutils.variant_position(v1)) self.assertEqual(pos, variantutils.variant_position(v2)) self.assertEqual(pos, variantutils.variant_range(v1)) self.assertEqual(range_, variantutils.variant_range(v2))
def test_bed_parser(self): data = [ 'chr20\t61724611\t61725646', 'chr20\t61304163\t61305182', 'chr20\t61286467\t61286789', ] self.assertEqual( list(ranges.parse_lines(data, 'bed')), [ ranges.make_range('chr20', 61724611, 61725646), ranges.make_range('chr20', 61304163, 61305182), ranges.make_range('chr20', 61286467, 61286789), ])
def test_bedpe_parser_skips_cross_chr_events(self): # pylint: disable=line-too-long data = [ 'chr20\t25763416\t25765517\tchr21\t25825181\t25826882\tP2_PM_20_1549\t63266\t+\tTYPE:DELETION', 'chr20\t25972820\t25972991\tchr20\t26045347\t26045538\tP2_PM_20_696\t72548\t+\tTYPE:DELETION', 'chr20\t23719873\t23721974\tchr20\t23794822\t23796523\tP2_PM_20_1548\t76450\t+\tTYPE:DELETION', ] self.assertEqual( list(ranges.parse_lines(data, 'bedpe')), [ ranges.make_range('chr20', 25972820, 26045538), ranges.make_range('chr20', 23719873, 23796523), ])
def test_find_max_overlapping_allows_unordered_search_ranges(self): query_range = ranges.make_range('1', 4, 12) search_ranges = [ ranges.make_range('1', 0, 10), ranges.make_range('1', 10, 20), ranges.make_range('1', 12, 20) ] max_overlapping_range = search_ranges[0] for permutated_ranges in itertools.permutations(search_ranges): self.assertEqual( permutated_ranges.index(max_overlapping_range), ranges.find_max_overlapping(query_range, permutated_ranges))
def read_span(self): if self._read_span is None and self.reads: spans = [utils.read_range(r) for r in self.reads] self._read_span = ranges.make_range(spans[0].reference_name, min(s.start for s in spans), max(s.end for s in spans)) return self._read_span
def test_overlaps_variant_with_ranges(self): variant = variants_pb2.Variant(reference_name='chr2', start=10, end=11) range_set = ranges.RangeSet([ranges.make_range('chr1', 0, 5)]) with mock.patch.object(range_set, 'overlaps') as mock_overlaps: mock_overlaps.return_value = True self.assertEqual(range_set.variant_overlaps(variant), True) mock_overlaps.assert_called_once_with('chr2', 10)
def test_no_bad_soft_clipping(self): self.skipTest('Enable when b/63143285 global alignment is fixed') common = 'CTA' read_seq = common + 'GA' ref_seq = 'N' + common + 'CA' + 'N' alt_seq = 'A' + ref_seq targets = [ref_seq, alt_seq] region = ranges.make_range('ref', 0, len(ref_seq)) align_reads = self.make_test_aligner(ref_seq, region) read = test_utils.make_read( read_seq, chrom='ref', start=0, cigar=[(len(read_seq), 'M')], quals=[35] * len(read_seq), name='read') realigned = align_reads.align_reads(targets, [read])[0] # redacted # 5M as we'd expect for this read: # read_seq: -CTAGA- # ref_seq : NCGTCAN # But the current algorithm produces a local alignment of the read against # the haplotypes, and the G <=> C mismatch causes the local aligner to # simply skip those bases instead of incurring the mismatch penalty for it, # resulting in a 3M2S read (GA clipped off) instead of the better 5M result. self.assertEqual([_cigar.to_cigar_unit(len(read_seq), 'M')], list(realigned.alignment.cigar))
def test_call_from_allele_counter(self): ref = genomics_io.make_ref_reader(test_utils.CHR20_FASTA) sam_reader = genomics_io.make_sam_reader(test_utils.CHR20_BAM) size = 1000 region = ranges.make_range('chr20', 10000000, 10000000 + size) allele_counter = _allelecounter.AlleleCounter( ref, region, deepvariant_pb2.AlleleCounterOptions(partition_size=size)) caller = variant_calling.VariantCaller( deepvariant_pb2.VariantCallerOptions(min_count_snps=2, min_count_indels=2, min_fraction_snps=0.12, min_fraction_indels=0.12, sample_name='sample_name', p_error=0.001, max_gq=50, gq_resolution=1, ploidy=2)) # Grab all of the reads in our region and add them to the allele_counter. reads = list(sam_reader.query(region)) self.assertNotEmpty(reads) for read in reads: allele_counter.add(read) # Get the candidates records for this whole region. candidates = caller.calls_from_allele_counter(allele_counter) # We should have at least some candidates and some gvcf records. self.assertNotEmpty(candidates) # Each candidate should be a DeepVariantCall. for candidate in candidates: self.assertIsInstance(candidate, deepvariant_pb2.DeepVariantCall)
def test_get_reference_bases_good_region(self): self.dv_call.variant.start = 10 region = ranges.make_range(self.variant.reference_name, 8, 13) actual = self.pic.get_reference_bases(self.variant) self.assertEqual('ACGT', actual) self.mock_ref_reader.is_valid_interval.assert_called_once_with(region) self.mock_ref_reader.bases.assert_called_once_with(region)
def test_read_range(self): """Tests reads have their ranges calculated correctly.""" start = 10000001 read = test_utils.make_read('AAACAG', chrom='chrX', start=start, cigar='2M1I3M', quals=range(10, 16), name='read1') self.assertEquals(ranges.make_range('chrX', start, start + 5), utils.read_range(read)) read = test_utils.make_read('AAACAG', chrom='chrX', start=start, cigar='2M16D3M', quals=range(10, 16), name='read1') self.assertEquals(ranges.make_range('chrX', start, start + 5 + 16), utils.read_range(read))
def make_test_aligner(self, ref_seq=None, region=None): config = realigner_pb2.RealignerOptions.AlignerOptions(match=1, mismatch=1, gap_open=2, gap_extend=1, k=3, error_rate=.02) ref_seq = ref_seq or 'AAAAAAA' region = region or ranges.make_range('ref', 10, 10 + len(ref_seq)) return aligner.Aligner(config, region, ref_seq)
def variant_range(variant): """Returns a new Range covering variant. Args: variant: third_party.nucleus.protos.Variant. Returns: A new Range with the same reference_name, start, and end as variant. """ return ranges.make_range(variant.reference_name, variant.start, variant.end)
def read_range(read): """Creates a Range proto from the alignment of Read. Args: read: the read to calculate range Returns: A learning.genomics.deepvariant.core.genomics.Range for read. """ start = read.alignment.position.position end = start + cigar.alignment_length(read.alignment.cigar) return ranges.make_range(read.alignment.position.reference_name, start, end)
def read_range(read): """Creates a Range proto from the alignment of Read. Args: read: the read to calculate range Returns: A third_party.nucleus.protos.Range for read. """ start = read.alignment.position.position end = start + cigar.alignment_length(read.alignment.cigar) return ranges.make_range(read.alignment.position.reference_name, start, end)
def variant_position(variant): """Returns a new Range at the start position of variant. Args: variant: third_party.nucleus.protos.Variant. Returns: A new Range with the same reference_name as variant and start but an end that is start + 1. This produces a range that is the single basepair of the start of variant, hence the name position. """ return ranges.make_range(variant.reference_name, variant.start, variant.start + 1)
def get_reads(self, variant): """Gets the reads used to construct the pileup image around variant. Args: variant: A third_party.nucleus.protos.Variant proto describing the variant we are creating the pileup image of. Returns: A list of third_party.nucleus.protos.Read protos. """ query_start = variant.start - self._options.read_overlap_buffer_bp query_end = variant.end + self._options.read_overlap_buffer_bp region = ranges.make_range(variant.reference_name, query_start, query_end) return list(self._sam_reader.query(region))
def test_align_reads_simple(self, read_seq, expected_align_pos, expected_cigar, comment): """Test Aligner.align_reads(). Simple tests. Targets consist of - original reference sequence. - a sequence with 'AA' insertion at position 14 and - 'T' deletion at position 19. Args: read_seq: str, read sequence. expected_align_pos: int, expected aligned position expected_cigar: [(int, str)], expected cigar information. comment: str, test comment. """ ref_seq = 'AAAAAAAAAAAAATGCATGGGGGATTTTTTTTTTT' region = ranges.make_range('ref', 10, 10 + len(ref_seq)) align_reads = self.make_test_aligner(ref_seq, region) # redacted # implemented. For local alignment, it ensures that there are enough exact # matches between the reference and target for end-to-end alignment. targets = [ref_seq, 'AAAAAAAAAAAAATAAGCAGGGGGATTTTTTTTTTT'] read = test_utils.make_read( read_seq, chrom='ref', start=0, cigar=[(len(read_seq), 'M')], quals=[64] * len(read_seq), name='read') aligned_reads = align_reads.align_reads(targets, [read]) self.assertEqual(expected_align_pos, aligned_reads[0].alignment.position.position, comment) self.assertEqual( _cigar.to_cigar_units(expected_cigar), list(aligned_reads[0].alignment.cigar), comment) read = test_utils.make_read( read_seq, chrom='ref', start=0, cigar=[(2, 'H'), (len(read_seq), 'M'), (1, 'H')], quals=[64] * len(read_seq), name='read') aligned_reads = align_reads.align_reads(targets, [read]) expected_cigar_w_hard_clip = [(2, 'H')] + expected_cigar + [(1, 'H')] self.assertEqual( _cigar.to_cigar_units(expected_cigar_w_hard_clip), list(aligned_reads[0].alignment.cigar), comment)
def get_reference_bases(self, variant): """Gets the reference bases used to make the pileup image around variant. Args: variant: A third_party.nucleus.protos.Variant proto describing the variant we are creating the pileup image of. Returns: A string of reference bases or None. Returns None if the reference interval for variant isn't valid for some reason. """ start = variant.start - self.half_width end = start + self._options.width region = ranges.make_range(variant.reference_name, start, end) if self._ref_reader.is_valid_interval(region): return self._ref_reader.bases(region) else: return None
def test_sanity_check_readalignment(self, ref_name, ref_start, ref_end, read_chrom, read_start, read_len, read_cigar, exception_msg): """Test Aligner.sanity_check_readalignment().""" region = ranges.make_range(ref_name, ref_start, ref_end) ref_seq = 'A' * (ref_end - ref_start) align_reads = self.make_test_aligner(ref_seq, region) read = test_utils.make_read('A' * read_len, chrom=read_chrom, start=read_start, cigar=read_cigar, quals=[64] * read_len, name='read') if exception_msg: with self.assertRaisesRegexp(ValueError, exception_msg): align_reads.sanity_check_readalignment(read) else: align_reads.sanity_check_readalignment(read)
def make_example(variant, alt_alleles, encoded_image, shape, image_format): """Creates a new tf.Example suitable for use with DeepVariant. Args: variant: third_party.nucleus.protos.Variant protobuf containing information about a candidate variant call. alt_alleles: A set of strings. Indicates the alternate alleles used as "alt" when constructing the image. encoded_image: a Tensor of type tf.string. Should contain an image encoding the reference and read data supporting variant. The encoding should be consistent with the image_format argument. shape: a list of (width, height, channel). image_format: string. The scheme used to encode our image. Returns: A tf.Example proto containing the standard DeepVariant features. """ example = example_pb2.Example() features = example.features features.feature['locus'].bytes_list.value.append( ranges.to_literal( ranges.make_range(variant.reference_name, variant.start, variant.end))) features.feature['variant/encoded'].bytes_list.value.append( variant.SerializeToString()) all_alts = list(variant.alternate_bases) alt_indices = sorted(all_alts.index(alt) for alt in alt_alleles) features.feature['alt_allele_indices/encoded'].bytes_list.value.append( deepvariant_pb2.CallVariantsOutput.AltAlleleIndices( indices=alt_indices).SerializeToString()) features.feature['image/encoded'].bytes_list.value.append(encoded_image) features.feature['image/format'].bytes_list.value.append(image_format) features.feature['image/shape'].int64_list.value.extend(shape) return example