示例#1
0
    def test_wrap(self, fasta_filename):
        chr_names = ['chrM', 'chr1', 'chr2']
        chr_lengths = [100, 76, 121]
        fasta = test_utils.genomics_core_testdata(fasta_filename)
        fai = test_utils.genomics_core_testdata(fasta_filename + '.fai')
        with reference_fai.GenomeReferenceFai.from_file(fasta, fai) as ref:
            self.assertEqual(ref.n_contigs, 3)
            self.assertIn(fasta, ref.fasta_path)
            self.assertIn('GenomeReference backed by htslib FAI index',
                          str(ref))
            self.assertEqual(ref.contig_names, chr_names)
            self.assertEqual(ref.n_bp, sum(chr_lengths))
            self.assertEqual(ref.bases(ranges.make_range('chrM', 1, 10)),
                             'ATCACAGGT')

            self.assertTrue(
                ref.is_valid_interval(ranges.make_range('chrM', 1, 10)))
            self.assertFalse(
                ref.is_valid_interval(ranges.make_range('chrM', 1, 100000)))

            self.assertEqual(len(ref.contigs), 3)
            self.assertEqual([c.name for c in ref.contigs], chr_names)
            self.assertEqual([c.n_bases for c in ref.contigs], chr_lengths)
            for contig in ref.contigs:
                self.assertEqual(ref.contig(contig.name), contig)
                self.assertTrue(ref.has_contig(contig.name))
                self.assertFalse(ref.has_contig(contig.name + '.unknown'))
示例#2
0
    def test_detector_ranges(self):
        test_ranges = [
            ranges.make_range('chr1', 0, 5),
            ranges.make_range('chr1', 8, 10),
            ranges.make_range('chr1', 12, 13),
            ranges.make_range('chr2', 2, 5),
        ]
        range_set = ranges.RangeSet(test_ranges)
        self.assertEqual(bool(range_set), True)
        self.assertEqual(len(range_set), 4)

        self.assertEqual(range_set.overlaps('chr1', 0), True)
        self.assertEqual(range_set.overlaps('chr1', 1), True)
        self.assertEqual(range_set.overlaps('chr1', 2), True)
        self.assertEqual(range_set.overlaps('chr1', 3), True)
        self.assertEqual(range_set.overlaps('chr1', 4), True)
        self.assertEqual(range_set.overlaps('chr1', 5), False)
        self.assertEqual(range_set.overlaps('chr1', 6), False)
        self.assertEqual(range_set.overlaps('chr1', 7), False)
        self.assertEqual(range_set.overlaps('chr1', 8), True)
        self.assertEqual(range_set.overlaps('chr1', 9), True)
        self.assertEqual(range_set.overlaps('chr1', 10), False)
        self.assertEqual(range_set.overlaps('chr1', 11), False)
        self.assertEqual(range_set.overlaps('chr1', 12), True)
        self.assertEqual(range_set.overlaps('chr1', 13), False)
        self.assertEqual(range_set.overlaps('chr1', 100), False)
        self.assertEqual(range_set.overlaps('chr1', 1000), False)
        self.assertEqual(range_set.overlaps('chr2', 0), False)
        self.assertEqual(range_set.overlaps('chr2', 1), False)
        self.assertEqual(range_set.overlaps('chr2', 2), True)
        self.assertEqual(range_set.overlaps('chr2', 3), True)
        self.assertEqual(range_set.overlaps('chr2', 4), True)
        self.assertEqual(range_set.overlaps('chr2', 5), False)
        self.assertEqual(range_set.overlaps('chr2', 6), False)
        self.assertEqual(range_set.overlaps('chr3', 3), False)
示例#3
0
    def test_envelops(self):
        start_ix = 5
        end_ix = 10
        start_ix2 = end_ix + 1
        end_ix2 = end_ix + 5
        range_set = ranges.RangeSet([
            ranges.make_range('chr1', start_ix, end_ix),
            ranges.make_range('chr1', start_ix2, end_ix2)
        ])

        # No start position before the first start range is enveloped.
        for i in range(start_ix):
            self.assertFalse(range_set.envelops('chr1', i, start_ix + 1))

        # All regions within a single record are enveloped.
        for six in range(start_ix, end_ix):
            for eix in range(six, end_ix + 1):
                self.assertTrue(range_set.envelops('chr1', six, eix),
                                'chr1 {} {} not enveloped'.format(six, eix))

        # Bridging across two ranges is not enveloped.
        for six in range(start_ix, end_ix):
            for eix in range(start_ix2, end_ix2 + 1):
                self.assertFalse(range_set.envelops('chr1', six, eix))

        # Other chromosome is not spanned.
        self.assertFalse(range_set.envelops('chr2', start_ix, start_ix + 1))
示例#4
0
 def test_read_range(self, update_cached_read_end_first):
   """Tests reads have their ranges calculated correctly."""
   start = 10000001
   read = test_utils.make_read(
       'AAACAG',
       chrom='chrX',
       start=start,
       cigar='2M1I3M',
       quals=range(10, 16),
       name='read1')
   if update_cached_read_end_first:
     # Explicitly update cached_end.
     read.cached_end = utils.read_end(read, use_cached_read_end=False)
   self.assertEqual(
       ranges.make_range('chrX', start, start + 5), utils.read_range(read))
   read = test_utils.make_read(
       'AAACAG',
       chrom='chrX',
       start=start,
       cigar='2M16D3M',
       quals=range(10, 16),
       name='read1')
   if update_cached_read_end_first:
     # Explicitly update cached_end.
     read.cached_end = utils.read_end(read, use_cached_read_end=False)
   self.assertEqual(
       ranges.make_range('chrX', start, start + 5 + 16),
       utils.read_range(read))
示例#5
0
 def test_from_regions_not_empty(self):
   literals = ['chr1', 'chr2:10-20']
   self.assertItemsEqual(
       [ranges.make_range('chr1', 0, 10),
        ranges.make_range('chr2', 9, 20)],
       ranges.RangeSet.from_regions(
           literals, ranges.contigs_dict(_TEST_CONTIGS)))
示例#6
0
 def test_parse_literal_one_bp(self):
     self.assertEqual(ranges.parse_literal('1:10'),
                      ranges.make_range('1', 9, 10))
     self.assertEqual(ranges.parse_literal('1:100'),
                      ranges.make_range('1', 99, 100))
     self.assertEqual(ranges.parse_literal('1:1,000'),
                      ranges.make_range('1', 999, 1000))
示例#7
0
 def test_from_bed(self, bed_filename):
     source = test_utils.genomics_core_testdata(bed_filename)
     self.assertCountEqual([
         ranges.make_range('chr1', 1, 10),
         ranges.make_range('chr2', 20, 30),
         ranges.make_range('chr2', 40, 60),
         ranges.make_range('chr3', 80, 90),
     ], ranges.RangeSet.from_bed(source))
示例#8
0
 def test_partitions(self, interval_size, expected):
     rangeset = ranges.RangeSet([
         ranges.make_range('chrM', 0, 100),
         ranges.make_range('chr1', 0, 76),
         ranges.make_range('chr2', 0, 121),
     ])
     self.assertEqual([ranges.make_range(*args) for args in expected],
                      list(rangeset.partition(interval_size)))
示例#9
0
 def test_partition_of_multiple_intervals(self, interval_size, expected):
     rangeset = ranges.RangeSet([
         ranges.make_range('1', 0, 10),
         ranges.make_range('1', 20, 40),
         ranges.make_range('1', 45, 50),
     ])
     self.assertCountEqual([ranges.make_range(*args) for args in expected],
                           rangeset.partition(interval_size))
示例#10
0
 def test_query_edge_cases(self):
     reader = fasta.InMemoryFastaReader([('1', 0, 'ACGT')])
     # Check that we can query the first base correctly.
     self.assertEqual(reader.query(ranges.make_range('1', 0, 1)), 'A')
     # Check that we can query the last base correctly.
     self.assertEqual(reader.query(ranges.make_range('1', 3, 4)), 'T')
     # Check that we can query the entire sequence correctly.
     self.assertEqual(reader.query(ranges.make_range('1', 0, 4)), 'ACGT')
示例#11
0
 def test_from_contigs(self):
     contigs = [
         reference_pb2.ContigInfo(name='chr1', n_bases=10),
         reference_pb2.ContigInfo(name='chr2', n_bases=5),
     ]
     self.assertCountEqual([
         ranges.make_range('chr1', 0, 10),
         ranges.make_range('chr2', 0, 5),
     ], ranges.RangeSet.from_contigs(contigs))
示例#12
0
  def test_find_max_overlapping_returns_least_index(self):
    query_range = ranges.make_range('1', 0, 10)
    search_ranges = [
        ranges.make_range('1', 0, 5),
        ranges.make_range('1', 5, 10)
    ]

    for to_search in [search_ranges, list(reversed(search_ranges))]:
      self.assertEqual(0, ranges.find_max_overlapping(query_range, to_search))
示例#13
0
 def test_partitions_bad_interval_size_raises(self):
     # list() is necessary to force the generator to execute.
     with self.assertRaisesRegexp(ValueError, 'max_size'):
         list(
             ranges.RangeSet([ranges.make_range('chrM', 0,
                                                100)]).partition(-10))
     with self.assertRaisesRegexp(ValueError, 'max_size'):
         list(
             ranges.RangeSet([ranges.make_range('chrM', 0,
                                                100)]).partition(0))
示例#14
0
 def test_from_regions_not_empty(self):
   literals = ['chr1', 'chr2:10-20']
   contig_map = {
       'chr1': reference_pb2.ContigInfo(name='chr1', n_bases=10),
       'chr2': reference_pb2.ContigInfo(name='chr2', n_bases=100),
   }
   self.assertItemsEqual(
       [ranges.make_range('chr1', 0, 10),
        ranges.make_range('chr2', 9, 20)],
       ranges.RangeSet.from_regions(literals, contig_map))
示例#15
0
 def test_bed_parser(self):
     test_bed_path = test_utils.test_tmpfile(
         'test_bed_parser.bed', '\n'.join([
             'chr20\t61724611\t61725646', 'chr20\t61304163\t61305182',
             'chr20\t61286467\t61286789'
         ]))
     self.assertEqual(list(ranges.bed_parser(test_bed_path)), [
         ranges.make_range('chr20', 61724611, 61725646),
         ranges.make_range('chr20', 61304163, 61305182),
         ranges.make_range('chr20', 61286467, 61286789),
     ])
示例#16
0
 def test_bedpe_parser_skips_cross_chr_events(self):
     # pylint: disable=line-too-long
     data = [
         'chr20\t25763416\t25765517\tchr21\t25825181\t25826882\tP2_PM_20_1549\t63266\t+\tTYPE:DELETION',
         'chr20\t25972820\t25972991\tchr20\t26045347\t26045538\tP2_PM_20_696\t72548\t+\tTYPE:DELETION',
         'chr20\t23719873\t23721974\tchr20\t23794822\t23796523\tP2_PM_20_1548\t76450\t+\tTYPE:DELETION',
     ]
     self.assertEqual(list(ranges.parse_lines(data, 'bedpe')), [
         ranges.make_range('chr20', 25972820, 26045538),
         ranges.make_range('chr20', 23719873, 23796523),
     ])
示例#17
0
 def test_bed_parser(self):
     data = [
         'chr20\t61724611\t61725646',
         'chr20\t61304163\t61305182',
         'chr20\t61286467\t61286789',
     ]
     self.assertEqual(list(ranges.parse_lines(data, 'bed')), [
         ranges.make_range('chr20', 61724611, 61725646),
         ranges.make_range('chr20', 61304163, 61305182),
         ranges.make_range('chr20', 61286467, 61286789),
     ])
示例#18
0
 def test_dispatching_reader(self):
   with fasta.FastaReader(
       test_utils.genomics_core_testdata('test.fasta')) as reader:
     # The reader is an instance of IndexedFastaReader which supports query().
     self.assertEqual(reader.query(ranges.make_range('chrM', 1, 6)), 'ATCAC')
   with fasta.FastaReader(
       test_utils.genomics_core_testdata('unindexed.fasta')) as reader:
     # The reader is an instance of UnindexedFastaReader which doesn't support
     # query().
     with self.assertRaises(NotImplementedError):
       reader.query(ranges.make_range('chrM', 1, 5))
示例#19
0
 def setUp(self):
     out_fname = test_utils.test_tmpfile('output.gff')
     self.writer = gff_writer.GffWriter.to_file(out_fname,
                                                gff_pb2.GffHeader(),
                                                gff_pb2.GffWriterOptions())
     self.expected_gff_content = open(
         test_utils.genomics_core_testdata(
             'test_features.gff')).readlines()
     self.header = gff_pb2.GffHeader(
         sequence_regions=[ranges.make_range('ctg123', 0, 1497228)])
     self.record = gff_pb2.GffRecord(
         range=ranges.make_range('ctg123', 1000, 1100))
示例#20
0
    def test_expand_raises_with_missing_contig_in_map(self):
        # Empty contig_map should raise.
        with self.assertRaises(KeyError):
            ranges.expand(ranges.make_range('1', 10, 20), 1, contig_map={})

        # Missing '1' from the contig map should raise.
        with self.assertRaises(KeyError):
            ranges.expand(ranges.make_range('1', 10, 20),
                          1,
                          contig_map={
                              '2': reference_pb2.ContigInfo(name='2',
                                                            n_bases=50),
                          })
示例#21
0
 def test_bedpe_parser_skips_cross_chr_events(self):
     # pylint: disable=line-too-long
     data = '\n'.join([
         'chr20\t25763416\t25765517\tchr21\t25825181\t25826882\tP2_PM_20_1549\t63266\t+\tTYPE:DELETION',
         'chr20\t25972820\t25972991\tchr20\t26045347\t26045538\tP2_PM_20_696\t72548\t+\tTYPE:DELETION',
         'chr20\t23719873\t23721974\tchr20\t23794822\t23796523\tP2_PM_20_1548\t76450\t+\tTYPE:DELETION',
     ])
     test_bedpe_path = test_utils.test_tmpfile('test_bedpe_parser2.bedpe',
                                               data)
     self.assertEqual(list(ranges.bedpe_parser(test_bedpe_path)), [
         ranges.make_range('chr20', 25972820, 26045538),
         ranges.make_range('chr20', 23719873, 23796523),
     ])
示例#22
0
    def test_find_max_overlapping_allows_unordered_search_ranges(self):
        query_range = ranges.make_range('1', 4, 12)
        search_ranges = [
            ranges.make_range('1', 0, 10),
            ranges.make_range('1', 10, 20),
            ranges.make_range('1', 12, 20)
        ]
        max_overlapping_range = search_ranges[0]

        for permutated_ranges in itertools.permutations(search_ranges):
            self.assertEqual(
                permutated_ranges.index(max_overlapping_range),
                ranges.find_max_overlapping(query_range, permutated_ranges))
示例#23
0
 def test_variant_position_and_range(self):
   v1 = test_utils.make_variant(chrom='1', alleles=['A', 'C'], start=10)
   v2 = test_utils.make_variant(chrom='1', alleles=['AGCT', 'C'], start=10)
   pos = ranges.make_range('1', 10, 11)
   range_ = ranges.make_range('1', 10, 14)
   v1_range_tuple = ('1', 10, 11)
   v2_range_tuple = ('1', 10, 14)
   self.assertEqual(pos, variant_utils.variant_position(v1))
   self.assertEqual(pos, variant_utils.variant_position(v2))
   self.assertEqual(pos, variant_utils.variant_range(v1))
   self.assertEqual(range_, variant_utils.variant_range(v2))
   self.assertEqual(v1_range_tuple, variant_utils.variant_range_tuple(v1))
   self.assertEqual(v2_range_tuple, variant_utils.variant_range_tuple(v2))
示例#24
0
 def test_good_query(self):
     for contig in self.fasta_reader.header.contigs:
         for start in range(contig.n_bases):
             for end in range(start, contig.n_bases):
                 region = ranges.make_range(contig.name, start, end)
                 self.assertEqual(self.in_mem.query(region),
                                  self.fasta_reader.query(region))
示例#25
0
  def __init__(self, chromosomes):
    """Initializes an InMemoryFastaReader using data from chromosomes.

    Args:
      chromosomes: list[tuple]. The chromosomes we are caching in memory as a
        list of tuples. Each tuple must be exactly three elements in length,
        containing (chromosome name [str], start [int], bases [str]).

    Raises:
      ValueError: If any of the chromosomes tuples are invalid.
    """
    super(InMemoryFastaReader, self).__init__()

    ref_seqs = []
    contigs = []
    for i, (contig_name, start, bases) in enumerate(chromosomes):
      if start < 0:
        raise ValueError('start={} must be >= for chromosome={}'.format(
            start, contig_name))
      if not bases:
        raise ValueError(
            'Bases must contain at least one base, but got "{}"'.format(bases))

      end = start + len(bases)
      ref_seqs.append(reference_pb2.ReferenceSequence(
          region=ranges.make_range(contig_name, start, end), bases=bases))
      contigs.append(
          reference_pb2.ContigInfo(
              name=contig_name, n_bases=end, pos_in_fasta=i))

    self._reader = in_memory_fasta_reader.InMemoryFastaReader.create(
        contigs, ref_seqs)
    self.header = RefFastaHeader(contigs=self._reader.contigs)
示例#26
0
 def test_overlaps_variant_with_ranges(self):
     variant = variants_pb2.Variant(reference_name='chr2', start=10, end=11)
     range_set = ranges.RangeSet([ranges.make_range('chr1', 0, 5)])
     with mock.patch.object(range_set, 'overlaps') as mock_overlaps:
         mock_overlaps.return_value = True
         self.assertEqual(range_set.variant_overlaps(variant), True)
         mock_overlaps.assert_called_once_with('chr2', 10)
示例#27
0
 def setUp(self):
     tfrecord_file = test_utils.genomics_core_testdata(
         'test_features.gff.tfrecord')
     self.records = list(
         io_utils.read_tfrecords(tfrecord_file, proto=gff_pb2.GffRecord))
     self.header = gff_pb2.GffHeader(
         sequence_regions=[ranges.make_range('ctg123', 0, 1497228)])
示例#28
0
def make_ngs_error_examples(ref_path, vcf_path, bam_path):
  """ Yields tf.Example for training a ML model.

  Each tf.Example contains
  relevant features aboout the ngs read.

  Args:
    ref_path: str. A path to an indexed fasta file.
    vcf_path: str. A path to an indexed VCF file.
    bam_path: str. A path to an SAM/BAM file.

  Yields:
    A tuple (example, ngs_read_length, has_error), where example is a
    tf.Example, ngs_read_length is the length of the read generated by the
    sequencer, and has_error is a boolean specifying whether the example
    contains a read error.
  """

  # Create a ref_reader backed by ref.
  ref_reader = fasta.IndexedFastaReader(ref_path)

  # Create a vcf_reader backed by vcf.
  vcf_reader = vcf.VcfReader(vcf_path)

  # Create a sam_reader backed by bam. Provide an empty ReadRequirements
  # proto to the reader so it enables standard filtering based on the default
  # values of ReadRequirements. Also explicitly allow the reader to access an
  # unindexed BAM, so only the iterate() function is enabled.
  read_requirements = reads_pb2.ReadRequirements()
  sam_reader = sam.SamReader(bam_path, read_requirements=read_requirements)

  # All our readers and writers are context managers, so use the `with`
  # construct to open all of the inputs/outputs and close them when we are done
  # looping over our reads.
  with ref_reader, vcf_reader, sam_reader:
    # Loop over the reads in our BAM file:
    for read in sam_reader.iterate():
      # Get the Range proto describing the chrom/start/stop spanned by our read.
      assert len(read.alignment.cigar) > 0
      first_cigar = read.alignment.cigar[0]
      # If the first cigar is a CLIP_SOFT, the start of sequence is the cigar
      # operation length before the alignment position.
      start = read.alignment.position.position
      if first_cigar.operation == cigar_pb2.CigarUnit.CLIP_SOFT:
        start -= first_cigar.operation_length
      read_range = ranges.make_range(read.alignment.position.reference_name,
                                     start, start + len(read.aligned_sequence))

      # Get all of the variants that overlap our read range.
      variants = list(vcf_reader.query(read_range))

      # Get the reference bases spanned by our read.
      ref_bases = ref_reader.query(read_range)

      # Check that we can use our read for generating an example.
      if is_usable_training_example(read, variants, ref_bases):
        # Convert read and ref_bases to a tf.Example with make_example.
        yield make_example(read, ref_bases), len(read.aligned_sequence), (
            read.aligned_sequence != ref_bases)
示例#29
0
 def check_overlaps(chr1, start1, end1, chr2, start2, end2, expected):
   nbp = end1 - start1
   read = test_utils.make_read(
       'A' * nbp, chrom=chr1, start=start1, cigar='{}M'.format(nbp))
   region = ranges.make_range(chr2, start2, end2)
   self.assertEqual(utils.read_overlaps_region(read, region), expected)
   # This check ensures we get the same result calling ranges.ranges_overlap.
   self.assertEqual(
       ranges.ranges_overlap(region, utils.read_range(read)), expected)
示例#30
0
  def setUpClass(cls):
    cls.fasta_reader = fasta.RefFastaReader(
        test_utils.genomics_core_testdata('test.fasta'))

    cls.in_mem = fasta.InMemoryRefReader(
        [(contig.name, 0,
          cls.fasta_reader.query(
              ranges.make_range(contig.name, 0, contig.n_bases)))
         for contig in cls.fasta_reader.header.contigs])