示例#1
0
 def test_roundtrip_writer(self, filename):
   output_path = test_utils.test_tmpfile(filename)
   original_reader = sam.SamReader(test_utils.genomics_core_testdata(filename))
   original_records = list(original_reader.iterate())
   with sam.SamWriter(output_path, header=original_reader.header) as writer:
     for record in original_records:
       writer.write(record)
   with sam.SamReader(output_path) as new_reader:
     self.assertEqual(original_records, list(new_reader.iterate()))
示例#2
0
 def _make_reader(self, filename, has_embedded_ref):
     if has_embedded_ref:
         # If we have an embedded reference, force the reader to use it by not
         # providing an argument for ref_path.
         return sam.SamReader(test_utils.genomics_core_testdata(filename))
     else:
         # Otherwise we need to explicitly override the reference encoded in the UR
         # of the CRAM file to use the path provided to our test.fasta.
         return sam.SamReader(
             test_utils.genomics_core_testdata(filename),
             ref_path=test_utils.genomics_core_testdata('test.fasta'))
示例#3
0
def make_ngs_error_examples(ref_path, vcf_path, bam_path):
  """ Yields tf.Example for training a ML model.

  Each tf.Example contains
  relevant features aboout the ngs read.

  Args:
    ref_path: str. A path to an indexed fasta file.
    vcf_path: str. A path to an indexed VCF file.
    bam_path: str. A path to an SAM/BAM file.

  Yields:
    A tuple (example, ngs_read_length, has_error), where example is a
    tf.Example, ngs_read_length is the length of the read generated by the
    sequencer, and has_error is a boolean specifying whether the example
    contains a read error.
  """

  # Create a ref_reader backed by ref.
  ref_reader = fasta.IndexedFastaReader(ref_path)

  # Create a vcf_reader backed by vcf.
  vcf_reader = vcf.VcfReader(vcf_path)

  # Create a sam_reader backed by bam. Provide an empty ReadRequirements
  # proto to the reader so it enables standard filtering based on the default
  # values of ReadRequirements. Also explicitly allow the reader to access an
  # unindexed BAM, so only the iterate() function is enabled.
  read_requirements = reads_pb2.ReadRequirements()
  sam_reader = sam.SamReader(bam_path, read_requirements=read_requirements)

  # All our readers and writers are context managers, so use the `with`
  # construct to open all of the inputs/outputs and close them when we are done
  # looping over our reads.
  with ref_reader, vcf_reader, sam_reader:
    # Loop over the reads in our BAM file:
    for read in sam_reader.iterate():
      # Get the Range proto describing the chrom/start/stop spanned by our read.
      assert len(read.alignment.cigar) > 0
      first_cigar = read.alignment.cigar[0]
      # If the first cigar is a CLIP_SOFT, the start of sequence is the cigar
      # operation length before the alignment position.
      start = read.alignment.position.position
      if first_cigar.operation == cigar_pb2.CigarUnit.CLIP_SOFT:
        start -= first_cigar.operation_length
      read_range = ranges.make_range(read.alignment.position.reference_name,
                                     start, start + len(read.aligned_sequence))

      # Get all of the variants that overlap our read range.
      variants = list(vcf_reader.query(read_range))

      # Get the reference bases spanned by our read.
      ref_bases = ref_reader.query(read_range)

      # Check that we can use our read for generating an example.
      if is_usable_training_example(read, variants, ref_bases):
        # Convert read and ref_bases to a tf.Example with make_example.
        yield make_example(read, ref_bases), len(read.aligned_sequence), (
            read.aligned_sequence != ref_bases)
示例#4
0
 def test_sam_query(self):
   reader = sam.SamReader(test_utils.genomics_core_testdata('test.bam'))
   expected = [(ranges.parse_literal('chr20:10,000,000-10,000,100'), 106),
               (ranges.parse_literal('chr20:10,000,000-10,000,000'), 45)]
   with reader:
     for interval, n_expected in expected:
       with reader.query(interval) as iterable:
         self.assertEqual(test_utils.iterable_len(iterable), n_expected)
示例#5
0
 def test_roundtrip_cram_writer(self, filename, has_embedded_ref):
   output_path = test_utils.test_tmpfile(filename)
   writer_ref_path = test_utils.genomics_core_testdata('test.fasta')
   reader_ref_path = ''
   if not has_embedded_ref:
     reader_ref_path = writer_ref_path
   original_reader = sam.SamReader(
       test_utils.genomics_core_testdata(filename), ref_path=reader_ref_path)
   original_records = list(original_reader.iterate())
   with sam.SamWriter(
       output_path,
       header=original_reader.header,
       ref_path=writer_ref_path,
       embed_ref=has_embedded_ref) as writer:
     for record in original_records:
       writer.write(record)
   with sam.SamReader(output_path, ref_path=reader_ref_path) as new_reader:
     self.assertEqual(original_records, list(new_reader.iterate()))
示例#6
0
 def test_bam_iterate_partially(self):
     """Verify that iteration provides results incrementally, not all at once."""
     reader = sam.SamReader(test_utils.genomics_core_testdata('test.bam'))
     with reader:
         iterable = reader.iterate()
         # We expect 106 records in total.
         for _ in range(10):
             results = list(itertools.islice(iterable, 10))
             self.assertEqual(len(results), 10)
         results = list(itertools.islice(iterable, 10))
         self.assertEqual(len(results), 6)
示例#7
0
 def _parse_read_with_aux_tags(self, tag_string):
     # Minimal header line to create a valid SAM file.
     header_lines = '@HD\tVN:1.3\tSO:coordinate\n@SQ\tSN:chr1\tLN:248956422\n'
     # A single stock read we'll add our AUX fields to.
     read = 'read_name\t0\tchr1\t1\t0\t3M\t*\t0\t0\tCCC\tAAA\t' + tag_string
     path = test_utils.test_tmpfile('aux_tags.bam')
     with gfile.GFile(path, 'w') as fout:
         fout.write(header_lines)
         fout.write(read + '\n')
     with sam.SamReader(path, parse_aux_fields=True) as reader:
         return list(reader.iterate())
def make_ngs_examples(hparams):
    """Generator function that yields training, evaluation and test examples."""
    ref_reader = fasta.IndexedFastaReader(input_path=hparams.ref_path)
    vcf_reader = vcf.VcfReader(input_path=hparams.vcf_path)
    read_requirements = reads_pb2.ReadRequirements()
    sam_reader = sam.SamReader(input_path=hparams.bam_path,
                               read_requirements=read_requirements)

    # Use a separate SAM reader to query for reads falling in the pileup range.
    sam_query_reader = sam.SamReader(input_path=hparams.bam_path,
                                     read_requirements=read_requirements)
    used_pileup_ranges = set()
    with ref_reader, vcf_reader, sam_reader, sam_query_reader:
        for read in sam_reader:

            # Check that read has cigar string present and allowed alignment.
            if not read.alignment.cigar:
                print('Skipping read, no cigar alignment found')
                continue
            if not has_allowed_alignment(read):
                continue

            # Obtain window that will be used to construct an example.
            read_range = utils.read_range(read)
            ref = ref_reader.query(region=read_range)
            pileup_range = get_pileup_range(hparams, read, read_range, ref)

            # Do not construct multiple examples with the same pileup range.
            pileup_range_serialized = pileup_range.SerializeToString()
            if pileup_range_serialized in used_pileup_ranges:
                continue
            used_pileup_ranges.add(pileup_range_serialized)

            # Get reference sequence, reads, and truth variants for the pileup range.
            pileup_reads = list(sam_query_reader.query(region=pileup_range))
            pileup_ref = ref_reader.query(region=pileup_range)
            pileup_variants = list(vcf_reader.query(region=pileup_range))
            if is_usable_example(pileup_reads, pileup_variants, pileup_ref):
                yield make_example(hparams, pileup_reads, pileup_ref,
                                   pileup_range)
示例#9
0
 def test_downsampling(self, method, maybe_range, fraction, expected_n_reads):
   reader = sam.SamReader(
       test_utils.genomics_core_testdata('test.bam'),
       downsample_fraction=fraction,
       random_seed=12345)
   with reader:
     if method == 'iterate':
       reads_iter = reader.iterate()
     elif method == 'query':
       reads_iter = reader.query(ranges.parse_literal(maybe_range))
     else:
       self.fail('Unexpected method ' + str(method))
     self.assertEqual(test_utils.iterable_len(reads_iter), expected_n_reads)
示例#10
0
def main(argv):
    if len(argv) != 3:
        print('Usage: {} <input_sam> <chromosome>:<position>'.format(argv[0]))
        sys.exit(-1)
    in_sam = argv[1]
    r = ranges.parse_literal(argv[2])
    position = r.start

    with sam.SamReader(in_sam) as sam_reader:
        reads = sam_reader.query(r)
        pos_seq_pairs = sorted(
            (read.alignment.position.position, read.aligned_sequence)
            for read in reads)
        if not pos_seq_pairs:
            print('No overlapping reads found for', argv[2])
            sys.exit(0)

        left_position = pos_seq_pairs[0][0]
        for start, seq in pos_seq_pairs:
            print_read(left_position, start, position, seq)
示例#11
0
def ascii_pileup(sam_filename, query):
  """Returns an ASCII pileup image for the query as a list of strings.

  Args:
    sam_filename: The filename of the BAM/SAM file.
    query: String version of range.
  """
  r = ranges.parse_literal(query)
  position = r.start

  with sam.SamReader(sam_filename) as sam_reader:
    reads = sam_reader.query(r)
    pos_seq_pairs = sorted(
        (read.alignment.position.position, read.aligned_sequence)
        for read in reads)
    if not pos_seq_pairs:
      print('No overlapping reads found for', query)
      return []

    left_position = pos_seq_pairs[0][0]
    return [read_str(left_position, start, position, seq)
            for start, seq in pos_seq_pairs]
示例#12
0
from nucleus.io import sam
r = sam.SamReader('NA12878_sliced.bam')
#for v in r:
#  print(v)
示例#13
0
 def test_tfbam_plugin_loads(self):
     reader = sam.SamReader('*****@*****.**', use_index=True)
     self.assertIsNotNone(reader)
示例#14
0
 def test_tfbam_plugin_does_not_load(self):
     with self.assertRaisesRegexp(
             ImportError,
             'tfbam_lib module not found, cannot read .tfbam files.'):
         _ = sam.SamReader('*****@*****.**')
示例#15
0
 def test_bam_iterate(self):
     reader = sam.SamReader(test_utils.genomics_core_testdata('test.bam'))
     with reader:
         self.assertEqual(test_utils.iterable_len(reader.iterate()), 106)
示例#16
0
def make_ngs_error_examples(ref_path,
                            vcf_path,
                            bam_path,
                            examples_out_path,
                            max_reads=None):
    """Driver program for ngs_errors.

  See module description for details.

  Args:
    ref_path: str. A path to an indexed fasta file.
    vcf_path: str. A path to an indexed VCF file.
    bam_path: str. A path to an SAM/BAM file.
    examples_out_path: str. A path where we will write out examples.
    max_reads: int or None. If not None, we will emit at most max_reads examples
      to examples_out_path.
  """

    # Create a ref_reader backed by ref.
    ref_reader = fasta.IndexedFastaReader(ref_path)

    # Create a vcf_reader backed by vcf.
    vcf_reader = vcf.VcfReader(vcf_path)

    # Create a sam_reader backed by bam. Provide an empty ReadRequirements
    # proto to the reader so it enables standard filtering based on the default
    # values of ReadRequirements. Also explicitly allow the reader to access an
    # unindexed BAM, so only the iterate() function is enabled.
    read_requirements = reads_pb2.ReadRequirements()
    sam_reader = sam.SamReader(bam_path, read_requirements=read_requirements)

    # Create our TFRecordWriter where we'll send our tf.Examples.
    examples_out = genomics_writer.TFRecordWriter(examples_out_path)

    # All our readers and writers are context managers, so use the `with`
    # construct to open all of the inputs/outputs and close them when we are done
    # looping over our reads.
    n_examples = 0
    with ref_reader, vcf_reader, sam_reader, examples_out:
        # Loop over the reads in our BAM file:
        for i, read in enumerate(sam_reader.iterate(), start=1):
            # Get the Range proto describing the chrom/start/stop spanned by our read.
            read_range = utils.read_range(read)

            # Get all of the variants that overlap our read range.
            variants = list(vcf_reader.query(read_range))

            # Get the reference bases spanned by our read.
            ref_bases = ref_reader.query(read_range)

            # Check that we can use our read for generating an example.
            if is_usable_training_example(read, variants, ref_bases):
                n_examples += 1

                # Convert read and ref_bases to a tf.Example with make_example.
                example = make_example(read, ref_bases)

                # And write it out to our TFRecord output file.
                examples_out.write(example)

                # Do a bit of convenient logging. This is very verbose if we convert a
                # lot of reads...
                logging.info((
                    'Added an example for read %s (span=%s) with cigar %s [%d added '
                    'of %d total reads]'), read.fragment_name,
                             ranges.to_literal(read_range),
                             cigar.format_cigar_units(read.alignment.cigar),
                             n_examples, i)

                if max_reads is not None and n_examples >= max_reads:
                    return
示例#17
0
 def test_sam_iterate(self):
     reader = sam.SamReader(test_utils.genomics_core_testdata('test.sam'),
                            use_index=False)
     with reader:
         self.assertEqual(test_utils.iterable_len(reader.iterate()), 6)