def test_roundtrip_writer(self, filename): output_path = test_utils.test_tmpfile(filename) original_reader = sam.SamReader(test_utils.genomics_core_testdata(filename)) original_records = list(original_reader.iterate()) with sam.SamWriter(output_path, header=original_reader.header) as writer: for record in original_records: writer.write(record) with sam.SamReader(output_path) as new_reader: self.assertEqual(original_records, list(new_reader.iterate()))
def _make_reader(self, filename, has_embedded_ref): if has_embedded_ref: # If we have an embedded reference, force the reader to use it by not # providing an argument for ref_path. return sam.SamReader(test_utils.genomics_core_testdata(filename)) else: # Otherwise we need to explicitly override the reference encoded in the UR # of the CRAM file to use the path provided to our test.fasta. return sam.SamReader( test_utils.genomics_core_testdata(filename), ref_path=test_utils.genomics_core_testdata('test.fasta'))
def make_ngs_error_examples(ref_path, vcf_path, bam_path): """ Yields tf.Example for training a ML model. Each tf.Example contains relevant features aboout the ngs read. Args: ref_path: str. A path to an indexed fasta file. vcf_path: str. A path to an indexed VCF file. bam_path: str. A path to an SAM/BAM file. Yields: A tuple (example, ngs_read_length, has_error), where example is a tf.Example, ngs_read_length is the length of the read generated by the sequencer, and has_error is a boolean specifying whether the example contains a read error. """ # Create a ref_reader backed by ref. ref_reader = fasta.IndexedFastaReader(ref_path) # Create a vcf_reader backed by vcf. vcf_reader = vcf.VcfReader(vcf_path) # Create a sam_reader backed by bam. Provide an empty ReadRequirements # proto to the reader so it enables standard filtering based on the default # values of ReadRequirements. Also explicitly allow the reader to access an # unindexed BAM, so only the iterate() function is enabled. read_requirements = reads_pb2.ReadRequirements() sam_reader = sam.SamReader(bam_path, read_requirements=read_requirements) # All our readers and writers are context managers, so use the `with` # construct to open all of the inputs/outputs and close them when we are done # looping over our reads. with ref_reader, vcf_reader, sam_reader: # Loop over the reads in our BAM file: for read in sam_reader.iterate(): # Get the Range proto describing the chrom/start/stop spanned by our read. assert len(read.alignment.cigar) > 0 first_cigar = read.alignment.cigar[0] # If the first cigar is a CLIP_SOFT, the start of sequence is the cigar # operation length before the alignment position. start = read.alignment.position.position if first_cigar.operation == cigar_pb2.CigarUnit.CLIP_SOFT: start -= first_cigar.operation_length read_range = ranges.make_range(read.alignment.position.reference_name, start, start + len(read.aligned_sequence)) # Get all of the variants that overlap our read range. variants = list(vcf_reader.query(read_range)) # Get the reference bases spanned by our read. ref_bases = ref_reader.query(read_range) # Check that we can use our read for generating an example. if is_usable_training_example(read, variants, ref_bases): # Convert read and ref_bases to a tf.Example with make_example. yield make_example(read, ref_bases), len(read.aligned_sequence), ( read.aligned_sequence != ref_bases)
def test_sam_query(self): reader = sam.SamReader(test_utils.genomics_core_testdata('test.bam')) expected = [(ranges.parse_literal('chr20:10,000,000-10,000,100'), 106), (ranges.parse_literal('chr20:10,000,000-10,000,000'), 45)] with reader: for interval, n_expected in expected: with reader.query(interval) as iterable: self.assertEqual(test_utils.iterable_len(iterable), n_expected)
def test_roundtrip_cram_writer(self, filename, has_embedded_ref): output_path = test_utils.test_tmpfile(filename) writer_ref_path = test_utils.genomics_core_testdata('test.fasta') reader_ref_path = '' if not has_embedded_ref: reader_ref_path = writer_ref_path original_reader = sam.SamReader( test_utils.genomics_core_testdata(filename), ref_path=reader_ref_path) original_records = list(original_reader.iterate()) with sam.SamWriter( output_path, header=original_reader.header, ref_path=writer_ref_path, embed_ref=has_embedded_ref) as writer: for record in original_records: writer.write(record) with sam.SamReader(output_path, ref_path=reader_ref_path) as new_reader: self.assertEqual(original_records, list(new_reader.iterate()))
def test_bam_iterate_partially(self): """Verify that iteration provides results incrementally, not all at once.""" reader = sam.SamReader(test_utils.genomics_core_testdata('test.bam')) with reader: iterable = reader.iterate() # We expect 106 records in total. for _ in range(10): results = list(itertools.islice(iterable, 10)) self.assertEqual(len(results), 10) results = list(itertools.islice(iterable, 10)) self.assertEqual(len(results), 6)
def _parse_read_with_aux_tags(self, tag_string): # Minimal header line to create a valid SAM file. header_lines = '@HD\tVN:1.3\tSO:coordinate\n@SQ\tSN:chr1\tLN:248956422\n' # A single stock read we'll add our AUX fields to. read = 'read_name\t0\tchr1\t1\t0\t3M\t*\t0\t0\tCCC\tAAA\t' + tag_string path = test_utils.test_tmpfile('aux_tags.bam') with gfile.GFile(path, 'w') as fout: fout.write(header_lines) fout.write(read + '\n') with sam.SamReader(path, parse_aux_fields=True) as reader: return list(reader.iterate())
def make_ngs_examples(hparams): """Generator function that yields training, evaluation and test examples.""" ref_reader = fasta.IndexedFastaReader(input_path=hparams.ref_path) vcf_reader = vcf.VcfReader(input_path=hparams.vcf_path) read_requirements = reads_pb2.ReadRequirements() sam_reader = sam.SamReader(input_path=hparams.bam_path, read_requirements=read_requirements) # Use a separate SAM reader to query for reads falling in the pileup range. sam_query_reader = sam.SamReader(input_path=hparams.bam_path, read_requirements=read_requirements) used_pileup_ranges = set() with ref_reader, vcf_reader, sam_reader, sam_query_reader: for read in sam_reader: # Check that read has cigar string present and allowed alignment. if not read.alignment.cigar: print('Skipping read, no cigar alignment found') continue if not has_allowed_alignment(read): continue # Obtain window that will be used to construct an example. read_range = utils.read_range(read) ref = ref_reader.query(region=read_range) pileup_range = get_pileup_range(hparams, read, read_range, ref) # Do not construct multiple examples with the same pileup range. pileup_range_serialized = pileup_range.SerializeToString() if pileup_range_serialized in used_pileup_ranges: continue used_pileup_ranges.add(pileup_range_serialized) # Get reference sequence, reads, and truth variants for the pileup range. pileup_reads = list(sam_query_reader.query(region=pileup_range)) pileup_ref = ref_reader.query(region=pileup_range) pileup_variants = list(vcf_reader.query(region=pileup_range)) if is_usable_example(pileup_reads, pileup_variants, pileup_ref): yield make_example(hparams, pileup_reads, pileup_ref, pileup_range)
def test_downsampling(self, method, maybe_range, fraction, expected_n_reads): reader = sam.SamReader( test_utils.genomics_core_testdata('test.bam'), downsample_fraction=fraction, random_seed=12345) with reader: if method == 'iterate': reads_iter = reader.iterate() elif method == 'query': reads_iter = reader.query(ranges.parse_literal(maybe_range)) else: self.fail('Unexpected method ' + str(method)) self.assertEqual(test_utils.iterable_len(reads_iter), expected_n_reads)
def main(argv): if len(argv) != 3: print('Usage: {} <input_sam> <chromosome>:<position>'.format(argv[0])) sys.exit(-1) in_sam = argv[1] r = ranges.parse_literal(argv[2]) position = r.start with sam.SamReader(in_sam) as sam_reader: reads = sam_reader.query(r) pos_seq_pairs = sorted( (read.alignment.position.position, read.aligned_sequence) for read in reads) if not pos_seq_pairs: print('No overlapping reads found for', argv[2]) sys.exit(0) left_position = pos_seq_pairs[0][0] for start, seq in pos_seq_pairs: print_read(left_position, start, position, seq)
def ascii_pileup(sam_filename, query): """Returns an ASCII pileup image for the query as a list of strings. Args: sam_filename: The filename of the BAM/SAM file. query: String version of range. """ r = ranges.parse_literal(query) position = r.start with sam.SamReader(sam_filename) as sam_reader: reads = sam_reader.query(r) pos_seq_pairs = sorted( (read.alignment.position.position, read.aligned_sequence) for read in reads) if not pos_seq_pairs: print('No overlapping reads found for', query) return [] left_position = pos_seq_pairs[0][0] return [read_str(left_position, start, position, seq) for start, seq in pos_seq_pairs]
from nucleus.io import sam r = sam.SamReader('NA12878_sliced.bam') #for v in r: # print(v)
def test_tfbam_plugin_loads(self): reader = sam.SamReader('*****@*****.**', use_index=True) self.assertIsNotNone(reader)
def test_tfbam_plugin_does_not_load(self): with self.assertRaisesRegexp( ImportError, 'tfbam_lib module not found, cannot read .tfbam files.'): _ = sam.SamReader('*****@*****.**')
def test_bam_iterate(self): reader = sam.SamReader(test_utils.genomics_core_testdata('test.bam')) with reader: self.assertEqual(test_utils.iterable_len(reader.iterate()), 106)
def make_ngs_error_examples(ref_path, vcf_path, bam_path, examples_out_path, max_reads=None): """Driver program for ngs_errors. See module description for details. Args: ref_path: str. A path to an indexed fasta file. vcf_path: str. A path to an indexed VCF file. bam_path: str. A path to an SAM/BAM file. examples_out_path: str. A path where we will write out examples. max_reads: int or None. If not None, we will emit at most max_reads examples to examples_out_path. """ # Create a ref_reader backed by ref. ref_reader = fasta.IndexedFastaReader(ref_path) # Create a vcf_reader backed by vcf. vcf_reader = vcf.VcfReader(vcf_path) # Create a sam_reader backed by bam. Provide an empty ReadRequirements # proto to the reader so it enables standard filtering based on the default # values of ReadRequirements. Also explicitly allow the reader to access an # unindexed BAM, so only the iterate() function is enabled. read_requirements = reads_pb2.ReadRequirements() sam_reader = sam.SamReader(bam_path, read_requirements=read_requirements) # Create our TFRecordWriter where we'll send our tf.Examples. examples_out = genomics_writer.TFRecordWriter(examples_out_path) # All our readers and writers are context managers, so use the `with` # construct to open all of the inputs/outputs and close them when we are done # looping over our reads. n_examples = 0 with ref_reader, vcf_reader, sam_reader, examples_out: # Loop over the reads in our BAM file: for i, read in enumerate(sam_reader.iterate(), start=1): # Get the Range proto describing the chrom/start/stop spanned by our read. read_range = utils.read_range(read) # Get all of the variants that overlap our read range. variants = list(vcf_reader.query(read_range)) # Get the reference bases spanned by our read. ref_bases = ref_reader.query(read_range) # Check that we can use our read for generating an example. if is_usable_training_example(read, variants, ref_bases): n_examples += 1 # Convert read and ref_bases to a tf.Example with make_example. example = make_example(read, ref_bases) # And write it out to our TFRecord output file. examples_out.write(example) # Do a bit of convenient logging. This is very verbose if we convert a # lot of reads... logging.info(( 'Added an example for read %s (span=%s) with cigar %s [%d added ' 'of %d total reads]'), read.fragment_name, ranges.to_literal(read_range), cigar.format_cigar_units(read.alignment.cigar), n_examples, i) if max_reads is not None and n_examples >= max_reads: return
def test_sam_iterate(self): reader = sam.SamReader(test_utils.genomics_core_testdata('test.sam'), use_index=False) with reader: self.assertEqual(test_utils.iterable_len(reader.iterate()), 6)