def test_get_input_fn(self): test_file = test_utils.test_tmpfile('test.tfrecord') # Use a simple test example that consists of a read sequence of length 5. example = example_pb2.Example() read_sequence = 'ACGTA' true_sequence = 'ACCTA' aligned_qualities = [30, 30, 20, 30, 30] features = example.features features.feature['read_name'].bytes_list.value.append( six.b('test_seq')) features.feature['read_sequence'].int64_list.value.extend( ['ACGT'.index(b) for b in read_sequence]) features.feature['read_qualities'].int64_list.value.extend( aligned_qualities) features.feature['true_sequence'].int64_list.value.extend( ['ACGT'.index(b) for b in true_sequence]) features.feature['ref_match'].int64_list.value.extend([1, 1, 0, 1, 1]) with genomics_writer.TFRecordWriter(test_file) as writer: writer.write(example) features, label = ngs_errors.get_input_fn( test_file, ngs_read_length=len(read_sequence), batch_size=1, num_epochs=1)() with tf.Session() as sess: features_val, label_val = sess.run([features, label]) features_array = np.array(features_val) self.assertEqual((1, 4, 5, 3), features_array.shape) self.assertTrue( np.array_equal( np.array([[[1, 0, 0, 0, 1], [0, 1, 0, 0, 0], [0, 0, 1, 0, 0], [0, 0, 0, 1, 0]]]), features_array[:, :, :, 0])) self.assertTrue( np.array_equal( np.array([[[1, 1, 0, 1, 1], [1, 1, 0, 1, 1], [1, 1, 0, 1, 1], [1, 1, 0, 1, 1]]]), features_array[:, :, :, 1])) self.assertTrue( np.array_equal( np.array([[[30, 30, 20, 30, 30], [30, 30, 20, 30, 30], [30, 30, 20, 30, 30], [30, 30, 20, 30, 30]]]), features_array[:, :, :, 2])) self.assertTrue( np.array_equal(np.array([[0, 1, 1, 3, 0]]), np.array(label_val)))
def make_ngs_error_examples(ref_path, vcf_path, bam_path, examples_out_path, max_reads=None): """Driver program for ngs_errors. See module description for details. Args: ref_path: str. A path to an indexed fasta file. vcf_path: str. A path to an indexed VCF file. bam_path: str. A path to an SAM/BAM file. examples_out_path: str. A path where we will write out examples. max_reads: int or None. If not None, we will emit at most max_reads examples to examples_out_path. """ # Create a ref_reader backed by ref. ref_reader = fasta.IndexedFastaReader(ref_path) # Create a vcf_reader backed by vcf. vcf_reader = vcf.VcfReader(vcf_path) # Create a sam_reader backed by bam. Provide an empty ReadRequirements # proto to the reader so it enables standard filtering based on the default # values of ReadRequirements. Also explicitly allow the reader to access an # unindexed BAM, so only the iterate() function is enabled. read_requirements = reads_pb2.ReadRequirements() sam_reader = sam.SamReader(bam_path, read_requirements=read_requirements) # Create our TFRecordWriter where we'll send our tf.Examples. examples_out = genomics_writer.TFRecordWriter(examples_out_path) # All our readers and writers are context managers, so use the `with` # construct to open all of the inputs/outputs and close them when we are done # looping over our reads. n_examples = 0 with ref_reader, vcf_reader, sam_reader, examples_out: # Loop over the reads in our BAM file: for i, read in enumerate(sam_reader.iterate(), start=1): # Get the Range proto describing the chrom/start/stop spanned by our read. read_range = utils.read_range(read) # Get all of the variants that overlap our read range. variants = list(vcf_reader.query(read_range)) # Get the reference bases spanned by our read. ref_bases = ref_reader.query(read_range) # Check that we can use our read for generating an example. if is_usable_training_example(read, variants, ref_bases): n_examples += 1 # Convert read and ref_bases to a tf.Example with make_example. example = make_example(read, ref_bases) # And write it out to our TFRecord output file. examples_out.write(example) # Do a bit of convenient logging. This is very verbose if we convert a # lot of reads... logging.info(( 'Added an example for read %s (span=%s) with cigar %s [%d added ' 'of %d total reads]'), read.fragment_name, ranges.to_literal(read_range), cigar.format_cigar_units(read.alignment.cigar), n_examples, i) if max_reads is not None and n_examples >= max_reads: return
def Writer(path, compression_type=None): """A convenience wrapper around genomics_writer.TFRecordWriter.""" return genomics_writer.TFRecordWriter(path, compression_type)