def write_variant_call(writer, prediction, use_tpu): """Write the variant call based on prediction. Args: writer: A object with a write() function that will be called for each encoded_variant and genotype likelihoods. prediction: A [3] tensor of floats. These are the predicted genotype likelihoods (p00, p0x, pxx) for some alt allele x, in the same order as encoded_variants. use_tpu: bool. Decode the tpu specific encoding of prediction. Returns: The return status from writer. """ encoded_variant = prediction['variant'] if use_tpu: encoded_variant = tf_utils.int_tensor_to_string(encoded_variant) encoded_alt_allele_indices = prediction['alt_allele_indices'] if use_tpu: encoded_alt_allele_indices = tf_utils.int_tensor_to_string( encoded_alt_allele_indices) rounded_gls = round_gls(prediction['probabilities'], precision=_GL_PRECISION) # Write it out. true_labels = prediction[ 'label'] if FLAGS.debugging_true_label_mode else None cvo = _create_cvo_proto(encoded_variant, rounded_gls, encoded_alt_allele_indices, true_labels) return writer.write(cvo.SerializeToString())
def test_get_batches(self, compressed_inputs, mode, use_tpu): mode = (tf.estimator.ModeKeys.EVAL if mode == 'EVAL' else tf.estimator.ModeKeys.TRAIN) input_fn = make_golden_dataset(compressed_inputs, mode=mode, use_tpu=use_tpu) batch_size = 16 with tf.Session() as sess: batch = input_fn(dict( batch_size=batch_size)).make_one_shot_iterator().get_next() # Get our images, labels, and variants for further testing. sess.run(tf.global_variables_initializer()) features, labels = sess.run(batch) variants = features['variant'] images = features['image'] # Checks that our labels are the right shape and are one-hot encoded. # Note that the shape is 100, not 107, because we only adjust the image # in the model_fn now, where previously it was done in the input_fn. self.assertEqual([batch_size] + dv_constants.PILEUP_DEFAULT_DIMS, list(images.shape)) self.assertEqual((batch_size, ), labels.shape) for label in labels: # pylint: disable=g-generic-assert self.assertTrue(0 <= label < dv_constants.NUM_CLASSES) # Check that our variants has the shape we expect and actually contain # variants by decoding them and checking the reference_name. self.assertEqual(batch_size, variants.shape[0]) for variant in variants: if use_tpu: variant = tf_utils.int_tensor_to_string(variant) for v in variant_utils.decode_variants([variant]): self.assertEqual(v.reference_name, 'chr20')
def testIntTensorToString(self): with tf.Session() as sess: s = '\001\002\003\004\005\006\007' it = tf_utils.string_to_int_tensor(s) x = sess.run(it) t = tf_utils.int_tensor_to_string(x) self.assertEqual(t, s)
def assertTfDataSetExamplesMatchExpected(self, input_fn, expected_dataset, use_tpu=False, workaround_list_files=False): # Note that we use input_fn to get an iterator, while we use # expected_dataset to get a filename, even though they are the same # type (DeepVariantInput), and may even be the same object. with tf.compat.v1.Session() as sess: params = {'batch_size': 1} batch_feed = tf.compat.v1.data.make_one_shot_iterator( input_fn(params)).get_next() sess.run(tf.compat.v1.global_variables_initializer()) sess.run(tf.compat.v1.local_variables_initializer()) seen = [] while True: try: features, _ = sess.run(batch_feed) except tf.errors.OutOfRangeError: break locus = features['locus'][0] if use_tpu: locus = tf_utils.int_tensor_to_string(locus) # NB, this looks like: array(['chr20:10001019-10001019'], dtype=object) seen.append(locus) if workaround_list_files: # This really only works for loci, because those are string valued and # are expected to show up in sorted order. For arbitrary data that's # not true. In prod we have the version of tf that lets us turn off # shuffling so this path is skipped, but kokoro hits this. seen = sorted(seen) expected_loci = [ example.features.feature['locus'].bytes_list.value[0] for example in tfrecord.read_tfrecords(expected_dataset.input_file_spec) ] self.assertLen(expected_loci, expected_dataset.num_examples) if seen != expected_loci: print('\n\nlen expected seen', len(expected_loci), len(seen)) print('\n\nexpected=', expected_loci) print('\n\nseen=', seen) self.assertEqual(expected_loci, seen) # Note that this expected shape comes from the golden dataset. If the data # is remade in the future, the values might need to be modified accordingly. self.assertEqual(dv_constants.PILEUP_DEFAULT_DIMS, expected_dataset.tensor_shape)
def testGoldenCallingExamples(self, use_tpu): # Read the golden calling examples, and read the batch_feed instantiated # from the golden calling examples, and ensure that we get the same # parsed records in both cases. # Read and parse the canonical data. expected_decoded_records = list( io_utils.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES, proto=example_pb2.Example)) # Read and parse the data using tf. This is the function under test, # although we indirectly check parse_tfexample as well. batch_feed = self.get_batch_feed(batch_size=1, use_tpu=use_tpu) with self.test_session() as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) n = 0 while True: # Read from batch. try: features = sess.run(batch_feed) except tf.errors.OutOfRangeError: break # Get the corresponding parsed golden example. example = expected_decoded_records[n] expected_alt_allele_indices_encoded = example.features.feature[ 'alt_allele_indices/encoded'].bytes_list.value[0] expected_variant_encoded = example.features.feature[ 'variant/encoded'].bytes_list.value[0] # Compare against the parsed batch feed. a = features['image'][0] # np.ndarray self.assertEqual(list(a.shape), dv_constants.PILEUP_DEFAULT_DIMS) self.assertIsNotNone(a) if use_tpu: self.assertEqual(a.dtype, np.dtype('int32')) else: self.assertEqual(a.dtype, np.dtype('uint8')) a = features['alt_allele_indices'][0] if use_tpu: self.assertEqual(a.dtype, np.dtype('int32')) self.assertEqual(a.shape, (tf_utils.STRING_TO_INT_BUFFER_LENGTH, )) actual_alt_allele_indices_encoded = tf_utils.int_tensor_to_string( a) else: self.assertIsInstance(a, six.string_types) actual_alt_allele_indices_encoded = a self.assertEqual(expected_alt_allele_indices_encoded, actual_alt_allele_indices_encoded) a = features['variant'][0] if use_tpu: self.assertEqual(a.dtype, np.dtype('int32')) self.assertEqual(a.shape, (tf_utils.STRING_TO_INT_BUFFER_LENGTH, )) actual_variant_encoded = tf_utils.int_tensor_to_string(a) else: self.assertIsInstance(a, six.string_types) actual_variant_encoded = a self.assertEqual(expected_variant_encoded, actual_variant_encoded) n += 1 self.assertEqual(n, testdata.N_GOLDEN_CALLING_EXAMPLES)