Пример #1
0
def write_variant_call(writer, prediction, use_tpu):
    """Write the variant call based on prediction.

  Args:
    writer: A object with a write() function that will be called for each
      encoded_variant and genotype likelihoods.
    prediction: A [3] tensor of floats. These are the predicted
      genotype likelihoods (p00, p0x, pxx) for some alt allele x, in the same
      order as encoded_variants.
      use_tpu: bool.  Decode the tpu specific encoding of prediction.

  Returns:
    The return status from writer.
  """
    encoded_variant = prediction['variant']
    if use_tpu:
        encoded_variant = tf_utils.int_tensor_to_string(encoded_variant)

    encoded_alt_allele_indices = prediction['alt_allele_indices']
    if use_tpu:
        encoded_alt_allele_indices = tf_utils.int_tensor_to_string(
            encoded_alt_allele_indices)

    rounded_gls = round_gls(prediction['probabilities'],
                            precision=_GL_PRECISION)

    # Write it out.
    true_labels = prediction[
        'label'] if FLAGS.debugging_true_label_mode else None
    cvo = _create_cvo_proto(encoded_variant, rounded_gls,
                            encoded_alt_allele_indices, true_labels)
    return writer.write(cvo.SerializeToString())
Пример #2
0
    def test_get_batches(self, compressed_inputs, mode, use_tpu):
        mode = (tf.estimator.ModeKeys.EVAL
                if mode == 'EVAL' else tf.estimator.ModeKeys.TRAIN)
        input_fn = make_golden_dataset(compressed_inputs,
                                       mode=mode,
                                       use_tpu=use_tpu)
        batch_size = 16
        with tf.Session() as sess:
            batch = input_fn(dict(
                batch_size=batch_size)).make_one_shot_iterator().get_next()

            # Get our images, labels, and variants for further testing.
            sess.run(tf.global_variables_initializer())
            features, labels = sess.run(batch)
            variants = features['variant']
            images = features['image']

            # Checks that our labels are the right shape and are one-hot encoded.
            # Note that the shape is 100, not 107, because we only adjust the image
            # in the model_fn now, where previously it was done in the input_fn.
            self.assertEqual([batch_size] + dv_constants.PILEUP_DEFAULT_DIMS,
                             list(images.shape))
            self.assertEqual((batch_size, ), labels.shape)
            for label in labels:
                # pylint: disable=g-generic-assert
                self.assertTrue(0 <= label < dv_constants.NUM_CLASSES)

            # Check that our variants has the shape we expect and actually contain
            # variants by decoding them and checking the reference_name.
            self.assertEqual(batch_size, variants.shape[0])
            for variant in variants:
                if use_tpu:
                    variant = tf_utils.int_tensor_to_string(variant)
                for v in variant_utils.decode_variants([variant]):
                    self.assertEqual(v.reference_name, 'chr20')
Пример #3
0
 def testIntTensorToString(self):
     with tf.Session() as sess:
         s = '\001\002\003\004\005\006\007'
         it = tf_utils.string_to_int_tensor(s)
         x = sess.run(it)
         t = tf_utils.int_tensor_to_string(x)
         self.assertEqual(t, s)
Пример #4
0
  def assertTfDataSetExamplesMatchExpected(self,
                                           input_fn,
                                           expected_dataset,
                                           use_tpu=False,
                                           workaround_list_files=False):
    # Note that we use input_fn to get an iterator, while we use
    # expected_dataset to get a filename, even though they are the same
    # type (DeepVariantInput), and may even be the same object.
    with tf.compat.v1.Session() as sess:
      params = {'batch_size': 1}
      batch_feed = tf.compat.v1.data.make_one_shot_iterator(
          input_fn(params)).get_next()

      sess.run(tf.compat.v1.global_variables_initializer())
      sess.run(tf.compat.v1.local_variables_initializer())
      seen = []
      while True:
        try:
          features, _ = sess.run(batch_feed)
        except tf.errors.OutOfRangeError:
          break
        locus = features['locus'][0]
        if use_tpu:
          locus = tf_utils.int_tensor_to_string(locus)
        # NB, this looks like: array(['chr20:10001019-10001019'], dtype=object)
        seen.append(locus)

    if workaround_list_files:
      # This really only works for loci, because those are string valued and
      # are expected to show up in sorted order.  For arbitrary data that's
      # not true.  In prod we have the version of tf that lets us turn off
      # shuffling so this path is skipped, but kokoro hits this.
      seen = sorted(seen)

    expected_loci = [
        example.features.feature['locus'].bytes_list.value[0]
        for example in tfrecord.read_tfrecords(expected_dataset.input_file_spec)
    ]
    self.assertLen(expected_loci, expected_dataset.num_examples)
    if seen != expected_loci:
      print('\n\nlen expected seen', len(expected_loci), len(seen))
      print('\n\nexpected=', expected_loci)
      print('\n\nseen=', seen)
    self.assertEqual(expected_loci, seen)
    # Note that this expected shape comes from the golden dataset. If the data
    # is remade in the future, the values might need to be modified accordingly.
    self.assertEqual(dv_constants.PILEUP_DEFAULT_DIMS,
                     expected_dataset.tensor_shape)
Пример #5
0
    def testGoldenCallingExamples(self, use_tpu):
        # Read the golden calling examples, and read the batch_feed instantiated
        # from the golden calling examples, and ensure that we get the same
        # parsed records in both cases.

        # Read and parse the canonical data.
        expected_decoded_records = list(
            io_utils.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES,
                                    proto=example_pb2.Example))

        # Read and parse the data using tf.  This is the function under test,
        # although we indirectly check parse_tfexample as well.
        batch_feed = self.get_batch_feed(batch_size=1, use_tpu=use_tpu)

        with self.test_session() as sess:
            sess.run(tf.local_variables_initializer())
            sess.run(tf.global_variables_initializer())

            n = 0
            while True:
                # Read from batch.
                try:
                    features = sess.run(batch_feed)
                except tf.errors.OutOfRangeError:
                    break

                # Get the corresponding parsed golden example.
                example = expected_decoded_records[n]
                expected_alt_allele_indices_encoded = example.features.feature[
                    'alt_allele_indices/encoded'].bytes_list.value[0]
                expected_variant_encoded = example.features.feature[
                    'variant/encoded'].bytes_list.value[0]

                # Compare against the parsed batch feed.

                a = features['image'][0]  # np.ndarray
                self.assertEqual(list(a.shape),
                                 dv_constants.PILEUP_DEFAULT_DIMS)
                self.assertIsNotNone(a)
                if use_tpu:
                    self.assertEqual(a.dtype, np.dtype('int32'))
                else:
                    self.assertEqual(a.dtype, np.dtype('uint8'))

                a = features['alt_allele_indices'][0]
                if use_tpu:
                    self.assertEqual(a.dtype, np.dtype('int32'))
                    self.assertEqual(a.shape,
                                     (tf_utils.STRING_TO_INT_BUFFER_LENGTH, ))
                    actual_alt_allele_indices_encoded = tf_utils.int_tensor_to_string(
                        a)
                else:
                    self.assertIsInstance(a, six.string_types)
                    actual_alt_allele_indices_encoded = a
                self.assertEqual(expected_alt_allele_indices_encoded,
                                 actual_alt_allele_indices_encoded)

                a = features['variant'][0]
                if use_tpu:
                    self.assertEqual(a.dtype, np.dtype('int32'))
                    self.assertEqual(a.shape,
                                     (tf_utils.STRING_TO_INT_BUFFER_LENGTH, ))
                    actual_variant_encoded = tf_utils.int_tensor_to_string(a)
                else:
                    self.assertIsInstance(a, six.string_types)
                    actual_variant_encoded = a
                self.assertEqual(expected_variant_encoded,
                                 actual_variant_encoded)

                n += 1

            self.assertEqual(n, testdata.N_GOLDEN_CALLING_EXAMPLES)