Exemplo n.º 1
0
 def test_make_tfrecord_options_with_bad_inputs(self, filenames):
     with self.assertRaisesRegexp(
             ValueError,
             'Incorrect value: {}. Filenames need to be all of the same type: '
             'either all with .gz or all without .gz'.format(
                 ','.join(filenames))):
         io.make_tfrecord_options(filenames)
    def assertDataSetExamplesMatchExpected(self, dataset, expected_dataset):
        with tf.Session() as sess:
            provider = slim.dataset_data_provider.DatasetDataProvider(
                expected_dataset.get_slim_dataset(),
                shuffle=False,
                reader_kwargs={
                    'options':
                    io_utils.make_tfrecord_options(expected_dataset.source)
                })
            sess.run(tf.global_variables_initializer())
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(coord=coord, sess=sess)
            image, label, locus = provider.get(['image', 'label', 'locus'])
            seen = [
                sess.run([image, label, locus])[2]
                for _ in range(expected_dataset.num_examples)
            ]
            coord.request_stop()
            coord.join(threads)

        expected_loci = [
            example.features.feature['locus'].bytes_list.value[0]
            for example in io_utils.read_tfrecords(expected_dataset.source)
        ]
        self.assertEqual(len(expected_loci), expected_dataset.num_examples)
        self.assertEqual(expected_loci, seen)
        # Note that this expected shape comes from the golden dataset. If the data
        # is remade in the future, the values might need to be modified accordingly.
        self.assertEqual([100, 221, 7], expected_dataset.tensor_shape)
def make_training_batches(dataset, model, batch_size):
    """Provides batches of pileup images from this dataset.

  Creates a DataSetProvider for dataset, extracts image, label, and
  truth_variant from it, preprocesses each image with model.preprocess_image()
  and finally batches these up.

  Args:
    dataset: a slim DataSet we want to turn into batches. Must provide data
      items "image", "label", and "truth_variant".
    model: a DeepVariantModel to use for preprocessing each image before
      batching.
    batch_size: the number of images in each batch.

  Returns:
    images: 4-D float Tensor of a batch of images with shape
      (batch_size, height, width, 3).
    labels: a 1-D integer Tensor shape (batch_size,) containing the labels for
      each image, in the same order.
    encoded_truth_variants: Tensor of strings with shape (batch_size,).
      Each element of this tensor is a byte-encoded learning.genomics.v1.Variant
      protobuf in the same order as images and one_hot_labels.
  """
    data_provider = slim.dataset_data_provider.DatasetDataProvider(
        dataset,
        common_queue_capacity=2 * batch_size,
        common_queue_min=batch_size,
        reader_kwargs={
            'options': io_utils.make_tfrecord_options(dataset.data_sources)
        })
    # Load the data.
    image, label, truth_variant = data_provider.get(
        ['image', 'label', 'truth_variant'])
    image = model.preprocess_image(image)
    return tf.train.shuffle_batch(
        [image, label, truth_variant],
        batch_size=batch_size,
        num_threads=4,
        capacity=5000,
        # redacted
        min_after_dequeue=min(1000, dataset.num_samples))
Exemplo n.º 4
0
def prepare_inputs(source_path, model, batch_size, num_readers=None):
    """Prepares image and encoded_variant ops.

  Reads image / encoded_variant tuples from source_path, extracting the image
  and encoded_variant tensors from source_path. The image is decoded from its
  png encoding and preprocessed with model.preprocess_image as well. Every
  example in source_path is read once (num_epoch=1).

  Args:
    source_path: Path to a TFRecord file containing deepvariant tf.Example
      protos.
    model: A DeepVariantModel whose preprocess_image function will be used on
      image.
    batch_size: int > 0. Size of batches to use during inference.
    num_readers: int > 0 or None. Number of parallel readers to use to read
      examples from source_path. If None, uses FLAGS.num_readers instead.

  Returns:
    A tuple of (image, encoded_variant, encoded_alt_allele_indices) TF ops.
    Image is a [height, width, channel] tensor.
    encoded_variants is a tf.string tensor containing a serialized Variant proto
    describing the variant call associated with image.
    encoded_alt_allele_indices is a tf.string tensor containing a serialized
    CallVariantsOutput.AltAlleleIndices proto containing the
    alternate alleles indices used as "alt" when constructing the image.
  """
    if not num_readers:
        num_readers = FLAGS.num_readers

    tensor_shape = tf_utils.get_shape_from_examples_path(source_path)

    def _parse_single_example(serialized_example):
        """Parses serialized example into a dictionary of de-serialized features."""
        features = tf.parse_single_example(
            serialized_example,
            features={
                'image/encoded': tf.FixedLenFeature([], tf.string),
                'variant/encoded': tf.FixedLenFeature([], tf.string),
                # deepvariant_pb2.CallVariantsOutput.AltAlleleIndices
                'alt_allele_indices/encoded': tf.FixedLenFeature([],
                                                                 tf.string),
            })
        return features

    with tf.name_scope('input'):

        def _preprocess_image(features):
            """Preprocess images (decode, reshape, and apply model-specific steps)."""
            image = features['image/encoded']
            # Bypassing the reshaping and preprocessing if there is no tensor_shape.
            # Currently that could happen when the input file is empty.
            if tensor_shape:
                image = tf.reshape(tf.decode_raw(image, tf.uint8),
                                   tensor_shape)
                image = model.preprocess_image(image)
            features['image/encoded'] = image
            return features

        files = tf.gfile.Glob(
            io_utils.NormalizeToShardedFilePattern(source_path))
        reader_options = io_utils.make_tfrecord_options(files)
        if reader_options.compression_type == (
                tf.python_io.TFRecordCompressionType.GZIP):
            compression_type = 'GZIP'
        else:
            compression_type = None
        dataset = tf.data.TFRecordDataset(files,
                                          compression_type=compression_type)
        dataset = dataset.map(_parse_single_example,
                              num_parallel_calls=FLAGS.num_readers)
        dataset = dataset.map(_preprocess_image,
                              num_parallel_calls=FLAGS.num_readers)
        dataset = dataset.prefetch(10 * batch_size)
        dataset = dataset.batch(batch_size)
        iterator = dataset.make_one_shot_iterator()
        features = iterator.get_next()
        return (features['image/encoded'], features['variant/encoded'],
                features['alt_allele_indices/encoded'])
Exemplo n.º 5
0
 def test_make_tfrecord_options(self, filenames, expected_compression_type):
     compression_type = tf.python_io.TFRecordOptions.get_compression_type_string(
         io.make_tfrecord_options(filenames))
     self.assertEqual(compression_type, expected_compression_type)