예제 #1
0
 def test_make_tfrecord_options_with_bad_inputs(self, filenames):
     with self.assertRaisesRegexp(
             ValueError,
             'Incorrect value: {}. Filenames need to be all of the same type: '
             'either all with .gz or all without .gz'.format(
                 ','.join(filenames))):
         io.make_tfrecord_options(filenames)
예제 #2
0
def compression_type_of_files(files):
    """Return GZIP or None for the compression type of the files."""
    reader_options = io_utils.make_tfrecord_options(files)
    if reader_options.compression_type == (
            tf.python_io.TFRecordCompressionType.GZIP):
        return 'GZIP'
    return None
예제 #3
0
    def assertDataSetExamplesMatchExpected(self, dataset, expected_dataset):
        with tf.Session() as sess:
            provider = slim.dataset_data_provider.DatasetDataProvider(
                expected_dataset.get_slim_dataset(),
                shuffle=False,
                reader_kwargs={
                    'options':
                    io_utils.make_tfrecord_options(expected_dataset.source)
                })
            sess.run(tf.global_variables_initializer())
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(coord=coord, sess=sess)
            image, label, locus = provider.get(['image', 'label', 'locus'])
            seen = [
                sess.run([image, label, locus])[2]
                for _ in range(expected_dataset.num_examples)
            ]
            coord.request_stop()
            coord.join(threads)

        expected_loci = [
            example.features.feature['locus'].bytes_list.value[0]
            for example in io_utils.read_tfrecords(expected_dataset.source)
        ]
        self.assertEqual(len(expected_loci), expected_dataset.num_examples)
        self.assertEqual(expected_loci, seen)
        # Note that this expected shape comes from the golden dataset. If the data
        # is remade in the future, the values might need to be modified accordingly.
        self.assertEqual([100, 221, pileup_image.DEFAULT_NUM_CHANNEL],
                         expected_dataset.tensor_shape)
예제 #4
0
  def assertDataSetExamplesMatchExpected(self, dataset, expected_dataset):
    with tf.Session() as sess:
      provider = slim.dataset_data_provider.DatasetDataProvider(
          expected_dataset.get_slim_dataset(),
          shuffle=False,
          reader_kwargs={
              'options': io_utils.make_tfrecord_options(expected_dataset.source)
          })
      sess.run(tf.global_variables_initializer())
      coord = tf.train.Coordinator()
      threads = tf.train.start_queue_runners(coord=coord, sess=sess)
      image, label, locus = provider.get(['image', 'label', 'locus'])
      seen = [
          sess.run([image, label, locus])[2]
          for _ in range(expected_dataset.num_examples)
      ]
      coord.request_stop()
      coord.join(threads)

    expected_loci = [
        example.features.feature['locus'].bytes_list.value[0]
        for example in io_utils.read_tfrecords(expected_dataset.source)
    ]
    self.assertEqual(len(expected_loci), expected_dataset.num_examples)
    self.assertEqual(expected_loci, seen)
    # Note that this expected shape comes from the golden dataset. If the data
    # is remade in the future, the values might need to be modified accordingly.
    self.assertEqual([100, 221, pileup_image.DEFAULT_NUM_CHANNEL],
                     expected_dataset.tensor_shape)
예제 #5
0
def prepare_inputs(source_path, model, batch_size, num_readers=None):
  """Prepares image and encoded_variant ops.

  Reads image / encoded_variant tuples from source_path, extracting the image
  and encoded_variant tensors from source_path. The image is decoded from its
  png encoding and preprocessed with model.preprocess_image as well. Every
  example in source_path is read once (num_epoch=1).

  Args:
    source_path: Path to a TFRecord file containing deepvariant tf.Example
      protos.
    model: A DeepVariantModel whose preprocess_image function will be used on
      image.
    batch_size: int > 0. Size of batches to use during inference.
    num_readers: int > 0 or None. Number of parallel readers to use to read
      examples from source_path. If None, uses FLAGS.num_readers instead.

  Returns:
    A tuple of (image, encoded_variant, encoded_alt_allele_indices) TF ops.
    Image is a [height, width, channel] tensor.
    encoded_variants is a tf.string tensor containing a serialized Variant proto
    describing the variant call associated with image.
    encoded_alt_allele_indices is a tf.string tensor containing a serialized
    CallVariantsOutput.AltAlleleIndices proto containing the
    alternate alleles indices used as "alt" when constructing the image.
  """
  if not num_readers:
    num_readers = FLAGS.num_readers

  tensor_shape = tf_utils.get_shape_from_examples_path(source_path)

  def _parse_single_example(serialized_example):
    """Parses serialized example into a dictionary of de-serialized features."""
    features = tf.parse_single_example(
        serialized_example,
        features={
            'image/encoded': tf.FixedLenFeature([], tf.string),
            'variant/encoded': tf.FixedLenFeature([], tf.string),
            # deepvariant_pb2.CallVariantsOutput.AltAlleleIndices
            'alt_allele_indices/encoded': tf.FixedLenFeature([], tf.string),
        })
    return features

  with tf.name_scope('input'):

    def _preprocess_image(features):
      """Preprocess images (decode, reshape, and apply model-specific steps)."""
      image = features['image/encoded']
      # Bypassing the reshaping and preprocessing if there is no tensor_shape.
      # Currently that could happen when the input file is empty.
      if tensor_shape:
        image = tf.reshape(tf.decode_raw(image, tf.uint8), tensor_shape)
        image = model.preprocess_image(image)
      features['image/encoded'] = image
      return features

    files = tf.gfile.Glob(io_utils.NormalizeToShardedFilePattern(source_path))
    reader_options = io_utils.make_tfrecord_options(files)
    if reader_options.compression_type == (
        tf.python_io.TFRecordCompressionType.GZIP):
      compression_type = 'GZIP'
    else:
      compression_type = None
    dataset = tf.data.TFRecordDataset(files, compression_type=compression_type)
    dataset = dataset.map(
        _parse_single_example, num_parallel_calls=FLAGS.num_readers)
    dataset = dataset.map(
        _preprocess_image, num_parallel_calls=FLAGS.num_readers)
    dataset = dataset.prefetch(10 * batch_size)
    dataset = dataset.batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    features = iterator.get_next()
    return (features['image/encoded'], features['variant/encoded'],
            features['alt_allele_indices/encoded'])
예제 #6
0
def make_batches(dataset, model, batch_size, mode):
  """Provides batches of pileup images from this dataset.

  Creates a DataSetProvider for dataset, extracts image, label, and variant
  from it, preprocesses each image with model.preprocess_image() and finally
  batches these up.

  Args:
    dataset: a slim DataSet we want to turn into batches. Must provide data
      items "image", "label", and "variant".
    model: a DeepVariantModel to use for preprocessing each image before
      batching.
    batch_size: the number of images in each batch.
    mode: str; one of TRAIN or EVAL.

  Returns:
    images: 4-D float Tensor of a batch of images with shape
      (batch_size, height, width, 3).
    labels: a 1-D integer Tensor shape (batch_size,) containing the labels for
      each image, in the same order.
    encoded_variants: Tensor of strings with shape (batch_size,).
      Each element of this tensor is a byte-encoded nucleus.genomics.v1.Variant
      protobuf in the same order as images and one_hot_labels.

  Raises:
    ValueError: if mode is not one of TRAIN or EVAL.
  """
  if mode not in {'TRAIN', 'EVAL'}:
    raise ValueError(
        'mode is {} but must be one of TRAIN or EVAL.'.format(mode))

  if mode == 'TRAIN':
    data_provider = slim.dataset_data_provider.DatasetDataProvider(
        dataset,
        common_queue_capacity=2 * batch_size,
        common_queue_min=batch_size,
        reader_kwargs={
            'options': io_utils.make_tfrecord_options(dataset.data_sources)
        })
  else:
    data_provider = slim.dataset_data_provider.DatasetDataProvider(
        dataset,
        num_readers=1,
        shuffle=False,
        reader_kwargs={
            'options': io_utils.make_tfrecord_options(dataset.data_sources)
        })

  # Load the data.
  image, label, variant = data_provider.get(['image', 'label', 'variant'])
  image = model.preprocess_image(image)

  if mode == 'TRAIN':
    return tf.train.shuffle_batch(
        [image, label, variant],
        batch_size=batch_size,
        num_threads=4,
        capacity=5000,
        # redacted
        min_after_dequeue=min(1000, dataset.num_samples))
  else:
    return tf.train.batch(
        [image, label, variant], batch_size=batch_size, num_threads=1)
예제 #7
0
def prepare_inputs(source_path, model, batch_size, num_readers=None):
  """Prepares image and encoded_variant ops.

  Reads image / encoded_variant tuples from source_path, extracting the image
  and encoded_variant tensors from source_path. The image is decoded from its
  png encoding and preprocessed with model.preprocess_image as well. Every
  example in source_path is read once (num_epoch=1).

  Args:
    source_path: Path to a TFRecord file containing deepvariant tf.Example
      protos.
    model: A DeepVariantModel whose preprocess_image function will be used on
      image.
    batch_size: int > 0. Size of batches to use during inference.
    num_readers: int > 0 or None. Number of parallel readers to use to read
      examples from source_path. If None, uses FLAGS.num_readers instead.

  Returns:
    A tuple of (image, encoded_variant, encoded_alt_allele_indices) TF ops.
    Image is a [height, width, channel] tensor.
    encoded_variants is a tf.string tensor containing a serialized Variant proto
    describing the variant call associated with image.
    encoded_alt_allele_indices is a tf.string tensor containing a serialized
    CallVariantsOutput.AltAlleleIndices proto containing the
    alternate alleles indices used as "alt" when constructing the image.
  """
  if not num_readers:
    num_readers = FLAGS.num_readers

  tensor_shape = tf_utils.get_shape_from_examples_path(source_path)

  def _parse_single_example(serialized_example):
    """Parses serialized example into a dictionary of de-serialized features."""
    features = tf.parse_single_example(
        serialized_example,
        features={
            'image/encoded': tf.FixedLenFeature([], tf.string),
            'variant/encoded': tf.FixedLenFeature([], tf.string),
            # deepvariant_pb2.CallVariantsOutput.AltAlleleIndices
            'alt_allele_indices/encoded': tf.FixedLenFeature([], tf.string),
        })
    return features

  with tf.name_scope('input'):

    def _preprocess_image(features):
      """Preprocess images (decode, reshape, and apply model-specific steps)."""
      image = features['image/encoded']
      # Bypassing the reshaping and preprocessing if there is no tensor_shape.
      # Currently that could happen when the input file is empty.
      if tensor_shape:
        image = tf.reshape(tf.decode_raw(image, tf.uint8), tensor_shape)
        image = model.preprocess_image(image)
      features['image/encoded'] = image
      return features

    files = tf.gfile.Glob(io_utils.NormalizeToShardedFilePattern(source_path))
    reader_options = io_utils.make_tfrecord_options(files)
    if reader_options.compression_type == (
        tf.python_io.TFRecordCompressionType.GZIP):
      compression_type = 'GZIP'
    else:
      compression_type = None
    dataset = tf.data.TFRecordDataset(files, compression_type=compression_type)
    dataset = dataset.map(
        _parse_single_example, num_parallel_calls=FLAGS.num_readers)
    dataset = dataset.map(
        _preprocess_image, num_parallel_calls=FLAGS.num_readers)
    dataset = dataset.prefetch(10 * batch_size)
    dataset = dataset.batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    features = iterator.get_next()
    return (features['image/encoded'], features['variant/encoded'],
            features['alt_allele_indices/encoded'])
예제 #8
0
def make_batches(dataset, model, batch_size, mode):
    """Provides batches of pileup images from this dataset.

  Creates a DataSetProvider for dataset, extracts image, label, and variant
  from it, preprocesses each image with model.preprocess_image() and finally
  batches these up.

  Args:
    dataset: a slim DataSet we want to turn into batches. Must provide data
      items "image", "label", and "variant".
    model: a DeepVariantModel to use for preprocessing each image before
      batching.
    batch_size: the number of images in each batch.
    mode: str; one of TRAIN or EVAL.

  Returns:
    images: 4-D float Tensor of a batch of images with shape
      (batch_size, height, width, 3).
    labels: a 1-D integer Tensor shape (batch_size,) containing the labels for
      each image, in the same order.
    encoded_variants: Tensor of strings with shape (batch_size,).
      Each element of this tensor is a byte-encoded nucleus.genomics.v1.Variant
      protobuf in the same order as images and one_hot_labels.

  Raises:
    ValueError: if mode is not one of TRAIN or EVAL.
  """
    if mode not in {'TRAIN', 'EVAL'}:
        raise ValueError(
            'mode is {} but must be one of TRAIN or EVAL.'.format(mode))

    if mode == 'TRAIN':
        data_provider = slim.dataset_data_provider.DatasetDataProvider(
            dataset,
            common_queue_capacity=2 * batch_size,
            common_queue_min=batch_size,
            reader_kwargs={
                'options': io_utils.make_tfrecord_options(dataset.data_sources)
            })
    else:
        data_provider = slim.dataset_data_provider.DatasetDataProvider(
            dataset,
            num_readers=1,
            shuffle=False,
            reader_kwargs={
                'options': io_utils.make_tfrecord_options(dataset.data_sources)
            })

    # Load the data.
    image, label, variant = data_provider.get(['image', 'label', 'variant'])
    image = model.preprocess_image(image)

    if mode == 'TRAIN':
        return tf.train.shuffle_batch(
            [image, label, variant],
            batch_size=batch_size,
            num_threads=4,
            capacity=5000,
            # redacted
            min_after_dequeue=min(1000, dataset.num_samples))
    else:
        return tf.train.batch([image, label, variant],
                              batch_size=batch_size,
                              num_threads=1)
예제 #9
0
 def test_make_tfrecord_options_with_bad_inputs(self, filenames):
   with self.assertRaisesRegexp(
       ValueError,
       'Incorrect value: {}. Filenames need to be all of the same type: '
       'either all with .gz or all without .gz'.format(','.join(filenames))):
     io.make_tfrecord_options(filenames)
예제 #10
0
 def test_make_tfrecord_options(self, filenames, expected_compression_type):
   compression_type = python_io.TFRecordOptions.get_compression_type_string(
       io.make_tfrecord_options(filenames))
   self.assertEqual(compression_type, expected_compression_type)