def testGetShapeFromExamplesPathInvalidPath(self): # This calls tf.gfile.Glob, which will raise errors.OpError, # at least on a Posix filesystem. Other filesystems might # not fail like that, and will return an empty list, which # is turned into a different exception. with self.assertRaisesRegexp(Exception, '/this/path/does/not'): tf_utils.get_shape_from_examples_path('/this/path/does/not/exist')
def testGetShapeFromExamplesPathInvalidPath(self, source_paths, expected_partial_message): # This calls tf.gfile.Glob, which will raise errors.OpError, # at least on a Posix filesystem. Other filesystems might # not fail like that, and will return an empty list, which # is turned into a different exception. with self.assertRaisesRegexp(Exception, expected_partial_message): tf_utils.get_shape_from_examples_path(source_paths)
def testGetShapeFromExamplesPath(self, file_name_to_write, tfrecord_path_to_match): example = example_pb2.Example() valid_shape = [1, 2, 3] example.features.feature['image/shape'].int64_list.value.extend(valid_shape) output_file = test_utils.test_tmpfile(file_name_to_write) io_utils.write_tfrecords([example], output_file) tf_utils.get_shape_from_examples_path( test_utils.test_tmpfile(tfrecord_path_to_match))
def testGetNoneShapeFromEmptyExamplesPath(self, file_name_to_write, tfrecord_path_to_match): output_file = test_utils.test_tmpfile(file_name_to_write) io_utils.write_tfrecords([], output_file) self.assertIsNone( tf_utils.get_shape_from_examples_path( test_utils.test_tmpfile(tfrecord_path_to_match)))
def __init__(self, name, source, num_examples, num_classes=DEFAULT_NUM_CLASSES, tensor_shape=None): """Creates a dataset. Args: name: str. The name of this dataset. Used to refer to this dataset on the command line. source: str or list[str]. A file path pattern or a comma-separated list of file path patterns pointing to TF.Example PIC images containing the data for this dataset. num_examples: A positive integer. The number of examples in this dataset. num_classes: A positive integer. The number of classes in the labels of this dataset. Currently defaults to DEFAULT_NUM_CLASSES. tensor_shape: None (whihc means we get the shape from the first example in source), or list of int [height, width, channel] for testing. """ self.name = name self.source = source self.num_examples = num_examples self.num_classes = num_classes if tensor_shape: self.tensor_shape = tensor_shape else: self.tensor_shape = tf_utils.get_shape_from_examples_path(source)
def prepare_inputs(source_path, model, batch_size, num_readers=None): """Prepares image and encoded_variant ops. Reads image / encoded_variant tuples from source_path, extracting the image and encoded_variant tensors from source_path. The image is decoded from its png encoding and preprocessed with model.preprocess_image as well. Every example in source_path is read once (num_epoch=1). Args: source_path: Path to a TFRecord file containing deepvariant tf.Example protos. model: A DeepVariantModel whose preprocess_image function will be used on image. batch_size: int > 0. Size of batches to use during inference. num_readers: int > 0 or None. Number of parallel readers to use to read examples from source_path. If None, uses FLAGS.num_readers instead. Returns: A tuple of (image, encoded_variant, encoded_alt_allele_indices) TF ops. Image is a [height, width, channel] tensor. encoded_variants is a tf.string tensor containing a serialized Variant proto describing the variant call associated with image. encoded_alt_allele_indices is a tf.string tensor containing a serialized CallVariantsOutput.AltAlleleIndices proto containing the alternate alleles indices used as "alt" when constructing the image. """ if not num_readers: num_readers = FLAGS.num_readers tensor_shape = tf_utils.get_shape_from_examples_path(source_path) def _parse_single_example(serialized_example): """Parses serialized example into a dictionary of de-serialized features.""" features = tf.parse_single_example( serialized_example, features={ 'image/encoded': tf.FixedLenFeature([], tf.string), 'variant/encoded': tf.FixedLenFeature([], tf.string), # deepvariant_pb2.CallVariantsOutput.AltAlleleIndices 'alt_allele_indices/encoded': tf.FixedLenFeature([], tf.string), }) return features with tf.name_scope('input'): def _preprocess_image(features): """Preprocess images (decode, reshape, and apply model-specific steps).""" image = features['image/encoded'] # Bypassing the reshaping and preprocessing if there is no tensor_shape. # Currently that could happen when the input file is empty. if tensor_shape: image = tf.reshape(tf.decode_raw(image, tf.uint8), tensor_shape) image = model.preprocess_image(image) features['image/encoded'] = image return features files = tf.gfile.Glob(io_utils.NormalizeToShardedFilePattern(source_path)) reader_options = io_utils.make_tfrecord_options(files) if reader_options.compression_type == ( tf.python_io.TFRecordCompressionType.GZIP): compression_type = 'GZIP' else: compression_type = None dataset = tf.data.TFRecordDataset(files, compression_type=compression_type) dataset = dataset.map( _parse_single_example, num_parallel_calls=FLAGS.num_readers) dataset = dataset.map( _preprocess_image, num_parallel_calls=FLAGS.num_readers) dataset = dataset.prefetch(10 * batch_size) dataset = dataset.batch(batch_size) iterator = dataset.make_one_shot_iterator() features = iterator.get_next() return (features['image/encoded'], features['variant/encoded'], features['alt_allele_indices/encoded'])
def __init__( self, mode, input_file_spec, num_examples=None, num_classes=dv_constants.NUM_CLASSES, max_examples=None, tensor_shape=None, name=None, use_tpu=False, input_read_threads=_DEFAULT_INPUT_READ_THREADS, input_map_threads=_DEFAULT_INPUT_MAP_THREADS, shuffle_buffer_size=_DEFAULT_SHUFFLE_BUFFER_ELEMENTS, initial_shuffle_buffer_size=_DEFAULT_INITIAL_SHUFFLE_BUFFER_ELEMENTS, prefetch_dataset_buffer_size=_DEFAULT_PREFETCH_BUFFER_BYTES, sloppy=True, list_files_shuffle=True, debugging_true_label_mode=False): """Create an DeepVariantInput object, usable as an `input_fn`. Args: mode: the mode string (from `tf.estimator.ModeKeys`). input_file_spec: the input filename for a tfrecord[.gz] file containing examples. Can contain sharding designators. num_examples: the number of examples contained in the input file. Required for setting learning rate schedule in train/eval only. num_classes: The number of classes in the labels of this dataset. Currently defaults to DEFAULT_NUM_CLASSES. max_examples: The maximum number of examples to use. If None, all examples will be used. If not None, the first n = min(max_examples, num_examples) will be used. This works with training, and the n examples will repeat over and over. tensor_shape: None (which means we get the shape from the first example in source), or list of int [height, width, channel] for testing. name: string, name of the dataset. use_tpu: use code paths tuned for TPU, in particular protobuf encoding. Default False. input_read_threads: number of threads for reading data. Default 32. input_map_threads: number of threads for mapping data. Default 48. shuffle_buffer_size: size of the final shuffle buffer, in elements. Default 100. initial_shuffle_buffer_size: int; the size of the dataset.shuffle buffer in elements. Default is 1024. prefetch_dataset_buffer_size: int; the size of the TFRecordDataset buffer in bytes. Default is 16 * 1000 * 1000. sloppy: boolean, allow parallel_interleave to be sloppy. Default True. list_files_shuffle: boolean, allow list_files to shuffle. Default True. debugging_true_label_mode: boolean. If true, the input examples are created with "training" mode. We'll parse the 'label' field even if the `mode` is PREDICT. Raises: ValueError: if `num_examples` not provided, in a context requiring it. """ self.mode = mode self.input_file_spec = input_file_spec self.name = name self.num_examples = num_examples self.num_classes = num_classes self.max_examples = max_examples self.use_tpu = use_tpu self.sloppy = sloppy self.list_files_shuffle = list_files_shuffle self.input_read_threads = input_read_threads self.input_map_threads = input_map_threads self.shuffle_buffer_size = shuffle_buffer_size self.initial_shuffle_buffer_size = initial_shuffle_buffer_size self.prefetch_dataset_buffer_size = prefetch_dataset_buffer_size self.debugging_true_label_mode = debugging_true_label_mode self.feature_extraction_spec = self.features_extraction_spec_for_mode( mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL) or debugging_true_label_mode) if num_examples is None and mode != tf.estimator.ModeKeys.PREDICT: raise ValueError('num_examples argument required for DeepVariantInput' 'in TRAIN/EVAL modes.') if max_examples is not None: if max_examples <= 0: raise ValueError( 'max_examples must be > 0 if not None. Got {}'.format(max_examples)) # We update our num_examples in the situation where num_examples is set # (i.e., is not None) to the smaller of max_examples and num_examples. if self.num_examples is not None: self.num_examples = min(max_examples, self.num_examples) if tensor_shape: self.tensor_shape = tensor_shape else: self.tensor_shape = tf_utils.get_shape_from_examples_path(input_file_spec) self.input_files = sharded_file_utils.glob_list_sharded_file_patterns( self.input_file_spec)