def testGetNoneShapeFromEmptyExamplesPath(self, file_name_to_write, tfrecord_path_to_match): output_file = test_utils.test_tmpfile(file_name_to_write) tfrecord.write_tfrecords([], output_file) self.assertIsNone( tf_utils.get_shape_from_examples_path( test_utils.test_tmpfile(tfrecord_path_to_match)))
def test_prepare_inputs(self, filename_to_write, file_string_input): source_path = test_utils.test_tmpfile(filename_to_write) tfrecord.write_tfrecords(self.examples, source_path) # file_string_input could be a comma-separated list. Add the prefix to all # of them, and join it back to a string. file_string_input = ','.join( [test_utils.test_tmpfile(f) for f in file_string_input.split(',')]) with self.test_session() as sess: sess.run(tf.compat.v1.local_variables_initializer()) sess.run(tf.compat.v1.global_variables_initializer()) ds = call_variants.prepare_inputs(file_string_input) _, variants, _ = _get_infer_batches(ds, model=self.model, batch_size=1) seen_variants = [] try: while True: seen_variants.extend(sess.run(variants)) except tf.errors.OutOfRangeError: pass six.assertCountEqual(self, self.variants, variant_utils.decode_variants(seen_variants))
def testGetShapeFromExamplesPath(self, file_name_to_write, tfrecord_path_to_match): example = example_pb2.Example() valid_shape = [1, 2, 3] example.features.feature['image/shape'].int64_list.value.extend(valid_shape) output_file = test_utils.test_tmpfile(file_name_to_write) tfrecord.write_tfrecords([example], output_file) tf_utils.get_shape_from_examples_path( test_utils.test_tmpfile(tfrecord_path_to_match))
def test_call_end2end_with_empty_shards(self): # Get only up to 10 examples. examples = list( tfrecord.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES, max_records=10)) # Write to 15 shards, which means there will be multiple empty shards. source_path = test_utils.test_tmpfile('sharded@{}'.format(15)) tfrecord.write_tfrecords(examples, source_path) self.assertCallVariantsEmitsNRecordsForRandomGuess( source_path, len(examples))
def make_golden_dataset(compressed_inputs=False): if compressed_inputs: source_path = test_utils.test_tmpfile( 'golden.postprocess_single_site_input.tfrecord.gz') tfrecord.write_tfrecords( tfrecord.read_tfrecords( testdata.GOLDEN_POSTPROCESS_INPUT, proto=deepvariant_pb2.CallVariantsOutput), source_path) else: source_path = testdata.GOLDEN_POSTPROCESS_INPUT return source_path
def test_call_end2end_empty_first_shard(self): # Get only up to 10 examples. examples = list( tfrecord.read_tfrecords( testdata.GOLDEN_CALLING_EXAMPLES, max_records=10)) empty_first_file = test_utils.test_tmpfile('empty_1st_shard-00000-of-00002') tfrecord.write_tfrecords([], empty_first_file) second_file = test_utils.test_tmpfile('empty_1st_shard-00001-of-00002') tfrecord.write_tfrecords(examples, second_file) self.assertCallVariantsEmitsNRecordsForRandomGuess( test_utils.test_tmpfile('empty_1st_shard@2'), len(examples))
def test_reading_empty_input_should_raise_error(self): empty_shard_one = test_utils.test_tmpfile( 'no_records.tfrecord-00000-of-00002') empty_shard_two = test_utils.test_tmpfile( 'no_records.tfrecord-00001-of-00002') tfrecord.write_tfrecords([], empty_shard_one) tfrecord.write_tfrecords([], empty_shard_two) FLAGS.infile = test_utils.test_tmpfile('no_records.tfrecord@2') FLAGS.ref = testdata.CHR20_FASTA FLAGS.outfile = test_utils.test_tmpfile('no_records.vcf') with self.assertRaisesRegexp(ValueError, 'Cannot find any records in'): postprocess_variants.main(['postprocess_variants.py'])
def test_call_variants_with_no_shape(self, model): # Read one good record from a valid file. example = next(tfrecord.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES)) # Remove image/shape. del example.features.feature['image/shape'] source_path = test_utils.test_tmpfile('make_examples_out_noshape.tfrecord') tfrecord.write_tfrecords([example], source_path) with six.assertRaisesRegex( self, ValueError, 'Invalid image/shape: we expect to find an image/shape ' 'field with length 3.'): ds = call_variants.prepare_inputs(source_path) _ = list(_get_infer_batches(ds, model=model, batch_size=1))
def test_get_shape_from_examples_path(self, file_name_to_write, tfrecord_path_to_match): example = example_pb2.Example() valid_shape = [1, 2, 3] example.features.feature['image/shape'].int64_list.value.extend(valid_shape) output_file = test_utils.test_tmpfile(file_name_to_write) tfrecord.write_tfrecords([example], output_file) ds = data_providers.DeepVariantInput( mode=tf.estimator.ModeKeys.PREDICT, name='test_shape', input_file_spec=test_utils.test_tmpfile(tfrecord_path_to_match), num_examples=1) self.assertEqual(valid_shape, ds.tensor_shape)
def test_reading_sharded_input_with_empty_shards_does_not_crash(self): valid_variants = tfrecord.read_tfrecords( testdata.GOLDEN_POSTPROCESS_INPUT, proto=deepvariant_pb2.CallVariantsOutput) empty_shard_one = test_utils.test_tmpfile( 'reading_empty_shard.tfrecord-00000-of-00002') none_empty_shard_two = test_utils.test_tmpfile( 'reading_empty_shard.tfrecord-00001-of-00002') tfrecord.write_tfrecords([], empty_shard_one) tfrecord.write_tfrecords(valid_variants, none_empty_shard_two) FLAGS.infile = test_utils.test_tmpfile('reading_empty_shard.tfrecord@2') FLAGS.ref = testdata.CHR20_FASTA FLAGS.outfile = test_utils.test_tmpfile('calls_reading_empty_shard.vcf') postprocess_variants.main(['postprocess_variants.py'])
def test_call_variants_with_empty_input(self): source_path = test_utils.test_tmpfile('empty.tfrecord') tfrecord.write_tfrecords([], source_path) # Make sure that prepare_inputs don't crash on empty input. ds = call_variants.prepare_inputs(source_path) m = modeling.get_model('random_guess') # The API specifies that OutOfRangeError is thrown in this case. batches = list(_get_infer_batches(ds, model=m, batch_size=1)) with self.test_session() as sess: sess.run(tf.compat.v1.local_variables_initializer()) sess.run(tf.compat.v1.global_variables_initializer()) try: _ = sess.run(batches) except tf.errors.OutOfRangeError: pass
def make_golden_dataset(compressed_inputs=False, mode=tf.estimator.ModeKeys.EVAL, use_tpu=False): if compressed_inputs: source_path = test_utils.test_tmpfile('make_golden_dataset.tfrecord.gz') tfrecord.write_tfrecords( tfrecord.read_tfrecords(testdata.GOLDEN_TRAINING_EXAMPLES), source_path) else: source_path = testdata.GOLDEN_TRAINING_EXAMPLES return data_providers.get_input_fn_from_filespec( input_file_spec=source_path, num_examples=testdata.N_GOLDEN_TRAINING_EXAMPLES, name='labeled_golden', mode=mode, tensor_shape=None, use_tpu=use_tpu)
def test_call_variants_with_invalid_format(self, model, bad_format): # Read one good record from a valid file. example = next(tfrecord.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES)) # Overwrite the image/format field to be an invalid value # (anything but 'raw'). example.features.feature['image/format'].bytes_list.value[0] = bad_format source_path = test_utils.test_tmpfile('make_examples_output.tfrecord') tfrecord.write_tfrecords([example], source_path) outfile = test_utils.test_tmpfile('call_variants_invalid_format.tfrecord') with self.assertRaises(ValueError): call_variants.call_variants( examples_filename=source_path, checkpoint_path=_LEAVE_MODEL_UNINITIALIZED, model=model, output_file=outfile, batch_size=1, max_batches=1, use_tpu=FLAGS.use_tpu)
def _call_end2end_helper(self, examples_path, model, shard_inputs): examples = list(tfrecord.read_tfrecords(examples_path)) if shard_inputs: # Create a sharded version of our golden examples. source_path = test_utils.test_tmpfile('sharded@{}'.format(3)) tfrecord.write_tfrecords(examples, source_path) else: source_path = examples_path # If we point the test at a headless server, it will often be 2x2, # which has 8 replicas. Otherwise a smaller batch size is fine. if FLAGS.use_tpu: batch_size = 8 else: batch_size = 4 if model.name == 'random_guess': # For the random guess model we can run everything. max_batches = None else: # For all other models we only run a single batch for inference. max_batches = 1 outfile = test_utils.test_tmpfile('call_variants.tfrecord') call_variants.call_variants( examples_filename=source_path, checkpoint_path=_LEAVE_MODEL_UNINITIALIZED, model=model, output_file=outfile, batch_size=batch_size, max_batches=max_batches, master='', use_tpu=FLAGS.use_tpu, ) call_variants_outputs = list( tfrecord.read_tfrecords(outfile, deepvariant_pb2.CallVariantsOutput)) return call_variants_outputs, examples, batch_size, max_batches
def test_reading_sharded_dataset(self, compressed_inputs, use_tpu): golden_dataset = make_golden_dataset(compressed_inputs, use_tpu=use_tpu) n_shards = 3 sharded_path = test_utils.test_tmpfile('sharded@{}'.format(n_shards)) tfrecord.write_tfrecords( tfrecord.read_tfrecords(golden_dataset.input_file_spec), sharded_path) config_file = _test_dataset_config( 'test_sharded.pbtxt', name='sharded_test', tfrecord_path=sharded_path, num_examples=golden_dataset.num_examples) self.assertTfDataSetExamplesMatchExpected( data_providers.get_input_fn_from_dataset( config_file, mode=tf.estimator.ModeKeys.EVAL), golden_dataset, # workaround_list_files is needed because wildcards, and so sharded # files, are nondeterministicly ordered (for now). workaround_list_files=True, )
def write_test_protos(self, filename): protos = [reference_pb2.ContigInfo(name=str(i)) for i in range(10)] path = test_utils.test_tmpfile(filename) tfrecord.write_tfrecords(protos, path) return protos, path
def test_call_end2end_zero_record_file_for_inception_v3(self): zero_record_file = test_utils.test_tmpfile('zero_record_file') tfrecord.write_tfrecords([], zero_record_file) self.assertCallVariantsEmitsNRecordsForInceptionV3( test_utils.test_tmpfile('zero_record_file'), 0)
def call_variants(examples_filename, checkpoint_path, model, output_file, execution_hardware='auto', batch_size=16, max_batches=None, use_tpu=False, master=''): """Main driver of call_variants.""" if FLAGS.kmp_blocktime: os.environ['KMP_BLOCKTIME'] = FLAGS.kmp_blocktime logging.vlog(3, 'Set KMP_BLOCKTIME to {}'.format(os.environ['KMP_BLOCKTIME'])) # Read a single TFExample to make sure we're not loading an older version. first_example = tf_utils.get_one_example_from_examples_path(examples_filename) if first_example is None: logging.warning( 'Unable to read any records from %s. Output will contain ' 'zero records.', examples_filename) tfrecord.write_tfrecords([], output_file) return example_format = tf_utils.example_image_format(first_example) example_shape = tf_utils.example_image_shape(first_example) if example_format != six.b('raw'): raise ValueError('The TF examples in {} has image/format \'{}\' ' '(expected \'raw\') which means you might need to rerun ' 'make_examples to generate the examples again.'.format( examples_filename, example_format)) logging.info('Shape of input examples: %s', str(example_shape)) if checkpoint_path is not None: reader = tf.compat.v1.train.NewCheckpointReader(checkpoint_path) shape_map_for_layers = reader.get_variable_to_shape_map() first_layer = 'InceptionV3/Conv2d_1a_3x3/weights' # For a shape map of [3, 3, 6, 32] for the Conv2d_1a_3x3 layer, the 6 # is the number of channels. num_channels_in_checkpoint_model = shape_map_for_layers[first_layer][2] if num_channels_in_checkpoint_model != example_shape[2]: raise ValueError('The number of channels in examples and checkpoint ' 'should match, but the checkpoint has {} channels while ' 'the examples have {}.'.format( num_channels_in_checkpoint_model, example_shape[2])) # The model checkpoint includes information on the number of channels but # unfortunately not the width or height. if example_shape[0] not in [100, 300]: logging.warning('The height of the input image is not 100 (standard in ' 'DeepVariant) or 300 (standard in DeepTrio). ' 'Please double-check that the model is trained with the ' 'same parameters and version of DeepVariant as you ' 'generated the examples with. An error will not appear ' 'when these are mismatched because of how InceptionV3 ' 'works. Note that if you set --pileup_image_height in ' 'DeepVariant, then you must use a model trained with ' 'that same parameter.') if example_shape[1] != 221: logging.warning('The width of the input image is not 221 (standard in ' 'DeepVariant). ' 'Please double-check that the model is trained with the ' 'same parameters and version of DeepVariant as you ' 'generated the examples with. An error will not appear ' 'when these are mismatched because of how InceptionV3 ' 'works. Note that if you set --pileup_image_width in ' 'DeepVariant, then you must use a model trained with ' 'that same parameter.') # Check accelerator status. if execution_hardware not in _ALLOW_EXECUTION_HARDWARE: raise ValueError( 'Unexpected execution_hardware={} value. Allowed values are {}'.format( execution_hardware, ','.join(_ALLOW_EXECUTION_HARDWARE))) init_op = tf.group(tf.compat.v1.global_variables_initializer(), tf.compat.v1.local_variables_initializer()) config = tf.compat.v1.ConfigProto() if FLAGS.config_string is not None: text_format.Parse(FLAGS.config_string, config) if execution_hardware == 'cpu': # Don't overwrite entire dictionary. config.device_count['GPU'] = 0 config.device_count['TPU'] = 0 # Perform sanity check. with tf.compat.v1.Session(config=config) as sess: sess.run(init_op) if execution_hardware == 'accelerator': if not any(dev.device_type != 'CPU' for dev in sess.list_devices()): raise ExecutionHardwareError( 'execution_hardware is set to accelerator, but no accelerator ' 'was found') # redacted # sess.list_devices here doesn't return the correct answer. That can only # work later, after the device (on the other VM) has been initialized, # which is generally not yet. # Prepare input stream and estimator. tf_dataset = prepare_inputs(source_path=examples_filename, use_tpu=use_tpu) if FLAGS.use_openvino: ie_estimator = OpenVINOEstimator( checkpoint_path, input_fn=tf_dataset, model=model) predictions = iter(ie_estimator) else: estimator = model.make_estimator( batch_size=batch_size, master=master, use_tpu=use_tpu, session_config=config, ) # Instantiate the prediction "stream", and select the EMA values from # the model. if checkpoint_path is None: # Unit tests use this branch. predict_hooks = [] else: predict_hooks = [ h(checkpoint_path) for h in model.session_predict_hooks() ] predictions = iter( estimator.predict( input_fn=tf_dataset, checkpoint_path=checkpoint_path, hooks=predict_hooks)) # Consume predictions one at a time and write them to output_file. logging.info('Writing calls to %s', output_file) writer = tfrecord.Writer(output_file) with writer: start_time = time.time() n_examples, n_batches = 0, 0 while max_batches is None or n_batches <= max_batches: try: prediction = next(predictions) except (StopIteration, tf.errors.OutOfRangeError): break write_variant_call(writer, prediction, use_tpu) n_examples += 1 n_batches = n_examples // batch_size + 1 duration = time.time() - start_time if not FLAGS.use_openvino: logging.log_every_n( logging.INFO, ('Processed %s examples in %s batches [%.3f sec per 100]'), _LOG_EVERY_N, n_examples, n_batches, (100 * duration) / n_examples) # One last log to capture the extra examples. if not FLAGS.use_openvino: logging.info('Processed %s examples in %s batches [%.3f sec per 100]', n_examples, n_batches, (100 * duration) / n_examples) logging.info('Done calling variants from a total of %d examples.', n_examples)
def call_variants(examples_filename, checkpoint_path, model, output_file, execution_hardware='auto', batch_size=16, max_batches=None, use_tpu=False, master=''): """Main driver of call_variants.""" if FLAGS.kmp_blocktime: os.environ['KMP_BLOCKTIME'] = FLAGS.kmp_blocktime logging.info('Set KMP_BLOCKTIME to %s', os.environ['KMP_BLOCKTIME']) # Read a single TFExample to make sure we're not loading an older version. example_format = tf_utils.get_format_from_examples_path(examples_filename) if example_format is None: logging.warning( 'Unable to read any records from %s. Output will contain ' 'zero records.', examples_filename) tfrecord.write_tfrecords([], output_file) return elif example_format != six.b('raw'): raise ValueError( 'The TF examples in {} has image/format \'{}\' ' '(expected \'raw\') which means you might need to rerun ' 'make_examples to generate the examples again.'.format( examples_filename, example_format)) # Check accelerator status. if execution_hardware not in _ALLOW_EXECUTION_HARDWARE: raise ValueError( 'Unexpected execution_hardware={} value. Allowed values are {}'. format(execution_hardware, ','.join(_ALLOW_EXECUTION_HARDWARE))) init_op = tf.group(tf.compat.v1.global_variables_initializer(), tf.compat.v1.local_variables_initializer()) config = tf.compat.v1.ConfigProto() if FLAGS.config_string is not None: text_format.Parse(FLAGS.config_string, config) if execution_hardware == 'cpu': # Don't overwrite entire dictionary. config.device_count['GPU'] = 0 config.device_count['TPU'] = 0 # Perform sanity check. with tf.compat.v1.Session(config=config) as sess: sess.run(init_op) if execution_hardware == 'accelerator': if not any(dev.device_type != 'CPU' for dev in sess.list_devices()): raise ExecutionHardwareError( 'execution_hardware is set to accelerator, but no accelerator ' 'was found') # redacted # sess.list_devices here doesn't return the correct answer. That can only # work later, after the device (on the other VM) has been initialized, # which is generally not yet. # Prepare input stream and estimator. tf_dataset = prepare_inputs(source_path=examples_filename, use_tpu=use_tpu) estimator = model.make_estimator( batch_size=batch_size, master=master, use_tpu=use_tpu, session_config=config, ) # Instantiate the prediction "stream", and select the EMA values from # the model. if checkpoint_path is None: # Unit tests use this branch. predict_hooks = [] else: predict_hooks = [ h(checkpoint_path) for h in model.session_predict_hooks() ] predictions = iter( estimator.predict(input_fn=tf_dataset, checkpoint_path=checkpoint_path, hooks=predict_hooks)) # Consume predictions one at a time and write them to output_file. logging.info('Writing calls to %s', output_file) writer = tfrecord.Writer(output_file) with writer: start_time = time.time() n_examples, n_batches = 0, 0 while max_batches is None or n_batches <= max_batches: try: prediction = next(predictions) except (StopIteration, tf.errors.OutOfRangeError): break write_variant_call(writer, prediction, use_tpu) n_examples += 1 n_batches = n_examples // batch_size + 1 duration = time.time() - start_time logging.log_every_n( logging.INFO, ('Processed %s examples in %s batches [%.3f sec per 100]'), _LOG_EVERY_N, n_examples, n_batches, (100 * duration) / n_examples) logging.info('Done evaluating variants')