def testGetNoneShapeFromEmptyExamplesPath(self, file_name_to_write,
                                           tfrecord_path_to_match):
   output_file = test_utils.test_tmpfile(file_name_to_write)
   tfrecord.write_tfrecords([], output_file)
   self.assertIsNone(
       tf_utils.get_shape_from_examples_path(
           test_utils.test_tmpfile(tfrecord_path_to_match)))
예제 #2
0
    def test_prepare_inputs(self, filename_to_write, file_string_input):
        source_path = test_utils.test_tmpfile(filename_to_write)
        tfrecord.write_tfrecords(self.examples, source_path)
        # file_string_input could be a comma-separated list. Add the prefix to all
        # of them, and join it back to a string.
        file_string_input = ','.join(
            [test_utils.test_tmpfile(f) for f in file_string_input.split(',')])

        with self.test_session() as sess:
            sess.run(tf.compat.v1.local_variables_initializer())
            sess.run(tf.compat.v1.global_variables_initializer())

            ds = call_variants.prepare_inputs(file_string_input)
            _, variants, _ = _get_infer_batches(ds,
                                                model=self.model,
                                                batch_size=1)

            seen_variants = []
            try:
                while True:
                    seen_variants.extend(sess.run(variants))
            except tf.errors.OutOfRangeError:
                pass

            six.assertCountEqual(self, self.variants,
                                 variant_utils.decode_variants(seen_variants))
 def testGetShapeFromExamplesPath(self, file_name_to_write,
                                  tfrecord_path_to_match):
   example = example_pb2.Example()
   valid_shape = [1, 2, 3]
   example.features.feature['image/shape'].int64_list.value.extend(valid_shape)
   output_file = test_utils.test_tmpfile(file_name_to_write)
   tfrecord.write_tfrecords([example], output_file)
   tf_utils.get_shape_from_examples_path(
       test_utils.test_tmpfile(tfrecord_path_to_match))
예제 #4
0
 def test_call_end2end_with_empty_shards(self):
     # Get only up to 10 examples.
     examples = list(
         tfrecord.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES,
                                 max_records=10))
     # Write to 15 shards, which means there will be multiple empty shards.
     source_path = test_utils.test_tmpfile('sharded@{}'.format(15))
     tfrecord.write_tfrecords(examples, source_path)
     self.assertCallVariantsEmitsNRecordsForRandomGuess(
         source_path, len(examples))
예제 #5
0
def make_golden_dataset(compressed_inputs=False):
  if compressed_inputs:
    source_path = test_utils.test_tmpfile(
        'golden.postprocess_single_site_input.tfrecord.gz')
    tfrecord.write_tfrecords(
        tfrecord.read_tfrecords(
            testdata.GOLDEN_POSTPROCESS_INPUT,
            proto=deepvariant_pb2.CallVariantsOutput), source_path)
  else:
    source_path = testdata.GOLDEN_POSTPROCESS_INPUT
  return source_path
예제 #6
0
 def test_call_end2end_empty_first_shard(self):
   # Get only up to 10 examples.
   examples = list(
       tfrecord.read_tfrecords(
           testdata.GOLDEN_CALLING_EXAMPLES, max_records=10))
   empty_first_file = test_utils.test_tmpfile('empty_1st_shard-00000-of-00002')
   tfrecord.write_tfrecords([], empty_first_file)
   second_file = test_utils.test_tmpfile('empty_1st_shard-00001-of-00002')
   tfrecord.write_tfrecords(examples, second_file)
   self.assertCallVariantsEmitsNRecordsForRandomGuess(
       test_utils.test_tmpfile('empty_1st_shard@2'), len(examples))
예제 #7
0
  def test_reading_empty_input_should_raise_error(self):
    empty_shard_one = test_utils.test_tmpfile(
        'no_records.tfrecord-00000-of-00002')
    empty_shard_two = test_utils.test_tmpfile(
        'no_records.tfrecord-00001-of-00002')
    tfrecord.write_tfrecords([], empty_shard_one)
    tfrecord.write_tfrecords([], empty_shard_two)
    FLAGS.infile = test_utils.test_tmpfile('no_records.tfrecord@2')
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.outfile = test_utils.test_tmpfile('no_records.vcf')

    with self.assertRaisesRegexp(ValueError, 'Cannot find any records in'):
      postprocess_variants.main(['postprocess_variants.py'])
예제 #8
0
 def test_call_variants_with_no_shape(self, model):
   # Read one good record from a valid file.
   example = next(tfrecord.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES))
   # Remove image/shape.
   del example.features.feature['image/shape']
   source_path = test_utils.test_tmpfile('make_examples_out_noshape.tfrecord')
   tfrecord.write_tfrecords([example], source_path)
   with six.assertRaisesRegex(
       self, ValueError,
       'Invalid image/shape: we expect to find an image/shape '
       'field with length 3.'):
     ds = call_variants.prepare_inputs(source_path)
     _ = list(_get_infer_batches(ds, model=model, batch_size=1))
예제 #9
0
 def test_get_shape_from_examples_path(self, file_name_to_write,
                                       tfrecord_path_to_match):
   example = example_pb2.Example()
   valid_shape = [1, 2, 3]
   example.features.feature['image/shape'].int64_list.value.extend(valid_shape)
   output_file = test_utils.test_tmpfile(file_name_to_write)
   tfrecord.write_tfrecords([example], output_file)
   ds = data_providers.DeepVariantInput(
       mode=tf.estimator.ModeKeys.PREDICT,
       name='test_shape',
       input_file_spec=test_utils.test_tmpfile(tfrecord_path_to_match),
       num_examples=1)
   self.assertEqual(valid_shape, ds.tensor_shape)
예제 #10
0
  def test_reading_sharded_input_with_empty_shards_does_not_crash(self):
    valid_variants = tfrecord.read_tfrecords(
        testdata.GOLDEN_POSTPROCESS_INPUT,
        proto=deepvariant_pb2.CallVariantsOutput)
    empty_shard_one = test_utils.test_tmpfile(
        'reading_empty_shard.tfrecord-00000-of-00002')
    none_empty_shard_two = test_utils.test_tmpfile(
        'reading_empty_shard.tfrecord-00001-of-00002')
    tfrecord.write_tfrecords([], empty_shard_one)
    tfrecord.write_tfrecords(valid_variants, none_empty_shard_two)
    FLAGS.infile = test_utils.test_tmpfile('reading_empty_shard.tfrecord@2')
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.outfile = test_utils.test_tmpfile('calls_reading_empty_shard.vcf')

    postprocess_variants.main(['postprocess_variants.py'])
예제 #11
0
    def test_call_variants_with_empty_input(self):
        source_path = test_utils.test_tmpfile('empty.tfrecord')
        tfrecord.write_tfrecords([], source_path)
        # Make sure that prepare_inputs don't crash on empty input.
        ds = call_variants.prepare_inputs(source_path)
        m = modeling.get_model('random_guess')

        # The API specifies that OutOfRangeError is thrown in this case.
        batches = list(_get_infer_batches(ds, model=m, batch_size=1))
        with self.test_session() as sess:
            sess.run(tf.compat.v1.local_variables_initializer())
            sess.run(tf.compat.v1.global_variables_initializer())
            try:
                _ = sess.run(batches)
            except tf.errors.OutOfRangeError:
                pass
예제 #12
0
def make_golden_dataset(compressed_inputs=False,
                        mode=tf.estimator.ModeKeys.EVAL,
                        use_tpu=False):
  if compressed_inputs:
    source_path = test_utils.test_tmpfile('make_golden_dataset.tfrecord.gz')
    tfrecord.write_tfrecords(
        tfrecord.read_tfrecords(testdata.GOLDEN_TRAINING_EXAMPLES), source_path)
  else:
    source_path = testdata.GOLDEN_TRAINING_EXAMPLES
  return data_providers.get_input_fn_from_filespec(
      input_file_spec=source_path,
      num_examples=testdata.N_GOLDEN_TRAINING_EXAMPLES,
      name='labeled_golden',
      mode=mode,
      tensor_shape=None,
      use_tpu=use_tpu)
예제 #13
0
  def test_call_variants_with_invalid_format(self, model, bad_format):
    # Read one good record from a valid file.
    example = next(tfrecord.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES))
    # Overwrite the image/format field to be an invalid value
    # (anything but 'raw').
    example.features.feature['image/format'].bytes_list.value[0] = bad_format
    source_path = test_utils.test_tmpfile('make_examples_output.tfrecord')
    tfrecord.write_tfrecords([example], source_path)
    outfile = test_utils.test_tmpfile('call_variants_invalid_format.tfrecord')

    with self.assertRaises(ValueError):
      call_variants.call_variants(
          examples_filename=source_path,
          checkpoint_path=_LEAVE_MODEL_UNINITIALIZED,
          model=model,
          output_file=outfile,
          batch_size=1,
          max_batches=1,
          use_tpu=FLAGS.use_tpu)
예제 #14
0
    def _call_end2end_helper(self, examples_path, model, shard_inputs):
        examples = list(tfrecord.read_tfrecords(examples_path))

        if shard_inputs:
            # Create a sharded version of our golden examples.
            source_path = test_utils.test_tmpfile('sharded@{}'.format(3))
            tfrecord.write_tfrecords(examples, source_path)
        else:
            source_path = examples_path

        # If we point the test at a headless server, it will often be 2x2,
        # which has 8 replicas.  Otherwise a smaller batch size is fine.
        if FLAGS.use_tpu:
            batch_size = 8
        else:
            batch_size = 4

        if model.name == 'random_guess':
            # For the random guess model we can run everything.
            max_batches = None
        else:
            # For all other models we only run a single batch for inference.
            max_batches = 1

        outfile = test_utils.test_tmpfile('call_variants.tfrecord')
        call_variants.call_variants(
            examples_filename=source_path,
            checkpoint_path=_LEAVE_MODEL_UNINITIALIZED,
            model=model,
            output_file=outfile,
            batch_size=batch_size,
            max_batches=max_batches,
            master='',
            use_tpu=FLAGS.use_tpu,
        )

        call_variants_outputs = list(
            tfrecord.read_tfrecords(outfile,
                                    deepvariant_pb2.CallVariantsOutput))

        return call_variants_outputs, examples, batch_size, max_batches
예제 #15
0
  def test_reading_sharded_dataset(self, compressed_inputs, use_tpu):
    golden_dataset = make_golden_dataset(compressed_inputs, use_tpu=use_tpu)
    n_shards = 3
    sharded_path = test_utils.test_tmpfile('sharded@{}'.format(n_shards))
    tfrecord.write_tfrecords(
        tfrecord.read_tfrecords(golden_dataset.input_file_spec), sharded_path)

    config_file = _test_dataset_config(
        'test_sharded.pbtxt',
        name='sharded_test',
        tfrecord_path=sharded_path,
        num_examples=golden_dataset.num_examples)

    self.assertTfDataSetExamplesMatchExpected(
        data_providers.get_input_fn_from_dataset(
            config_file, mode=tf.estimator.ModeKeys.EVAL),
        golden_dataset,
        # workaround_list_files is needed because wildcards, and so sharded
        # files, are nondeterministicly ordered (for now).
        workaround_list_files=True,
    )
예제 #16
0
 def write_test_protos(self, filename):
   protos = [reference_pb2.ContigInfo(name=str(i)) for i in range(10)]
   path = test_utils.test_tmpfile(filename)
   tfrecord.write_tfrecords(protos, path)
   return protos, path
예제 #17
0
 def test_call_end2end_zero_record_file_for_inception_v3(self):
     zero_record_file = test_utils.test_tmpfile('zero_record_file')
     tfrecord.write_tfrecords([], zero_record_file)
     self.assertCallVariantsEmitsNRecordsForInceptionV3(
         test_utils.test_tmpfile('zero_record_file'), 0)
예제 #18
0
def call_variants(examples_filename,
                  checkpoint_path,
                  model,
                  output_file,
                  execution_hardware='auto',
                  batch_size=16,
                  max_batches=None,
                  use_tpu=False,
                  master=''):
  """Main driver of call_variants."""
  if FLAGS.kmp_blocktime:
    os.environ['KMP_BLOCKTIME'] = FLAGS.kmp_blocktime
    logging.vlog(3,
                 'Set KMP_BLOCKTIME to {}'.format(os.environ['KMP_BLOCKTIME']))

  # Read a single TFExample to make sure we're not loading an older version.
  first_example = tf_utils.get_one_example_from_examples_path(examples_filename)
  if first_example is None:
    logging.warning(
        'Unable to read any records from %s. Output will contain '
        'zero records.', examples_filename)
    tfrecord.write_tfrecords([], output_file)
    return

  example_format = tf_utils.example_image_format(first_example)
  example_shape = tf_utils.example_image_shape(first_example)
  if example_format != six.b('raw'):
    raise ValueError('The TF examples in {} has image/format \'{}\' '
                     '(expected \'raw\') which means you might need to rerun '
                     'make_examples to generate the examples again.'.format(
                         examples_filename, example_format))
  logging.info('Shape of input examples: %s', str(example_shape))

  if checkpoint_path is not None:
    reader = tf.compat.v1.train.NewCheckpointReader(checkpoint_path)
    shape_map_for_layers = reader.get_variable_to_shape_map()
    first_layer = 'InceptionV3/Conv2d_1a_3x3/weights'
    # For a shape map of [3, 3, 6, 32] for the Conv2d_1a_3x3 layer, the 6
    # is the number of channels.
    num_channels_in_checkpoint_model = shape_map_for_layers[first_layer][2]
    if num_channels_in_checkpoint_model != example_shape[2]:
      raise ValueError('The number of channels in examples and checkpoint '
                       'should match, but the checkpoint has {} channels while '
                       'the examples have {}.'.format(
                           num_channels_in_checkpoint_model, example_shape[2]))
    # The model checkpoint includes information on the number of channels but
    # unfortunately not the width or height.
    if example_shape[0] not in [100, 300]:
      logging.warning('The height of the input image is not 100 (standard in '
                      'DeepVariant) or 300 (standard in DeepTrio). '
                      'Please double-check that the model is trained with the '
                      'same parameters and version of DeepVariant as you '
                      'generated the examples with. An error will not appear '
                      'when these are mismatched because of how InceptionV3 '
                      'works. Note that if you set --pileup_image_height in '
                      'DeepVariant, then you must use a model trained with '
                      'that same parameter.')

    if example_shape[1] != 221:
      logging.warning('The width of the input image is not 221 (standard in '
                      'DeepVariant). '
                      'Please double-check that the model is trained with the '
                      'same parameters and version of DeepVariant as you '
                      'generated the examples with. An error will not appear '
                      'when these are mismatched because of how InceptionV3 '
                      'works. Note that if you set --pileup_image_width in '
                      'DeepVariant, then you must use a model trained with '
                      'that same parameter.')

  # Check accelerator status.
  if execution_hardware not in _ALLOW_EXECUTION_HARDWARE:
    raise ValueError(
        'Unexpected execution_hardware={} value. Allowed values are {}'.format(
            execution_hardware, ','.join(_ALLOW_EXECUTION_HARDWARE)))
  init_op = tf.group(tf.compat.v1.global_variables_initializer(),
                     tf.compat.v1.local_variables_initializer())

  config = tf.compat.v1.ConfigProto()
  if FLAGS.config_string is not None:
    text_format.Parse(FLAGS.config_string, config)
  if execution_hardware == 'cpu':
    # Don't overwrite entire dictionary.
    config.device_count['GPU'] = 0
    config.device_count['TPU'] = 0

  # Perform sanity check.
  with tf.compat.v1.Session(config=config) as sess:
    sess.run(init_op)
    if execution_hardware == 'accelerator':
      if not any(dev.device_type != 'CPU' for dev in sess.list_devices()):
        raise ExecutionHardwareError(
            'execution_hardware is set to accelerator, but no accelerator '
            'was found')
    # redacted
    # sess.list_devices here doesn't return the correct answer. That can only
    # work later, after the device (on the other VM) has been initialized,
    # which is generally not yet.

  # Prepare input stream and estimator.
  tf_dataset = prepare_inputs(source_path=examples_filename, use_tpu=use_tpu)
  if FLAGS.use_openvino:
    ie_estimator = OpenVINOEstimator(
        checkpoint_path, input_fn=tf_dataset, model=model)
    predictions = iter(ie_estimator)
  else:
    estimator = model.make_estimator(
        batch_size=batch_size,
        master=master,
        use_tpu=use_tpu,
        session_config=config,
    )

    # Instantiate the prediction "stream", and select the EMA values from
    # the model.
    if checkpoint_path is None:
      # Unit tests use this branch.
      predict_hooks = []
    else:
      predict_hooks = [
          h(checkpoint_path) for h in model.session_predict_hooks()
      ]
    predictions = iter(
        estimator.predict(
            input_fn=tf_dataset,
            checkpoint_path=checkpoint_path,
            hooks=predict_hooks))

  # Consume predictions one at a time and write them to output_file.
  logging.info('Writing calls to %s', output_file)
  writer = tfrecord.Writer(output_file)
  with writer:
    start_time = time.time()
    n_examples, n_batches = 0, 0
    while max_batches is None or n_batches <= max_batches:
      try:
        prediction = next(predictions)
      except (StopIteration, tf.errors.OutOfRangeError):
        break
      write_variant_call(writer, prediction, use_tpu)
      n_examples += 1
      n_batches = n_examples // batch_size + 1
      duration = time.time() - start_time

      if not FLAGS.use_openvino:
        logging.log_every_n(
            logging.INFO,
            ('Processed %s examples in %s batches [%.3f sec per 100]'),
            _LOG_EVERY_N, n_examples, n_batches, (100 * duration) / n_examples)
    # One last log to capture the extra examples.
    if not FLAGS.use_openvino:
      logging.info('Processed %s examples in %s batches [%.3f sec per 100]',
                   n_examples, n_batches, (100 * duration) / n_examples)

    logging.info('Done calling variants from a total of %d examples.',
                 n_examples)
예제 #19
0
def call_variants(examples_filename,
                  checkpoint_path,
                  model,
                  output_file,
                  execution_hardware='auto',
                  batch_size=16,
                  max_batches=None,
                  use_tpu=False,
                  master=''):
    """Main driver of call_variants."""
    if FLAGS.kmp_blocktime:
        os.environ['KMP_BLOCKTIME'] = FLAGS.kmp_blocktime
        logging.info('Set KMP_BLOCKTIME to %s', os.environ['KMP_BLOCKTIME'])

    # Read a single TFExample to make sure we're not loading an older version.
    example_format = tf_utils.get_format_from_examples_path(examples_filename)
    if example_format is None:
        logging.warning(
            'Unable to read any records from %s. Output will contain '
            'zero records.', examples_filename)
        tfrecord.write_tfrecords([], output_file)
        return
    elif example_format != six.b('raw'):
        raise ValueError(
            'The TF examples in {} has image/format \'{}\' '
            '(expected \'raw\') which means you might need to rerun '
            'make_examples to generate the examples again.'.format(
                examples_filename, example_format))

    # Check accelerator status.
    if execution_hardware not in _ALLOW_EXECUTION_HARDWARE:
        raise ValueError(
            'Unexpected execution_hardware={} value. Allowed values are {}'.
            format(execution_hardware, ','.join(_ALLOW_EXECUTION_HARDWARE)))
    init_op = tf.group(tf.compat.v1.global_variables_initializer(),
                       tf.compat.v1.local_variables_initializer())

    config = tf.compat.v1.ConfigProto()
    if FLAGS.config_string is not None:
        text_format.Parse(FLAGS.config_string, config)
    if execution_hardware == 'cpu':
        # Don't overwrite entire dictionary.
        config.device_count['GPU'] = 0
        config.device_count['TPU'] = 0

    # Perform sanity check.
    with tf.compat.v1.Session(config=config) as sess:
        sess.run(init_op)
        if execution_hardware == 'accelerator':
            if not any(dev.device_type != 'CPU'
                       for dev in sess.list_devices()):
                raise ExecutionHardwareError(
                    'execution_hardware is set to accelerator, but no accelerator '
                    'was found')
        # redacted
        # sess.list_devices here doesn't return the correct answer. That can only
        # work later, after the device (on the other VM) has been initialized,
        # which is generally not yet.

    # Prepare input stream and estimator.
    tf_dataset = prepare_inputs(source_path=examples_filename, use_tpu=use_tpu)
    estimator = model.make_estimator(
        batch_size=batch_size,
        master=master,
        use_tpu=use_tpu,
        session_config=config,
    )

    # Instantiate the prediction "stream", and select the EMA values from
    # the model.
    if checkpoint_path is None:
        # Unit tests use this branch.
        predict_hooks = []
    else:
        predict_hooks = [
            h(checkpoint_path) for h in model.session_predict_hooks()
        ]
    predictions = iter(
        estimator.predict(input_fn=tf_dataset,
                          checkpoint_path=checkpoint_path,
                          hooks=predict_hooks))

    # Consume predictions one at a time and write them to output_file.
    logging.info('Writing calls to %s', output_file)
    writer = tfrecord.Writer(output_file)
    with writer:
        start_time = time.time()
        n_examples, n_batches = 0, 0
        while max_batches is None or n_batches <= max_batches:
            try:
                prediction = next(predictions)
            except (StopIteration, tf.errors.OutOfRangeError):
                break
            write_variant_call(writer, prediction, use_tpu)
            n_examples += 1
            n_batches = n_examples // batch_size + 1
            duration = time.time() - start_time

            logging.log_every_n(
                logging.INFO,
                ('Processed %s examples in %s batches [%.3f sec per 100]'),
                _LOG_EVERY_N, n_examples, n_batches,
                (100 * duration) / n_examples)

        logging.info('Done evaluating variants')