def testINT8(self):
   """Test INT8 conversion. Results may be different from native case."""
   calib_graph = self.get_trt_graph("INT8")
   result = self.run_calibration(calib_graph, self._input)
   self.assertAllEqual(self._reference, result)
   int8_graph = trt.calib_graph_to_infer_graph(calib_graph)
   result = self.run_graph(int8_graph, self._input)
   self.assertAllClose(self._reference, result, rtol=1.e-03)
   result1 = self.run_graph(int8_graph, self._input)
   self.assertAllEqual(result1, result)
	def testINT8(self):
		"""Test INT8 conversion. Results may be difference from native case."""
		calib_graph = self.get_trt_graph("INT8")
		result = self.run_calibration(calib_graph, self._input)
		self.assertAllEqual(self._reference, result)
		int8_graph = trt.calib_graph_to_infer_graph(calib_graph)
		result = self.run_graph(int8_graph, self._input)
		self.assertAllClose(self._reference, result, rtol=1.e-03)
		result1 = self.run_graph(int8_graph, self._input)
		self.assertAllEqual(result1, result)
    def _RunTest(self, graph_key, use_optimizer, precision_mode,
                 dynamic_infer_engine, dynamic_calib_engine):
        assert precision_mode in [MODE_FP32, MODE_FP16, MODE_INT8]
        input_gdef = TEST_GRAPHS[graph_key].gdef
        self._VerifyGraphDef(graph_key, input_gdef)

        # Get reference result without running trt.
        config_no_trt = self._GetConfigProto(False)
        print("Running original graph w/o trt, config:\n%s" %
              str(config_no_trt))
        ref_result = self._RunGraph(graph_key, input_gdef, self._input,
                                    config_no_trt)

        # Run calibration if necessary.
        if precision_mode == MODE_INT8:

            calib_config = self._GetConfigProto(use_optimizer, precision_mode,
                                                dynamic_calib_engine)
            print("Running calibration graph, config:\n%s" % str(calib_config))
            if use_optimizer:
                self.assertTrue(False)
                # TODO(aaroey): uncomment this and get infer_gdef when this mode is
                # supported.
                # result = self._RunCalibration(graph_key, input_gdef, self._input,
                #                               calib_config)
            else:
                calib_gdef = self._GetTrtGraph(input_gdef, precision_mode,
                                               dynamic_calib_engine)
                self._VerifyGraphDef(graph_key, calib_gdef, precision_mode,
                                     False, dynamic_calib_engine)
                result = self._RunCalibration(graph_key, calib_gdef,
                                              self._input, calib_config)
                infer_gdef = trt.calib_graph_to_infer_graph(calib_gdef)
                self._VerifyGraphDef(graph_key, infer_gdef, precision_mode,
                                     True, dynamic_calib_engine)
            self.assertAllClose(ref_result, result, rtol=1.e-03)
        else:
            infer_gdef = input_gdef

        # Run inference.
        infer_config = self._GetConfigProto(use_optimizer, precision_mode,
                                            dynamic_infer_engine)
        print("Running final inference graph, config:\n%s" % str(infer_config))
        if use_optimizer:
            result = self._RunGraph(graph_key, infer_gdef, self._input,
                                    infer_config)
        else:
            trt_infer_gdef = self._GetTrtGraph(infer_gdef, precision_mode,
                                               dynamic_infer_engine)
            self._VerifyGraphDef(graph_key, trt_infer_gdef, precision_mode,
                                 True, dynamic_infer_engine)
            result = self._RunGraph(graph_key, trt_infer_gdef, self._input,
                                    infer_config)
        self.assertAllClose(ref_result, result, rtol=1.e-03)
示例#4
0
def user(multi_engine,
         run_graph=execute_graph,
         run_calibration=execute_calibration):
    """Example function that converts a graph to TFTRT graph."""
    if multi_engine:
        inp_dims = (2, 3, 7, 5)
        orig_graph = get_multi_engine_graph_def()
    else:
        inp_dims = (100, 24, 24, 2)
        orig_graph = get_simple_graph_def()  # use a frozen graph for inference
    dummy_input = np.random.random_sample(inp_dims)
    # Get optimized graph
    trt_graph = trt.create_inference_graph(
        input_graph_def=orig_graph,
        outputs=["output"],
        max_batch_size=inp_dims[0],
        max_workspace_size_bytes=1 << 25,
        precision_mode="FP32",  # TRT Engine precision "FP32","FP16" or "INT8"
        minimum_segment_size=2,  # minimum number of nodes in an engine
        is_dynamic_op=False,
        maximum_cached_engines=1,
        cached_engine_batch_sizes=[])
    o1 = run_graph(orig_graph, dummy_input)
    o2 = run_graph(trt_graph, dummy_input)
    o3 = run_graph(trt_graph, dummy_input)
    assert np.array_equal(o1, o2)
    assert np.array_equal(o3, o2)  # sanity check
    fp16_graph = trt.create_inference_graph(
        input_graph_def=orig_graph,
        outputs=["output"],
        max_batch_size=inp_dims[0],
        max_workspace_size_bytes=1 << 25,
        precision_mode="FP16",  # TRT Engine precision "FP32","FP16" or "INT8"
        minimum_segment_size=2,  # minimum number of nodes in an engine
        is_dynamic_op=False,
        maximum_cached_engines=1,
        cached_engine_batch_sizes=[])
    int8_calib_gdef = trt.create_inference_graph(
        input_graph_def=orig_graph,
        outputs=["output"],
        max_batch_size=inp_dims[0],
        max_workspace_size_bytes=1 << 25,
        precision_mode="INT8",  # TRT Engine precision "FP32","FP16" or "INT8"
        minimum_segment_size=2,  # minimum number of nodes in an engine
        is_dynamic_op=False,
        maximum_cached_engines=1,
        cached_engine_batch_sizes=[])
    o4 = run_graph(fp16_graph, dummy_input)
    _ = run_calibration(int8_calib_gdef, dummy_input)
    int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef)
    o5 = run_graph(int8_graph, dummy_input)
    print("Is FP32 == FP16? %s (False is possible)" % np.allclose(o1, o4))
    print("Is FP32 == INT8? %s (False is possible)" % np.allclose(o1, o5))
    print("Pass")
示例#5
0
def user(multi_engine,
         run_graph=execute_graph,
         run_calibration=execute_calibration):
  """Example function that converts a graph to TFTRT graph."""
  if multi_engine:
    inp_dims = (2, 3, 7, 5)
    orig_graph = get_multi_engine_graph_def()
  else:
    inp_dims = (100, 24, 24, 2)
    orig_graph = get_simple_graph_def()  # use a frozen graph for inference
  dummy_input = np.random.random_sample(inp_dims)
  # Get optimized graph
  trt_graph = trt.create_inference_graph(
      input_graph_def=orig_graph,
      outputs=["output"],
      max_batch_size=inp_dims[0],
      max_workspace_size_bytes=1 << 25,
      precision_mode="FP32",  # TRT Engine precision "FP32","FP16" or "INT8"
      minimum_segment_size=2,  # minimum number of nodes in an engine
      is_dynamic_op=False,
      maximum_cached_engines=1,
      cached_engine_batches=[])
  o1 = run_graph(orig_graph, dummy_input)
  o2 = run_graph(trt_graph, dummy_input)
  o3 = run_graph(trt_graph, dummy_input)
  assert np.array_equal(o1, o2)
  assert np.array_equal(o3, o2)  # sanity check
  fp16_graph = trt.create_inference_graph(
      input_graph_def=orig_graph,
      outputs=["output"],
      max_batch_size=inp_dims[0],
      max_workspace_size_bytes=1 << 25,
      precision_mode="FP16",  # TRT Engine precision "FP32","FP16" or "INT8"
      minimum_segment_size=2,  # minimum number of nodes in an engine
      is_dynamic_op=False,
      maximum_cached_engines=1,
      cached_engine_batches=[])
  int8_calib_gdef = trt.create_inference_graph(
      input_graph_def=orig_graph,
      outputs=["output"],
      max_batch_size=inp_dims[0],
      max_workspace_size_bytes=1 << 25,
      precision_mode="INT8",  # TRT Engine precision "FP32","FP16" or "INT8"
      minimum_segment_size=2,  # minimum number of nodes in an engine
      is_dynamic_op=False,
      maximum_cached_engines=1,
      cached_engine_batches=[])
  o4 = run_graph(fp16_graph, dummy_input)
  _ = run_calibration(int8_calib_gdef, dummy_input)
  int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef)
  o5 = run_graph(int8_graph, dummy_input)
  print("Is FP32 == FP16? %s (False is possible)" % np.allclose(o1, o4))
  print("Is FP32 == INT8? %s (False is possible)" % np.allclose(o1, o5))
  print("Pass")
  def _RunTest(self, graph_key, use_optimizer, precision_mode,
               dynamic_infer_engine, dynamic_calib_engine):
    assert precision_mode in [MODE_FP32, MODE_FP16, MODE_INT8]
    input_gdef = TEST_GRAPHS[graph_key].gdef
    self._VerifyGraphDef(graph_key, input_gdef)

    # Get reference result without running trt.
    config_no_trt = self._GetConfigProto(False)
    print("Running original graph w/o trt, config:\n%s" % str(config_no_trt))
    ref_result = self._RunGraph(graph_key, input_gdef, self._input,
                                config_no_trt)

    # Run calibration if necessary.
    if precision_mode == MODE_INT8:

      calib_config = self._GetConfigProto(use_optimizer, precision_mode,
                                          dynamic_calib_engine)
      print("Running calibration graph, config:\n%s" % str(calib_config))
      if use_optimizer:
        self.assertTrue(False)
        # TODO(aaroey): uncomment this and get infer_gdef when this mode is
        # supported.
        # result = self._RunCalibration(graph_key, input_gdef, self._input,
        #                               calib_config)
      else:
        calib_gdef = self._GetTrtGraph(input_gdef, precision_mode,
                                       dynamic_calib_engine)
        self._VerifyGraphDef(graph_key, calib_gdef, precision_mode, False,
                             dynamic_calib_engine)
        result = self._RunCalibration(graph_key, calib_gdef, self._input,
                                      calib_config)
        infer_gdef = trt.calib_graph_to_infer_graph(calib_gdef)
        self._VerifyGraphDef(graph_key, infer_gdef, precision_mode, True,
                             dynamic_calib_engine)
      self.assertAllClose(ref_result, result, rtol=1.e-03)
    else:
      infer_gdef = input_gdef

    # Run inference.
    infer_config = self._GetConfigProto(use_optimizer, precision_mode,
                                        dynamic_infer_engine)
    print("Running final inference graph, config:\n%s" % str(infer_config))
    if use_optimizer:
      result = self._RunGraph(graph_key, infer_gdef, self._input, infer_config)
    else:
      trt_infer_gdef = self._GetTrtGraph(infer_gdef, precision_mode,
                                         dynamic_infer_engine)
      self._VerifyGraphDef(graph_key, trt_infer_gdef, precision_mode, True,
                           dynamic_infer_engine)
      result = self._RunGraph(graph_key, trt_infer_gdef, self._input,
                              infer_config)
    self.assertAllClose(ref_result, result, rtol=1.e-03)
示例#7
0
def getINT8InferenceGraph():

	with tf.gfile.FastGFile('/host_temp/test/test_INT8_batch1_trt_graph.pb', 'rb') as f:
		calibGraph = tf.GraphDef()
		calibGraph.ParseFromString(f.read())

	trt_graph=trt.calib_graph_to_infer_graph(calibGraph)

	with gfile.FastGFile("CPN_TRTINT8.pb",'wb') as f:
		f.write(trt_graph.SerializeToString())


	return trt_graph
示例#8
0
def convert_tensorrt_speedup_graph(input_graph_path,
                                   output_graph_path,
                                   data_type="FP32",
                                   calibrate_img_dir=""):
    output_node_names = ["score_list"]
    batch_size = 1
    workspace_size = 1 << 30
    precision = data_type

    trt_graph = trt.create_inference_graph(
        input_graph_def=get_graph_definition(input_graph_path),
        outputs=output_node_names,
        max_batch_size=batch_size,
        max_workspace_size_bytes=workspace_size,
        precision_mode=precision,
        minimum_segment_size=3)

    if data_type == "FP32" or data_type == "FP16":
        # save the new graph transformed by tensorRT
        with gfile.FastGFile(output_graph_path, "wb") as f:
            f.write(trt_graph.SerializeToString())  #序列化输出
        print("convert tensorrt {} speed up graph finished".format(data_type))
    elif data_type == "INT8":
        calib_graph = tf.Graph()
        with calib_graph.as_default():
            tf.import_graph_def(trt_graph, name='')

        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        #config.gpu_options.per_process_gpu_memory_fraction = 0.50
        sess = tf.Session(graph=calib_graph, config=config)

        inputs = calib_graph.get_tensor_by_name('inputs:0')
        is_training = calib_graph.get_tensor_by_name('is_training:0')
        prediction = calib_graph.get_tensor_by_name('score_list:0')

        # calibrate graph
        image_files = glob.glob(os.path.join(calibrate_img_dir, '*.*'))
        for image_path in image_files:
            preprocess_and_inference(image_path, sess, inputs, is_training,
                                     prediction)

        infer_graph = trt.calib_graph_to_infer_graph(trt_graph)
        with gfile.FastGFile(output_graph_path, 'wb') as f:
            f.write(infer_graph.SerializeToString())
        print("convert tensorrt {} speed up graph finished".format(data_type))
    else:
        print("data_type error, return")
示例#9
0
    def __init__(self,
                 graph_path,
                 use_tensorrt=False,
                 fp_mode='FP32',
                 target_size=(320, 240)):
        self.target_size = target_size
        print('fp mode is {}'.format(fp_mode))
        graph_def = None
        if use_tensorrt and tf.__version__ > '1.7':
            graph_def = trt.create_inference_graph(
                input_graph_def=load_graph(graph_path),
                outputs=['Openpose/concat_stage7'],
                max_batch_size=1,
                max_workspace_size_bytes=1 >> 16,
                precision_mode=fp_mode)
            if fp_mode == "INT8":
                graph_def = trt.calib_graph_to_infer_graph(graph_def)

        else:
            with tf.gfile.GFile(graph_path, 'rb') as f:
                graph_def = tf.GraphDef()
                graph_def.ParseFromString(f.read())

        self.graph = tf.get_default_graph()
        tf.import_graph_def(graph_def, name='')
        self.persistent_sess = tf.Session(graph=self.graph)
        #for op in self.graph.get_operations():
        #    print(op.name)

        self.tensor_image = self.graph.get_tensor_by_name('image:0')
        self.tensor_output = self.graph.get_tensor_by_name(
            'Openpose/concat_stage7:0')

        self.heatMat = self.pafMat = None

        # warm-up
        self.persistent_sess.run(self.tensor_output,
                                 feed_dict={
                                     self.tensor_image: [
                                         np.ndarray(shape=(target_size[1],
                                                           target_size[0], 3),
                                                    dtype=np.float32)
                                     ]
                                 })
示例#10
0
def user(run_graph=execute_graph, run_calibration=execute_calibration):
  """Example function that converts a graph to TFTRT graph."""

  inp_dims = (100, 24, 24, 2)
  dummy_input = np.random.random_sample(inp_dims)
  orig_graph = get_simple_graph_def()  # use a frozen graph for inference
  # Get optimized graph
  trt_graph = trt.create_inference_graph(
      input_graph_def=orig_graph,
      outputs=["output"],
      max_batch_size=inp_dims[0],
      max_workspace_size_bytes=1 << 25,
      precision_mode="FP32",  # TRT Engine precision "FP32","FP16" or "INT8"
      minimum_segment_size=2  # minimum number of nodes in an engine
  )
  o1 = run_graph(orig_graph, dummy_input)
  o2 = run_graph(trt_graph, dummy_input)
  o3 = run_graph(trt_graph, dummy_input)
  assert np.array_equal(o1, o2)
  assert np.array_equal(o3, o2)  # sanity check
  fp16_graph = trt.create_inference_graph(
      input_graph_def=orig_graph,
      outputs=["output"],
      max_batch_size=inp_dims[0],
      max_workspace_size_bytes=1 << 25,
      precision_mode="FP16",  # TRT Engine precision "FP32","FP16" or "INT8"
      minimum_segment_size=2  # minimum number of nodes in an engine
  )
  int8_calib_gdef = trt.create_inference_graph(
      input_graph_def=orig_graph,
      outputs=["output"],
      max_batch_size=inp_dims[0],
      max_workspace_size_bytes=1 << 25,
      precision_mode="INT8",  # TRT Engine precision "FP32","FP16" or "INT8"
      minimum_segment_size=2  # minimum number of nodes in an engine
  )
  o4 = run_graph(fp16_graph, dummy_input)
  _ = run_calibration(int8_calib_gdef, dummy_input)
  int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef)
  o5 = run_graph(int8_graph, dummy_input)
  assert np.allclose(o1, o4)
  assert np.allclose(o1, o5)
  print("Pass")
示例#11
0
def user(run_graph=execute_graph, run_calibration=execute_calibration):
    """Example function that converts a graph to TFTRT graph."""

    inp_dims = (100, 24, 24, 2)
    dummy_input = np.random.random_sample(inp_dims)
    orig_graph = get_simple_graph_def()  # use a frozen graph for inference
    # Get optimized graph
    trt_graph = trt.create_inference_graph(
        input_graph_def=orig_graph,
        outputs=["output"],
        max_batch_size=inp_dims[0],
        max_workspace_size_bytes=1 << 25,
        precision_mode="FP32",  # TRT Engine precision "FP32","FP16" or "INT8"
        minimum_segment_size=2  # minimum number of nodes in an engine
    )
    o1 = run_graph(orig_graph, dummy_input)
    o2 = run_graph(trt_graph, dummy_input)
    o3 = run_graph(trt_graph, dummy_input)
    assert np.array_equal(o1, o2)
    assert np.array_equal(o3, o2)  # sanity check
    fp16_graph = trt.create_inference_graph(
        input_graph_def=orig_graph,
        outputs=["output"],
        max_batch_size=inp_dims[0],
        max_workspace_size_bytes=1 << 25,
        precision_mode="FP16",  # TRT Engine precision "FP32","FP16" or "INT8"
        minimum_segment_size=2  # minimum number of nodes in an engine
    )
    int8_calib_gdef = trt.create_inference_graph(
        input_graph_def=orig_graph,
        outputs=["output"],
        max_batch_size=inp_dims[0],
        max_workspace_size_bytes=1 << 25,
        precision_mode="INT8",  # TRT Engine precision "FP32","FP16" or "INT8"
        minimum_segment_size=2  # minimum number of nodes in an engine
    )
    o4 = run_graph(fp16_graph, dummy_input)
    _ = run_calibration(int8_calib_gdef, dummy_input)
    int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef)
    o5 = run_graph(int8_graph, dummy_input)
    assert np.allclose(o1, o4)
    assert np.allclose(o1, o5)
    print("Pass")
示例#12
0
def get_trt_graph_from_calib(graph_name, calib_graph_def, output_dir):
    """Convert a TensorRT graph used for calibration to an inference graph."""
    trt_graph = trt.calib_graph_to_infer_graph(calib_graph_def)
    write_graph_to_file(graph_name, trt_graph, output_dir)
    return trt_graph
示例#13
0
 def build_forward_pass_graph(self,
                              input_tensors,
                              gpu_id=0,
                              checkpoint=None,
                              use_trt=False,
                              precision='FP32'):
     """Wrapper around _build_forward_pass_graph with option of using TF-TRT"""
     if use_trt:
         import tensorflow.contrib.tensorrt as trt
         # Create temporary graph which will contain the native TF graph
         tf_config = tf.ConfigProto()
         tf_config.gpu_options.allow_growth = True
         temp_graph = tf.Graph()
         with temp_graph.as_default() as tf_graph:
             with tf.Session(config=tf_config) as tf_sess:
                 input_placeholders = {
                     'source_tensors': [
                         tf.placeholder(shape=(None, None),
                                        dtype=tf.int32,
                                        name='input_map1'),
                         tf.placeholder(shape=(None, None),
                                        dtype=tf.int32,
                                        name='input_map2')
                     ]
                 }
                 loss, self._outputs[
                     gpu_id] = self._build_forward_pass_graph(
                         input_placeholders, gpu_id=gpu_id)
                 output_node_names = [
                     x.name.split(':0')[0] for x in self._outputs[gpu_id]
                 ]
                 # Restore checkpoint here because we have to freeze the graph
                 tf_saver = tf.train.Saver()
                 tf_saver.restore(save_path=checkpoint, sess=tf_sess)
                 frozen_graph = tf.graph_util.convert_variables_to_constants(
                     tf_sess,
                     tf_sess.graph_def,
                     output_node_names=output_node_names)
                 num_nodes = len(frozen_graph.node)
                 print('Converting graph using TensorFlow-TensorRT...')
                 frozen_graph = trt.create_inference_graph(
                     input_graph_def=frozen_graph,
                     outputs=output_node_names,
                     max_batch_size=64,
                     max_workspace_size_bytes=4096 << 20,
                     precision_mode=precision,
                     minimum_segment_size=3)
                 print(
                     'Total node count before and after TF-TRT conversion:',
                     num_nodes, '->', len(frozen_graph.node))
                 print(
                     'TRT node count:',
                     len([
                         1 for n in frozen_graph.node
                         if str(n.op) == 'TRTEngineOp'
                     ]))
         # Perform calibration for INT8 precision mode
         if precision == 'int8':
             with tf.Session(config=tf_config) as tf_sess:
                 calib_graph = frozen_graph
                 num_iterations = 10
                 print('Calibrating INT8...')
                 self._outputs[gpu_id] = tf.import_graph_def(
                     calib_graph,
                     input_map={
                         'input_map1': input_tensors['source_tensors'][0]
                     },
                     return_elements=[x + ':0' for x in output_node_names],
                     name='')
                 self._num_objects_per_step = [
                     self._get_num_objects_per_step(worker_id)
                     for worker_id in range(self.num_gpus)
                 ]
                 results_per_batch = iterate_data(self,
                                                  tf_sess,
                                                  compute_loss=False,
                                                  mode='infer',
                                                  verbose=False,
                                                  num_steps=num_iterations)
                 frozen_graph = trt.calib_graph_to_infer_graph(calib_graph)
                 del calib_graph
                 print('INT8 graph created.')
                 print('Nodes INT8:', len(frozen_graph.node))
         # Import TRT converted graph to default graph, mapping it to the original input tensors
         self._outputs[gpu_id] = tf.import_graph_def(
             frozen_graph,
             input_map={'input_map1': input_tensors['source_tensors'][0]},
             return_elements=[x + ':0' for x in output_node_names],
             name='')
         return loss, self._outputs[gpu_id]
     else:
         return self._build_forward_pass_graph(input_tensors, gpu_id)
示例#14
0
      minimum_segment_size=2  # minimum number of nodes in an engine
  )
  o1 = run_graph(orig_graph, dummy_input)
  o2 = run_graph(trt_graph, dummy_input)
  o3 = run_graph(trt_graph, dummy_input)
  assert np.array_equal(o1, o2)
  assert np.array_equal(o3, o2)  # sanity check
  fp16_graph = trt.create_inference_graph(
      input_graph_def=orig_graph,
      outputs=["output"],
      max_batch_size=inp_dims[0],
      max_workspace_size_bytes=1 << 25,
      precision_mode="FP16",  # TRT Engine precision "FP32","FP16" or "INT8"
      minimum_segment_size=2  # minimum number of nodes in an engine
  )
  int8_calib_gdef = trt.create_inference_graph(
      input_graph_def=orig_graph,
      outputs=["output"],
      max_batch_size=inp_dims[0],
      max_workspace_size_bytes=1 << 25,
      precision_mode="INT8",  # TRT Engine precision "FP32","FP16" or "INT8"
      minimum_segment_size=2  # minimum number of nodes in an engine
  )
  o4 = run_graph(fp16_graph, dummy_input)
  _ = run_calibration(int8_calib_gdef, dummy_input)
  int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef)
  o5 = run_graph(int8_graph, dummy_input)
  assert np.allclose(o1, o4)
  assert np.allclose(o1, o5)
  print("Pass")
def getINT8InferenceGraph(calibGraph): 
  trt_graph=trt.calib_graph_to_infer_graph(calibGraph)
  with gfile.FastGFile("resnetV250_TRTINT8_chest.pb",'wb') as f:
    f.write(trt_graph.SerializeToString())
  return trt_graph
def get_frozen_graph(model,
                     use_trt=False,
                     use_dynamic_op=False,
                     precision='fp32',
                     batch_size=8,
                     minimum_segment_size=2,
                     calib_data_dir=None,
                     num_calib_inputs=None,
                     use_synthetic=False,
                     cache=False,
                     download_dir='./data'):
    """Retreives a frozen GraphDef from model definitions in classification.py and applies TF-TRT

    model: str, the model name (see NETS table in classification.py)
    use_trt: bool, if true, use TensorRT
    precision: str, floating point precision (fp32, fp16, or int8)
    batch_size: int, batch size for TensorRT optimizations
    returns: tensorflow.GraphDef, the TensorRT compatible frozen graph
    """
    num_nodes = {}
    times = {}

    # Load from pb file if frozen graph was already created and cached
    if cache:
        # Graph must match the model, TRT mode, precision, and batch size
        prebuilt_graph_path = "graphs/frozen_graph_%s_%d_%s_%d.pb" % (
            model, int(use_trt), precision, batch_size)
        if os.path.isfile(prebuilt_graph_path):
            print('Loading cached frozen graph from \'%s\'' %
                  prebuilt_graph_path)
            start_time = time.time()
            with tf.gfile.GFile(prebuilt_graph_path, "rb") as f:
                frozen_graph = tf.GraphDef()
                frozen_graph.ParseFromString(f.read())
            times['loading_frozen_graph'] = time.time() - start_time
            num_nodes['loaded_frozen_graph'] = len(frozen_graph.node)
            num_nodes['trt_only'] = len(
                [1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp'])
            return frozen_graph, num_nodes, times

    # Build graph and load weights
    #frozen_graph = build_classification_graph(model, download_dir)
    model_dir = os.path.join(os.environ['APP_HOME'], "Modules",
                             "Deep-Learning", "packages", "models")
    frozen_graph = create_graph(model_dir, FLAGS.frozen_graph)
    num_nodes['native_tf'] = len(frozen_graph.node)

    # Convert to TensorRT graph
    if use_trt:
        start_time = time.time()
        frozen_graph = trt.create_inference_graph(
            input_graph_def=frozen_graph,
            outputs=['resnet_v1_50/SpatialSqueeze:0'],
            max_batch_size=batch_size,
            max_workspace_size_bytes=(4096 << 20) - 1000,
            precision_mode=precision,
            minimum_segment_size=minimum_segment_size,
            is_dynamic_op=use_dynamic_op)
        times['trt_conversion'] = time.time() - start_time
        num_nodes['tftrt_total'] = len(frozen_graph.node)
        num_nodes['trt_only'] = len(
            [1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp'])

        if precision == 'int8':
            calib_graph = frozen_graph
            # INT8 calibration step
            print('Calibrating INT8...')
            batch_data = []
            files_dir = 'input_images'
            data_dir = os.path.join(os.environ['APP_HOME'], "Modules",
                                    "Deep-Learning", "packages", files_dir)
            files = os.listdir(data_dir)
            for f in files:
                if f.lower().endswith(('.png', '.jpg', '.jpeg')):
                    image_path = files_dir + '/' + f
                batch_data = batch_from_image(image_path, FLAGS.batch_size,
                                              batch_data)
                print(image_path)
            run_inference(batch_data, calib_graph)
            #run(calib_graph, model, calib_data_dir, batch_size,
            #    num_calib_inputs // batch_size, 0, False)
            frozen_graph = trt.calib_graph_to_infer_graph(calib_graph)
            del calib_graph
            print('INT8 graph created.')

    # Cache graph to avoid long conversions each time
    if cache:
        if not os.path.exists(os.path.dirname(prebuilt_graph_path)):
            try:
                os.makedirs(os.path.dirname(prebuilt_graph_path))
            except Exception as e:
                raise e
        start_time = time.time()
        with tf.gfile.GFile(prebuilt_graph_path, "wb") as f:
            f.write(frozen_graph.SerializeToString())
        times['saving_frozen_graph'] = time.time() - start_time

    return frozen_graph, num_nodes, times
def get_frozen_graph(model,
                     model_dir=None,
                     use_trt=False,
                     use_dynamic_op=False,
                     precision='fp32',
                     batch_size=8,
                     minimum_segment_size=2,
                     calib_files=None,
                     num_calib_inputs=None,
                     use_synthetic=False,
                     cache=False,
                     default_models_dir='./data',
                     max_workspace_size=(1 << 32)):
    """Retreives a frozen GraphDef from model definitions in classification.py and applies TF-TRT

    model: str, the model name (see NETS table in classification.py)
    use_trt: bool, if true, use TensorRT
    precision: str, floating point precision (fp32, fp16, or int8)
    batch_size: int, batch size for TensorRT optimizations
    returns: tensorflow.GraphDef, the TensorRT compatible frozen graph
    """
    num_nodes = {}
    times = {}
    graph_sizes = {}

    # Load from pb file if frozen graph was already created and cached
    if cache:
        # Graph must match the model, TRT mode, precision, and batch size
        prebuilt_graph_path = "graphs/frozen_graph_%s_%d_%s_%d.pb" % (
            model, int(use_trt), precision, batch_size)
        if os.path.isfile(prebuilt_graph_path):
            print('Loading cached frozen graph from \'%s\'' %
                  prebuilt_graph_path)
            start_time = time.time()
            with tf.gfile.GFile(prebuilt_graph_path, "rb") as f:
                frozen_graph = tf.GraphDef()
                frozen_graph.ParseFromString(f.read())
            times['loading_frozen_graph'] = time.time() - start_time
            num_nodes['loaded_frozen_graph'] = len(frozen_graph.node)
            num_nodes['trt_only'] = len(
                [1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp'])
            graph_sizes['loaded_frozen_graph'] = len(
                frozen_graph.SerializeToString())
            return frozen_graph, num_nodes, times, graph_sizes

    # Build graph and load weights
    frozen_graph = build_classification_graph(model, model_dir,
                                              default_models_dir)
    num_nodes['native_tf'] = len(frozen_graph.node)
    graph_sizes['native_tf'] = len(frozen_graph.SerializeToString())

    # Convert to TensorRT graph
    if use_trt:
        start_time = time.time()
        frozen_graph = trt.create_inference_graph(
            input_graph_def=frozen_graph,
            outputs=['logits', 'classes'],
            max_batch_size=batch_size,
            max_workspace_size_bytes=max_workspace_size,
            precision_mode=precision.upper(),
            minimum_segment_size=minimum_segment_size,
            is_dynamic_op=use_dynamic_op)
        times['trt_conversion'] = time.time() - start_time
        num_nodes['tftrt_total'] = len(frozen_graph.node)
        num_nodes['trt_only'] = len(
            [1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp'])
        graph_sizes['trt'] = len(frozen_graph.SerializeToString())

        if precision == 'int8':
            calib_graph = frozen_graph
            graph_sizes['calib'] = len(calib_graph.SerializeToString())
            # INT8 calibration step
            print('Calibrating INT8...')
            start_time = time.time()
            run(calib_graph,
                model,
                calib_files,
                batch_size,
                num_calib_inputs // batch_size,
                0,
                use_synthetic=use_synthetic,
                run_calibration=True)
            times['trt_calibration'] = time.time() - start_time

            start_time = time.time()
            frozen_graph = trt.calib_graph_to_infer_graph(calib_graph)
            times['trt_int8_conversion'] = time.time() - start_time
            # This is already set but overwriting it here to ensure the right size
            graph_sizes['trt'] = len(frozen_graph.SerializeToString())

            del calib_graph
            print('INT8 graph created.')

    # Cache graph to avoid long conversions each time
    if cache:
        if not os.path.exists(os.path.dirname(prebuilt_graph_path)):
            try:
                os.makedirs(os.path.dirname(prebuilt_graph_path))
            except Exception as e:
                raise e
        start_time = time.time()
        with tf.gfile.GFile(prebuilt_graph_path, "wb") as f:
            f.write(frozen_graph.SerializeToString())
        times['saving_frozen_graph'] = time.time() - start_time

    return frozen_graph, num_nodes, times, graph_sizes
示例#18
0
def get_frozen_graph(model,
                     model_dir=None,
                     use_trt=False,
                     use_dynamic_op=False,
                     precision='fp32',
                     batch_size=8,
                     minimum_segment_size=2,
                     calib_files=None,
                     num_calib_inputs=None,
                     use_synthetic=False,
                     cache=False,
                     default_models_dir='./data',
                     max_workspace_size=(1 << 32)):
    """Retreives a frozen GraphDef from model definitions in classification.py and applies TF-TRT

    model: str, the model name (see NETS table in classification.py)
    use_trt: bool, if true, use TensorRT
    precision: str, floating point precision (fp32, fp16, or int8)
    batch_size: int, batch size for TensorRT optimizations
    returns: tensorflow.GraphDef, the TensorRT compatible frozen graph
    """
    num_nodes = {}
    times = {}
    graph_sizes = {}

    # Load from pb file if frozen graph was already created and cached
    if cache:
        # Graph must match the model, TRT mode, precision, and batch size
        prebuilt_graph_path = "graphs/frozen_graph_%s_%d_%s_%d.pb" % (
            model, int(use_trt), precision, batch_size)
        if os.path.isfile(prebuilt_graph_path):
            print('Loading cached frozen graph from \'%s\'' %
                  prebuilt_graph_path)
            start_time = time.time()
            with tf.gfile.GFile(prebuilt_graph_path, "rb") as f:
                frozen_graph = tf.GraphDef()
                frozen_graph.ParseFromString(f.read())
            times['loading_frozen_graph'] = time.time() - start_time
            num_nodes['loaded_frozen_graph'] = len(frozen_graph.node)
            num_nodes['trt_only'] = len(
                [1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp'])
            graph_sizes['loaded_frozen_graph'] = len(
                frozen_graph.SerializeToString())
            return frozen_graph, num_nodes, times, graph_sizes

    # Build graph and load weights
    frozen_graph = build_classification_graph(model, model_dir,
                                              default_models_dir)

    num_nodes['native_tf'] = len(frozen_graph.node)
    graph_sizes['native_tf'] = len(frozen_graph.SerializeToString())

    export_dir = './saved_model4/1'
    graph_pb = './graphs/frozen_graph_inception_v3_0_fp32_8.pb'
    ''' 
    builder = tf.saved_model.builder.SavedModelBuilder(export_dir)

    with tf.gfile.GFile(graph_pb, "rb") as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())

    sigs = {}

    with tf.Session(graph=tf.Graph()) as sess:
                    # name="" is important to ensure we don't get spurious prefixing
        tf.import_graph_def(graph_def, name="")
        #tf.summary.FileWriter('inception_v3_event', sess.graph)
        g = tf.get_default_graph()
        inp = g.get_tensor_by_name("input:0")
        out = g.get_tensor_by_name("ArgMax:0")
        
        print(inp)
        print(out)
        #worked version
        prediction_signature = (
            tf.saved_model.signature_def_utils.build_signature_def(
              inputs={'image': tf.saved_model.utils.build_tensor_info(inp)},
              outputs={'out':tf.saved_model.utils.build_tensor_info(out)},
              method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME))

        #legacy_init_op = tf.group(tf.tables.initializer(), name='legacy_init_op')

        builder.add_meta_graph_and_variables(
            sess, [tf.saved_model.tag_constants.SERVING],
            signature_def_map={'predict_images':
                               prediction_signature,
                               })
        builder.save()
        
        #Failed method 1:
        
        image_height_tensor = tf.placeholder(tf.int32)
        image_width_tensor = tf.placeholder(tf.int32)
        #placeholder for receiving the serialized input image
        serialized_tf_example = tf.placeholder(tf.string, name='tf_example')
        feature_configs = {'x': tf.FixedLenFeature(shape=[], dtype=tf.float32), }
        tf_example = tf.parse_example(serialized_tf_example, feature_configs)

        # reshape the input image to its original dimension
        tf_example['x'] = tf.reshape(tf_example['x'], (1, image_height_tensor, image_width_tensor, 3))
        x = tf.identity(tf_example['x'], name='x')  # use tf.identity() to assign name
        inp = x
        # perform inference on the input image

        print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
        print(inp)
        print(out)

        # Creates the TensorInfo protobuf objects that encapsulates the input/output tensors
        tensor_info_input = tf.saved_model.utils.build_tensor_info(x)
        tensor_info_height = tf.saved_model.utils.build_tensor_info(image_height_tensor)
        tensor_info_width = tf.saved_model.utils.build_tensor_info(image_width_tensor)

        # output tensor info
        tensor_info_output = tf.saved_model.utils.build_tensor_info(out)

        prediction_signature = (
            tf.saved_model.signature_def_utils.build_signature_def(
              inputs={'images': tensor_info_input, 'height': tensor_info_height, 'width': tensor_info_width},
              outputs={'segmentation_map': tensor_info_output},
              method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME))
        builder.add_meta_graph_and_variables(
            sess, [tf.saved_model.tag_constants.SERVING],
            signature_def_map={'predict_images':
                               prediction_signature,
                               })
        builder.save()
        #failed version 2
        sigs[tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = \
        tf.saved_model.signature_def_utils.predict_signature_def(
                {'images': tensor_info_input, 'height':tensor_info_height, 'width': tensor_info_width}, {"out": tensor_info_output})

        builder.add_meta_graph_and_variables(sess,
        [tf.saved_model.tag_constants.SERVING],
        signature_def_map=sigs)
        '''

    # Convert to TensorRT graph
    if use_trt:
        start_time = time.time()
        frozen_graph = trt.create_inference_graph(
            input_graph_def=frozen_graph,
            outputs=['logits', 'classes'],
            max_batch_size=batch_size,
            max_workspace_size_bytes=max_workspace_size,
            precision_mode=precision.upper(),
            minimum_segment_size=minimum_segment_size,
            is_dynamic_op=use_dynamic_op)
        times['trt_conversion'] = time.time() - start_time
        num_nodes['tftrt_total'] = len(frozen_graph.node)
        num_nodes['trt_only'] = len(
            [1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp'])
        graph_sizes['trt'] = len(frozen_graph.SerializeToString())

        if precision == 'int8':
            calib_graph = frozen_graph
            graph_sizes['calib'] = len(calib_graph.SerializeToString())
            # INT8 calibration step
            print('Calibrating INT8...')
            start_time = time.time()
            run(calib_graph,
                model,
                calib_files,
                batch_size,
                num_calib_inputs // batch_size,
                0,
                use_synthetic=use_synthetic)
            times['trt_calibration'] = time.time() - start_time

            start_time = time.time()
            frozen_graph = trt.calib_graph_to_infer_graph(calib_graph)
            times['trt_int8_conversion'] = time.time() - start_time
            # This is already set but overwriting it here to ensure the right size
            graph_sizes['trt'] = len(frozen_graph.SerializeToString())

            del calib_graph
            print('INT8 graph created.')

    # Cache graph to avoid long conversions each time
    print(
        "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&"
    )
    print(cache)
    if cache:
        if not os.path.exists(os.path.dirname(prebuilt_graph_path)):
            try:
                os.makedirs(os.path.dirname(prebuilt_graph_path))
            except Exception as e:
                raise e
        start_time = time.time()
        with tf.gfile.GFile(prebuilt_graph_path, "wb") as f:
            f.write(frozen_graph.SerializeToString())
        times['saving_frozen_graph'] = time.time() - start_time

    return frozen_graph, num_nodes, times, graph_sizes
示例#19
0
 def get_trt_from_calib(self, grf_calib):
     """Convert a TensorRT graph used for calibration to an inference graph."""
     grf_trt = trt.calib_graph_to_infer_graph(grf_calib)
     return grf_trt
示例#20
0
def get_frozen_graph(model,
                     model_dir=None,
                     pb_name=None,
                     use_trt=False,
                     use_dynamic_op=False,
                     precision='fp32',
                     batch_size=8,
                     minimum_segment_size=2,
                     calib_files=None,
                     num_calib_inputs=None,
                     cache=False,
                     max_workspace_size=(1 << 32)):
    num_nodes = {}
    times = {}
    graph_sizes = {}
    frozen_graph = tf.GraphDef()

    # Load from pb file if frozen graph was already created and cached
    if pb_name:
        prebuilt_graph_path = os.path.join(model_dir, pb_name)
    else:
        prebuilt_graph_path = os.path.join(model_dir, 'r50_93.40_trt.pb')

    if cache:
        if os.path.isfile(prebuilt_graph_path):
            print('Loading cached frozen graph from \'%s\'' %
                  prebuilt_graph_path)
            start_time = time.time()
            with tf.gfile.GFile(prebuilt_graph_path, "rb") as f:
                frozen_graph = tf.GraphDef()
                frozen_graph.ParseFromString(f.read())
            times['loading_frozen_graph'] = time.time() - start_time
            num_nodes['loaded_frozen_graph'] = len(frozen_graph.node)
            num_nodes['trt_only'] = len(
                [1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp'])
            graph_sizes['loaded_frozen_graph'] = len(
                frozen_graph.SerializeToString())
            return frozen_graph, num_nodes, times, graph_sizes

    num_nodes['native_tf'] = len(frozen_graph.node)
    graph_sizes['native_tf'] = len(frozen_graph.SerializeToString())

    # Convert to TensorRT graph
    if use_trt:
        print("Using TensorRT")
        start_time = time.time()
        frozen_graph = trt.create_inference_graph(
            input_graph_def=frozen_graph,
            outputs=PB_OUTPUTS,
            max_batch_size=batch_size,
            max_workspace_size_bytes=max_workspace_size,
            precision_mode=precision,
            minimum_segment_size=minimum_segment_size,
            is_dynamic_op=use_dynamic_op)
        times['trt_conversion'] = time.time() - start_time
        num_nodes['tftrt_total'] = len(frozen_graph.node)
        num_nodes['trt_only'] = len(
            [1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp'])
        graph_sizes['trt'] = len(frozen_graph.SerializeToString())

        if precision == 'int8':
            calib_graph = frozen_graph
            graph_sizes['calib'] = len(calib_graph.SerializeToString())
            # INT8 calibration step
            print('Calibrating INT8...')
            start_time = time.time()
            run(calib_graph,
                model,
                calib_files,
                batch_size,
                num_calib_inputs // batch_size,
                0,
                False,
                run_calibration=True)
            times['trt_calibration'] = time.time() - start_time

            start_time = time.time()
            frozen_graph = trt.calib_graph_to_infer_graph(calib_graph)

            times['trt_int8_conversion'] = time.time() - start_time
            # This is already set but overwriting it here to ensure the right size
            graph_sizes['trt'] = len(frozen_graph.SerializeToString())

            del calib_graph
            print('INT8 graph created.')

    # Cache graph to avoid long conversions each time
    if cache:
        saved_pb = os.path.join(model_dir, 'r50.pb')
        if not os.path.exists(os.path.dirname(saved_pb)):
            try:
                os.makedirs(os.path.dirname(saved_pb))
            except Exception as e:
                raise e
        start_time = time.time()
        with tf.gfile.GFile(saved_pb, "wb") as f:
            f.write(frozen_graph.SerializeToString())
        times['saving_frozen_graph'] = time.time() - start_time

    return frozen_graph, num_nodes, times, graph_sizes
示例#21
0
def main(args):
    workspace_size_bytes = 1 << 30
    trt_gpu_ops = tf.GPUOptions(per_process_gpu_memory_fraction=0.50)
    batches = random_sequences(length_from=3,
                               length_to=10,
                               vocab_lower=2,
                               vocab_upper=10,
                               batch_size=args.batch_size)
    frozen_model = 'frozen_model.pb'

    tf.reset_default_graph()
    # parse the graph_def file
    with tf.gfile.GFile(frozen_model, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())

    trt_graph_def = trt.create_inference_graph(
        input_graph_def=graph_def,
        outputs=['ArgMax'],
        max_batch_size=args.batch_size,
        max_workspace_size_bytes=workspace_size_bytes,
        precision_mode=args.precision_mode)

    if args.precision_mode == 'INT8':
        trt_graph_def = trt.calib_graph_to_infer_graph(trt_graph_def)
    print('Generated TensorRT graph def')

    with tf.Graph().as_default() as graph:
        encoder_inputs, decoder_inputs, decoder_targets, decoder_prediction = tf.import_graph_def(
            graph_def,
            return_elements=[
                'encoder_inputs:0', 'decoder_inputs:0', 'decoder_targets:0',
                'ArgMax:0'
            ])
    print('Generated tensor by frozen graph')

    with tf.Session(graph=graph,
                    config=tf.ConfigProto(gpu_options=trt_gpu_ops)) as sess:
        for _ in range(args.roll):
            batch = next(batches)
            encoder_inputs_, _ = make_batch(batch)
            decoder_targets_, _ = make_batch([(sequence) + [EOS]
                                              for sequence in batch])
            decoder_inputs_, _ = make_batch([[EOS] + (sequence)
                                             for sequence in batch])
            feed_dict = {
                encoder_inputs: encoder_inputs_,
                decoder_inputs: decoder_inputs_,
                decoder_targets: decoder_targets_
            }
            start_time = time.process_time()
            predict_ = sess.run(decoder_prediction, feed_dict)
            stop_time = time.process_time()
            for i, (inp, pred) in enumerate(
                    zip(feed_dict[encoder_inputs].T, predict_.T)):
                print('input > {}'.format(inp))
                print('predicted > {}'.format(pred))
                if i >= 10:
                    break
            print('{:.2f} milliseconds'.format(
                (stop_time - start_time) * 1000))
示例#22
0
def convert_tftrt_fp(orig_graph, batch_size, precision):
    # convert native Tensorflow graphdef into a mixed TF-TRT graph
    trt_graph = trt.create_inference_graph(
        input_graph_def=orig_graph,  # native Tensorflow graphdef
        outputs=["output"],  # list of names for output node
        max_batch_size=batch_size,  # maximum/optimum batchsize for TF-TRT
        # mixed graphdef
        max_workspace_size_bytes=1 <<
        25,  # maximum workspace (in MB) for each 
        # TRT engine to allocate
        precision_mode=precision,  # TRT Engine precision
        # "FP32","FP16" or "INT8"
        minimum_segment_size=2  # minimum number of nodes in an engine,
        # this parameter allows the converter to
        # skip subgraph with total node number
        # less than the threshold
    )

    # allow_growth and restrict Tensorflow to claim all GPU memory
    # currently TensorRT engine uses independent memory allocation outside of TF
    config = tf.ConfigProto(gpu_options=tf.GPUOptions(
        per_process_gpu_memory_fraction=0.5, allow_growth=True))
    # we can now import trt_graph into Tensorflow and execute it. If given target
    # precision_mode as 'FP32' or 'FP16'.
    if precision == 'FP16' or precision == 'FP32':
        return trt_graph

    # 'INT8' precision would require an extra step of calibration
    int8_calib_gdef = trt_graph
    # 'INT8' precision requires calibration to retrieve proper quantization range.
    # trt.create_inference_graph returns a calibration graph def with inserted
    #   calibration op that captures input tensor during session run to feed
    #   TensorRT subgraph during engine construction

    # feed calibration date into TF-TRT mixed graph
    # this step is just running the calibration graph with a set of representative
    #   input data. (could use a subset of validation data with even distribution
    #   of all categories)
    g = tf.Graph()
    with g.as_default():
        inp, out = tf.import_graph_def(graph_def=int8_calib_gdef,
                                       return_elements=["input", "output"])
        inp = inp.outputs[0]
        out = out.outputs[0]

    # start TF session with TF-TRT graph, execute the graph and feed it with input
    #   calibration_batch should be sharded and feed through TF-TRT mixed network
    # Should use real data that is representatitive of the inference dataset for
    #   calibration to reduce quantization error.
    # For this test script it is random data.
    CALIBRATION_BATCH = 100
    inp_dims = (CALIBRATION_BATCH, 24, 24, 2)
    dummy_input = np.random.random_sample(inp_dims)

    # allow_growth and restrict Tensorflow to claim all GPU memory
    # currently TensorRT engine uses independent memory allocation outside of TF
    config = tf.ConfigProto(gpu_options=tf.GPUOptions(
        per_process_gpu_memory_fraction=0.5, allow_growth=True))

    # start session to feed calibration data
    with tf.Session(graph=g, config=config) as sess:
        iteration = int(CALIBRATION_BATCH / batch_size)
        # iterate through the clibration data, each time we feed data with
        #   batch size < BATCH_SIZE (specified during conversion)
        for i in range(iteration):
            val = sess.run(out, {inp: dummy_input[i::iteration]})

    # finished calibration, trigger calib_graph_to_infer_graph to build
    #   TF-TRT mixed graphdef for inference
    int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef)
    return int8_graph
示例#23
0
def getINT8InferenceGraph(output_prefix, calibGraph):
    trt_graph = trt.calib_graph_to_infer_graph(calibGraph)
    with gfile.FastGFile(output_prefix + '.INT8.pb', 'wb') as f:
        f.write(trt_graph.SerializeToString())
    return trt_graph
示例#24
0
def get_int8_infer_graph(calib_graph, output_pb):
    trt_graph = trt.calib_graph_to_infer_graph(calib_graph)
    with gfile.FastGFile(output_pb, 'wb') as f:
        f.write(trt_graph.SerializeToString())
    return trt_graph
示例#25
0
def main(args):
    workspace_size_bytes = 1 << 30
    trt_gpu_ops = tf.GPUOptions(per_process_gpu_memory_fraction=0.50)

    # transform images (image -> input vector)
    tf.reset_default_graph()
    g1 = tf.Graph()
    with g1.as_default():
        # create graph
        in_images = tf.placeholder(tf.string, name='in_images')
        decoded_input = tf.image.decode_png(in_images, channels=3)
        float_input = tf.cast(decoded_input, dtype=tf.float32)
        # (224, 224, 3) -> (n, 224, 224, 3)
        rgb_input = tf.expand_dims(float_input, axis=0)
        # for VGG preprocess, reduce means and convert to BGR
        slice_red = tf.slice(rgb_input, [0, 0, 0, 0], [1, 224, 224, 1])
        slice_green = tf.slice(rgb_input, [0, 0, 0, 1], [1, 224, 224, 1])
        slice_blue = tf.slice(rgb_input, [0, 0, 0, 2], [1, 224, 224, 1])
        sub_red = tf.subtract(slice_red, 123.68)
        sub_green = tf.subtract(slice_green, 116.779)
        sub_blue = tf.subtract(slice_blue, 103.939)
        transferred_input = tf.concat([sub_blue, sub_green, sub_red], 3)
        # transform to vectors
        with tf.Session(config=tf.ConfigProto(gpu_options=trt_gpu_ops)) as s1:
            with open('tiger224x224.jpg', 'rb') as f:
                data1 = f.read()
                imglist1 = s1.run([transferred_input], feed_dict={in_images: data1})
                image1 = imglist1[0]
            with open('lion224x224.jpg', 'rb') as f:
                data2 = f.read()
                imglist2 = s1.run([transferred_input], feed_dict={in_images: data2})
                image2 = imglist2[0]
            with open('orangutan224x224.jpg', 'rb') as f:
                data3 = f.read()
                imglist3 = s1.run([transferred_input], feed_dict={in_images: data3})
                image3 = imglist3[0]
    print('Loaded image vectors (tiger, lion, orangutan')

    # When you test batch, please uncomment here. (single prediction is executed by default)
    image1 = np.tile(image1, (args.batch_size, 1, 1, 1))
    image2 = np.tile(image2, (args.batch_size, 1, 1, 1))
    image3 = np.tile(image3, (args.batch_size, 1, 1, 1))

    # load classification graph def
    classifier_model_file = 'resnetV150_frozen.pb'
    classifier_graph_def = tf.GraphDef()
    with tf.gfile.Open(classifier_model_file, 'rb') as f:
        data = f.read()
        classifier_graph_def.ParseFromString(data)
    print('Loaded classifier graph def')

    trt_graph_def = trt.create_inference_graph(
        input_graph_def=classifier_graph_def,
        outputs=['resnet_v1_50/predictions/Reshape_1'],
        max_batch_size=args.batch_size,
        max_workspace_size_bytes=workspace_size_bytes,
        precision_mode=args.precision_mode
    )

    if args.precision_mode == 'INT8':
        trt_graph_def = trt.calib_graph_to_infer_graph(trt_graph_def)
    print('Generated TensorRT graph def')

    # generate tensor with TensorRT graph
    tf.reset_default_graph()
    g2 = tf.Graph()
    with g2.as_default():
        trt_x, trt_y = tf.import_graph_def(
            trt_graph_def,
            return_elements=['input:0', 'resnet_v1_50/predictions/Reshape_1:0']
        )
    print('Generated tensor by TensorRT graph')

    # run classification with TensorRT graph
    with open('imagenet_classes.txt', 'rb') as f:
        labeltext = f.read()
        classes_entries = labeltext.splitlines()

    with tf.Session(graph=g2, config=tf.ConfigProto(gpu_options=trt_gpu_ops)) as s2:
        eval_list = [image1, image2, image3]
        for img in eval_list:
            start_time = time.process_time()
            result = s2.run([trt_y], feed_dict={trt_x: img})
            stop_time = time.process_time()
            # list -> 1 x n ndarray : feature's format is [[1.16643378e-06 3.12126781e-06 3.39836406e-05 ... ]]
            nd_result = result[0]
            # remove row's dimension
            onedim_result = nd_result[0,]
            # set column index to array of possibilities
            indexed_result = enumerate(onedim_result)
            # sort with possibilities
            sorted_result = sorted(indexed_result, key=lambda x: x[1], reverse=True)
            # get the names of top5 possibilities
            print('********************')
            for top in sorted_result[:5]:
                print(classes_entries[top[0]], 'confidence:', top[1])
            print('{:.2f} milliseconds'.format((stop_time - start_time) * 1000))
示例#26
0
def optimize_model(config_path,
                   checkpoint_path,
                   use_trt=True,
                   force_nms_cpu=True,
                   replace_relu6=True,
                   remove_assert=True,
                   override_nms_score_threshold=None,
                   override_resizer_shape=[600, 600],
                   max_batch_size=1,
                   precision_mode='FP32',
                   minimum_segment_size=50,
                   max_workspace_size_bytes=1 << 32,
                   maximum_cached_engines=100,
                   calib_images_dir=None,
                   num_calib_images=None,
                   calib_image_shape=None,
                   tmp_dir='.optimize_model_tmp_dir',
                   remove_tmp_dir=True,
                   output_path=None,
                   display_every=100):
    """Optimizes an object detection model using TensorRT

    Optimizes an object detection model using TensorRT.  This method also
    performs pre-tensorrt optimizations specific to the TensorFlow object
    detection API models.  Please see the list of arguments for other
    optimization parameters.

    Args
    ----
        config_path: A string representing the path of the object detection
            pipeline config file.
        checkpoint_path: A string representing the path of the object
            detection model checkpoint.
        use_trt: A boolean representing whether to optimize with TensorRT. If
            False, regular TensorFlow will be used but other optimizations
            (like NMS device placement) will still be applied.
        force_nms_cpu: A boolean indicating whether to place NMS operations on
            the CPU.
        replace_relu6: A boolean indicating whether to replace relu6(x)
            operations with relu(x) - relu(x-6).
        remove_assert: A boolean indicating whether to remove Assert
            operations from the graph.
        override_nms_score_threshold: An optional float representing
            a NMS score threshold to override that specified in the object
            detection configuration file.
        override_resizer_shape: An optional list/tuple of integers
            representing a fixed shape to override the default image resizer
            specified in the object detection configuration file.
        max_batch_size: An integer representing the max batch size to use for
            TensorRT optimization.
        precision_mode: A string representing the precision mode to use for
            TensorRT optimization.  Must be one of 'FP32', 'FP16', or 'INT8'.
        minimum_segment_size: An integer representing the minimum segment size
            to use for TensorRT graph segmentation.
        max_workspace_size_bytes: An integer representing the max workspace
            size for TensorRT optimization.
        maximum_cached_engines: An integer represenging the number of TRT engines
            that can be stored in the cache.
        calib_images_dir: A string representing a directory containing images to
            use for int8 calibration. 
        num_calib_images: An integer representing the number of calibration 
            images to use.  If None, will use all images in directory.
        calib_image_shape: A tuple of integers representing the height, 
            width that images will be resized to for calibration. 
        tmp_dir: A string representing a directory for temporary files.  This
            directory will be created and removed by this function and should
            not already exist.  If the directory exists, an error will be
            thrown.
        remove_tmp_dir: A boolean indicating whether we should remove the
            tmp_dir or throw error.
        output_path: An optional string representing the path to save the
            optimized GraphDef to.
        display_every: print log for calibration every display_every iteration

    Returns
    -------
        A GraphDef representing the optimized model.
    """
    if max_batch_size > 1 and calib_image_shape is None:
        raise RuntimeError(
            'Fixed calibration image shape must be provided for max_batch_size > 1'
        )
    if os.path.exists(tmp_dir):
        if not remove_tmp_dir:
            raise RuntimeError(
                'Cannot create temporary directory, path exists: %s' % tmp_dir)
        subprocess.call(['rm', '-rf', tmp_dir])

    # load config from file
    config = pipeline_pb2.TrainEvalPipelineConfig()
    with open(config_path, 'r') as f:
        text_format.Merge(f.read(), config, allow_unknown_extension=True)

    # override some config parameters
    if config.model.HasField('ssd'):
        config.model.ssd.feature_extractor.override_base_feature_extractor_hyperparams = True
        if override_nms_score_threshold is not None:
            config.model.ssd.post_processing.batch_non_max_suppression.score_threshold = override_nms_score_threshold
        if override_resizer_shape is not None:
            config.model.ssd.image_resizer.fixed_shape_resizer.height = override_resizer_shape[
                0]
            config.model.ssd.image_resizer.fixed_shape_resizer.width = override_resizer_shape[
                1]
    elif config.model.HasField('faster_rcnn'):
        if override_nms_score_threshold is not None:
            config.model.faster_rcnn.second_stage_post_processing.batch_non_max_suppression.score_threshold = override_nms_score_threshold
        if override_resizer_shape is not None:
            config.model.faster_rcnn.image_resizer.fixed_shape_resizer.height = override_resizer_shape[
                0]
            config.model.faster_rcnn.image_resizer.fixed_shape_resizer.width = override_resizer_shape[
                1]
            print("***************************8 image_resizer " +
                  str(config.model.faster_rcnn.image_resizer))

    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True

    # export inference graph to file (initial), this will create tmp_dir
    with tf.Session(config=tf_config):
        with tf.Graph().as_default():
            exporter.export_inference_graph(
                INPUT_NAME,
                config,
                checkpoint_path,
                tmp_dir,
                #input_shape=[max_batch_size, 600, 340, 3])
                input_shape=[
                    max_batch_size, override_resizer_shape[0],
                    override_resizer_shape[1], 3
                ])

    # read frozen graph from file
    frozen_graph_path = os.path.join(tmp_dir, FROZEN_GRAPH_NAME)
    frozen_graph = tf.GraphDef()
    with open(frozen_graph_path, 'rb') as f:
        frozen_graph.ParseFromString(f.read())

    # apply graph modifications
    if force_nms_cpu:
        frozen_graph = f_force_nms_cpu(frozen_graph)
    if replace_relu6:
        frozen_graph = f_replace_relu6(frozen_graph)
    if remove_assert:
        frozen_graph = f_remove_assert(frozen_graph)

    # get input names
    output_names = [BOXES_NAME, CLASSES_NAME, SCORES_NAME, NUM_DETECTIONS_NAME]

    # optionally perform TensorRT optimization
    if use_trt:

        print("**************************** using tensor RT *************")

        runtimes = []
        with tf.Graph().as_default() as tf_graph:
            with tf.Session(config=tf_config) as tf_sess:
                graph_size = len(frozen_graph.SerializeToString())
                num_nodes = len(frozen_graph.node)
                start_time = time.time()
                frozen_graph = trt.create_inference_graph(
                    input_graph_def=frozen_graph,
                    outputs=output_names,
                    max_batch_size=max_batch_size,
                    max_workspace_size_bytes=max_workspace_size_bytes,
                    precision_mode=precision_mode.encode('utf-8'),
                    minimum_segment_size=minimum_segment_size,
                    is_dynamic_op=True,
                    maximum_cached_engines=maximum_cached_engines)
                end_time = time.time()
                print("graph_size(MB)(native_tf): %.1f" % (float(graph_size) /
                                                           (1 << 20)))
                print("graph_size(MB)(trt): %.1f" %
                      (float(len(frozen_graph.SerializeToString())) /
                       (1 << 20)))
                print("num_nodes(native_tf): %d" % num_nodes)
                print("num_nodes(tftrt_total): %d" % len(frozen_graph.node))
                print("num_nodes(trt_only): %d" % len([
                    1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp'
                ]))
                print("time(s) (trt_conversion): %.4f" %
                      (end_time - start_time))

                # perform calibration for int8 precision
                if precision_mode == 'INT8':

                    if calib_images_dir is None:
                        raise ValueError(
                            'calib_images_dir must be provided for int8 optimization.'
                        )

                    tf.import_graph_def(frozen_graph, name='')
                    tf_input = tf_graph.get_tensor_by_name(INPUT_NAME + ':0')
                    tf_boxes = tf_graph.get_tensor_by_name(BOXES_NAME + ':0')
                    tf_classes = tf_graph.get_tensor_by_name(CLASSES_NAME +
                                                             ':0')
                    tf_scores = tf_graph.get_tensor_by_name(SCORES_NAME + ':0')
                    tf_num_detections = tf_graph.get_tensor_by_name(
                        NUM_DETECTIONS_NAME + ':0')

                    image_paths = glob.glob(
                        os.path.join(calib_images_dir, '*.jpg'))
                    image_paths = image_paths[0:num_calib_images]

                    for image_idx in range(0, len(image_paths),
                                           max_batch_size):

                        # read batch of images
                        batch_images = []
                        for image_path in image_paths[image_idx:image_idx +
                                                      max_batch_size]:
                            image = _read_image(image_path, calib_image_shape)
                            batch_images.append(image)

                        t0 = time.time()
                        # execute batch of images
                        boxes, classes, scores, num_detections = tf_sess.run(
                            [
                                tf_boxes, tf_classes, tf_scores,
                                tf_num_detections
                            ],
                            feed_dict={tf_input: batch_images})
                        t1 = time.time()
                        runtimes.append(float(t1 - t0))
                        if len(runtimes) % display_every == 0:
                            print("    step %d/%d, iter_time(ms)=%.4f" %
                                  (len(runtimes),
                                   (len(image_path) + max_batch_size - 1) /
                                   max_batch_size, np.mean(runtimes) * 1000))

                    frozen_graph = trt.calib_graph_to_infer_graph(frozen_graph)

    # re-enable variable batch size, this was forced to max
    # batch size during export to enable TensorRT optimization
    #for node in frozen_graph.node:
    #    if INPUT_NAME == node.name:
    #        node.attr['shape'].shape.dim[0].size = -1

    # write optimized model to disk
    if output_path is not None:
        with open(output_path, 'wb') as f:
            f.write(frozen_graph.SerializeToString())
        export_dir = os.path.join(os.path.dirname(output_path), 'saved_model')
        subprocess.call(['rm', '-rf', export_dir])

        with tf.Session(graph=tf.Graph()) as session:
            tf.import_graph_def(frozen_graph, name='')
            tf_input = session.graph.get_tensor_by_name(INPUT_NAME + ':0')
            tf_boxes = session.graph.get_tensor_by_name(BOXES_NAME + ':0')
            tf_classes = session.graph.get_tensor_by_name(CLASSES_NAME + ':0')
            tf_scores = session.graph.get_tensor_by_name(SCORES_NAME + ':0')
            tf_num_detections = session.graph.get_tensor_by_name(
                NUM_DETECTIONS_NAME + ':0')

            tf.saved_model.simple_save(session,
                                       export_dir,
                                       inputs={'inputs': tf_input},
                                       outputs={
                                           BOXES_NAME: tf_boxes,
                                           CLASSES_NAME: tf_classes,
                                           SCORES_NAME: tf_scores,
                                           NUM_DETECTIONS_NAME:
                                           tf_num_detections
                                       })

    return frozen_graph
示例#27
0
def get_trt_graph_from_calib(graph_name, calib_graph_def, output_dir):
  """Convert a TensorRT graph used for calibration to an inference graph."""
  trt_graph = trt.calib_graph_to_infer_graph(calib_graph_def)
  write_graph_to_file(graph_name, trt_graph, output_dir)
  return trt_graph
示例#28
0
      input_graph_def=orig_graph,
      outputs=["output"],
      max_batch_size=inp_dims[0],
      max_workspace_size_bytes=1 << 25,
      precision_mode="FP16",  # TRT Engine precision "FP32","FP16" or "INT8"
      minimum_segment_size=2  # minimum number of nodes in an engine
  )
  '''
    int8_calib_gdef = trt.create_inference_graph(
        input_graph_def=orig_graph,
        outputs=["output"],
        max_batch_size=inp_dims[0],
        max_workspace_size_bytes=1 << 25,
        precision_mode="INT8",  # TRT Engine precision "FP32","FP16" or "INT8"
        minimum_segment_size=2  # minimum number of nodes in an engine
    )
    print("##############################################")
    #o4 = run_graph(fp16_graph, dummy_input)
    _ = run_calibration(int8_calib_gdef, dummy_input)
    #_ = run_graph(int8_calib_gdef, dummy_input)
    int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef)
    o5 = run_graph(int8_graph, dummy_input)
    #assert np.allclose(o1, o4)
    #assert np.allclose(o1, o5)
    print(o1)
    print(
        "----------------------------------------------------------------------------"
    )
    print(o5)
    print("Pass")
示例#29
0
def getINT8InferenceGraph(calibGraph):
  trt_graph=trt.calib_graph_to_infer_graph(calibGraph)
  with gfile.FastGFile("resnetV150_TRTINT8.pb",'wb') as f:
    f.write(trt_graph.SerializeToString())
  return trt_graph
示例#30
0
 def build_trt_forward_pass_graph(self, input_tensors, gpu_id=0,
                                  checkpoint=None):
     """Wrapper around _build_forward_pass_graph which converts graph using
     TF-TRT"""
     import tensorflow.contrib.tensorrt as trt
     # Default parameters
     trt_params = {
         "batch_size_per_gpu": 64,
         "trt_max_workspace_size_bytes": (4096 << 20) - 1000,
         "trt_precision_mode": "FP32",
         "trt_minimum_segment_size": 10,
         "trt_is_dynamic_op": True,
         "trt_maximum_cached_engines": 1
     }
     # Update params from user config
     for key in trt_params:
         if key in self.params:
             trt_params[key] = self.params[key]
     # Create temporary graph which will contain the native TF graph
     tf_config = tf.ConfigProto()
     tf_config.gpu_options.allow_growth = True
     temp_graph = tf.Graph()
     input_map = {}
     # We have to deconstruct SparseTensors into their 3 internal tensors
     # (indicies, values, dense_shape). This maps each tensor name to a list of
     # all 3 tensor names in its SparseTensor.
     output_sparse_tensor_map = {}
     with temp_graph.as_default() as tf_graph:
         with tf.Session(config=tf_config) as tf_sess:
             # Create temporary input placeholders used to build native TF graph
             input_placeholders = {'source_tensors': []}
             for i, original_input in enumerate(input_tensors['source_tensors']):
                 name = 'input_map_%d' % i
                 input_placeholders['source_tensors'].append(
                     tf.placeholder(shape=original_input.shape,
                                    dtype=original_input.dtype,
                                    name=name))
                 # And map it back to original input
                 input_map[name] = original_input
             # Build native graph
             loss, outputs = self._build_forward_pass_graph(
                 input_placeholders,
                 gpu_id=gpu_id
             )
             # Gather output tensors
             output_node_names = []
             output_node_names_and_ports = []
             for x in outputs:
                 if isinstance(x, tf.SparseTensor):
                     components = [x.indices.name,
                                   x.values.name, x.dense_shape.name]
                     fetch_names = [tensor.split(':')[0]
                                    for tensor in components]
                     # Remove duplicates (i.e. if SparseTensor is output of one node)
                     fetch_names = list(set(fetch_names))
                     output_node_names.extend(fetch_names)
                     output_node_names_and_ports.extend(components)
                     # Add all components to map so SparseTensor can be reconstructed
                     # from tensor components which will be outputs of new graph
                     for tensor in components:
                         output_sparse_tensor_map[tensor] = components
                 else:
                     output_node_names.append(x.name.split(':')[0])
                     output_node_names_and_ports.append(x.name)
             # Restore checkpoint here because we have to freeze the graph
             tf_saver = tf.train.Saver()
             tf_saver.restore(save_path=checkpoint, sess=tf_sess)
             frozen_graph = tf.graph_util.convert_variables_to_constants(
                 tf_sess,
                 tf_sess.graph_def,
                 output_node_names=output_node_names
             )
             num_nodes = len(frozen_graph.node)
             print('Converting graph using TensorFlow-TensorRT...')
             frozen_graph = trt.create_inference_graph(
                 input_graph_def=frozen_graph,
                 outputs=output_node_names,
                 max_batch_size=trt_params["batch_size_per_gpu"],
                 max_workspace_size_bytes=trt_params["trt_max_workspace_size_bytes"],
                 precision_mode=trt_params["trt_precision_mode"],
                 minimum_segment_size=trt_params["trt_minimum_segment_size"],
                 is_dynamic_op=trt_params["trt_is_dynamic_op"],
                 maximum_cached_engines=trt_params["trt_maximum_cached_engines"]
             )
             # Remove unused inputs from input_map.
             inputs_to_remove = []
             for k in input_map:
                 if k not in [node.name for node in frozen_graph.node]:
                     inputs_to_remove.append(k)
             for k in inputs_to_remove:
                 del input_map[k]
             print('Total node count before and after TF-TRT conversion:',
                   num_nodes, '->', len(frozen_graph.node))
             print('TRT node count:',
                   len([1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp']))
     # Perform calibration for INT8 precision mode
     if self.params.get("trt_precision_mode", "FP32").upper() == 'INT8':
         with tf.Session(config=tf_config) as tf_sess:
             calib_graph = frozen_graph
             num_iterations = 10
             print('Calibrating INT8...')
             outputs = tf.import_graph_def(
                 calib_graph,
                 input_map=input_map,
                 return_elements=output_node_names_and_ports,
                 name='')
             self._num_objects_per_step = [self._get_num_objects_per_step(worker_id)
                                           for worker_id in range(self.num_gpus)]
             results_per_batch = iterate_data(
                 self, tf_sess, compute_loss=False, mode='infer', verbose=False,
                 num_steps=num_iterations
             )
             frozen_graph = trt.calib_graph_to_infer_graph(calib_graph)
             del calib_graph
             print('INT8 graph created.')
             print('Nodes INT8:', len(frozen_graph.node))
     # Import TRT converted graph to default graph, mapping it to the original
     # input tensors.
     outputs = tf.import_graph_def(
         frozen_graph,
         input_map=input_map,
         return_elements=output_node_names_and_ports,
         name='')
     # Reconstruct SparseTensors
     final_outputs = []
     for tensor in outputs:
         if tensor.name in output_sparse_tensor_map:
             component_names = output_sparse_tensor_map[tensor.name]
             # Find tensors in outputs for components
             component_tensors = [[x for x in outputs if x.name == name][0]
                                  for name in component_names]
             # Remove all components from outputs so we don't create duplicates of
             # this SparseTensor
             for x in component_tensors:
                 if x in outputs:
                     outputs.remove(x)
             final_outputs.append(tf.SparseTensor(*component_tensors))
         else:
             final_outputs.append(tensor)
     return loss, final_outputs
 def get_trt_from_calib(self, grf_calib):
     grf_trt = trt.calib_graph_to_infer_graph(grf_calib)
     return grf_trt
示例#32
0
def convert_int8(input_model_dir, output_model_dir, batch_size, precision_mode,
                 calib_image_dir, input_tensor, output_tensor, epochs):

    # (TODO) Need to check if we need Tesla T4 when conversion.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    # Get path to calibration data.
    calibration_files = get_calibration_files(calib_image_dir, 'validation*')

    # Create dataset and apply preprocess
    # (TODO) Get num cpus to set appropriate number to num_parallel_calls
    dataset = tf.data.TFRecordDataset(calibration_files)
    dataset = dataset.apply(
        tf.contrib.data.map_and_batch(
            map_func=preprocess,
            batch_size=batch_size,
            num_parallel_calls=multiprocessing.cpu_count()))
    """
  Step 1: Creating the calibration graph.
  """

    # Create TF-TRT INT8 calibration graph.
    trt_int8_calib_graph = trt.create_inference_graph(
        input_graph_def=None,
        outputs=[output_tensor],
        max_batch_size=batch_size,
        input_saved_model_dir=input_model_dir,
        precision_mode=precision_mode)

    # Calibrate graph.
    with tf.Session(graph=tf.Graph(), config=config) as sess:
        tf.logging.info('preparing calibration data...')
        iterator = dataset.make_one_shot_iterator()
        next_element = iterator.get_next()

        tf.logging.info('Loading INT8 calibration graph...')
        output_node = tf.import_graph_def(trt_int8_calib_graph,
                                          return_elements=[output_tensor],
                                          name='')

        tf.logging.info('Calibrate model with calibration data...')
        for _ in range(epochs):
            sess.run(output_node,
                     feed_dict={input_tensor: sess.run(next_element)[0]})
    """
  Step 2: Converting the calibration graph to inference graph
  """
    tf.logging.info('Creating TF-TRT INT8 inference engine...')
    trt_int8_calibrated_graph = trt.calib_graph_to_infer_graph(
        trt_int8_calib_graph)

    # Copy MetaGraph from base model.
    with tf.Session(graph=tf.Graph(), config=config) as sess:
        base_model = tf.saved_model.loader.load(
            sess, [tf.saved_model.tag_constants.SERVING], input_model_dir)

        metagraph = tf.MetaGraphDef()
        metagraph.graph_def.CopyFrom(trt_int8_calibrated_graph)
        for key in base_model.collection_def:
            if key not in [
                    'variables', 'local_variables', 'model_variables',
                    'trainable_variables', 'train_op', 'table_initializer'
            ]:
                metagraph.collection_def[key].CopyFrom(
                    base_model.collection_def[key])

        metagraph.meta_info_def.CopyFrom(base_model.meta_info_def)
        for key in base_model.signature_def:
            metagraph.signature_def[key].CopyFrom(
                base_model.signature_def[key])

    saved_model_builder = (
        tf.saved_model.builder.SavedModelBuilder(output_model_dir))

    # Write SavedModel with INT8 precision.
    with tf.Graph().as_default():
        tf.graph_util.import_graph_def(trt_int8_calibrated_graph,
                                       return_elements=[output_tensor],
                                       name='')
        with tf.Session(config=config) as sess:
            saved_model_builder.add_meta_graph_and_variables(
                sess, ('serve', ), signature_def_map=metagraph.signature_def)

    # Ignore other meta graphs from the input SavedModel.
    saved_model_builder.save()
示例#33
0
                       graph=gx) as sess:
        # run over real calibration data here, we are mimicking a calibration set of
        # 30 different batches. Use as much calibration data as you want
        for j in range(200):
            val = sess.run(out, {inp: [dumm_inp[j]]})
    return val


mnist__train, temppp = tf.keras.datasets.mnist.load_data()
imagesss, labels = mnist__train[0], mnist__train[1]
imagesss = imagesss.astype('float32')
imagesss /= 255.0
imagesss = np.reshape(imagesss, [60000, 28, 28, 1])

_ = run_calibration(trt_graph_def, imagesss)
trt_graph_def = trt.calib_graph_to_infer_graph(
    trt_graph_def)  # For only 'INT8'

#trt_graph_def=trt.calib_graph_to_infer_graph(trt_graph_def) # For only 'INT8'
print('Generated TensorRT graph def')

#
# Generate tensor with TensorRT graph def
#
tf.reset_default_graph()
g2 = tf.Graph()
with g2.as_default():
    trt_x, trt_y = tf.import_graph_def(trt_graph_def,
                                       return_elements=['x:0', 'output:0'])
print('Generated tensor for TensorRT optimized graph')

#