def testINT8(self): """Test INT8 conversion. Results may be different from native case.""" calib_graph = self.get_trt_graph("INT8") result = self.run_calibration(calib_graph, self._input) self.assertAllEqual(self._reference, result) int8_graph = trt.calib_graph_to_infer_graph(calib_graph) result = self.run_graph(int8_graph, self._input) self.assertAllClose(self._reference, result, rtol=1.e-03) result1 = self.run_graph(int8_graph, self._input) self.assertAllEqual(result1, result)
def testINT8(self): """Test INT8 conversion. Results may be difference from native case.""" calib_graph = self.get_trt_graph("INT8") result = self.run_calibration(calib_graph, self._input) self.assertAllEqual(self._reference, result) int8_graph = trt.calib_graph_to_infer_graph(calib_graph) result = self.run_graph(int8_graph, self._input) self.assertAllClose(self._reference, result, rtol=1.e-03) result1 = self.run_graph(int8_graph, self._input) self.assertAllEqual(result1, result)
def _RunTest(self, graph_key, use_optimizer, precision_mode, dynamic_infer_engine, dynamic_calib_engine): assert precision_mode in [MODE_FP32, MODE_FP16, MODE_INT8] input_gdef = TEST_GRAPHS[graph_key].gdef self._VerifyGraphDef(graph_key, input_gdef) # Get reference result without running trt. config_no_trt = self._GetConfigProto(False) print("Running original graph w/o trt, config:\n%s" % str(config_no_trt)) ref_result = self._RunGraph(graph_key, input_gdef, self._input, config_no_trt) # Run calibration if necessary. if precision_mode == MODE_INT8: calib_config = self._GetConfigProto(use_optimizer, precision_mode, dynamic_calib_engine) print("Running calibration graph, config:\n%s" % str(calib_config)) if use_optimizer: self.assertTrue(False) # TODO(aaroey): uncomment this and get infer_gdef when this mode is # supported. # result = self._RunCalibration(graph_key, input_gdef, self._input, # calib_config) else: calib_gdef = self._GetTrtGraph(input_gdef, precision_mode, dynamic_calib_engine) self._VerifyGraphDef(graph_key, calib_gdef, precision_mode, False, dynamic_calib_engine) result = self._RunCalibration(graph_key, calib_gdef, self._input, calib_config) infer_gdef = trt.calib_graph_to_infer_graph(calib_gdef) self._VerifyGraphDef(graph_key, infer_gdef, precision_mode, True, dynamic_calib_engine) self.assertAllClose(ref_result, result, rtol=1.e-03) else: infer_gdef = input_gdef # Run inference. infer_config = self._GetConfigProto(use_optimizer, precision_mode, dynamic_infer_engine) print("Running final inference graph, config:\n%s" % str(infer_config)) if use_optimizer: result = self._RunGraph(graph_key, infer_gdef, self._input, infer_config) else: trt_infer_gdef = self._GetTrtGraph(infer_gdef, precision_mode, dynamic_infer_engine) self._VerifyGraphDef(graph_key, trt_infer_gdef, precision_mode, True, dynamic_infer_engine) result = self._RunGraph(graph_key, trt_infer_gdef, self._input, infer_config) self.assertAllClose(ref_result, result, rtol=1.e-03)
def user(multi_engine, run_graph=execute_graph, run_calibration=execute_calibration): """Example function that converts a graph to TFTRT graph.""" if multi_engine: inp_dims = (2, 3, 7, 5) orig_graph = get_multi_engine_graph_def() else: inp_dims = (100, 24, 24, 2) orig_graph = get_simple_graph_def() # use a frozen graph for inference dummy_input = np.random.random_sample(inp_dims) # Get optimized graph trt_graph = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="FP32", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2, # minimum number of nodes in an engine is_dynamic_op=False, maximum_cached_engines=1, cached_engine_batch_sizes=[]) o1 = run_graph(orig_graph, dummy_input) o2 = run_graph(trt_graph, dummy_input) o3 = run_graph(trt_graph, dummy_input) assert np.array_equal(o1, o2) assert np.array_equal(o3, o2) # sanity check fp16_graph = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="FP16", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2, # minimum number of nodes in an engine is_dynamic_op=False, maximum_cached_engines=1, cached_engine_batch_sizes=[]) int8_calib_gdef = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="INT8", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2, # minimum number of nodes in an engine is_dynamic_op=False, maximum_cached_engines=1, cached_engine_batch_sizes=[]) o4 = run_graph(fp16_graph, dummy_input) _ = run_calibration(int8_calib_gdef, dummy_input) int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef) o5 = run_graph(int8_graph, dummy_input) print("Is FP32 == FP16? %s (False is possible)" % np.allclose(o1, o4)) print("Is FP32 == INT8? %s (False is possible)" % np.allclose(o1, o5)) print("Pass")
def user(multi_engine, run_graph=execute_graph, run_calibration=execute_calibration): """Example function that converts a graph to TFTRT graph.""" if multi_engine: inp_dims = (2, 3, 7, 5) orig_graph = get_multi_engine_graph_def() else: inp_dims = (100, 24, 24, 2) orig_graph = get_simple_graph_def() # use a frozen graph for inference dummy_input = np.random.random_sample(inp_dims) # Get optimized graph trt_graph = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="FP32", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2, # minimum number of nodes in an engine is_dynamic_op=False, maximum_cached_engines=1, cached_engine_batches=[]) o1 = run_graph(orig_graph, dummy_input) o2 = run_graph(trt_graph, dummy_input) o3 = run_graph(trt_graph, dummy_input) assert np.array_equal(o1, o2) assert np.array_equal(o3, o2) # sanity check fp16_graph = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="FP16", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2, # minimum number of nodes in an engine is_dynamic_op=False, maximum_cached_engines=1, cached_engine_batches=[]) int8_calib_gdef = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="INT8", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2, # minimum number of nodes in an engine is_dynamic_op=False, maximum_cached_engines=1, cached_engine_batches=[]) o4 = run_graph(fp16_graph, dummy_input) _ = run_calibration(int8_calib_gdef, dummy_input) int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef) o5 = run_graph(int8_graph, dummy_input) print("Is FP32 == FP16? %s (False is possible)" % np.allclose(o1, o4)) print("Is FP32 == INT8? %s (False is possible)" % np.allclose(o1, o5)) print("Pass")
def getINT8InferenceGraph(): with tf.gfile.FastGFile('/host_temp/test/test_INT8_batch1_trt_graph.pb', 'rb') as f: calibGraph = tf.GraphDef() calibGraph.ParseFromString(f.read()) trt_graph=trt.calib_graph_to_infer_graph(calibGraph) with gfile.FastGFile("CPN_TRTINT8.pb",'wb') as f: f.write(trt_graph.SerializeToString()) return trt_graph
def convert_tensorrt_speedup_graph(input_graph_path, output_graph_path, data_type="FP32", calibrate_img_dir=""): output_node_names = ["score_list"] batch_size = 1 workspace_size = 1 << 30 precision = data_type trt_graph = trt.create_inference_graph( input_graph_def=get_graph_definition(input_graph_path), outputs=output_node_names, max_batch_size=batch_size, max_workspace_size_bytes=workspace_size, precision_mode=precision, minimum_segment_size=3) if data_type == "FP32" or data_type == "FP16": # save the new graph transformed by tensorRT with gfile.FastGFile(output_graph_path, "wb") as f: f.write(trt_graph.SerializeToString()) #序列化输出 print("convert tensorrt {} speed up graph finished".format(data_type)) elif data_type == "INT8": calib_graph = tf.Graph() with calib_graph.as_default(): tf.import_graph_def(trt_graph, name='') config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True #config.gpu_options.per_process_gpu_memory_fraction = 0.50 sess = tf.Session(graph=calib_graph, config=config) inputs = calib_graph.get_tensor_by_name('inputs:0') is_training = calib_graph.get_tensor_by_name('is_training:0') prediction = calib_graph.get_tensor_by_name('score_list:0') # calibrate graph image_files = glob.glob(os.path.join(calibrate_img_dir, '*.*')) for image_path in image_files: preprocess_and_inference(image_path, sess, inputs, is_training, prediction) infer_graph = trt.calib_graph_to_infer_graph(trt_graph) with gfile.FastGFile(output_graph_path, 'wb') as f: f.write(infer_graph.SerializeToString()) print("convert tensorrt {} speed up graph finished".format(data_type)) else: print("data_type error, return")
def __init__(self, graph_path, use_tensorrt=False, fp_mode='FP32', target_size=(320, 240)): self.target_size = target_size print('fp mode is {}'.format(fp_mode)) graph_def = None if use_tensorrt and tf.__version__ > '1.7': graph_def = trt.create_inference_graph( input_graph_def=load_graph(graph_path), outputs=['Openpose/concat_stage7'], max_batch_size=1, max_workspace_size_bytes=1 >> 16, precision_mode=fp_mode) if fp_mode == "INT8": graph_def = trt.calib_graph_to_infer_graph(graph_def) else: with tf.gfile.GFile(graph_path, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) self.graph = tf.get_default_graph() tf.import_graph_def(graph_def, name='') self.persistent_sess = tf.Session(graph=self.graph) #for op in self.graph.get_operations(): # print(op.name) self.tensor_image = self.graph.get_tensor_by_name('image:0') self.tensor_output = self.graph.get_tensor_by_name( 'Openpose/concat_stage7:0') self.heatMat = self.pafMat = None # warm-up self.persistent_sess.run(self.tensor_output, feed_dict={ self.tensor_image: [ np.ndarray(shape=(target_size[1], target_size[0], 3), dtype=np.float32) ] })
def user(run_graph=execute_graph, run_calibration=execute_calibration): """Example function that converts a graph to TFTRT graph.""" inp_dims = (100, 24, 24, 2) dummy_input = np.random.random_sample(inp_dims) orig_graph = get_simple_graph_def() # use a frozen graph for inference # Get optimized graph trt_graph = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="FP32", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2 # minimum number of nodes in an engine ) o1 = run_graph(orig_graph, dummy_input) o2 = run_graph(trt_graph, dummy_input) o3 = run_graph(trt_graph, dummy_input) assert np.array_equal(o1, o2) assert np.array_equal(o3, o2) # sanity check fp16_graph = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="FP16", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2 # minimum number of nodes in an engine ) int8_calib_gdef = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="INT8", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2 # minimum number of nodes in an engine ) o4 = run_graph(fp16_graph, dummy_input) _ = run_calibration(int8_calib_gdef, dummy_input) int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef) o5 = run_graph(int8_graph, dummy_input) assert np.allclose(o1, o4) assert np.allclose(o1, o5) print("Pass")
def get_trt_graph_from_calib(graph_name, calib_graph_def, output_dir): """Convert a TensorRT graph used for calibration to an inference graph.""" trt_graph = trt.calib_graph_to_infer_graph(calib_graph_def) write_graph_to_file(graph_name, trt_graph, output_dir) return trt_graph
def build_forward_pass_graph(self, input_tensors, gpu_id=0, checkpoint=None, use_trt=False, precision='FP32'): """Wrapper around _build_forward_pass_graph with option of using TF-TRT""" if use_trt: import tensorflow.contrib.tensorrt as trt # Create temporary graph which will contain the native TF graph tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True temp_graph = tf.Graph() with temp_graph.as_default() as tf_graph: with tf.Session(config=tf_config) as tf_sess: input_placeholders = { 'source_tensors': [ tf.placeholder(shape=(None, None), dtype=tf.int32, name='input_map1'), tf.placeholder(shape=(None, None), dtype=tf.int32, name='input_map2') ] } loss, self._outputs[ gpu_id] = self._build_forward_pass_graph( input_placeholders, gpu_id=gpu_id) output_node_names = [ x.name.split(':0')[0] for x in self._outputs[gpu_id] ] # Restore checkpoint here because we have to freeze the graph tf_saver = tf.train.Saver() tf_saver.restore(save_path=checkpoint, sess=tf_sess) frozen_graph = tf.graph_util.convert_variables_to_constants( tf_sess, tf_sess.graph_def, output_node_names=output_node_names) num_nodes = len(frozen_graph.node) print('Converting graph using TensorFlow-TensorRT...') frozen_graph = trt.create_inference_graph( input_graph_def=frozen_graph, outputs=output_node_names, max_batch_size=64, max_workspace_size_bytes=4096 << 20, precision_mode=precision, minimum_segment_size=3) print( 'Total node count before and after TF-TRT conversion:', num_nodes, '->', len(frozen_graph.node)) print( 'TRT node count:', len([ 1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp' ])) # Perform calibration for INT8 precision mode if precision == 'int8': with tf.Session(config=tf_config) as tf_sess: calib_graph = frozen_graph num_iterations = 10 print('Calibrating INT8...') self._outputs[gpu_id] = tf.import_graph_def( calib_graph, input_map={ 'input_map1': input_tensors['source_tensors'][0] }, return_elements=[x + ':0' for x in output_node_names], name='') self._num_objects_per_step = [ self._get_num_objects_per_step(worker_id) for worker_id in range(self.num_gpus) ] results_per_batch = iterate_data(self, tf_sess, compute_loss=False, mode='infer', verbose=False, num_steps=num_iterations) frozen_graph = trt.calib_graph_to_infer_graph(calib_graph) del calib_graph print('INT8 graph created.') print('Nodes INT8:', len(frozen_graph.node)) # Import TRT converted graph to default graph, mapping it to the original input tensors self._outputs[gpu_id] = tf.import_graph_def( frozen_graph, input_map={'input_map1': input_tensors['source_tensors'][0]}, return_elements=[x + ':0' for x in output_node_names], name='') return loss, self._outputs[gpu_id] else: return self._build_forward_pass_graph(input_tensors, gpu_id)
minimum_segment_size=2 # minimum number of nodes in an engine ) o1 = run_graph(orig_graph, dummy_input) o2 = run_graph(trt_graph, dummy_input) o3 = run_graph(trt_graph, dummy_input) assert np.array_equal(o1, o2) assert np.array_equal(o3, o2) # sanity check fp16_graph = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="FP16", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2 # minimum number of nodes in an engine ) int8_calib_gdef = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="INT8", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2 # minimum number of nodes in an engine ) o4 = run_graph(fp16_graph, dummy_input) _ = run_calibration(int8_calib_gdef, dummy_input) int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef) o5 = run_graph(int8_graph, dummy_input) assert np.allclose(o1, o4) assert np.allclose(o1, o5) print("Pass")
def getINT8InferenceGraph(calibGraph): trt_graph=trt.calib_graph_to_infer_graph(calibGraph) with gfile.FastGFile("resnetV250_TRTINT8_chest.pb",'wb') as f: f.write(trt_graph.SerializeToString()) return trt_graph
def get_frozen_graph(model, use_trt=False, use_dynamic_op=False, precision='fp32', batch_size=8, minimum_segment_size=2, calib_data_dir=None, num_calib_inputs=None, use_synthetic=False, cache=False, download_dir='./data'): """Retreives a frozen GraphDef from model definitions in classification.py and applies TF-TRT model: str, the model name (see NETS table in classification.py) use_trt: bool, if true, use TensorRT precision: str, floating point precision (fp32, fp16, or int8) batch_size: int, batch size for TensorRT optimizations returns: tensorflow.GraphDef, the TensorRT compatible frozen graph """ num_nodes = {} times = {} # Load from pb file if frozen graph was already created and cached if cache: # Graph must match the model, TRT mode, precision, and batch size prebuilt_graph_path = "graphs/frozen_graph_%s_%d_%s_%d.pb" % ( model, int(use_trt), precision, batch_size) if os.path.isfile(prebuilt_graph_path): print('Loading cached frozen graph from \'%s\'' % prebuilt_graph_path) start_time = time.time() with tf.gfile.GFile(prebuilt_graph_path, "rb") as f: frozen_graph = tf.GraphDef() frozen_graph.ParseFromString(f.read()) times['loading_frozen_graph'] = time.time() - start_time num_nodes['loaded_frozen_graph'] = len(frozen_graph.node) num_nodes['trt_only'] = len( [1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp']) return frozen_graph, num_nodes, times # Build graph and load weights #frozen_graph = build_classification_graph(model, download_dir) model_dir = os.path.join(os.environ['APP_HOME'], "Modules", "Deep-Learning", "packages", "models") frozen_graph = create_graph(model_dir, FLAGS.frozen_graph) num_nodes['native_tf'] = len(frozen_graph.node) # Convert to TensorRT graph if use_trt: start_time = time.time() frozen_graph = trt.create_inference_graph( input_graph_def=frozen_graph, outputs=['resnet_v1_50/SpatialSqueeze:0'], max_batch_size=batch_size, max_workspace_size_bytes=(4096 << 20) - 1000, precision_mode=precision, minimum_segment_size=minimum_segment_size, is_dynamic_op=use_dynamic_op) times['trt_conversion'] = time.time() - start_time num_nodes['tftrt_total'] = len(frozen_graph.node) num_nodes['trt_only'] = len( [1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp']) if precision == 'int8': calib_graph = frozen_graph # INT8 calibration step print('Calibrating INT8...') batch_data = [] files_dir = 'input_images' data_dir = os.path.join(os.environ['APP_HOME'], "Modules", "Deep-Learning", "packages", files_dir) files = os.listdir(data_dir) for f in files: if f.lower().endswith(('.png', '.jpg', '.jpeg')): image_path = files_dir + '/' + f batch_data = batch_from_image(image_path, FLAGS.batch_size, batch_data) print(image_path) run_inference(batch_data, calib_graph) #run(calib_graph, model, calib_data_dir, batch_size, # num_calib_inputs // batch_size, 0, False) frozen_graph = trt.calib_graph_to_infer_graph(calib_graph) del calib_graph print('INT8 graph created.') # Cache graph to avoid long conversions each time if cache: if not os.path.exists(os.path.dirname(prebuilt_graph_path)): try: os.makedirs(os.path.dirname(prebuilt_graph_path)) except Exception as e: raise e start_time = time.time() with tf.gfile.GFile(prebuilt_graph_path, "wb") as f: f.write(frozen_graph.SerializeToString()) times['saving_frozen_graph'] = time.time() - start_time return frozen_graph, num_nodes, times
def get_frozen_graph(model, model_dir=None, use_trt=False, use_dynamic_op=False, precision='fp32', batch_size=8, minimum_segment_size=2, calib_files=None, num_calib_inputs=None, use_synthetic=False, cache=False, default_models_dir='./data', max_workspace_size=(1 << 32)): """Retreives a frozen GraphDef from model definitions in classification.py and applies TF-TRT model: str, the model name (see NETS table in classification.py) use_trt: bool, if true, use TensorRT precision: str, floating point precision (fp32, fp16, or int8) batch_size: int, batch size for TensorRT optimizations returns: tensorflow.GraphDef, the TensorRT compatible frozen graph """ num_nodes = {} times = {} graph_sizes = {} # Load from pb file if frozen graph was already created and cached if cache: # Graph must match the model, TRT mode, precision, and batch size prebuilt_graph_path = "graphs/frozen_graph_%s_%d_%s_%d.pb" % ( model, int(use_trt), precision, batch_size) if os.path.isfile(prebuilt_graph_path): print('Loading cached frozen graph from \'%s\'' % prebuilt_graph_path) start_time = time.time() with tf.gfile.GFile(prebuilt_graph_path, "rb") as f: frozen_graph = tf.GraphDef() frozen_graph.ParseFromString(f.read()) times['loading_frozen_graph'] = time.time() - start_time num_nodes['loaded_frozen_graph'] = len(frozen_graph.node) num_nodes['trt_only'] = len( [1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp']) graph_sizes['loaded_frozen_graph'] = len( frozen_graph.SerializeToString()) return frozen_graph, num_nodes, times, graph_sizes # Build graph and load weights frozen_graph = build_classification_graph(model, model_dir, default_models_dir) num_nodes['native_tf'] = len(frozen_graph.node) graph_sizes['native_tf'] = len(frozen_graph.SerializeToString()) # Convert to TensorRT graph if use_trt: start_time = time.time() frozen_graph = trt.create_inference_graph( input_graph_def=frozen_graph, outputs=['logits', 'classes'], max_batch_size=batch_size, max_workspace_size_bytes=max_workspace_size, precision_mode=precision.upper(), minimum_segment_size=minimum_segment_size, is_dynamic_op=use_dynamic_op) times['trt_conversion'] = time.time() - start_time num_nodes['tftrt_total'] = len(frozen_graph.node) num_nodes['trt_only'] = len( [1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp']) graph_sizes['trt'] = len(frozen_graph.SerializeToString()) if precision == 'int8': calib_graph = frozen_graph graph_sizes['calib'] = len(calib_graph.SerializeToString()) # INT8 calibration step print('Calibrating INT8...') start_time = time.time() run(calib_graph, model, calib_files, batch_size, num_calib_inputs // batch_size, 0, use_synthetic=use_synthetic, run_calibration=True) times['trt_calibration'] = time.time() - start_time start_time = time.time() frozen_graph = trt.calib_graph_to_infer_graph(calib_graph) times['trt_int8_conversion'] = time.time() - start_time # This is already set but overwriting it here to ensure the right size graph_sizes['trt'] = len(frozen_graph.SerializeToString()) del calib_graph print('INT8 graph created.') # Cache graph to avoid long conversions each time if cache: if not os.path.exists(os.path.dirname(prebuilt_graph_path)): try: os.makedirs(os.path.dirname(prebuilt_graph_path)) except Exception as e: raise e start_time = time.time() with tf.gfile.GFile(prebuilt_graph_path, "wb") as f: f.write(frozen_graph.SerializeToString()) times['saving_frozen_graph'] = time.time() - start_time return frozen_graph, num_nodes, times, graph_sizes
def get_frozen_graph(model, model_dir=None, use_trt=False, use_dynamic_op=False, precision='fp32', batch_size=8, minimum_segment_size=2, calib_files=None, num_calib_inputs=None, use_synthetic=False, cache=False, default_models_dir='./data', max_workspace_size=(1 << 32)): """Retreives a frozen GraphDef from model definitions in classification.py and applies TF-TRT model: str, the model name (see NETS table in classification.py) use_trt: bool, if true, use TensorRT precision: str, floating point precision (fp32, fp16, or int8) batch_size: int, batch size for TensorRT optimizations returns: tensorflow.GraphDef, the TensorRT compatible frozen graph """ num_nodes = {} times = {} graph_sizes = {} # Load from pb file if frozen graph was already created and cached if cache: # Graph must match the model, TRT mode, precision, and batch size prebuilt_graph_path = "graphs/frozen_graph_%s_%d_%s_%d.pb" % ( model, int(use_trt), precision, batch_size) if os.path.isfile(prebuilt_graph_path): print('Loading cached frozen graph from \'%s\'' % prebuilt_graph_path) start_time = time.time() with tf.gfile.GFile(prebuilt_graph_path, "rb") as f: frozen_graph = tf.GraphDef() frozen_graph.ParseFromString(f.read()) times['loading_frozen_graph'] = time.time() - start_time num_nodes['loaded_frozen_graph'] = len(frozen_graph.node) num_nodes['trt_only'] = len( [1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp']) graph_sizes['loaded_frozen_graph'] = len( frozen_graph.SerializeToString()) return frozen_graph, num_nodes, times, graph_sizes # Build graph and load weights frozen_graph = build_classification_graph(model, model_dir, default_models_dir) num_nodes['native_tf'] = len(frozen_graph.node) graph_sizes['native_tf'] = len(frozen_graph.SerializeToString()) export_dir = './saved_model4/1' graph_pb = './graphs/frozen_graph_inception_v3_0_fp32_8.pb' ''' builder = tf.saved_model.builder.SavedModelBuilder(export_dir) with tf.gfile.GFile(graph_pb, "rb") as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) sigs = {} with tf.Session(graph=tf.Graph()) as sess: # name="" is important to ensure we don't get spurious prefixing tf.import_graph_def(graph_def, name="") #tf.summary.FileWriter('inception_v3_event', sess.graph) g = tf.get_default_graph() inp = g.get_tensor_by_name("input:0") out = g.get_tensor_by_name("ArgMax:0") print(inp) print(out) #worked version prediction_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={'image': tf.saved_model.utils.build_tensor_info(inp)}, outputs={'out':tf.saved_model.utils.build_tensor_info(out)}, method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)) #legacy_init_op = tf.group(tf.tables.initializer(), name='legacy_init_op') builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={'predict_images': prediction_signature, }) builder.save() #Failed method 1: image_height_tensor = tf.placeholder(tf.int32) image_width_tensor = tf.placeholder(tf.int32) #placeholder for receiving the serialized input image serialized_tf_example = tf.placeholder(tf.string, name='tf_example') feature_configs = {'x': tf.FixedLenFeature(shape=[], dtype=tf.float32), } tf_example = tf.parse_example(serialized_tf_example, feature_configs) # reshape the input image to its original dimension tf_example['x'] = tf.reshape(tf_example['x'], (1, image_height_tensor, image_width_tensor, 3)) x = tf.identity(tf_example['x'], name='x') # use tf.identity() to assign name inp = x # perform inference on the input image print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&') print(inp) print(out) # Creates the TensorInfo protobuf objects that encapsulates the input/output tensors tensor_info_input = tf.saved_model.utils.build_tensor_info(x) tensor_info_height = tf.saved_model.utils.build_tensor_info(image_height_tensor) tensor_info_width = tf.saved_model.utils.build_tensor_info(image_width_tensor) # output tensor info tensor_info_output = tf.saved_model.utils.build_tensor_info(out) prediction_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={'images': tensor_info_input, 'height': tensor_info_height, 'width': tensor_info_width}, outputs={'segmentation_map': tensor_info_output}, method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)) builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={'predict_images': prediction_signature, }) builder.save() #failed version 2 sigs[tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = \ tf.saved_model.signature_def_utils.predict_signature_def( {'images': tensor_info_input, 'height':tensor_info_height, 'width': tensor_info_width}, {"out": tensor_info_output}) builder.add_meta_graph_and_variables(sess, [tf.saved_model.tag_constants.SERVING], signature_def_map=sigs) ''' # Convert to TensorRT graph if use_trt: start_time = time.time() frozen_graph = trt.create_inference_graph( input_graph_def=frozen_graph, outputs=['logits', 'classes'], max_batch_size=batch_size, max_workspace_size_bytes=max_workspace_size, precision_mode=precision.upper(), minimum_segment_size=minimum_segment_size, is_dynamic_op=use_dynamic_op) times['trt_conversion'] = time.time() - start_time num_nodes['tftrt_total'] = len(frozen_graph.node) num_nodes['trt_only'] = len( [1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp']) graph_sizes['trt'] = len(frozen_graph.SerializeToString()) if precision == 'int8': calib_graph = frozen_graph graph_sizes['calib'] = len(calib_graph.SerializeToString()) # INT8 calibration step print('Calibrating INT8...') start_time = time.time() run(calib_graph, model, calib_files, batch_size, num_calib_inputs // batch_size, 0, use_synthetic=use_synthetic) times['trt_calibration'] = time.time() - start_time start_time = time.time() frozen_graph = trt.calib_graph_to_infer_graph(calib_graph) times['trt_int8_conversion'] = time.time() - start_time # This is already set but overwriting it here to ensure the right size graph_sizes['trt'] = len(frozen_graph.SerializeToString()) del calib_graph print('INT8 graph created.') # Cache graph to avoid long conversions each time print( "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&" ) print(cache) if cache: if not os.path.exists(os.path.dirname(prebuilt_graph_path)): try: os.makedirs(os.path.dirname(prebuilt_graph_path)) except Exception as e: raise e start_time = time.time() with tf.gfile.GFile(prebuilt_graph_path, "wb") as f: f.write(frozen_graph.SerializeToString()) times['saving_frozen_graph'] = time.time() - start_time return frozen_graph, num_nodes, times, graph_sizes
def get_trt_from_calib(self, grf_calib): """Convert a TensorRT graph used for calibration to an inference graph.""" grf_trt = trt.calib_graph_to_infer_graph(grf_calib) return grf_trt
def get_frozen_graph(model, model_dir=None, pb_name=None, use_trt=False, use_dynamic_op=False, precision='fp32', batch_size=8, minimum_segment_size=2, calib_files=None, num_calib_inputs=None, cache=False, max_workspace_size=(1 << 32)): num_nodes = {} times = {} graph_sizes = {} frozen_graph = tf.GraphDef() # Load from pb file if frozen graph was already created and cached if pb_name: prebuilt_graph_path = os.path.join(model_dir, pb_name) else: prebuilt_graph_path = os.path.join(model_dir, 'r50_93.40_trt.pb') if cache: if os.path.isfile(prebuilt_graph_path): print('Loading cached frozen graph from \'%s\'' % prebuilt_graph_path) start_time = time.time() with tf.gfile.GFile(prebuilt_graph_path, "rb") as f: frozen_graph = tf.GraphDef() frozen_graph.ParseFromString(f.read()) times['loading_frozen_graph'] = time.time() - start_time num_nodes['loaded_frozen_graph'] = len(frozen_graph.node) num_nodes['trt_only'] = len( [1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp']) graph_sizes['loaded_frozen_graph'] = len( frozen_graph.SerializeToString()) return frozen_graph, num_nodes, times, graph_sizes num_nodes['native_tf'] = len(frozen_graph.node) graph_sizes['native_tf'] = len(frozen_graph.SerializeToString()) # Convert to TensorRT graph if use_trt: print("Using TensorRT") start_time = time.time() frozen_graph = trt.create_inference_graph( input_graph_def=frozen_graph, outputs=PB_OUTPUTS, max_batch_size=batch_size, max_workspace_size_bytes=max_workspace_size, precision_mode=precision, minimum_segment_size=minimum_segment_size, is_dynamic_op=use_dynamic_op) times['trt_conversion'] = time.time() - start_time num_nodes['tftrt_total'] = len(frozen_graph.node) num_nodes['trt_only'] = len( [1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp']) graph_sizes['trt'] = len(frozen_graph.SerializeToString()) if precision == 'int8': calib_graph = frozen_graph graph_sizes['calib'] = len(calib_graph.SerializeToString()) # INT8 calibration step print('Calibrating INT8...') start_time = time.time() run(calib_graph, model, calib_files, batch_size, num_calib_inputs // batch_size, 0, False, run_calibration=True) times['trt_calibration'] = time.time() - start_time start_time = time.time() frozen_graph = trt.calib_graph_to_infer_graph(calib_graph) times['trt_int8_conversion'] = time.time() - start_time # This is already set but overwriting it here to ensure the right size graph_sizes['trt'] = len(frozen_graph.SerializeToString()) del calib_graph print('INT8 graph created.') # Cache graph to avoid long conversions each time if cache: saved_pb = os.path.join(model_dir, 'r50.pb') if not os.path.exists(os.path.dirname(saved_pb)): try: os.makedirs(os.path.dirname(saved_pb)) except Exception as e: raise e start_time = time.time() with tf.gfile.GFile(saved_pb, "wb") as f: f.write(frozen_graph.SerializeToString()) times['saving_frozen_graph'] = time.time() - start_time return frozen_graph, num_nodes, times, graph_sizes
def main(args): workspace_size_bytes = 1 << 30 trt_gpu_ops = tf.GPUOptions(per_process_gpu_memory_fraction=0.50) batches = random_sequences(length_from=3, length_to=10, vocab_lower=2, vocab_upper=10, batch_size=args.batch_size) frozen_model = 'frozen_model.pb' tf.reset_default_graph() # parse the graph_def file with tf.gfile.GFile(frozen_model, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) trt_graph_def = trt.create_inference_graph( input_graph_def=graph_def, outputs=['ArgMax'], max_batch_size=args.batch_size, max_workspace_size_bytes=workspace_size_bytes, precision_mode=args.precision_mode) if args.precision_mode == 'INT8': trt_graph_def = trt.calib_graph_to_infer_graph(trt_graph_def) print('Generated TensorRT graph def') with tf.Graph().as_default() as graph: encoder_inputs, decoder_inputs, decoder_targets, decoder_prediction = tf.import_graph_def( graph_def, return_elements=[ 'encoder_inputs:0', 'decoder_inputs:0', 'decoder_targets:0', 'ArgMax:0' ]) print('Generated tensor by frozen graph') with tf.Session(graph=graph, config=tf.ConfigProto(gpu_options=trt_gpu_ops)) as sess: for _ in range(args.roll): batch = next(batches) encoder_inputs_, _ = make_batch(batch) decoder_targets_, _ = make_batch([(sequence) + [EOS] for sequence in batch]) decoder_inputs_, _ = make_batch([[EOS] + (sequence) for sequence in batch]) feed_dict = { encoder_inputs: encoder_inputs_, decoder_inputs: decoder_inputs_, decoder_targets: decoder_targets_ } start_time = time.process_time() predict_ = sess.run(decoder_prediction, feed_dict) stop_time = time.process_time() for i, (inp, pred) in enumerate( zip(feed_dict[encoder_inputs].T, predict_.T)): print('input > {}'.format(inp)) print('predicted > {}'.format(pred)) if i >= 10: break print('{:.2f} milliseconds'.format( (stop_time - start_time) * 1000))
def convert_tftrt_fp(orig_graph, batch_size, precision): # convert native Tensorflow graphdef into a mixed TF-TRT graph trt_graph = trt.create_inference_graph( input_graph_def=orig_graph, # native Tensorflow graphdef outputs=["output"], # list of names for output node max_batch_size=batch_size, # maximum/optimum batchsize for TF-TRT # mixed graphdef max_workspace_size_bytes=1 << 25, # maximum workspace (in MB) for each # TRT engine to allocate precision_mode=precision, # TRT Engine precision # "FP32","FP16" or "INT8" minimum_segment_size=2 # minimum number of nodes in an engine, # this parameter allows the converter to # skip subgraph with total node number # less than the threshold ) # allow_growth and restrict Tensorflow to claim all GPU memory # currently TensorRT engine uses independent memory allocation outside of TF config = tf.ConfigProto(gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=0.5, allow_growth=True)) # we can now import trt_graph into Tensorflow and execute it. If given target # precision_mode as 'FP32' or 'FP16'. if precision == 'FP16' or precision == 'FP32': return trt_graph # 'INT8' precision would require an extra step of calibration int8_calib_gdef = trt_graph # 'INT8' precision requires calibration to retrieve proper quantization range. # trt.create_inference_graph returns a calibration graph def with inserted # calibration op that captures input tensor during session run to feed # TensorRT subgraph during engine construction # feed calibration date into TF-TRT mixed graph # this step is just running the calibration graph with a set of representative # input data. (could use a subset of validation data with even distribution # of all categories) g = tf.Graph() with g.as_default(): inp, out = tf.import_graph_def(graph_def=int8_calib_gdef, return_elements=["input", "output"]) inp = inp.outputs[0] out = out.outputs[0] # start TF session with TF-TRT graph, execute the graph and feed it with input # calibration_batch should be sharded and feed through TF-TRT mixed network # Should use real data that is representatitive of the inference dataset for # calibration to reduce quantization error. # For this test script it is random data. CALIBRATION_BATCH = 100 inp_dims = (CALIBRATION_BATCH, 24, 24, 2) dummy_input = np.random.random_sample(inp_dims) # allow_growth and restrict Tensorflow to claim all GPU memory # currently TensorRT engine uses independent memory allocation outside of TF config = tf.ConfigProto(gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=0.5, allow_growth=True)) # start session to feed calibration data with tf.Session(graph=g, config=config) as sess: iteration = int(CALIBRATION_BATCH / batch_size) # iterate through the clibration data, each time we feed data with # batch size < BATCH_SIZE (specified during conversion) for i in range(iteration): val = sess.run(out, {inp: dummy_input[i::iteration]}) # finished calibration, trigger calib_graph_to_infer_graph to build # TF-TRT mixed graphdef for inference int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef) return int8_graph
def getINT8InferenceGraph(output_prefix, calibGraph): trt_graph = trt.calib_graph_to_infer_graph(calibGraph) with gfile.FastGFile(output_prefix + '.INT8.pb', 'wb') as f: f.write(trt_graph.SerializeToString()) return trt_graph
def get_int8_infer_graph(calib_graph, output_pb): trt_graph = trt.calib_graph_to_infer_graph(calib_graph) with gfile.FastGFile(output_pb, 'wb') as f: f.write(trt_graph.SerializeToString()) return trt_graph
def main(args): workspace_size_bytes = 1 << 30 trt_gpu_ops = tf.GPUOptions(per_process_gpu_memory_fraction=0.50) # transform images (image -> input vector) tf.reset_default_graph() g1 = tf.Graph() with g1.as_default(): # create graph in_images = tf.placeholder(tf.string, name='in_images') decoded_input = tf.image.decode_png(in_images, channels=3) float_input = tf.cast(decoded_input, dtype=tf.float32) # (224, 224, 3) -> (n, 224, 224, 3) rgb_input = tf.expand_dims(float_input, axis=0) # for VGG preprocess, reduce means and convert to BGR slice_red = tf.slice(rgb_input, [0, 0, 0, 0], [1, 224, 224, 1]) slice_green = tf.slice(rgb_input, [0, 0, 0, 1], [1, 224, 224, 1]) slice_blue = tf.slice(rgb_input, [0, 0, 0, 2], [1, 224, 224, 1]) sub_red = tf.subtract(slice_red, 123.68) sub_green = tf.subtract(slice_green, 116.779) sub_blue = tf.subtract(slice_blue, 103.939) transferred_input = tf.concat([sub_blue, sub_green, sub_red], 3) # transform to vectors with tf.Session(config=tf.ConfigProto(gpu_options=trt_gpu_ops)) as s1: with open('tiger224x224.jpg', 'rb') as f: data1 = f.read() imglist1 = s1.run([transferred_input], feed_dict={in_images: data1}) image1 = imglist1[0] with open('lion224x224.jpg', 'rb') as f: data2 = f.read() imglist2 = s1.run([transferred_input], feed_dict={in_images: data2}) image2 = imglist2[0] with open('orangutan224x224.jpg', 'rb') as f: data3 = f.read() imglist3 = s1.run([transferred_input], feed_dict={in_images: data3}) image3 = imglist3[0] print('Loaded image vectors (tiger, lion, orangutan') # When you test batch, please uncomment here. (single prediction is executed by default) image1 = np.tile(image1, (args.batch_size, 1, 1, 1)) image2 = np.tile(image2, (args.batch_size, 1, 1, 1)) image3 = np.tile(image3, (args.batch_size, 1, 1, 1)) # load classification graph def classifier_model_file = 'resnetV150_frozen.pb' classifier_graph_def = tf.GraphDef() with tf.gfile.Open(classifier_model_file, 'rb') as f: data = f.read() classifier_graph_def.ParseFromString(data) print('Loaded classifier graph def') trt_graph_def = trt.create_inference_graph( input_graph_def=classifier_graph_def, outputs=['resnet_v1_50/predictions/Reshape_1'], max_batch_size=args.batch_size, max_workspace_size_bytes=workspace_size_bytes, precision_mode=args.precision_mode ) if args.precision_mode == 'INT8': trt_graph_def = trt.calib_graph_to_infer_graph(trt_graph_def) print('Generated TensorRT graph def') # generate tensor with TensorRT graph tf.reset_default_graph() g2 = tf.Graph() with g2.as_default(): trt_x, trt_y = tf.import_graph_def( trt_graph_def, return_elements=['input:0', 'resnet_v1_50/predictions/Reshape_1:0'] ) print('Generated tensor by TensorRT graph') # run classification with TensorRT graph with open('imagenet_classes.txt', 'rb') as f: labeltext = f.read() classes_entries = labeltext.splitlines() with tf.Session(graph=g2, config=tf.ConfigProto(gpu_options=trt_gpu_ops)) as s2: eval_list = [image1, image2, image3] for img in eval_list: start_time = time.process_time() result = s2.run([trt_y], feed_dict={trt_x: img}) stop_time = time.process_time() # list -> 1 x n ndarray : feature's format is [[1.16643378e-06 3.12126781e-06 3.39836406e-05 ... ]] nd_result = result[0] # remove row's dimension onedim_result = nd_result[0,] # set column index to array of possibilities indexed_result = enumerate(onedim_result) # sort with possibilities sorted_result = sorted(indexed_result, key=lambda x: x[1], reverse=True) # get the names of top5 possibilities print('********************') for top in sorted_result[:5]: print(classes_entries[top[0]], 'confidence:', top[1]) print('{:.2f} milliseconds'.format((stop_time - start_time) * 1000))
def optimize_model(config_path, checkpoint_path, use_trt=True, force_nms_cpu=True, replace_relu6=True, remove_assert=True, override_nms_score_threshold=None, override_resizer_shape=[600, 600], max_batch_size=1, precision_mode='FP32', minimum_segment_size=50, max_workspace_size_bytes=1 << 32, maximum_cached_engines=100, calib_images_dir=None, num_calib_images=None, calib_image_shape=None, tmp_dir='.optimize_model_tmp_dir', remove_tmp_dir=True, output_path=None, display_every=100): """Optimizes an object detection model using TensorRT Optimizes an object detection model using TensorRT. This method also performs pre-tensorrt optimizations specific to the TensorFlow object detection API models. Please see the list of arguments for other optimization parameters. Args ---- config_path: A string representing the path of the object detection pipeline config file. checkpoint_path: A string representing the path of the object detection model checkpoint. use_trt: A boolean representing whether to optimize with TensorRT. If False, regular TensorFlow will be used but other optimizations (like NMS device placement) will still be applied. force_nms_cpu: A boolean indicating whether to place NMS operations on the CPU. replace_relu6: A boolean indicating whether to replace relu6(x) operations with relu(x) - relu(x-6). remove_assert: A boolean indicating whether to remove Assert operations from the graph. override_nms_score_threshold: An optional float representing a NMS score threshold to override that specified in the object detection configuration file. override_resizer_shape: An optional list/tuple of integers representing a fixed shape to override the default image resizer specified in the object detection configuration file. max_batch_size: An integer representing the max batch size to use for TensorRT optimization. precision_mode: A string representing the precision mode to use for TensorRT optimization. Must be one of 'FP32', 'FP16', or 'INT8'. minimum_segment_size: An integer representing the minimum segment size to use for TensorRT graph segmentation. max_workspace_size_bytes: An integer representing the max workspace size for TensorRT optimization. maximum_cached_engines: An integer represenging the number of TRT engines that can be stored in the cache. calib_images_dir: A string representing a directory containing images to use for int8 calibration. num_calib_images: An integer representing the number of calibration images to use. If None, will use all images in directory. calib_image_shape: A tuple of integers representing the height, width that images will be resized to for calibration. tmp_dir: A string representing a directory for temporary files. This directory will be created and removed by this function and should not already exist. If the directory exists, an error will be thrown. remove_tmp_dir: A boolean indicating whether we should remove the tmp_dir or throw error. output_path: An optional string representing the path to save the optimized GraphDef to. display_every: print log for calibration every display_every iteration Returns ------- A GraphDef representing the optimized model. """ if max_batch_size > 1 and calib_image_shape is None: raise RuntimeError( 'Fixed calibration image shape must be provided for max_batch_size > 1' ) if os.path.exists(tmp_dir): if not remove_tmp_dir: raise RuntimeError( 'Cannot create temporary directory, path exists: %s' % tmp_dir) subprocess.call(['rm', '-rf', tmp_dir]) # load config from file config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, 'r') as f: text_format.Merge(f.read(), config, allow_unknown_extension=True) # override some config parameters if config.model.HasField('ssd'): config.model.ssd.feature_extractor.override_base_feature_extractor_hyperparams = True if override_nms_score_threshold is not None: config.model.ssd.post_processing.batch_non_max_suppression.score_threshold = override_nms_score_threshold if override_resizer_shape is not None: config.model.ssd.image_resizer.fixed_shape_resizer.height = override_resizer_shape[ 0] config.model.ssd.image_resizer.fixed_shape_resizer.width = override_resizer_shape[ 1] elif config.model.HasField('faster_rcnn'): if override_nms_score_threshold is not None: config.model.faster_rcnn.second_stage_post_processing.batch_non_max_suppression.score_threshold = override_nms_score_threshold if override_resizer_shape is not None: config.model.faster_rcnn.image_resizer.fixed_shape_resizer.height = override_resizer_shape[ 0] config.model.faster_rcnn.image_resizer.fixed_shape_resizer.width = override_resizer_shape[ 1] print("***************************8 image_resizer " + str(config.model.faster_rcnn.image_resizer)) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # export inference graph to file (initial), this will create tmp_dir with tf.Session(config=tf_config): with tf.Graph().as_default(): exporter.export_inference_graph( INPUT_NAME, config, checkpoint_path, tmp_dir, #input_shape=[max_batch_size, 600, 340, 3]) input_shape=[ max_batch_size, override_resizer_shape[0], override_resizer_shape[1], 3 ]) # read frozen graph from file frozen_graph_path = os.path.join(tmp_dir, FROZEN_GRAPH_NAME) frozen_graph = tf.GraphDef() with open(frozen_graph_path, 'rb') as f: frozen_graph.ParseFromString(f.read()) # apply graph modifications if force_nms_cpu: frozen_graph = f_force_nms_cpu(frozen_graph) if replace_relu6: frozen_graph = f_replace_relu6(frozen_graph) if remove_assert: frozen_graph = f_remove_assert(frozen_graph) # get input names output_names = [BOXES_NAME, CLASSES_NAME, SCORES_NAME, NUM_DETECTIONS_NAME] # optionally perform TensorRT optimization if use_trt: print("**************************** using tensor RT *************") runtimes = [] with tf.Graph().as_default() as tf_graph: with tf.Session(config=tf_config) as tf_sess: graph_size = len(frozen_graph.SerializeToString()) num_nodes = len(frozen_graph.node) start_time = time.time() frozen_graph = trt.create_inference_graph( input_graph_def=frozen_graph, outputs=output_names, max_batch_size=max_batch_size, max_workspace_size_bytes=max_workspace_size_bytes, precision_mode=precision_mode.encode('utf-8'), minimum_segment_size=minimum_segment_size, is_dynamic_op=True, maximum_cached_engines=maximum_cached_engines) end_time = time.time() print("graph_size(MB)(native_tf): %.1f" % (float(graph_size) / (1 << 20))) print("graph_size(MB)(trt): %.1f" % (float(len(frozen_graph.SerializeToString())) / (1 << 20))) print("num_nodes(native_tf): %d" % num_nodes) print("num_nodes(tftrt_total): %d" % len(frozen_graph.node)) print("num_nodes(trt_only): %d" % len([ 1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp' ])) print("time(s) (trt_conversion): %.4f" % (end_time - start_time)) # perform calibration for int8 precision if precision_mode == 'INT8': if calib_images_dir is None: raise ValueError( 'calib_images_dir must be provided for int8 optimization.' ) tf.import_graph_def(frozen_graph, name='') tf_input = tf_graph.get_tensor_by_name(INPUT_NAME + ':0') tf_boxes = tf_graph.get_tensor_by_name(BOXES_NAME + ':0') tf_classes = tf_graph.get_tensor_by_name(CLASSES_NAME + ':0') tf_scores = tf_graph.get_tensor_by_name(SCORES_NAME + ':0') tf_num_detections = tf_graph.get_tensor_by_name( NUM_DETECTIONS_NAME + ':0') image_paths = glob.glob( os.path.join(calib_images_dir, '*.jpg')) image_paths = image_paths[0:num_calib_images] for image_idx in range(0, len(image_paths), max_batch_size): # read batch of images batch_images = [] for image_path in image_paths[image_idx:image_idx + max_batch_size]: image = _read_image(image_path, calib_image_shape) batch_images.append(image) t0 = time.time() # execute batch of images boxes, classes, scores, num_detections = tf_sess.run( [ tf_boxes, tf_classes, tf_scores, tf_num_detections ], feed_dict={tf_input: batch_images}) t1 = time.time() runtimes.append(float(t1 - t0)) if len(runtimes) % display_every == 0: print(" step %d/%d, iter_time(ms)=%.4f" % (len(runtimes), (len(image_path) + max_batch_size - 1) / max_batch_size, np.mean(runtimes) * 1000)) frozen_graph = trt.calib_graph_to_infer_graph(frozen_graph) # re-enable variable batch size, this was forced to max # batch size during export to enable TensorRT optimization #for node in frozen_graph.node: # if INPUT_NAME == node.name: # node.attr['shape'].shape.dim[0].size = -1 # write optimized model to disk if output_path is not None: with open(output_path, 'wb') as f: f.write(frozen_graph.SerializeToString()) export_dir = os.path.join(os.path.dirname(output_path), 'saved_model') subprocess.call(['rm', '-rf', export_dir]) with tf.Session(graph=tf.Graph()) as session: tf.import_graph_def(frozen_graph, name='') tf_input = session.graph.get_tensor_by_name(INPUT_NAME + ':0') tf_boxes = session.graph.get_tensor_by_name(BOXES_NAME + ':0') tf_classes = session.graph.get_tensor_by_name(CLASSES_NAME + ':0') tf_scores = session.graph.get_tensor_by_name(SCORES_NAME + ':0') tf_num_detections = session.graph.get_tensor_by_name( NUM_DETECTIONS_NAME + ':0') tf.saved_model.simple_save(session, export_dir, inputs={'inputs': tf_input}, outputs={ BOXES_NAME: tf_boxes, CLASSES_NAME: tf_classes, SCORES_NAME: tf_scores, NUM_DETECTIONS_NAME: tf_num_detections }) return frozen_graph
input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="FP16", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2 # minimum number of nodes in an engine ) ''' int8_calib_gdef = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="INT8", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2 # minimum number of nodes in an engine ) print("##############################################") #o4 = run_graph(fp16_graph, dummy_input) _ = run_calibration(int8_calib_gdef, dummy_input) #_ = run_graph(int8_calib_gdef, dummy_input) int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef) o5 = run_graph(int8_graph, dummy_input) #assert np.allclose(o1, o4) #assert np.allclose(o1, o5) print(o1) print( "----------------------------------------------------------------------------" ) print(o5) print("Pass")
def getINT8InferenceGraph(calibGraph): trt_graph=trt.calib_graph_to_infer_graph(calibGraph) with gfile.FastGFile("resnetV150_TRTINT8.pb",'wb') as f: f.write(trt_graph.SerializeToString()) return trt_graph
def build_trt_forward_pass_graph(self, input_tensors, gpu_id=0, checkpoint=None): """Wrapper around _build_forward_pass_graph which converts graph using TF-TRT""" import tensorflow.contrib.tensorrt as trt # Default parameters trt_params = { "batch_size_per_gpu": 64, "trt_max_workspace_size_bytes": (4096 << 20) - 1000, "trt_precision_mode": "FP32", "trt_minimum_segment_size": 10, "trt_is_dynamic_op": True, "trt_maximum_cached_engines": 1 } # Update params from user config for key in trt_params: if key in self.params: trt_params[key] = self.params[key] # Create temporary graph which will contain the native TF graph tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True temp_graph = tf.Graph() input_map = {} # We have to deconstruct SparseTensors into their 3 internal tensors # (indicies, values, dense_shape). This maps each tensor name to a list of # all 3 tensor names in its SparseTensor. output_sparse_tensor_map = {} with temp_graph.as_default() as tf_graph: with tf.Session(config=tf_config) as tf_sess: # Create temporary input placeholders used to build native TF graph input_placeholders = {'source_tensors': []} for i, original_input in enumerate(input_tensors['source_tensors']): name = 'input_map_%d' % i input_placeholders['source_tensors'].append( tf.placeholder(shape=original_input.shape, dtype=original_input.dtype, name=name)) # And map it back to original input input_map[name] = original_input # Build native graph loss, outputs = self._build_forward_pass_graph( input_placeholders, gpu_id=gpu_id ) # Gather output tensors output_node_names = [] output_node_names_and_ports = [] for x in outputs: if isinstance(x, tf.SparseTensor): components = [x.indices.name, x.values.name, x.dense_shape.name] fetch_names = [tensor.split(':')[0] for tensor in components] # Remove duplicates (i.e. if SparseTensor is output of one node) fetch_names = list(set(fetch_names)) output_node_names.extend(fetch_names) output_node_names_and_ports.extend(components) # Add all components to map so SparseTensor can be reconstructed # from tensor components which will be outputs of new graph for tensor in components: output_sparse_tensor_map[tensor] = components else: output_node_names.append(x.name.split(':')[0]) output_node_names_and_ports.append(x.name) # Restore checkpoint here because we have to freeze the graph tf_saver = tf.train.Saver() tf_saver.restore(save_path=checkpoint, sess=tf_sess) frozen_graph = tf.graph_util.convert_variables_to_constants( tf_sess, tf_sess.graph_def, output_node_names=output_node_names ) num_nodes = len(frozen_graph.node) print('Converting graph using TensorFlow-TensorRT...') frozen_graph = trt.create_inference_graph( input_graph_def=frozen_graph, outputs=output_node_names, max_batch_size=trt_params["batch_size_per_gpu"], max_workspace_size_bytes=trt_params["trt_max_workspace_size_bytes"], precision_mode=trt_params["trt_precision_mode"], minimum_segment_size=trt_params["trt_minimum_segment_size"], is_dynamic_op=trt_params["trt_is_dynamic_op"], maximum_cached_engines=trt_params["trt_maximum_cached_engines"] ) # Remove unused inputs from input_map. inputs_to_remove = [] for k in input_map: if k not in [node.name for node in frozen_graph.node]: inputs_to_remove.append(k) for k in inputs_to_remove: del input_map[k] print('Total node count before and after TF-TRT conversion:', num_nodes, '->', len(frozen_graph.node)) print('TRT node count:', len([1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp'])) # Perform calibration for INT8 precision mode if self.params.get("trt_precision_mode", "FP32").upper() == 'INT8': with tf.Session(config=tf_config) as tf_sess: calib_graph = frozen_graph num_iterations = 10 print('Calibrating INT8...') outputs = tf.import_graph_def( calib_graph, input_map=input_map, return_elements=output_node_names_and_ports, name='') self._num_objects_per_step = [self._get_num_objects_per_step(worker_id) for worker_id in range(self.num_gpus)] results_per_batch = iterate_data( self, tf_sess, compute_loss=False, mode='infer', verbose=False, num_steps=num_iterations ) frozen_graph = trt.calib_graph_to_infer_graph(calib_graph) del calib_graph print('INT8 graph created.') print('Nodes INT8:', len(frozen_graph.node)) # Import TRT converted graph to default graph, mapping it to the original # input tensors. outputs = tf.import_graph_def( frozen_graph, input_map=input_map, return_elements=output_node_names_and_ports, name='') # Reconstruct SparseTensors final_outputs = [] for tensor in outputs: if tensor.name in output_sparse_tensor_map: component_names = output_sparse_tensor_map[tensor.name] # Find tensors in outputs for components component_tensors = [[x for x in outputs if x.name == name][0] for name in component_names] # Remove all components from outputs so we don't create duplicates of # this SparseTensor for x in component_tensors: if x in outputs: outputs.remove(x) final_outputs.append(tf.SparseTensor(*component_tensors)) else: final_outputs.append(tensor) return loss, final_outputs
def get_trt_from_calib(self, grf_calib): grf_trt = trt.calib_graph_to_infer_graph(grf_calib) return grf_trt
def convert_int8(input_model_dir, output_model_dir, batch_size, precision_mode, calib_image_dir, input_tensor, output_tensor, epochs): # (TODO) Need to check if we need Tesla T4 when conversion. config = tf.ConfigProto() config.gpu_options.allow_growth = True # Get path to calibration data. calibration_files = get_calibration_files(calib_image_dir, 'validation*') # Create dataset and apply preprocess # (TODO) Get num cpus to set appropriate number to num_parallel_calls dataset = tf.data.TFRecordDataset(calibration_files) dataset = dataset.apply( tf.contrib.data.map_and_batch( map_func=preprocess, batch_size=batch_size, num_parallel_calls=multiprocessing.cpu_count())) """ Step 1: Creating the calibration graph. """ # Create TF-TRT INT8 calibration graph. trt_int8_calib_graph = trt.create_inference_graph( input_graph_def=None, outputs=[output_tensor], max_batch_size=batch_size, input_saved_model_dir=input_model_dir, precision_mode=precision_mode) # Calibrate graph. with tf.Session(graph=tf.Graph(), config=config) as sess: tf.logging.info('preparing calibration data...') iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() tf.logging.info('Loading INT8 calibration graph...') output_node = tf.import_graph_def(trt_int8_calib_graph, return_elements=[output_tensor], name='') tf.logging.info('Calibrate model with calibration data...') for _ in range(epochs): sess.run(output_node, feed_dict={input_tensor: sess.run(next_element)[0]}) """ Step 2: Converting the calibration graph to inference graph """ tf.logging.info('Creating TF-TRT INT8 inference engine...') trt_int8_calibrated_graph = trt.calib_graph_to_infer_graph( trt_int8_calib_graph) # Copy MetaGraph from base model. with tf.Session(graph=tf.Graph(), config=config) as sess: base_model = tf.saved_model.loader.load( sess, [tf.saved_model.tag_constants.SERVING], input_model_dir) metagraph = tf.MetaGraphDef() metagraph.graph_def.CopyFrom(trt_int8_calibrated_graph) for key in base_model.collection_def: if key not in [ 'variables', 'local_variables', 'model_variables', 'trainable_variables', 'train_op', 'table_initializer' ]: metagraph.collection_def[key].CopyFrom( base_model.collection_def[key]) metagraph.meta_info_def.CopyFrom(base_model.meta_info_def) for key in base_model.signature_def: metagraph.signature_def[key].CopyFrom( base_model.signature_def[key]) saved_model_builder = ( tf.saved_model.builder.SavedModelBuilder(output_model_dir)) # Write SavedModel with INT8 precision. with tf.Graph().as_default(): tf.graph_util.import_graph_def(trt_int8_calibrated_graph, return_elements=[output_tensor], name='') with tf.Session(config=config) as sess: saved_model_builder.add_meta_graph_and_variables( sess, ('serve', ), signature_def_map=metagraph.signature_def) # Ignore other meta graphs from the input SavedModel. saved_model_builder.save()
graph=gx) as sess: # run over real calibration data here, we are mimicking a calibration set of # 30 different batches. Use as much calibration data as you want for j in range(200): val = sess.run(out, {inp: [dumm_inp[j]]}) return val mnist__train, temppp = tf.keras.datasets.mnist.load_data() imagesss, labels = mnist__train[0], mnist__train[1] imagesss = imagesss.astype('float32') imagesss /= 255.0 imagesss = np.reshape(imagesss, [60000, 28, 28, 1]) _ = run_calibration(trt_graph_def, imagesss) trt_graph_def = trt.calib_graph_to_infer_graph( trt_graph_def) # For only 'INT8' #trt_graph_def=trt.calib_graph_to_infer_graph(trt_graph_def) # For only 'INT8' print('Generated TensorRT graph def') # # Generate tensor with TensorRT graph def # tf.reset_default_graph() g2 = tf.Graph() with g2.as_default(): trt_x, trt_y = tf.import_graph_def(trt_graph_def, return_elements=['x:0', 'output:0']) print('Generated tensor for TensorRT optimized graph') #