def generate_engine(uff_file, G_LOGGER): parser = uffparser.create_uff_parser() parser.register_input("input_1", (3, 416, 416), 0) parser.register_output("conv2d_23/BiasAdd") engine = trt.utils.uff_file_to_trt_engine(G_LOGGER, uff_file, parser, 1, 1 << 30) return engine
def create_graph(self): uff_model = uff.from_tensorflow_frozen_model( self.facenet, ['InceptionResnetV2/Bottleneck/BatchNorm/Reshape_1'], list_nodes=False) G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.ERROR) parser = uffparser.create_uff_parser() parser.register_input('input_image', (3, 160, 160), 0) parser.register_output( 'InceptionResnetV2/Bottleneck/BatchNorm/Reshape_1') engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, 1, 1 << 31) parser.destroy() runtime = trt.infer.create_infer_runtime(G_LOGGER) self.context = engine.create_execution_context() self.output = np.empty((1, 128), dtype=np.float32) self.d_input = cuda.mem_alloc(1 * 160 * 160 * 3 * 4) self.d_output = cuda.mem_alloc(1 * 128 * 4) self.bindings = [int(self.d_input), int(self.d_output)] print('here') self.stream = cuda.Stream()
def main(): # generate test case for our engine img_input = DATA + '/VOC2012/JPEGImages/2008_000016.jpg' img, img_id, img_w, img_h = get_testcase(img_input) #img in ppm format # convert model to UFF uff_model = uff.from_tensorflow_frozen_model( '/tiny-yolo-voc/tiny-yolo-graph-tf17.pb', ["22-convolutional"]) # convert model to TensorRT model model_parser = uffparser.create_uff_parser() model_parser.register_input("input", (3, 416, 416), 0) #input name, input dims, input order model_parser.register_output("22-convolutional") # create engine, context, and runtime engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, model_parser, MAX_BATCH_SIZE, MAX_WORKSPACE) assert (engine) runtime = trt.infer.create_infer_runtime(G_LOGGER) context = engine.create_execution_context() context.set_profiler(G_PROFILER) if (TIMEIT): time_inference(context, engine, 1) else: if (VALIDATE): f = open("/tiny-yolo-voc/2012_val.txt", "r") for image_path in f: image_path = image_path.strip() image_jpg = image_path.split("/")[-1] img_input = DATA + '/VOC2012/JPEGImages/' + image_jpg img, img_id, img_w, img_h = get_testcase(img_input) out = infer( context, img, OUTPUT_SIZE, 1 ) # infer use context.enqueue(): asynchronous process with cuda stream. TensorRT does not support profiling on this at the moment # parse output output_parser = yoloparser.yolov2parser( out, output_wd, nclass, nbox, class_name, biases) result = output_parser.interpret(threshold, nms, img_w, img_h) save_results(img_input, result, img_w, img_h, img_id, "/tiny-yolo-voc/results/") else: out = infer( context, img, OUTPUT_SIZE, 1 ) # infer use context.enqueue(): asynchronous process with cuda stream. TensorRT does not support profiling on this at the moment # parse output output_parser = yoloparser.yolov2parser(out, output_wd, nclass, nbox, class_name, biases) result = output_parser.interpret(threshold, nms, img_w, img_h) save_results(img_input, result, img_w, img_h, img_id, "/tiny-yolo-voc/results/") context.destroy() engine.destroy() runtime.destroy()
def main(args): input = [args.input_placeholder] output = args.output_placeholders.split(',') dims = map(int, args.dimensions.split(',')) assert (len(dims) == 3), 'Input dimensions must be given in CHW format.' # Convert tensorflow pb file to uff stream for tensorRT uff_model = uff.from_tensorflow_frozen_model(frozen_file=args.frozen_file, input_nodes=input, output_nodes=output) # Create parser for uff file and register input placeholder parser = uffparser.create_uff_parser() parser.register_input(args.input_placeholder, dims, uffparser.UffInputOrder_kNCHW) # Create a tensorRT engine which is ready for immediate use. # For this example, we will serialize it for fast instantiation later. G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.ERROR) engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, args.max_batch_size, 1 << args.max_workspace_size, trt.infer.DataType.FLOAT) assert (engine) # Serialize the engine to given file path serialize_engine(engine, args.file_path) engine.destroy()
def main(): path = os.path.dirname(os.path.realpath(__file__)) tf_model = lenet5.learn() uff_model = uff.from_tensorflow(tf_model, ["fc2/Relu"]) #Convert Tensorflow model to TensorRT model parser = uffparser.create_uff_parser() parser.register_input("Placeholder", (1, 28, 28), 0) parser.register_output("fc2/Relu") engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, MAX_BATCHSIZE, MAX_WORKSPACE) assert (engine) # parser.destroy() context = engine.create_execution_context() print("\n| TEST CASE | PREDICTION |") for i in range(ITERATIONS): img, label = lenet5.get_testcase() img = img[0] label = label[0] out = infer(context, img, 1) print("|-----------|------------|") print("| " + str(label) + " | " + str(np.argmax(out)) + " |")
def main(): path = dir_path = os.path.dirname(os.path.realpath(__file__)) #Convert uff model to TensorRT model parser = uffparser.create_uff_parser() parser.register_input("Input_0", (1, 28, 28), 0) parser.register_output("Binary_3") engine = trt.utils.uff_file_to_trt_engine(G_LOGGER, MODEL, parser, MAX_BATCHSIZE, MAX_WORKSPACE, trt.infer.DataType.FLOAT) assert (engine) # parser.destroy() rand_file = randint(0, 9) img = get_testcase(DATA + str(rand_file) + '.pgm') data = normalize(img) print("Test case: " + str(rand_file)) out = infer(engine, data, 1) print("Prediction: " + str(np.argmax(out)))
def mk_TensorRT_engine(self): #モデルがない場合学習をさせる if not tf.train.get_checkpoint_state(os.path.join(save_dir, "model.ckpt")): self.fit() #学習済みモデルを読み込む with tf.Session() as sess: saver = tf.train.Saver(tf.global_variables()) saver.restore(sess, "save/model.ckpt") graph_def = sess.graph_def() frozen_graph = tf.graph_util.convert_variables_to_constants(sess, graph_def, ["inference/softmax"]) tf_model _ tf.graph_util.remove_training_nodes(frozen_graph) # Tensorflowのモデル形式からUFFへ変換 uff_model = uff.from_tensorflow(tf_model, ["inference/softmax"]) # TensorRT EngineのためのUFF Streamを作る G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.ERROR) # uff parserを作り,モデルの入出力に関する情報を加える parser = uffparser.create_uff_parser() # (channel, im_size, im_size) parser.register_input("Placeholder", (1,28,28), 0) parser.register_output("inference/softmax") # utility関数を用いてエンジンを作る(最後の引数はmax batch size と max workspace size) engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, MAX_BATCH_SIZE, MAX_WORKSPACE_SIZE) parser.destroy() return engine
def main(): args = parse_args() height, width, channel = 368, 432, 3 images = [] for name in args.images.split(','): x = read_imgfile( name, width, height, 'channels_first') # channels_first is required for tensorRT images.append(x) model_func = _get_model_func(args.base_model) model_inputs, model_outputs = model_func() input_names = [p.name[:-2] for p in model_inputs] output_names = [p.name[:-2] for p in model_outputs] print('input names: %s' % ','.join(input_names)) print('output names: %s' % ','.join(output_names)) # outputs/conf,outputs/paf # with tf.Session() as sess: sess = tf.InteractiveSession() measure(lambda: tl.files.load_and_assign_npz_dict(args.path_to_npz, sess), 'load npz') frozen_graph = tf.graph_util.convert_variables_to_constants( sess, sess.graph_def, output_names) tf_model = tf.graph_util.remove_training_nodes(frozen_graph) uff_model = measure(lambda: uff.from_tensorflow(tf_model, output_names), 'uff.from_tensorflow') print('uff model created') parser = uffparser.create_uff_parser() inputOrder = 0 # NCHW, https://docs.nvidia.com/deeplearning/sdk/tensorrt-api/c_api/_nv_uff_parser_8h_source.html parser.register_input(input_names[0], (channel, height, width), inputOrder) for name in output_names: parser.register_output(name) G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.INFO) max_batch_size = 1 max_workspace_size = 1 << 30 engine = measure( lambda: trt.utils.uff_to_trt_engine( G_LOGGER, uff_model, parser, max_batch_size, max_workspace_size), 'trt.utils.uff_to_trt_engine') print('engine created') f_height, f_width = (height / 8, width / 8 ) # TODO: derive from model_outputs post_process = PostProcessor((height, width), (f_height, f_width), 'channels_first') for idx, x in enumerate(images): conf, paf = measure(lambda: infer(engine, x, 1), 'infer') humans, heat_up, paf_up = measure(lambda: post_process(conf, paf), 'post_process') print('got %d humans' % (len(humans))) plot_humans(x.transpose([1, 2, 0]), heat_up, paf_up, humans, '%02d' % (idx + 1))
def uff2engine(frozen_input_name, net_input_shape, frozen_output_name, uff_path, engine_path): with open(uff_path, 'rb') as f: uff_model = f.read() G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.ERROR) parser = uffparser.create_uff_parser() parser.register_input(frozen_input_name, net_input_shape, 0) parser.register_output(frozen_output_name) engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, 1, 1 << 30) parser.destroy() trt.utils.write_engine_to_file(engine_path, engine.serialize())
def create_and_save_inference_engine(): INPUT_LAYERS = [config['input_layer']] OUTPUT_LAYERS = [config['output_layer']] INFERENCE_BATCH_SIZE = config['inference_batch_size'] INPUT_C = 1 INPUT_H = config['image_dim'] INPUT_W = config['image_dim'] # Load your newly created Tensorflow frozen model and convert it to UFF uff_model = uff.from_tensorflow_frozen_model(config['frozen_model_file'], OUTPUT_LAYERS) # Now that we have a UFF model, we can generate a TensorRT engine by creating a logger for TensorRT. G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.ERROR) # Create a UFF parser to parse the UFF file created from your TF Frozen model and identify the desired input and output nodes parser = uffparser.create_uff_parser() parser.register_input(INPUT_LAYERS[0], (INPUT_C, INPUT_H, INPUT_W), 0) parser.register_output(OUTPUT_LAYERS[0]) # Build your TensorRT inference engine # This step performs (1) Tensor fusion (2) Reduced precision calibration # (3) Target-specific autotuning (4) Tensor memory management # Pass the logger, parser, the UFF model stream, # and some settings (max batch size and max workspace size) # to a utility function that will create the engine for us # Build your TensorRT inference engine if (config['precision'] == 'fp32'): engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, INFERENCE_BATCH_SIZE, 1 << 20, trt.infer.DataType.FLOAT) elif (config['precision'] == 'fp16'): engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, INFERENCE_BATCH_SIZE, 1 << 20, trt.infer.DataType.HALF) elif (config['precision'] == 'int8'): engine = trt.utils.uff_file_to_trt_engine(G_LOGGER, uff_model, parser, INFERENCE_BATCH_SIZE, 1 << 20, trt.infer.DataType.INT8) # engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, 1, 1 << 20) # Serialize TensorRT engine to a file for when you are ready to deploy your model. save_path = str(config['engine_save_dir']) + "tf_model_batch" \ + str(INFERENCE_BATCH_SIZE) + "_" + str(config['precision']) + ".engine" trt.utils.write_engine_to_file(save_path, engine.serialize()) print("Saved TensorRT engine to {}".format(save_path))
def create_and_save_inference_engine(): # Define network parameters, including inference batch size, name & dimensionality of input/output layers INPUT_LAYERS = [config['input_layer']] OUTPUT_LAYERS = [config['out_layer']] INFERENCE_BATCH_SIZE = config['inference_batch_size'] INPUT_C = 3 INPUT_H = config['image_dim'] INPUT_W = config['image_dim'] # Load your newly created Tensorflow frozen model and convert it to UFF uff_model = uff.from_tensorflow_frozen_model(config['frozen_model_file'], OUTPUT_LAYERS) # Create a UFF parser to parse the UFF file created from your TF Frozen model parser = uffparser.create_uff_parser() parser.register_input(INPUT_LAYERS[0], (INPUT_C,INPUT_H,INPUT_W),0) parser.register_output(OUTPUT_LAYERS[0]) # Build your TensorRT inference engine if(config['precision'] == 'fp32'): engine = trt.utils.uff_to_trt_engine( G_LOGGER, uff_model, parser, INFERENCE_BATCH_SIZE, 1<<20, trt.infer.DataType.FLOAT ) elif(config['precision'] == 'fp16'): engine = trt.utils.uff_to_trt_engine( G_LOGGER, uff_model, parser, INFERENCE_BATCH_SIZE, 1<<20, trt.infer.DataType.HALF ) # Serialize TensorRT engine to a file for when you are ready to deploy your model. save_path = str(config['engine_save_dir']) + "keras_vgg19_b" \ + str(INFERENCE_BATCH_SIZE) + "_"+ str(config['precision']) + ".engine" trt.utils.write_engine_to_file(save_path, engine.serialize()) print("Saved TRT engine to {}".format(save_path))
def __init__(self, model, batch_size): # get Tensorflow graph object from Keras with K.get_session() as sess: image_batch_t = tf.placeholder(tf.float32, shape=(None, 1, 28, 28), name='image_tensor') K.set_learning_phase(0) conf_t = model(image_batch_t) output_names = [conf_t.name[:-2]] graphdef = sess.graph.as_graph_def() frozen_graph = tf.graph_util.convert_variables_to_constants( sess, graphdef, output_names) frozen_graph = tf.graph_util.remove_training_nodes(frozen_graph) # convert TensorRT UFF object uff_model = uff.from_tensorflow(frozen_graph, output_names) G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.ERROR) parser = uffparser.create_uff_parser() input_shape = (1, 28, 28) parser.register_input("image_tensor", input_shape, 0) parser.register_output(output_names[0]) # create TensorRT inference engine engine = trt.utils.uff_to_trt_engine(G_LOGGER, stream=uff_model, parser=parser, max_batch_size=batch_size, max_workspace_size=1 << 25) # datatype='FP32') parser.destroy() # allocate needed device buffers dims = engine.get_binding_dimensions(0).to_DimsCHW() nbytes = batch_size * dims.C() * dims.H() * dims.W() * np.dtype( np.float32).itemsize self.d_src = cuda.mem_alloc(nbytes) dims = engine.get_binding_dimensions(1).to_DimsCHW() nbytes = batch_size * dims.C() * dims.H() * dims.W() * np.dtype( np.float32).itemsize self.d_dst = cuda.mem_alloc(nbytes) self.engine = engine self.ctx = engine.create_execution_context() self.batch_size = batch_size
def parse_uff_model(self, uff_model=None, uff_path=None): assert uff_model or uff_path, "Must pass in either a UFF model or the path to an UFF model in disk" if uff_path: with open(uff_path, 'rb') as uff_file: uff_model = uff_file.read() parser = uffparser.create_uff_parser() # input_1 parser.register_input(self.model_input_name, (3, 224, 224), 0) # dense_2/Sigmoid parser.register_output(self.model_output_name) engine = trt.utils.uff_to_trt_engine(logger=trt_logger, stream=uff_model, parser=parser, max_batch_size=MAX_BATCH_SIZE, max_workspace_size=MAX_WORKSPACE_SIZE, datatype=TRT_DATATYPE) context = engine.create_execution_context() return context
def main(): tf_freeze_model = 'car_series/frozen_graph.pb' input_node = 'input' out_node = 'InceptionV4/Logits/Predictions' uff_model = uff.from_tensorflow_frozen_model(tf_freeze_model, [out_node]) #Convert Tensorflow model to TensorRT model parser = uffparser.create_uff_parser() parser.register_input(input_node, (CHANNEL, INPUT_H, INPUT_W), 0) parser.register_output(out_node) engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, MAX_BATCHSIZE, MAX_WORKSPACE) trt.utils.write_engine_to_file("car_series/car_series_tensorrt.engine", engine.serialize())
def main(): MAX_WORKSPACE = 1 << 30 MAX_BATCHSIZE = 1 # 若用了output_filename参数则返回的是NULL,否则返回的是序列化以后的UFF模型数据 uff_model = uff.from_tensorflow_frozen_model( frozen_model_path, frozen_node_name ) #, output_filename=UFF_PATH, text=True, list_nodes=True) parser = uffparser.create_uff_parser() parser.register_input(frozen_input_name, NET_INPUT_IMAGE_SHAPE, 0) # 0表示输入通道顺序NCHW,1表示输入通道顺序为NHWC parser.register_output(frozen_node_name[0]) engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, MAX_BATCHSIZE, MAX_WORKSPACE) # save engine trt.utils.write_engine_to_file(ENGINE_PATH, engine.serialize()) assert (engine) # parser.destroy() context = engine.create_execution_context() print("\n| TEST CASE | PREDICTION |") pair = imgTestData[0] correct = 0 for img, label in pair: output = infer(context, img, 1) # my frozen graph output is logists , here need convert to softmax softmax = np.exp(output) / np.sum(np.exp(output)) predict = np.argmax(softmax) if int(label) == predict: correct += 1 print( "|-------|--------|--------------------------------------------------------" ) print("| " + str(label) + " | " + str(predict) + " | " + str(['{:.2f}%'.format(i * 100) for i in softmax]) + " ") accuracy = correct / len(pair) print("Accuracy = ", accuracy)
def _create_engine(self, modelstream, **kwargs): ''' Helper to create engine when trying to build from models ''' self.log_info("Parsing Model from {}".format(self.src_framework)) if self.src_framework == "uff": parser = uffparser.create_uff_parser() for k, v in kwargs["input_nodes"].items(): parser.register_input(k, v, 0) for o in kwargs["output_nodes"]: parser.register_output(o) if modelstream: self.engine = trt.utils.uff_to_trt_engine( self.logger, modelstream, parser, self.max_batch_size, self.max_workspace_size, self.data_type, None, #TODO: Figure out if plugins are supported in UFF kwargs.get("calibrator", None)) else: self.engine = trt.utils.uff_file_to_trt_engine( self.logger, kwargs["path"], parser, self.max_batch_size, self.max_workspace_size, self.data_type, None, #TODO: Figure out if plugins are supported in UFF kwargs.get("calibrator", None)) parser.destroy() elif self.src_framework == "caffe": self.engine = trt.utils.caffe_to_trt_engine( self.logger, kwargs["deployfile"], kwargs["modelfile"], self.max_batch_size, self.max_workspace_size, kwargs["output_nodes"], self.data_type, kwargs.get("plugins", None), kwargs.get("calibrator", None))
def init_models(use_divice, model_file): if use_divice == "GPU": os.environ['CUDA_VISIBLE_DEVICES'] = Config.TEST_GPU_ID # load model uff_model = open(model_file, 'rb').read() parser = uffparser.create_uff_parser() parser.register_input("input_images", (3, 768, 768), 0) parser.register_output("feature_fusion/concat_3") # create inference engine and context (aka session) trt_logger = trt.infer.ConsoleLogger(trt.infer.LogSeverity.INFO) engine = trt.utils.uff_to_trt_engine( logger=trt_logger, stream=uff_model, parser=parser, max_batch_size=1, # 1 sample at a time max_workspace_size=1 << 20, # 1 GB GPU memory workspace datatype=trt.infer.DataType.FLOAT ) # that's very cool, you can set precision context = engine.create_execution_context() return context
def createTrtFromUFF(modelpath): MAX_WORKSPACE = 1 << 30 G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.INFO) parser = uffparser.create_uff_parser() parser.register_input("enc_text", (1, VOC_LEN, 1), 0) parser.register_input("dec_text", (1, VOC_LEN, 1), 1) parser.register_input("h0_in", (1, DIM, 1), 2) parser.register_input("c0_in", (1, DIM, 1), 3) parser.register_input("h1_in", (1, DIM, 1), 4) parser.register_input("c1_in", (1, DIM, 1), 5) parser.register_output("h0_out") parser.register_output("c0_out") parser.register_output("h1_out") parser.register_output("c1_out") parser.register_output("final_output") engine = trt.utils.uff_file_to_trt_engine(G_LOGGER, modelpath, parser, MAX_BATCHSIZE, MAX_WORKSPACE, trt.infer.DataType.FLOAT) print '[ChatBot] Successfully create TensorRT engine from file '+modelpath return engine
def main(): args = parse_args() # Convert pb to uff uff_model = uff.from_tensorflow_frozen_model(args.pb_path, [args.output_node]) # Create UFF parser and logger parser = uffparser.create_uff_parser() INPUT_SIZE = [3 , args.image_size , args.image_size] parser.register_input(args.input_node,INPUT_SIZE , 0) parser.register_output(args.output_node) G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.INFO) # Convert uff to plan if args.calib_images_dir: calibration_files = [os.path.join(args.calib_images_dir,i) for i in os.listdir(args.calib_images_dir)] else: calibration_files = [] batchstream = ImageBatchStream(args.max_batch_size, calibration_files,INPUT_SIZE) int8_calibrator = PythonEntropyCalibrator([args.input_node], batchstream) if args.int8: engine = trt.utils.uff_to_trt_engine( G_LOGGER, uff_model, parser, args.max_batch_size, args.max_workspace, datatype = trt.infer.DataType.INT8, calibrator = int8_calibrator ) else: engine = trt.utils.uff_to_trt_engine( G_LOGGER, uff_model, parser, args.max_batch_size, args.max_workspace ) trt.utils.write_engine_to_file(args.engine_path, engine.serialize())
def create_graph(self): """""" uff_model = uff.from_tensorflow_frozen_model( self.model_file, ['InceptionResnetV2/Logits/Predictions']) G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.ERROR) parser = uffparser.create_uff_parser() parser.register_input('input_image', (3, 512, 512), 0) parser.register_output('InceptionResnetV2/Logits/Predictions') engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, 1, 1 << 32) parser.destroy() runtime = trt.infer.create_infer_runtime(G_LOGGER) self.context = engine.create_execution_context() self.output = np.empty(len(self.id2name), dtype=np.float32) self.d_input = cuda.mem_alloc(1 * 512 * 512 * 3 * 4) self.d_output = cuda.mem_alloc(1 * len(self.id2name) * 4) self.bindings = [int(self.d_input), int(self.d_output)] self.stream = cuda.Stream()
def create_engine(name, model_path, height, width, input_layer='image', output_layer='Openpose/concat_stage7', half16=False): if not os.path.exists(name): # Load your newly created Tensorflow frozen model and convert it to UFF # import pdb; pdb.set_trace(); uff_model = uff.from_tensorflow_frozen_model( model_path, [output_layer]) # , output_filename = 'mobilepose.uff') dump = open(name.replace('engine', 'uff'), 'wb') dump.write(uff_model) dump.close() # Create a UFF parser to parse the UFF file created from your TF Frozen model parser = uffparser.create_uff_parser() parser.register_input(input_layer, (3, height, width), 0) parser.register_output(output_layer) # Build your TensorRT inference engine # This step performs (1) Tensor fusion (2) Reduced precision # (3) Target autotuning (4) Tensor memory management engine = trt.utils.uff_to_trt_engine( G_LOGGER, uff_model, parser, 1, 1 << 20, datatype=trt.infer.DataType.FLOAT if not half16 else trt.infer.DataType.HALF) trt.utils.write_engine_to_file(name, engine.serialize()) else: engine = trt.utils.load_engine(G_LOGGER, name) return engine
import tensorflow as tf from tensorflow.examples.tutorials.mnist import input_data import pycuda.driver as cuda import pycuda.autoinit import numpy as np from random import randint # generate a random test case from PIL import Image import time #import system tools import os import uff import tensorrt as trt from tensorrt.parsers import uffparser trt.utils.get_uff_version() parser = uffparser.create_uff_parser() def get_uff_required_version(parser): return str(parser.get_uff_required_version_major()) + '.' + str( parser.get_uff_required_version_minor()) + '.' + str( parser.get_uff_required_version_patch()) if trt.utils.get_uff_version() != get_uff_required_version(parser): raise ImportError("""ERROR: UFF TRT Required version mismatch""") # STARTER_LEARNING_RATE = 1e-4 BATCH_SIZE = 10 NUM_CLASSES = 10 MAX_STEPS = 1000
def main(): train_X=get_data() tensorrt_input=train_X.reshape(3,28,28) tensorrt_input=tensorrt_input.astype(np.float32) X = tf.placeholder("float", shape=[1, 28, 28, 3]) h_conv1=forward_prop(X) # saver = tf.train.Saver() init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) tf.train.write_graph(sess.graph_def, '.', 'hellotensor.pbtxt') final_result=sess.run(h_conv1,feed_dict={X:train_X}) # print(final_result) #saver.save(sess, './hellotensor.ckpt') output_graph_name='./hellotensor.pb' output_node_names='Conv2D' output_graph_def = graph_util.convert_variables_to_constants(sess,sess.graph_def,output_node_names.split(",")) output_graph_def = tf.graph_util.remove_training_nodes(output_graph_def) uff_model = uff.from_tensorflow(output_graph_def, output_nodes=['Conv2D']) dump = open('slimConv.uff', 'wb') dump.write(uff_model) dump.close() # with tf.gfile.GFile(output_graph_name, "wb") as f: # f.write(output_graph_def.SerializeToString()) uff_model = open("/home/dami/TensorRt_test/slimConv.uff", 'rb').read() G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.ERROR) parser = uffparser.create_uff_parser() parser.register_input("Placeholder", (3, 28, 28), 0) parser.register_output("Conv2D") engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, 1, 1 << 20) parser.destroy() runtime = trt.infer.create_infer_runtime(G_LOGGER) context = engine.create_execution_context() dims_data = engine.get_binding_dimensions(0).to_DimsCHW() dims_out1 = engine.get_binding_dimensions(1).to_DimsCHW() _out0 = np.empty(dims_data.C() * dims_data.H() * dims_data.W(), dtype=np.float32) _out1 = np.empty(dims_out1.C() * dims_out1.H() * dims_out1.W(), dtype=np.float32) d_out0 = cuda.mem_alloc(1 * dims_data.C() * dims_data.H() * dims_data.W() * _out0.dtype.itemsize) d_out1 = cuda.mem_alloc(1 * dims_out1.C() * dims_out1.H() * dims_out1.W() * _out1.dtype.itemsize) bindings = [int(d_out0), int(d_out1)] stream = cuda.Stream() # transfer input data to device cuda.memcpy_htod_async(d_out0, tensorrt_input, stream) # execute model context.enqueue(1, bindings, stream.handle, None) # transfer predictions back cuda.memcpy_dtoh_async(_out1, d_out1, stream) # synchronize threads stream.synchronize() # re_array=_out1.reshape((13, 13, 32)) if (_out1.shape != final_result.shape): results = final_result.reshape(_out1.shape) print(str(compare_arrays(results, _out1))) print(sumArray(_out1)) print(sumArray(results)) context.destroy() engine.destroy() runtime.destroy()
def main(unused_argv): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1) start_time = time.time() if True: folder = 'output/' if not os.path.exists(folder): try: os.makedirs(folder) except OSError: pass print(img_path) test_image = np.asarray(misc.imread(img_path), dtype=np.float32) pp_image = test_image print(pp_image.shape) x_d = pp_image.shape[0] - tile_size + 1 y_d = pp_image.shape[1] - tile_size + 1 # Getting a patch for every pixel in the image, unless separation > 1 if separation > 1: i_sep = int(separation) x_values = np.arange(0, x_d, i_sep) y_values = np.arange(0, y_d, i_sep) if (x_d - 1) % i_sep != 0: x_values = np.append(x_values, x_d - 1) if (y_d - 1) % i_sep != 0: y_values = np.append(y_values, y_d - 1) else: x_values = np.arange(0, x_d) y_values = np.arange(0, y_d) #print(x_values, y_values) for x in x_values: for y in y_values: input_pipe_l.append(test_image[x:(x + tile_size), y:(y + tile_size)]) #print (str(x) + ':' + str(y) + ' ' + str(x+tile_size) + ':' + str(y+tile_size)) # input_g follows a shape of (num_patches, 28, 28) input_g = np.asarray(input_pipe_l, dtype=np.float32) print('Input pipeline constructed.') print('Input shape: ' + str(input_g.shape)) pred_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": input_g}, num_epochs=1, shuffle=False) # Lana's code for using uff model print("====================================") start_time = time.time() num_tiles = np.size(input_g, 0) ### Test code for visual confirmation #num_tiles = 4 img1 = np.ascontiguousarray(test_image[216:244, 216:244]) img2 = np.ascontiguousarray(test_image[244:272, 216:244]) img3 = np.ascontiguousarray(test_image[216:244, 244:272]) img4 = np.ascontiguousarray(test_image[244:272, 244:272]) imgs = np.array([img1, img2, img3, img4]) print("Processing " + str(num_tiles) + " tiles") ### General inference setup uff_model = open('uff_no_reshape.uff', 'rb').read() G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.ERROR) parser = uffparser.create_uff_parser() parser.register_input("Reshape", (1, tile_size, tile_size), 0) parser.register_output("output_score/output_relu") engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, 1, 1 << 20) parser.destroy() runtime = trt.infer.create_infer_runtime(G_LOGGER) # Alocate device memory nn_in = np.ascontiguousarray(input_g[0, :, :]) nn_out = np.empty(tile_size * tile_size * 2, dtype=np.float32) d_input = cuda.mem_alloc(1 * nn_in.size * nn_in.dtype.itemsize) d_output = cuda.mem_alloc(1 * nn_out.size * nn_out.dtype.itemsize) bindings = [int(d_input), int(d_output)] stream = cuda.Stream() out_imgs_ch1 = np.empty([tile_size, tile_size, num_tiles]) out_imgs_ch2 = np.empty([tile_size, tile_size, num_tiles]) for i in range(0, num_tiles): if num_tiles == 4: nn_in = imgs[i] else: nn_in = np.ascontiguousarray(input_g[i, :, :]) context = engine.create_execution_context() #d_input = cuda.mem_alloc(1 * img.size * img.dtype.itemsize) #d_output = cuda.mem_alloc(1 * output.size * output.dtype.itemsize) #bindings = [int(d_input), int(d_output)] #stream = cuda.Stream() # Transfer input data to device cuda.memcpy_htod_async(d_input, nn_in, stream) # Execute model context.enqueue(1, bindings, stream.handle, None) # Transfer predictions back cuda.memcpy_dtoh_async(nn_out, d_output, stream) # Syncronize threads stream.synchronize() out_ch1 = np.reshape(nn_out[0:tile_size * tile_size], (tile_size, tile_size)) out_ch2 = np.reshape( nn_out[tile_size * tile_size:tile_size * tile_size * 2], (tile_size, tile_size)) context.destroy() out_imgs_ch1[:, :, i] = out_ch1 out_imgs_ch2[:, :, i] = out_ch2 #make_image(out_ch1, folder + img_name + str(i) + "_uff_out.png") ### General inference cleanup new_engine = trt.utils.load_engine(G_LOGGER, "./tf_mnist.engine") engine.destroy() new_engine.destroy() runtime.destroy() current_time = time.time() - start_time print("Inference complete. Time elapsed: %f seconds." % current_time) out0 = out_imgs_ch1[:, :, 0] out1 = out_imgs_ch1[:, :, 1] out2 = out_imgs_ch1[:, :, 2] out3 = out_imgs_ch1[:, :, 3] out_top = np.hstack((out0, out1)) out_btm = np.hstack((out2, out3)) out_final = np.vstack((out_top, out_btm)) make_image(out_final, folder + img_name + "_uff_out_ch1.png") out0 = out_imgs_ch2[:, :, 0] out1 = out_imgs_ch2[:, :, 1] out2 = out_imgs_ch2[:, :, 2] out3 = out_imgs_ch2[:, :, 3] out_top = np.hstack((out0, out1)) out_btm = np.hstack((out2, out3)) out_final = np.vstack((out_top, out_btm)) make_image(out_final, folder + img_name + "_uff_out_ch2.png") print("====================================")
def __init__(self, FLAGS, darknet=None): self.ntrain = 0 if isinstance(FLAGS, dict): from ..defaults import argHandler newFLAGS = argHandler() newFLAGS.setDefaults() newFLAGS.update(FLAGS) FLAGS = newFLAGS self.FLAGS = FLAGS if self.FLAGS.tensor: with open(self.FLAGS.metaLoad, 'r') as fp: self.meta = json.load(fp) self.framework = create_framework(self.meta, self.FLAGS) MODEL_FILE = '/home/sergey/darkflow/built_graph/tiny-yolo-voc.uff' INPUT_NAME = "input" INPUT_SHAPE = (3, 416, 416) OUTPUT_NAME = "BiasAdd_8" TRT_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.ERROR) from tensorrt.parsers import uffparser #builder = trt.Builder(TRT_LOGGER) #network = builder.create_network() parser = uffparser.create_uff_parser() parser.register_input("input", (3, 416, 416), 0) parser.register_output(OUTPUT_NAME) #parser.parse(MODEL_FILE, network) import uff m = uff.from_tensorflow_frozen_model(MODEL_FILE, ["BiasAdd_8"]) engine = trt.utils.uff_to_trt_engine(TRT_LOGGER, m, parser, 1, 1 << 20) print(engine) inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: size = trt.volume( engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) context = engine.create_execution_context() img = cv2.imread( '/home/sergey/darkflow/sample_img/sample_computer.jpg') img = self.framework.resize_input(img) img = img.transpose(2, 0, 1) h, w, _ = img.shape img = img.ravel() np.copyto(inputs[0].host, img) [ cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs ] # Run inference. context.execute_async(batch_size=1, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. [ cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs ] # Synchronize the stream stream.synchronize() [result] = [out.host for out in outputs] print(result) file = open('/home/sergey/q.txt', 'w') np.savetxt('/home/sergey/q.txt', result.ravel()) out = np.ndarray(shape=(13, 13, 125), dtype=np.float32) out = result.reshape((13, 13, 125)) boxes = self.framework.findboxes(out) threshold = self.FLAGS.threshold boxesInfo = list() for box in boxes: tmpBox = self.framework.process_box(box, h, w, threshold) if tmpBox is None: continue boxesInfo.append({ "label": tmpBox[4], "confidence": tmpBox[6], "topleft": { "x": tmpBox[0], "y": tmpBox[2] }, "bottomright": { "x": tmpBox[1], "y": tmpBox[3] } }) print(boxesInfo) return if self.FLAGS.pbLoad and self.FLAGS.metaLoad: self.say('\nLoading from .pb and .meta') self.graph = tf.Graph() device_name = FLAGS.gpuName \ if FLAGS.gpu > 0.0 else None with tf.device(device_name): with self.graph.as_default() as g: self.build_from_pb() return if darknet is None: darknet = Darknet(FLAGS) self.ntrain = len(darknet.layers) self.darknet = darknet args = [darknet.meta, FLAGS] self.num_layer = len(darknet.layers) self.framework = create_framework(*args) self.meta = darknet.meta self.say('\nBuilding net ...') start = time.time() self.graph = tf.Graph() device_name = FLAGS.gpuName \ if FLAGS.gpu > 0.0 else None with tf.device(device_name): with self.graph.as_default() as g: self.build_forward() self.setup_meta_ops() self.say('Finished in {}s\n'.format(time.time() - start))
import tensorrt as trt import uff from tensorrt.parsers import uffparser G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.INFO) uff_model = uff.from_tensorflow_frozen_model("final.pb", ["dense_2/Softmax"]) INFERENCE_BATCH_SIZE = 256 parser = uffparser.create_uff_parser() parser.register_input("conv2d_1_input", (1, 28, 28), 0) parser.register_output("dense_2/Softmax") engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, INFERENCE_BATCH_SIZE, 1<<20, trt.infer.DataType.FLOAT) trt.utils.write_engine_to_file("test_engine.engine", engine.serialize())