def _apply_graph_transform_tool_rewrites(g, input_node_names, output_node_names): # type: (gde.Graph, List[str], List[str]) -> tf.GraphDef """ Use the [Graph Transform Tool]( https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/ graph_transforms/README.md) to perform a series of pre-deployment rewrites. Args: g: GDE representation of the core graph. input_node_names: Names of placeholder nodes that are used as inputs to the graph for inference. Placeholders NOT on this list will be considered dead code. output_node_names: Names of nodes that produce tensors that are outputs of the graph for inference purposes. Nodes not necessary to produce these tensors will be considered dead code. Returns: GraphDef representation of rewritten graph. """ # Invoke the Graph Transform Tool using the undocumented Python APIs under # tensorflow.tools.graph_transforms after_tf_rewrites_graph_def = graph_transforms.TransformGraph( g.to_graph_def(), inputs=input_node_names, outputs=output_node_names, # Use the set of transforms recommended in the README under "Optimizing # for Deployment" transforms=[ 'strip_unused_nodes(type=float, shape="1,299,299,3")', 'remove_nodes(op=Identity, op=CheckNumerics)', 'fold_constants(ignore_errors=true)', 'fold_batch_norms', 'fold_old_batch_norms' ]) return after_tf_rewrites_graph_def
def _gtt_transforms(graph_def, input_names, output_names, initializer_names, transforms): """Pass through gtt transforms, applying them to the graph_def. Args: graph_def: A GraphDef proto to be transformed. input_names: Names of input nodes. output_names: Names of output nodes. initializer_names: Dictionary of the "infrastructural" nodes (initializers, save and restore ops, etc.) that should be retained even if they are not transitively reachable from output nodes. The keys in this dictionary indicate the collection where these nodes were obtained from. transforms: A list of strings naming the graph transforms to be applied in order. Returns: The transformed GraphDef. """ if not transforms: transformed_graph_def = _graph_pb2.GraphDef() transformed_graph_def.CopyFrom(graph_def) return transformed_graph_def initializer_names_flat = sorted( [k for l in initializer_names.values() for k in l]) all_output_names = output_names + initializer_names_flat return _graph_transforms.TransformGraph(graph_def, input_names, all_output_names, transforms)
def main(_): if len(sys.argv) < 2 or sys.argv[-1].startswith('-'): print( 'Usage: model_graph_to_saved_model.py [--model_version=y] import_path export_dir' ) sys.exit(-1) if FLAGS.import_path == '': print( 'Please specify the path to the model graph you want to convert to SavedModel format.' ) sys.exit(-1) if FLAGS.model_version <= 0: print('Please specify a positive value for version number.') sys.exit(-1) # Import model graph with tf.Session() as sess: graph_def = tf.GraphDef() with tf.gfile.GFile(FLAGS.import_path, 'rb') as input_file: input_graph_content = input_file.read() graph_def.ParseFromString(input_graph_content) # Apply transform optimizations output_graph = graph_transforms.TransformGraph(graph_def, [INPUTS], [OUTPUTS], [OPTIMIZATION]) sess.graph.as_default() tf.import_graph_def(output_graph, name='') # Build the signature_def_map. in_image = sess.graph.get_tensor_by_name('input:0') inputs = {INPUTS: tf.saved_model.utils.build_tensor_info(in_image)} out_classes = sess.graph.get_tensor_by_name('predict:0') outputs = { OUTPUTS: tf.saved_model.utils.build_tensor_info(out_classes) } signature = tf.saved_model.signature_def_utils.build_signature_def( inputs=inputs, outputs=outputs, method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME) # Save out the SavedModel print('Exporting trained model to', FLAGS.export_dir + '/' + str(FLAGS.model_version)) builder = tf.saved_model.builder.SavedModelBuilder( FLAGS.export_dir + '/' + str(FLAGS.model_version)) builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={ tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature }) builder.save() print('Done!')
def _do_transforms(graph_def, input_names, output_names, initializer_names, transforms, saver_def=None, checkpoint_path=None): """Apply requested transforms to a GraphDef, including freezing. This applies the Graph Transform Tool interleaved with graph freezing. Args: graph_def: A GraphDef proto to be transformed. input_names: Names of input nodes. output_names: Names of output nodes. initializer_names: Names of "infrastructural" nodes (initializers, save and restore ops, etc.) that should be retained even if they are not transitively reachable from output nodes. transforms: A list of strings naming the graph transforms to be applied in order. These transform names are exactly those supported by the Graph Transform Tool, with the addition of the 'freeze_graph' transform. saver_def: A SaverDef proto used for restoring a checkpoint during freezing, if needed (default None). checkpoint_path: A path to a checkpoint to restore during freezing, if needed (default None). Returns: The transformed GraphDef. """ if not transforms: transformed_graph_def = _graph_pb2.GraphDef() transformed_graph_def.CopyFrom(graph_def) return transformed_graph_def else: try: freeze_index = transforms.index('freeze_graph') except ValueError: # No freeze_graph requested, so do all transforms in one go. all_output_names = output_names + initializer_names return _graph_transforms.TransformGraph(graph_def, input_names, all_output_names, transforms) # freeze_graph requested, possibly with transforms before and after. phase_1_transforms = transforms[:freeze_index] phase_2_transforms = transforms[freeze_index + 1:] graph_def = _do_transforms(graph_def, input_names, output_names, initializer_names, phase_1_transforms, saver_def, checkpoint_path) output_node_names = [_op_name(x) for x in output_names] graph_def = _freeze_graph_with_def_protos(graph_def, output_node_names, saver_def, checkpoint_path) # No need for saver or checkpoint anymore return _do_transforms(graph_def, input_names, output_names, [], phase_2_transforms)
def inference(image_path, input_graph): #read the image in jpeg format img = tf.read_file(image_path) input_op = tf.image.decode_jpeg(img, channels=3) sess = tf.Session() with sess.as_default(): input = sess.run(input_op) if input.shape[0] != 224 or input.shape[1] != 224 or input.shape[ 2] != 3: #if the shape is not 224*224*3, preprocess the image, such as: resize input = preprocess(input) input = input.reshape(1, 224, 224, 3) #config the inference graph config infer_config = tf.ConfigProto() infer_config.intra_op_parallelism_threads = 26 infer_config.inter_op_parallelism_threads = 1 infer_config.use_per_session_threads = 1 #read the pb model infer_graph = tf.Graph() with infer_graph.as_default(): graph_def = tf.GraphDef() with tf.gfile.FastGFile(input_graph, 'rb') as input_file: input_graph_content = input_file.read() graph_def.ParseFromString(input_graph_content) output_graph = graph_transforms.TransformGraph(graph_def, [INPUTS], [OUTPUTS], [OPTIMIZATION]) # for node in output_graph.node: # print("name:{} op:{}".format(node.name,node.op)) tf.import_graph_def(output_graph, name='') # Definite input and output Tensors for detection_graph input_tensor = infer_graph.get_tensor_by_name('input:0') output_tensor = infer_graph.get_tensor_by_name('predict:0') infer_sess = tf.Session(graph=infer_graph, config=infer_config) predictions = infer_sess.run(output_tensor, {input_tensor: input}) print(np.argmax(predictions)) print("This image belong to : \"{}\"".format( word_label.label[np.argmax(predictions) - 1]))
def run(self): """run benchmark with optimized graph""" print("Run inference") data_config = tf.ConfigProto() data_config.intra_op_parallelism_threads = self.args.data_num_intra_threads data_config.inter_op_parallelism_threads = self.args.data_num_inter_threads data_config.use_per_session_threads = 1 infer_config = tf.ConfigProto() infer_config.intra_op_parallelism_threads = self.args.num_intra_threads infer_config.inter_op_parallelism_threads = self.args.num_inter_threads infer_config.use_per_session_threads = 1 data_graph = tf.Graph() with data_graph.as_default(): if (self.args.data_location): print("Inference with real data.") dataset = datasets.ImagenetData(self.args.data_location) preprocessor = dataset.get_image_preprocessor()( RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, self.args.batch_size, intra_threads=self.args.num_intra_threads, resize_method='crop') images, labels = preprocessor.minibatch(dataset, subset='validation') else: print("Inference with dummy data.") input_shape = [ self.args.batch_size, RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, 3 ] images = tf.random.uniform(input_shape, 0.0, 255.0, dtype=tf.float32, name='synthetic_images') infer_graph = tf.Graph() with infer_graph.as_default(): # convert the freezed graph to optimized graph graph_def = tf.GraphDef() with tf.gfile.FastGFile(self.args.input_graph, 'rb') as input_file: input_graph_content = input_file.read() graph_def.ParseFromString(input_graph_content) output_graph = graph_transforms.TransformGraph( graph_def, [INPUTS], [OUTPUTS], [OPTIMIZATION]) tf.import_graph_def(output_graph, name='') # Definite input and output Tensors for detection_graph input_tensor = infer_graph.get_tensor_by_name('input:0') #output_tensor = infer_graph.get_tensor_by_name('resnet_v1_101/SpatialSqueeze:0') output_tensor = infer_graph.get_tensor_by_name( 'resnet_v1_101/predictions/Reshape_1:0') #tf.global_variables_initializer() data_sess = tf.Session(graph=data_graph, config=data_config) infer_sess = tf.Session(graph=infer_graph, config=infer_config) num_processed_images = 0 num_remaining_images = IMAGENET_VALIDATION_IMAGES if (not self.args.accuracy_only): # performance check iteration = 0 warm_up_iteration = self.args.warmup_steps total_run = self.args.steps total_time = 0 #options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) #run_metadata = tf.RunMetadata() while num_remaining_images >= self.args.batch_size and iteration < total_run: iteration += 1 # Reads and preprocess data data_load_start = time.time() image_np = data_sess.run(images) data_load_time = time.time() - data_load_start num_processed_images += self.args.batch_size num_remaining_images -= self.args.batch_size start_time = time.time() infer_sess.run([output_tensor], feed_dict={input_tensor: image_np}) time_consume = time.time() - start_time # only add data loading time for real data, not for dummy data if self.args.data_location: time_consume += data_load_time #trace = timeline.Timeline(step_stats=run_metadata.step_stats) #with gfile.Open('resnet101_fp32_int8_master', 'w') as trace_file: # trace_file.write(trace.generate_chrome_trace_format(show_memory=False)) print('Iteration %d: %.3f sec' % (iteration, time_consume)) if iteration > warm_up_iteration: total_time += time_consume time_average = total_time / (iteration - warm_up_iteration) print('Average time: %.3f sec' % (time_average)) print('Batch size = %d' % self.args.batch_size) if (self.args.batch_size == 1): print('Latency: %.3f ms' % (time_average * 1000)) # print throughput for both batch size 1 and 128 print('Throughput: %.3f images/sec' % (self.args.batch_size / time_average)) else: # accuracy check total_accuracy1, total_accuracy5 = (0.0, 0.0) while num_remaining_images >= self.args.batch_size: # Reads and preprocess data np_images, np_labels = data_sess.run([images, labels]) np_labels -= 1 num_processed_images += self.args.batch_size num_remaining_images -= self.args.batch_size start_time = time.time() # Compute inference on the preprocessed data predictions = infer_sess.run(output_tensor, {input_tensor: np_images}) elapsed_time = time.time() - start_time with tf.Graph().as_default() as accu_graph: # Putting all code within this make things faster. accuracy1 = tf.reduce_sum( tf.cast( tf.nn.in_top_k(tf.constant(predictions), tf.constant(np_labels), 1), tf.float32)) accuracy5 = tf.reduce_sum( tf.cast( tf.nn.in_top_k(tf.constant(predictions), tf.constant(np_labels), 5), tf.float32)) with tf.Session() as accu_sess: np_accuracy1, np_accuracy5 = accu_sess.run( [accuracy1, accuracy5]) total_accuracy1 += np_accuracy1 total_accuracy5 += np_accuracy5 print("Iteration time: %0.4f ms" % elapsed_time) print("Processed %d images. (Top1 accuracy, Top5 accuracy) = (%0.4f, %0.4f)" \ % (num_processed_images, total_accuracy1 / num_processed_images, total_accuracy5 / num_processed_images))
def run(self): """run benchmark with optimized graph""" print("Run inference") data_config = tf.ConfigProto() data_config.intra_op_parallelism_threads = self.args.data_num_intra_threads data_config.inter_op_parallelism_threads = self.args.data_num_inter_threads data_config.use_per_session_threads = 1 infer_config = tf.ConfigProto() infer_config.intra_op_parallelism_threads = self.args.num_intra_threads infer_config.inter_op_parallelism_threads = self.args.num_inter_threads infer_config.use_per_session_threads = 1 data_graph = tf.Graph() with data_graph.as_default(): if (self.args.data_location): print("Inference with real data.") if self.args.calibrate: subset = 'calibration' else: subset = 'validation' dataset = datasets.ImagenetData(self.args.data_location) preprocessor = dataset.get_image_preprocessor()( RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, self.args.batch_size, num_cores=self.args.num_cores, resize_method='crop') images, labels, filenames = preprocessor.minibatch(dataset, subset=subset) # If a results file path is provided, then start the prediction output file if self.args.results_file_path: with open(self.args.results_file_path, "w+") as fp: fp.write("filename,actual,prediction\n") else: print("Inference with dummy data.") input_shape = [self.args.batch_size, RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, 3] images = tf.random.uniform(input_shape, 0.0, 255.0, dtype=tf.float32, name='synthetic_images') infer_graph = tf.Graph() with infer_graph.as_default(): graph_def = tf.GraphDef() with tf.gfile.FastGFile(self.args.input_graph, 'rb') as input_file: input_graph_content = input_file.read() graph_def.ParseFromString(input_graph_content) output_graph = graph_transforms.TransformGraph(graph_def, [INPUTS], [OUTPUTS], [OPTIMIZATION]) tf.import_graph_def(output_graph, name='') # Definite input and output Tensors for detection_graph input_tensor = infer_graph.get_tensor_by_name('input:0') output_tensor = infer_graph.get_tensor_by_name('predict:0') data_sess = tf.Session(graph=data_graph, config=data_config) infer_sess = tf.Session(graph=infer_graph, config=infer_config) num_processed_images = 0 num_remaining_images = dataset.num_examples_per_epoch(subset=subset) - num_processed_images \ if self.args.data_location else (self.args.batch_size * self.args.steps) if (not self.args.accuracy_only): iteration = 0 warm_up_iteration = self.args.warmup_steps total_run = self.args.steps total_time = 0 while num_remaining_images >= self.args.batch_size and iteration < total_run: iteration += 1 tf_filenames = None np_labels = None data_load_start = time.time() if self.args.results_file_path: image_np, np_labels, tf_filenames = data_sess.run([images, labels, filenames]) else: image_np = data_sess.run(images) data_load_time = time.time() - data_load_start num_processed_images += self.args.batch_size num_remaining_images -= self.args.batch_size start_time = time.time() predictions = infer_sess.run(output_tensor, feed_dict={input_tensor: image_np}) time_consume = time.time() - start_time # Write out the file name, expected label, and top prediction self.write_results_output(predictions, tf_filenames, np_labels) # only add data loading time for real data, not for dummy data if self.args.data_location: time_consume += data_load_time print('Iteration %d: %.6f sec' % (iteration, time_consume)) if iteration > warm_up_iteration: total_time += time_consume time_average = total_time / (iteration - warm_up_iteration) print('Average time: %.6f sec' % (time_average)) print('Batch size = %d' % self.args.batch_size) if (self.args.batch_size == 1): print('Latency: %.3f ms' % (time_average * 1000)) # print throughput for both batch size 1 and 128 print('Throughput: %.3f images/sec' % (self.args.batch_size / time_average)) else: # accuracy check total_accuracy1, total_accuracy5 = (0.0, 0.0) while num_remaining_images >= self.args.batch_size: # Reads and preprocess data tf_filenames = None if self.args.results_file_path: np_images, np_labels, tf_filenames = data_sess.run([images, labels, filenames]) else: np_images, np_labels = data_sess.run([images, labels]) num_processed_images += self.args.batch_size num_remaining_images -= self.args.batch_size start_time = time.time() # Compute inference on the preprocessed data predictions = infer_sess.run(output_tensor, {input_tensor: np_images}) elapsed_time = time.time() - start_time # Write out the file name, expected label, and top prediction self.write_results_output(predictions, tf_filenames, np_labels) with tf.Graph().as_default() as accu_graph: accuracy1 = tf.reduce_sum( tf.cast(tf.nn.in_top_k(tf.constant(predictions), tf.constant(np_labels), 1), tf.float32)) accuracy5 = tf.reduce_sum( tf.cast(tf.nn.in_top_k(tf.constant(predictions), tf.constant(np_labels), 5), tf.float32)) with tf.Session() as accu_sess: np_accuracy1, np_accuracy5 = accu_sess.run([accuracy1, accuracy5]) total_accuracy1 += np_accuracy1 total_accuracy5 += np_accuracy5 print("Iteration time: %0.4f ms" % elapsed_time) print("Processed %d images. (Top1 accuracy, Top5 accuracy) = (%0.4f, %0.4f)" \ % (num_processed_images, total_accuracy1 / num_processed_images, total_accuracy5 / num_processed_images))
def tf_run_const_folding(self, file): print("run const folding----------------------------") tf.reset_default_graph() graph_def, graph = self.import_graph(file) print() if (self.debug): print('Placeholders:') assert graph is not None ops = graph.get_operations() # type: Iterable[tf.Operation] input_nodes = [] last_nodes = [] for op in ops: if op.type == 'Placeholder': for tensor in op.outputs: if (self.debug): print('- {0:20s} {1}'.format("Tensor", tensor.name)) input_nodes.append(tensor.name) if (self.debug): print() print('Sinks (operations without outputs):') last_outputs = [] num_nodes = len(ops) name2nodeIdx_map = {} for i in range(num_nodes): name2nodeIdx_map[ops[i].name] = i node_outputs_ = [[] for i in range(num_nodes)] for n in range(num_nodes): op = ops[n] pending_count = len(op.inputs) for i in range(pending_count): input_name_id = op.inputs[i].name.split(':') node_outputs_[name2nodeIdx_map[input_name_id[0]]].append(n) for n in range(num_nodes): if len(node_outputs_[n]) == 0 and ops[n].type != 'NoOp': if (self.debug): print('- {0:20s} {1}'.format(ops[n].type, ops[n].name)) for m in range(len(ops[n].inputs)): if (self.debug): print('<-in-- {0:20s}'.format(ops[n].inputs[m].name)) last_outputs.append(ops[n].inputs[m].name) ''' if len(node_outputs_[n]) == 0 and ops[n].type == 'NoOp': for m in range(len(ops[n].control_inputs)): print('<-in-^ {0:20s}'.format(ops[n].control_inputs[m].name)) last_outputs.append(ops[n].control_inputs[m].name) ''' print(input_nodes) print(last_outputs) g_def_const = tf.import_graph_def(graph_def, name="") g_def_const = graph_transforms.TransformGraph( graph_def, input_nodes, last_outputs, ["fold_constants", "strip_unused_nodes"]) print() self.folded_graph = file[:-3] + ".const_folded.pb" print("Saving Const-folded Graph... as " + self.folded_graph) graph_io.write_graph(as_text=False, name=self.folded_graph, logdir="./", graph_or_graph_def=g_def_const) print("Finished.")
def run(self): """run benchmark with optimized graph""" with tf.Graph().as_default() as graph: config = tf.ConfigProto() config.allow_soft_placement = True config.intra_op_parallelism_threads = self.args.num_intra_threads config.inter_op_parallelism_threads = self.args.num_inter_threads with tf.Session(config=config) as sess: # convert the freezed graph to optimized graph graph_def = tf.GraphDef() with tf.gfile.FastGFile(self.args.input_graph, 'rb') as input_file: input_graph_content = input_file.read() graph_def.ParseFromString(input_graph_content) output_graph = graph_transforms.TransformGraph(graph_def, [INPUTS], [OUTPUTS], [OPTIMIZATION]) sess.graph.as_default() tf.import_graph_def(output_graph, name='') # Definite input and output Tensors for detection_graph input_tensor = graph.get_tensor_by_name('input:0') output_tensor = graph.get_tensor_by_name('predict:0') tf.global_variables_initializer() num_processed_images = 0 num_remaining_images = IMAGENET_VALIDATION_IMAGES if (self.args.data_location): print("Inference with real data.") dataset = datasets.ImagenetData(self.args.data_location) preprocessor = preprocessing.ImagePreprocessor( RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, self.args.batch_size, 1, # device count tf.float32, # data_type for input fed to the graph train=False, # doing inference resize_method='crop') images, labels, filenames = preprocessor.minibatch(dataset, subset='validation') num_remaining_images = dataset.num_examples_per_epoch(subset='validation') \ - num_processed_images else: print("Inference with dummy data.") input_shape = [self.args.batch_size, RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, 3] images = tf.random.uniform(input_shape, 0.0, 255.0, dtype=tf.float32, name='synthetic_images') if (not self.args.accuracy_only): # performance check iteration = 0 warm_up_iteration = 10 total_run = 40 total_time = 0 while num_remaining_images >= self.args.batch_size and iteration < total_run: iteration += 1 # Reads and preprocess data if (self.args.data_location): preprocessed_images = sess.run([images[0]]) image_np = preprocessed_images[0] else: image_np = sess.run(images) num_processed_images += self.args.batch_size num_remaining_images -= self.args.batch_size start_time = time.time() (predicts) = sess.run([output_tensor], feed_dict={input_tensor: image_np}) time_consume = time.time() - start_time print('Iteration %d: %.3f sec' % (iteration, time_consume)) if iteration > warm_up_iteration: total_time += time_consume time_average = total_time / (iteration - warm_up_iteration) print('Average time: %.3f sec' % (time_average)) print('Batch size = %d' % self.args.batch_size) if (self.args.batch_size == 1): print('Latency: %.3f ms' % (time_average * 1000)) # print throughput for both batch size 1 and 128 print('Throughput: %.3f images/sec' % (self.args.batch_size / time_average)) else: # accuracy check total_accuracy1, total_accuracy5 = (0.0, 0.0) # If a results file path is provided, then start the prediction output file if self.args.results_file_path: with open(self.args.results_file_path, "w+") as fp: fp.write("filename,actual,prediction\n") while num_remaining_images >= self.args.batch_size: # Reads and preprocess data np_images, np_labels, tf_filenames = sess.run( [images[0], labels[0], filenames[0]]) num_processed_images += self.args.batch_size num_remaining_images -= self.args.batch_size # Compute inference on the preprocessed data predictions = sess.run(output_tensor, {input_tensor: np_images}) # Write out the file name, expected label, and top prediction if self.args.results_file_path: top_predictions = np.argmax(predictions, 1) with open(self.args.results_file_path, "a") as fp: for filename, expected_label, top_prediction in \ zip(tf_filenames, np_labels, top_predictions): fp.write("{},{},{}\n".format(filename, expected_label, top_prediction)) accuracy1 = tf.reduce_sum( tf.cast(tf.nn.in_top_k(tf.constant(predictions), tf.constant(np_labels), 1), tf.float32)) accuracy5 = tf.reduce_sum( tf.cast(tf.nn.in_top_k(tf.constant(predictions), tf.constant(np_labels), 5), tf.float32)) np_accuracy1, np_accuracy5 = sess.run([accuracy1, accuracy5]) total_accuracy1 += np_accuracy1 total_accuracy5 += np_accuracy5 print("Processed %d images. (Top1 accuracy, Top5 accuracy) = (%0.4f, %0.4f)" \ % (num_processed_images, total_accuracy1 / num_processed_images, total_accuracy5 / num_processed_images))
def main(_): # Remove any detritus of previous runs of this script, but leave the temp # dir in place because the user might have a shell there. if not os.path.isdir(_TMP_DIR): os.mkdir(_TMP_DIR) _clear_dir(_SAVED_MODEL_DIR) for f in _AFTER_MODEL_FILES: if os.path.isfile(f): os.remove(f) # Obtain a frozen graph for a MobileNet model if _USE_KERAS: frozen_graph_def, input_node, output_node = get_keras_frozen_graph() else: frozen_graph_def, input_node, output_node = get_slim_frozen_graph() _protobuf_to_file(frozen_graph_def, _FROZEN_GRAPH_FILE, "Frozen graph") # Now run through some of TensorFlow's built-in graph rewrites. # For that we use the undocumented Python APIs under # tensorflow.tools.graph_transforms after_tf_rewrites_graph_def = graph_transforms.TransformGraph( frozen_graph_def, inputs=[input_node], outputs=[output_node], # Use the set of transforms recommended in the README under "Optimizing # for Deployment" transforms=[ 'strip_unused_nodes(type=float, shape="1,299,299,3")', 'remove_nodes(op=Identity, op=CheckNumerics)', 'fold_constants(ignore_errors=true)', 'fold_batch_norms', 'fold_old_batch_norms' ]) _protobuf_to_file(after_tf_rewrites_graph_def, _TF_REWRITES_GRAPH_FILE, "Graph after built-in TensorFlow rewrites") # Now run the GraphDef editor's fold_batch_norms_up() rewrite g = gde.Graph(after_tf_rewrites_graph_def) gde.rewrite.fold_batch_norms(g) gde.rewrite.fold_old_batch_norms(g) gde.rewrite.fold_batch_norms_up(g) after_gde_graph_def = g.to_graph_def(add_shapes=True) _protobuf_to_file(after_gde_graph_def, _GDE_REWRITES_GRAPH_FILE, "Graph after fold_batch_norms_up() rewrite") # Dump some statistics about the number of each type of op print(" Number of ops in frozen graph: {}".format( len(frozen_graph_def.node))) print(" Number of ops after built-in rewrites: {}".format( len(after_tf_rewrites_graph_def.node))) print("Number of ops after GDE rewrites: {}".format( len(after_gde_graph_def.node))) # Run model before and after rewrite and compare results if not os.path.exists(_PANDA_PIC_FILE): print("Downloading {} to {}".format(_PANDA_PIC_URL, _PANDA_PIC_FILE)) urllib.request.urlretrieve(_PANDA_PIC_URL, _PANDA_PIC_FILE) img = np.array(PIL.Image.open(_PANDA_PIC_FILE).resize( (224, 224))).astype(np.float) # / 128 # - 1 # Normalize each channel channel_means = np.mean(img, axis=(0, 1)) print("Channel means are: {}".format(channel_means)) print("Image shape is {}".format(img.shape)) print("Frozen graph results:") run_graph(frozen_graph_def, img, input_node, output_node) print("Results after built-in rewrites:") run_graph(after_tf_rewrites_graph_def, img, input_node, output_node) print("Results after GDE rewrites:") run_graph(after_gde_graph_def, img, input_node, output_node)
input_nodes.append(tensor.name) print() print('Sinks (operations without outputs):') last_outputs = [] num_nodes = len(ops) name2nodeIdx_map = {} for i in range(num_nodes): name2nodeIdx_map[ops[i].name] = i node_outputs_ = [[] for i in range(num_nodes)] for n in range(num_nodes): # if len(ops[n].outputs) > 0: # last_outputs.append(ops[n].outputs[0]) op = ops[n] pending_count = len(op.inputs) for i in range(pending_count): input_name_id = op.inputs[i].name.split(':') node_outputs_[name2nodeIdx_map[input_name_id[0]]].append(n) for n in range(num_nodes): if len(node_outputs_[n]) == 0 and ops[n].type != 'NoOp' and ops[n].type != 'Assert': print('- {0:20s} {1}'.format(ops[n].type, ops[n].name)) last_outputs.append(ops[n].outputs[0].name) g_def_const = tf.import_graph_def(graph_def, name="") g_def_const = graph_transforms.TransformGraph(graph_def, input_nodes, last_outputs, ["fold_constants", "strip_unused_nodes", "merge_duplicate_nodes", "sort_by_execution_order"]) print() folded_graph = args.file[:-3] + ".const_folded.pb" print("Saving Const-folded Graph... as " + folded_graph) graph_io.write_graph(as_text=False, name=folded_graph, logdir="./",graph_or_graph_def=g_def_const) print("Finished.")
def main(_): input_names = sorted( [name for name in FLAGS.input_names.replace(' ', '').split(',')]) output_names = sorted( [name for name in FLAGS.output_names.replace(' ', '').split(',')]) output_graph_def = None optimized_graph_path = None if not (FLAGS.frozen_graph_path or FLAGS.savedmodel_dir): raise AttributeError( 'Either path to the frozen graph or directory of the SavedModel must be provided!' ) if FLAGS.frozen_graph_path and not (FLAGS.input_names and FLAGS.output_names): raise AttributeError( 'Input and output tensor names must be provided along with frozen graph path!' ) if FLAGS.savedmodel_dir: savedmodel_pb_filename = 'saved_model.pb' path_to_pb = os.path.join(FLAGS.savedmodel_dir, savedmodel_pb_filename) signature_def = graph_def_util.saved_model_pb2.SavedModel() graph_def_util.read_def(path_to_pb, signature_def, 'The SavedModel') signature_def = signature_def.meta_graphs[0].signature_def[ tf.saved_model.signature_constants. DEFAULT_SERVING_SIGNATURE_DEF_KEY] input_names = sorted([ item.name[:item.name.find(':')] for _, item in signature_def.inputs.items() ]) output_names = sorted([ item.name[:item.name.find(':')] for _, item in signature_def.outputs.items() ]) frozen_graph_filename = 'frozen_graph.pb' frozen_graph_path = os.path.join(FLAGS.savedmodel_dir, frozen_graph_filename) output_graph_def = freeze_graph.freeze_graph( input_graph=None, input_saver=None, input_checkpoint=None, input_binary=True, clear_devices=True, output_node_names=', '.join(output_names), restore_op_name=None, filename_tensor_name=None, output_graph=frozen_graph_path, initializer_nodes=None, input_saved_model_dir=FLAGS.savedmodel_dir) optimized_graph_path = FLAGS.savedmodel_dir elif FLAGS.frozen_graph_path: output_graph_def = graph_def_util.graph_pb2.GraphDef() graph_def_util.read_def(FLAGS.frozen_graph_path, output_graph_def, 'The frozen graph') optimized_graph_path = os.path.dirname(FLAGS.frozen_graph_path) # If you want to apply only 'optimize_for_inference' uncomment the following, but # don't forget to remove 'graph_transforms' optimization since they are not compatible. # # output_graph_def = optimize_for_inference_lib.optimize_for_inference( # input_graph_def=output_graph_def, placeholder_type_enum=tf.float32.as_datatype_enum, # input_node_names=input_names, output_node_names=output_names) transforms = [ 'strip_unused_nodes(type=float, shape="1,299,299,3")', 'remove_nodes(op=Identity, op=CheckNumerics)', 'fold_constants(ignore_errors=true)', 'fold_batch_norms', 'fold_old_batch_norms', 'quantize_weights', 'quantize_nodes' ] output_graph_def = graph_transforms.TransformGraph( input_graph_def=output_graph_def, transforms=transforms, inputs=input_names, outputs=output_names) optimized_graph_filename = 'optimized_graph.pb' optimized_graph_path = os.path.join(optimized_graph_path, optimized_graph_filename) graph_def_util.write_def(optimized_graph_path, output_graph_def)
def run_inference(tfConfigParams, images, image_path): model_dir = os.path.join(os.environ['APP_HOME'], "Modules", "Deep-Learning", "packages", "models") if FLAGS.precision == 'int8': INPUTS = 'input' OUTPUTS = 'predict' if FLAGS.precision == 'fp32': INPUTS = 'input' OUTPUTS = 'resnet_v1_50/SpatialSqueeze' device = "/" + FLAGS.aarch + ":0" if len(FLAGS.aarch) > 1 else "/cpu:0" # open the device to run on with tf.device(device): timing_csv_file = open(FLAGS.csv_file_path, "a") # prepare the config with tf.Graph().as_default() as graph: config = tf.ConfigProto() for key, value in tfConfigParams.items(): if (key == "inter_op_parallelism_threads"): config.inter_op_parallelism_threads = value if (key == "intra_op_parallelism_threads"): config.intra_op_parallelism_threads = value if (key == "allow_soft_placement"): config.allow_soft_placement = value # open a tensorflow session and load the graph timing_csv_buffer_data = [] with tf.Session(config=config) as sess: graph_def = tf.GraphDef() with tf.gfile.GFile(model_dir + "/" + FLAGS.frozen_graph, 'rb') as input_file: input_graph_content = input_file.read() graph_def.ParseFromString(input_graph_content) output_graph = graph_transforms.TransformGraph( graph_def, [INPUTS], [OUTPUTS], [OPTIMIZATION]) sess.graph.as_default() tf.import_graph_def(output_graph, name='') # Definite input and output Tensors for detection_graph input_tensor = graph.get_tensor_by_name(INPUTS + ':0') output_tensor = graph.get_tensor_by_name(OUTPUTS + ':0') tf.global_variables_initializer() #start inference tf.logging.info("Starting Warmup cycle") for _ in range(_WARMUP_NUM_LOOPS): predicts = sess.run([output_tensor], feed_dict={input_tensor: images}) for iter in range(FLAGS.iterations): tf.logging.info("Starting timing.") tstart = time.time() predicts = sess.run([output_tensor], feed_dict={input_tensor: images}) tend = time.time() predictions = np.squeeze(predicts) if (os.environ["DEMO"] == "True"): imageName = os.path.basename(image_path) predictionsList = predictions.argsort()[-5:][::-1] scoreList = [] for node_id in predictionsList: scoreList.append(predictions[node_id]) timing_csv_buffer_data.append( str(tstart) + ',' + str(tend) + ',' + imageName + ',' + str(predictionsList[0]) + ',' + str(scoreList[0]) + ',' + str(predictionsList[1]) + ',' + str(scoreList[1]) + ',' + str(predictionsList[2]) + ',' + str(scoreList[2]) + ',' + str(predictionsList[3]) + ',' + str(scoreList[3]) + ',' + str(predictionsList[4]) + ',' + str(scoreList[4])) else: timing_csv_buffer_data.append( str(tstart) + ',' + str(tend)) predictions = np.squeeze(predicts) return timing_csv_buffer_data
def _do_transforms(graph_def, input_names, output_names, initializer_names, transforms, saver_def=None, checkpoint_path=None): """Apply requested transforms to a GraphDef, including freezing. This applies the Graph Transform Tool interleaved with graph freezing. Args: graph_def: A GraphDef proto to be transformed. input_names: Names of input nodes. output_names: Names of output nodes. initializer_names: Dictionary of the "infrastructural" nodes (initializers, save and restore ops, etc.) that should be retained even if they are not transitively reachable from output nodes. The keys in this dictionary indicate the collection where these nodes were obtained from. transforms: A list of strings naming the graph transforms to be applied in order. These transform names are exactly those supported by the Graph Transform Tool, with the addition of the 'freeze_graph' transform. saver_def: A SaverDef proto used for restoring a checkpoint during freezing, if needed (default None). checkpoint_path: A path to a checkpoint to restore during freezing, if needed (default None). Returns: The transformed GraphDef. """ if not transforms: transformed_graph_def = _graph_pb2.GraphDef() transformed_graph_def.CopyFrom(graph_def) return transformed_graph_def else: try: freeze_index = transforms.index(_FREEZE_GRAPH_TRANSFORM_NAME) except ValueError: # No freeze_graph requested, so do all transforms in one go. initializer_names_flat = sorted( [k for l in initializer_names.values() for k in l]) all_output_names = output_names + initializer_names_flat return _graph_transforms.TransformGraph( graph_def, input_names, all_output_names, transforms) # freeze_graph requested, possibly with transforms before and after. phase_1_transforms = transforms[:freeze_index] phase_2_transforms = transforms[freeze_index+1:] graph_def = _do_transforms( graph_def, input_names, output_names, initializer_names, phase_1_transforms, saver_def, checkpoint_path) output_node_names = [_op_name(x) for x in output_names] graph_def = _freeze_graph_with_def_protos( graph_def, output_node_names, initializer_names[_ops.GraphKeys.TABLE_INITIALIZERS], initializer_names[_saved_model_constants.LEGACY_INIT_OP_KEY][0], saver_def, checkpoint_path) # No need for saver or checkpoint anymore pruned_initializer_names = {} # Freeze graph will prune all initializers and shared init nodes if table # initializers are not present. Handle this case in future GTT transforms. if initializer_names[_ops.GraphKeys.TABLE_INITIALIZERS]: pruned_initializer_names[_ops.GraphKeys.TABLE_INITIALIZERS] = ( initializer_names[_ops.GraphKeys.TABLE_INITIALIZERS]) pruned_initializer_names[_saved_model_constants.LEGACY_INIT_OP_KEY] = ( initializer_names[_saved_model_constants.LEGACY_INIT_OP_KEY]) return _do_transforms(graph_def, input_names, output_names, pruned_initializer_names, phase_2_transforms)
def run(self): """run benchmark with optimized graph""" print("Run inference") data_config = tf.ConfigProto() data_config.intra_op_parallelism_threads = self.args.data_num_intra_threads data_config.inter_op_parallelism_threads = self.args.data_num_inter_threads data_config.use_per_session_threads = 1 infer_config = tf.ConfigProto() infer_config.intra_op_parallelism_threads = self.args.num_intra_threads infer_config.inter_op_parallelism_threads = self.args.num_inter_threads infer_config.use_per_session_threads = 1 data_graph = tf.Graph() with data_graph.as_default(): if (self.args.data_location): print("Inference with real data.") dataset = datasets.ImagenetData(self.args.data_location) preprocessor = dataset.get_image_preprocessor()( INCEPTION_V3_IMAGE_SIZE, INCEPTION_V3_IMAGE_SIZE, self.args.batch_size, num_cores=self.args.num_cores, resize_method='bilinear') images, labels = preprocessor.minibatch(dataset, subset='validation') else: print("Inference with dummy data.") input_shape = [ self.args.batch_size, INCEPTION_V3_IMAGE_SIZE, INCEPTION_V3_IMAGE_SIZE, 3 ] images = tf.random.uniform(input_shape, 0.0, 255.0, dtype=tf.float32, name='synthetic_images') infer_graph = tf.Graph() with infer_graph.as_default(): graph_def = tf.GraphDef() with tf.gfile.FastGFile(self.args.input_graph, 'rb') as input_file: input_graph_content = input_file.read() graph_def.ParseFromString(input_graph_content) output_graph = graph_transforms.TransformGraph( graph_def, [INPUTS], [OUTPUTS], [OPTIMIZATION]) tf.import_graph_def(output_graph, name='') # Definite input and output Tensors for detection_graph input_tensor = infer_graph.get_tensor_by_name('input:0') output_tensor = infer_graph.get_tensor_by_name('predict:0') data_sess = tf.Session(graph=data_graph, config=data_config) infer_sess = tf.Session(graph=infer_graph, config=infer_config) num_processed_images = 0 num_remaining_images = datasets.IMAGENET_NUM_VAL_IMAGES if (not self.args.accuracy_only): iteration = 0 warm_up_iteration = self.args.warmup_steps total_run = self.args.steps total_time = 0 while num_remaining_images >= self.args.batch_size and iteration < total_run: iteration += 1 data_load_start = time.time() image_np = data_sess.run(images) data_load_time = time.time() - data_load_start num_processed_images += self.args.batch_size num_remaining_images -= self.args.batch_size start_time = time.time() infer_sess.run([output_tensor], feed_dict={input_tensor: image_np}) time_consume = time.time() - start_time # only add data loading time for real data, not for dummy data if self.args.data_location: time_consume += data_load_time print('Iteration %d: %.6f sec' % (iteration, time_consume)) if iteration > warm_up_iteration: total_time += time_consume time_average = total_time / (iteration - warm_up_iteration) print('Average time: %.6f sec' % (time_average)) print('Batch size = %d' % self.args.batch_size) if (self.args.batch_size == 1): print('Latency: %.3f ms' % (time_average * 1000)) print('Throughput: %.3f images/sec' % (self.args.batch_size / time_average)) else: # accuracy check total_accuracy1, total_accuracy5 = (0.0, 0.0) while num_remaining_images >= self.args.batch_size: # Reads and preprocess data np_images, np_labels = data_sess.run([images, labels]) num_processed_images += self.args.batch_size num_remaining_images -= self.args.batch_size # Compute inference on the preprocessed data predictions = infer_sess.run(output_tensor, {input_tensor: np_images}) with tf.Graph().as_default() as accu_graph: accuracy1 = tf.reduce_sum( tf.cast( tf.nn.in_top_k(tf.constant(predictions), tf.constant(np_labels), 1), tf.float32)) accuracy5 = tf.reduce_sum( tf.cast( tf.nn.in_top_k(tf.constant(predictions), tf.constant(np_labels), 5), tf.float32)) with tf.Session() as accu_sess: np_accuracy1, np_accuracy5 = accu_sess.run( [accuracy1, accuracy5]) total_accuracy1 += np_accuracy1 total_accuracy5 += np_accuracy5 print("Processed %d images. (Top1 accuracy, Top5 accuracy) = (%0.4f, %0.4f)" \ % (num_processed_images, total_accuracy1 / num_processed_images, total_accuracy5 / num_processed_images))