def testFoldFusedBatchNormsWithSharedWeights(self): for data_format, conv2d_func in [ ("NHWC", nn_ops.conv2d), ("NCHW", nn_ops.conv2d), ("NHWC", nn_ops.depthwise_conv2d_native), ("NCHW", nn_ops.depthwise_conv2d_native) ]: with tf.compat.v1.Session() as sess: _generate_fused_batchnorm(data_format, conv2d_func, 2) original_graph_def = sess.graph_def original_result = sess.run(["output:0"]) optimized_graph_def = fold_batch_norms.fold_batch_norms( original_graph_def) with tf.compat.v1.Session() as sess: _ = importer.import_graph_def(optimized_graph_def, input_map={}, name="optimized") optimized_result = sess.run(["optimized/output:0"]) self.assertAllClose(original_result, optimized_result, rtol=1e-04, atol=1e-06) for node in optimized_graph_def.node: self.assertNotEqual("FusedBatchNormV3", node.op)
def testFoldFusedBatchNormWithBias(self): for data_format, conv2d_func in [ ("NHWC", nn_ops.conv2d), ("NHWC", nn_ops.depthwise_conv2d_native), ]: graph = tf1.Graph() with tf1.Session(graph=graph) as sess: count = 1 add_bias = True _generate_fused_batchnorm(data_format, conv2d_func, count, add_bias) original_graph_def = sess.graph_def original_result = sess.run(["output:0"]) optimized_graph_def = fold_batch_norms.fold_batch_norms( original_graph_def) with tf1.Session() as sess: _ = importer.import_graph_def(optimized_graph_def, input_map={}, name="optimized") optimized_result = sess.run(["optimized/output:0"]) self.assertAllClose(original_result, optimized_result, rtol=1e-04, atol=1e-06) bias_nodes = [ node for node in optimized_graph_def.node if node.op == 'BiasAdd' ] self.assertEqual(len(bias_nodes), 1) for node in optimized_graph_def.node: self.assertNotEqual("FusedBatchNormV3", node.op)
def testFoldBatchNorms(self): with tf.compat.v1.Session() as sess: inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6] input_op = constant_op.constant(np.array(inputs), shape=[1, 1, 6, 2], dtype=dtypes.float32) weights = [1, 2, 3, 4, 0.1, 0.2, 0.3, 0.4] weights_op = constant_op.constant(np.array(weights), shape=[1, 2, 2, 2], dtype=dtypes.float32) conv_op = nn_ops.conv2d(input_op, weights_op, [1, 1, 1, 1], padding="SAME", name="conv_op") mean_op = constant_op.constant(np.array([10, 20]), shape=[2], dtype=dtypes.float32) variance_op = constant_op.constant(np.array([0.25, 0.5]), shape=[2], dtype=dtypes.float32) beta_op = constant_op.constant(np.array([0.1, 0.6]), shape=[2], dtype=dtypes.float32) gamma_op = constant_op.constant(np.array([1.0, 2.0]), shape=[2], dtype=dtypes.float32) test_util.set_producer_version(ops.get_default_graph(), 8) gen_nn_ops._batch_norm_with_global_normalization(conv_op, mean_op, variance_op, beta_op, gamma_op, 0.00001, False, name="output") original_graph_def = sess.graph_def original_result = sess.run(["output:0"]) optimized_graph_def = fold_batch_norms.fold_batch_norms( original_graph_def) with tf.compat.v1.Session() as sess: _ = importer.import_graph_def(optimized_graph_def, input_map={}, name="optimized") optimized_result = sess.run(["optimized/output:0"]) self.assertAllClose(original_result, optimized_result) for node in optimized_graph_def.node: self.assertNotEqual("BatchNormWithGlobalNormalization", node.op)
def optimize_graph(graph, signature_def, output_graph, tf_version, quantization_dtype=None, skip_op_check=False, strip_debug_ops=False, weight_shard_size_bytes=1024 * 1024 * 4): """Takes a Python Graph object and optimizes the graph. Args: graph: The frozen graph to optimize. signature_def: the SignatureDef of the inference graph. output_graph: The location of the output graph. tf_version: Tensorflow version of the input graph. quantization_dtype: An optional numpy dtype to quantize weights to for compression. Only np.uint8 and np.uint16 are supported. skip_op_check: Bool whether to skip the op check. strip_debug_ops: Bool whether to strip debug ops. weight_shard_size_bytes: Shard size (in bytes) of the weight files. The size of each weight file will be <= this value. """ # Add a collection 'train_op' so that Grappler knows the outputs. for _, output in signature_def.outputs.items(): name = output.name.split(':')[0] graph.add_to_collection('train_op', graph.get_operation_by_name(name)) graph_def = graph.as_graph_def() unsupported = validate(graph_def.node, skip_op_check, strip_debug_ops) if unsupported: raise ValueError('Unsupported Ops in the model before optimization\n' + ', '.join(unsupported)) # first pass of grappler optimization, this is needed for batch norm folding. config = config_pb2.ConfigProto() rewriter_config = config.graph_options.rewrite_options rewriter_config.optimizers[:] = [ 'pruning', 'constfold', 'arithmetic', 'dependency', 'pruning', 'constfold', 'arithmetic', 'dependency' ] if strip_debug_ops: rewriter_config.optimizers.insert(0, 'debug_stripper') optimized_graph = _run_grappler(config, graph_def, graph, signature_def) # batch norm folding optimized_graph = fold_batch_norms.fold_batch_norms(optimized_graph) # set the device to CPU for all Conv2d and MatMul nodes, since grappler # remap optimizer only support FusedConv2D and FusedMatMul for CPU. for node in optimized_graph.node: if node.op == 'Conv2D' or node.op == 'MatMul': node.device = '/device:CPU:0' # rerun grappler to fuse conv2d/matmul config.graph_options.rewrite_options.optimizers[:] = [ 'remap', 'constfold', 'arithmetic', 'dependency' ] optimized_graph = _run_grappler(config, optimized_graph, graph, signature_def) optimized_graph = _remove_unused_control_flow_inputs(optimized_graph) # Because TF break the Prelu op into 6 ops, for performance we are # fusing those ops into a single prelu optimized_graph = fuse_prelu.fuse_ops_for_prelu(optimized_graph) # Because grappler does not support DepthwiseConv2d fusing, we have # implemented it here. optimized_graph = fuse_depthwise_conv2d.fuse_depthwise_conv2d( optimized_graph) # Since the grappler remap optimizer doe snot support prelu as the activation # function for _FusedConv2D op, we are doing it manually here. optimized_graph = fuse_prelu.fuse_prelu_with_fused_conv2d_or_matmul( optimized_graph) unsupported = validate(optimized_graph.node, skip_op_check, strip_debug_ops) if unsupported: raise ValueError('Unsupported Ops in the model after optimization\n' + ', '.join(unsupported)) extract_weights(optimized_graph, output_graph, tf_version, signature_def, quantization_dtype, weight_shard_size_bytes) return optimize_graph
def optimize_graph(graph, output_node_names, output_graph, tf_version, quantization_dtype=None, skip_op_check=False, strip_debug_ops=False): """Takes a Python Graph object and optimizes the graph. Args: graph: The frozen graph to optimize. output_node_names: List of output node names. output_graph: The location of the output graph. tf_version: Tensorflow version of the input graph. quantization_dtype: An optional numpy dtype to quantize weights to for compression. Only np.uint8 and np.uint16 are supported. skip_op_check: Bool whether to skip the op check. strip_debug_ops: Bool whether to strip debug ops. """ fuse_prelu.register_prelu_func(graph) # Add a collection 'train_op' so that Grappler knows the outputs. for output in output_node_names: graph.add_to_collection('train_op', graph.get_operation_by_name(output)) graph_def = graph.as_graph_def() unsupported = validate(graph_def.node, skip_op_check, strip_debug_ops) if unsupported: raise ValueError('Unsupported Ops in the model before optimization\n' + ', '.join(unsupported)) # Because TF break the Prelu op into 6 ops, for performance we are # fusing those ops into a single prelu optimized_graph = fuse_prelu.fuse_ops_for_prelu(graph_def) # first pass of grappler optimization, this is needed for batch norm folding. config = config_pb2.ConfigProto() rewriter_config = config.graph_options.rewrite_options rewriter_config.optimizers[:] = [ 'pruning', 'constfold', 'arithmetic', 'dependency', 'pruning', 'constfold', 'arithmetic', 'dependency' ] if strip_debug_ops: rewriter_config.optimizers.insert(0, 'debug_stripper') optimized_graph = _run_grappler(config, optimized_graph, graph) # batch norm folding optimized_graph = fold_batch_norms.fold_batch_norms(optimized_graph) # set the device to CPU for all Conv2d nodes, since grappler remap optimizer # only support FusedConv2D for CPU. for node in optimized_graph.node: if node.op == 'Conv2D': node.device = '/device:CPU:0' # rerun grappler to fuse conv2d config.graph_options.rewrite_options.optimizers[:] = [ 'remap', 'constfold', 'arithmetic', 'dependency' ] optimized_graph = _run_grappler(config, optimized_graph, graph) # Since the grappler remap optimizer doe snot support prelu as the activation # function for _FusedConv2D op, we are doing it manually here. optimized_graph = fuse_prelu.fuse_prelu_with_fused_conv2d(optimized_graph) unsupported = validate(optimized_graph.node, skip_op_check, strip_debug_ops) if unsupported: raise ValueError('Unsupported Ops in the model after optimization\n' + ', '.join(unsupported)) extract_weights(optimized_graph, output_graph, tf_version, quantization_dtype) return optimize_graph
def testFoldFusedBatchNorms(self): for data_format, conv2d_func in [ ("NHWC", nn_ops.conv2d), ("NCHW", nn_ops.conv2d), ("NHWC", nn_ops.depthwise_conv2d_native), ("NCHW", nn_ops.depthwise_conv2d_native) ]: with tf.compat.v1.Session() as sess: inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6] input_op = constant_op.constant( np.array(inputs), shape=[1, 1, 6, 2] if data_format == "NHWC" else [1, 2, 1, 6], dtype=dtypes.float32) if conv2d_func == nn_ops.conv2d: weights = [1, 2, 3, 4, 0.1, 0.2, 0.3, 0.4] weights_op = constant_op.constant(np.array(weights), shape=[1, 2, 2, 2], dtype=dtypes.float32) else: weights = [1, 2, 0.3, 0.4] weights_op = constant_op.constant(np.array(weights), shape=[1, 2, 2, 1], dtype=dtypes.float32) conv_op = conv2d_func(input_op, weights_op, [1, 1, 1, 1], padding="SAME", data_format=data_format, name="conv_op") mean_op = constant_op.constant(np.array([10, 20]), shape=[2], dtype=dtypes.float32) variance_op = constant_op.constant(np.array([0.25, 0.5]), shape=[2], dtype=dtypes.float32) beta_op = constant_op.constant(np.array([0.1, 0.6]), shape=[2], dtype=dtypes.float32) gamma_op = constant_op.constant(np.array([1.0, 2.0]), shape=[2], dtype=dtypes.float32) ops.get_default_graph().graph_def_versions.producer = 9 gen_nn_ops._fused_batch_norm(conv_op, gamma_op, beta_op, mean_op, variance_op, 0.00001, is_training=False, data_format=data_format, name="output") original_graph_def = sess.graph_def original_result = sess.run(["output:0"]) optimized_graph_def = fold_batch_norms.fold_batch_norms( original_graph_def) with tf.compat.v1.Session() as sess: _ = importer.import_graph_def(optimized_graph_def, input_map={}, name="optimized") optimized_result = sess.run(["optimized/output:0"]) self.assertAllClose(original_result, optimized_result, rtol=1e-04, atol=1e-06) for node in optimized_graph_def.node: self.assertNotEqual("FusedBatchNorm", node.op)