def testFuseDepthwiseConv2dNativeWithBiasAndActivation(self): layers = [ tf.keras.layers.DepthwiseConv2D( 1, bias_initializer=tf.initializers.constant(0.25)), tf.keras.layers.ReLU() ] model = tf.keras.Sequential(layers) tf.keras.backend.set_learning_phase(0) input_tensor = tf.constant([1.0, 1.0], shape=[1, 1, 1, 2]) @tf.function def execute_model(tensor): return model(tensor) graph = tf_saved_model_conversion_v2._freeze_saved_model_v2( execute_model.get_concrete_function(input_tensor)) graph_def = graph.as_graph_def() optimized_graph_def = fuse_depthwise_conv2d.fuse_depthwise_conv2d( graph_def) depthwise_conv2d_count = 0 depthwise_conv2d = None for node in optimized_graph_def.node: self.assertNotEqual("BiasAdd", node.op) self.assertNotEqual("DepthwiseConv2dNative", node.op) self.assertNotEqual("Relu", node.op) if node.op == graph_rewrite_util.FUSED_DEPTHWISE_CONV2D: depthwise_conv2d_count += 1 depthwise_conv2d = node self.assertEqual(depthwise_conv2d_count, 1) self.assertEqual(depthwise_conv2d.attr['fused_ops'].list.s, [b'BiasAdd', b'Relu']) self.assertEqual(depthwise_conv2d.attr['num_args'].i, 1)
def testFusePreluWithDepthwiseConv2d(self): layers = [ tf.keras.layers.DepthwiseConv2D( 1, bias_initializer=tf.initializers.constant(0.25)), tf.keras.layers.PReLU() ] model = tf.keras.Sequential(layers) tf.keras.backend.set_learning_phase(0) input_tensor = tf.constant([1.0, 1.0], shape=[1, 2, 1, 1]) @tf.function def execute_model(tensor): return model(tensor) graph = tf_saved_model_conversion_v2._freeze_saved_model_v2( execute_model.get_concrete_function(input_tensor)) graph_def = graph.as_graph_def() for node in graph_def.node: if node.op == 'Conv2D': node.device = "/CPU:0" config = config_pb2.ConfigProto() rewriter_config = config.graph_options.rewrite_options rewriter_config.optimizers[:] = [ 'pruning', 'constfold', 'arithmetic', 'dependency', 'pruning', 'remap', 'constfold', 'arithmetic', 'dependency' ] for output in ['Identity']: graph.add_to_collection('train_op', graph.get_operation_by_name(output)) signature = meta_graph_pb2.SignatureDef() graph_def = tf_saved_model_conversion_v2._run_grappler( config, graph_def, graph, signature) graph_def = fuse_prelu.fuse_ops_for_prelu(graph_def) graph_def = fuse_depthwise_conv2d.fuse_depthwise_conv2d(graph_def) optimized_graph_def = fuse_prelu.fuse_prelu_with_fused_conv2d_or_matmul( graph_def) conv2d_op = None for node in optimized_graph_def.node: self.assertNotEqual("Prelu", node.op) if node.op == 'FusedDepthwiseConv2dNative': conv2d_op = node self.assertNotEqual(conv2d_op, None) self.assertEqual(conv2d_op.attr['fused_ops'].list.s, [b'BiasAdd', b'Prelu']) self.assertEqual(conv2d_op.attr['num_args'].i, 2)
def optimize_graph(graph, signature_def, output_graph, tf_version, quantization_dtype=None, skip_op_check=False, strip_debug_ops=False, weight_shard_size_bytes=1024 * 1024 * 4): """Takes a Python Graph object and optimizes the graph. Args: graph: The frozen graph to optimize. signature_def: the SignatureDef of the inference graph. output_graph: The location of the output graph. tf_version: Tensorflow version of the input graph. quantization_dtype: An optional numpy dtype to quantize weights to for compression. Only np.uint8 and np.uint16 are supported. skip_op_check: Bool whether to skip the op check. strip_debug_ops: Bool whether to strip debug ops. weight_shard_size_bytes: Shard size (in bytes) of the weight files. The size of each weight file will be <= this value. """ # Add a collection 'train_op' so that Grappler knows the outputs. for _, output in signature_def.outputs.items(): name = output.name.split(':')[0] graph.add_to_collection('train_op', graph.get_operation_by_name(name)) graph_def = graph.as_graph_def() unsupported = validate(graph_def.node, skip_op_check, strip_debug_ops) if unsupported: raise ValueError('Unsupported Ops in the model before optimization\n' + ', '.join(unsupported)) # first pass of grappler optimization, this is needed for batch norm folding. config = config_pb2.ConfigProto() rewriter_config = config.graph_options.rewrite_options rewriter_config.optimizers[:] = [ 'pruning', 'constfold', 'arithmetic', 'dependency', 'pruning', 'constfold', 'arithmetic', 'dependency' ] if strip_debug_ops: rewriter_config.optimizers.insert(0, 'debug_stripper') optimized_graph = _run_grappler(config, graph_def, graph, signature_def) # batch norm folding optimized_graph = fold_batch_norms.fold_batch_norms(optimized_graph) # set the device to CPU for all Conv2d and MatMul nodes, since grappler # remap optimizer only support FusedConv2D and FusedMatMul for CPU. for node in optimized_graph.node: if node.op == 'Conv2D' or node.op == 'MatMul': node.device = '/device:CPU:0' # rerun grappler to fuse conv2d/matmul config.graph_options.rewrite_options.optimizers[:] = [ 'remap', 'constfold', 'arithmetic', 'dependency' ] optimized_graph = _run_grappler(config, optimized_graph, graph, signature_def) optimized_graph = _remove_unused_control_flow_inputs(optimized_graph) # Because TF break the Prelu op into 6 ops, for performance we are # fusing those ops into a single prelu optimized_graph = fuse_prelu.fuse_ops_for_prelu(optimized_graph) # Because grappler does not support DepthwiseConv2d fusing, we have # implemented it here. optimized_graph = fuse_depthwise_conv2d.fuse_depthwise_conv2d( optimized_graph) # Since the grappler remap optimizer doe snot support prelu as the activation # function for _FusedConv2D op, we are doing it manually here. optimized_graph = fuse_prelu.fuse_prelu_with_fused_conv2d_or_matmul( optimized_graph) unsupported = validate(optimized_graph.node, skip_op_check, strip_debug_ops) if unsupported: raise ValueError('Unsupported Ops in the model after optimization\n' + ', '.join(unsupported)) extract_weights(optimized_graph, output_graph, tf_version, signature_def, quantization_dtype, weight_shard_size_bytes) return optimize_graph