def convert_float_to_float16(self, use_symbolic_shape_infer=True, **kwargs): """Convert a model to half (default) or mixed precision. To use mixed precision, user need specify which graph inputs, outputs, operator type or list of nodes shall keep in float32. By default, we use symbolic shape inference to get shape and type information. If not, ONNX shape inference will be used. Note that symbolic/ONNX shape inference might fail, and the conversion might not proceed without shape and type information. Args: use_symbolic_shape_infer (bool, optional): use symbolic shape inference instead of onnx shape inference. Defaults to True. keep_io_types (Union[bool, List[str]], optional): It could be boolean or a list of float32 input/output names. If True, model inputs/outputs should be left as float32. Defaults to False. op_block_list (List[str], optional): List of operator types to leave as float32. Defaults to None, which will use `float16.DEFAULT_OP_BLOCK_LIST` as default. node_block_list (List[str], optional): List of node names to leave as float32. Defaults to None. force_fp16_initializers(bool): force converting all float initializers to float16. Default to false, which will convert only the one needed to avoid precision loss. min_positive_val (float, optional): minimal positive value. Defaults to 1e-7. max_finite_val (float, optional): maximal finite value. Defaults to 1e4. """ if "keep_io_types" not in kwargs: kwargs["keep_io_types"] = True def float_to_float16_func(): # TODO: import from onnxconverter_common when it is stable # try: # import onnxconverter_common as oc # from packaging.version import Version # if Version(oc.__version__) > Version("1.9.0"): # from onnxconverter_common.float16 import convert_float_to_float16 # return convert_float_to_float16 # except ImportError: # pass from float16 import convert_float_to_float16 return convert_float_to_float16 convert_float_to_float16 = float_to_float16_func() model = self.model if use_symbolic_shape_infer: # Use symbolic shape inference since custom operators (like Gelu, SkipLayerNormalization etc) are not recognized by onnx shape inference. shape_infer_helper = SymbolicShapeInferenceHelper(model) model = shape_infer_helper.infer_shapes(model, auto_merge=True, guess_output_rank=False) parameters = {"disable_shape_infer": use_symbolic_shape_infer} parameters.update( { key: kwargs[key] for key in [ "keep_io_types", "min_positive_val", "max_finite_val", "op_block_list", "node_block_list", "force_fp16_initializers", ] if key in kwargs } ) fp16_model = convert_float_to_float16(model, **parameters) self.initialize(fp16_model) # Convert_float_to_float16 might add Cast(to=10) --> Cast(to=1) when two consequent nodes are computed in FP32. # Below are post-processing that removes those Cast nodes. # Remove first Cast nodes in path like --> Cast --> Cast --> nodes_to_remove = [] for node in self.nodes(): if node.op_type == "Cast": parent = self.get_parent(node, 0) if parent and parent.op_type == "Cast": if self.get_children(parent) == 1: # cannot be removed if its output is used by multiple nodes self.replace_input_of_all_nodes(parent.output[0], parent.input[0]) nodes_to_remove.append(parent) # Remove the second cast node. for node in self.nodes(): if ( node.op_type == "Cast" and OnnxModel.get_node_attribute(node, "to") == int(TensorProto.FLOAT) and self.get_dtype(node.input[0]) == int(TensorProto.FLOAT) ): if self.find_graph_output(node.output[0]): self.replace_output_of_all_nodes(node.input[0], node.output[0]) else: self.replace_input_of_all_nodes(node.output[0], node.input[0]) nodes_to_remove.append(node) self.remove_nodes(nodes_to_remove) if nodes_to_remove: self.prune_graph() print(f"removed {len(nodes_to_remove)} Cast nodes from float16 model")
def convert_model_float32_to_float16(self, cast_input_output=True, use_symbolic_shape_infer=True): """Convert a graph to FLOAT16. By default, we will keep data types of inputs and outputs. For decoder model with past_key_values, it is recommended to set cast_input_output=False for better performance. Args: cast_input_output (bool, optional): keep data type of inputs and outputs, and add Cast nodes to convert float32 inputs to float16, and float16 to float32 for outputs. Defaults to True. use_symbolic_shape_infer (bool, optional): use symbolic shape inference instead of onnx shape inference. """ from packaging.version import Version import onnxconverter_common as oc if Version(oc.__version__) > Version("1.7.0"): model = self.model if use_symbolic_shape_infer: # Use symbolic shape inference since custom operators (like Gelu, SkipLayerNormalization etc) are not recognized by onnx shape inference. shape_infer_helper = SymbolicShapeInferenceHelper(model) model = shape_infer_helper.infer_shapes( model, auto_merge=True, guess_output_rank=False) self.model = oc.float16.convert_float_to_float16( model, keep_io_types=cast_input_output, disable_shape_infer=use_symbolic_shape_infer) return graph = self.model.graph initializers = graph.initializer for initializer in initializers: if initializer.data_type == 1: initializer.CopyFrom( numpy_helper.from_array( numpy_helper.to_array(initializer).astype(np.float16), initializer.name)) for node in graph.node: if node.op_type in ['Constant', 'ConstantOfShape']: for att in node.attribute: if att.name == 'value' and att.t.data_type == 1: att.CopyFrom( helper.make_attribute( "value", numpy_helper.from_array( numpy_helper.to_array(att.t).astype( np.float16)))) if node.op_type == 'Cast': for att in node.attribute: if att.name == 'to' and att.i == 1: att.CopyFrom( helper.make_attribute("to", int(TensorProto.FLOAT16))) if not cast_input_output: self.change_input_output_float32_to_float16() return # Below assumes that we keep input and output data types. # Add Cast node to convert input from float32 to float16. for input_value_info in graph.input: if input_value_info.type.tensor_type.elem_type == TensorProto.FLOAT: initializer = self.get_initializer(input_value_info.name) if initializer is not None: # for compatibility for old converter/exporter input_value_info.type.tensor_type.elem_type = TensorProto.FLOAT16 else: cast_input = input_value_info.name cast_output = input_value_info.name + '_float16' self.replace_input_of_all_nodes(cast_input, cast_output) cast_node = helper.make_node('Cast', inputs=[cast_input], outputs=[cast_output]) cast_node.attribute.extend([ helper.make_attribute("to", int(TensorProto.FLOAT16)) ]) self.add_node(cast_node) # Add Cast node to convert output from float16 back to float32. for output_value_info in graph.output: if output_value_info.type.tensor_type.elem_type == TensorProto.FLOAT: cast_input = output_value_info.name + '_float16' cast_output = output_value_info.name self.replace_output_of_all_nodes(cast_output, cast_input) self.replace_input_of_all_nodes(cast_output, cast_input) cast_node = helper.make_node('Cast', inputs=[cast_input], outputs=[cast_output]) cast_node.attribute.extend( [helper.make_attribute("to", int(TensorProto.FLOAT))]) self.add_node(cast_node)