Exemplo n.º 1
0
    def optimize_onnx(onnx_model_path,
                      optimized_model_path,
                      is_float16,
                      num_attention_heads,
                      hidden_size,
                      use_external_data_format=False,
                      auto_mixed_precision=False,
                      **kwargs):
        """ Optimize ONNX model with an option to convert it to use mixed precision.
        """
        from optimizer import optimize_model

        from fusion_options import FusionOptions
        optimization_options = FusionOptions('gpt2')
        #optimization_options.enable_gelu = False
        #optimization_options.enable_layer_norm = False
        #optimization_options.enable_attention = False
        m = optimize_model(onnx_model_path,
                           model_type='gpt2',
                           num_heads=num_attention_heads,
                           hidden_size=hidden_size,
                           opt_level=0,
                           optimization_options=optimization_options,
                           use_gpu=False)

        if is_float16:
            if auto_mixed_precision:
                Gpt2Helper.auto_mixed_precision(m)
            else:
                m.convert_float_to_float16(use_symbolic_shape_infer=True,
                                           **kwargs)

        m.save_model_to_file(optimized_model_path, use_external_data_format)
Exemplo n.º 2
0
    def optimize_onnx(
        onnx_model_path: str,
        optimized_model_path: str,
        is_float16: bool,
        num_attention_heads: int,
        hidden_size: int,
        use_external_data_format: bool = False,
        auto_mixed_precision: bool = True,
    ):
        """Optimize ONNX model with an option to convert it to use mixed precision."""
        m = optimize_model(
            onnx_model_path,
            model_type="bert",
            num_heads=num_attention_heads,
            hidden_size=hidden_size,
            opt_level=0,
            optimization_options=None,
            use_gpu=False,
        )
        if is_float16:
            if auto_mixed_precision:
                T5Helper.auto_mixed_precision(m)
            else:
                m.convert_model_float32_to_float16(cast_input_output=False)

        m.save_model_to_file(optimized_model_path, use_external_data_format)
Exemplo n.º 3
0
    def optimize_onnx(onnx_model_path,
                      optimized_model_path,
                      is_float16,
                      num_attention_heads,
                      hidden_size,
                      use_external_data_format=False,
                      **kwargs):
        """ Optimize ONNX model with an option to convert it to use mixed precision.
        """
        from optimizer import optimize_model

        from fusion_options import FusionOptions
        optimization_options = FusionOptions('gpt2')
        #optimization_options.enable_gelu = False
        #optimization_options.enable_layer_norm = False
        #optimization_options.enable_attention = False
        m = optimize_model(onnx_model_path,
                           model_type='gpt2',
                           num_heads=num_attention_heads,
                           hidden_size=hidden_size,
                           opt_level=0,
                           optimization_options=optimization_options,
                           use_gpu=False)

        if is_float16:
            op_full_list = set([node.op_type for node in m.nodes()])
            op_block_list = set(kwargs["op_block_list"]
                                ) if "op_block_list" in kwargs else set()
            op_remain_list = op_full_list.difference(op_block_list)
            logger.info(
                f"op_block_list={op_block_list} op_remain_list={op_remain_list}"
            )
            m.convert_float_to_float16(use_symbolic_shape_infer=True, **kwargs)

        m.save_model_to_file(optimized_model_path, use_external_data_format)
Exemplo n.º 4
0
def optimize_onnx_model(onnx_model_path, optimized_model_path, model_type,
                        num_attention_heads, hidden_size, use_gpu, fp16,
                        overwrite):
    if overwrite or not os.path.exists(optimized_model_path):
        from optimizer import optimize_model
        from onnx_model_bert import BertOptimizationOptions
        optimization_options = BertOptimizationOptions(model_type)
        if fp16:
            optimization_options.enable_gelu_approximation = True

        # Use script to optimize model.
        # Use opt_level <= 1 for models to be converted to fp16, because some fused op (like FusedGemm) has only fp32 and no fp16.
        # It is better to be conservative so we use opt_level=0 here, in case MemcpyFromHost is added to the graph by OnnxRuntime.
        opt_model = optimize_model(onnx_model_path,
                                   model_type,
                                   num_heads=num_attention_heads,
                                   hidden_size=hidden_size,
                                   opt_level=0,
                                   optimization_options=optimization_options,
                                   use_gpu=use_gpu,
                                   only_onnxruntime=False)
        model_fusion_statistics[
            optimized_model_path] = opt_model.get_fused_operator_statistics()

        if fp16:
            opt_model.convert_model_float32_to_float16()
        opt_model.save_model_to_file(optimized_model_path)
    else:
        logger.info(
            f"Skip optimization since model existed: {optimized_model_path}")
Exemplo n.º 5
0
    def test_keras_squad_model(self):
        input = _get_test_model_path('bert_keras_squad')

        bert_model = optimize_model(input, 'bert_keras', num_heads=2, hidden_size=8)

        print("fused_operator_statistics for test_keras_squad_model", bert_model.get_fused_operator_statistics())

        self.assertTrue(bert_model.is_fully_optimized())
Exemplo n.º 6
0
    def test_3d_attention_fusion_tf2onnx_model(self):
        model = create_tf2onnx_attention_3d()
        dir = "."
        model_path = os.path.join(dir, "bert_3d_attention.onnx")
        onnx.save(model, model_path)
        optimized_model = optimize_model(model_path, model_type="bert_tf", num_heads=4, hidden_size=16)
        os.remove(model_path)

        self.verify_fusion(optimized_model, "bert_3d_attention_opt.onnx")
Exemplo n.º 7
0
    def test_attention_fusion(self):
        model = create_bert_attention()
        dir = "."
        model_path = os.path.join(dir, "attention.onnx")
        onnx.save(model, model_path)
        optimized_model = optimize_model(model_path)
        os.remove(model_path)

        self.verify_fusion(optimized_model, "attention_opt.onnx")
Exemplo n.º 8
0
def optimize_onnx_model(
    model_name,
    onnx_model_path,
    optimized_model_path,
    model_type,
    num_attention_heads,
    hidden_size,
    use_gpu,
    precision,
    use_raw_attention_mask,
    overwrite,
    model_fusion_statistics,
    use_external_data_format,
    optimization_options=None,
):
    if overwrite or not os.path.exists(optimized_model_path):
        Path(optimized_model_path).parent.mkdir(parents=True, exist_ok=True)

        from fusion_options import FusionOptions
        from optimizer import optimize_model

        if optimization_options == None:
            optimization_options = FusionOptions(model_type)
        optimization_options.use_raw_attention_mask(use_raw_attention_mask)
        if Precision.FLOAT16 == precision:
            optimization_options.enable_gelu_approximation = True
        if Precision.INT8 == precision:
            optimization_options.enable_embed_layer_norm = False

        # Use script to optimize model.
        # Use opt_level <= 1 for models to be converted to fp16, because some fused op (like FusedGemm) has only fp32 and no fp16.
        # It is better to be conservative so we use opt_level=0 here, in case MemcpyFromHost is added to the graph by OnnxRuntime.
        opt_model = optimize_model(
            onnx_model_path,
            model_type,
            num_heads=num_attention_heads,
            hidden_size=hidden_size,
            opt_level=0,
            optimization_options=optimization_options,
            use_gpu=use_gpu,
            only_onnxruntime=False,
        )
        if model_type == "bert_keras" or model_type == "bert_tf":
            opt_model.use_dynamic_axes()

        model_fusion_statistics[
            optimized_model_path] = opt_model.get_fused_operator_statistics()

        if Precision.FLOAT16 == precision:
            opt_model.convert_float_to_float16(keep_io_types=True)

        opt_model.save_model_to_file(optimized_model_path,
                                     use_external_data_format)
    else:
        logger.info(
            f"Skip optimization since model existed: {optimized_model_path}")
Exemplo n.º 9
0
def optimize_onnx(input_onnx_path, optimized_onnx_path, expected_op=None):
    if find_transformers_source():
        from optimizer import optimize_model
    else:
        from onnxruntime.transformers.optimizer import optimize_model

    onnx_model = optimize_model(input_onnx_path, model_type='gpt2')
    onnx_model.save_model_to_file(optimized_onnx_path)

    if expected_op is not None:
        assert len(onnx_model.get_nodes_by_op_type(expected_op)) == 1, \
            f"Expected {expected_op} node not found in the optimized model {optimized_onnx_path}"
Exemplo n.º 10
0
    def test_attention_fusion_pruned_model(self):
        model = create_bert_attention(input_hidden_size=16,
                                      num_heads=2,
                                      pruned_qk_hidden_size=8,
                                      pruned_v_hidden_size=8)
        dir = '.'
        model_path = os.path.join(dir, "pruned_attention.onnx")
        onnx.save(model, model_path)
        optimized_model = optimize_model(model_path)
        os.remove(model_path)

        self.verify_fusion(optimized_model, 'pruned_attention_opt.onnx')
Exemplo n.º 11
0
def optimize_onnx_model(onnx_model_filename, model_type, num_attention_heads, hidden_size, use_gpu, fp16, overwrite):
    suffix =  "_fp{}_{}.onnx".format(16 if fp16 else 32, "gpu" if use_gpu else "cpu")
    optimized_model_filename = onnx_model_filename.replace(".onnx", suffix)
    if overwrite or not os.path.exists(optimized_model_filename):
        from optimizer import optimize_model
        from BertOnnxModel import BertOptimizationOptions
        optimization_options = BertOptimizationOptions(model_type)
        if fp16:
            optimization_options.enable_gelu_approximation = True

        # Use onnxruntime to optimize model, which will be saved to *_ort_cpu.onnx
        opt_model = optimize_model(onnx_model_filename,
                                   model_type,
                                   num_heads=num_attention_heads,
                                   hidden_size=hidden_size,
                                   opt_level=99,
                                   optimization_options=optimization_options,
                                   use_gpu=use_gpu,
                                   only_onnxruntime=True)
        model_fusion_statistics[onnx_model_filename] = opt_model.get_fused_operator_statistics()

        # Use script to optimize model.
        # Use opt_level <= 1 for models to be converted to fp16, because some fused op (like FusedGemm) has only fp32 and no fp16. 
        # It is better to be conservative so we use opt_level=0 here, in case MemcpyFromHost is added to the graph by OnnxRuntime.
        opt_model = optimize_model(onnx_model_filename,
                                   model_type,
                                   num_heads=num_attention_heads,
                                   hidden_size=hidden_size,
                                   opt_level=0,
                                   optimization_options=optimization_options,
                                   use_gpu=use_gpu,
                                   only_onnxruntime=False)
        model_fusion_statistics[optimized_model_filename] = opt_model.get_fused_operator_statistics()

        if fp16:
            opt_model.convert_model_float32_to_float16()
        opt_model.save_model_to_file(optimized_model_filename)
    else:
        logger.info(f"Skip optimization since model existed: {optimized_model_filename}")
    return optimized_model_filename
Exemplo n.º 12
0
 def test_bert_tf2onnx_0(self):
     input = _get_test_model_path('bert_tf2onnx_0')
     model = optimize_model(input, 'bert_tf', num_heads=2, hidden_size=8)
     expected_node_count = {
         'EmbedLayerNormalization': 0,
         'Attention': 6,
         'Gelu': 0,
         'FastGelu': 6,
         'BiasGelu': 0,
         'LayerNormalization': 0,
         'SkipLayerNormalization': 13
     }
     self.verify_node_count(model, expected_node_count, 'test_bert_tf2onnx_0')
Exemplo n.º 13
0
    def test_attention_fusion_for_varied_qkv_dimensions(self):
        model = create_bert_attention(input_hidden_size=16,
                                      num_heads=2,
                                      pruned_qk_hidden_size=24,
                                      pruned_v_hidden_size=16)
        dir = '.'
        model_path = os.path.join(dir, "attention_with_varied_qkv.onnx")
        onnx.save(model, model_path)
        optimized_model = optimize_model(model_path)
        os.remove(model_path)

        self.verify_fusion(optimized_model,
                           'attention_with_varied_qkv_opt.onnx')
Exemplo n.º 14
0
    def test_pytorch_model_0(self):
        input = BERT_TEST_MODELS['bert_pytorch_0']
        bert_model = optimize_model(input, 'bert', num_heads=2, hidden_size=8)

        expected_node_count = {
            'EmbedLayerNormalization': 1,
            'Attention': 12,
            'SkipLayerNormalization': 24,
            'Gelu': 0,
            'FastGelu': 0,
            'BiasGelu': 12
        }
        self.verify_node_count(bert_model, expected_node_count)
Exemplo n.º 15
0
    def test_attention_fusion_pruned_model(self):
        model = create_bert_attention()
        dir = '.'
        model_path = os.path.join(dir, "pruned_attention.onnx")
        onnx.save(model, model_path)
        optimized_model = optimize_model(model_path)
        os.remove(model_path)

        expected_model_path = os.path.join(os.path.dirname(__file__),
                                           'test_data', 'fusion',
                                           'pruned_attention_opt.onnx')
        expected = onnx.load(expected_model_path)
        self.assertEqual(str(optimized_model.model.graph), str(expected.graph))
Exemplo n.º 16
0
 def test_multiple_embed(self):
     input_model_path = _get_test_model_path('multiple_embed')
     model = optimize_model(input_model_path, 'bert', num_heads=2, hidden_size=4)
     expected_node_count = {
         'EmbedLayerNormalization': 2,
         'Attention': 2,
         'Gelu': 0,
         'FastGelu': 0,
         'BiasGelu': 0,
         'LayerNormalization': 0,
         'SkipLayerNormalization': 0
     }
     self.verify_node_count(model, expected_node_count, 'test_multiple_embed')
Exemplo n.º 17
0
 def test_gpt2_past_mask(self):
     input = _get_test_model_path('gpt2_past_mask')
     model = optimize_model(input, 'gpt2', num_heads=2, hidden_size=4)
     expected_node_count = {
         'EmbedLayerNormalization': 0,
         'Attention': 1,
         'Gelu': 0,
         'FastGelu': 1,
         'BiasGelu': 0,
         'LayerNormalization': 2,
         'SkipLayerNormalization': 0
     }
     self.verify_node_count(model, expected_node_count, 'test_gpt2_past_mask')
Exemplo n.º 18
0
    def test_embed_layer_norm_fusion(self):
        onnx_files = []
        for i in [3, 8, 9]:
            onnx_files.append(f"embed_layer_norm_format{i}.onnx")
            onnx_files.append(f"embed_layer_norm_format{i}_opset13.onnx")
        onnx_files.append('embed_layer_norm_format3_no_cast.onnx')
        onnx_files.append('embed_layer_norm_format3_no_cast_opset13.onnx')

        for file in onnx_files:
            input_model_path = get_fusion_test_model(file)
            model = optimize_model(input_model_path, 'bert')
            expected_node_count = {'EmbedLayerNormalization': 1, 'Attention': 1, 'ReduceSum': 0}
            self.verify_node_count(model, expected_node_count, file)
Exemplo n.º 19
0
    def optimize_onnx(onnx_model_path, optimized_model_path, is_float16,
                      num_attention_heads, hidden_size):
        from optimizer import optimize_model
        m = optimize_model(onnx_model_path,
                           model_type='gpt2',
                           num_heads=num_attention_heads,
                           hidden_size=hidden_size,
                           opt_level=0,
                           optimization_options=None,
                           use_gpu=False)
        if is_float16:
            m.convert_model_float32_to_float16(cast_input_output=False)

        m.save_model_to_file(optimized_model_path)
Exemplo n.º 20
0
 def test_gpt2_past_mask(self):
     input = _get_test_model_path("gpt2_past_mask")
     model = optimize_model(input, "gpt2", num_heads=2, hidden_size=4)
     expected_node_count = {
         "EmbedLayerNormalization": 1,
         "Attention": 1,
         "Gelu": 0,
         "FastGelu": 1,
         "BiasGelu": 0,
         "LayerNormalization": 1,
         "SkipLayerNormalization": 0,
     }
     self.verify_node_count(model, expected_node_count,
                            "test_gpt2_past_mask")
Exemplo n.º 21
0
    def test_attention_fusion_reverse_add_order(self):
        model = create_bert_attention(input_hidden_size=16,
                                      num_heads=2,
                                      pruned_qk_hidden_size=8,
                                      pruned_v_hidden_size=8,
                                      switch_add_inputs=True)
        dir = '.'
        model_path = os.path.join(dir, "bert_attention_reverse_add_order.onnx")
        onnx.save(model, model_path)
        optimized_model = optimize_model(model_path)
        os.remove(model_path)

        # reverse add input order will get same optimized model
        self.verify_fusion(optimized_model, 'pruned_attention_opt.onnx')
Exemplo n.º 22
0
    def test_attention_fusion_reverse_add_order(self):
        model = create_bert_attention(switch_add_inputs=True)
        dir = '.'
        model_path = os.path.join(dir, "bert_attention_reverse_add_order.onnx")
        onnx.save(model, model_path)
        optimized_model = optimize_model(model_path)
        os.remove(model_path)

        # reverse add input order will get same optimized model
        expected_model_path = os.path.join(os.path.dirname(__file__),
                                           'test_data', 'fusion',
                                           'pruned_attention_opt.onnx')
        expected = onnx.load(expected_model_path)
        self.assertEqual(str(optimized_model.model.graph), str(expected.graph))
Exemplo n.º 23
0
    def test_gpt2(self):
        input = BERT_TEST_MODELS['gpt2']
        bert_model = optimize_model(input, 'gpt2', num_heads=2, hidden_size=4)

        expected_node_count = {
            'EmbedLayerNormalization': 0,
            'Attention': 12,
            'Gelu': 0,
            'FastGelu': 12,
            'BiasGelu': 0,
            'LayerNormalization': 25,
            'SkipLayerNormalization': 0
        }
        self.verify_node_count(bert_model, expected_node_count)
Exemplo n.º 24
0
    def test_pytorch_model_2(self):
        input = _get_test_model_path('bert_squad_pytorch1.4_opset10_fp32')
        bert_model = optimize_model(input, 'bert', num_heads=2, hidden_size=8)
        print("fused_operator_statistics for test_pytorch_model_2", bert_model.get_fused_operator_statistics())
        self.assertTrue(bert_model.is_fully_optimized())

        # Test change input to int32
        bert_model.change_input_to_int32()
        embed_nodes = bert_model.get_nodes_by_op_type('EmbedLayerNormalization')
        for embed_node in embed_nodes:
            bert_inputs = embed_node.input[:2] + embed_node.input[7:]
            for bert_input in bert_inputs:
                self.assertIsNotNone(bert_model.find_graph_input(bert_input))
        for input in bert_model.graph().input:
            self.assertEqual(input.type.tensor_type.elem_type, TensorProto.INT32)
Exemplo n.º 25
0
    def test_keras_model_1(self):
        input = _get_test_model_path('bert_keras_0')

        bert_model = optimize_model(input, 'bert_keras', num_heads=2, hidden_size=8)

        expected_node_count = {
            'EmbedLayerNormalization': 1,
            'Attention': 12,
            'LayerNormalization': 0,
            'SkipLayerNormalization': 24,
            'BiasGelu': 12,
            'Gelu': 0,
            'FastGelu': 0
        }
        self.verify_node_count(bert_model, expected_node_count, 'test_keras_model_1')
Exemplo n.º 26
0
    def test_3d_attention_fusion_tf2onnx_model(self):
        model = create_tf2onnx_attention_3d()
        dir = '.'
        model_path = os.path.join(dir, 'bert_3d_attention.onnx')
        onnx.save(model, model_path)
        optimized_model = optimize_model(model_path,
                                         model_type='bert_tf',
                                         num_heads=4,
                                         hidden_size=16)
        os.remove(model_path)

        expected_model_path = os.path.join(os.path.dirname(__file__),
                                           'test_data', 'fusion',
                                           'bert_3d_attention_opt.onnx')
        expected = onnx.load(expected_model_path)
        self.assertEqual(str(optimized_model.model.graph), str(expected.graph))
Exemplo n.º 27
0
 def test_fusions(self):
     for test_case in test_cases:
         source, operator, model_class = test_case
         model = model_class()
         dummy_input = torch.ones(3, dtype=torch.float32)
         test_name = f"{operator}_{source}"
         onnx_path = f"{test_name}.onnx"
         torch.onnx.export(model, (dummy_input),
                           onnx_path,
                           input_names=['input'],
                           output_names=['output'])
         optimizer = optimize_model(onnx_path, 'bert')
         # optimizer.save_model_to_file(f"{operator}_{source}_opt.onnx")
         os.remove(onnx_path)
         expected_node_count = {operator: 1}
         self.verify_node_count(optimizer, expected_node_count, test_name)
Exemplo n.º 28
0
    def test_attention_fusion_for_varied_qkv_dimensions(self):
        model = create_bert_attention(input_hidden_size=16,
                                      num_heads=2,
                                      pruned_qk_hidden_size=24,
                                      pruned_v_hidden_size=16)
        dir = '.'
        model_path = os.path.join(dir, "attention_with_varied_qkv.onnx")
        onnx.save(model, model_path)
        optimized_model = optimize_model(model_path)
        os.remove(model_path)

        expected_model_path = os.path.join(
            os.path.dirname(__file__), 'test_data', 'models',
            'attention_with_varied_qkv_opt.onnx')
        expected = onnx.load(expected_model_path)
        self.assertEqual(str(optimized_model.model.graph), str(expected.graph))
Exemplo n.º 29
0
 def test_multiple_embed(self):
     input_model_path = _get_test_model_path("multiple_embed")
     model = optimize_model(input_model_path,
                            "bert",
                            num_heads=2,
                            hidden_size=4)
     expected_node_count = {
         "EmbedLayerNormalization": 2,
         "Attention": 2,
         "Gelu": 0,
         "FastGelu": 0,
         "BiasGelu": 0,
         "LayerNormalization": 0,
         "SkipLayerNormalization": 0,
     }
     self.verify_node_count(model, expected_node_count,
                            "test_multiple_embed")
Exemplo n.º 30
0
    def test_attention_fusion_for_varied_qkv_dimensions_with_wrong_opt_parameters(self):
        model = create_bert_attention(
            input_hidden_size=16,
            num_heads=2,
            pruned_qk_hidden_size=24,
            pruned_v_hidden_size=16,
        )
        dir = "."
        model_path = os.path.join(dir, "attention_with_varied_qkv.onnx")
        onnx.save(model, model_path)

        # wrong num_heads and hidden_size
        optimized_model = optimize_model(model_path, "bert", num_heads=8, hidden_size=8)

        os.remove(model_path)

        self.verify_fusion(optimized_model, "attention_with_varied_qkv_opt.onnx")