def quantize(onnx_model_path: Path) -> Path:
    """
    Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU.
    Args:
        onnx_model_path: Path to location the exported ONNX model is stored

    Returns: The Path generated for the quantized
    """
    import onnx
    from onnxruntime.quantization import QuantizationMode, quantize

    onnx_model = onnx.load(onnx_model_path.as_posix())

    # Discussed with @yufenglee from ONNX runtime, this will be address in the next release of onnxruntime
    print(
        "As of onnxruntime 1.4.0, models larger than 2GB will fail to quantize due to protobuf constraint.\n"
        "This limitation will be removed in the next release of onnxruntime.")

    quantized_model = quantize(
        model=onnx_model,
        quantization_mode=QuantizationMode.IntegerOps,
        force_fusions=True,
        symmetric_weight=True,
    )

    # Append "-quantized" at the end of the model's name
    quantized_model_path = generate_identified_filename(
        onnx_model_path, "-quantized")

    # Save model
    print(
        f"Quantized model has been written at {quantized_model_path}: \N{heavy check mark}"
    )
    onnx.save_model(quantized_model, quantized_model_path.as_posix())

    return quantized_model_path
Exemplo n.º 2
0
            # onnxruntime optimizations doesn't provide the same level of performances on TensorFlow than PyTorch
            if args.framework == "tf":
                print(
                    "\t Using TensorFlow might not provide the same optimization level compared to PyTorch.\n"
                    "\t For TensorFlow users you can try optimizing the model directly through onnxruntime_tools.\n"
                    "\t For more information, please refer to the onnxruntime documentation:\n"
                    "\t\thttps://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers\n"
                )

            print("\n====== Optimizing ONNX model ======")

            # Quantization works best when using the optimized version of the model
            args.optimized_output = optimize(args.output)

            # Do the quantization on the right graph
            args.quantized_output = quantize(args.optimized_output)

        # And verify
        if args.check_loading:
            print("\n====== Check exported ONNX model(s) ======")
            verify(args.output)

            if hasattr(args, "optimized_output"):
                verify(args.optimized_output)

            if hasattr(args, "quantized_output"):
                verify(args.quantized_output)

    except Exception as e:
        print(f"Error while converting the model: {e}")
        exit(1)
Exemplo n.º 3
0
            model=model_path,
            tokenizer=args.model_name,
            output=onnx_path + 'converted.onnx',
            opset=11)

    print('>> optimizing..')
    # ONNX optimization
    optimized_model = optimizer.optimize_model(onnx_path + '/converted.onnx',
                                               model_type=args.model_type,
                                               num_heads=12,
                                               hidden_size=768)

    optimized_onnx_model_path = os.path.join(onnx_path, 'bert_optimized.onnx')
    optimized_model.save_model_to_file(optimized_onnx_model_path)
    print('Optimized model saved at :', optimized_onnx_model_path)

    print('>> quantizing..')
    model = onnx.load(onnx_path + '/converted.onnx')
    quantized_model = quantize(model=model,
                               quantization_mode=QuantizationMode.IntegerOps,
                               force_fusions=True,
                               symmetric_weight=True)
    optimized_quantized_onnx_model_path = os.path.join(
        os.path.dirname(optimized_onnx_model_path),
        'ONNX_model_optimized_quantized.onnx')
    onnx.save_model(quantized_model, optimized_quantized_onnx_model_path)
    print('Quantized&optimized model saved at :',
          optimized_quantized_onnx_model_path)

    # break
Exemplo n.º 4
0
model = onnx.load(
    "C:/Users/82109/PycharmProjects/Quantization/original_model.onnx")

sess0 = onnxruntime.InferenceSession(
    "C:/Users/82109/PycharmProjects/Quantization/original_model.onnx")

input_name = sess0.get_inputs()[0].name
n = 1000
start = time.time()
pred0_onnx = sess0.run(None, {input_name: x_test[:n].astype(np.float32)})
print("ori_pred : ", (time.time() - start) / n)

snapshot = tracemalloc.take_snapshot()
top_stats = snapshot.statistics('lineno')

quantized_model = quantize(model,
                           quantization_mode=QuantizationMode.IntegerOps)
onnx.save(
    quantized_model,
    "C:/Users/82109/PycharmProjects/Quantization/orginal_model_test.onnx")

Q_model = onnx.load(
    "C:/Users/82109/PycharmProjects/Quantization/orginal_model_test.onnx")
sess = onnxruntime.InferenceSession(
    "C:/Users/82109/PycharmProjects/Quantization/orginal_model_test.onnx")

input_name = sess.get_inputs()[0].name

start = time.time()
pred_onnx = sess.run(None, {input_name: x_test[:n].astype(np.float32)})
print("Q_pred : ", (time.time() - start) / n)
Exemplo n.º 5
0
    args = parser.parse_args()

    # Make sure output is absolute path
    args.output = Path(args.output).absolute()

    try:
        # Convert
        convert(
            args.framework,
            args.model,
            args.output,
            args.opset,
            args.tokenizer,
            args.use_external_format,
            args.pipeline,
        )

        if args.quantize:
            args.quantized_output = quantize(args.output)

        # And verify
        if args.check_loading:
            verify(args.output)

            if hasattr(args, "quantized_output"):
                verify(args.quantized_output)

    except Exception as e:
        print(f"Error while converting the model: {e}")
        exit(1)